diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,53815 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23049, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003904724716907458, + "grad_norm": 22.37471580505371, + "learning_rate": 6.000000000000001e-08, + "loss": 1.5848, + "step": 3 + }, + { + "epoch": 0.0007809449433814916, + "grad_norm": 24.825096130371094, + "learning_rate": 1.2000000000000002e-07, + "loss": 1.4111, + "step": 6 + }, + { + "epoch": 0.0011714174150722373, + "grad_norm": 22.30377769470215, + "learning_rate": 1.8e-07, + "loss": 1.4883, + "step": 9 + }, + { + "epoch": 0.0015618898867629833, + "grad_norm": 22.412364959716797, + "learning_rate": 2.4000000000000003e-07, + "loss": 1.5305, + "step": 12 + }, + { + "epoch": 0.001952362358453729, + "grad_norm": 22.382173538208008, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.4476, + "step": 15 + }, + { + "epoch": 0.0023428348301444747, + "grad_norm": 18.971206665039062, + "learning_rate": 3.6e-07, + "loss": 1.3828, + "step": 18 + }, + { + "epoch": 0.002733307301835221, + "grad_norm": 17.42239761352539, + "learning_rate": 4.2000000000000006e-07, + "loss": 1.4491, + "step": 21 + }, + { + "epoch": 0.0031237797735259665, + "grad_norm": 16.123905181884766, + "learning_rate": 4.800000000000001e-07, + "loss": 1.418, + "step": 24 + }, + { + "epoch": 0.003514252245216712, + "grad_norm": 15.892184257507324, + "learning_rate": 5.4e-07, + "loss": 1.4415, + "step": 27 + }, + { + "epoch": 0.003904724716907458, + "grad_norm": 12.987159729003906, + "learning_rate": 6.000000000000001e-07, + "loss": 1.3907, + "step": 30 + }, + { + "epoch": 0.004295197188598204, + "grad_norm": 11.818194389343262, + "learning_rate": 6.6e-07, + "loss": 1.421, + "step": 33 + }, + { + "epoch": 0.004685669660288949, + "grad_norm": 11.758996963500977, + "learning_rate": 7.2e-07, + "loss": 1.2863, + "step": 36 + }, + { + "epoch": 0.005076142131979695, + "grad_norm": 11.128711700439453, + "learning_rate": 7.8e-07, + "loss": 1.2689, + "step": 39 + }, + { + "epoch": 0.005466614603670442, + "grad_norm": 10.793634414672852, + "learning_rate": 8.400000000000001e-07, + "loss": 1.3767, + "step": 42 + }, + { + "epoch": 0.005857087075361187, + "grad_norm": 10.585275650024414, + "learning_rate": 9.000000000000001e-07, + "loss": 1.2601, + "step": 45 + }, + { + "epoch": 0.006247559547051933, + "grad_norm": 9.969756126403809, + "learning_rate": 9.600000000000001e-07, + "loss": 1.1707, + "step": 48 + }, + { + "epoch": 0.006638032018742679, + "grad_norm": 9.342447280883789, + "learning_rate": 1.02e-06, + "loss": 1.1647, + "step": 51 + }, + { + "epoch": 0.007028504490433424, + "grad_norm": 9.255729675292969, + "learning_rate": 1.08e-06, + "loss": 1.1294, + "step": 54 + }, + { + "epoch": 0.00741897696212417, + "grad_norm": 8.989304542541504, + "learning_rate": 1.14e-06, + "loss": 1.1128, + "step": 57 + }, + { + "epoch": 0.007809449433814916, + "grad_norm": 8.631978034973145, + "learning_rate": 1.2000000000000002e-06, + "loss": 1.2147, + "step": 60 + }, + { + "epoch": 0.008199921905505662, + "grad_norm": 9.135863304138184, + "learning_rate": 1.26e-06, + "loss": 1.052, + "step": 63 + }, + { + "epoch": 0.008590394377196407, + "grad_norm": 7.280301570892334, + "learning_rate": 1.32e-06, + "loss": 0.9579, + "step": 66 + }, + { + "epoch": 0.008980866848887154, + "grad_norm": 6.385473728179932, + "learning_rate": 1.3800000000000001e-06, + "loss": 1.1031, + "step": 69 + }, + { + "epoch": 0.009371339320577899, + "grad_norm": 4.986073017120361, + "learning_rate": 1.44e-06, + "loss": 0.9746, + "step": 72 + }, + { + "epoch": 0.009761811792268645, + "grad_norm": 4.2078423500061035, + "learning_rate": 1.5e-06, + "loss": 1.0148, + "step": 75 + }, + { + "epoch": 0.01015228426395939, + "grad_norm": 4.854979038238525, + "learning_rate": 1.56e-06, + "loss": 1.005, + "step": 78 + }, + { + "epoch": 0.010542756735650137, + "grad_norm": 3.9257068634033203, + "learning_rate": 1.6200000000000002e-06, + "loss": 0.8129, + "step": 81 + }, + { + "epoch": 0.010933229207340883, + "grad_norm": 4.077581882476807, + "learning_rate": 1.6800000000000002e-06, + "loss": 1.0132, + "step": 84 + }, + { + "epoch": 0.011323701679031628, + "grad_norm": 4.484386444091797, + "learning_rate": 1.74e-06, + "loss": 0.9022, + "step": 87 + }, + { + "epoch": 0.011714174150722375, + "grad_norm": 3.997361898422241, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.8801, + "step": 90 + }, + { + "epoch": 0.01210464662241312, + "grad_norm": 3.9567580223083496, + "learning_rate": 1.8600000000000002e-06, + "loss": 0.9124, + "step": 93 + }, + { + "epoch": 0.012495119094103866, + "grad_norm": 4.3611297607421875, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.939, + "step": 96 + }, + { + "epoch": 0.012885591565794611, + "grad_norm": 7.97072172164917, + "learning_rate": 1.98e-06, + "loss": 0.9517, + "step": 99 + }, + { + "epoch": 0.013276064037485357, + "grad_norm": 3.360591173171997, + "learning_rate": 2.04e-06, + "loss": 0.7842, + "step": 102 + }, + { + "epoch": 0.013666536509176102, + "grad_norm": 8.188989639282227, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.9067, + "step": 105 + }, + { + "epoch": 0.014057008980866849, + "grad_norm": 5.126707077026367, + "learning_rate": 2.16e-06, + "loss": 0.8402, + "step": 108 + }, + { + "epoch": 0.014447481452557595, + "grad_norm": 3.1572999954223633, + "learning_rate": 2.2200000000000003e-06, + "loss": 0.7993, + "step": 111 + }, + { + "epoch": 0.01483795392424834, + "grad_norm": 3.5879690647125244, + "learning_rate": 2.28e-06, + "loss": 0.8593, + "step": 114 + }, + { + "epoch": 0.015228426395939087, + "grad_norm": 3.5023837089538574, + "learning_rate": 2.3400000000000005e-06, + "loss": 0.8434, + "step": 117 + }, + { + "epoch": 0.015618898867629832, + "grad_norm": 3.2916157245635986, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.8904, + "step": 120 + }, + { + "epoch": 0.016009371339320577, + "grad_norm": 4.094881534576416, + "learning_rate": 2.46e-06, + "loss": 0.8106, + "step": 123 + }, + { + "epoch": 0.016399843811011325, + "grad_norm": 7.149028301239014, + "learning_rate": 2.52e-06, + "loss": 0.8551, + "step": 126 + }, + { + "epoch": 0.01679031628270207, + "grad_norm": 3.3029215335845947, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.8387, + "step": 129 + }, + { + "epoch": 0.017180788754392814, + "grad_norm": 3.166794538497925, + "learning_rate": 2.64e-06, + "loss": 0.819, + "step": 132 + }, + { + "epoch": 0.017571261226083563, + "grad_norm": 4.197257995605469, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.7449, + "step": 135 + }, + { + "epoch": 0.017961733697774308, + "grad_norm": 3.171112060546875, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.7152, + "step": 138 + }, + { + "epoch": 0.018352206169465052, + "grad_norm": 3.2386510372161865, + "learning_rate": 2.82e-06, + "loss": 0.7647, + "step": 141 + }, + { + "epoch": 0.018742678641155797, + "grad_norm": 3.2060446739196777, + "learning_rate": 2.88e-06, + "loss": 0.737, + "step": 144 + }, + { + "epoch": 0.019133151112846546, + "grad_norm": 3.846658706665039, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.7577, + "step": 147 + }, + { + "epoch": 0.01952362358453729, + "grad_norm": 3.3511500358581543, + "learning_rate": 3e-06, + "loss": 0.7303, + "step": 150 + }, + { + "epoch": 0.019914096056228035, + "grad_norm": 3.5826847553253174, + "learning_rate": 3.0600000000000003e-06, + "loss": 0.8347, + "step": 153 + }, + { + "epoch": 0.02030456852791878, + "grad_norm": 3.3204636573791504, + "learning_rate": 3.12e-06, + "loss": 0.7353, + "step": 156 + }, + { + "epoch": 0.02069504099960953, + "grad_norm": 3.548901319503784, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.8361, + "step": 159 + }, + { + "epoch": 0.021085513471300273, + "grad_norm": 3.251668691635132, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.7772, + "step": 162 + }, + { + "epoch": 0.021475985942991018, + "grad_norm": 3.0531423091888428, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.724, + "step": 165 + }, + { + "epoch": 0.021866458414681766, + "grad_norm": 3.8247272968292236, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.6697, + "step": 168 + }, + { + "epoch": 0.02225693088637251, + "grad_norm": 6.110504627227783, + "learning_rate": 3.4200000000000007e-06, + "loss": 0.8024, + "step": 171 + }, + { + "epoch": 0.022647403358063256, + "grad_norm": 3.497457504272461, + "learning_rate": 3.48e-06, + "loss": 0.7605, + "step": 174 + }, + { + "epoch": 0.023037875829754, + "grad_norm": 3.874572515487671, + "learning_rate": 3.54e-06, + "loss": 0.8033, + "step": 177 + }, + { + "epoch": 0.02342834830144475, + "grad_norm": 3.286510467529297, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.7136, + "step": 180 + }, + { + "epoch": 0.023818820773135494, + "grad_norm": 3.194321870803833, + "learning_rate": 3.66e-06, + "loss": 0.804, + "step": 183 + }, + { + "epoch": 0.02420929324482624, + "grad_norm": 3.3385202884674072, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.7735, + "step": 186 + }, + { + "epoch": 0.024599765716516987, + "grad_norm": 3.5652225017547607, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.8019, + "step": 189 + }, + { + "epoch": 0.024990238188207732, + "grad_norm": 3.887373447418213, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.732, + "step": 192 + }, + { + "epoch": 0.025380710659898477, + "grad_norm": 3.134536027908325, + "learning_rate": 3.900000000000001e-06, + "loss": 0.7466, + "step": 195 + }, + { + "epoch": 0.025771183131589222, + "grad_norm": 3.26898455619812, + "learning_rate": 3.96e-06, + "loss": 0.7262, + "step": 198 + }, + { + "epoch": 0.02616165560327997, + "grad_norm": 3.5477030277252197, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.8538, + "step": 201 + }, + { + "epoch": 0.026552128074970715, + "grad_norm": 3.780203104019165, + "learning_rate": 4.08e-06, + "loss": 0.6764, + "step": 204 + }, + { + "epoch": 0.02694260054666146, + "grad_norm": 3.250094413757324, + "learning_rate": 4.14e-06, + "loss": 0.8611, + "step": 207 + }, + { + "epoch": 0.027333073018352205, + "grad_norm": 5.036427021026611, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.8276, + "step": 210 + }, + { + "epoch": 0.027723545490042953, + "grad_norm": 2.8822755813598633, + "learning_rate": 4.26e-06, + "loss": 0.8168, + "step": 213 + }, + { + "epoch": 0.028114017961733698, + "grad_norm": 3.374403953552246, + "learning_rate": 4.32e-06, + "loss": 0.8104, + "step": 216 + }, + { + "epoch": 0.028504490433424443, + "grad_norm": 4.382564544677734, + "learning_rate": 4.38e-06, + "loss": 0.769, + "step": 219 + }, + { + "epoch": 0.02889496290511519, + "grad_norm": 4.98586893081665, + "learning_rate": 4.440000000000001e-06, + "loss": 0.8022, + "step": 222 + }, + { + "epoch": 0.029285435376805936, + "grad_norm": 3.1688528060913086, + "learning_rate": 4.5e-06, + "loss": 0.7269, + "step": 225 + }, + { + "epoch": 0.02967590784849668, + "grad_norm": 3.384666681289673, + "learning_rate": 4.56e-06, + "loss": 0.7402, + "step": 228 + }, + { + "epoch": 0.030066380320187425, + "grad_norm": 3.3539681434631348, + "learning_rate": 4.620000000000001e-06, + "loss": 0.8786, + "step": 231 + }, + { + "epoch": 0.030456852791878174, + "grad_norm": 3.280231237411499, + "learning_rate": 4.680000000000001e-06, + "loss": 0.6507, + "step": 234 + }, + { + "epoch": 0.03084732526356892, + "grad_norm": 2.9844179153442383, + "learning_rate": 4.74e-06, + "loss": 0.7314, + "step": 237 + }, + { + "epoch": 0.031237797735259663, + "grad_norm": 5.182901382446289, + "learning_rate": 4.800000000000001e-06, + "loss": 0.7851, + "step": 240 + }, + { + "epoch": 0.03162827020695041, + "grad_norm": 4.9192023277282715, + "learning_rate": 4.86e-06, + "loss": 0.7062, + "step": 243 + }, + { + "epoch": 0.03201874267864115, + "grad_norm": 3.2586448192596436, + "learning_rate": 4.92e-06, + "loss": 0.778, + "step": 246 + }, + { + "epoch": 0.032409215150331905, + "grad_norm": 3.3434247970581055, + "learning_rate": 4.980000000000001e-06, + "loss": 0.7214, + "step": 249 + }, + { + "epoch": 0.03279968762202265, + "grad_norm": 3.2206249237060547, + "learning_rate": 5.04e-06, + "loss": 0.75, + "step": 252 + }, + { + "epoch": 0.033190160093713394, + "grad_norm": 4.057199954986572, + "learning_rate": 5.1e-06, + "loss": 0.8155, + "step": 255 + }, + { + "epoch": 0.03358063256540414, + "grad_norm": 3.0813217163085938, + "learning_rate": 5.1600000000000006e-06, + "loss": 0.7603, + "step": 258 + }, + { + "epoch": 0.033971105037094884, + "grad_norm": 3.3523905277252197, + "learning_rate": 5.220000000000001e-06, + "loss": 0.738, + "step": 261 + }, + { + "epoch": 0.03436157750878563, + "grad_norm": 5.289391994476318, + "learning_rate": 5.28e-06, + "loss": 0.6952, + "step": 264 + }, + { + "epoch": 0.034752049980476374, + "grad_norm": 3.419605016708374, + "learning_rate": 5.3400000000000005e-06, + "loss": 0.7415, + "step": 267 + }, + { + "epoch": 0.035142522452167126, + "grad_norm": 2.9998910427093506, + "learning_rate": 5.400000000000001e-06, + "loss": 0.813, + "step": 270 + }, + { + "epoch": 0.03553299492385787, + "grad_norm": 3.2090444564819336, + "learning_rate": 5.460000000000001e-06, + "loss": 0.7232, + "step": 273 + }, + { + "epoch": 0.035923467395548615, + "grad_norm": 4.806464672088623, + "learning_rate": 5.5200000000000005e-06, + "loss": 0.865, + "step": 276 + }, + { + "epoch": 0.03631393986723936, + "grad_norm": 3.2608954906463623, + "learning_rate": 5.580000000000001e-06, + "loss": 0.7011, + "step": 279 + }, + { + "epoch": 0.036704412338930105, + "grad_norm": 3.387782096862793, + "learning_rate": 5.64e-06, + "loss": 0.7094, + "step": 282 + }, + { + "epoch": 0.03709488481062085, + "grad_norm": 3.7122697830200195, + "learning_rate": 5.7e-06, + "loss": 0.7027, + "step": 285 + }, + { + "epoch": 0.037485357282311595, + "grad_norm": 2.901911497116089, + "learning_rate": 5.76e-06, + "loss": 0.6686, + "step": 288 + }, + { + "epoch": 0.03787582975400234, + "grad_norm": 4.1798996925354, + "learning_rate": 5.82e-06, + "loss": 0.7313, + "step": 291 + }, + { + "epoch": 0.03826630222569309, + "grad_norm": 3.4261515140533447, + "learning_rate": 5.8800000000000005e-06, + "loss": 0.7415, + "step": 294 + }, + { + "epoch": 0.038656774697383836, + "grad_norm": 3.0005760192871094, + "learning_rate": 5.94e-06, + "loss": 0.7753, + "step": 297 + }, + { + "epoch": 0.03904724716907458, + "grad_norm": 3.1258742809295654, + "learning_rate": 6e-06, + "loss": 0.6959, + "step": 300 + }, + { + "epoch": 0.039437719640765326, + "grad_norm": 4.299694538116455, + "learning_rate": 6.0600000000000004e-06, + "loss": 0.6857, + "step": 303 + }, + { + "epoch": 0.03982819211245607, + "grad_norm": 3.4530158042907715, + "learning_rate": 6.120000000000001e-06, + "loss": 0.7072, + "step": 306 + }, + { + "epoch": 0.040218664584146815, + "grad_norm": 3.939462900161743, + "learning_rate": 6.18e-06, + "loss": 0.7783, + "step": 309 + }, + { + "epoch": 0.04060913705583756, + "grad_norm": 3.1602437496185303, + "learning_rate": 6.24e-06, + "loss": 0.7953, + "step": 312 + }, + { + "epoch": 0.04099960952752831, + "grad_norm": 2.8542044162750244, + "learning_rate": 6.300000000000001e-06, + "loss": 0.779, + "step": 315 + }, + { + "epoch": 0.04139008199921906, + "grad_norm": 3.5136380195617676, + "learning_rate": 6.360000000000001e-06, + "loss": 0.755, + "step": 318 + }, + { + "epoch": 0.0417805544709098, + "grad_norm": 3.060673713684082, + "learning_rate": 6.42e-06, + "loss": 0.7327, + "step": 321 + }, + { + "epoch": 0.04217102694260055, + "grad_norm": 3.174912691116333, + "learning_rate": 6.480000000000001e-06, + "loss": 0.7553, + "step": 324 + }, + { + "epoch": 0.04256149941429129, + "grad_norm": 3.189807653427124, + "learning_rate": 6.540000000000001e-06, + "loss": 0.815, + "step": 327 + }, + { + "epoch": 0.042951971885982036, + "grad_norm": 2.7848079204559326, + "learning_rate": 6.600000000000001e-06, + "loss": 0.6674, + "step": 330 + }, + { + "epoch": 0.04334244435767278, + "grad_norm": 3.377772331237793, + "learning_rate": 6.660000000000001e-06, + "loss": 0.6562, + "step": 333 + }, + { + "epoch": 0.04373291682936353, + "grad_norm": 3.255612373352051, + "learning_rate": 6.720000000000001e-06, + "loss": 0.71, + "step": 336 + }, + { + "epoch": 0.04412338930105428, + "grad_norm": 3.997131824493408, + "learning_rate": 6.780000000000001e-06, + "loss": 0.7364, + "step": 339 + }, + { + "epoch": 0.04451386177274502, + "grad_norm": 3.373277425765991, + "learning_rate": 6.8400000000000014e-06, + "loss": 0.7041, + "step": 342 + }, + { + "epoch": 0.04490433424443577, + "grad_norm": 3.901888608932495, + "learning_rate": 6.9e-06, + "loss": 0.6684, + "step": 345 + }, + { + "epoch": 0.04529480671612651, + "grad_norm": 3.7340526580810547, + "learning_rate": 6.96e-06, + "loss": 0.7079, + "step": 348 + }, + { + "epoch": 0.04568527918781726, + "grad_norm": 3.2724201679229736, + "learning_rate": 7.0200000000000006e-06, + "loss": 0.7308, + "step": 351 + }, + { + "epoch": 0.046075751659508, + "grad_norm": 2.8971149921417236, + "learning_rate": 7.08e-06, + "loss": 0.7238, + "step": 354 + }, + { + "epoch": 0.046466224131198754, + "grad_norm": 2.782917022705078, + "learning_rate": 7.14e-06, + "loss": 0.637, + "step": 357 + }, + { + "epoch": 0.0468566966028895, + "grad_norm": 4.860075950622559, + "learning_rate": 7.2000000000000005e-06, + "loss": 0.6668, + "step": 360 + }, + { + "epoch": 0.04724716907458024, + "grad_norm": 3.7025034427642822, + "learning_rate": 7.260000000000001e-06, + "loss": 0.7802, + "step": 363 + }, + { + "epoch": 0.04763764154627099, + "grad_norm": 3.697002410888672, + "learning_rate": 7.32e-06, + "loss": 0.7448, + "step": 366 + }, + { + "epoch": 0.04802811401796173, + "grad_norm": 4.358737468719482, + "learning_rate": 7.3800000000000005e-06, + "loss": 0.7448, + "step": 369 + }, + { + "epoch": 0.04841858648965248, + "grad_norm": 3.1445984840393066, + "learning_rate": 7.440000000000001e-06, + "loss": 0.7037, + "step": 372 + }, + { + "epoch": 0.04880905896134322, + "grad_norm": 5.41956090927124, + "learning_rate": 7.500000000000001e-06, + "loss": 0.7322, + "step": 375 + }, + { + "epoch": 0.049199531433033974, + "grad_norm": 3.0740654468536377, + "learning_rate": 7.5600000000000005e-06, + "loss": 0.773, + "step": 378 + }, + { + "epoch": 0.04959000390472472, + "grad_norm": 3.2865071296691895, + "learning_rate": 7.620000000000001e-06, + "loss": 0.7685, + "step": 381 + }, + { + "epoch": 0.049980476376415464, + "grad_norm": 3.2605721950531006, + "learning_rate": 7.680000000000001e-06, + "loss": 0.7097, + "step": 384 + }, + { + "epoch": 0.05037094884810621, + "grad_norm": 2.969805955886841, + "learning_rate": 7.74e-06, + "loss": 0.5873, + "step": 387 + }, + { + "epoch": 0.050761421319796954, + "grad_norm": 3.413188934326172, + "learning_rate": 7.800000000000002e-06, + "loss": 0.7874, + "step": 390 + }, + { + "epoch": 0.0511518937914877, + "grad_norm": 4.274326324462891, + "learning_rate": 7.860000000000001e-06, + "loss": 0.7592, + "step": 393 + }, + { + "epoch": 0.051542366263178443, + "grad_norm": 2.9528450965881348, + "learning_rate": 7.92e-06, + "loss": 0.6821, + "step": 396 + }, + { + "epoch": 0.05193283873486919, + "grad_norm": 2.8550336360931396, + "learning_rate": 7.980000000000002e-06, + "loss": 0.6741, + "step": 399 + }, + { + "epoch": 0.05232331120655994, + "grad_norm": 2.7226080894470215, + "learning_rate": 8.040000000000001e-06, + "loss": 0.5926, + "step": 402 + }, + { + "epoch": 0.052713783678250685, + "grad_norm": 2.704489231109619, + "learning_rate": 8.1e-06, + "loss": 0.7503, + "step": 405 + }, + { + "epoch": 0.05310425614994143, + "grad_norm": 3.636996269226074, + "learning_rate": 8.16e-06, + "loss": 0.7294, + "step": 408 + }, + { + "epoch": 0.053494728621632175, + "grad_norm": 3.1197781562805176, + "learning_rate": 8.220000000000001e-06, + "loss": 0.7368, + "step": 411 + }, + { + "epoch": 0.05388520109332292, + "grad_norm": 3.0387392044067383, + "learning_rate": 8.28e-06, + "loss": 0.752, + "step": 414 + }, + { + "epoch": 0.054275673565013664, + "grad_norm": 3.60607647895813, + "learning_rate": 8.34e-06, + "loss": 0.7767, + "step": 417 + }, + { + "epoch": 0.05466614603670441, + "grad_norm": 3.62844181060791, + "learning_rate": 8.400000000000001e-06, + "loss": 0.6498, + "step": 420 + }, + { + "epoch": 0.05505661850839516, + "grad_norm": 3.531270980834961, + "learning_rate": 8.46e-06, + "loss": 0.7256, + "step": 423 + }, + { + "epoch": 0.055447090980085906, + "grad_norm": 3.4062726497650146, + "learning_rate": 8.52e-06, + "loss": 0.7756, + "step": 426 + }, + { + "epoch": 0.05583756345177665, + "grad_norm": 3.633138656616211, + "learning_rate": 8.580000000000001e-06, + "loss": 0.7159, + "step": 429 + }, + { + "epoch": 0.056228035923467395, + "grad_norm": 4.716714859008789, + "learning_rate": 8.64e-06, + "loss": 0.7503, + "step": 432 + }, + { + "epoch": 0.05661850839515814, + "grad_norm": 3.8363606929779053, + "learning_rate": 8.700000000000001e-06, + "loss": 0.708, + "step": 435 + }, + { + "epoch": 0.057008980866848885, + "grad_norm": 3.476318597793579, + "learning_rate": 8.76e-06, + "loss": 0.6928, + "step": 438 + }, + { + "epoch": 0.05739945333853963, + "grad_norm": 4.243283748626709, + "learning_rate": 8.82e-06, + "loss": 0.5729, + "step": 441 + }, + { + "epoch": 0.05778992581023038, + "grad_norm": 18.63896942138672, + "learning_rate": 8.880000000000001e-06, + "loss": 0.893, + "step": 444 + }, + { + "epoch": 0.05818039828192113, + "grad_norm": 3.8758761882781982, + "learning_rate": 8.94e-06, + "loss": 0.655, + "step": 447 + }, + { + "epoch": 0.05857087075361187, + "grad_norm": 2.645306348800659, + "learning_rate": 9e-06, + "loss": 0.6287, + "step": 450 + }, + { + "epoch": 0.058961343225302616, + "grad_norm": 2.911987066268921, + "learning_rate": 9.060000000000001e-06, + "loss": 0.7451, + "step": 453 + }, + { + "epoch": 0.05935181569699336, + "grad_norm": 3.4977638721466064, + "learning_rate": 9.12e-06, + "loss": 0.6821, + "step": 456 + }, + { + "epoch": 0.059742288168684106, + "grad_norm": 3.464571714401245, + "learning_rate": 9.180000000000002e-06, + "loss": 0.6449, + "step": 459 + }, + { + "epoch": 0.06013276064037485, + "grad_norm": 3.0574846267700195, + "learning_rate": 9.240000000000001e-06, + "loss": 0.7833, + "step": 462 + }, + { + "epoch": 0.0605232331120656, + "grad_norm": 3.181323766708374, + "learning_rate": 9.3e-06, + "loss": 0.759, + "step": 465 + }, + { + "epoch": 0.06091370558375635, + "grad_norm": 2.9368088245391846, + "learning_rate": 9.360000000000002e-06, + "loss": 0.681, + "step": 468 + }, + { + "epoch": 0.06130417805544709, + "grad_norm": 2.9919021129608154, + "learning_rate": 9.42e-06, + "loss": 0.6649, + "step": 471 + }, + { + "epoch": 0.06169465052713784, + "grad_norm": 3.4639129638671875, + "learning_rate": 9.48e-06, + "loss": 0.6816, + "step": 474 + }, + { + "epoch": 0.06208512299882858, + "grad_norm": 4.285403251647949, + "learning_rate": 9.54e-06, + "loss": 0.8209, + "step": 477 + }, + { + "epoch": 0.06247559547051933, + "grad_norm": 2.8827459812164307, + "learning_rate": 9.600000000000001e-06, + "loss": 0.7042, + "step": 480 + }, + { + "epoch": 0.06286606794221007, + "grad_norm": 3.176536798477173, + "learning_rate": 9.66e-06, + "loss": 0.7312, + "step": 483 + }, + { + "epoch": 0.06325654041390082, + "grad_norm": 4.091725826263428, + "learning_rate": 9.72e-06, + "loss": 0.6797, + "step": 486 + }, + { + "epoch": 0.06364701288559156, + "grad_norm": 3.53378963470459, + "learning_rate": 9.780000000000001e-06, + "loss": 0.6557, + "step": 489 + }, + { + "epoch": 0.0640374853572823, + "grad_norm": 3.440068006515503, + "learning_rate": 9.84e-06, + "loss": 0.6972, + "step": 492 + }, + { + "epoch": 0.06442795782897305, + "grad_norm": 3.1737592220306396, + "learning_rate": 9.9e-06, + "loss": 0.6832, + "step": 495 + }, + { + "epoch": 0.06481843030066381, + "grad_norm": 2.9271981716156006, + "learning_rate": 9.960000000000001e-06, + "loss": 0.7136, + "step": 498 + }, + { + "epoch": 0.06520890277235455, + "grad_norm": 4.281307697296143, + "learning_rate": 9.999999951472807e-06, + "loss": 0.7126, + "step": 501 + }, + { + "epoch": 0.0655993752440453, + "grad_norm": 4.331837177276611, + "learning_rate": 9.999999223564915e-06, + "loss": 0.6736, + "step": 504 + }, + { + "epoch": 0.06598984771573604, + "grad_norm": 3.0359110832214355, + "learning_rate": 9.999997622167676e-06, + "loss": 0.7754, + "step": 507 + }, + { + "epoch": 0.06638032018742679, + "grad_norm": 2.992680072784424, + "learning_rate": 9.999995147281374e-06, + "loss": 0.8101, + "step": 510 + }, + { + "epoch": 0.06677079265911753, + "grad_norm": 2.822829484939575, + "learning_rate": 9.999991798906435e-06, + "loss": 0.6705, + "step": 513 + }, + { + "epoch": 0.06716126513080828, + "grad_norm": 3.342939615249634, + "learning_rate": 9.999987577043449e-06, + "loss": 0.7045, + "step": 516 + }, + { + "epoch": 0.06755173760249902, + "grad_norm": 4.064047813415527, + "learning_rate": 9.999982481693151e-06, + "loss": 0.7007, + "step": 519 + }, + { + "epoch": 0.06794221007418977, + "grad_norm": 2.783782720565796, + "learning_rate": 9.999976512856434e-06, + "loss": 0.5655, + "step": 522 + }, + { + "epoch": 0.06833268254588051, + "grad_norm": 2.931049346923828, + "learning_rate": 9.999969670534335e-06, + "loss": 0.677, + "step": 525 + }, + { + "epoch": 0.06872315501757126, + "grad_norm": 3.2958552837371826, + "learning_rate": 9.999961954728054e-06, + "loss": 0.7741, + "step": 528 + }, + { + "epoch": 0.069113627489262, + "grad_norm": 2.8830935955047607, + "learning_rate": 9.999953365438939e-06, + "loss": 0.6607, + "step": 531 + }, + { + "epoch": 0.06950409996095275, + "grad_norm": 3.2814905643463135, + "learning_rate": 9.99994390266849e-06, + "loss": 0.7587, + "step": 534 + }, + { + "epoch": 0.06989457243264349, + "grad_norm": 4.126666069030762, + "learning_rate": 9.999933566418358e-06, + "loss": 0.7806, + "step": 537 + }, + { + "epoch": 0.07028504490433425, + "grad_norm": 3.1992428302764893, + "learning_rate": 9.99992235669035e-06, + "loss": 0.7045, + "step": 540 + }, + { + "epoch": 0.070675517376025, + "grad_norm": 3.3059449195861816, + "learning_rate": 9.999910273486427e-06, + "loss": 0.8061, + "step": 543 + }, + { + "epoch": 0.07106598984771574, + "grad_norm": 5.540506839752197, + "learning_rate": 9.999897316808695e-06, + "loss": 0.7123, + "step": 546 + }, + { + "epoch": 0.07145646231940649, + "grad_norm": 2.840941905975342, + "learning_rate": 9.999883486659421e-06, + "loss": 0.6211, + "step": 549 + }, + { + "epoch": 0.07184693479109723, + "grad_norm": 2.806445837020874, + "learning_rate": 9.99986878304102e-06, + "loss": 0.7114, + "step": 552 + }, + { + "epoch": 0.07223740726278798, + "grad_norm": 4.31725549697876, + "learning_rate": 9.99985320595606e-06, + "loss": 0.6363, + "step": 555 + }, + { + "epoch": 0.07262787973447872, + "grad_norm": 3.1201353073120117, + "learning_rate": 9.999836755407264e-06, + "loss": 0.7572, + "step": 558 + }, + { + "epoch": 0.07301835220616947, + "grad_norm": 4.234785079956055, + "learning_rate": 9.999819431397506e-06, + "loss": 0.6565, + "step": 561 + }, + { + "epoch": 0.07340882467786021, + "grad_norm": 3.2616043090820312, + "learning_rate": 9.999801233929808e-06, + "loss": 0.7319, + "step": 564 + }, + { + "epoch": 0.07379929714955095, + "grad_norm": 2.796783924102783, + "learning_rate": 9.999782163007357e-06, + "loss": 0.7293, + "step": 567 + }, + { + "epoch": 0.0741897696212417, + "grad_norm": 3.4508984088897705, + "learning_rate": 9.999762218633478e-06, + "loss": 0.8077, + "step": 570 + }, + { + "epoch": 0.07458024209293244, + "grad_norm": 2.6913444995880127, + "learning_rate": 9.999741400811656e-06, + "loss": 0.6777, + "step": 573 + }, + { + "epoch": 0.07497071456462319, + "grad_norm": 3.599764108657837, + "learning_rate": 9.99971970954553e-06, + "loss": 0.7533, + "step": 576 + }, + { + "epoch": 0.07536118703631393, + "grad_norm": 3.3088502883911133, + "learning_rate": 9.999697144838889e-06, + "loss": 0.605, + "step": 579 + }, + { + "epoch": 0.07575165950800468, + "grad_norm": 3.051250457763672, + "learning_rate": 9.999673706695676e-06, + "loss": 0.736, + "step": 582 + }, + { + "epoch": 0.07614213197969544, + "grad_norm": 2.775729179382324, + "learning_rate": 9.999649395119983e-06, + "loss": 0.7246, + "step": 585 + }, + { + "epoch": 0.07653260445138618, + "grad_norm": 3.2636523246765137, + "learning_rate": 9.999624210116057e-06, + "loss": 0.8448, + "step": 588 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 2.841562032699585, + "learning_rate": 9.999598151688301e-06, + "loss": 0.6813, + "step": 591 + }, + { + "epoch": 0.07731354939476767, + "grad_norm": 3.2505574226379395, + "learning_rate": 9.999571219841264e-06, + "loss": 0.6421, + "step": 594 + }, + { + "epoch": 0.07770402186645842, + "grad_norm": 4.958602428436279, + "learning_rate": 9.999543414579655e-06, + "loss": 0.7551, + "step": 597 + }, + { + "epoch": 0.07809449433814916, + "grad_norm": 3.9999706745147705, + "learning_rate": 9.999514735908326e-06, + "loss": 0.6695, + "step": 600 + }, + { + "epoch": 0.0784849668098399, + "grad_norm": 4.219988822937012, + "learning_rate": 9.999485183832291e-06, + "loss": 0.7231, + "step": 603 + }, + { + "epoch": 0.07887543928153065, + "grad_norm": 2.954352855682373, + "learning_rate": 9.999454758356713e-06, + "loss": 0.6714, + "step": 606 + }, + { + "epoch": 0.0792659117532214, + "grad_norm": 4.073672294616699, + "learning_rate": 9.999423459486906e-06, + "loss": 0.7234, + "step": 609 + }, + { + "epoch": 0.07965638422491214, + "grad_norm": 3.4436042308807373, + "learning_rate": 9.999391287228337e-06, + "loss": 0.8006, + "step": 612 + }, + { + "epoch": 0.08004685669660289, + "grad_norm": 3.66202974319458, + "learning_rate": 9.999358241586627e-06, + "loss": 0.6682, + "step": 615 + }, + { + "epoch": 0.08043732916829363, + "grad_norm": 4.19435977935791, + "learning_rate": 9.99932432256755e-06, + "loss": 0.7799, + "step": 618 + }, + { + "epoch": 0.08082780163998438, + "grad_norm": 3.3048112392425537, + "learning_rate": 9.99928953017703e-06, + "loss": 0.7087, + "step": 621 + }, + { + "epoch": 0.08121827411167512, + "grad_norm": 2.822140693664551, + "learning_rate": 9.999253864421147e-06, + "loss": 0.6894, + "step": 624 + }, + { + "epoch": 0.08160874658336588, + "grad_norm": 2.9324841499328613, + "learning_rate": 9.99921732530613e-06, + "loss": 0.6938, + "step": 627 + }, + { + "epoch": 0.08199921905505662, + "grad_norm": 3.84236478805542, + "learning_rate": 9.999179912838364e-06, + "loss": 0.783, + "step": 630 + }, + { + "epoch": 0.08238969152674737, + "grad_norm": 3.0340070724487305, + "learning_rate": 9.999141627024384e-06, + "loss": 0.7381, + "step": 633 + }, + { + "epoch": 0.08278016399843811, + "grad_norm": 4.351922512054443, + "learning_rate": 9.999102467870877e-06, + "loss": 0.7803, + "step": 636 + }, + { + "epoch": 0.08317063647012886, + "grad_norm": 2.6362740993499756, + "learning_rate": 9.999062435384686e-06, + "loss": 0.598, + "step": 639 + }, + { + "epoch": 0.0835611089418196, + "grad_norm": 3.186671495437622, + "learning_rate": 9.999021529572806e-06, + "loss": 0.7747, + "step": 642 + }, + { + "epoch": 0.08395158141351035, + "grad_norm": 3.305826187133789, + "learning_rate": 9.99897975044238e-06, + "loss": 0.7638, + "step": 645 + }, + { + "epoch": 0.0843420538852011, + "grad_norm": 3.8169655799865723, + "learning_rate": 9.998937098000705e-06, + "loss": 0.7015, + "step": 648 + }, + { + "epoch": 0.08473252635689184, + "grad_norm": 3.1275665760040283, + "learning_rate": 9.99889357225524e-06, + "loss": 0.6977, + "step": 651 + }, + { + "epoch": 0.08512299882858258, + "grad_norm": 3.6872782707214355, + "learning_rate": 9.998849173213581e-06, + "loss": 0.7307, + "step": 654 + }, + { + "epoch": 0.08551347130027333, + "grad_norm": 3.6949827671051025, + "learning_rate": 9.998803900883487e-06, + "loss": 0.6681, + "step": 657 + }, + { + "epoch": 0.08590394377196407, + "grad_norm": 3.2044758796691895, + "learning_rate": 9.99875775527287e-06, + "loss": 0.6699, + "step": 660 + }, + { + "epoch": 0.08629441624365482, + "grad_norm": 3.4927258491516113, + "learning_rate": 9.998710736389787e-06, + "loss": 0.6393, + "step": 663 + }, + { + "epoch": 0.08668488871534556, + "grad_norm": 3.450648784637451, + "learning_rate": 9.998662844242456e-06, + "loss": 0.6751, + "step": 666 + }, + { + "epoch": 0.08707536118703631, + "grad_norm": 3.036738157272339, + "learning_rate": 9.99861407883924e-06, + "loss": 0.7606, + "step": 669 + }, + { + "epoch": 0.08746583365872707, + "grad_norm": 3.3343772888183594, + "learning_rate": 9.998564440188661e-06, + "loss": 0.7669, + "step": 672 + }, + { + "epoch": 0.08785630613041781, + "grad_norm": 3.513052225112915, + "learning_rate": 9.998513928299389e-06, + "loss": 0.6073, + "step": 675 + }, + { + "epoch": 0.08824677860210856, + "grad_norm": 2.9392402172088623, + "learning_rate": 9.998462543180249e-06, + "loss": 0.6474, + "step": 678 + }, + { + "epoch": 0.0886372510737993, + "grad_norm": 3.289640188217163, + "learning_rate": 9.998410284840217e-06, + "loss": 0.6522, + "step": 681 + }, + { + "epoch": 0.08902772354549005, + "grad_norm": 5.727564811706543, + "learning_rate": 9.998357153288425e-06, + "loss": 0.6354, + "step": 684 + }, + { + "epoch": 0.08941819601718079, + "grad_norm": 2.87939715385437, + "learning_rate": 9.998303148534153e-06, + "loss": 0.6943, + "step": 687 + }, + { + "epoch": 0.08980866848887153, + "grad_norm": 5.636757850646973, + "learning_rate": 9.998248270586837e-06, + "loss": 0.651, + "step": 690 + }, + { + "epoch": 0.09019914096056228, + "grad_norm": 4.383195400238037, + "learning_rate": 9.99819251945606e-06, + "loss": 0.7694, + "step": 693 + }, + { + "epoch": 0.09058961343225302, + "grad_norm": 3.0773353576660156, + "learning_rate": 9.998135895151567e-06, + "loss": 0.6747, + "step": 696 + }, + { + "epoch": 0.09098008590394377, + "grad_norm": 3.2680556774139404, + "learning_rate": 9.998078397683246e-06, + "loss": 0.7254, + "step": 699 + }, + { + "epoch": 0.09137055837563451, + "grad_norm": 7.307571887969971, + "learning_rate": 9.998020027061145e-06, + "loss": 0.671, + "step": 702 + }, + { + "epoch": 0.09176103084732526, + "grad_norm": 2.9415576457977295, + "learning_rate": 9.99796078329546e-06, + "loss": 0.7709, + "step": 705 + }, + { + "epoch": 0.092151503319016, + "grad_norm": 2.814495086669922, + "learning_rate": 9.997900666396539e-06, + "loss": 0.6947, + "step": 708 + }, + { + "epoch": 0.09254197579070675, + "grad_norm": 5.5829081535339355, + "learning_rate": 9.997839676374885e-06, + "loss": 0.73, + "step": 711 + }, + { + "epoch": 0.09293244826239751, + "grad_norm": 5.697535037994385, + "learning_rate": 9.997777813241154e-06, + "loss": 0.6227, + "step": 714 + }, + { + "epoch": 0.09332292073408825, + "grad_norm": 3.0062732696533203, + "learning_rate": 9.997715077006152e-06, + "loss": 0.6035, + "step": 717 + }, + { + "epoch": 0.093713393205779, + "grad_norm": 2.8236165046691895, + "learning_rate": 9.997651467680843e-06, + "loss": 0.7009, + "step": 720 + }, + { + "epoch": 0.09410386567746974, + "grad_norm": 2.7727997303009033, + "learning_rate": 9.997586985276333e-06, + "loss": 0.6508, + "step": 723 + }, + { + "epoch": 0.09449433814916049, + "grad_norm": 3.65282940864563, + "learning_rate": 9.99752162980389e-06, + "loss": 0.7025, + "step": 726 + }, + { + "epoch": 0.09488481062085123, + "grad_norm": 3.1343119144439697, + "learning_rate": 9.997455401274932e-06, + "loss": 0.7112, + "step": 729 + }, + { + "epoch": 0.09527528309254198, + "grad_norm": 3.504101037979126, + "learning_rate": 9.99738829970103e-06, + "loss": 0.7705, + "step": 732 + }, + { + "epoch": 0.09566575556423272, + "grad_norm": 2.7946465015411377, + "learning_rate": 9.997320325093903e-06, + "loss": 0.5736, + "step": 735 + }, + { + "epoch": 0.09605622803592347, + "grad_norm": 3.221226215362549, + "learning_rate": 9.99725147746543e-06, + "loss": 0.647, + "step": 738 + }, + { + "epoch": 0.09644670050761421, + "grad_norm": 5.345529079437256, + "learning_rate": 9.997181756827634e-06, + "loss": 0.5896, + "step": 741 + }, + { + "epoch": 0.09683717297930496, + "grad_norm": 3.8715436458587646, + "learning_rate": 9.9971111631927e-06, + "loss": 0.7666, + "step": 744 + }, + { + "epoch": 0.0972276454509957, + "grad_norm": 2.9973673820495605, + "learning_rate": 9.997039696572956e-06, + "loss": 0.6106, + "step": 747 + }, + { + "epoch": 0.09761811792268645, + "grad_norm": 5.8889851570129395, + "learning_rate": 9.996967356980891e-06, + "loss": 0.6416, + "step": 750 + }, + { + "epoch": 0.09800859039437719, + "grad_norm": 3.953415632247925, + "learning_rate": 9.99689414442914e-06, + "loss": 0.744, + "step": 753 + }, + { + "epoch": 0.09839906286606795, + "grad_norm": 4.9584760665893555, + "learning_rate": 9.996820058930495e-06, + "loss": 0.6435, + "step": 756 + }, + { + "epoch": 0.0987895353377587, + "grad_norm": 2.774200916290283, + "learning_rate": 9.996745100497898e-06, + "loss": 0.6054, + "step": 759 + }, + { + "epoch": 0.09918000780944944, + "grad_norm": 4.4339118003845215, + "learning_rate": 9.996669269144442e-06, + "loss": 0.6135, + "step": 762 + }, + { + "epoch": 0.09957048028114018, + "grad_norm": 3.3275766372680664, + "learning_rate": 9.996592564883376e-06, + "loss": 0.7489, + "step": 765 + }, + { + "epoch": 0.09996095275283093, + "grad_norm": 3.1275625228881836, + "learning_rate": 9.996514987728101e-06, + "loss": 0.825, + "step": 768 + }, + { + "epoch": 0.10035142522452167, + "grad_norm": 3.67669415473938, + "learning_rate": 9.99643653769217e-06, + "loss": 0.8093, + "step": 771 + }, + { + "epoch": 0.10074189769621242, + "grad_norm": 3.200958013534546, + "learning_rate": 9.996357214789284e-06, + "loss": 0.7373, + "step": 774 + }, + { + "epoch": 0.10113237016790316, + "grad_norm": 2.847730875015259, + "learning_rate": 9.996277019033305e-06, + "loss": 0.7335, + "step": 777 + }, + { + "epoch": 0.10152284263959391, + "grad_norm": 4.33748722076416, + "learning_rate": 9.99619595043824e-06, + "loss": 0.6688, + "step": 780 + }, + { + "epoch": 0.10191331511128465, + "grad_norm": 3.9746432304382324, + "learning_rate": 9.996114009018254e-06, + "loss": 0.7776, + "step": 783 + }, + { + "epoch": 0.1023037875829754, + "grad_norm": 3.600278854370117, + "learning_rate": 9.996031194787661e-06, + "loss": 0.7403, + "step": 786 + }, + { + "epoch": 0.10269426005466614, + "grad_norm": 2.957000494003296, + "learning_rate": 9.995947507760928e-06, + "loss": 0.7485, + "step": 789 + }, + { + "epoch": 0.10308473252635689, + "grad_norm": 4.908221244812012, + "learning_rate": 9.995862947952676e-06, + "loss": 0.748, + "step": 792 + }, + { + "epoch": 0.10347520499804763, + "grad_norm": 3.816086769104004, + "learning_rate": 9.995777515377677e-06, + "loss": 0.6787, + "step": 795 + }, + { + "epoch": 0.10386567746973838, + "grad_norm": 2.948234796524048, + "learning_rate": 9.995691210050854e-06, + "loss": 0.7267, + "step": 798 + }, + { + "epoch": 0.10425614994142914, + "grad_norm": 2.818983793258667, + "learning_rate": 9.995604031987287e-06, + "loss": 0.6914, + "step": 801 + }, + { + "epoch": 0.10464662241311988, + "grad_norm": 3.0057449340820312, + "learning_rate": 9.995515981202206e-06, + "loss": 0.7294, + "step": 804 + }, + { + "epoch": 0.10503709488481063, + "grad_norm": 3.676535129547119, + "learning_rate": 9.99542705771099e-06, + "loss": 0.6236, + "step": 807 + }, + { + "epoch": 0.10542756735650137, + "grad_norm": 3.7774033546447754, + "learning_rate": 9.995337261529176e-06, + "loss": 0.6749, + "step": 810 + }, + { + "epoch": 0.10581803982819211, + "grad_norm": 3.2406165599823, + "learning_rate": 9.995246592672451e-06, + "loss": 0.729, + "step": 813 + }, + { + "epoch": 0.10620851229988286, + "grad_norm": 5.026638984680176, + "learning_rate": 9.995155051156657e-06, + "loss": 0.7358, + "step": 816 + }, + { + "epoch": 0.1065989847715736, + "grad_norm": 3.2842941284179688, + "learning_rate": 9.995062636997783e-06, + "loss": 0.6659, + "step": 819 + }, + { + "epoch": 0.10698945724326435, + "grad_norm": 2.9089906215667725, + "learning_rate": 9.994969350211974e-06, + "loss": 0.6672, + "step": 822 + }, + { + "epoch": 0.1073799297149551, + "grad_norm": 2.6798436641693115, + "learning_rate": 9.994875190815527e-06, + "loss": 0.6423, + "step": 825 + }, + { + "epoch": 0.10777040218664584, + "grad_norm": 2.918283462524414, + "learning_rate": 9.994780158824891e-06, + "loss": 0.7637, + "step": 828 + }, + { + "epoch": 0.10816087465833658, + "grad_norm": 3.470376491546631, + "learning_rate": 9.99468425425667e-06, + "loss": 0.7711, + "step": 831 + }, + { + "epoch": 0.10855134713002733, + "grad_norm": 3.4783475399017334, + "learning_rate": 9.994587477127617e-06, + "loss": 0.8156, + "step": 834 + }, + { + "epoch": 0.10894181960171807, + "grad_norm": 2.7268190383911133, + "learning_rate": 9.994489827454638e-06, + "loss": 0.6253, + "step": 837 + }, + { + "epoch": 0.10933229207340882, + "grad_norm": 3.936289072036743, + "learning_rate": 9.994391305254795e-06, + "loss": 0.6787, + "step": 840 + }, + { + "epoch": 0.10972276454509958, + "grad_norm": 3.084895610809326, + "learning_rate": 9.994291910545296e-06, + "loss": 0.6393, + "step": 843 + }, + { + "epoch": 0.11011323701679032, + "grad_norm": 2.9369547367095947, + "learning_rate": 9.994191643343508e-06, + "loss": 0.6388, + "step": 846 + }, + { + "epoch": 0.11050370948848107, + "grad_norm": 2.8913955688476562, + "learning_rate": 9.994090503666945e-06, + "loss": 0.6447, + "step": 849 + }, + { + "epoch": 0.11089418196017181, + "grad_norm": 2.990058183670044, + "learning_rate": 9.993988491533277e-06, + "loss": 0.6725, + "step": 852 + }, + { + "epoch": 0.11128465443186256, + "grad_norm": 3.0597927570343018, + "learning_rate": 9.993885606960325e-06, + "loss": 0.7052, + "step": 855 + }, + { + "epoch": 0.1116751269035533, + "grad_norm": 4.286864280700684, + "learning_rate": 9.993781849966064e-06, + "loss": 0.6557, + "step": 858 + }, + { + "epoch": 0.11206559937524405, + "grad_norm": 2.70944881439209, + "learning_rate": 9.993677220568619e-06, + "loss": 0.7614, + "step": 861 + }, + { + "epoch": 0.11245607184693479, + "grad_norm": 2.6077942848205566, + "learning_rate": 9.993571718786268e-06, + "loss": 0.713, + "step": 864 + }, + { + "epoch": 0.11284654431862554, + "grad_norm": 2.833029270172119, + "learning_rate": 9.993465344637443e-06, + "loss": 0.6649, + "step": 867 + }, + { + "epoch": 0.11323701679031628, + "grad_norm": 2.6848433017730713, + "learning_rate": 9.993358098140727e-06, + "loss": 0.7228, + "step": 870 + }, + { + "epoch": 0.11362748926200703, + "grad_norm": 3.404714822769165, + "learning_rate": 9.993249979314857e-06, + "loss": 0.6024, + "step": 873 + }, + { + "epoch": 0.11401796173369777, + "grad_norm": 4.099708557128906, + "learning_rate": 9.993140988178718e-06, + "loss": 0.664, + "step": 876 + }, + { + "epoch": 0.11440843420538852, + "grad_norm": 2.908158302307129, + "learning_rate": 9.993031124751353e-06, + "loss": 0.7663, + "step": 879 + }, + { + "epoch": 0.11479890667707926, + "grad_norm": 2.5075185298919678, + "learning_rate": 9.992920389051955e-06, + "loss": 0.7454, + "step": 882 + }, + { + "epoch": 0.11518937914877002, + "grad_norm": 3.0805585384368896, + "learning_rate": 9.992808781099868e-06, + "loss": 0.7226, + "step": 885 + }, + { + "epoch": 0.11557985162046076, + "grad_norm": 3.1145894527435303, + "learning_rate": 9.992696300914591e-06, + "loss": 0.732, + "step": 888 + }, + { + "epoch": 0.11597032409215151, + "grad_norm": 3.146247386932373, + "learning_rate": 9.992582948515772e-06, + "loss": 0.785, + "step": 891 + }, + { + "epoch": 0.11636079656384225, + "grad_norm": 3.7035303115844727, + "learning_rate": 9.992468723923216e-06, + "loss": 0.7226, + "step": 894 + }, + { + "epoch": 0.116751269035533, + "grad_norm": 3.59032940864563, + "learning_rate": 9.992353627156876e-06, + "loss": 0.6496, + "step": 897 + }, + { + "epoch": 0.11714174150722374, + "grad_norm": 3.1689419746398926, + "learning_rate": 9.992237658236859e-06, + "loss": 0.6467, + "step": 900 + }, + { + "epoch": 0.11753221397891449, + "grad_norm": 2.9618992805480957, + "learning_rate": 9.992120817183427e-06, + "loss": 0.6797, + "step": 903 + }, + { + "epoch": 0.11792268645060523, + "grad_norm": 2.657771587371826, + "learning_rate": 9.992003104016988e-06, + "loss": 0.6924, + "step": 906 + }, + { + "epoch": 0.11831315892229598, + "grad_norm": 2.938518524169922, + "learning_rate": 9.99188451875811e-06, + "loss": 0.722, + "step": 909 + }, + { + "epoch": 0.11870363139398672, + "grad_norm": 2.878551959991455, + "learning_rate": 9.991765061427508e-06, + "loss": 0.6749, + "step": 912 + }, + { + "epoch": 0.11909410386567747, + "grad_norm": 3.0917224884033203, + "learning_rate": 9.99164473204605e-06, + "loss": 0.634, + "step": 915 + }, + { + "epoch": 0.11948457633736821, + "grad_norm": 3.893941879272461, + "learning_rate": 9.991523530634758e-06, + "loss": 0.7202, + "step": 918 + }, + { + "epoch": 0.11987504880905896, + "grad_norm": 12.49660873413086, + "learning_rate": 9.991401457214807e-06, + "loss": 0.7348, + "step": 921 + }, + { + "epoch": 0.1202655212807497, + "grad_norm": 2.8306703567504883, + "learning_rate": 9.99127851180752e-06, + "loss": 0.7267, + "step": 924 + }, + { + "epoch": 0.12065599375244045, + "grad_norm": 3.2142062187194824, + "learning_rate": 9.99115469443438e-06, + "loss": 0.6601, + "step": 927 + }, + { + "epoch": 0.1210464662241312, + "grad_norm": 2.776210308074951, + "learning_rate": 9.991030005117013e-06, + "loss": 0.671, + "step": 930 + }, + { + "epoch": 0.12143693869582195, + "grad_norm": 3.657951593399048, + "learning_rate": 9.990904443877203e-06, + "loss": 0.6504, + "step": 933 + }, + { + "epoch": 0.1218274111675127, + "grad_norm": 2.87703800201416, + "learning_rate": 9.990778010736885e-06, + "loss": 0.7212, + "step": 936 + }, + { + "epoch": 0.12221788363920344, + "grad_norm": 4.6955718994140625, + "learning_rate": 9.99065070571815e-06, + "loss": 0.6704, + "step": 939 + }, + { + "epoch": 0.12260835611089418, + "grad_norm": 4.890464782714844, + "learning_rate": 9.990522528843236e-06, + "loss": 0.587, + "step": 942 + }, + { + "epoch": 0.12299882858258493, + "grad_norm": 3.7932112216949463, + "learning_rate": 9.990393480134532e-06, + "loss": 0.6647, + "step": 945 + }, + { + "epoch": 0.12338930105427567, + "grad_norm": 2.9928741455078125, + "learning_rate": 9.990263559614589e-06, + "loss": 0.7145, + "step": 948 + }, + { + "epoch": 0.12377977352596642, + "grad_norm": 2.880676031112671, + "learning_rate": 9.990132767306097e-06, + "loss": 0.6792, + "step": 951 + }, + { + "epoch": 0.12417024599765716, + "grad_norm": 2.795924186706543, + "learning_rate": 9.990001103231909e-06, + "loss": 0.6022, + "step": 954 + }, + { + "epoch": 0.12456071846934791, + "grad_norm": 2.9928035736083984, + "learning_rate": 9.989868567415027e-06, + "loss": 0.6714, + "step": 957 + }, + { + "epoch": 0.12495119094103865, + "grad_norm": 2.6535706520080566, + "learning_rate": 9.989735159878601e-06, + "loss": 0.6269, + "step": 960 + }, + { + "epoch": 0.1253416634127294, + "grad_norm": 2.8510921001434326, + "learning_rate": 9.98960088064594e-06, + "loss": 0.7881, + "step": 963 + }, + { + "epoch": 0.12573213588442014, + "grad_norm": 2.8703200817108154, + "learning_rate": 9.989465729740504e-06, + "loss": 0.641, + "step": 966 + }, + { + "epoch": 0.1261226083561109, + "grad_norm": 2.8146934509277344, + "learning_rate": 9.989329707185899e-06, + "loss": 0.5499, + "step": 969 + }, + { + "epoch": 0.12651308082780163, + "grad_norm": 2.848714590072632, + "learning_rate": 9.989192813005891e-06, + "loss": 0.6357, + "step": 972 + }, + { + "epoch": 0.12690355329949238, + "grad_norm": 3.125911235809326, + "learning_rate": 9.989055047224393e-06, + "loss": 0.8084, + "step": 975 + }, + { + "epoch": 0.12729402577118312, + "grad_norm": 3.852508306503296, + "learning_rate": 9.988916409865476e-06, + "loss": 0.6652, + "step": 978 + }, + { + "epoch": 0.12768449824287387, + "grad_norm": 3.4313900470733643, + "learning_rate": 9.988776900953356e-06, + "loss": 0.6909, + "step": 981 + }, + { + "epoch": 0.1280749707145646, + "grad_norm": 2.9528424739837646, + "learning_rate": 9.988636520512407e-06, + "loss": 0.6335, + "step": 984 + }, + { + "epoch": 0.12846544318625536, + "grad_norm": 3.1923022270202637, + "learning_rate": 9.988495268567152e-06, + "loss": 0.7599, + "step": 987 + }, + { + "epoch": 0.1288559156579461, + "grad_norm": 2.8430709838867188, + "learning_rate": 9.988353145142267e-06, + "loss": 0.659, + "step": 990 + }, + { + "epoch": 0.12924638812963687, + "grad_norm": 4.609011173248291, + "learning_rate": 9.988210150262582e-06, + "loss": 0.7288, + "step": 993 + }, + { + "epoch": 0.12963686060132762, + "grad_norm": 3.1884047985076904, + "learning_rate": 9.98806628395308e-06, + "loss": 0.8857, + "step": 996 + }, + { + "epoch": 0.13002733307301836, + "grad_norm": 4.605678081512451, + "learning_rate": 9.987921546238888e-06, + "loss": 0.6314, + "step": 999 + }, + { + "epoch": 0.1304178055447091, + "grad_norm": 2.8281877040863037, + "learning_rate": 9.987775937145297e-06, + "loss": 0.6079, + "step": 1002 + }, + { + "epoch": 0.13080827801639985, + "grad_norm": 7.240689277648926, + "learning_rate": 9.987629456697741e-06, + "loss": 0.6683, + "step": 1005 + }, + { + "epoch": 0.1311987504880906, + "grad_norm": 3.1431357860565186, + "learning_rate": 9.987482104921813e-06, + "loss": 0.7741, + "step": 1008 + }, + { + "epoch": 0.13158922295978134, + "grad_norm": 3.569730043411255, + "learning_rate": 9.98733388184325e-06, + "loss": 0.83, + "step": 1011 + }, + { + "epoch": 0.1319796954314721, + "grad_norm": 2.689365863800049, + "learning_rate": 9.987184787487953e-06, + "loss": 0.7279, + "step": 1014 + }, + { + "epoch": 0.13237016790316283, + "grad_norm": 2.8707399368286133, + "learning_rate": 9.987034821881965e-06, + "loss": 0.7021, + "step": 1017 + }, + { + "epoch": 0.13276064037485358, + "grad_norm": 2.8549368381500244, + "learning_rate": 9.986883985051485e-06, + "loss": 0.6305, + "step": 1020 + }, + { + "epoch": 0.13315111284654432, + "grad_norm": 2.8607091903686523, + "learning_rate": 9.986732277022862e-06, + "loss": 0.647, + "step": 1023 + }, + { + "epoch": 0.13354158531823507, + "grad_norm": 3.3489599227905273, + "learning_rate": 9.986579697822601e-06, + "loss": 0.624, + "step": 1026 + }, + { + "epoch": 0.1339320577899258, + "grad_norm": 3.7060370445251465, + "learning_rate": 9.986426247477358e-06, + "loss": 0.7278, + "step": 1029 + }, + { + "epoch": 0.13432253026161656, + "grad_norm": 3.295768976211548, + "learning_rate": 9.98627192601394e-06, + "loss": 0.6298, + "step": 1032 + }, + { + "epoch": 0.1347130027333073, + "grad_norm": 4.761738300323486, + "learning_rate": 9.986116733459303e-06, + "loss": 0.8255, + "step": 1035 + }, + { + "epoch": 0.13510347520499805, + "grad_norm": 2.9362332820892334, + "learning_rate": 9.985960669840564e-06, + "loss": 0.6209, + "step": 1038 + }, + { + "epoch": 0.1354939476766888, + "grad_norm": 2.879631519317627, + "learning_rate": 9.985803735184986e-06, + "loss": 0.6995, + "step": 1041 + }, + { + "epoch": 0.13588442014837954, + "grad_norm": 2.8256990909576416, + "learning_rate": 9.985645929519983e-06, + "loss": 0.6906, + "step": 1044 + }, + { + "epoch": 0.13627489262007028, + "grad_norm": 2.7403573989868164, + "learning_rate": 9.985487252873125e-06, + "loss": 0.5904, + "step": 1047 + }, + { + "epoch": 0.13666536509176103, + "grad_norm": 2.701652765274048, + "learning_rate": 9.98532770527213e-06, + "loss": 0.6527, + "step": 1050 + }, + { + "epoch": 0.13705583756345177, + "grad_norm": 2.5606539249420166, + "learning_rate": 9.985167286744875e-06, + "loss": 0.7061, + "step": 1053 + }, + { + "epoch": 0.13744631003514252, + "grad_norm": 2.821444511413574, + "learning_rate": 9.98500599731938e-06, + "loss": 0.671, + "step": 1056 + }, + { + "epoch": 0.13783678250683326, + "grad_norm": 3.4483652114868164, + "learning_rate": 9.984843837023826e-06, + "loss": 0.5973, + "step": 1059 + }, + { + "epoch": 0.138227254978524, + "grad_norm": 2.7346315383911133, + "learning_rate": 9.984680805886538e-06, + "loss": 0.6508, + "step": 1062 + }, + { + "epoch": 0.13861772745021475, + "grad_norm": 3.0941390991210938, + "learning_rate": 9.984516903936002e-06, + "loss": 0.6686, + "step": 1065 + }, + { + "epoch": 0.1390081999219055, + "grad_norm": 2.95706844329834, + "learning_rate": 9.984352131200847e-06, + "loss": 0.7592, + "step": 1068 + }, + { + "epoch": 0.13939867239359624, + "grad_norm": 2.627673864364624, + "learning_rate": 9.984186487709862e-06, + "loss": 0.6032, + "step": 1071 + }, + { + "epoch": 0.13978914486528699, + "grad_norm": 3.3096377849578857, + "learning_rate": 9.984019973491981e-06, + "loss": 0.7385, + "step": 1074 + }, + { + "epoch": 0.14017961733697773, + "grad_norm": 2.916146993637085, + "learning_rate": 9.983852588576296e-06, + "loss": 0.6303, + "step": 1077 + }, + { + "epoch": 0.1405700898086685, + "grad_norm": 6.1797614097595215, + "learning_rate": 9.983684332992049e-06, + "loss": 0.6442, + "step": 1080 + }, + { + "epoch": 0.14096056228035925, + "grad_norm": 4.810859203338623, + "learning_rate": 9.983515206768633e-06, + "loss": 0.7992, + "step": 1083 + }, + { + "epoch": 0.14135103475205, + "grad_norm": 2.918363094329834, + "learning_rate": 9.983345209935593e-06, + "loss": 0.6568, + "step": 1086 + }, + { + "epoch": 0.14174150722374074, + "grad_norm": 2.725175380706787, + "learning_rate": 9.983174342522628e-06, + "loss": 0.6201, + "step": 1089 + }, + { + "epoch": 0.14213197969543148, + "grad_norm": 3.9198555946350098, + "learning_rate": 9.983002604559591e-06, + "loss": 0.6546, + "step": 1092 + }, + { + "epoch": 0.14252245216712223, + "grad_norm": 2.7483971118927, + "learning_rate": 9.98282999607648e-06, + "loss": 0.7721, + "step": 1095 + }, + { + "epoch": 0.14291292463881297, + "grad_norm": 3.3657147884368896, + "learning_rate": 9.982656517103451e-06, + "loss": 0.7828, + "step": 1098 + }, + { + "epoch": 0.14330339711050372, + "grad_norm": 2.881145715713501, + "learning_rate": 9.982482167670811e-06, + "loss": 0.6587, + "step": 1101 + }, + { + "epoch": 0.14369386958219446, + "grad_norm": 3.116957187652588, + "learning_rate": 9.982306947809016e-06, + "loss": 0.8168, + "step": 1104 + }, + { + "epoch": 0.1440843420538852, + "grad_norm": 3.0553340911865234, + "learning_rate": 9.98213085754868e-06, + "loss": 0.6254, + "step": 1107 + }, + { + "epoch": 0.14447481452557595, + "grad_norm": 2.685007095336914, + "learning_rate": 9.981953896920564e-06, + "loss": 0.6368, + "step": 1110 + }, + { + "epoch": 0.1448652869972667, + "grad_norm": 3.4758572578430176, + "learning_rate": 9.981776065955583e-06, + "loss": 0.6683, + "step": 1113 + }, + { + "epoch": 0.14525575946895744, + "grad_norm": 3.288086414337158, + "learning_rate": 9.981597364684804e-06, + "loss": 0.6544, + "step": 1116 + }, + { + "epoch": 0.14564623194064819, + "grad_norm": 2.9728589057922363, + "learning_rate": 9.981417793139443e-06, + "loss": 0.7256, + "step": 1119 + }, + { + "epoch": 0.14603670441233893, + "grad_norm": 4.481566429138184, + "learning_rate": 9.981237351350874e-06, + "loss": 0.6933, + "step": 1122 + }, + { + "epoch": 0.14642717688402968, + "grad_norm": 3.00492000579834, + "learning_rate": 9.98105603935062e-06, + "loss": 0.7278, + "step": 1125 + }, + { + "epoch": 0.14681764935572042, + "grad_norm": 3.2771825790405273, + "learning_rate": 9.980873857170352e-06, + "loss": 0.7472, + "step": 1128 + }, + { + "epoch": 0.14720812182741116, + "grad_norm": 4.5184550285339355, + "learning_rate": 9.980690804841901e-06, + "loss": 0.6744, + "step": 1131 + }, + { + "epoch": 0.1475985942991019, + "grad_norm": 3.881673574447632, + "learning_rate": 9.980506882397246e-06, + "loss": 0.5923, + "step": 1134 + }, + { + "epoch": 0.14798906677079265, + "grad_norm": 3.5104172229766846, + "learning_rate": 9.980322089868512e-06, + "loss": 0.6073, + "step": 1137 + }, + { + "epoch": 0.1483795392424834, + "grad_norm": 4.261860370635986, + "learning_rate": 9.980136427287989e-06, + "loss": 0.6716, + "step": 1140 + }, + { + "epoch": 0.14877001171417414, + "grad_norm": 3.353560209274292, + "learning_rate": 9.979949894688108e-06, + "loss": 0.7559, + "step": 1143 + }, + { + "epoch": 0.1491604841858649, + "grad_norm": 3.719203233718872, + "learning_rate": 9.979762492101456e-06, + "loss": 0.7475, + "step": 1146 + }, + { + "epoch": 0.14955095665755563, + "grad_norm": 3.833672523498535, + "learning_rate": 9.979574219560773e-06, + "loss": 0.7024, + "step": 1149 + }, + { + "epoch": 0.14994142912924638, + "grad_norm": 3.349752426147461, + "learning_rate": 9.97938507709895e-06, + "loss": 0.681, + "step": 1152 + }, + { + "epoch": 0.15033190160093712, + "grad_norm": 3.8509340286254883, + "learning_rate": 9.979195064749029e-06, + "loss": 0.7446, + "step": 1155 + }, + { + "epoch": 0.15072237407262787, + "grad_norm": 3.9146859645843506, + "learning_rate": 9.979004182544204e-06, + "loss": 0.6898, + "step": 1158 + }, + { + "epoch": 0.1511128465443186, + "grad_norm": 3.239340305328369, + "learning_rate": 9.978812430517824e-06, + "loss": 0.7282, + "step": 1161 + }, + { + "epoch": 0.15150331901600936, + "grad_norm": 4.0539727210998535, + "learning_rate": 9.978619808703385e-06, + "loss": 0.6493, + "step": 1164 + }, + { + "epoch": 0.15189379148770013, + "grad_norm": 3.1297805309295654, + "learning_rate": 9.978426317134538e-06, + "loss": 0.7316, + "step": 1167 + }, + { + "epoch": 0.15228426395939088, + "grad_norm": 3.2664248943328857, + "learning_rate": 9.978231955845089e-06, + "loss": 0.6631, + "step": 1170 + }, + { + "epoch": 0.15267473643108162, + "grad_norm": 3.597419023513794, + "learning_rate": 9.978036724868989e-06, + "loss": 0.661, + "step": 1173 + }, + { + "epoch": 0.15306520890277237, + "grad_norm": 2.828190565109253, + "learning_rate": 9.977840624240345e-06, + "loss": 0.6447, + "step": 1176 + }, + { + "epoch": 0.1534556813744631, + "grad_norm": 2.887118339538574, + "learning_rate": 9.977643653993415e-06, + "loss": 0.7544, + "step": 1179 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 4.281754016876221, + "learning_rate": 9.977445814162612e-06, + "loss": 0.7907, + "step": 1182 + }, + { + "epoch": 0.1542366263178446, + "grad_norm": 2.87988018989563, + "learning_rate": 9.977247104782496e-06, + "loss": 0.6872, + "step": 1185 + }, + { + "epoch": 0.15462709878953534, + "grad_norm": 3.0348126888275146, + "learning_rate": 9.97704752588778e-06, + "loss": 0.6695, + "step": 1188 + }, + { + "epoch": 0.1550175712612261, + "grad_norm": 2.9127049446105957, + "learning_rate": 9.976847077513331e-06, + "loss": 0.6395, + "step": 1191 + }, + { + "epoch": 0.15540804373291683, + "grad_norm": 2.811537981033325, + "learning_rate": 9.976645759694167e-06, + "loss": 0.6382, + "step": 1194 + }, + { + "epoch": 0.15579851620460758, + "grad_norm": 3.1080002784729004, + "learning_rate": 9.976443572465462e-06, + "loss": 0.6556, + "step": 1197 + }, + { + "epoch": 0.15618898867629832, + "grad_norm": 3.4639227390289307, + "learning_rate": 9.97624051586253e-06, + "loss": 0.6893, + "step": 1200 + }, + { + "epoch": 0.15657946114798907, + "grad_norm": 3.166987895965576, + "learning_rate": 9.97603658992085e-06, + "loss": 0.669, + "step": 1203 + }, + { + "epoch": 0.1569699336196798, + "grad_norm": 3.154684543609619, + "learning_rate": 9.975831794676045e-06, + "loss": 0.7224, + "step": 1206 + }, + { + "epoch": 0.15736040609137056, + "grad_norm": 4.444057464599609, + "learning_rate": 9.975626130163893e-06, + "loss": 0.6632, + "step": 1209 + }, + { + "epoch": 0.1577508785630613, + "grad_norm": 2.8680520057678223, + "learning_rate": 9.975419596420325e-06, + "loss": 0.7838, + "step": 1212 + }, + { + "epoch": 0.15814135103475205, + "grad_norm": 2.656010866165161, + "learning_rate": 9.975212193481419e-06, + "loss": 0.6376, + "step": 1215 + }, + { + "epoch": 0.1585318235064428, + "grad_norm": 2.9363276958465576, + "learning_rate": 9.975003921383409e-06, + "loss": 0.7675, + "step": 1218 + }, + { + "epoch": 0.15892229597813354, + "grad_norm": 2.935682535171509, + "learning_rate": 9.97479478016268e-06, + "loss": 0.749, + "step": 1221 + }, + { + "epoch": 0.15931276844982428, + "grad_norm": 4.171043395996094, + "learning_rate": 9.974584769855768e-06, + "loss": 0.64, + "step": 1224 + }, + { + "epoch": 0.15970324092151503, + "grad_norm": 3.761404037475586, + "learning_rate": 9.974373890499363e-06, + "loss": 0.6779, + "step": 1227 + }, + { + "epoch": 0.16009371339320577, + "grad_norm": 3.7097084522247314, + "learning_rate": 9.974162142130302e-06, + "loss": 0.6745, + "step": 1230 + }, + { + "epoch": 0.16048418586489652, + "grad_norm": 2.918391704559326, + "learning_rate": 9.97394952478558e-06, + "loss": 0.6632, + "step": 1233 + }, + { + "epoch": 0.16087465833658726, + "grad_norm": 2.916435956954956, + "learning_rate": 9.97373603850234e-06, + "loss": 0.7137, + "step": 1236 + }, + { + "epoch": 0.161265130808278, + "grad_norm": 4.109777450561523, + "learning_rate": 9.973521683317877e-06, + "loss": 0.6859, + "step": 1239 + }, + { + "epoch": 0.16165560327996875, + "grad_norm": 3.2378978729248047, + "learning_rate": 9.973306459269639e-06, + "loss": 0.6296, + "step": 1242 + }, + { + "epoch": 0.1620460757516595, + "grad_norm": 3.034580707550049, + "learning_rate": 9.973090366395223e-06, + "loss": 0.6648, + "step": 1245 + }, + { + "epoch": 0.16243654822335024, + "grad_norm": 2.901724100112915, + "learning_rate": 9.972873404732383e-06, + "loss": 0.6546, + "step": 1248 + }, + { + "epoch": 0.16282702069504099, + "grad_norm": 4.608283519744873, + "learning_rate": 9.972655574319022e-06, + "loss": 0.6832, + "step": 1251 + }, + { + "epoch": 0.16321749316673176, + "grad_norm": 2.890130043029785, + "learning_rate": 9.972436875193191e-06, + "loss": 0.7566, + "step": 1254 + }, + { + "epoch": 0.1636079656384225, + "grad_norm": 2.8199384212493896, + "learning_rate": 9.972217307393099e-06, + "loss": 0.654, + "step": 1257 + }, + { + "epoch": 0.16399843811011325, + "grad_norm": 3.453725576400757, + "learning_rate": 9.971996870957104e-06, + "loss": 0.6728, + "step": 1260 + }, + { + "epoch": 0.164388910581804, + "grad_norm": 2.9947872161865234, + "learning_rate": 9.971775565923715e-06, + "loss": 0.8324, + "step": 1263 + }, + { + "epoch": 0.16477938305349474, + "grad_norm": 4.272884368896484, + "learning_rate": 9.971553392331593e-06, + "loss": 0.6809, + "step": 1266 + }, + { + "epoch": 0.16516985552518548, + "grad_norm": 3.4897780418395996, + "learning_rate": 9.971330350219553e-06, + "loss": 0.6167, + "step": 1269 + }, + { + "epoch": 0.16556032799687623, + "grad_norm": 3.0097784996032715, + "learning_rate": 9.971106439626559e-06, + "loss": 0.5921, + "step": 1272 + }, + { + "epoch": 0.16595080046856697, + "grad_norm": 5.600336074829102, + "learning_rate": 9.970881660591727e-06, + "loss": 0.6879, + "step": 1275 + }, + { + "epoch": 0.16634127294025772, + "grad_norm": 2.6105566024780273, + "learning_rate": 9.970656013154326e-06, + "loss": 0.651, + "step": 1278 + }, + { + "epoch": 0.16673174541194846, + "grad_norm": 2.573819875717163, + "learning_rate": 9.970429497353777e-06, + "loss": 0.6906, + "step": 1281 + }, + { + "epoch": 0.1671222178836392, + "grad_norm": 2.971766233444214, + "learning_rate": 9.97020211322965e-06, + "loss": 0.6863, + "step": 1284 + }, + { + "epoch": 0.16751269035532995, + "grad_norm": 2.8079757690429688, + "learning_rate": 9.96997386082167e-06, + "loss": 0.6304, + "step": 1287 + }, + { + "epoch": 0.1679031628270207, + "grad_norm": 3.1326887607574463, + "learning_rate": 9.969744740169713e-06, + "loss": 0.6407, + "step": 1290 + }, + { + "epoch": 0.16829363529871144, + "grad_norm": 2.8672449588775635, + "learning_rate": 9.969514751313803e-06, + "loss": 0.764, + "step": 1293 + }, + { + "epoch": 0.1686841077704022, + "grad_norm": 2.8906450271606445, + "learning_rate": 9.969283894294121e-06, + "loss": 0.8143, + "step": 1296 + }, + { + "epoch": 0.16907458024209293, + "grad_norm": 2.724889039993286, + "learning_rate": 9.969052169150997e-06, + "loss": 0.6327, + "step": 1299 + }, + { + "epoch": 0.16946505271378368, + "grad_norm": 2.980803966522217, + "learning_rate": 9.968819575924911e-06, + "loss": 0.6832, + "step": 1302 + }, + { + "epoch": 0.16985552518547442, + "grad_norm": 2.6396119594573975, + "learning_rate": 9.9685861146565e-06, + "loss": 0.6276, + "step": 1305 + }, + { + "epoch": 0.17024599765716517, + "grad_norm": 5.2581257820129395, + "learning_rate": 9.968351785386545e-06, + "loss": 0.6789, + "step": 1308 + }, + { + "epoch": 0.1706364701288559, + "grad_norm": 2.702031373977661, + "learning_rate": 9.968116588155986e-06, + "loss": 0.7108, + "step": 1311 + }, + { + "epoch": 0.17102694260054666, + "grad_norm": 2.8001067638397217, + "learning_rate": 9.967880523005911e-06, + "loss": 0.6184, + "step": 1314 + }, + { + "epoch": 0.1714174150722374, + "grad_norm": 2.5163309574127197, + "learning_rate": 9.967643589977559e-06, + "loss": 0.6862, + "step": 1317 + }, + { + "epoch": 0.17180788754392814, + "grad_norm": 3.189988374710083, + "learning_rate": 9.967405789112322e-06, + "loss": 0.6618, + "step": 1320 + }, + { + "epoch": 0.1721983600156189, + "grad_norm": 3.1581954956054688, + "learning_rate": 9.967167120451744e-06, + "loss": 0.6224, + "step": 1323 + }, + { + "epoch": 0.17258883248730963, + "grad_norm": 4.420681953430176, + "learning_rate": 9.966927584037518e-06, + "loss": 0.7317, + "step": 1326 + }, + { + "epoch": 0.17297930495900038, + "grad_norm": 2.627751350402832, + "learning_rate": 9.966687179911494e-06, + "loss": 0.706, + "step": 1329 + }, + { + "epoch": 0.17336977743069112, + "grad_norm": 2.905385971069336, + "learning_rate": 9.966445908115668e-06, + "loss": 0.6165, + "step": 1332 + }, + { + "epoch": 0.17376024990238187, + "grad_norm": 2.9616243839263916, + "learning_rate": 9.966203768692189e-06, + "loss": 0.619, + "step": 1335 + }, + { + "epoch": 0.17415072237407261, + "grad_norm": 3.9424779415130615, + "learning_rate": 9.965960761683358e-06, + "loss": 0.6366, + "step": 1338 + }, + { + "epoch": 0.1745411948457634, + "grad_norm": 2.880329132080078, + "learning_rate": 9.965716887131628e-06, + "loss": 0.7437, + "step": 1341 + }, + { + "epoch": 0.17493166731745413, + "grad_norm": 2.994891405105591, + "learning_rate": 9.965472145079606e-06, + "loss": 0.7427, + "step": 1344 + }, + { + "epoch": 0.17532213978914488, + "grad_norm": 3.0199975967407227, + "learning_rate": 9.965226535570047e-06, + "loss": 0.6787, + "step": 1347 + }, + { + "epoch": 0.17571261226083562, + "grad_norm": 4.0724873542785645, + "learning_rate": 9.964980058645856e-06, + "loss": 0.6798, + "step": 1350 + }, + { + "epoch": 0.17610308473252637, + "grad_norm": 3.5381436347961426, + "learning_rate": 9.964732714350093e-06, + "loss": 0.6456, + "step": 1353 + }, + { + "epoch": 0.1764935572042171, + "grad_norm": 2.9115025997161865, + "learning_rate": 9.964484502725972e-06, + "loss": 0.6812, + "step": 1356 + }, + { + "epoch": 0.17688402967590786, + "grad_norm": 2.9412219524383545, + "learning_rate": 9.964235423816851e-06, + "loss": 0.6854, + "step": 1359 + }, + { + "epoch": 0.1772745021475986, + "grad_norm": 3.002401351928711, + "learning_rate": 9.963985477666242e-06, + "loss": 0.6684, + "step": 1362 + }, + { + "epoch": 0.17766497461928935, + "grad_norm": 5.3304877281188965, + "learning_rate": 9.963734664317816e-06, + "loss": 0.7724, + "step": 1365 + }, + { + "epoch": 0.1780554470909801, + "grad_norm": 3.2181010246276855, + "learning_rate": 9.963482983815385e-06, + "loss": 0.6604, + "step": 1368 + }, + { + "epoch": 0.17844591956267083, + "grad_norm": 3.477856397628784, + "learning_rate": 9.963230436202918e-06, + "loss": 0.6447, + "step": 1371 + }, + { + "epoch": 0.17883639203436158, + "grad_norm": 3.450572967529297, + "learning_rate": 9.962977021524535e-06, + "loss": 0.5988, + "step": 1374 + }, + { + "epoch": 0.17922686450605232, + "grad_norm": 4.131798267364502, + "learning_rate": 9.962722739824506e-06, + "loss": 0.6665, + "step": 1377 + }, + { + "epoch": 0.17961733697774307, + "grad_norm": 3.188093423843384, + "learning_rate": 9.962467591147256e-06, + "loss": 0.6613, + "step": 1380 + }, + { + "epoch": 0.18000780944943381, + "grad_norm": 3.239835023880005, + "learning_rate": 9.962211575537357e-06, + "loss": 0.6192, + "step": 1383 + }, + { + "epoch": 0.18039828192112456, + "grad_norm": 3.1334681510925293, + "learning_rate": 9.961954693039535e-06, + "loss": 0.6414, + "step": 1386 + }, + { + "epoch": 0.1807887543928153, + "grad_norm": 2.906036376953125, + "learning_rate": 9.961696943698667e-06, + "loss": 0.7304, + "step": 1389 + }, + { + "epoch": 0.18117922686450605, + "grad_norm": 4.601681709289551, + "learning_rate": 9.961438327559778e-06, + "loss": 0.6493, + "step": 1392 + }, + { + "epoch": 0.1815696993361968, + "grad_norm": 3.0899908542633057, + "learning_rate": 9.961178844668054e-06, + "loss": 0.6764, + "step": 1395 + }, + { + "epoch": 0.18196017180788754, + "grad_norm": 3.128690242767334, + "learning_rate": 9.96091849506882e-06, + "loss": 0.7436, + "step": 1398 + }, + { + "epoch": 0.18235064427957828, + "grad_norm": 4.097222805023193, + "learning_rate": 9.960657278807562e-06, + "loss": 0.6236, + "step": 1401 + }, + { + "epoch": 0.18274111675126903, + "grad_norm": 2.8826756477355957, + "learning_rate": 9.960395195929915e-06, + "loss": 0.7283, + "step": 1404 + }, + { + "epoch": 0.18313158922295977, + "grad_norm": 2.9129467010498047, + "learning_rate": 9.96013224648166e-06, + "loss": 0.7447, + "step": 1407 + }, + { + "epoch": 0.18352206169465052, + "grad_norm": 3.429777145385742, + "learning_rate": 9.959868430508737e-06, + "loss": 0.5577, + "step": 1410 + }, + { + "epoch": 0.18391253416634126, + "grad_norm": 6.315993785858154, + "learning_rate": 9.959603748057234e-06, + "loss": 0.5291, + "step": 1413 + }, + { + "epoch": 0.184303006638032, + "grad_norm": 2.7502191066741943, + "learning_rate": 9.959338199173387e-06, + "loss": 0.7087, + "step": 1416 + }, + { + "epoch": 0.18469347910972275, + "grad_norm": 3.25874662399292, + "learning_rate": 9.959071783903592e-06, + "loss": 0.6868, + "step": 1419 + }, + { + "epoch": 0.1850839515814135, + "grad_norm": 2.6682872772216797, + "learning_rate": 9.958804502294388e-06, + "loss": 0.598, + "step": 1422 + }, + { + "epoch": 0.18547442405310427, + "grad_norm": 4.8309221267700195, + "learning_rate": 9.95853635439247e-06, + "loss": 0.7511, + "step": 1425 + }, + { + "epoch": 0.18586489652479501, + "grad_norm": 5.207489013671875, + "learning_rate": 9.95826734024468e-06, + "loss": 0.7661, + "step": 1428 + }, + { + "epoch": 0.18625536899648576, + "grad_norm": 2.6510419845581055, + "learning_rate": 9.95799745989802e-06, + "loss": 0.6397, + "step": 1431 + }, + { + "epoch": 0.1866458414681765, + "grad_norm": 3.0544328689575195, + "learning_rate": 9.957726713399631e-06, + "loss": 0.6224, + "step": 1434 + }, + { + "epoch": 0.18703631393986725, + "grad_norm": 2.7472503185272217, + "learning_rate": 9.957455100796815e-06, + "loss": 0.6854, + "step": 1437 + }, + { + "epoch": 0.187426786411558, + "grad_norm": 3.071814775466919, + "learning_rate": 9.957182622137022e-06, + "loss": 0.6508, + "step": 1440 + }, + { + "epoch": 0.18781725888324874, + "grad_norm": 2.8780770301818848, + "learning_rate": 9.956909277467854e-06, + "loss": 0.6678, + "step": 1443 + }, + { + "epoch": 0.18820773135493948, + "grad_norm": 3.1620678901672363, + "learning_rate": 9.956635066837062e-06, + "loss": 0.6834, + "step": 1446 + }, + { + "epoch": 0.18859820382663023, + "grad_norm": 2.79729962348938, + "learning_rate": 9.956359990292552e-06, + "loss": 0.6528, + "step": 1449 + }, + { + "epoch": 0.18898867629832097, + "grad_norm": 2.7644762992858887, + "learning_rate": 9.956084047882377e-06, + "loss": 0.7051, + "step": 1452 + }, + { + "epoch": 0.18937914877001172, + "grad_norm": 3.1195924282073975, + "learning_rate": 9.955807239654746e-06, + "loss": 0.7568, + "step": 1455 + }, + { + "epoch": 0.18976962124170246, + "grad_norm": 2.780245065689087, + "learning_rate": 9.955529565658017e-06, + "loss": 0.55, + "step": 1458 + }, + { + "epoch": 0.1901600937133932, + "grad_norm": 5.350193977355957, + "learning_rate": 9.955251025940696e-06, + "loss": 0.6525, + "step": 1461 + }, + { + "epoch": 0.19055056618508395, + "grad_norm": 2.754183530807495, + "learning_rate": 9.954971620551446e-06, + "loss": 0.6989, + "step": 1464 + }, + { + "epoch": 0.1909410386567747, + "grad_norm": 2.848510980606079, + "learning_rate": 9.954691349539076e-06, + "loss": 0.7376, + "step": 1467 + }, + { + "epoch": 0.19133151112846544, + "grad_norm": 4.006906509399414, + "learning_rate": 9.954410212952551e-06, + "loss": 0.68, + "step": 1470 + }, + { + "epoch": 0.1917219836001562, + "grad_norm": 2.6759417057037354, + "learning_rate": 9.954128210840985e-06, + "loss": 0.6434, + "step": 1473 + }, + { + "epoch": 0.19211245607184693, + "grad_norm": 2.9079430103302, + "learning_rate": 9.953845343253643e-06, + "loss": 0.6194, + "step": 1476 + }, + { + "epoch": 0.19250292854353768, + "grad_norm": 2.6229822635650635, + "learning_rate": 9.953561610239941e-06, + "loss": 0.6018, + "step": 1479 + }, + { + "epoch": 0.19289340101522842, + "grad_norm": 2.8164374828338623, + "learning_rate": 9.953277011849444e-06, + "loss": 0.6276, + "step": 1482 + }, + { + "epoch": 0.19328387348691917, + "grad_norm": 3.2668967247009277, + "learning_rate": 9.952991548131876e-06, + "loss": 0.722, + "step": 1485 + }, + { + "epoch": 0.1936743459586099, + "grad_norm": 2.878713846206665, + "learning_rate": 9.952705219137102e-06, + "loss": 0.7137, + "step": 1488 + }, + { + "epoch": 0.19406481843030066, + "grad_norm": 2.783433437347412, + "learning_rate": 9.952418024915146e-06, + "loss": 0.6569, + "step": 1491 + }, + { + "epoch": 0.1944552909019914, + "grad_norm": 3.105311870574951, + "learning_rate": 9.95212996551618e-06, + "loss": 0.6587, + "step": 1494 + }, + { + "epoch": 0.19484576337368215, + "grad_norm": 6.641456604003906, + "learning_rate": 9.951841040990527e-06, + "loss": 0.6604, + "step": 1497 + }, + { + "epoch": 0.1952362358453729, + "grad_norm": 3.2183666229248047, + "learning_rate": 9.951551251388661e-06, + "loss": 0.74, + "step": 1500 + }, + { + "epoch": 0.19562670831706364, + "grad_norm": 2.9036731719970703, + "learning_rate": 9.951260596761208e-06, + "loss": 0.6216, + "step": 1503 + }, + { + "epoch": 0.19601718078875438, + "grad_norm": 3.435842752456665, + "learning_rate": 9.950969077158944e-06, + "loss": 0.7316, + "step": 1506 + }, + { + "epoch": 0.19640765326044513, + "grad_norm": 2.714198350906372, + "learning_rate": 9.950676692632797e-06, + "loss": 0.566, + "step": 1509 + }, + { + "epoch": 0.1967981257321359, + "grad_norm": 3.2397942543029785, + "learning_rate": 9.950383443233848e-06, + "loss": 0.702, + "step": 1512 + }, + { + "epoch": 0.19718859820382664, + "grad_norm": 2.9357926845550537, + "learning_rate": 9.950089329013324e-06, + "loss": 0.7299, + "step": 1515 + }, + { + "epoch": 0.1975790706755174, + "grad_norm": 4.893215179443359, + "learning_rate": 9.949794350022609e-06, + "loss": 0.7201, + "step": 1518 + }, + { + "epoch": 0.19796954314720813, + "grad_norm": 2.9395291805267334, + "learning_rate": 9.949498506313232e-06, + "loss": 0.7369, + "step": 1521 + }, + { + "epoch": 0.19836001561889888, + "grad_norm": 2.985732078552246, + "learning_rate": 9.949201797936882e-06, + "loss": 0.7187, + "step": 1524 + }, + { + "epoch": 0.19875048809058962, + "grad_norm": 3.205756425857544, + "learning_rate": 9.948904224945386e-06, + "loss": 0.6903, + "step": 1527 + }, + { + "epoch": 0.19914096056228037, + "grad_norm": 3.1247010231018066, + "learning_rate": 9.948605787390735e-06, + "loss": 0.6934, + "step": 1530 + }, + { + "epoch": 0.1995314330339711, + "grad_norm": 2.7591116428375244, + "learning_rate": 9.948306485325061e-06, + "loss": 0.6708, + "step": 1533 + }, + { + "epoch": 0.19992190550566186, + "grad_norm": 3.138434648513794, + "learning_rate": 9.948006318800657e-06, + "loss": 0.7368, + "step": 1536 + }, + { + "epoch": 0.2003123779773526, + "grad_norm": 6.248907089233398, + "learning_rate": 9.947705287869956e-06, + "loss": 0.8298, + "step": 1539 + }, + { + "epoch": 0.20070285044904335, + "grad_norm": 4.321200847625732, + "learning_rate": 9.947403392585548e-06, + "loss": 0.6983, + "step": 1542 + }, + { + "epoch": 0.2010933229207341, + "grad_norm": 2.919874429702759, + "learning_rate": 9.947100633000178e-06, + "loss": 0.6522, + "step": 1545 + }, + { + "epoch": 0.20148379539242484, + "grad_norm": 4.117648601531982, + "learning_rate": 9.946797009166732e-06, + "loss": 0.5987, + "step": 1548 + }, + { + "epoch": 0.20187426786411558, + "grad_norm": 3.4389774799346924, + "learning_rate": 9.946492521138258e-06, + "loss": 0.7094, + "step": 1551 + }, + { + "epoch": 0.20226474033580633, + "grad_norm": 2.9114327430725098, + "learning_rate": 9.946187168967944e-06, + "loss": 0.7512, + "step": 1554 + }, + { + "epoch": 0.20265521280749707, + "grad_norm": 2.9621176719665527, + "learning_rate": 9.94588095270914e-06, + "loss": 0.6346, + "step": 1557 + }, + { + "epoch": 0.20304568527918782, + "grad_norm": 4.908132553100586, + "learning_rate": 9.945573872415334e-06, + "loss": 0.7007, + "step": 1560 + }, + { + "epoch": 0.20343615775087856, + "grad_norm": 3.1218419075012207, + "learning_rate": 9.94526592814018e-06, + "loss": 0.6411, + "step": 1563 + }, + { + "epoch": 0.2038266302225693, + "grad_norm": 2.8320937156677246, + "learning_rate": 9.944957119937471e-06, + "loss": 0.576, + "step": 1566 + }, + { + "epoch": 0.20421710269426005, + "grad_norm": 2.7374327182769775, + "learning_rate": 9.944647447861154e-06, + "loss": 0.5977, + "step": 1569 + }, + { + "epoch": 0.2046075751659508, + "grad_norm": 3.095136880874634, + "learning_rate": 9.944336911965332e-06, + "loss": 0.6583, + "step": 1572 + }, + { + "epoch": 0.20499804763764154, + "grad_norm": 3.028735399246216, + "learning_rate": 9.944025512304251e-06, + "loss": 0.6727, + "step": 1575 + }, + { + "epoch": 0.20538852010933228, + "grad_norm": 4.234666347503662, + "learning_rate": 9.943713248932314e-06, + "loss": 0.6762, + "step": 1578 + }, + { + "epoch": 0.20577899258102303, + "grad_norm": 2.7858617305755615, + "learning_rate": 9.943400121904074e-06, + "loss": 0.7175, + "step": 1581 + }, + { + "epoch": 0.20616946505271377, + "grad_norm": 2.894981861114502, + "learning_rate": 9.943086131274231e-06, + "loss": 0.6666, + "step": 1584 + }, + { + "epoch": 0.20655993752440452, + "grad_norm": 2.6675071716308594, + "learning_rate": 9.94277127709764e-06, + "loss": 0.6473, + "step": 1587 + }, + { + "epoch": 0.20695040999609526, + "grad_norm": 2.9758174419403076, + "learning_rate": 9.942455559429304e-06, + "loss": 0.6711, + "step": 1590 + }, + { + "epoch": 0.207340882467786, + "grad_norm": 4.3356781005859375, + "learning_rate": 9.94213897832438e-06, + "loss": 0.6205, + "step": 1593 + }, + { + "epoch": 0.20773135493947675, + "grad_norm": 3.1128265857696533, + "learning_rate": 9.941821533838172e-06, + "loss": 0.677, + "step": 1596 + }, + { + "epoch": 0.20812182741116753, + "grad_norm": 3.0443339347839355, + "learning_rate": 9.941503226026139e-06, + "loss": 0.7659, + "step": 1599 + }, + { + "epoch": 0.20851229988285827, + "grad_norm": 3.6503114700317383, + "learning_rate": 9.941184054943888e-06, + "loss": 0.6364, + "step": 1602 + }, + { + "epoch": 0.20890277235454902, + "grad_norm": 3.119065761566162, + "learning_rate": 9.940864020647178e-06, + "loss": 0.6696, + "step": 1605 + }, + { + "epoch": 0.20929324482623976, + "grad_norm": 2.870157241821289, + "learning_rate": 9.940543123191916e-06, + "loss": 0.7786, + "step": 1608 + }, + { + "epoch": 0.2096837172979305, + "grad_norm": 2.8346638679504395, + "learning_rate": 9.940221362634165e-06, + "loss": 0.7752, + "step": 1611 + }, + { + "epoch": 0.21007418976962125, + "grad_norm": 2.9750826358795166, + "learning_rate": 9.939898739030135e-06, + "loss": 0.6307, + "step": 1614 + }, + { + "epoch": 0.210464662241312, + "grad_norm": 3.138920307159424, + "learning_rate": 9.939575252436186e-06, + "loss": 0.8262, + "step": 1617 + }, + { + "epoch": 0.21085513471300274, + "grad_norm": 2.8036742210388184, + "learning_rate": 9.939250902908832e-06, + "loss": 0.801, + "step": 1620 + }, + { + "epoch": 0.21124560718469348, + "grad_norm": 2.724499225616455, + "learning_rate": 9.938925690504737e-06, + "loss": 0.65, + "step": 1623 + }, + { + "epoch": 0.21163607965638423, + "grad_norm": 3.0661230087280273, + "learning_rate": 9.938599615280713e-06, + "loss": 0.7127, + "step": 1626 + }, + { + "epoch": 0.21202655212807497, + "grad_norm": 2.5042243003845215, + "learning_rate": 9.938272677293727e-06, + "loss": 0.6147, + "step": 1629 + }, + { + "epoch": 0.21241702459976572, + "grad_norm": 3.2207114696502686, + "learning_rate": 9.937944876600891e-06, + "loss": 0.7055, + "step": 1632 + }, + { + "epoch": 0.21280749707145646, + "grad_norm": 3.2957639694213867, + "learning_rate": 9.937616213259474e-06, + "loss": 0.7794, + "step": 1635 + }, + { + "epoch": 0.2131979695431472, + "grad_norm": 3.5852959156036377, + "learning_rate": 9.93728668732689e-06, + "loss": 0.6852, + "step": 1638 + }, + { + "epoch": 0.21358844201483795, + "grad_norm": 3.0781290531158447, + "learning_rate": 9.936956298860711e-06, + "loss": 0.7384, + "step": 1641 + }, + { + "epoch": 0.2139789144865287, + "grad_norm": 2.6453282833099365, + "learning_rate": 9.93662504791865e-06, + "loss": 0.6695, + "step": 1644 + }, + { + "epoch": 0.21436938695821944, + "grad_norm": 3.183838367462158, + "learning_rate": 9.93629293455858e-06, + "loss": 0.7207, + "step": 1647 + }, + { + "epoch": 0.2147598594299102, + "grad_norm": 2.969679832458496, + "learning_rate": 9.935959958838519e-06, + "loss": 0.6558, + "step": 1650 + }, + { + "epoch": 0.21515033190160093, + "grad_norm": 2.812744617462158, + "learning_rate": 9.935626120816636e-06, + "loss": 0.6737, + "step": 1653 + }, + { + "epoch": 0.21554080437329168, + "grad_norm": 2.789893627166748, + "learning_rate": 9.935291420551252e-06, + "loss": 0.6972, + "step": 1656 + }, + { + "epoch": 0.21593127684498242, + "grad_norm": 2.601032018661499, + "learning_rate": 9.934955858100838e-06, + "loss": 0.6458, + "step": 1659 + }, + { + "epoch": 0.21632174931667317, + "grad_norm": 3.639371395111084, + "learning_rate": 9.93461943352402e-06, + "loss": 0.7118, + "step": 1662 + }, + { + "epoch": 0.2167122217883639, + "grad_norm": 3.122035264968872, + "learning_rate": 9.934282146879568e-06, + "loss": 0.7283, + "step": 1665 + }, + { + "epoch": 0.21710269426005466, + "grad_norm": 2.9482860565185547, + "learning_rate": 9.933943998226403e-06, + "loss": 0.6477, + "step": 1668 + }, + { + "epoch": 0.2174931667317454, + "grad_norm": 2.530093193054199, + "learning_rate": 9.933604987623603e-06, + "loss": 0.6032, + "step": 1671 + }, + { + "epoch": 0.21788363920343615, + "grad_norm": 2.5820152759552, + "learning_rate": 9.93326511513039e-06, + "loss": 0.6156, + "step": 1674 + }, + { + "epoch": 0.2182741116751269, + "grad_norm": 3.059326410293579, + "learning_rate": 9.93292438080614e-06, + "loss": 0.6979, + "step": 1677 + }, + { + "epoch": 0.21866458414681764, + "grad_norm": 3.3234145641326904, + "learning_rate": 9.932582784710377e-06, + "loss": 0.6619, + "step": 1680 + }, + { + "epoch": 0.21905505661850838, + "grad_norm": 2.865112066268921, + "learning_rate": 9.932240326902777e-06, + "loss": 0.6367, + "step": 1683 + }, + { + "epoch": 0.21944552909019915, + "grad_norm": 2.950484037399292, + "learning_rate": 9.93189700744317e-06, + "loss": 0.7528, + "step": 1686 + }, + { + "epoch": 0.2198360015618899, + "grad_norm": 2.690485715866089, + "learning_rate": 9.931552826391529e-06, + "loss": 0.6232, + "step": 1689 + }, + { + "epoch": 0.22022647403358064, + "grad_norm": 2.7388739585876465, + "learning_rate": 9.931207783807984e-06, + "loss": 0.7912, + "step": 1692 + }, + { + "epoch": 0.2206169465052714, + "grad_norm": 2.996173620223999, + "learning_rate": 9.930861879752814e-06, + "loss": 0.7042, + "step": 1695 + }, + { + "epoch": 0.22100741897696213, + "grad_norm": 3.6289913654327393, + "learning_rate": 9.930515114286446e-06, + "loss": 0.713, + "step": 1698 + }, + { + "epoch": 0.22139789144865288, + "grad_norm": 2.484739065170288, + "learning_rate": 9.93016748746946e-06, + "loss": 0.7148, + "step": 1701 + }, + { + "epoch": 0.22178836392034362, + "grad_norm": 3.885227918624878, + "learning_rate": 9.929818999362585e-06, + "loss": 0.5977, + "step": 1704 + }, + { + "epoch": 0.22217883639203437, + "grad_norm": 3.2628114223480225, + "learning_rate": 9.929469650026705e-06, + "loss": 0.6511, + "step": 1707 + }, + { + "epoch": 0.2225693088637251, + "grad_norm": 2.676222085952759, + "learning_rate": 9.929119439522843e-06, + "loss": 0.7086, + "step": 1710 + }, + { + "epoch": 0.22295978133541586, + "grad_norm": 2.7704615592956543, + "learning_rate": 9.928768367912186e-06, + "loss": 0.6214, + "step": 1713 + }, + { + "epoch": 0.2233502538071066, + "grad_norm": 2.829817295074463, + "learning_rate": 9.928416435256062e-06, + "loss": 0.7286, + "step": 1716 + }, + { + "epoch": 0.22374072627879735, + "grad_norm": 4.687877178192139, + "learning_rate": 9.928063641615958e-06, + "loss": 0.6466, + "step": 1719 + }, + { + "epoch": 0.2241311987504881, + "grad_norm": 2.777306079864502, + "learning_rate": 9.9277099870535e-06, + "loss": 0.7183, + "step": 1722 + }, + { + "epoch": 0.22452167122217884, + "grad_norm": 3.198352575302124, + "learning_rate": 9.927355471630475e-06, + "loss": 0.6335, + "step": 1725 + }, + { + "epoch": 0.22491214369386958, + "grad_norm": 3.063441276550293, + "learning_rate": 9.927000095408814e-06, + "loss": 0.7046, + "step": 1728 + }, + { + "epoch": 0.22530261616556033, + "grad_norm": 2.7282402515411377, + "learning_rate": 9.926643858450602e-06, + "loss": 0.6636, + "step": 1731 + }, + { + "epoch": 0.22569308863725107, + "grad_norm": 3.712812900543213, + "learning_rate": 9.926286760818072e-06, + "loss": 0.763, + "step": 1734 + }, + { + "epoch": 0.22608356110894182, + "grad_norm": 2.7162492275238037, + "learning_rate": 9.925928802573608e-06, + "loss": 0.7452, + "step": 1737 + }, + { + "epoch": 0.22647403358063256, + "grad_norm": 2.689542770385742, + "learning_rate": 9.925569983779744e-06, + "loss": 0.6457, + "step": 1740 + }, + { + "epoch": 0.2268645060523233, + "grad_norm": 3.449857234954834, + "learning_rate": 9.925210304499168e-06, + "loss": 0.6226, + "step": 1743 + }, + { + "epoch": 0.22725497852401405, + "grad_norm": 3.3526997566223145, + "learning_rate": 9.92484976479471e-06, + "loss": 0.6801, + "step": 1746 + }, + { + "epoch": 0.2276454509957048, + "grad_norm": 3.0590097904205322, + "learning_rate": 9.924488364729362e-06, + "loss": 0.7149, + "step": 1749 + }, + { + "epoch": 0.22803592346739554, + "grad_norm": 4.266837120056152, + "learning_rate": 9.924126104366255e-06, + "loss": 0.6816, + "step": 1752 + }, + { + "epoch": 0.22842639593908629, + "grad_norm": 2.6770737171173096, + "learning_rate": 9.923762983768674e-06, + "loss": 0.7303, + "step": 1755 + }, + { + "epoch": 0.22881686841077703, + "grad_norm": 2.922563314437866, + "learning_rate": 9.92339900300006e-06, + "loss": 0.6389, + "step": 1758 + }, + { + "epoch": 0.22920734088246777, + "grad_norm": 3.8620433807373047, + "learning_rate": 9.923034162123996e-06, + "loss": 0.6251, + "step": 1761 + }, + { + "epoch": 0.22959781335415852, + "grad_norm": 3.186556339263916, + "learning_rate": 9.922668461204222e-06, + "loss": 0.8476, + "step": 1764 + }, + { + "epoch": 0.22998828582584926, + "grad_norm": 3.0054855346679688, + "learning_rate": 9.922301900304622e-06, + "loss": 0.6719, + "step": 1767 + }, + { + "epoch": 0.23037875829754004, + "grad_norm": 3.260510206222534, + "learning_rate": 9.921934479489236e-06, + "loss": 0.6796, + "step": 1770 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 3.021167755126953, + "learning_rate": 9.921566198822252e-06, + "loss": 0.667, + "step": 1773 + }, + { + "epoch": 0.23115970324092153, + "grad_norm": 3.6485886573791504, + "learning_rate": 9.921197058368005e-06, + "loss": 0.7357, + "step": 1776 + }, + { + "epoch": 0.23155017571261227, + "grad_norm": 2.8014872074127197, + "learning_rate": 9.920827058190984e-06, + "loss": 0.8078, + "step": 1779 + }, + { + "epoch": 0.23194064818430302, + "grad_norm": 2.7139265537261963, + "learning_rate": 9.92045619835583e-06, + "loss": 0.7207, + "step": 1782 + }, + { + "epoch": 0.23233112065599376, + "grad_norm": 2.5168399810791016, + "learning_rate": 9.920084478927327e-06, + "loss": 0.5873, + "step": 1785 + }, + { + "epoch": 0.2327215931276845, + "grad_norm": 3.7160627841949463, + "learning_rate": 9.919711899970417e-06, + "loss": 0.7382, + "step": 1788 + }, + { + "epoch": 0.23311206559937525, + "grad_norm": 2.766188383102417, + "learning_rate": 9.919338461550188e-06, + "loss": 0.6981, + "step": 1791 + }, + { + "epoch": 0.233502538071066, + "grad_norm": 3.4832241535186768, + "learning_rate": 9.918964163731878e-06, + "loss": 0.6844, + "step": 1794 + }, + { + "epoch": 0.23389301054275674, + "grad_norm": 2.998603582382202, + "learning_rate": 9.918589006580877e-06, + "loss": 0.6986, + "step": 1797 + }, + { + "epoch": 0.23428348301444749, + "grad_norm": 2.618968963623047, + "learning_rate": 9.918212990162724e-06, + "loss": 0.6967, + "step": 1800 + }, + { + "epoch": 0.23467395548613823, + "grad_norm": 3.121068000793457, + "learning_rate": 9.917836114543105e-06, + "loss": 0.5803, + "step": 1803 + }, + { + "epoch": 0.23506442795782898, + "grad_norm": 2.8870410919189453, + "learning_rate": 9.917458379787865e-06, + "loss": 0.66, + "step": 1806 + }, + { + "epoch": 0.23545490042951972, + "grad_norm": 3.4096627235412598, + "learning_rate": 9.917079785962991e-06, + "loss": 0.644, + "step": 1809 + }, + { + "epoch": 0.23584537290121046, + "grad_norm": 3.3763647079467773, + "learning_rate": 9.916700333134622e-06, + "loss": 0.7743, + "step": 1812 + }, + { + "epoch": 0.2362358453729012, + "grad_norm": 3.2882561683654785, + "learning_rate": 9.916320021369049e-06, + "loss": 0.7217, + "step": 1815 + }, + { + "epoch": 0.23662631784459195, + "grad_norm": 2.849679946899414, + "learning_rate": 9.91593885073271e-06, + "loss": 0.6473, + "step": 1818 + }, + { + "epoch": 0.2370167903162827, + "grad_norm": 2.5509214401245117, + "learning_rate": 9.915556821292194e-06, + "loss": 0.6598, + "step": 1821 + }, + { + "epoch": 0.23740726278797344, + "grad_norm": 4.192689895629883, + "learning_rate": 9.915173933114243e-06, + "loss": 0.737, + "step": 1824 + }, + { + "epoch": 0.2377977352596642, + "grad_norm": 3.805640459060669, + "learning_rate": 9.914790186265747e-06, + "loss": 0.6514, + "step": 1827 + }, + { + "epoch": 0.23818820773135493, + "grad_norm": 2.7506203651428223, + "learning_rate": 9.914405580813744e-06, + "loss": 0.6088, + "step": 1830 + }, + { + "epoch": 0.23857868020304568, + "grad_norm": 2.9429638385772705, + "learning_rate": 9.914020116825425e-06, + "loss": 0.6348, + "step": 1833 + }, + { + "epoch": 0.23896915267473642, + "grad_norm": 2.8457984924316406, + "learning_rate": 9.913633794368128e-06, + "loss": 0.6043, + "step": 1836 + }, + { + "epoch": 0.23935962514642717, + "grad_norm": 3.119004011154175, + "learning_rate": 9.913246613509344e-06, + "loss": 0.6326, + "step": 1839 + }, + { + "epoch": 0.2397500976181179, + "grad_norm": 2.674212694168091, + "learning_rate": 9.912858574316714e-06, + "loss": 0.6505, + "step": 1842 + }, + { + "epoch": 0.24014057008980866, + "grad_norm": 4.301478862762451, + "learning_rate": 9.912469676858025e-06, + "loss": 0.6864, + "step": 1845 + }, + { + "epoch": 0.2405310425614994, + "grad_norm": 3.199939727783203, + "learning_rate": 9.912079921201216e-06, + "loss": 0.716, + "step": 1848 + }, + { + "epoch": 0.24092151503319015, + "grad_norm": 3.2009308338165283, + "learning_rate": 9.911689307414381e-06, + "loss": 0.7442, + "step": 1851 + }, + { + "epoch": 0.2413119875048809, + "grad_norm": 4.463657855987549, + "learning_rate": 9.911297835565755e-06, + "loss": 0.6488, + "step": 1854 + }, + { + "epoch": 0.24170245997657167, + "grad_norm": 2.5389511585235596, + "learning_rate": 9.91090550572373e-06, + "loss": 0.6598, + "step": 1857 + }, + { + "epoch": 0.2420929324482624, + "grad_norm": 2.7800350189208984, + "learning_rate": 9.910512317956845e-06, + "loss": 0.6538, + "step": 1860 + }, + { + "epoch": 0.24248340491995315, + "grad_norm": 4.617105007171631, + "learning_rate": 9.910118272333787e-06, + "loss": 0.6534, + "step": 1863 + }, + { + "epoch": 0.2428738773916439, + "grad_norm": 4.014925479888916, + "learning_rate": 9.909723368923397e-06, + "loss": 0.7108, + "step": 1866 + }, + { + "epoch": 0.24326434986333464, + "grad_norm": 3.7771573066711426, + "learning_rate": 9.909327607794663e-06, + "loss": 0.7403, + "step": 1869 + }, + { + "epoch": 0.2436548223350254, + "grad_norm": 4.131522178649902, + "learning_rate": 9.908930989016723e-06, + "loss": 0.6846, + "step": 1872 + }, + { + "epoch": 0.24404529480671613, + "grad_norm": 2.5078864097595215, + "learning_rate": 9.908533512658867e-06, + "loss": 0.7045, + "step": 1875 + }, + { + "epoch": 0.24443576727840688, + "grad_norm": 3.6496095657348633, + "learning_rate": 9.90813517879053e-06, + "loss": 0.6804, + "step": 1878 + }, + { + "epoch": 0.24482623975009762, + "grad_norm": 3.153731107711792, + "learning_rate": 9.907735987481306e-06, + "loss": 0.77, + "step": 1881 + }, + { + "epoch": 0.24521671222178837, + "grad_norm": 3.234714984893799, + "learning_rate": 9.907335938800925e-06, + "loss": 0.6396, + "step": 1884 + }, + { + "epoch": 0.2456071846934791, + "grad_norm": 4.209356784820557, + "learning_rate": 9.906935032819283e-06, + "loss": 0.6937, + "step": 1887 + }, + { + "epoch": 0.24599765716516986, + "grad_norm": 2.8862154483795166, + "learning_rate": 9.906533269606412e-06, + "loss": 0.7209, + "step": 1890 + }, + { + "epoch": 0.2463881296368606, + "grad_norm": 3.8933069705963135, + "learning_rate": 9.9061306492325e-06, + "loss": 0.7016, + "step": 1893 + }, + { + "epoch": 0.24677860210855135, + "grad_norm": 3.536663770675659, + "learning_rate": 9.905727171767885e-06, + "loss": 0.6479, + "step": 1896 + }, + { + "epoch": 0.2471690745802421, + "grad_norm": 2.742396354675293, + "learning_rate": 9.905322837283054e-06, + "loss": 0.7213, + "step": 1899 + }, + { + "epoch": 0.24755954705193284, + "grad_norm": 3.2880959510803223, + "learning_rate": 9.904917645848642e-06, + "loss": 0.7038, + "step": 1902 + }, + { + "epoch": 0.24795001952362358, + "grad_norm": 2.86783504486084, + "learning_rate": 9.904511597535435e-06, + "loss": 0.7333, + "step": 1905 + }, + { + "epoch": 0.24834049199531433, + "grad_norm": 2.66743803024292, + "learning_rate": 9.904104692414372e-06, + "loss": 0.6619, + "step": 1908 + }, + { + "epoch": 0.24873096446700507, + "grad_norm": 2.8109350204467773, + "learning_rate": 9.903696930556534e-06, + "loss": 0.6984, + "step": 1911 + }, + { + "epoch": 0.24912143693869582, + "grad_norm": 4.756025791168213, + "learning_rate": 9.903288312033158e-06, + "loss": 0.6782, + "step": 1914 + }, + { + "epoch": 0.24951190941038656, + "grad_norm": 3.219858169555664, + "learning_rate": 9.902878836915628e-06, + "loss": 0.7427, + "step": 1917 + }, + { + "epoch": 0.2499023818820773, + "grad_norm": 2.6937522888183594, + "learning_rate": 9.902468505275481e-06, + "loss": 0.6722, + "step": 1920 + }, + { + "epoch": 0.2502928543537681, + "grad_norm": 3.728074073791504, + "learning_rate": 9.9020573171844e-06, + "loss": 0.667, + "step": 1923 + }, + { + "epoch": 0.2506833268254588, + "grad_norm": 2.520673990249634, + "learning_rate": 9.901645272714216e-06, + "loss": 0.6062, + "step": 1926 + }, + { + "epoch": 0.25107379929714957, + "grad_norm": 3.0606751441955566, + "learning_rate": 9.901232371936916e-06, + "loss": 0.6204, + "step": 1929 + }, + { + "epoch": 0.2514642717688403, + "grad_norm": 3.0006275177001953, + "learning_rate": 9.90081861492463e-06, + "loss": 0.7612, + "step": 1932 + }, + { + "epoch": 0.25185474424053106, + "grad_norm": 3.9713451862335205, + "learning_rate": 9.900404001749643e-06, + "loss": 0.7894, + "step": 1935 + }, + { + "epoch": 0.2522452167122218, + "grad_norm": 2.761209726333618, + "learning_rate": 9.899988532484386e-06, + "loss": 0.6158, + "step": 1938 + }, + { + "epoch": 0.25263568918391255, + "grad_norm": 2.8086750507354736, + "learning_rate": 9.89957220720144e-06, + "loss": 0.6952, + "step": 1941 + }, + { + "epoch": 0.25302616165560327, + "grad_norm": 5.129540920257568, + "learning_rate": 9.899155025973535e-06, + "loss": 0.6314, + "step": 1944 + }, + { + "epoch": 0.25341663412729404, + "grad_norm": 2.605057954788208, + "learning_rate": 9.898736988873555e-06, + "loss": 0.7729, + "step": 1947 + }, + { + "epoch": 0.25380710659898476, + "grad_norm": 4.0138044357299805, + "learning_rate": 9.898318095974529e-06, + "loss": 0.7074, + "step": 1950 + }, + { + "epoch": 0.25419757907067553, + "grad_norm": 2.733633041381836, + "learning_rate": 9.897898347349635e-06, + "loss": 0.6754, + "step": 1953 + }, + { + "epoch": 0.25458805154236624, + "grad_norm": 2.873657703399658, + "learning_rate": 9.897477743072203e-06, + "loss": 0.6609, + "step": 1956 + }, + { + "epoch": 0.254978524014057, + "grad_norm": 3.4984524250030518, + "learning_rate": 9.897056283215713e-06, + "loss": 0.6305, + "step": 1959 + }, + { + "epoch": 0.25536899648574773, + "grad_norm": 2.7125892639160156, + "learning_rate": 9.896633967853793e-06, + "loss": 0.6463, + "step": 1962 + }, + { + "epoch": 0.2557594689574385, + "grad_norm": 2.670562505722046, + "learning_rate": 9.896210797060218e-06, + "loss": 0.6669, + "step": 1965 + }, + { + "epoch": 0.2561499414291292, + "grad_norm": 3.608865261077881, + "learning_rate": 9.895786770908918e-06, + "loss": 0.6898, + "step": 1968 + }, + { + "epoch": 0.25654041390082, + "grad_norm": 3.5786221027374268, + "learning_rate": 9.895361889473969e-06, + "loss": 0.7525, + "step": 1971 + }, + { + "epoch": 0.2569308863725107, + "grad_norm": 3.3023688793182373, + "learning_rate": 9.894936152829595e-06, + "loss": 0.7672, + "step": 1974 + }, + { + "epoch": 0.2573213588442015, + "grad_norm": 2.93623948097229, + "learning_rate": 9.894509561050173e-06, + "loss": 0.7114, + "step": 1977 + }, + { + "epoch": 0.2577118313158922, + "grad_norm": 3.6028199195861816, + "learning_rate": 9.894082114210226e-06, + "loss": 0.6342, + "step": 1980 + }, + { + "epoch": 0.258102303787583, + "grad_norm": 3.522853374481201, + "learning_rate": 9.893653812384432e-06, + "loss": 0.6205, + "step": 1983 + }, + { + "epoch": 0.25849277625927375, + "grad_norm": 2.625506639480591, + "learning_rate": 9.893224655647609e-06, + "loss": 0.6737, + "step": 1986 + }, + { + "epoch": 0.25888324873096447, + "grad_norm": 3.548536777496338, + "learning_rate": 9.892794644074735e-06, + "loss": 0.6586, + "step": 1989 + }, + { + "epoch": 0.25927372120265524, + "grad_norm": 4.54545783996582, + "learning_rate": 9.892363777740928e-06, + "loss": 0.7415, + "step": 1992 + }, + { + "epoch": 0.25966419367434596, + "grad_norm": 2.687030792236328, + "learning_rate": 9.89193205672146e-06, + "loss": 0.5988, + "step": 1995 + }, + { + "epoch": 0.26005466614603673, + "grad_norm": 4.913947582244873, + "learning_rate": 9.891499481091755e-06, + "loss": 0.707, + "step": 1998 + }, + { + "epoch": 0.26044513861772745, + "grad_norm": 2.828240394592285, + "learning_rate": 9.891066050927381e-06, + "loss": 0.6659, + "step": 2001 + }, + { + "epoch": 0.2608356110894182, + "grad_norm": 2.456120014190674, + "learning_rate": 9.890631766304054e-06, + "loss": 0.743, + "step": 2004 + }, + { + "epoch": 0.26122608356110893, + "grad_norm": 3.4322896003723145, + "learning_rate": 9.890196627297649e-06, + "loss": 0.6586, + "step": 2007 + }, + { + "epoch": 0.2616165560327997, + "grad_norm": 2.4019174575805664, + "learning_rate": 9.88976063398418e-06, + "loss": 0.6277, + "step": 2010 + }, + { + "epoch": 0.2620070285044904, + "grad_norm": 2.6000640392303467, + "learning_rate": 9.889323786439815e-06, + "loss": 0.6874, + "step": 2013 + }, + { + "epoch": 0.2623975009761812, + "grad_norm": 2.7879741191864014, + "learning_rate": 9.88888608474087e-06, + "loss": 0.6844, + "step": 2016 + }, + { + "epoch": 0.2627879734478719, + "grad_norm": 2.620607852935791, + "learning_rate": 9.888447528963809e-06, + "loss": 0.6139, + "step": 2019 + }, + { + "epoch": 0.2631784459195627, + "grad_norm": 3.1650354862213135, + "learning_rate": 9.88800811918525e-06, + "loss": 0.6203, + "step": 2022 + }, + { + "epoch": 0.2635689183912534, + "grad_norm": 2.8393919467926025, + "learning_rate": 9.887567855481955e-06, + "loss": 0.6931, + "step": 2025 + }, + { + "epoch": 0.2639593908629442, + "grad_norm": 3.6173481941223145, + "learning_rate": 9.88712673793084e-06, + "loss": 0.6675, + "step": 2028 + }, + { + "epoch": 0.2643498633346349, + "grad_norm": 3.9958341121673584, + "learning_rate": 9.88668476660896e-06, + "loss": 0.6857, + "step": 2031 + }, + { + "epoch": 0.26474033580632567, + "grad_norm": 2.9784352779388428, + "learning_rate": 9.886241941593535e-06, + "loss": 0.7343, + "step": 2034 + }, + { + "epoch": 0.2651308082780164, + "grad_norm": 3.043720006942749, + "learning_rate": 9.885798262961921e-06, + "loss": 0.6138, + "step": 2037 + }, + { + "epoch": 0.26552128074970716, + "grad_norm": 2.8800082206726074, + "learning_rate": 9.88535373079163e-06, + "loss": 0.7174, + "step": 2040 + }, + { + "epoch": 0.2659117532213979, + "grad_norm": 2.96820330619812, + "learning_rate": 9.884908345160318e-06, + "loss": 0.6844, + "step": 2043 + }, + { + "epoch": 0.26630222569308865, + "grad_norm": 4.006218910217285, + "learning_rate": 9.884462106145794e-06, + "loss": 0.5799, + "step": 2046 + }, + { + "epoch": 0.26669269816477936, + "grad_norm": 4.129579544067383, + "learning_rate": 9.884015013826015e-06, + "loss": 0.7025, + "step": 2049 + }, + { + "epoch": 0.26708317063647014, + "grad_norm": 6.496399879455566, + "learning_rate": 9.88356706827909e-06, + "loss": 0.6598, + "step": 2052 + }, + { + "epoch": 0.26747364310816085, + "grad_norm": 10.179603576660156, + "learning_rate": 9.88311826958327e-06, + "loss": 0.7294, + "step": 2055 + }, + { + "epoch": 0.2678641155798516, + "grad_norm": 4.363855838775635, + "learning_rate": 9.882668617816962e-06, + "loss": 0.6529, + "step": 2058 + }, + { + "epoch": 0.26825458805154234, + "grad_norm": 2.7566702365875244, + "learning_rate": 9.882218113058716e-06, + "loss": 0.6574, + "step": 2061 + }, + { + "epoch": 0.2686450605232331, + "grad_norm": 2.922041416168213, + "learning_rate": 9.88176675538724e-06, + "loss": 0.6732, + "step": 2064 + }, + { + "epoch": 0.26903553299492383, + "grad_norm": 4.118600368499756, + "learning_rate": 9.881314544881377e-06, + "loss": 0.6912, + "step": 2067 + }, + { + "epoch": 0.2694260054666146, + "grad_norm": 2.772783041000366, + "learning_rate": 9.880861481620134e-06, + "loss": 0.7012, + "step": 2070 + }, + { + "epoch": 0.2698164779383054, + "grad_norm": 2.855739116668701, + "learning_rate": 9.88040756568266e-06, + "loss": 0.7134, + "step": 2073 + }, + { + "epoch": 0.2702069504099961, + "grad_norm": 2.6456918716430664, + "learning_rate": 9.879952797148249e-06, + "loss": 0.6548, + "step": 2076 + }, + { + "epoch": 0.27059742288168687, + "grad_norm": 2.943904399871826, + "learning_rate": 9.87949717609635e-06, + "loss": 0.6378, + "step": 2079 + }, + { + "epoch": 0.2709878953533776, + "grad_norm": 2.9040253162384033, + "learning_rate": 9.87904070260656e-06, + "loss": 0.649, + "step": 2082 + }, + { + "epoch": 0.27137836782506836, + "grad_norm": 2.9723544120788574, + "learning_rate": 9.878583376758623e-06, + "loss": 0.705, + "step": 2085 + }, + { + "epoch": 0.2717688402967591, + "grad_norm": 3.080580949783325, + "learning_rate": 9.878125198632433e-06, + "loss": 0.7454, + "step": 2088 + }, + { + "epoch": 0.27215931276844985, + "grad_norm": 2.676499843597412, + "learning_rate": 9.877666168308034e-06, + "loss": 0.7742, + "step": 2091 + }, + { + "epoch": 0.27254978524014056, + "grad_norm": 2.981276035308838, + "learning_rate": 9.877206285865614e-06, + "loss": 0.7173, + "step": 2094 + }, + { + "epoch": 0.27294025771183134, + "grad_norm": 2.9143402576446533, + "learning_rate": 9.876745551385519e-06, + "loss": 0.6747, + "step": 2097 + }, + { + "epoch": 0.27333073018352205, + "grad_norm": 2.7891063690185547, + "learning_rate": 9.876283964948232e-06, + "loss": 0.5657, + "step": 2100 + }, + { + "epoch": 0.2737212026552128, + "grad_norm": 2.83236026763916, + "learning_rate": 9.875821526634397e-06, + "loss": 0.7013, + "step": 2103 + }, + { + "epoch": 0.27411167512690354, + "grad_norm": 2.9020628929138184, + "learning_rate": 9.875358236524798e-06, + "loss": 0.6061, + "step": 2106 + }, + { + "epoch": 0.2745021475985943, + "grad_norm": 2.637514352798462, + "learning_rate": 9.874894094700372e-06, + "loss": 0.6832, + "step": 2109 + }, + { + "epoch": 0.27489262007028503, + "grad_norm": 2.8690152168273926, + "learning_rate": 9.874429101242202e-06, + "loss": 0.7544, + "step": 2112 + }, + { + "epoch": 0.2752830925419758, + "grad_norm": 3.029694080352783, + "learning_rate": 9.873963256231522e-06, + "loss": 0.7404, + "step": 2115 + }, + { + "epoch": 0.2756735650136665, + "grad_norm": 3.4346494674682617, + "learning_rate": 9.873496559749716e-06, + "loss": 0.6089, + "step": 2118 + }, + { + "epoch": 0.2760640374853573, + "grad_norm": 2.6336498260498047, + "learning_rate": 9.873029011878312e-06, + "loss": 0.7007, + "step": 2121 + }, + { + "epoch": 0.276454509957048, + "grad_norm": 2.4632461071014404, + "learning_rate": 9.872560612698992e-06, + "loss": 0.6416, + "step": 2124 + }, + { + "epoch": 0.2768449824287388, + "grad_norm": 2.5596401691436768, + "learning_rate": 9.872091362293581e-06, + "loss": 0.6107, + "step": 2127 + }, + { + "epoch": 0.2772354549004295, + "grad_norm": 3.3999154567718506, + "learning_rate": 9.87162126074406e-06, + "loss": 0.6935, + "step": 2130 + }, + { + "epoch": 0.2776259273721203, + "grad_norm": 3.2630927562713623, + "learning_rate": 9.871150308132554e-06, + "loss": 0.6553, + "step": 2133 + }, + { + "epoch": 0.278016399843811, + "grad_norm": 3.5304367542266846, + "learning_rate": 9.870678504541336e-06, + "loss": 0.7273, + "step": 2136 + }, + { + "epoch": 0.27840687231550176, + "grad_norm": 3.2613844871520996, + "learning_rate": 9.87020585005283e-06, + "loss": 0.6989, + "step": 2139 + }, + { + "epoch": 0.2787973447871925, + "grad_norm": 2.5427675247192383, + "learning_rate": 9.869732344749605e-06, + "loss": 0.7172, + "step": 2142 + }, + { + "epoch": 0.27918781725888325, + "grad_norm": 2.583522081375122, + "learning_rate": 9.869257988714386e-06, + "loss": 0.688, + "step": 2145 + }, + { + "epoch": 0.27957828973057397, + "grad_norm": 3.7895023822784424, + "learning_rate": 9.86878278203004e-06, + "loss": 0.6906, + "step": 2148 + }, + { + "epoch": 0.27996876220226474, + "grad_norm": 2.4927175045013428, + "learning_rate": 9.868306724779584e-06, + "loss": 0.661, + "step": 2151 + }, + { + "epoch": 0.28035923467395546, + "grad_norm": 2.5533857345581055, + "learning_rate": 9.867829817046183e-06, + "loss": 0.6139, + "step": 2154 + }, + { + "epoch": 0.28074970714564623, + "grad_norm": 3.4451935291290283, + "learning_rate": 9.867352058913156e-06, + "loss": 0.78, + "step": 2157 + }, + { + "epoch": 0.281140179617337, + "grad_norm": 2.7176883220672607, + "learning_rate": 9.866873450463963e-06, + "loss": 0.7304, + "step": 2160 + }, + { + "epoch": 0.2815306520890277, + "grad_norm": 2.8105108737945557, + "learning_rate": 9.866393991782215e-06, + "loss": 0.6057, + "step": 2163 + }, + { + "epoch": 0.2819211245607185, + "grad_norm": 4.555068492889404, + "learning_rate": 9.865913682951675e-06, + "loss": 0.7168, + "step": 2166 + }, + { + "epoch": 0.2823115970324092, + "grad_norm": 4.762774467468262, + "learning_rate": 9.865432524056252e-06, + "loss": 0.6895, + "step": 2169 + }, + { + "epoch": 0.2827020695041, + "grad_norm": 2.9473941326141357, + "learning_rate": 9.86495051518e-06, + "loss": 0.6986, + "step": 2172 + }, + { + "epoch": 0.2830925419757907, + "grad_norm": 3.883134603500366, + "learning_rate": 9.86446765640713e-06, + "loss": 0.6505, + "step": 2175 + }, + { + "epoch": 0.2834830144474815, + "grad_norm": 2.467970132827759, + "learning_rate": 9.863983947821993e-06, + "loss": 0.7058, + "step": 2178 + }, + { + "epoch": 0.2838734869191722, + "grad_norm": 2.5856029987335205, + "learning_rate": 9.863499389509092e-06, + "loss": 0.6564, + "step": 2181 + }, + { + "epoch": 0.28426395939086296, + "grad_norm": 3.4759273529052734, + "learning_rate": 9.86301398155308e-06, + "loss": 0.644, + "step": 2184 + }, + { + "epoch": 0.2846544318625537, + "grad_norm": 3.2896201610565186, + "learning_rate": 9.862527724038755e-06, + "loss": 0.6862, + "step": 2187 + }, + { + "epoch": 0.28504490433424445, + "grad_norm": 3.064748764038086, + "learning_rate": 9.862040617051065e-06, + "loss": 0.6644, + "step": 2190 + }, + { + "epoch": 0.28543537680593517, + "grad_norm": 2.8241119384765625, + "learning_rate": 9.861552660675109e-06, + "loss": 0.6167, + "step": 2193 + }, + { + "epoch": 0.28582584927762594, + "grad_norm": 2.6539626121520996, + "learning_rate": 9.86106385499613e-06, + "loss": 0.6695, + "step": 2196 + }, + { + "epoch": 0.28621632174931666, + "grad_norm": 3.7717602252960205, + "learning_rate": 9.86057420009952e-06, + "loss": 0.7306, + "step": 2199 + }, + { + "epoch": 0.28660679422100743, + "grad_norm": 3.5839555263519287, + "learning_rate": 9.860083696070825e-06, + "loss": 0.6344, + "step": 2202 + }, + { + "epoch": 0.28699726669269815, + "grad_norm": 2.718700408935547, + "learning_rate": 9.85959234299573e-06, + "loss": 0.6781, + "step": 2205 + }, + { + "epoch": 0.2873877391643889, + "grad_norm": 3.010596990585327, + "learning_rate": 9.859100140960079e-06, + "loss": 0.7763, + "step": 2208 + }, + { + "epoch": 0.28777821163607964, + "grad_norm": 2.9677681922912598, + "learning_rate": 9.858607090049851e-06, + "loss": 0.6675, + "step": 2211 + }, + { + "epoch": 0.2881686841077704, + "grad_norm": 2.874814510345459, + "learning_rate": 9.858113190351189e-06, + "loss": 0.8113, + "step": 2214 + }, + { + "epoch": 0.28855915657946113, + "grad_norm": 2.915309190750122, + "learning_rate": 9.857618441950372e-06, + "loss": 0.7219, + "step": 2217 + }, + { + "epoch": 0.2889496290511519, + "grad_norm": 3.3547255992889404, + "learning_rate": 9.857122844933831e-06, + "loss": 0.689, + "step": 2220 + }, + { + "epoch": 0.2893401015228426, + "grad_norm": 2.769690752029419, + "learning_rate": 9.856626399388146e-06, + "loss": 0.6282, + "step": 2223 + }, + { + "epoch": 0.2897305739945334, + "grad_norm": 2.8011720180511475, + "learning_rate": 9.856129105400048e-06, + "loss": 0.6347, + "step": 2226 + }, + { + "epoch": 0.2901210464662241, + "grad_norm": 4.004727363586426, + "learning_rate": 9.855630963056411e-06, + "loss": 0.6332, + "step": 2229 + }, + { + "epoch": 0.2905115189379149, + "grad_norm": 3.3377795219421387, + "learning_rate": 9.855131972444259e-06, + "loss": 0.7273, + "step": 2232 + }, + { + "epoch": 0.2909019914096056, + "grad_norm": 4.1533379554748535, + "learning_rate": 9.854632133650765e-06, + "loss": 0.6631, + "step": 2235 + }, + { + "epoch": 0.29129246388129637, + "grad_norm": 3.070554256439209, + "learning_rate": 9.85413144676325e-06, + "loss": 0.6178, + "step": 2238 + }, + { + "epoch": 0.2916829363529871, + "grad_norm": 4.225495338439941, + "learning_rate": 9.853629911869182e-06, + "loss": 0.6509, + "step": 2241 + }, + { + "epoch": 0.29207340882467786, + "grad_norm": 3.233717441558838, + "learning_rate": 9.853127529056182e-06, + "loss": 0.7106, + "step": 2244 + }, + { + "epoch": 0.29246388129636863, + "grad_norm": 2.8367974758148193, + "learning_rate": 9.852624298412008e-06, + "loss": 0.6732, + "step": 2247 + }, + { + "epoch": 0.29285435376805935, + "grad_norm": 3.421760320663452, + "learning_rate": 9.852120220024579e-06, + "loss": 0.7217, + "step": 2250 + }, + { + "epoch": 0.2932448262397501, + "grad_norm": 3.042901039123535, + "learning_rate": 9.851615293981956e-06, + "loss": 0.7075, + "step": 2253 + }, + { + "epoch": 0.29363529871144084, + "grad_norm": 2.7977664470672607, + "learning_rate": 9.851109520372346e-06, + "loss": 0.6137, + "step": 2256 + }, + { + "epoch": 0.2940257711831316, + "grad_norm": 2.4614906311035156, + "learning_rate": 9.85060289928411e-06, + "loss": 0.621, + "step": 2259 + }, + { + "epoch": 0.29441624365482233, + "grad_norm": 2.9462318420410156, + "learning_rate": 9.85009543080575e-06, + "loss": 0.7196, + "step": 2262 + }, + { + "epoch": 0.2948067161265131, + "grad_norm": 2.9534754753112793, + "learning_rate": 9.849587115025923e-06, + "loss": 0.6909, + "step": 2265 + }, + { + "epoch": 0.2951971885982038, + "grad_norm": 2.725046157836914, + "learning_rate": 9.849077952033427e-06, + "loss": 0.676, + "step": 2268 + }, + { + "epoch": 0.2955876610698946, + "grad_norm": 2.515141487121582, + "learning_rate": 9.848567941917216e-06, + "loss": 0.6987, + "step": 2271 + }, + { + "epoch": 0.2959781335415853, + "grad_norm": 2.8634941577911377, + "learning_rate": 9.848057084766382e-06, + "loss": 0.6337, + "step": 2274 + }, + { + "epoch": 0.2963686060132761, + "grad_norm": 3.2547008991241455, + "learning_rate": 9.847545380670176e-06, + "loss": 0.6618, + "step": 2277 + }, + { + "epoch": 0.2967590784849668, + "grad_norm": 3.159667730331421, + "learning_rate": 9.847032829717991e-06, + "loss": 0.6848, + "step": 2280 + }, + { + "epoch": 0.29714955095665757, + "grad_norm": 3.1204230785369873, + "learning_rate": 9.846519431999366e-06, + "loss": 0.6681, + "step": 2283 + }, + { + "epoch": 0.2975400234283483, + "grad_norm": 2.5976922512054443, + "learning_rate": 9.846005187603992e-06, + "loss": 0.8029, + "step": 2286 + }, + { + "epoch": 0.29793049590003906, + "grad_norm": 3.485644578933716, + "learning_rate": 9.845490096621706e-06, + "loss": 0.6945, + "step": 2289 + }, + { + "epoch": 0.2983209683717298, + "grad_norm": 3.9062230587005615, + "learning_rate": 9.844974159142494e-06, + "loss": 0.6611, + "step": 2292 + }, + { + "epoch": 0.29871144084342055, + "grad_norm": 2.7590322494506836, + "learning_rate": 9.844457375256488e-06, + "loss": 0.6508, + "step": 2295 + }, + { + "epoch": 0.29910191331511127, + "grad_norm": 2.883934736251831, + "learning_rate": 9.84393974505397e-06, + "loss": 0.7574, + "step": 2298 + }, + { + "epoch": 0.29949238578680204, + "grad_norm": 2.7723381519317627, + "learning_rate": 9.843421268625371e-06, + "loss": 0.604, + "step": 2301 + }, + { + "epoch": 0.29988285825849276, + "grad_norm": 2.5612845420837402, + "learning_rate": 9.842901946061263e-06, + "loss": 0.6271, + "step": 2304 + }, + { + "epoch": 0.30027333073018353, + "grad_norm": 2.4496188163757324, + "learning_rate": 9.842381777452373e-06, + "loss": 0.6317, + "step": 2307 + }, + { + "epoch": 0.30066380320187425, + "grad_norm": 2.936002731323242, + "learning_rate": 9.841860762889574e-06, + "loss": 0.6229, + "step": 2310 + }, + { + "epoch": 0.301054275673565, + "grad_norm": 2.9964542388916016, + "learning_rate": 9.841338902463885e-06, + "loss": 0.667, + "step": 2313 + }, + { + "epoch": 0.30144474814525574, + "grad_norm": 3.358015775680542, + "learning_rate": 9.840816196266475e-06, + "loss": 0.6421, + "step": 2316 + }, + { + "epoch": 0.3018352206169465, + "grad_norm": 3.8633599281311035, + "learning_rate": 9.840292644388659e-06, + "loss": 0.6912, + "step": 2319 + }, + { + "epoch": 0.3022256930886372, + "grad_norm": 2.9916861057281494, + "learning_rate": 9.839768246921901e-06, + "loss": 0.6766, + "step": 2322 + }, + { + "epoch": 0.302616165560328, + "grad_norm": 3.3866844177246094, + "learning_rate": 9.83924300395781e-06, + "loss": 0.6397, + "step": 2325 + }, + { + "epoch": 0.3030066380320187, + "grad_norm": 2.5590834617614746, + "learning_rate": 9.838716915588148e-06, + "loss": 0.7354, + "step": 2328 + }, + { + "epoch": 0.3033971105037095, + "grad_norm": 2.6998791694641113, + "learning_rate": 9.83818998190482e-06, + "loss": 0.7159, + "step": 2331 + }, + { + "epoch": 0.30378758297540026, + "grad_norm": 2.2209694385528564, + "learning_rate": 9.837662202999879e-06, + "loss": 0.677, + "step": 2334 + }, + { + "epoch": 0.304178055447091, + "grad_norm": 3.1843392848968506, + "learning_rate": 9.83713357896553e-06, + "loss": 0.6385, + "step": 2337 + }, + { + "epoch": 0.30456852791878175, + "grad_norm": 2.818430185317993, + "learning_rate": 9.836604109894118e-06, + "loss": 0.643, + "step": 2340 + }, + { + "epoch": 0.30495900039047247, + "grad_norm": 2.380825996398926, + "learning_rate": 9.836073795878144e-06, + "loss": 0.6909, + "step": 2343 + }, + { + "epoch": 0.30534947286216324, + "grad_norm": 2.392238140106201, + "learning_rate": 9.835542637010253e-06, + "loss": 0.6184, + "step": 2346 + }, + { + "epoch": 0.30573994533385396, + "grad_norm": 3.895524263381958, + "learning_rate": 9.835010633383234e-06, + "loss": 0.735, + "step": 2349 + }, + { + "epoch": 0.30613041780554473, + "grad_norm": 2.8263940811157227, + "learning_rate": 9.834477785090028e-06, + "loss": 0.6762, + "step": 2352 + }, + { + "epoch": 0.30652089027723545, + "grad_norm": 3.5129168033599854, + "learning_rate": 9.833944092223725e-06, + "loss": 0.746, + "step": 2355 + }, + { + "epoch": 0.3069113627489262, + "grad_norm": 2.720536947250366, + "learning_rate": 9.833409554877558e-06, + "loss": 0.6833, + "step": 2358 + }, + { + "epoch": 0.30730183522061694, + "grad_norm": 2.6948177814483643, + "learning_rate": 9.83287417314491e-06, + "loss": 0.6223, + "step": 2361 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 2.750875234603882, + "learning_rate": 9.832337947119311e-06, + "loss": 0.7024, + "step": 2364 + }, + { + "epoch": 0.3080827801639984, + "grad_norm": 3.7236342430114746, + "learning_rate": 9.831800876894436e-06, + "loss": 0.7258, + "step": 2367 + }, + { + "epoch": 0.3084732526356892, + "grad_norm": 2.5590686798095703, + "learning_rate": 9.831262962564114e-06, + "loss": 0.6718, + "step": 2370 + }, + { + "epoch": 0.3088637251073799, + "grad_norm": 3.692631483078003, + "learning_rate": 9.830724204222316e-06, + "loss": 0.6383, + "step": 2373 + }, + { + "epoch": 0.3092541975790707, + "grad_norm": 2.6844470500946045, + "learning_rate": 9.830184601963162e-06, + "loss": 0.6747, + "step": 2376 + }, + { + "epoch": 0.3096446700507614, + "grad_norm": 3.532243490219116, + "learning_rate": 9.82964415588092e-06, + "loss": 0.72, + "step": 2379 + }, + { + "epoch": 0.3100351425224522, + "grad_norm": 2.9188520908355713, + "learning_rate": 9.829102866070002e-06, + "loss": 0.6895, + "step": 2382 + }, + { + "epoch": 0.3104256149941429, + "grad_norm": 2.847670316696167, + "learning_rate": 9.828560732624974e-06, + "loss": 0.7294, + "step": 2385 + }, + { + "epoch": 0.31081608746583367, + "grad_norm": 2.8164069652557373, + "learning_rate": 9.828017755640543e-06, + "loss": 0.7019, + "step": 2388 + }, + { + "epoch": 0.3112065599375244, + "grad_norm": 2.716837167739868, + "learning_rate": 9.827473935211567e-06, + "loss": 0.7291, + "step": 2391 + }, + { + "epoch": 0.31159703240921516, + "grad_norm": 2.867050886154175, + "learning_rate": 9.82692927143305e-06, + "loss": 0.6845, + "step": 2394 + }, + { + "epoch": 0.3119875048809059, + "grad_norm": 2.4712677001953125, + "learning_rate": 9.826383764400143e-06, + "loss": 0.5568, + "step": 2397 + }, + { + "epoch": 0.31237797735259665, + "grad_norm": 2.6688764095306396, + "learning_rate": 9.825837414208147e-06, + "loss": 0.6441, + "step": 2400 + }, + { + "epoch": 0.31276844982428736, + "grad_norm": 2.8412606716156006, + "learning_rate": 9.825290220952507e-06, + "loss": 0.7189, + "step": 2403 + }, + { + "epoch": 0.31315892229597814, + "grad_norm": 4.064339637756348, + "learning_rate": 9.824742184728815e-06, + "loss": 0.595, + "step": 2406 + }, + { + "epoch": 0.31354939476766885, + "grad_norm": 2.6639394760131836, + "learning_rate": 9.824193305632814e-06, + "loss": 0.6398, + "step": 2409 + }, + { + "epoch": 0.3139398672393596, + "grad_norm": 3.3248283863067627, + "learning_rate": 9.823643583760389e-06, + "loss": 0.6377, + "step": 2412 + }, + { + "epoch": 0.31433033971105034, + "grad_norm": 4.568389415740967, + "learning_rate": 9.823093019207578e-06, + "loss": 0.687, + "step": 2415 + }, + { + "epoch": 0.3147208121827411, + "grad_norm": 3.337682008743286, + "learning_rate": 9.822541612070566e-06, + "loss": 0.6771, + "step": 2418 + }, + { + "epoch": 0.3151112846544319, + "grad_norm": 2.871934652328491, + "learning_rate": 9.821989362445676e-06, + "loss": 0.7428, + "step": 2421 + }, + { + "epoch": 0.3155017571261226, + "grad_norm": 3.3546926975250244, + "learning_rate": 9.82143627042939e-06, + "loss": 0.6353, + "step": 2424 + }, + { + "epoch": 0.3158922295978134, + "grad_norm": 2.4146947860717773, + "learning_rate": 9.820882336118332e-06, + "loss": 0.6301, + "step": 2427 + }, + { + "epoch": 0.3162827020695041, + "grad_norm": 2.7984747886657715, + "learning_rate": 9.820327559609268e-06, + "loss": 0.5877, + "step": 2430 + }, + { + "epoch": 0.31667317454119487, + "grad_norm": 4.265294551849365, + "learning_rate": 9.819771940999123e-06, + "loss": 0.6227, + "step": 2433 + }, + { + "epoch": 0.3170636470128856, + "grad_norm": 2.921192169189453, + "learning_rate": 9.819215480384956e-06, + "loss": 0.6851, + "step": 2436 + }, + { + "epoch": 0.31745411948457636, + "grad_norm": 2.67498517036438, + "learning_rate": 9.818658177863985e-06, + "loss": 0.5766, + "step": 2439 + }, + { + "epoch": 0.3178445919562671, + "grad_norm": 2.93373703956604, + "learning_rate": 9.818100033533567e-06, + "loss": 0.7525, + "step": 2442 + }, + { + "epoch": 0.31823506442795785, + "grad_norm": 2.9454703330993652, + "learning_rate": 9.817541047491209e-06, + "loss": 0.7152, + "step": 2445 + }, + { + "epoch": 0.31862553689964856, + "grad_norm": 2.5796053409576416, + "learning_rate": 9.816981219834565e-06, + "loss": 0.6848, + "step": 2448 + }, + { + "epoch": 0.31901600937133934, + "grad_norm": 3.1265947818756104, + "learning_rate": 9.816420550661434e-06, + "loss": 0.7028, + "step": 2451 + }, + { + "epoch": 0.31940648184303005, + "grad_norm": 2.7802746295928955, + "learning_rate": 9.815859040069766e-06, + "loss": 0.6456, + "step": 2454 + }, + { + "epoch": 0.3197969543147208, + "grad_norm": 3.0092685222625732, + "learning_rate": 9.815296688157654e-06, + "loss": 0.6739, + "step": 2457 + }, + { + "epoch": 0.32018742678641154, + "grad_norm": 3.065697193145752, + "learning_rate": 9.81473349502334e-06, + "loss": 0.6604, + "step": 2460 + }, + { + "epoch": 0.3205778992581023, + "grad_norm": 2.455078601837158, + "learning_rate": 9.814169460765215e-06, + "loss": 0.6433, + "step": 2463 + }, + { + "epoch": 0.32096837172979303, + "grad_norm": 2.841989040374756, + "learning_rate": 9.813604585481812e-06, + "loss": 0.6915, + "step": 2466 + }, + { + "epoch": 0.3213588442014838, + "grad_norm": 2.2834136486053467, + "learning_rate": 9.813038869271814e-06, + "loss": 0.634, + "step": 2469 + }, + { + "epoch": 0.3217493166731745, + "grad_norm": 3.262573719024658, + "learning_rate": 9.81247231223405e-06, + "loss": 0.6987, + "step": 2472 + }, + { + "epoch": 0.3221397891448653, + "grad_norm": 2.7389028072357178, + "learning_rate": 9.811904914467498e-06, + "loss": 0.6553, + "step": 2475 + }, + { + "epoch": 0.322530261616556, + "grad_norm": 2.779263973236084, + "learning_rate": 9.81133667607128e-06, + "loss": 0.7449, + "step": 2478 + }, + { + "epoch": 0.3229207340882468, + "grad_norm": 2.815535306930542, + "learning_rate": 9.810767597144668e-06, + "loss": 0.5643, + "step": 2481 + }, + { + "epoch": 0.3233112065599375, + "grad_norm": 2.794158458709717, + "learning_rate": 9.810197677787074e-06, + "loss": 0.7033, + "step": 2484 + }, + { + "epoch": 0.3237016790316283, + "grad_norm": 3.1413052082061768, + "learning_rate": 9.809626918098066e-06, + "loss": 0.6482, + "step": 2487 + }, + { + "epoch": 0.324092151503319, + "grad_norm": 2.798581838607788, + "learning_rate": 9.809055318177353e-06, + "loss": 0.6147, + "step": 2490 + }, + { + "epoch": 0.32448262397500977, + "grad_norm": 3.2870383262634277, + "learning_rate": 9.808482878124794e-06, + "loss": 0.6716, + "step": 2493 + }, + { + "epoch": 0.3248730964467005, + "grad_norm": 2.5646088123321533, + "learning_rate": 9.807909598040392e-06, + "loss": 0.6022, + "step": 2496 + }, + { + "epoch": 0.32526356891839125, + "grad_norm": 3.979936361312866, + "learning_rate": 9.807335478024297e-06, + "loss": 0.6804, + "step": 2499 + }, + { + "epoch": 0.32565404139008197, + "grad_norm": 3.786860704421997, + "learning_rate": 9.806760518176806e-06, + "loss": 0.6807, + "step": 2502 + }, + { + "epoch": 0.32604451386177274, + "grad_norm": 3.001896619796753, + "learning_rate": 9.806184718598365e-06, + "loss": 0.6799, + "step": 2505 + }, + { + "epoch": 0.3264349863334635, + "grad_norm": 2.965797185897827, + "learning_rate": 9.805608079389566e-06, + "loss": 0.7123, + "step": 2508 + }, + { + "epoch": 0.32682545880515423, + "grad_norm": 5.5680460929870605, + "learning_rate": 9.805030600651143e-06, + "loss": 0.6111, + "step": 2511 + }, + { + "epoch": 0.327215931276845, + "grad_norm": 3.2845263481140137, + "learning_rate": 9.804452282483983e-06, + "loss": 0.614, + "step": 2514 + }, + { + "epoch": 0.3276064037485357, + "grad_norm": 2.4378509521484375, + "learning_rate": 9.803873124989116e-06, + "loss": 0.712, + "step": 2517 + }, + { + "epoch": 0.3279968762202265, + "grad_norm": 2.590458393096924, + "learning_rate": 9.803293128267721e-06, + "loss": 0.62, + "step": 2520 + }, + { + "epoch": 0.3283873486919172, + "grad_norm": 2.5005409717559814, + "learning_rate": 9.802712292421121e-06, + "loss": 0.7454, + "step": 2523 + }, + { + "epoch": 0.328777821163608, + "grad_norm": 3.139564037322998, + "learning_rate": 9.802130617550788e-06, + "loss": 0.7651, + "step": 2526 + }, + { + "epoch": 0.3291682936352987, + "grad_norm": 3.2621004581451416, + "learning_rate": 9.801548103758335e-06, + "loss": 0.7262, + "step": 2529 + }, + { + "epoch": 0.3295587661069895, + "grad_norm": 2.5339832305908203, + "learning_rate": 9.800964751145533e-06, + "loss": 0.6914, + "step": 2532 + }, + { + "epoch": 0.3299492385786802, + "grad_norm": 3.3442342281341553, + "learning_rate": 9.800380559814284e-06, + "loss": 0.6403, + "step": 2535 + }, + { + "epoch": 0.33033971105037097, + "grad_norm": 2.522374153137207, + "learning_rate": 9.799795529866654e-06, + "loss": 0.6177, + "step": 2538 + }, + { + "epoch": 0.3307301835220617, + "grad_norm": 2.8395447731018066, + "learning_rate": 9.79920966140484e-06, + "loss": 0.6954, + "step": 2541 + }, + { + "epoch": 0.33112065599375246, + "grad_norm": 2.558065891265869, + "learning_rate": 9.798622954531194e-06, + "loss": 0.6864, + "step": 2544 + }, + { + "epoch": 0.3315111284654432, + "grad_norm": 3.8972573280334473, + "learning_rate": 9.798035409348214e-06, + "loss": 0.6951, + "step": 2547 + }, + { + "epoch": 0.33190160093713394, + "grad_norm": 3.3120248317718506, + "learning_rate": 9.797447025958542e-06, + "loss": 0.5917, + "step": 2550 + }, + { + "epoch": 0.33229207340882466, + "grad_norm": 2.999908685684204, + "learning_rate": 9.796857804464966e-06, + "loss": 0.6607, + "step": 2553 + }, + { + "epoch": 0.33268254588051543, + "grad_norm": 2.7225866317749023, + "learning_rate": 9.796267744970423e-06, + "loss": 0.7139, + "step": 2556 + }, + { + "epoch": 0.33307301835220615, + "grad_norm": 2.655848741531372, + "learning_rate": 9.795676847577995e-06, + "loss": 0.7146, + "step": 2559 + }, + { + "epoch": 0.3334634908238969, + "grad_norm": 3.1451730728149414, + "learning_rate": 9.795085112390909e-06, + "loss": 0.5869, + "step": 2562 + }, + { + "epoch": 0.33385396329558764, + "grad_norm": 3.549569845199585, + "learning_rate": 9.794492539512544e-06, + "loss": 0.6792, + "step": 2565 + }, + { + "epoch": 0.3342444357672784, + "grad_norm": 2.678586959838867, + "learning_rate": 9.793899129046417e-06, + "loss": 0.5984, + "step": 2568 + }, + { + "epoch": 0.33463490823896913, + "grad_norm": 2.842653274536133, + "learning_rate": 9.793304881096195e-06, + "loss": 0.6587, + "step": 2571 + }, + { + "epoch": 0.3350253807106599, + "grad_norm": 2.715420722961426, + "learning_rate": 9.792709795765695e-06, + "loss": 0.6969, + "step": 2574 + }, + { + "epoch": 0.3354158531823506, + "grad_norm": 3.2462103366851807, + "learning_rate": 9.792113873158877e-06, + "loss": 0.7084, + "step": 2577 + }, + { + "epoch": 0.3358063256540414, + "grad_norm": 3.362494468688965, + "learning_rate": 9.791517113379847e-06, + "loss": 0.7104, + "step": 2580 + }, + { + "epoch": 0.3361967981257321, + "grad_norm": 2.433021306991577, + "learning_rate": 9.790919516532856e-06, + "loss": 0.6128, + "step": 2583 + }, + { + "epoch": 0.3365872705974229, + "grad_norm": 2.534708261489868, + "learning_rate": 9.790321082722306e-06, + "loss": 0.6569, + "step": 2586 + }, + { + "epoch": 0.3369777430691136, + "grad_norm": 2.7385549545288086, + "learning_rate": 9.789721812052738e-06, + "loss": 0.6337, + "step": 2589 + }, + { + "epoch": 0.3373682155408044, + "grad_norm": 2.9946787357330322, + "learning_rate": 9.789121704628845e-06, + "loss": 0.6794, + "step": 2592 + }, + { + "epoch": 0.33775868801249515, + "grad_norm": 2.673137664794922, + "learning_rate": 9.788520760555467e-06, + "loss": 0.6952, + "step": 2595 + }, + { + "epoch": 0.33814916048418586, + "grad_norm": 2.8383102416992188, + "learning_rate": 9.787918979937584e-06, + "loss": 0.7175, + "step": 2598 + }, + { + "epoch": 0.33853963295587663, + "grad_norm": 2.792264223098755, + "learning_rate": 9.78731636288033e-06, + "loss": 0.6472, + "step": 2601 + }, + { + "epoch": 0.33893010542756735, + "grad_norm": 3.444775342941284, + "learning_rate": 9.786712909488976e-06, + "loss": 0.6164, + "step": 2604 + }, + { + "epoch": 0.3393205778992581, + "grad_norm": 2.6704118251800537, + "learning_rate": 9.786108619868948e-06, + "loss": 0.6913, + "step": 2607 + }, + { + "epoch": 0.33971105037094884, + "grad_norm": 3.128438949584961, + "learning_rate": 9.785503494125812e-06, + "loss": 0.6493, + "step": 2610 + }, + { + "epoch": 0.3401015228426396, + "grad_norm": 3.342224359512329, + "learning_rate": 9.784897532365283e-06, + "loss": 0.7241, + "step": 2613 + }, + { + "epoch": 0.34049199531433033, + "grad_norm": 3.057779550552368, + "learning_rate": 9.78429073469322e-06, + "loss": 0.6189, + "step": 2616 + }, + { + "epoch": 0.3408824677860211, + "grad_norm": 2.809861898422241, + "learning_rate": 9.783683101215632e-06, + "loss": 0.6926, + "step": 2619 + }, + { + "epoch": 0.3412729402577118, + "grad_norm": 2.8063440322875977, + "learning_rate": 9.783074632038669e-06, + "loss": 0.7242, + "step": 2622 + }, + { + "epoch": 0.3416634127294026, + "grad_norm": 2.8597209453582764, + "learning_rate": 9.78246532726863e-06, + "loss": 0.6391, + "step": 2625 + }, + { + "epoch": 0.3420538852010933, + "grad_norm": 2.6011080741882324, + "learning_rate": 9.78185518701196e-06, + "loss": 0.6819, + "step": 2628 + }, + { + "epoch": 0.3424443576727841, + "grad_norm": 3.5617270469665527, + "learning_rate": 9.781244211375247e-06, + "loss": 0.733, + "step": 2631 + }, + { + "epoch": 0.3428348301444748, + "grad_norm": 3.6274831295013428, + "learning_rate": 9.78063240046523e-06, + "loss": 0.7171, + "step": 2634 + }, + { + "epoch": 0.3432253026161656, + "grad_norm": 2.6554691791534424, + "learning_rate": 9.780019754388786e-06, + "loss": 0.7321, + "step": 2637 + }, + { + "epoch": 0.3436157750878563, + "grad_norm": 2.574505090713501, + "learning_rate": 9.779406273252949e-06, + "loss": 0.7734, + "step": 2640 + }, + { + "epoch": 0.34400624755954706, + "grad_norm": 4.732491970062256, + "learning_rate": 9.77879195716489e-06, + "loss": 0.7968, + "step": 2643 + }, + { + "epoch": 0.3443967200312378, + "grad_norm": 3.508035182952881, + "learning_rate": 9.778176806231931e-06, + "loss": 0.6802, + "step": 2646 + }, + { + "epoch": 0.34478719250292855, + "grad_norm": 2.745506763458252, + "learning_rate": 9.777560820561533e-06, + "loss": 0.6712, + "step": 2649 + }, + { + "epoch": 0.34517766497461927, + "grad_norm": 2.6209089756011963, + "learning_rate": 9.776944000261313e-06, + "loss": 0.6477, + "step": 2652 + }, + { + "epoch": 0.34556813744631004, + "grad_norm": 3.0945277214050293, + "learning_rate": 9.776326345439024e-06, + "loss": 0.6644, + "step": 2655 + }, + { + "epoch": 0.34595860991800076, + "grad_norm": 3.156196117401123, + "learning_rate": 9.77570785620257e-06, + "loss": 0.6713, + "step": 2658 + }, + { + "epoch": 0.34634908238969153, + "grad_norm": 3.839165210723877, + "learning_rate": 9.77508853266e-06, + "loss": 0.7557, + "step": 2661 + }, + { + "epoch": 0.34673955486138225, + "grad_norm": 3.0956363677978516, + "learning_rate": 9.77446837491951e-06, + "loss": 0.7111, + "step": 2664 + }, + { + "epoch": 0.347130027333073, + "grad_norm": 2.516638994216919, + "learning_rate": 9.773847383089439e-06, + "loss": 0.6385, + "step": 2667 + }, + { + "epoch": 0.34752049980476374, + "grad_norm": 2.5300917625427246, + "learning_rate": 9.773225557278272e-06, + "loss": 0.6205, + "step": 2670 + }, + { + "epoch": 0.3479109722764545, + "grad_norm": 2.5988070964813232, + "learning_rate": 9.77260289759464e-06, + "loss": 0.6469, + "step": 2673 + }, + { + "epoch": 0.34830144474814523, + "grad_norm": 2.493129014968872, + "learning_rate": 9.771979404147324e-06, + "loss": 0.6266, + "step": 2676 + }, + { + "epoch": 0.348691917219836, + "grad_norm": 2.7826688289642334, + "learning_rate": 9.771355077045244e-06, + "loss": 0.6741, + "step": 2679 + }, + { + "epoch": 0.3490823896915268, + "grad_norm": 2.9177565574645996, + "learning_rate": 9.77072991639747e-06, + "loss": 0.764, + "step": 2682 + }, + { + "epoch": 0.3494728621632175, + "grad_norm": 3.730968475341797, + "learning_rate": 9.770103922313215e-06, + "loss": 0.5789, + "step": 2685 + }, + { + "epoch": 0.34986333463490826, + "grad_norm": 2.848114490509033, + "learning_rate": 9.76947709490184e-06, + "loss": 0.6656, + "step": 2688 + }, + { + "epoch": 0.350253807106599, + "grad_norm": 3.0486176013946533, + "learning_rate": 9.768849434272851e-06, + "loss": 0.6857, + "step": 2691 + }, + { + "epoch": 0.35064427957828975, + "grad_norm": 2.6924357414245605, + "learning_rate": 9.768220940535897e-06, + "loss": 0.6939, + "step": 2694 + }, + { + "epoch": 0.35103475204998047, + "grad_norm": 3.603853225708008, + "learning_rate": 9.767591613800775e-06, + "loss": 0.6395, + "step": 2697 + }, + { + "epoch": 0.35142522452167124, + "grad_norm": 2.700711250305176, + "learning_rate": 9.76696145417743e-06, + "loss": 0.6732, + "step": 2700 + }, + { + "epoch": 0.35181569699336196, + "grad_norm": 2.3906078338623047, + "learning_rate": 9.766330461775944e-06, + "loss": 0.5556, + "step": 2703 + }, + { + "epoch": 0.35220616946505273, + "grad_norm": 2.9107656478881836, + "learning_rate": 9.765698636706555e-06, + "loss": 0.7257, + "step": 2706 + }, + { + "epoch": 0.35259664193674345, + "grad_norm": 2.9227166175842285, + "learning_rate": 9.765065979079639e-06, + "loss": 0.5716, + "step": 2709 + }, + { + "epoch": 0.3529871144084342, + "grad_norm": 3.1230244636535645, + "learning_rate": 9.764432489005722e-06, + "loss": 0.6969, + "step": 2712 + }, + { + "epoch": 0.35337758688012494, + "grad_norm": 2.984856128692627, + "learning_rate": 9.763798166595473e-06, + "loss": 0.6435, + "step": 2715 + }, + { + "epoch": 0.3537680593518157, + "grad_norm": 2.4585700035095215, + "learning_rate": 9.763163011959702e-06, + "loss": 0.7096, + "step": 2718 + }, + { + "epoch": 0.35415853182350643, + "grad_norm": 2.7415413856506348, + "learning_rate": 9.762527025209377e-06, + "loss": 0.7252, + "step": 2721 + }, + { + "epoch": 0.3545490042951972, + "grad_norm": 3.0851871967315674, + "learning_rate": 9.761890206455597e-06, + "loss": 0.6221, + "step": 2724 + }, + { + "epoch": 0.3549394767668879, + "grad_norm": 4.111907482147217, + "learning_rate": 9.761252555809616e-06, + "loss": 0.6155, + "step": 2727 + }, + { + "epoch": 0.3553299492385787, + "grad_norm": 3.664571762084961, + "learning_rate": 9.76061407338283e-06, + "loss": 0.6705, + "step": 2730 + }, + { + "epoch": 0.3557204217102694, + "grad_norm": 2.795311689376831, + "learning_rate": 9.75997475928678e-06, + "loss": 0.6785, + "step": 2733 + }, + { + "epoch": 0.3561108941819602, + "grad_norm": 3.0778064727783203, + "learning_rate": 9.759334613633154e-06, + "loss": 0.7244, + "step": 2736 + }, + { + "epoch": 0.3565013666536509, + "grad_norm": 3.36773943901062, + "learning_rate": 9.758693636533782e-06, + "loss": 0.6062, + "step": 2739 + }, + { + "epoch": 0.35689183912534167, + "grad_norm": 2.6564176082611084, + "learning_rate": 9.758051828100643e-06, + "loss": 0.7016, + "step": 2742 + }, + { + "epoch": 0.3572823115970324, + "grad_norm": 2.544980049133301, + "learning_rate": 9.75740918844586e-06, + "loss": 0.6046, + "step": 2745 + }, + { + "epoch": 0.35767278406872316, + "grad_norm": 3.575075626373291, + "learning_rate": 9.756765717681698e-06, + "loss": 0.663, + "step": 2748 + }, + { + "epoch": 0.3580632565404139, + "grad_norm": 2.658778429031372, + "learning_rate": 9.756121415920572e-06, + "loss": 0.6775, + "step": 2751 + }, + { + "epoch": 0.35845372901210465, + "grad_norm": 2.7959978580474854, + "learning_rate": 9.755476283275042e-06, + "loss": 0.7469, + "step": 2754 + }, + { + "epoch": 0.35884420148379537, + "grad_norm": 2.5027709007263184, + "learning_rate": 9.754830319857809e-06, + "loss": 0.6239, + "step": 2757 + }, + { + "epoch": 0.35923467395548614, + "grad_norm": 2.6545188426971436, + "learning_rate": 9.75418352578172e-06, + "loss": 0.6756, + "step": 2760 + }, + { + "epoch": 0.3596251464271769, + "grad_norm": 2.5996038913726807, + "learning_rate": 9.753535901159772e-06, + "loss": 0.7895, + "step": 2763 + }, + { + "epoch": 0.36001561889886763, + "grad_norm": 2.747310161590576, + "learning_rate": 9.752887446105101e-06, + "loss": 0.6933, + "step": 2766 + }, + { + "epoch": 0.3604060913705584, + "grad_norm": 2.4019601345062256, + "learning_rate": 9.752238160730994e-06, + "loss": 0.5939, + "step": 2769 + }, + { + "epoch": 0.3607965638422491, + "grad_norm": 3.0222368240356445, + "learning_rate": 9.751588045150875e-06, + "loss": 0.7407, + "step": 2772 + }, + { + "epoch": 0.3611870363139399, + "grad_norm": 3.187483310699463, + "learning_rate": 9.750937099478322e-06, + "loss": 0.6874, + "step": 2775 + }, + { + "epoch": 0.3615775087856306, + "grad_norm": 2.897294282913208, + "learning_rate": 9.750285323827051e-06, + "loss": 0.7485, + "step": 2778 + }, + { + "epoch": 0.3619679812573214, + "grad_norm": 2.5609171390533447, + "learning_rate": 9.749632718310927e-06, + "loss": 0.6426, + "step": 2781 + }, + { + "epoch": 0.3623584537290121, + "grad_norm": 2.4874250888824463, + "learning_rate": 9.74897928304396e-06, + "loss": 0.6134, + "step": 2784 + }, + { + "epoch": 0.36274892620070287, + "grad_norm": 3.737605333328247, + "learning_rate": 9.748325018140301e-06, + "loss": 0.661, + "step": 2787 + }, + { + "epoch": 0.3631393986723936, + "grad_norm": 2.8321633338928223, + "learning_rate": 9.747669923714252e-06, + "loss": 0.7641, + "step": 2790 + }, + { + "epoch": 0.36352987114408436, + "grad_norm": 2.5597903728485107, + "learning_rate": 9.747013999880255e-06, + "loss": 0.6625, + "step": 2793 + }, + { + "epoch": 0.3639203436157751, + "grad_norm": 2.5689401626586914, + "learning_rate": 9.746357246752898e-06, + "loss": 0.6729, + "step": 2796 + }, + { + "epoch": 0.36431081608746585, + "grad_norm": 2.8747706413269043, + "learning_rate": 9.745699664446914e-06, + "loss": 0.6406, + "step": 2799 + }, + { + "epoch": 0.36470128855915657, + "grad_norm": 2.724287271499634, + "learning_rate": 9.745041253077183e-06, + "loss": 0.6439, + "step": 2802 + }, + { + "epoch": 0.36509176103084734, + "grad_norm": 3.9166243076324463, + "learning_rate": 9.744382012758727e-06, + "loss": 0.6298, + "step": 2805 + }, + { + "epoch": 0.36548223350253806, + "grad_norm": 3.197875499725342, + "learning_rate": 9.743721943606715e-06, + "loss": 0.6032, + "step": 2808 + }, + { + "epoch": 0.36587270597422883, + "grad_norm": 2.6559009552001953, + "learning_rate": 9.743061045736457e-06, + "loss": 0.6065, + "step": 2811 + }, + { + "epoch": 0.36626317844591955, + "grad_norm": 2.4632251262664795, + "learning_rate": 9.742399319263414e-06, + "loss": 0.6398, + "step": 2814 + }, + { + "epoch": 0.3666536509176103, + "grad_norm": 3.159674882888794, + "learning_rate": 9.741736764303185e-06, + "loss": 0.5769, + "step": 2817 + }, + { + "epoch": 0.36704412338930104, + "grad_norm": 2.6667957305908203, + "learning_rate": 9.74107338097152e-06, + "loss": 0.7085, + "step": 2820 + }, + { + "epoch": 0.3674345958609918, + "grad_norm": 2.641826629638672, + "learning_rate": 9.740409169384308e-06, + "loss": 0.6944, + "step": 2823 + }, + { + "epoch": 0.3678250683326825, + "grad_norm": 3.9662795066833496, + "learning_rate": 9.739744129657586e-06, + "loss": 0.6217, + "step": 2826 + }, + { + "epoch": 0.3682155408043733, + "grad_norm": 3.6376278400421143, + "learning_rate": 9.739078261907537e-06, + "loss": 0.7337, + "step": 2829 + }, + { + "epoch": 0.368606013276064, + "grad_norm": 3.675191640853882, + "learning_rate": 9.738411566250485e-06, + "loss": 0.6678, + "step": 2832 + }, + { + "epoch": 0.3689964857477548, + "grad_norm": 2.5543577671051025, + "learning_rate": 9.7377440428029e-06, + "loss": 0.577, + "step": 2835 + }, + { + "epoch": 0.3693869582194455, + "grad_norm": 3.843170166015625, + "learning_rate": 9.737075691681398e-06, + "loss": 0.7626, + "step": 2838 + }, + { + "epoch": 0.3697774306911363, + "grad_norm": 2.62917423248291, + "learning_rate": 9.736406513002737e-06, + "loss": 0.6669, + "step": 2841 + }, + { + "epoch": 0.370167903162827, + "grad_norm": 2.891164541244507, + "learning_rate": 9.735736506883822e-06, + "loss": 0.6709, + "step": 2844 + }, + { + "epoch": 0.37055837563451777, + "grad_norm": 2.4671757221221924, + "learning_rate": 9.735065673441702e-06, + "loss": 0.5747, + "step": 2847 + }, + { + "epoch": 0.37094884810620854, + "grad_norm": 3.6108286380767822, + "learning_rate": 9.73439401279357e-06, + "loss": 0.6485, + "step": 2850 + }, + { + "epoch": 0.37133932057789926, + "grad_norm": 2.4093971252441406, + "learning_rate": 9.733721525056764e-06, + "loss": 0.6907, + "step": 2853 + }, + { + "epoch": 0.37172979304959003, + "grad_norm": 4.289933681488037, + "learning_rate": 9.733048210348767e-06, + "loss": 0.6965, + "step": 2856 + }, + { + "epoch": 0.37212026552128075, + "grad_norm": 2.8142073154449463, + "learning_rate": 9.732374068787202e-06, + "loss": 0.7055, + "step": 2859 + }, + { + "epoch": 0.3725107379929715, + "grad_norm": 2.733341693878174, + "learning_rate": 9.731699100489845e-06, + "loss": 0.7459, + "step": 2862 + }, + { + "epoch": 0.37290121046466224, + "grad_norm": 3.4828221797943115, + "learning_rate": 9.731023305574608e-06, + "loss": 0.6431, + "step": 2865 + }, + { + "epoch": 0.373291682936353, + "grad_norm": 2.436321258544922, + "learning_rate": 9.730346684159553e-06, + "loss": 0.6435, + "step": 2868 + }, + { + "epoch": 0.3736821554080437, + "grad_norm": 2.3857946395874023, + "learning_rate": 9.729669236362882e-06, + "loss": 0.6081, + "step": 2871 + }, + { + "epoch": 0.3740726278797345, + "grad_norm": 3.848266839981079, + "learning_rate": 9.728990962302946e-06, + "loss": 0.6367, + "step": 2874 + }, + { + "epoch": 0.3744631003514252, + "grad_norm": 2.9143803119659424, + "learning_rate": 9.72831186209824e-06, + "loss": 0.6555, + "step": 2877 + }, + { + "epoch": 0.374853572823116, + "grad_norm": 2.645496129989624, + "learning_rate": 9.727631935867394e-06, + "loss": 0.6999, + "step": 2880 + }, + { + "epoch": 0.3752440452948067, + "grad_norm": 2.874675750732422, + "learning_rate": 9.726951183729196e-06, + "loss": 0.6976, + "step": 2883 + }, + { + "epoch": 0.3756345177664975, + "grad_norm": 2.555006742477417, + "learning_rate": 9.726269605802569e-06, + "loss": 0.6693, + "step": 2886 + }, + { + "epoch": 0.3760249902381882, + "grad_norm": 2.870241641998291, + "learning_rate": 9.725587202206588e-06, + "loss": 0.6434, + "step": 2889 + }, + { + "epoch": 0.37641546270987897, + "grad_norm": 4.20858097076416, + "learning_rate": 9.724903973060461e-06, + "loss": 0.7726, + "step": 2892 + }, + { + "epoch": 0.3768059351815697, + "grad_norm": 2.6733803749084473, + "learning_rate": 9.72421991848355e-06, + "loss": 0.7146, + "step": 2895 + }, + { + "epoch": 0.37719640765326046, + "grad_norm": 2.6210622787475586, + "learning_rate": 9.723535038595358e-06, + "loss": 0.7285, + "step": 2898 + }, + { + "epoch": 0.3775868801249512, + "grad_norm": 3.7322444915771484, + "learning_rate": 9.722849333515532e-06, + "loss": 0.7671, + "step": 2901 + }, + { + "epoch": 0.37797735259664195, + "grad_norm": 3.78657603263855, + "learning_rate": 9.722162803363863e-06, + "loss": 0.6344, + "step": 2904 + }, + { + "epoch": 0.37836782506833266, + "grad_norm": 3.663477659225464, + "learning_rate": 9.721475448260286e-06, + "loss": 0.6887, + "step": 2907 + }, + { + "epoch": 0.37875829754002344, + "grad_norm": 2.8419249057769775, + "learning_rate": 9.72078726832488e-06, + "loss": 0.7098, + "step": 2910 + }, + { + "epoch": 0.37914877001171415, + "grad_norm": 5.145716190338135, + "learning_rate": 9.72009826367787e-06, + "loss": 0.6945, + "step": 2913 + }, + { + "epoch": 0.3795392424834049, + "grad_norm": 3.3549532890319824, + "learning_rate": 9.719408434439623e-06, + "loss": 0.6757, + "step": 2916 + }, + { + "epoch": 0.37992971495509564, + "grad_norm": 2.720118284225464, + "learning_rate": 9.71871778073065e-06, + "loss": 0.7617, + "step": 2919 + }, + { + "epoch": 0.3803201874267864, + "grad_norm": 2.3909714221954346, + "learning_rate": 9.718026302671608e-06, + "loss": 0.6084, + "step": 2922 + }, + { + "epoch": 0.38071065989847713, + "grad_norm": 2.6086878776550293, + "learning_rate": 9.717334000383297e-06, + "loss": 0.6376, + "step": 2925 + }, + { + "epoch": 0.3811011323701679, + "grad_norm": 2.780679702758789, + "learning_rate": 9.716640873986658e-06, + "loss": 0.712, + "step": 2928 + }, + { + "epoch": 0.3814916048418586, + "grad_norm": 2.720557451248169, + "learning_rate": 9.715946923602781e-06, + "loss": 0.643, + "step": 2931 + }, + { + "epoch": 0.3818820773135494, + "grad_norm": 5.16661262512207, + "learning_rate": 9.715252149352898e-06, + "loss": 0.705, + "step": 2934 + }, + { + "epoch": 0.38227254978524017, + "grad_norm": 3.2445664405822754, + "learning_rate": 9.714556551358385e-06, + "loss": 0.6784, + "step": 2937 + }, + { + "epoch": 0.3826630222569309, + "grad_norm": 3.182467460632324, + "learning_rate": 9.713860129740759e-06, + "loss": 0.714, + "step": 2940 + }, + { + "epoch": 0.38305349472862166, + "grad_norm": 2.596245288848877, + "learning_rate": 9.713162884621686e-06, + "loss": 0.5894, + "step": 2943 + }, + { + "epoch": 0.3834439672003124, + "grad_norm": 2.317925453186035, + "learning_rate": 9.71246481612297e-06, + "loss": 0.5575, + "step": 2946 + }, + { + "epoch": 0.38383443967200315, + "grad_norm": 2.5028085708618164, + "learning_rate": 9.711765924366567e-06, + "loss": 0.6765, + "step": 2949 + }, + { + "epoch": 0.38422491214369386, + "grad_norm": 2.46987247467041, + "learning_rate": 9.711066209474568e-06, + "loss": 0.7498, + "step": 2952 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 2.608255386352539, + "learning_rate": 9.710365671569214e-06, + "loss": 0.718, + "step": 2955 + }, + { + "epoch": 0.38500585708707535, + "grad_norm": 3.6582038402557373, + "learning_rate": 9.709664310772884e-06, + "loss": 0.7416, + "step": 2958 + }, + { + "epoch": 0.3853963295587661, + "grad_norm": 3.149799108505249, + "learning_rate": 9.708962127208105e-06, + "loss": 0.746, + "step": 2961 + }, + { + "epoch": 0.38578680203045684, + "grad_norm": 2.3268182277679443, + "learning_rate": 9.708259120997551e-06, + "loss": 0.5427, + "step": 2964 + }, + { + "epoch": 0.3861772745021476, + "grad_norm": 2.4898290634155273, + "learning_rate": 9.707555292264035e-06, + "loss": 0.6849, + "step": 2967 + }, + { + "epoch": 0.38656774697383833, + "grad_norm": 2.5410478115081787, + "learning_rate": 9.706850641130512e-06, + "loss": 0.6692, + "step": 2970 + }, + { + "epoch": 0.3869582194455291, + "grad_norm": 2.798302173614502, + "learning_rate": 9.706145167720082e-06, + "loss": 0.5719, + "step": 2973 + }, + { + "epoch": 0.3873486919172198, + "grad_norm": 3.7535929679870605, + "learning_rate": 9.705438872155993e-06, + "loss": 0.7151, + "step": 2976 + }, + { + "epoch": 0.3877391643889106, + "grad_norm": 2.450899124145508, + "learning_rate": 9.70473175456163e-06, + "loss": 0.7171, + "step": 2979 + }, + { + "epoch": 0.3881296368606013, + "grad_norm": 3.063359260559082, + "learning_rate": 9.704023815060528e-06, + "loss": 0.7145, + "step": 2982 + }, + { + "epoch": 0.3885201093322921, + "grad_norm": 2.760772228240967, + "learning_rate": 9.703315053776362e-06, + "loss": 0.7893, + "step": 2985 + }, + { + "epoch": 0.3889105818039828, + "grad_norm": 2.4443037509918213, + "learning_rate": 9.702605470832948e-06, + "loss": 0.6993, + "step": 2988 + }, + { + "epoch": 0.3893010542756736, + "grad_norm": 2.651404619216919, + "learning_rate": 9.701895066354255e-06, + "loss": 0.6193, + "step": 2991 + }, + { + "epoch": 0.3896915267473643, + "grad_norm": 3.2075984477996826, + "learning_rate": 9.701183840464383e-06, + "loss": 0.6428, + "step": 2994 + }, + { + "epoch": 0.39008199921905506, + "grad_norm": 2.8586585521698, + "learning_rate": 9.700471793287582e-06, + "loss": 0.7017, + "step": 2997 + }, + { + "epoch": 0.3904724716907458, + "grad_norm": 2.386969804763794, + "learning_rate": 9.69975892494825e-06, + "loss": 0.5862, + "step": 3000 + }, + { + "epoch": 0.39086294416243655, + "grad_norm": 2.74922776222229, + "learning_rate": 9.69904523557092e-06, + "loss": 0.6879, + "step": 3003 + }, + { + "epoch": 0.39125341663412727, + "grad_norm": 2.8284764289855957, + "learning_rate": 9.698330725280271e-06, + "loss": 0.6019, + "step": 3006 + }, + { + "epoch": 0.39164388910581804, + "grad_norm": 4.191535472869873, + "learning_rate": 9.69761539420113e-06, + "loss": 0.6605, + "step": 3009 + }, + { + "epoch": 0.39203436157750876, + "grad_norm": 2.883096218109131, + "learning_rate": 9.696899242458462e-06, + "loss": 0.7188, + "step": 3012 + }, + { + "epoch": 0.39242483404919953, + "grad_norm": 4.879593372344971, + "learning_rate": 9.696182270177377e-06, + "loss": 0.6858, + "step": 3015 + }, + { + "epoch": 0.39281530652089025, + "grad_norm": 3.190108060836792, + "learning_rate": 9.695464477483127e-06, + "loss": 0.7377, + "step": 3018 + }, + { + "epoch": 0.393205778992581, + "grad_norm": 2.7659029960632324, + "learning_rate": 9.694745864501113e-06, + "loss": 0.6033, + "step": 3021 + }, + { + "epoch": 0.3935962514642718, + "grad_norm": 2.5670714378356934, + "learning_rate": 9.694026431356872e-06, + "loss": 0.6231, + "step": 3024 + }, + { + "epoch": 0.3939867239359625, + "grad_norm": 2.614166498184204, + "learning_rate": 9.693306178176086e-06, + "loss": 0.696, + "step": 3027 + }, + { + "epoch": 0.3943771964076533, + "grad_norm": 2.583684206008911, + "learning_rate": 9.692585105084588e-06, + "loss": 0.6598, + "step": 3030 + }, + { + "epoch": 0.394767668879344, + "grad_norm": 2.6008903980255127, + "learning_rate": 9.691863212208342e-06, + "loss": 0.6207, + "step": 3033 + }, + { + "epoch": 0.3951581413510348, + "grad_norm": 2.974407196044922, + "learning_rate": 9.691140499673462e-06, + "loss": 0.6982, + "step": 3036 + }, + { + "epoch": 0.3955486138227255, + "grad_norm": 3.109865665435791, + "learning_rate": 9.690416967606207e-06, + "loss": 0.8394, + "step": 3039 + }, + { + "epoch": 0.39593908629441626, + "grad_norm": 2.6200222969055176, + "learning_rate": 9.689692616132975e-06, + "loss": 0.6467, + "step": 3042 + }, + { + "epoch": 0.396329558766107, + "grad_norm": 2.555178642272949, + "learning_rate": 9.688967445380306e-06, + "loss": 0.7058, + "step": 3045 + }, + { + "epoch": 0.39672003123779775, + "grad_norm": 2.755859375, + "learning_rate": 9.688241455474892e-06, + "loss": 0.637, + "step": 3048 + }, + { + "epoch": 0.39711050370948847, + "grad_norm": 2.5034401416778564, + "learning_rate": 9.687514646543557e-06, + "loss": 0.7212, + "step": 3051 + }, + { + "epoch": 0.39750097618117924, + "grad_norm": 3.507744550704956, + "learning_rate": 9.686787018713273e-06, + "loss": 0.6072, + "step": 3054 + }, + { + "epoch": 0.39789144865286996, + "grad_norm": 3.1820497512817383, + "learning_rate": 9.686058572111157e-06, + "loss": 0.7151, + "step": 3057 + }, + { + "epoch": 0.39828192112456073, + "grad_norm": 2.979086399078369, + "learning_rate": 9.685329306864468e-06, + "loss": 0.8117, + "step": 3060 + }, + { + "epoch": 0.39867239359625145, + "grad_norm": 2.7010393142700195, + "learning_rate": 9.684599223100604e-06, + "loss": 0.6791, + "step": 3063 + }, + { + "epoch": 0.3990628660679422, + "grad_norm": 2.8512115478515625, + "learning_rate": 9.68386832094711e-06, + "loss": 0.6964, + "step": 3066 + }, + { + "epoch": 0.39945333853963294, + "grad_norm": 2.7704756259918213, + "learning_rate": 9.683136600531674e-06, + "loss": 0.7149, + "step": 3069 + }, + { + "epoch": 0.3998438110113237, + "grad_norm": 2.513730764389038, + "learning_rate": 9.682404061982129e-06, + "loss": 0.6607, + "step": 3072 + }, + { + "epoch": 0.40023428348301443, + "grad_norm": 2.799468517303467, + "learning_rate": 9.681670705426442e-06, + "loss": 0.6806, + "step": 3075 + }, + { + "epoch": 0.4006247559547052, + "grad_norm": 4.188741683959961, + "learning_rate": 9.680936530992731e-06, + "loss": 0.5816, + "step": 3078 + }, + { + "epoch": 0.4010152284263959, + "grad_norm": 2.830998659133911, + "learning_rate": 9.680201538809257e-06, + "loss": 0.6774, + "step": 3081 + }, + { + "epoch": 0.4014057008980867, + "grad_norm": 2.5481221675872803, + "learning_rate": 9.679465729004419e-06, + "loss": 0.5939, + "step": 3084 + }, + { + "epoch": 0.4017961733697774, + "grad_norm": 2.98976469039917, + "learning_rate": 9.678729101706763e-06, + "loss": 0.6127, + "step": 3087 + }, + { + "epoch": 0.4021866458414682, + "grad_norm": 3.0015671253204346, + "learning_rate": 9.677991657044973e-06, + "loss": 0.6858, + "step": 3090 + }, + { + "epoch": 0.4025771183131589, + "grad_norm": 3.737126350402832, + "learning_rate": 9.677253395147886e-06, + "loss": 0.7177, + "step": 3093 + }, + { + "epoch": 0.40296759078484967, + "grad_norm": 2.3965842723846436, + "learning_rate": 9.676514316144468e-06, + "loss": 0.6822, + "step": 3096 + }, + { + "epoch": 0.4033580632565404, + "grad_norm": 2.6666600704193115, + "learning_rate": 9.675774420163835e-06, + "loss": 0.6958, + "step": 3099 + }, + { + "epoch": 0.40374853572823116, + "grad_norm": 3.0011026859283447, + "learning_rate": 9.675033707335249e-06, + "loss": 0.7197, + "step": 3102 + }, + { + "epoch": 0.4041390081999219, + "grad_norm": 2.5443384647369385, + "learning_rate": 9.674292177788109e-06, + "loss": 0.6865, + "step": 3105 + }, + { + "epoch": 0.40452948067161265, + "grad_norm": 2.5568745136260986, + "learning_rate": 9.67354983165196e-06, + "loss": 0.624, + "step": 3108 + }, + { + "epoch": 0.4049199531433034, + "grad_norm": 4.514736652374268, + "learning_rate": 9.672806669056486e-06, + "loss": 0.6752, + "step": 3111 + }, + { + "epoch": 0.40531042561499414, + "grad_norm": 2.690507650375366, + "learning_rate": 9.672062690131516e-06, + "loss": 0.7393, + "step": 3114 + }, + { + "epoch": 0.4057008980866849, + "grad_norm": 3.8276305198669434, + "learning_rate": 9.671317895007025e-06, + "loss": 0.6249, + "step": 3117 + }, + { + "epoch": 0.40609137055837563, + "grad_norm": 2.517906427383423, + "learning_rate": 9.670572283813123e-06, + "loss": 0.5731, + "step": 3120 + }, + { + "epoch": 0.4064818430300664, + "grad_norm": 2.6286120414733887, + "learning_rate": 9.669825856680068e-06, + "loss": 0.6554, + "step": 3123 + }, + { + "epoch": 0.4068723155017571, + "grad_norm": 2.6419596672058105, + "learning_rate": 9.669078613738263e-06, + "loss": 0.6567, + "step": 3126 + }, + { + "epoch": 0.4072627879734479, + "grad_norm": 2.7058355808258057, + "learning_rate": 9.668330555118243e-06, + "loss": 0.8195, + "step": 3129 + }, + { + "epoch": 0.4076532604451386, + "grad_norm": 2.805713176727295, + "learning_rate": 9.667581680950698e-06, + "loss": 0.7018, + "step": 3132 + }, + { + "epoch": 0.4080437329168294, + "grad_norm": 2.345257520675659, + "learning_rate": 9.66683199136645e-06, + "loss": 0.6237, + "step": 3135 + }, + { + "epoch": 0.4084342053885201, + "grad_norm": 2.4524874687194824, + "learning_rate": 9.666081486496472e-06, + "loss": 0.6889, + "step": 3138 + }, + { + "epoch": 0.40882467786021087, + "grad_norm": 2.8738114833831787, + "learning_rate": 9.665330166471875e-06, + "loss": 0.6038, + "step": 3141 + }, + { + "epoch": 0.4092151503319016, + "grad_norm": 3.0100152492523193, + "learning_rate": 9.664578031423913e-06, + "loss": 0.6482, + "step": 3144 + }, + { + "epoch": 0.40960562280359236, + "grad_norm": 2.748427391052246, + "learning_rate": 9.663825081483979e-06, + "loss": 0.6699, + "step": 3147 + }, + { + "epoch": 0.4099960952752831, + "grad_norm": 2.4630000591278076, + "learning_rate": 9.663071316783617e-06, + "loss": 0.6064, + "step": 3150 + }, + { + "epoch": 0.41038656774697385, + "grad_norm": 3.131740093231201, + "learning_rate": 9.662316737454505e-06, + "loss": 0.6362, + "step": 3153 + }, + { + "epoch": 0.41077704021866457, + "grad_norm": 2.545337200164795, + "learning_rate": 9.661561343628465e-06, + "loss": 0.6804, + "step": 3156 + }, + { + "epoch": 0.41116751269035534, + "grad_norm": 2.7401156425476074, + "learning_rate": 9.660805135437468e-06, + "loss": 0.679, + "step": 3159 + }, + { + "epoch": 0.41155798516204606, + "grad_norm": 2.6069376468658447, + "learning_rate": 9.660048113013616e-06, + "loss": 0.6729, + "step": 3162 + }, + { + "epoch": 0.41194845763373683, + "grad_norm": 2.470309257507324, + "learning_rate": 9.659290276489164e-06, + "loss": 0.6852, + "step": 3165 + }, + { + "epoch": 0.41233893010542755, + "grad_norm": 2.4761130809783936, + "learning_rate": 9.658531625996502e-06, + "loss": 0.6221, + "step": 3168 + }, + { + "epoch": 0.4127294025771183, + "grad_norm": 2.603297472000122, + "learning_rate": 9.657772161668164e-06, + "loss": 0.645, + "step": 3171 + }, + { + "epoch": 0.41311987504880904, + "grad_norm": 5.437543869018555, + "learning_rate": 9.657011883636828e-06, + "loss": 0.7422, + "step": 3174 + }, + { + "epoch": 0.4135103475204998, + "grad_norm": 2.724668025970459, + "learning_rate": 9.656250792035314e-06, + "loss": 0.6882, + "step": 3177 + }, + { + "epoch": 0.4139008199921905, + "grad_norm": 2.5859644412994385, + "learning_rate": 9.655488886996582e-06, + "loss": 0.5831, + "step": 3180 + }, + { + "epoch": 0.4142912924638813, + "grad_norm": 2.3966543674468994, + "learning_rate": 9.654726168653733e-06, + "loss": 0.5881, + "step": 3183 + }, + { + "epoch": 0.414681764935572, + "grad_norm": 2.52030611038208, + "learning_rate": 9.653962637140016e-06, + "loss": 0.6998, + "step": 3186 + }, + { + "epoch": 0.4150722374072628, + "grad_norm": 2.5317649841308594, + "learning_rate": 9.653198292588816e-06, + "loss": 0.7423, + "step": 3189 + }, + { + "epoch": 0.4154627098789535, + "grad_norm": 2.806925058364868, + "learning_rate": 9.652433135133666e-06, + "loss": 0.7737, + "step": 3192 + }, + { + "epoch": 0.4158531823506443, + "grad_norm": 2.7214372158050537, + "learning_rate": 9.651667164908232e-06, + "loss": 0.6412, + "step": 3195 + }, + { + "epoch": 0.41624365482233505, + "grad_norm": 3.378222942352295, + "learning_rate": 9.65090038204633e-06, + "loss": 0.5841, + "step": 3198 + }, + { + "epoch": 0.41663412729402577, + "grad_norm": 5.052935600280762, + "learning_rate": 9.650132786681916e-06, + "loss": 0.6651, + "step": 3201 + }, + { + "epoch": 0.41702459976571654, + "grad_norm": 3.4297244548797607, + "learning_rate": 9.649364378949087e-06, + "loss": 0.6839, + "step": 3204 + }, + { + "epoch": 0.41741507223740726, + "grad_norm": 3.513136148452759, + "learning_rate": 9.64859515898208e-06, + "loss": 0.727, + "step": 3207 + }, + { + "epoch": 0.41780554470909803, + "grad_norm": 3.3228838443756104, + "learning_rate": 9.64782512691528e-06, + "loss": 0.7235, + "step": 3210 + }, + { + "epoch": 0.41819601718078875, + "grad_norm": 3.5547192096710205, + "learning_rate": 9.647054282883207e-06, + "loss": 0.7343, + "step": 3213 + }, + { + "epoch": 0.4185864896524795, + "grad_norm": 2.559657096862793, + "learning_rate": 9.646282627020527e-06, + "loss": 0.5984, + "step": 3216 + }, + { + "epoch": 0.41897696212417024, + "grad_norm": 2.8096330165863037, + "learning_rate": 9.645510159462047e-06, + "loss": 0.7274, + "step": 3219 + }, + { + "epoch": 0.419367434595861, + "grad_norm": 2.950134038925171, + "learning_rate": 9.644736880342714e-06, + "loss": 0.6025, + "step": 3222 + }, + { + "epoch": 0.4197579070675517, + "grad_norm": 2.600390911102295, + "learning_rate": 9.643962789797619e-06, + "loss": 0.7445, + "step": 3225 + }, + { + "epoch": 0.4201483795392425, + "grad_norm": 2.848917245864868, + "learning_rate": 9.643187887961993e-06, + "loss": 0.6197, + "step": 3228 + }, + { + "epoch": 0.4205388520109332, + "grad_norm": 2.5057928562164307, + "learning_rate": 9.642412174971214e-06, + "loss": 0.7009, + "step": 3231 + }, + { + "epoch": 0.420929324482624, + "grad_norm": 3.8111495971679688, + "learning_rate": 9.641635650960792e-06, + "loss": 0.589, + "step": 3234 + }, + { + "epoch": 0.4213197969543147, + "grad_norm": 4.075084686279297, + "learning_rate": 9.640858316066387e-06, + "loss": 0.6813, + "step": 3237 + }, + { + "epoch": 0.4217102694260055, + "grad_norm": 3.2589192390441895, + "learning_rate": 9.640080170423796e-06, + "loss": 0.6352, + "step": 3240 + }, + { + "epoch": 0.4221007418976962, + "grad_norm": 3.0924248695373535, + "learning_rate": 9.63930121416896e-06, + "loss": 0.581, + "step": 3243 + }, + { + "epoch": 0.42249121436938697, + "grad_norm": 3.0321390628814697, + "learning_rate": 9.638521447437965e-06, + "loss": 0.7448, + "step": 3246 + }, + { + "epoch": 0.4228816868410777, + "grad_norm": 2.8482680320739746, + "learning_rate": 9.637740870367028e-06, + "loss": 0.7065, + "step": 3249 + }, + { + "epoch": 0.42327215931276846, + "grad_norm": 2.815291404724121, + "learning_rate": 9.636959483092518e-06, + "loss": 0.5734, + "step": 3252 + }, + { + "epoch": 0.4236626317844592, + "grad_norm": 2.5750129222869873, + "learning_rate": 9.636177285750942e-06, + "loss": 0.6645, + "step": 3255 + }, + { + "epoch": 0.42405310425614995, + "grad_norm": 2.154196262359619, + "learning_rate": 9.635394278478947e-06, + "loss": 0.6641, + "step": 3258 + }, + { + "epoch": 0.42444357672784067, + "grad_norm": 2.7113664150238037, + "learning_rate": 9.634610461413322e-06, + "loss": 0.6695, + "step": 3261 + }, + { + "epoch": 0.42483404919953144, + "grad_norm": 2.7851827144622803, + "learning_rate": 9.633825834691e-06, + "loss": 0.6974, + "step": 3264 + }, + { + "epoch": 0.42522452167122216, + "grad_norm": 2.6589601039886475, + "learning_rate": 9.633040398449052e-06, + "loss": 0.7072, + "step": 3267 + }, + { + "epoch": 0.42561499414291293, + "grad_norm": 2.8155033588409424, + "learning_rate": 9.632254152824693e-06, + "loss": 0.652, + "step": 3270 + }, + { + "epoch": 0.42600546661460365, + "grad_norm": 3.5873613357543945, + "learning_rate": 9.63146709795528e-06, + "loss": 0.6695, + "step": 3273 + }, + { + "epoch": 0.4263959390862944, + "grad_norm": 2.7894959449768066, + "learning_rate": 9.630679233978308e-06, + "loss": 0.6908, + "step": 3276 + }, + { + "epoch": 0.42678641155798513, + "grad_norm": 2.545112371444702, + "learning_rate": 9.629890561031414e-06, + "loss": 0.6758, + "step": 3279 + }, + { + "epoch": 0.4271768840296759, + "grad_norm": 2.591367483139038, + "learning_rate": 9.629101079252379e-06, + "loss": 0.6303, + "step": 3282 + }, + { + "epoch": 0.4275673565013667, + "grad_norm": 2.6541688442230225, + "learning_rate": 9.628310788779125e-06, + "loss": 0.728, + "step": 3285 + }, + { + "epoch": 0.4279578289730574, + "grad_norm": 2.391240358352661, + "learning_rate": 9.627519689749711e-06, + "loss": 0.6469, + "step": 3288 + }, + { + "epoch": 0.42834830144474817, + "grad_norm": 2.4447021484375, + "learning_rate": 9.626727782302343e-06, + "loss": 0.6447, + "step": 3291 + }, + { + "epoch": 0.4287387739164389, + "grad_norm": 2.9437851905822754, + "learning_rate": 9.625935066575364e-06, + "loss": 0.7597, + "step": 3294 + }, + { + "epoch": 0.42912924638812966, + "grad_norm": 2.9596848487854004, + "learning_rate": 9.625141542707261e-06, + "loss": 0.7102, + "step": 3297 + }, + { + "epoch": 0.4295197188598204, + "grad_norm": 3.0737996101379395, + "learning_rate": 9.62434721083666e-06, + "loss": 0.597, + "step": 3300 + }, + { + "epoch": 0.42991019133151115, + "grad_norm": 2.6365230083465576, + "learning_rate": 9.62355207110233e-06, + "loss": 0.6877, + "step": 3303 + }, + { + "epoch": 0.43030066380320187, + "grad_norm": 3.259915351867676, + "learning_rate": 9.62275612364318e-06, + "loss": 0.6127, + "step": 3306 + }, + { + "epoch": 0.43069113627489264, + "grad_norm": 2.5275659561157227, + "learning_rate": 9.621959368598259e-06, + "loss": 0.5959, + "step": 3309 + }, + { + "epoch": 0.43108160874658336, + "grad_norm": 2.8045878410339355, + "learning_rate": 9.62116180610676e-06, + "loss": 0.582, + "step": 3312 + }, + { + "epoch": 0.43147208121827413, + "grad_norm": 2.7796387672424316, + "learning_rate": 9.620363436308015e-06, + "loss": 0.714, + "step": 3315 + }, + { + "epoch": 0.43186255368996485, + "grad_norm": 2.583375930786133, + "learning_rate": 9.619564259341497e-06, + "loss": 0.7435, + "step": 3318 + }, + { + "epoch": 0.4322530261616556, + "grad_norm": 2.662533760070801, + "learning_rate": 9.618764275346821e-06, + "loss": 0.6005, + "step": 3321 + }, + { + "epoch": 0.43264349863334634, + "grad_norm": 2.9329593181610107, + "learning_rate": 9.617963484463744e-06, + "loss": 0.6643, + "step": 3324 + }, + { + "epoch": 0.4330339711050371, + "grad_norm": 2.6769914627075195, + "learning_rate": 9.61716188683216e-06, + "loss": 0.745, + "step": 3327 + }, + { + "epoch": 0.4334244435767278, + "grad_norm": 2.9171302318573, + "learning_rate": 9.616359482592108e-06, + "loss": 0.7305, + "step": 3330 + }, + { + "epoch": 0.4338149160484186, + "grad_norm": 3.290548086166382, + "learning_rate": 9.615556271883766e-06, + "loss": 0.7274, + "step": 3333 + }, + { + "epoch": 0.4342053885201093, + "grad_norm": 2.999380588531494, + "learning_rate": 9.61475225484745e-06, + "loss": 0.628, + "step": 3336 + }, + { + "epoch": 0.4345958609918001, + "grad_norm": 2.652371644973755, + "learning_rate": 9.613947431623627e-06, + "loss": 0.6668, + "step": 3339 + }, + { + "epoch": 0.4349863334634908, + "grad_norm": 2.5840189456939697, + "learning_rate": 9.613141802352893e-06, + "loss": 0.664, + "step": 3342 + }, + { + "epoch": 0.4353768059351816, + "grad_norm": 2.5839924812316895, + "learning_rate": 9.61233536717599e-06, + "loss": 0.7491, + "step": 3345 + }, + { + "epoch": 0.4357672784068723, + "grad_norm": 2.6329665184020996, + "learning_rate": 9.6115281262338e-06, + "loss": 0.6435, + "step": 3348 + }, + { + "epoch": 0.43615775087856307, + "grad_norm": 2.31581974029541, + "learning_rate": 9.61072007966735e-06, + "loss": 0.6642, + "step": 3351 + }, + { + "epoch": 0.4365482233502538, + "grad_norm": 3.6810152530670166, + "learning_rate": 9.609911227617802e-06, + "loss": 0.677, + "step": 3354 + }, + { + "epoch": 0.43693869582194456, + "grad_norm": 2.4327340126037598, + "learning_rate": 9.609101570226458e-06, + "loss": 0.6408, + "step": 3357 + }, + { + "epoch": 0.4373291682936353, + "grad_norm": 2.6379623413085938, + "learning_rate": 9.608291107634767e-06, + "loss": 0.6083, + "step": 3360 + }, + { + "epoch": 0.43771964076532605, + "grad_norm": 2.7450592517852783, + "learning_rate": 9.607479839984313e-06, + "loss": 0.6663, + "step": 3363 + }, + { + "epoch": 0.43811011323701676, + "grad_norm": 2.3480405807495117, + "learning_rate": 9.606667767416824e-06, + "loss": 0.5952, + "step": 3366 + }, + { + "epoch": 0.43850058570870754, + "grad_norm": 2.8158650398254395, + "learning_rate": 9.605854890074169e-06, + "loss": 0.745, + "step": 3369 + }, + { + "epoch": 0.4388910581803983, + "grad_norm": 2.310330390930176, + "learning_rate": 9.60504120809835e-06, + "loss": 0.5277, + "step": 3372 + }, + { + "epoch": 0.439281530652089, + "grad_norm": 2.6338181495666504, + "learning_rate": 9.604226721631525e-06, + "loss": 0.6792, + "step": 3375 + }, + { + "epoch": 0.4396720031237798, + "grad_norm": 2.5001606941223145, + "learning_rate": 9.603411430815974e-06, + "loss": 0.61, + "step": 3378 + }, + { + "epoch": 0.4400624755954705, + "grad_norm": 2.8085408210754395, + "learning_rate": 9.60259533579413e-06, + "loss": 0.6936, + "step": 3381 + }, + { + "epoch": 0.4404529480671613, + "grad_norm": 2.701272964477539, + "learning_rate": 9.601778436708564e-06, + "loss": 0.584, + "step": 3384 + }, + { + "epoch": 0.440843420538852, + "grad_norm": 2.5402166843414307, + "learning_rate": 9.600960733701988e-06, + "loss": 0.6306, + "step": 3387 + }, + { + "epoch": 0.4412338930105428, + "grad_norm": 2.4810187816619873, + "learning_rate": 9.600142226917248e-06, + "loss": 0.6259, + "step": 3390 + }, + { + "epoch": 0.4416243654822335, + "grad_norm": 2.939995765686035, + "learning_rate": 9.599322916497338e-06, + "loss": 0.6119, + "step": 3393 + }, + { + "epoch": 0.44201483795392427, + "grad_norm": 2.663740873336792, + "learning_rate": 9.598502802585392e-06, + "loss": 0.6582, + "step": 3396 + }, + { + "epoch": 0.442405310425615, + "grad_norm": 2.5204548835754395, + "learning_rate": 9.597681885324679e-06, + "loss": 0.7289, + "step": 3399 + }, + { + "epoch": 0.44279578289730576, + "grad_norm": 2.44008207321167, + "learning_rate": 9.596860164858612e-06, + "loss": 0.5731, + "step": 3402 + }, + { + "epoch": 0.4431862553689965, + "grad_norm": 2.584868907928467, + "learning_rate": 9.596037641330746e-06, + "loss": 0.6289, + "step": 3405 + }, + { + "epoch": 0.44357672784068725, + "grad_norm": 3.060908079147339, + "learning_rate": 9.595214314884773e-06, + "loss": 0.7144, + "step": 3408 + }, + { + "epoch": 0.44396720031237796, + "grad_norm": 2.6084563732147217, + "learning_rate": 9.594390185664526e-06, + "loss": 0.6185, + "step": 3411 + }, + { + "epoch": 0.44435767278406874, + "grad_norm": 2.7041709423065186, + "learning_rate": 9.593565253813978e-06, + "loss": 0.6793, + "step": 3414 + }, + { + "epoch": 0.44474814525575945, + "grad_norm": 2.9432315826416016, + "learning_rate": 9.592739519477243e-06, + "loss": 0.6493, + "step": 3417 + }, + { + "epoch": 0.4451386177274502, + "grad_norm": 2.6161787509918213, + "learning_rate": 9.591912982798576e-06, + "loss": 0.6717, + "step": 3420 + }, + { + "epoch": 0.44552909019914094, + "grad_norm": 3.7241930961608887, + "learning_rate": 9.591085643922372e-06, + "loss": 0.7395, + "step": 3423 + }, + { + "epoch": 0.4459195626708317, + "grad_norm": 2.6377596855163574, + "learning_rate": 9.590257502993164e-06, + "loss": 0.733, + "step": 3426 + }, + { + "epoch": 0.44631003514252243, + "grad_norm": 2.6171798706054688, + "learning_rate": 9.589428560155627e-06, + "loss": 0.6539, + "step": 3429 + }, + { + "epoch": 0.4467005076142132, + "grad_norm": 3.0598702430725098, + "learning_rate": 9.588598815554573e-06, + "loss": 0.7271, + "step": 3432 + }, + { + "epoch": 0.4470909800859039, + "grad_norm": 2.866503953933716, + "learning_rate": 9.587768269334962e-06, + "loss": 0.6864, + "step": 3435 + }, + { + "epoch": 0.4474814525575947, + "grad_norm": 2.591628074645996, + "learning_rate": 9.586936921641884e-06, + "loss": 0.6335, + "step": 3438 + }, + { + "epoch": 0.4478719250292854, + "grad_norm": 2.506284236907959, + "learning_rate": 9.586104772620575e-06, + "loss": 0.6376, + "step": 3441 + }, + { + "epoch": 0.4482623975009762, + "grad_norm": 2.7341339588165283, + "learning_rate": 9.585271822416412e-06, + "loss": 0.7072, + "step": 3444 + }, + { + "epoch": 0.4486528699726669, + "grad_norm": 2.406806230545044, + "learning_rate": 9.584438071174904e-06, + "loss": 0.5989, + "step": 3447 + }, + { + "epoch": 0.4490433424443577, + "grad_norm": 2.4429965019226074, + "learning_rate": 9.583603519041713e-06, + "loss": 0.6158, + "step": 3450 + }, + { + "epoch": 0.4494338149160484, + "grad_norm": 2.7472376823425293, + "learning_rate": 9.582768166162628e-06, + "loss": 0.6195, + "step": 3453 + }, + { + "epoch": 0.44982428738773916, + "grad_norm": 2.816849946975708, + "learning_rate": 9.581932012683583e-06, + "loss": 0.7533, + "step": 3456 + }, + { + "epoch": 0.45021475985942994, + "grad_norm": 2.6621181964874268, + "learning_rate": 9.581095058750658e-06, + "loss": 0.6995, + "step": 3459 + }, + { + "epoch": 0.45060523233112065, + "grad_norm": 2.8741328716278076, + "learning_rate": 9.580257304510062e-06, + "loss": 0.6234, + "step": 3462 + }, + { + "epoch": 0.4509957048028114, + "grad_norm": 2.814319372177124, + "learning_rate": 9.579418750108149e-06, + "loss": 0.7944, + "step": 3465 + }, + { + "epoch": 0.45138617727450214, + "grad_norm": 2.745290517807007, + "learning_rate": 9.578579395691417e-06, + "loss": 0.6016, + "step": 3468 + }, + { + "epoch": 0.4517766497461929, + "grad_norm": 4.0121541023254395, + "learning_rate": 9.577739241406494e-06, + "loss": 0.6709, + "step": 3471 + }, + { + "epoch": 0.45216712221788363, + "grad_norm": 2.6668434143066406, + "learning_rate": 9.576898287400155e-06, + "loss": 0.6304, + "step": 3474 + }, + { + "epoch": 0.4525575946895744, + "grad_norm": 2.569227933883667, + "learning_rate": 9.576056533819316e-06, + "loss": 0.6098, + "step": 3477 + }, + { + "epoch": 0.4529480671612651, + "grad_norm": 2.43481183052063, + "learning_rate": 9.575213980811027e-06, + "loss": 0.6843, + "step": 3480 + }, + { + "epoch": 0.4533385396329559, + "grad_norm": 2.772892713546753, + "learning_rate": 9.57437062852248e-06, + "loss": 0.6885, + "step": 3483 + }, + { + "epoch": 0.4537290121046466, + "grad_norm": 2.392486333847046, + "learning_rate": 9.573526477101006e-06, + "loss": 0.6022, + "step": 3486 + }, + { + "epoch": 0.4541194845763374, + "grad_norm": 2.679281234741211, + "learning_rate": 9.572681526694079e-06, + "loss": 0.6724, + "step": 3489 + }, + { + "epoch": 0.4545099570480281, + "grad_norm": 2.970829725265503, + "learning_rate": 9.571835777449307e-06, + "loss": 0.6936, + "step": 3492 + }, + { + "epoch": 0.4549004295197189, + "grad_norm": 2.3695054054260254, + "learning_rate": 9.570989229514445e-06, + "loss": 0.5728, + "step": 3495 + }, + { + "epoch": 0.4552909019914096, + "grad_norm": 2.7799527645111084, + "learning_rate": 9.57014188303738e-06, + "loss": 0.6046, + "step": 3498 + }, + { + "epoch": 0.45568137446310036, + "grad_norm": 2.7889013290405273, + "learning_rate": 9.569293738166141e-06, + "loss": 0.7424, + "step": 3501 + }, + { + "epoch": 0.4560718469347911, + "grad_norm": 4.006762504577637, + "learning_rate": 9.568444795048899e-06, + "loss": 0.62, + "step": 3504 + }, + { + "epoch": 0.45646231940648185, + "grad_norm": 2.510829448699951, + "learning_rate": 9.567595053833963e-06, + "loss": 0.6677, + "step": 3507 + }, + { + "epoch": 0.45685279187817257, + "grad_norm": 2.6628668308258057, + "learning_rate": 9.566744514669777e-06, + "loss": 0.6483, + "step": 3510 + }, + { + "epoch": 0.45724326434986334, + "grad_norm": 2.8696441650390625, + "learning_rate": 9.565893177704934e-06, + "loss": 0.6337, + "step": 3513 + }, + { + "epoch": 0.45763373682155406, + "grad_norm": 2.641186237335205, + "learning_rate": 9.565041043088157e-06, + "loss": 0.6788, + "step": 3516 + }, + { + "epoch": 0.45802420929324483, + "grad_norm": 3.316789150238037, + "learning_rate": 9.564188110968314e-06, + "loss": 0.6791, + "step": 3519 + }, + { + "epoch": 0.45841468176493555, + "grad_norm": 2.8417608737945557, + "learning_rate": 9.563334381494409e-06, + "loss": 0.6841, + "step": 3522 + }, + { + "epoch": 0.4588051542366263, + "grad_norm": 3.1960368156433105, + "learning_rate": 9.562479854815587e-06, + "loss": 0.6391, + "step": 3525 + }, + { + "epoch": 0.45919562670831704, + "grad_norm": 2.585097074508667, + "learning_rate": 9.561624531081132e-06, + "loss": 0.7299, + "step": 3528 + }, + { + "epoch": 0.4595860991800078, + "grad_norm": 2.7368311882019043, + "learning_rate": 9.56076841044047e-06, + "loss": 0.6428, + "step": 3531 + }, + { + "epoch": 0.45997657165169853, + "grad_norm": 2.4325826168060303, + "learning_rate": 9.559911493043159e-06, + "loss": 0.617, + "step": 3534 + }, + { + "epoch": 0.4603670441233893, + "grad_norm": 2.53668212890625, + "learning_rate": 9.559053779038903e-06, + "loss": 0.6855, + "step": 3537 + }, + { + "epoch": 0.4607575165950801, + "grad_norm": 3.207590341567993, + "learning_rate": 9.558195268577543e-06, + "loss": 0.6649, + "step": 3540 + }, + { + "epoch": 0.4611479890667708, + "grad_norm": 2.7838118076324463, + "learning_rate": 9.557335961809059e-06, + "loss": 0.6823, + "step": 3543 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 2.8149774074554443, + "learning_rate": 9.556475858883569e-06, + "loss": 0.6601, + "step": 3546 + }, + { + "epoch": 0.4619289340101523, + "grad_norm": 2.708364725112915, + "learning_rate": 9.555614959951333e-06, + "loss": 0.7265, + "step": 3549 + }, + { + "epoch": 0.46231940648184305, + "grad_norm": 3.0776867866516113, + "learning_rate": 9.554753265162746e-06, + "loss": 0.573, + "step": 3552 + }, + { + "epoch": 0.46270987895353377, + "grad_norm": 2.5604937076568604, + "learning_rate": 9.553890774668347e-06, + "loss": 0.6187, + "step": 3555 + }, + { + "epoch": 0.46310035142522454, + "grad_norm": 3.412839651107788, + "learning_rate": 9.553027488618806e-06, + "loss": 0.6175, + "step": 3558 + }, + { + "epoch": 0.46349082389691526, + "grad_norm": 2.88525128364563, + "learning_rate": 9.552163407164945e-06, + "loss": 0.7192, + "step": 3561 + }, + { + "epoch": 0.46388129636860603, + "grad_norm": 3.187467098236084, + "learning_rate": 9.551298530457711e-06, + "loss": 0.6381, + "step": 3564 + }, + { + "epoch": 0.46427176884029675, + "grad_norm": 2.5330684185028076, + "learning_rate": 9.5504328586482e-06, + "loss": 0.7419, + "step": 3567 + }, + { + "epoch": 0.4646622413119875, + "grad_norm": 3.115802049636841, + "learning_rate": 9.54956639188764e-06, + "loss": 0.6623, + "step": 3570 + }, + { + "epoch": 0.46505271378367824, + "grad_norm": 2.369229555130005, + "learning_rate": 9.548699130327401e-06, + "loss": 0.6891, + "step": 3573 + }, + { + "epoch": 0.465443186255369, + "grad_norm": 3.0128095149993896, + "learning_rate": 9.547831074118995e-06, + "loss": 0.6068, + "step": 3576 + }, + { + "epoch": 0.46583365872705973, + "grad_norm": 2.500648021697998, + "learning_rate": 9.546962223414067e-06, + "loss": 0.7091, + "step": 3579 + }, + { + "epoch": 0.4662241311987505, + "grad_norm": 2.3269920349121094, + "learning_rate": 9.546092578364403e-06, + "loss": 0.6648, + "step": 3582 + }, + { + "epoch": 0.4666146036704412, + "grad_norm": 2.717773675918579, + "learning_rate": 9.545222139121931e-06, + "loss": 0.5747, + "step": 3585 + }, + { + "epoch": 0.467005076142132, + "grad_norm": 3.3133246898651123, + "learning_rate": 9.544350905838712e-06, + "loss": 0.6512, + "step": 3588 + }, + { + "epoch": 0.4673955486138227, + "grad_norm": 2.5333547592163086, + "learning_rate": 9.54347887866695e-06, + "loss": 0.5804, + "step": 3591 + }, + { + "epoch": 0.4677860210855135, + "grad_norm": 2.5862746238708496, + "learning_rate": 9.542606057758986e-06, + "loss": 0.6361, + "step": 3594 + }, + { + "epoch": 0.4681764935572042, + "grad_norm": 2.544423818588257, + "learning_rate": 9.5417324432673e-06, + "loss": 0.6235, + "step": 3597 + }, + { + "epoch": 0.46856696602889497, + "grad_norm": 2.701850652694702, + "learning_rate": 9.540858035344509e-06, + "loss": 0.725, + "step": 3600 + }, + { + "epoch": 0.4689574385005857, + "grad_norm": 2.778111457824707, + "learning_rate": 9.539982834143373e-06, + "loss": 0.5713, + "step": 3603 + }, + { + "epoch": 0.46934791097227646, + "grad_norm": 4.258044719696045, + "learning_rate": 9.539106839816787e-06, + "loss": 0.7481, + "step": 3606 + }, + { + "epoch": 0.4697383834439672, + "grad_norm": 4.864741325378418, + "learning_rate": 9.538230052517786e-06, + "loss": 0.7349, + "step": 3609 + }, + { + "epoch": 0.47012885591565795, + "grad_norm": 2.560973882675171, + "learning_rate": 9.53735247239954e-06, + "loss": 0.5515, + "step": 3612 + }, + { + "epoch": 0.47051932838734867, + "grad_norm": 2.455519676208496, + "learning_rate": 9.536474099615362e-06, + "loss": 0.6881, + "step": 3615 + }, + { + "epoch": 0.47090980085903944, + "grad_norm": 2.9638755321502686, + "learning_rate": 9.535594934318702e-06, + "loss": 0.6421, + "step": 3618 + }, + { + "epoch": 0.47130027333073016, + "grad_norm": 3.095149517059326, + "learning_rate": 9.53471497666315e-06, + "loss": 0.6678, + "step": 3621 + }, + { + "epoch": 0.47169074580242093, + "grad_norm": 3.0638108253479004, + "learning_rate": 9.533834226802431e-06, + "loss": 0.6058, + "step": 3624 + }, + { + "epoch": 0.4720812182741117, + "grad_norm": 2.333308696746826, + "learning_rate": 9.53295268489041e-06, + "loss": 0.5788, + "step": 3627 + }, + { + "epoch": 0.4724716907458024, + "grad_norm": 3.2323508262634277, + "learning_rate": 9.53207035108109e-06, + "loss": 0.7214, + "step": 3630 + }, + { + "epoch": 0.4728621632174932, + "grad_norm": 2.5078964233398438, + "learning_rate": 9.531187225528615e-06, + "loss": 0.7241, + "step": 3633 + }, + { + "epoch": 0.4732526356891839, + "grad_norm": 2.5901105403900146, + "learning_rate": 9.530303308387263e-06, + "loss": 0.707, + "step": 3636 + }, + { + "epoch": 0.4736431081608747, + "grad_norm": 2.5510504245758057, + "learning_rate": 9.529418599811455e-06, + "loss": 0.7274, + "step": 3639 + }, + { + "epoch": 0.4740335806325654, + "grad_norm": 2.424675703048706, + "learning_rate": 9.528533099955745e-06, + "loss": 0.6885, + "step": 3642 + }, + { + "epoch": 0.47442405310425617, + "grad_norm": 3.0174453258514404, + "learning_rate": 9.527646808974828e-06, + "loss": 0.6603, + "step": 3645 + }, + { + "epoch": 0.4748145255759469, + "grad_norm": 2.4409093856811523, + "learning_rate": 9.52675972702354e-06, + "loss": 0.684, + "step": 3648 + }, + { + "epoch": 0.47520499804763766, + "grad_norm": 2.731062889099121, + "learning_rate": 9.52587185425685e-06, + "loss": 0.7147, + "step": 3651 + }, + { + "epoch": 0.4755954705193284, + "grad_norm": 2.9121267795562744, + "learning_rate": 9.524983190829868e-06, + "loss": 0.7414, + "step": 3654 + }, + { + "epoch": 0.47598594299101915, + "grad_norm": 2.500781297683716, + "learning_rate": 9.52409373689784e-06, + "loss": 0.6857, + "step": 3657 + }, + { + "epoch": 0.47637641546270987, + "grad_norm": 2.709920644760132, + "learning_rate": 9.523203492616158e-06, + "loss": 0.6614, + "step": 3660 + }, + { + "epoch": 0.47676688793440064, + "grad_norm": 2.8489487171173096, + "learning_rate": 9.522312458140338e-06, + "loss": 0.5616, + "step": 3663 + }, + { + "epoch": 0.47715736040609136, + "grad_norm": 3.2674407958984375, + "learning_rate": 9.521420633626045e-06, + "loss": 0.5813, + "step": 3666 + }, + { + "epoch": 0.47754783287778213, + "grad_norm": 2.6726903915405273, + "learning_rate": 9.52052801922908e-06, + "loss": 0.6633, + "step": 3669 + }, + { + "epoch": 0.47793830534947285, + "grad_norm": 2.6989340782165527, + "learning_rate": 9.51963461510538e-06, + "loss": 0.6985, + "step": 3672 + }, + { + "epoch": 0.4783287778211636, + "grad_norm": 2.936947822570801, + "learning_rate": 9.51874042141102e-06, + "loss": 0.6472, + "step": 3675 + }, + { + "epoch": 0.47871925029285434, + "grad_norm": 2.9338979721069336, + "learning_rate": 9.517845438302213e-06, + "loss": 0.6153, + "step": 3678 + }, + { + "epoch": 0.4791097227645451, + "grad_norm": 2.5126919746398926, + "learning_rate": 9.516949665935314e-06, + "loss": 0.6328, + "step": 3681 + }, + { + "epoch": 0.4795001952362358, + "grad_norm": 2.4342703819274902, + "learning_rate": 9.516053104466811e-06, + "loss": 0.6888, + "step": 3684 + }, + { + "epoch": 0.4798906677079266, + "grad_norm": 3.6857855319976807, + "learning_rate": 9.51515575405333e-06, + "loss": 0.614, + "step": 3687 + }, + { + "epoch": 0.4802811401796173, + "grad_norm": 2.602360963821411, + "learning_rate": 9.514257614851639e-06, + "loss": 0.7064, + "step": 3690 + }, + { + "epoch": 0.4806716126513081, + "grad_norm": 3.5609123706817627, + "learning_rate": 9.513358687018636e-06, + "loss": 0.78, + "step": 3693 + }, + { + "epoch": 0.4810620851229988, + "grad_norm": 2.5246832370758057, + "learning_rate": 9.512458970711366e-06, + "loss": 0.6885, + "step": 3696 + }, + { + "epoch": 0.4814525575946896, + "grad_norm": 2.706555128097534, + "learning_rate": 9.51155846608701e-06, + "loss": 0.659, + "step": 3699 + }, + { + "epoch": 0.4818430300663803, + "grad_norm": 3.1037871837615967, + "learning_rate": 9.510657173302878e-06, + "loss": 0.6932, + "step": 3702 + }, + { + "epoch": 0.48223350253807107, + "grad_norm": 2.7866525650024414, + "learning_rate": 9.509755092516427e-06, + "loss": 0.6329, + "step": 3705 + }, + { + "epoch": 0.4826239750097618, + "grad_norm": 2.528336524963379, + "learning_rate": 9.508852223885248e-06, + "loss": 0.5991, + "step": 3708 + }, + { + "epoch": 0.48301444748145256, + "grad_norm": 2.683349609375, + "learning_rate": 9.507948567567073e-06, + "loss": 0.6348, + "step": 3711 + }, + { + "epoch": 0.48340491995314333, + "grad_norm": 3.169064998626709, + "learning_rate": 9.507044123719764e-06, + "loss": 0.758, + "step": 3714 + }, + { + "epoch": 0.48379539242483405, + "grad_norm": 2.669492721557617, + "learning_rate": 9.50613889250133e-06, + "loss": 0.6458, + "step": 3717 + }, + { + "epoch": 0.4841858648965248, + "grad_norm": 4.006582736968994, + "learning_rate": 9.50523287406991e-06, + "loss": 0.6128, + "step": 3720 + }, + { + "epoch": 0.48457633736821554, + "grad_norm": 2.645681858062744, + "learning_rate": 9.504326068583784e-06, + "loss": 0.651, + "step": 3723 + }, + { + "epoch": 0.4849668098399063, + "grad_norm": 2.4632298946380615, + "learning_rate": 9.503418476201371e-06, + "loss": 0.5938, + "step": 3726 + }, + { + "epoch": 0.485357282311597, + "grad_norm": 2.407679557800293, + "learning_rate": 9.502510097081223e-06, + "loss": 0.7057, + "step": 3729 + }, + { + "epoch": 0.4857477547832878, + "grad_norm": 2.6753084659576416, + "learning_rate": 9.501600931382034e-06, + "loss": 0.7167, + "step": 3732 + }, + { + "epoch": 0.4861382272549785, + "grad_norm": 2.707786798477173, + "learning_rate": 9.500690979262632e-06, + "loss": 0.7135, + "step": 3735 + }, + { + "epoch": 0.4865286997266693, + "grad_norm": 2.5371201038360596, + "learning_rate": 9.499780240881981e-06, + "loss": 0.7335, + "step": 3738 + }, + { + "epoch": 0.48691917219836, + "grad_norm": 2.661376953125, + "learning_rate": 9.498868716399192e-06, + "loss": 0.6258, + "step": 3741 + }, + { + "epoch": 0.4873096446700508, + "grad_norm": 2.647146463394165, + "learning_rate": 9.4979564059735e-06, + "loss": 0.7245, + "step": 3744 + }, + { + "epoch": 0.4877001171417415, + "grad_norm": 3.4181723594665527, + "learning_rate": 9.497043309764289e-06, + "loss": 0.7563, + "step": 3747 + }, + { + "epoch": 0.48809058961343227, + "grad_norm": 3.5437850952148438, + "learning_rate": 9.496129427931069e-06, + "loss": 0.7247, + "step": 3750 + }, + { + "epoch": 0.488481062085123, + "grad_norm": 3.7329277992248535, + "learning_rate": 9.495214760633498e-06, + "loss": 0.6169, + "step": 3753 + }, + { + "epoch": 0.48887153455681376, + "grad_norm": 2.8779706954956055, + "learning_rate": 9.494299308031365e-06, + "loss": 0.6749, + "step": 3756 + }, + { + "epoch": 0.4892620070285045, + "grad_norm": 2.7706687450408936, + "learning_rate": 9.493383070284597e-06, + "loss": 0.6352, + "step": 3759 + }, + { + "epoch": 0.48965247950019525, + "grad_norm": 2.6291885375976562, + "learning_rate": 9.49246604755326e-06, + "loss": 0.5958, + "step": 3762 + }, + { + "epoch": 0.49004295197188597, + "grad_norm": 2.48679256439209, + "learning_rate": 9.491548239997555e-06, + "loss": 0.6365, + "step": 3765 + }, + { + "epoch": 0.49043342444357674, + "grad_norm": 2.522580862045288, + "learning_rate": 9.490629647777821e-06, + "loss": 0.6723, + "step": 3768 + }, + { + "epoch": 0.49082389691526745, + "grad_norm": 2.4899022579193115, + "learning_rate": 9.489710271054533e-06, + "loss": 0.651, + "step": 3771 + }, + { + "epoch": 0.4912143693869582, + "grad_norm": 3.645275592803955, + "learning_rate": 9.488790109988308e-06, + "loss": 0.6087, + "step": 3774 + }, + { + "epoch": 0.49160484185864894, + "grad_norm": 2.5229358673095703, + "learning_rate": 9.487869164739893e-06, + "loss": 0.6372, + "step": 3777 + }, + { + "epoch": 0.4919953143303397, + "grad_norm": 2.5674540996551514, + "learning_rate": 9.486947435470176e-06, + "loss": 0.585, + "step": 3780 + }, + { + "epoch": 0.49238578680203043, + "grad_norm": 2.749277353286743, + "learning_rate": 9.486024922340181e-06, + "loss": 0.6834, + "step": 3783 + }, + { + "epoch": 0.4927762592737212, + "grad_norm": 2.4357573986053467, + "learning_rate": 9.485101625511071e-06, + "loss": 0.6562, + "step": 3786 + }, + { + "epoch": 0.4931667317454119, + "grad_norm": 2.85579776763916, + "learning_rate": 9.48417754514414e-06, + "loss": 0.6401, + "step": 3789 + }, + { + "epoch": 0.4935572042171027, + "grad_norm": 2.402381658554077, + "learning_rate": 9.483252681400825e-06, + "loss": 0.7112, + "step": 3792 + }, + { + "epoch": 0.4939476766887934, + "grad_norm": 3.6302082538604736, + "learning_rate": 9.4823270344427e-06, + "loss": 0.71, + "step": 3795 + }, + { + "epoch": 0.4943381491604842, + "grad_norm": 2.943791627883911, + "learning_rate": 9.48140060443147e-06, + "loss": 0.657, + "step": 3798 + }, + { + "epoch": 0.49472862163217496, + "grad_norm": 3.03796648979187, + "learning_rate": 9.480473391528982e-06, + "loss": 0.7657, + "step": 3801 + }, + { + "epoch": 0.4951190941038657, + "grad_norm": 3.2505815029144287, + "learning_rate": 9.479545395897219e-06, + "loss": 0.6439, + "step": 3804 + }, + { + "epoch": 0.49550956657555645, + "grad_norm": 2.497419834136963, + "learning_rate": 9.478616617698297e-06, + "loss": 0.7109, + "step": 3807 + }, + { + "epoch": 0.49590003904724717, + "grad_norm": 2.562572956085205, + "learning_rate": 9.477687057094476e-06, + "loss": 0.7191, + "step": 3810 + }, + { + "epoch": 0.49629051151893794, + "grad_norm": 2.6476757526397705, + "learning_rate": 9.476756714248142e-06, + "loss": 0.6237, + "step": 3813 + }, + { + "epoch": 0.49668098399062866, + "grad_norm": 2.27634859085083, + "learning_rate": 9.475825589321831e-06, + "loss": 0.635, + "step": 3816 + }, + { + "epoch": 0.4970714564623194, + "grad_norm": 2.4250454902648926, + "learning_rate": 9.474893682478205e-06, + "loss": 0.6073, + "step": 3819 + }, + { + "epoch": 0.49746192893401014, + "grad_norm": 2.7182376384735107, + "learning_rate": 9.473960993880068e-06, + "loss": 0.6665, + "step": 3822 + }, + { + "epoch": 0.4978524014057009, + "grad_norm": 2.8492777347564697, + "learning_rate": 9.473027523690355e-06, + "loss": 0.5861, + "step": 3825 + }, + { + "epoch": 0.49824287387739163, + "grad_norm": 2.5110251903533936, + "learning_rate": 9.472093272072142e-06, + "loss": 0.5882, + "step": 3828 + }, + { + "epoch": 0.4986333463490824, + "grad_norm": 2.5196032524108887, + "learning_rate": 9.471158239188645e-06, + "loss": 0.7131, + "step": 3831 + }, + { + "epoch": 0.4990238188207731, + "grad_norm": 2.7564871311187744, + "learning_rate": 9.470222425203209e-06, + "loss": 0.7012, + "step": 3834 + }, + { + "epoch": 0.4994142912924639, + "grad_norm": 2.5324935913085938, + "learning_rate": 9.469285830279318e-06, + "loss": 0.6357, + "step": 3837 + }, + { + "epoch": 0.4998047637641546, + "grad_norm": 2.3627541065216064, + "learning_rate": 9.468348454580596e-06, + "loss": 0.6458, + "step": 3840 + }, + { + "epoch": 0.5001952362358454, + "grad_norm": 3.514322519302368, + "learning_rate": 9.467410298270798e-06, + "loss": 0.5593, + "step": 3843 + }, + { + "epoch": 0.5005857087075362, + "grad_norm": 2.6262550354003906, + "learning_rate": 9.46647136151382e-06, + "loss": 0.6549, + "step": 3846 + }, + { + "epoch": 0.5009761811792268, + "grad_norm": 3.208850860595703, + "learning_rate": 9.46553164447369e-06, + "loss": 0.6028, + "step": 3849 + }, + { + "epoch": 0.5013666536509176, + "grad_norm": 2.4970576763153076, + "learning_rate": 9.464591147314577e-06, + "loss": 0.712, + "step": 3852 + }, + { + "epoch": 0.5017571261226084, + "grad_norm": 3.3862645626068115, + "learning_rate": 9.463649870200782e-06, + "loss": 0.5893, + "step": 3855 + }, + { + "epoch": 0.5021475985942991, + "grad_norm": 5.486196041107178, + "learning_rate": 9.462707813296746e-06, + "loss": 0.7407, + "step": 3858 + }, + { + "epoch": 0.5025380710659898, + "grad_norm": 2.571362018585205, + "learning_rate": 9.46176497676704e-06, + "loss": 0.5659, + "step": 3861 + }, + { + "epoch": 0.5029285435376806, + "grad_norm": 2.665663957595825, + "learning_rate": 9.460821360776382e-06, + "loss": 0.7724, + "step": 3864 + }, + { + "epoch": 0.5033190160093713, + "grad_norm": 3.4792850017547607, + "learning_rate": 9.459876965489614e-06, + "loss": 0.6739, + "step": 3867 + }, + { + "epoch": 0.5037094884810621, + "grad_norm": 3.114149332046509, + "learning_rate": 9.458931791071723e-06, + "loss": 0.6073, + "step": 3870 + }, + { + "epoch": 0.5040999609527528, + "grad_norm": 4.276490211486816, + "learning_rate": 9.457985837687832e-06, + "loss": 0.7193, + "step": 3873 + }, + { + "epoch": 0.5044904334244436, + "grad_norm": 2.727604389190674, + "learning_rate": 9.457039105503188e-06, + "loss": 0.6752, + "step": 3876 + }, + { + "epoch": 0.5048809058961343, + "grad_norm": 2.9666244983673096, + "learning_rate": 9.456091594683192e-06, + "loss": 0.7623, + "step": 3879 + }, + { + "epoch": 0.5052713783678251, + "grad_norm": 2.6261138916015625, + "learning_rate": 9.455143305393367e-06, + "loss": 0.7193, + "step": 3882 + }, + { + "epoch": 0.5056618508395159, + "grad_norm": 3.4070770740509033, + "learning_rate": 9.454194237799379e-06, + "loss": 0.6537, + "step": 3885 + }, + { + "epoch": 0.5060523233112065, + "grad_norm": 2.4243500232696533, + "learning_rate": 9.453244392067028e-06, + "loss": 0.6781, + "step": 3888 + }, + { + "epoch": 0.5064427957828973, + "grad_norm": 3.1091771125793457, + "learning_rate": 9.45229376836225e-06, + "loss": 0.6486, + "step": 3891 + }, + { + "epoch": 0.5068332682545881, + "grad_norm": 2.519580841064453, + "learning_rate": 9.451342366851117e-06, + "loss": 0.6303, + "step": 3894 + }, + { + "epoch": 0.5072237407262788, + "grad_norm": 2.6116557121276855, + "learning_rate": 9.450390187699838e-06, + "loss": 0.6344, + "step": 3897 + }, + { + "epoch": 0.5076142131979695, + "grad_norm": 2.69616961479187, + "learning_rate": 9.449437231074755e-06, + "loss": 0.6245, + "step": 3900 + }, + { + "epoch": 0.5080046856696603, + "grad_norm": 2.675161600112915, + "learning_rate": 9.448483497142347e-06, + "loss": 0.6087, + "step": 3903 + }, + { + "epoch": 0.5083951581413511, + "grad_norm": 2.4945101737976074, + "learning_rate": 9.44752898606923e-06, + "loss": 0.6146, + "step": 3906 + }, + { + "epoch": 0.5087856306130418, + "grad_norm": 2.615118980407715, + "learning_rate": 9.446573698022155e-06, + "loss": 0.8205, + "step": 3909 + }, + { + "epoch": 0.5091761030847325, + "grad_norm": 4.189955234527588, + "learning_rate": 9.445617633168012e-06, + "loss": 0.6343, + "step": 3912 + }, + { + "epoch": 0.5095665755564233, + "grad_norm": 2.6967740058898926, + "learning_rate": 9.444660791673818e-06, + "loss": 0.58, + "step": 3915 + }, + { + "epoch": 0.509957048028114, + "grad_norm": 2.989888906478882, + "learning_rate": 9.443703173706734e-06, + "loss": 0.6818, + "step": 3918 + }, + { + "epoch": 0.5103475204998048, + "grad_norm": 3.485555410385132, + "learning_rate": 9.442744779434054e-06, + "loss": 0.612, + "step": 3921 + }, + { + "epoch": 0.5107379929714955, + "grad_norm": 2.7845234870910645, + "learning_rate": 9.441785609023208e-06, + "loss": 0.6826, + "step": 3924 + }, + { + "epoch": 0.5111284654431862, + "grad_norm": 2.6713454723358154, + "learning_rate": 9.44082566264176e-06, + "loss": 0.6219, + "step": 3927 + }, + { + "epoch": 0.511518937914877, + "grad_norm": 3.361051321029663, + "learning_rate": 9.43986494045741e-06, + "loss": 0.669, + "step": 3930 + }, + { + "epoch": 0.5119094103865678, + "grad_norm": 3.308380365371704, + "learning_rate": 9.438903442637997e-06, + "loss": 0.6721, + "step": 3933 + }, + { + "epoch": 0.5122998828582584, + "grad_norm": 2.7254140377044678, + "learning_rate": 9.43794116935149e-06, + "loss": 0.6372, + "step": 3936 + }, + { + "epoch": 0.5126903553299492, + "grad_norm": 2.9412028789520264, + "learning_rate": 9.436978120765996e-06, + "loss": 0.7481, + "step": 3939 + }, + { + "epoch": 0.51308082780164, + "grad_norm": 2.711512804031372, + "learning_rate": 9.436014297049758e-06, + "loss": 0.7198, + "step": 3942 + }, + { + "epoch": 0.5134713002733308, + "grad_norm": 2.667794942855835, + "learning_rate": 9.435049698371155e-06, + "loss": 0.681, + "step": 3945 + }, + { + "epoch": 0.5138617727450214, + "grad_norm": 3.5074334144592285, + "learning_rate": 9.4340843248987e-06, + "loss": 0.6905, + "step": 3948 + }, + { + "epoch": 0.5142522452167122, + "grad_norm": 3.1205289363861084, + "learning_rate": 9.43311817680104e-06, + "loss": 0.7021, + "step": 3951 + }, + { + "epoch": 0.514642717688403, + "grad_norm": 2.667708396911621, + "learning_rate": 9.432151254246961e-06, + "loss": 0.7106, + "step": 3954 + }, + { + "epoch": 0.5150331901600937, + "grad_norm": 2.348116874694824, + "learning_rate": 9.431183557405383e-06, + "loss": 0.5503, + "step": 3957 + }, + { + "epoch": 0.5154236626317844, + "grad_norm": 3.321404457092285, + "learning_rate": 9.430215086445358e-06, + "loss": 0.6023, + "step": 3960 + }, + { + "epoch": 0.5158141351034752, + "grad_norm": 2.7746543884277344, + "learning_rate": 9.429245841536079e-06, + "loss": 0.678, + "step": 3963 + }, + { + "epoch": 0.516204607575166, + "grad_norm": 2.634472131729126, + "learning_rate": 9.428275822846868e-06, + "loss": 0.6451, + "step": 3966 + }, + { + "epoch": 0.5165950800468567, + "grad_norm": 2.5320868492126465, + "learning_rate": 9.427305030547185e-06, + "loss": 0.6206, + "step": 3969 + }, + { + "epoch": 0.5169855525185475, + "grad_norm": 2.5869338512420654, + "learning_rate": 9.42633346480663e-06, + "loss": 0.6766, + "step": 3972 + }, + { + "epoch": 0.5173760249902382, + "grad_norm": 2.589397430419922, + "learning_rate": 9.425361125794928e-06, + "loss": 0.7054, + "step": 3975 + }, + { + "epoch": 0.5177664974619289, + "grad_norm": 2.9746224880218506, + "learning_rate": 9.424388013681947e-06, + "loss": 0.7086, + "step": 3978 + }, + { + "epoch": 0.5181569699336197, + "grad_norm": 2.565460205078125, + "learning_rate": 9.42341412863769e-06, + "loss": 0.6858, + "step": 3981 + }, + { + "epoch": 0.5185474424053105, + "grad_norm": 2.5512335300445557, + "learning_rate": 9.422439470832288e-06, + "loss": 0.6332, + "step": 3984 + }, + { + "epoch": 0.5189379148770011, + "grad_norm": 3.2780306339263916, + "learning_rate": 9.421464040436012e-06, + "loss": 0.664, + "step": 3987 + }, + { + "epoch": 0.5193283873486919, + "grad_norm": 2.374025821685791, + "learning_rate": 9.420487837619272e-06, + "loss": 0.5543, + "step": 3990 + }, + { + "epoch": 0.5197188598203827, + "grad_norm": 2.818821907043457, + "learning_rate": 9.419510862552608e-06, + "loss": 0.6181, + "step": 3993 + }, + { + "epoch": 0.5201093322920735, + "grad_norm": 2.3839192390441895, + "learning_rate": 9.41853311540669e-06, + "loss": 0.6783, + "step": 3996 + }, + { + "epoch": 0.5204998047637641, + "grad_norm": 2.5750620365142822, + "learning_rate": 9.417554596352334e-06, + "loss": 0.6101, + "step": 3999 + }, + { + "epoch": 0.5208902772354549, + "grad_norm": 2.7210500240325928, + "learning_rate": 9.416575305560482e-06, + "loss": 0.749, + "step": 4002 + }, + { + "epoch": 0.5212807497071457, + "grad_norm": 2.856265068054199, + "learning_rate": 9.415595243202217e-06, + "loss": 0.6108, + "step": 4005 + }, + { + "epoch": 0.5216712221788364, + "grad_norm": 2.2772696018218994, + "learning_rate": 9.41461440944875e-06, + "loss": 0.7071, + "step": 4008 + }, + { + "epoch": 0.5220616946505271, + "grad_norm": 2.629115104675293, + "learning_rate": 9.413632804471434e-06, + "loss": 0.6378, + "step": 4011 + }, + { + "epoch": 0.5224521671222179, + "grad_norm": 2.853085517883301, + "learning_rate": 9.412650428441752e-06, + "loss": 0.6871, + "step": 4014 + }, + { + "epoch": 0.5228426395939086, + "grad_norm": 2.5856900215148926, + "learning_rate": 9.411667281531322e-06, + "loss": 0.6206, + "step": 4017 + }, + { + "epoch": 0.5232331120655994, + "grad_norm": 2.705491065979004, + "learning_rate": 9.4106833639119e-06, + "loss": 0.5966, + "step": 4020 + }, + { + "epoch": 0.5236235845372901, + "grad_norm": 2.5959479808807373, + "learning_rate": 9.409698675755372e-06, + "loss": 0.6265, + "step": 4023 + }, + { + "epoch": 0.5240140570089808, + "grad_norm": 2.661630630493164, + "learning_rate": 9.408713217233762e-06, + "loss": 0.6688, + "step": 4026 + }, + { + "epoch": 0.5244045294806716, + "grad_norm": 2.437685966491699, + "learning_rate": 9.40772698851923e-06, + "loss": 0.6566, + "step": 4029 + }, + { + "epoch": 0.5247950019523624, + "grad_norm": 2.658416986465454, + "learning_rate": 9.406739989784061e-06, + "loss": 0.679, + "step": 4032 + }, + { + "epoch": 0.5251854744240531, + "grad_norm": 3.007324695587158, + "learning_rate": 9.40575222120069e-06, + "loss": 0.6227, + "step": 4035 + }, + { + "epoch": 0.5255759468957438, + "grad_norm": 2.6084835529327393, + "learning_rate": 9.404763682941673e-06, + "loss": 0.6149, + "step": 4038 + }, + { + "epoch": 0.5259664193674346, + "grad_norm": 3.3151960372924805, + "learning_rate": 9.403774375179707e-06, + "loss": 0.6682, + "step": 4041 + }, + { + "epoch": 0.5263568918391254, + "grad_norm": 2.847337007522583, + "learning_rate": 9.402784298087622e-06, + "loss": 0.596, + "step": 4044 + }, + { + "epoch": 0.526747364310816, + "grad_norm": 3.080592393875122, + "learning_rate": 9.401793451838382e-06, + "loss": 0.7028, + "step": 4047 + }, + { + "epoch": 0.5271378367825068, + "grad_norm": 2.5581657886505127, + "learning_rate": 9.400801836605085e-06, + "loss": 0.6566, + "step": 4050 + }, + { + "epoch": 0.5275283092541976, + "grad_norm": 2.55700421333313, + "learning_rate": 9.399809452560966e-06, + "loss": 0.6734, + "step": 4053 + }, + { + "epoch": 0.5279187817258884, + "grad_norm": 2.44401478767395, + "learning_rate": 9.398816299879392e-06, + "loss": 0.5712, + "step": 4056 + }, + { + "epoch": 0.5283092541975791, + "grad_norm": 2.4714529514312744, + "learning_rate": 9.397822378733864e-06, + "loss": 0.6925, + "step": 4059 + }, + { + "epoch": 0.5286997266692698, + "grad_norm": 3.4595723152160645, + "learning_rate": 9.396827689298018e-06, + "loss": 0.6653, + "step": 4062 + }, + { + "epoch": 0.5290901991409606, + "grad_norm": 2.528628349304199, + "learning_rate": 9.395832231745624e-06, + "loss": 0.6362, + "step": 4065 + }, + { + "epoch": 0.5294806716126513, + "grad_norm": 2.4336354732513428, + "learning_rate": 9.394836006250587e-06, + "loss": 0.6348, + "step": 4068 + }, + { + "epoch": 0.5298711440843421, + "grad_norm": 2.7131993770599365, + "learning_rate": 9.393839012986944e-06, + "loss": 0.675, + "step": 4071 + }, + { + "epoch": 0.5302616165560328, + "grad_norm": 2.5495808124542236, + "learning_rate": 9.39284125212887e-06, + "loss": 0.6013, + "step": 4074 + }, + { + "epoch": 0.5306520890277235, + "grad_norm": 2.3943541049957275, + "learning_rate": 9.39184272385067e-06, + "loss": 0.653, + "step": 4077 + }, + { + "epoch": 0.5310425614994143, + "grad_norm": 2.781402111053467, + "learning_rate": 9.390843428326785e-06, + "loss": 0.7808, + "step": 4080 + }, + { + "epoch": 0.5314330339711051, + "grad_norm": 3.896851062774658, + "learning_rate": 9.38984336573179e-06, + "loss": 0.6671, + "step": 4083 + }, + { + "epoch": 0.5318235064427957, + "grad_norm": 2.4791414737701416, + "learning_rate": 9.388842536240395e-06, + "loss": 0.7199, + "step": 4086 + }, + { + "epoch": 0.5322139789144865, + "grad_norm": 2.551300048828125, + "learning_rate": 9.387840940027439e-06, + "loss": 0.5786, + "step": 4089 + }, + { + "epoch": 0.5326044513861773, + "grad_norm": 2.9271490573883057, + "learning_rate": 9.386838577267906e-06, + "loss": 0.755, + "step": 4092 + }, + { + "epoch": 0.5329949238578681, + "grad_norm": 2.4005637168884277, + "learning_rate": 9.385835448136897e-06, + "loss": 0.662, + "step": 4095 + }, + { + "epoch": 0.5333853963295587, + "grad_norm": 2.360596179962158, + "learning_rate": 9.384831552809665e-06, + "loss": 0.6733, + "step": 4098 + }, + { + "epoch": 0.5337758688012495, + "grad_norm": 2.809666395187378, + "learning_rate": 9.383826891461583e-06, + "loss": 0.5828, + "step": 4101 + }, + { + "epoch": 0.5341663412729403, + "grad_norm": 2.80281138420105, + "learning_rate": 9.382821464268166e-06, + "loss": 0.6344, + "step": 4104 + }, + { + "epoch": 0.534556813744631, + "grad_norm": 2.5210893154144287, + "learning_rate": 9.38181527140506e-06, + "loss": 0.6828, + "step": 4107 + }, + { + "epoch": 0.5349472862163217, + "grad_norm": 3.6144542694091797, + "learning_rate": 9.380808313048045e-06, + "loss": 0.7309, + "step": 4110 + }, + { + "epoch": 0.5353377586880125, + "grad_norm": 2.6584270000457764, + "learning_rate": 9.379800589373032e-06, + "loss": 0.6842, + "step": 4113 + }, + { + "epoch": 0.5357282311597032, + "grad_norm": 2.685014247894287, + "learning_rate": 9.378792100556069e-06, + "loss": 0.64, + "step": 4116 + }, + { + "epoch": 0.536118703631394, + "grad_norm": 2.667910099029541, + "learning_rate": 9.37778284677334e-06, + "loss": 0.7603, + "step": 4119 + }, + { + "epoch": 0.5365091761030847, + "grad_norm": 2.820080041885376, + "learning_rate": 9.376772828201155e-06, + "loss": 0.6083, + "step": 4122 + }, + { + "epoch": 0.5368996485747755, + "grad_norm": 2.422607898712158, + "learning_rate": 9.375762045015966e-06, + "loss": 0.6028, + "step": 4125 + }, + { + "epoch": 0.5372901210464662, + "grad_norm": 3.0713932514190674, + "learning_rate": 9.374750497394352e-06, + "loss": 0.7543, + "step": 4128 + }, + { + "epoch": 0.537680593518157, + "grad_norm": 2.311880111694336, + "learning_rate": 9.373738185513028e-06, + "loss": 0.6959, + "step": 4131 + }, + { + "epoch": 0.5380710659898477, + "grad_norm": 2.619349241256714, + "learning_rate": 9.372725109548846e-06, + "loss": 0.7416, + "step": 4134 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 2.9388935565948486, + "learning_rate": 9.371711269678786e-06, + "loss": 0.737, + "step": 4137 + }, + { + "epoch": 0.5388520109332292, + "grad_norm": 2.5478711128234863, + "learning_rate": 9.370696666079964e-06, + "loss": 0.7358, + "step": 4140 + }, + { + "epoch": 0.53924248340492, + "grad_norm": 4.10902214050293, + "learning_rate": 9.369681298929629e-06, + "loss": 0.6521, + "step": 4143 + }, + { + "epoch": 0.5396329558766108, + "grad_norm": 2.6128830909729004, + "learning_rate": 9.368665168405163e-06, + "loss": 0.663, + "step": 4146 + }, + { + "epoch": 0.5400234283483014, + "grad_norm": 3.5728492736816406, + "learning_rate": 9.367648274684082e-06, + "loss": 0.6235, + "step": 4149 + }, + { + "epoch": 0.5404139008199922, + "grad_norm": 3.0014986991882324, + "learning_rate": 9.366630617944037e-06, + "loss": 0.607, + "step": 4152 + }, + { + "epoch": 0.540804373291683, + "grad_norm": 2.6722731590270996, + "learning_rate": 9.365612198362807e-06, + "loss": 0.6592, + "step": 4155 + }, + { + "epoch": 0.5411948457633737, + "grad_norm": 3.075171947479248, + "learning_rate": 9.364593016118311e-06, + "loss": 0.7503, + "step": 4158 + }, + { + "epoch": 0.5415853182350644, + "grad_norm": 2.6275832653045654, + "learning_rate": 9.363573071388598e-06, + "loss": 0.6354, + "step": 4161 + }, + { + "epoch": 0.5419757907067552, + "grad_norm": 2.640997886657715, + "learning_rate": 9.362552364351849e-06, + "loss": 0.6437, + "step": 4164 + }, + { + "epoch": 0.5423662631784459, + "grad_norm": 4.610156059265137, + "learning_rate": 9.361530895186378e-06, + "loss": 0.7229, + "step": 4167 + }, + { + "epoch": 0.5427567356501367, + "grad_norm": 2.4628376960754395, + "learning_rate": 9.360508664070634e-06, + "loss": 0.7439, + "step": 4170 + }, + { + "epoch": 0.5431472081218274, + "grad_norm": 3.297962188720703, + "learning_rate": 9.359485671183202e-06, + "loss": 0.7502, + "step": 4173 + }, + { + "epoch": 0.5435376805935181, + "grad_norm": 2.7493808269500732, + "learning_rate": 9.358461916702793e-06, + "loss": 0.5738, + "step": 4176 + }, + { + "epoch": 0.5439281530652089, + "grad_norm": 2.37622332572937, + "learning_rate": 9.357437400808256e-06, + "loss": 0.6252, + "step": 4179 + }, + { + "epoch": 0.5443186255368997, + "grad_norm": 2.4607114791870117, + "learning_rate": 9.356412123678572e-06, + "loss": 0.6813, + "step": 4182 + }, + { + "epoch": 0.5447090980085904, + "grad_norm": 3.2492547035217285, + "learning_rate": 9.355386085492855e-06, + "loss": 0.6283, + "step": 4185 + }, + { + "epoch": 0.5450995704802811, + "grad_norm": 2.3244526386260986, + "learning_rate": 9.35435928643035e-06, + "loss": 0.5893, + "step": 4188 + }, + { + "epoch": 0.5454900429519719, + "grad_norm": 3.75888729095459, + "learning_rate": 9.353331726670438e-06, + "loss": 0.7102, + "step": 4191 + }, + { + "epoch": 0.5458805154236627, + "grad_norm": 2.8710289001464844, + "learning_rate": 9.352303406392634e-06, + "loss": 0.6786, + "step": 4194 + }, + { + "epoch": 0.5462709878953533, + "grad_norm": 2.649484634399414, + "learning_rate": 9.351274325776578e-06, + "loss": 0.6359, + "step": 4197 + }, + { + "epoch": 0.5466614603670441, + "grad_norm": 2.539466381072998, + "learning_rate": 9.350244485002051e-06, + "loss": 0.7931, + "step": 4200 + }, + { + "epoch": 0.5470519328387349, + "grad_norm": 2.964155912399292, + "learning_rate": 9.349213884248967e-06, + "loss": 0.6971, + "step": 4203 + }, + { + "epoch": 0.5474424053104257, + "grad_norm": 2.4749128818511963, + "learning_rate": 9.348182523697365e-06, + "loss": 0.6917, + "step": 4206 + }, + { + "epoch": 0.5478328777821163, + "grad_norm": 2.6950347423553467, + "learning_rate": 9.347150403527422e-06, + "loss": 0.6089, + "step": 4209 + }, + { + "epoch": 0.5482233502538071, + "grad_norm": 3.4852073192596436, + "learning_rate": 9.34611752391945e-06, + "loss": 0.6896, + "step": 4212 + }, + { + "epoch": 0.5486138227254979, + "grad_norm": 2.6837730407714844, + "learning_rate": 9.34508388505389e-06, + "loss": 0.6311, + "step": 4215 + }, + { + "epoch": 0.5490042951971886, + "grad_norm": 3.1561684608459473, + "learning_rate": 9.344049487111316e-06, + "loss": 0.6248, + "step": 4218 + }, + { + "epoch": 0.5493947676688793, + "grad_norm": 3.7130353450775146, + "learning_rate": 9.343014330272432e-06, + "loss": 0.6496, + "step": 4221 + }, + { + "epoch": 0.5497852401405701, + "grad_norm": 3.125425338745117, + "learning_rate": 9.341978414718084e-06, + "loss": 0.7285, + "step": 4224 + }, + { + "epoch": 0.5501757126122608, + "grad_norm": 2.448007583618164, + "learning_rate": 9.34094174062924e-06, + "loss": 0.7151, + "step": 4227 + }, + { + "epoch": 0.5505661850839516, + "grad_norm": 2.704099655151367, + "learning_rate": 9.339904308187006e-06, + "loss": 0.7632, + "step": 4230 + }, + { + "epoch": 0.5509566575556424, + "grad_norm": 2.6874077320098877, + "learning_rate": 9.33886611757262e-06, + "loss": 0.6707, + "step": 4233 + }, + { + "epoch": 0.551347130027333, + "grad_norm": 2.497955799102783, + "learning_rate": 9.33782716896745e-06, + "loss": 0.6996, + "step": 4236 + }, + { + "epoch": 0.5517376024990238, + "grad_norm": 2.5674867630004883, + "learning_rate": 9.336787462553001e-06, + "loss": 0.6761, + "step": 4239 + }, + { + "epoch": 0.5521280749707146, + "grad_norm": 2.8246846199035645, + "learning_rate": 9.335746998510902e-06, + "loss": 0.7313, + "step": 4242 + }, + { + "epoch": 0.5525185474424054, + "grad_norm": 3.6085305213928223, + "learning_rate": 9.334705777022926e-06, + "loss": 0.7069, + "step": 4245 + }, + { + "epoch": 0.552909019914096, + "grad_norm": 2.782156229019165, + "learning_rate": 9.333663798270969e-06, + "loss": 0.6735, + "step": 4248 + }, + { + "epoch": 0.5532994923857868, + "grad_norm": 2.7994630336761475, + "learning_rate": 9.332621062437064e-06, + "loss": 0.6201, + "step": 4251 + }, + { + "epoch": 0.5536899648574776, + "grad_norm": 2.498595952987671, + "learning_rate": 9.331577569703374e-06, + "loss": 0.7614, + "step": 4254 + }, + { + "epoch": 0.5540804373291683, + "grad_norm": 2.729330062866211, + "learning_rate": 9.330533320252193e-06, + "loss": 0.6992, + "step": 4257 + }, + { + "epoch": 0.554470909800859, + "grad_norm": 2.681333303451538, + "learning_rate": 9.32948831426595e-06, + "loss": 0.7586, + "step": 4260 + }, + { + "epoch": 0.5548613822725498, + "grad_norm": 2.6096370220184326, + "learning_rate": 9.32844255192721e-06, + "loss": 0.6834, + "step": 4263 + }, + { + "epoch": 0.5552518547442405, + "grad_norm": 2.6331887245178223, + "learning_rate": 9.32739603341866e-06, + "loss": 0.6116, + "step": 4266 + }, + { + "epoch": 0.5556423272159313, + "grad_norm": 2.9159796237945557, + "learning_rate": 9.326348758923127e-06, + "loss": 0.6944, + "step": 4269 + }, + { + "epoch": 0.556032799687622, + "grad_norm": 2.7244224548339844, + "learning_rate": 9.325300728623567e-06, + "loss": 0.7559, + "step": 4272 + }, + { + "epoch": 0.5564232721593128, + "grad_norm": 2.5639004707336426, + "learning_rate": 9.324251942703068e-06, + "loss": 0.7335, + "step": 4275 + }, + { + "epoch": 0.5568137446310035, + "grad_norm": 2.4377241134643555, + "learning_rate": 9.323202401344852e-06, + "loss": 0.667, + "step": 4278 + }, + { + "epoch": 0.5572042171026943, + "grad_norm": 2.6609203815460205, + "learning_rate": 9.322152104732272e-06, + "loss": 0.6463, + "step": 4281 + }, + { + "epoch": 0.557594689574385, + "grad_norm": 3.7862682342529297, + "learning_rate": 9.321101053048812e-06, + "loss": 0.6735, + "step": 4284 + }, + { + "epoch": 0.5579851620460757, + "grad_norm": 3.0916101932525635, + "learning_rate": 9.320049246478086e-06, + "loss": 0.6867, + "step": 4287 + }, + { + "epoch": 0.5583756345177665, + "grad_norm": 2.4427428245544434, + "learning_rate": 9.318996685203848e-06, + "loss": 0.6473, + "step": 4290 + }, + { + "epoch": 0.5587661069894573, + "grad_norm": 2.9090192317962646, + "learning_rate": 9.317943369409973e-06, + "loss": 0.6461, + "step": 4293 + }, + { + "epoch": 0.5591565794611479, + "grad_norm": 5.3703742027282715, + "learning_rate": 9.316889299280475e-06, + "loss": 0.716, + "step": 4296 + }, + { + "epoch": 0.5595470519328387, + "grad_norm": 4.018325328826904, + "learning_rate": 9.315834474999498e-06, + "loss": 0.716, + "step": 4299 + }, + { + "epoch": 0.5599375244045295, + "grad_norm": 2.85209059715271, + "learning_rate": 9.314778896751317e-06, + "loss": 0.6133, + "step": 4302 + }, + { + "epoch": 0.5603279968762203, + "grad_norm": 2.487459182739258, + "learning_rate": 9.31372256472034e-06, + "loss": 0.6144, + "step": 4305 + }, + { + "epoch": 0.5607184693479109, + "grad_norm": 2.43287992477417, + "learning_rate": 9.312665479091106e-06, + "loss": 0.5989, + "step": 4308 + }, + { + "epoch": 0.5611089418196017, + "grad_norm": 3.1222102642059326, + "learning_rate": 9.311607640048286e-06, + "loss": 0.5524, + "step": 4311 + }, + { + "epoch": 0.5614994142912925, + "grad_norm": 2.515286445617676, + "learning_rate": 9.31054904777668e-06, + "loss": 0.6501, + "step": 4314 + }, + { + "epoch": 0.5618898867629832, + "grad_norm": 2.938828706741333, + "learning_rate": 9.309489702461223e-06, + "loss": 0.6829, + "step": 4317 + }, + { + "epoch": 0.562280359234674, + "grad_norm": 2.4656574726104736, + "learning_rate": 9.308429604286982e-06, + "loss": 0.6517, + "step": 4320 + }, + { + "epoch": 0.5626708317063647, + "grad_norm": 3.7046689987182617, + "learning_rate": 9.307368753439153e-06, + "loss": 0.6415, + "step": 4323 + }, + { + "epoch": 0.5630613041780554, + "grad_norm": 3.0883255004882812, + "learning_rate": 9.306307150103064e-06, + "loss": 0.7135, + "step": 4326 + }, + { + "epoch": 0.5634517766497462, + "grad_norm": 2.469595432281494, + "learning_rate": 9.305244794464174e-06, + "loss": 0.5753, + "step": 4329 + }, + { + "epoch": 0.563842249121437, + "grad_norm": 2.6837077140808105, + "learning_rate": 9.304181686708077e-06, + "loss": 0.6382, + "step": 4332 + }, + { + "epoch": 0.5642327215931277, + "grad_norm": 2.8714263439178467, + "learning_rate": 9.303117827020493e-06, + "loss": 0.7308, + "step": 4335 + }, + { + "epoch": 0.5646231940648184, + "grad_norm": 2.2070488929748535, + "learning_rate": 9.302053215587276e-06, + "loss": 0.7811, + "step": 4338 + }, + { + "epoch": 0.5650136665365092, + "grad_norm": 2.573568105697632, + "learning_rate": 9.300987852594414e-06, + "loss": 0.7276, + "step": 4341 + }, + { + "epoch": 0.5654041390082, + "grad_norm": 2.5360209941864014, + "learning_rate": 9.299921738228023e-06, + "loss": 0.6433, + "step": 4344 + }, + { + "epoch": 0.5657946114798906, + "grad_norm": 3.1989572048187256, + "learning_rate": 9.298854872674348e-06, + "loss": 0.6491, + "step": 4347 + }, + { + "epoch": 0.5661850839515814, + "grad_norm": 2.791049003601074, + "learning_rate": 9.297787256119772e-06, + "loss": 0.6953, + "step": 4350 + }, + { + "epoch": 0.5665755564232722, + "grad_norm": 3.882077693939209, + "learning_rate": 9.296718888750802e-06, + "loss": 0.697, + "step": 4353 + }, + { + "epoch": 0.566966028894963, + "grad_norm": 2.5977842807769775, + "learning_rate": 9.295649770754082e-06, + "loss": 0.6448, + "step": 4356 + }, + { + "epoch": 0.5673565013666536, + "grad_norm": 2.584838628768921, + "learning_rate": 9.294579902316382e-06, + "loss": 0.6445, + "step": 4359 + }, + { + "epoch": 0.5677469738383444, + "grad_norm": 3.0195114612579346, + "learning_rate": 9.293509283624611e-06, + "loss": 0.7209, + "step": 4362 + }, + { + "epoch": 0.5681374463100352, + "grad_norm": 3.4873313903808594, + "learning_rate": 9.292437914865798e-06, + "loss": 0.7565, + "step": 4365 + }, + { + "epoch": 0.5685279187817259, + "grad_norm": 2.704925298690796, + "learning_rate": 9.291365796227111e-06, + "loss": 0.8317, + "step": 4368 + }, + { + "epoch": 0.5689183912534166, + "grad_norm": 2.3734493255615234, + "learning_rate": 9.290292927895848e-06, + "loss": 0.6708, + "step": 4371 + }, + { + "epoch": 0.5693088637251074, + "grad_norm": 2.2318577766418457, + "learning_rate": 9.289219310059437e-06, + "loss": 0.6154, + "step": 4374 + }, + { + "epoch": 0.5696993361967981, + "grad_norm": 3.829906702041626, + "learning_rate": 9.288144942905432e-06, + "loss": 0.7834, + "step": 4377 + }, + { + "epoch": 0.5700898086684889, + "grad_norm": 2.0775136947631836, + "learning_rate": 9.28706982662153e-06, + "loss": 0.5858, + "step": 4380 + }, + { + "epoch": 0.5704802811401796, + "grad_norm": 2.4510486125946045, + "learning_rate": 9.285993961395548e-06, + "loss": 0.5897, + "step": 4383 + }, + { + "epoch": 0.5708707536118703, + "grad_norm": 2.438112735748291, + "learning_rate": 9.284917347415435e-06, + "loss": 0.6553, + "step": 4386 + }, + { + "epoch": 0.5712612260835611, + "grad_norm": 3.0546696186065674, + "learning_rate": 9.283839984869276e-06, + "loss": 0.6921, + "step": 4389 + }, + { + "epoch": 0.5716516985552519, + "grad_norm": 2.234543561935425, + "learning_rate": 9.282761873945285e-06, + "loss": 0.5942, + "step": 4392 + }, + { + "epoch": 0.5720421710269425, + "grad_norm": 2.697658061981201, + "learning_rate": 9.281683014831804e-06, + "loss": 0.6324, + "step": 4395 + }, + { + "epoch": 0.5724326434986333, + "grad_norm": 2.6841681003570557, + "learning_rate": 9.280603407717306e-06, + "loss": 0.5907, + "step": 4398 + }, + { + "epoch": 0.5728231159703241, + "grad_norm": 2.358556032180786, + "learning_rate": 9.2795230527904e-06, + "loss": 0.6638, + "step": 4401 + }, + { + "epoch": 0.5732135884420149, + "grad_norm": 4.04594087600708, + "learning_rate": 9.278441950239819e-06, + "loss": 0.6402, + "step": 4404 + }, + { + "epoch": 0.5736040609137056, + "grad_norm": 2.5940234661102295, + "learning_rate": 9.277360100254428e-06, + "loss": 0.6327, + "step": 4407 + }, + { + "epoch": 0.5739945333853963, + "grad_norm": 2.4827663898468018, + "learning_rate": 9.276277503023226e-06, + "loss": 0.6645, + "step": 4410 + }, + { + "epoch": 0.5743850058570871, + "grad_norm": 2.8493151664733887, + "learning_rate": 9.27519415873534e-06, + "loss": 0.6459, + "step": 4413 + }, + { + "epoch": 0.5747754783287778, + "grad_norm": 2.4425408840179443, + "learning_rate": 9.274110067580031e-06, + "loss": 0.5905, + "step": 4416 + }, + { + "epoch": 0.5751659508004686, + "grad_norm": 2.5381839275360107, + "learning_rate": 9.273025229746683e-06, + "loss": 0.7284, + "step": 4419 + }, + { + "epoch": 0.5755564232721593, + "grad_norm": 3.9269378185272217, + "learning_rate": 9.271939645424816e-06, + "loss": 0.6981, + "step": 4422 + }, + { + "epoch": 0.57594689574385, + "grad_norm": 2.5297110080718994, + "learning_rate": 9.27085331480408e-06, + "loss": 0.7223, + "step": 4425 + }, + { + "epoch": 0.5763373682155408, + "grad_norm": 2.864983320236206, + "learning_rate": 9.269766238074255e-06, + "loss": 0.7924, + "step": 4428 + }, + { + "epoch": 0.5767278406872316, + "grad_norm": 2.4377658367156982, + "learning_rate": 9.26867841542525e-06, + "loss": 0.6703, + "step": 4431 + }, + { + "epoch": 0.5771183131589223, + "grad_norm": 2.9342570304870605, + "learning_rate": 9.267589847047105e-06, + "loss": 0.6622, + "step": 4434 + }, + { + "epoch": 0.577508785630613, + "grad_norm": 2.2394070625305176, + "learning_rate": 9.266500533129994e-06, + "loss": 0.7219, + "step": 4437 + }, + { + "epoch": 0.5778992581023038, + "grad_norm": 3.120809316635132, + "learning_rate": 9.265410473864214e-06, + "loss": 0.6276, + "step": 4440 + }, + { + "epoch": 0.5782897305739946, + "grad_norm": 4.084946155548096, + "learning_rate": 9.264319669440197e-06, + "loss": 0.6136, + "step": 4443 + }, + { + "epoch": 0.5786802030456852, + "grad_norm": 3.8687705993652344, + "learning_rate": 9.263228120048504e-06, + "loss": 0.698, + "step": 4446 + }, + { + "epoch": 0.579070675517376, + "grad_norm": 2.582995891571045, + "learning_rate": 9.262135825879827e-06, + "loss": 0.6726, + "step": 4449 + }, + { + "epoch": 0.5794611479890668, + "grad_norm": 3.0313069820404053, + "learning_rate": 9.261042787124987e-06, + "loss": 0.6088, + "step": 4452 + }, + { + "epoch": 0.5798516204607576, + "grad_norm": 2.4391238689422607, + "learning_rate": 9.259949003974938e-06, + "loss": 0.6239, + "step": 4455 + }, + { + "epoch": 0.5802420929324482, + "grad_norm": 2.8740012645721436, + "learning_rate": 9.258854476620758e-06, + "loss": 0.6486, + "step": 4458 + }, + { + "epoch": 0.580632565404139, + "grad_norm": 2.8367395401000977, + "learning_rate": 9.257759205253662e-06, + "loss": 0.7247, + "step": 4461 + }, + { + "epoch": 0.5810230378758298, + "grad_norm": 2.558208703994751, + "learning_rate": 9.256663190064987e-06, + "loss": 0.6522, + "step": 4464 + }, + { + "epoch": 0.5814135103475205, + "grad_norm": 2.159445285797119, + "learning_rate": 9.25556643124621e-06, + "loss": 0.5987, + "step": 4467 + }, + { + "epoch": 0.5818039828192112, + "grad_norm": 2.406489849090576, + "learning_rate": 9.25446892898893e-06, + "loss": 0.7873, + "step": 4470 + }, + { + "epoch": 0.582194455290902, + "grad_norm": 2.4776813983917236, + "learning_rate": 9.253370683484876e-06, + "loss": 0.6753, + "step": 4473 + }, + { + "epoch": 0.5825849277625927, + "grad_norm": 2.6619131565093994, + "learning_rate": 9.252271694925913e-06, + "loss": 0.7201, + "step": 4476 + }, + { + "epoch": 0.5829754002342835, + "grad_norm": 2.598264455795288, + "learning_rate": 9.25117196350403e-06, + "loss": 0.679, + "step": 4479 + }, + { + "epoch": 0.5833658727059742, + "grad_norm": 3.2932846546173096, + "learning_rate": 9.250071489411348e-06, + "loss": 0.6224, + "step": 4482 + }, + { + "epoch": 0.583756345177665, + "grad_norm": 2.7969069480895996, + "learning_rate": 9.248970272840116e-06, + "loss": 0.5868, + "step": 4485 + }, + { + "epoch": 0.5841468176493557, + "grad_norm": 2.904038190841675, + "learning_rate": 9.247868313982719e-06, + "loss": 0.6435, + "step": 4488 + }, + { + "epoch": 0.5845372901210465, + "grad_norm": 2.6889734268188477, + "learning_rate": 9.246765613031661e-06, + "loss": 0.6655, + "step": 4491 + }, + { + "epoch": 0.5849277625927373, + "grad_norm": 2.590397834777832, + "learning_rate": 9.245662170179586e-06, + "loss": 0.6866, + "step": 4494 + }, + { + "epoch": 0.5853182350644279, + "grad_norm": 2.5878326892852783, + "learning_rate": 9.24455798561926e-06, + "loss": 0.5923, + "step": 4497 + }, + { + "epoch": 0.5857087075361187, + "grad_norm": 3.6354682445526123, + "learning_rate": 9.243453059543586e-06, + "loss": 0.7215, + "step": 4500 + }, + { + "epoch": 0.5860991800078095, + "grad_norm": 2.499769687652588, + "learning_rate": 9.242347392145587e-06, + "loss": 0.7189, + "step": 4503 + }, + { + "epoch": 0.5864896524795002, + "grad_norm": 2.806671619415283, + "learning_rate": 9.241240983618423e-06, + "loss": 0.8219, + "step": 4506 + }, + { + "epoch": 0.5868801249511909, + "grad_norm": 2.4375195503234863, + "learning_rate": 9.240133834155382e-06, + "loss": 0.7025, + "step": 4509 + }, + { + "epoch": 0.5872705974228817, + "grad_norm": 3.987379789352417, + "learning_rate": 9.239025943949882e-06, + "loss": 0.6628, + "step": 4512 + }, + { + "epoch": 0.5876610698945725, + "grad_norm": 2.5904123783111572, + "learning_rate": 9.237917313195465e-06, + "loss": 0.6761, + "step": 4515 + }, + { + "epoch": 0.5880515423662632, + "grad_norm": 2.4467389583587646, + "learning_rate": 9.236807942085809e-06, + "loss": 0.5364, + "step": 4518 + }, + { + "epoch": 0.5884420148379539, + "grad_norm": 3.339595317840576, + "learning_rate": 9.235697830814718e-06, + "loss": 0.6519, + "step": 4521 + }, + { + "epoch": 0.5888324873096447, + "grad_norm": 4.496737003326416, + "learning_rate": 9.234586979576127e-06, + "loss": 0.6588, + "step": 4524 + }, + { + "epoch": 0.5892229597813354, + "grad_norm": 2.4723331928253174, + "learning_rate": 9.2334753885641e-06, + "loss": 0.6806, + "step": 4527 + }, + { + "epoch": 0.5896134322530262, + "grad_norm": 2.589547872543335, + "learning_rate": 9.232363057972828e-06, + "loss": 0.5903, + "step": 4530 + }, + { + "epoch": 0.5900039047247169, + "grad_norm": 3.4298384189605713, + "learning_rate": 9.231249987996632e-06, + "loss": 0.6546, + "step": 4533 + }, + { + "epoch": 0.5903943771964076, + "grad_norm": 2.480419158935547, + "learning_rate": 9.230136178829967e-06, + "loss": 0.7137, + "step": 4536 + }, + { + "epoch": 0.5907848496680984, + "grad_norm": 2.6856725215911865, + "learning_rate": 9.229021630667407e-06, + "loss": 0.6591, + "step": 4539 + }, + { + "epoch": 0.5911753221397892, + "grad_norm": 4.8847975730896, + "learning_rate": 9.227906343703668e-06, + "loss": 0.756, + "step": 4542 + }, + { + "epoch": 0.5915657946114798, + "grad_norm": 2.5259408950805664, + "learning_rate": 9.226790318133583e-06, + "loss": 0.592, + "step": 4545 + }, + { + "epoch": 0.5919562670831706, + "grad_norm": 3.381965160369873, + "learning_rate": 9.225673554152122e-06, + "loss": 0.7512, + "step": 4548 + }, + { + "epoch": 0.5923467395548614, + "grad_norm": 2.3359241485595703, + "learning_rate": 9.224556051954381e-06, + "loss": 0.6277, + "step": 4551 + }, + { + "epoch": 0.5927372120265522, + "grad_norm": 2.8028383255004883, + "learning_rate": 9.223437811735583e-06, + "loss": 0.7311, + "step": 4554 + }, + { + "epoch": 0.5931276844982428, + "grad_norm": 2.1525771617889404, + "learning_rate": 9.222318833691085e-06, + "loss": 0.6426, + "step": 4557 + }, + { + "epoch": 0.5935181569699336, + "grad_norm": 2.6036198139190674, + "learning_rate": 9.22119911801637e-06, + "loss": 0.6085, + "step": 4560 + }, + { + "epoch": 0.5939086294416244, + "grad_norm": 2.456698417663574, + "learning_rate": 9.220078664907048e-06, + "loss": 0.6745, + "step": 4563 + }, + { + "epoch": 0.5942991019133151, + "grad_norm": 3.952479600906372, + "learning_rate": 9.218957474558862e-06, + "loss": 0.6955, + "step": 4566 + }, + { + "epoch": 0.5946895743850058, + "grad_norm": 3.0322844982147217, + "learning_rate": 9.217835547167682e-06, + "loss": 0.7331, + "step": 4569 + }, + { + "epoch": 0.5950800468566966, + "grad_norm": 2.9084672927856445, + "learning_rate": 9.216712882929503e-06, + "loss": 0.6295, + "step": 4572 + }, + { + "epoch": 0.5954705193283873, + "grad_norm": 2.524294376373291, + "learning_rate": 9.215589482040455e-06, + "loss": 0.6615, + "step": 4575 + }, + { + "epoch": 0.5958609918000781, + "grad_norm": 2.533430576324463, + "learning_rate": 9.21446534469679e-06, + "loss": 0.6768, + "step": 4578 + }, + { + "epoch": 0.5962514642717689, + "grad_norm": 2.4197540283203125, + "learning_rate": 9.213340471094899e-06, + "loss": 0.688, + "step": 4581 + }, + { + "epoch": 0.5966419367434596, + "grad_norm": 2.344994068145752, + "learning_rate": 9.212214861431289e-06, + "loss": 0.7125, + "step": 4584 + }, + { + "epoch": 0.5970324092151503, + "grad_norm": 2.436912775039673, + "learning_rate": 9.211088515902604e-06, + "loss": 0.6781, + "step": 4587 + }, + { + "epoch": 0.5974228816868411, + "grad_norm": 2.6664328575134277, + "learning_rate": 9.209961434705614e-06, + "loss": 0.6564, + "step": 4590 + }, + { + "epoch": 0.5978133541585319, + "grad_norm": 3.217984437942505, + "learning_rate": 9.20883361803722e-06, + "loss": 0.6577, + "step": 4593 + }, + { + "epoch": 0.5982038266302225, + "grad_norm": 2.4378035068511963, + "learning_rate": 9.207705066094445e-06, + "loss": 0.5963, + "step": 4596 + }, + { + "epoch": 0.5985942991019133, + "grad_norm": 2.4753284454345703, + "learning_rate": 9.206575779074448e-06, + "loss": 0.6725, + "step": 4599 + }, + { + "epoch": 0.5989847715736041, + "grad_norm": 2.7560603618621826, + "learning_rate": 9.20544575717451e-06, + "loss": 0.6058, + "step": 4602 + }, + { + "epoch": 0.5993752440452949, + "grad_norm": 2.4613637924194336, + "learning_rate": 9.204315000592046e-06, + "loss": 0.7212, + "step": 4605 + }, + { + "epoch": 0.5997657165169855, + "grad_norm": 2.4374942779541016, + "learning_rate": 9.203183509524596e-06, + "loss": 0.5989, + "step": 4608 + }, + { + "epoch": 0.6001561889886763, + "grad_norm": 2.783355236053467, + "learning_rate": 9.202051284169829e-06, + "loss": 0.6998, + "step": 4611 + }, + { + "epoch": 0.6005466614603671, + "grad_norm": 2.803696870803833, + "learning_rate": 9.200918324725543e-06, + "loss": 0.6278, + "step": 4614 + }, + { + "epoch": 0.6009371339320578, + "grad_norm": 3.1450366973876953, + "learning_rate": 9.199784631389663e-06, + "loss": 0.7003, + "step": 4617 + }, + { + "epoch": 0.6013276064037485, + "grad_norm": 3.2626290321350098, + "learning_rate": 9.198650204360241e-06, + "loss": 0.5592, + "step": 4620 + }, + { + "epoch": 0.6017180788754393, + "grad_norm": 2.694031238555908, + "learning_rate": 9.197515043835463e-06, + "loss": 0.6437, + "step": 4623 + }, + { + "epoch": 0.60210855134713, + "grad_norm": 3.2066242694854736, + "learning_rate": 9.196379150013638e-06, + "loss": 0.6952, + "step": 4626 + }, + { + "epoch": 0.6024990238188208, + "grad_norm": 3.181091547012329, + "learning_rate": 9.195242523093202e-06, + "loss": 0.6107, + "step": 4629 + }, + { + "epoch": 0.6028894962905115, + "grad_norm": 3.754946231842041, + "learning_rate": 9.194105163272722e-06, + "loss": 0.7689, + "step": 4632 + }, + { + "epoch": 0.6032799687622022, + "grad_norm": 3.5265393257141113, + "learning_rate": 9.192967070750895e-06, + "loss": 0.6393, + "step": 4635 + }, + { + "epoch": 0.603670441233893, + "grad_norm": 3.310450315475464, + "learning_rate": 9.191828245726539e-06, + "loss": 0.718, + "step": 4638 + }, + { + "epoch": 0.6040609137055838, + "grad_norm": 2.7861340045928955, + "learning_rate": 9.19068868839861e-06, + "loss": 0.6306, + "step": 4641 + }, + { + "epoch": 0.6044513861772745, + "grad_norm": 2.8099071979522705, + "learning_rate": 9.189548398966181e-06, + "loss": 0.7781, + "step": 4644 + }, + { + "epoch": 0.6048418586489652, + "grad_norm": 2.284885883331299, + "learning_rate": 9.18840737762846e-06, + "loss": 0.5758, + "step": 4647 + }, + { + "epoch": 0.605232331120656, + "grad_norm": 2.39620304107666, + "learning_rate": 9.187265624584782e-06, + "loss": 0.5665, + "step": 4650 + }, + { + "epoch": 0.6056228035923468, + "grad_norm": 2.680431604385376, + "learning_rate": 9.186123140034607e-06, + "loss": 0.7102, + "step": 4653 + }, + { + "epoch": 0.6060132760640374, + "grad_norm": 2.5489349365234375, + "learning_rate": 9.184979924177527e-06, + "loss": 0.61, + "step": 4656 + }, + { + "epoch": 0.6064037485357282, + "grad_norm": 2.9585001468658447, + "learning_rate": 9.183835977213257e-06, + "loss": 0.6092, + "step": 4659 + }, + { + "epoch": 0.606794221007419, + "grad_norm": 2.7119574546813965, + "learning_rate": 9.182691299341643e-06, + "loss": 0.6834, + "step": 4662 + }, + { + "epoch": 0.6071846934791097, + "grad_norm": 2.6506659984588623, + "learning_rate": 9.181545890762661e-06, + "loss": 0.6116, + "step": 4665 + }, + { + "epoch": 0.6075751659508005, + "grad_norm": 2.5166819095611572, + "learning_rate": 9.180399751676407e-06, + "loss": 0.6865, + "step": 4668 + }, + { + "epoch": 0.6079656384224912, + "grad_norm": 3.43406343460083, + "learning_rate": 9.17925288228311e-06, + "loss": 0.6894, + "step": 4671 + }, + { + "epoch": 0.608356110894182, + "grad_norm": 2.9682881832122803, + "learning_rate": 9.178105282783127e-06, + "loss": 0.6195, + "step": 4674 + }, + { + "epoch": 0.6087465833658727, + "grad_norm": 4.136783123016357, + "learning_rate": 9.17695695337694e-06, + "loss": 0.5971, + "step": 4677 + }, + { + "epoch": 0.6091370558375635, + "grad_norm": 2.4573512077331543, + "learning_rate": 9.175807894265161e-06, + "loss": 0.7037, + "step": 4680 + }, + { + "epoch": 0.6095275283092542, + "grad_norm": 2.2731971740722656, + "learning_rate": 9.174658105648526e-06, + "loss": 0.6454, + "step": 4683 + }, + { + "epoch": 0.6099180007809449, + "grad_norm": 3.480694532394409, + "learning_rate": 9.173507587727904e-06, + "loss": 0.6724, + "step": 4686 + }, + { + "epoch": 0.6103084732526357, + "grad_norm": 2.483057975769043, + "learning_rate": 9.172356340704285e-06, + "loss": 0.7026, + "step": 4689 + }, + { + "epoch": 0.6106989457243265, + "grad_norm": 2.4843204021453857, + "learning_rate": 9.171204364778791e-06, + "loss": 0.6642, + "step": 4692 + }, + { + "epoch": 0.6110894181960171, + "grad_norm": 2.6703476905822754, + "learning_rate": 9.17005166015267e-06, + "loss": 0.5992, + "step": 4695 + }, + { + "epoch": 0.6114798906677079, + "grad_norm": 2.2671613693237305, + "learning_rate": 9.168898227027296e-06, + "loss": 0.6149, + "step": 4698 + }, + { + "epoch": 0.6118703631393987, + "grad_norm": 2.83113169670105, + "learning_rate": 9.167744065604171e-06, + "loss": 0.7512, + "step": 4701 + }, + { + "epoch": 0.6122608356110895, + "grad_norm": 2.2886712551116943, + "learning_rate": 9.166589176084925e-06, + "loss": 0.603, + "step": 4704 + }, + { + "epoch": 0.6126513080827801, + "grad_norm": 2.4751598834991455, + "learning_rate": 9.165433558671318e-06, + "loss": 0.6055, + "step": 4707 + }, + { + "epoch": 0.6130417805544709, + "grad_norm": 2.5859594345092773, + "learning_rate": 9.164277213565228e-06, + "loss": 0.6932, + "step": 4710 + }, + { + "epoch": 0.6134322530261617, + "grad_norm": 2.6246237754821777, + "learning_rate": 9.163120140968671e-06, + "loss": 0.5861, + "step": 4713 + }, + { + "epoch": 0.6138227254978524, + "grad_norm": 2.5987634658813477, + "learning_rate": 9.161962341083784e-06, + "loss": 0.729, + "step": 4716 + }, + { + "epoch": 0.6142131979695431, + "grad_norm": 2.6038060188293457, + "learning_rate": 9.160803814112829e-06, + "loss": 0.6701, + "step": 4719 + }, + { + "epoch": 0.6146036704412339, + "grad_norm": 3.065664768218994, + "learning_rate": 9.159644560258201e-06, + "loss": 0.7233, + "step": 4722 + }, + { + "epoch": 0.6149941429129246, + "grad_norm": 2.4268383979797363, + "learning_rate": 9.15848457972242e-06, + "loss": 0.6326, + "step": 4725 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 2.269376277923584, + "learning_rate": 9.157323872708131e-06, + "loss": 0.6076, + "step": 4728 + }, + { + "epoch": 0.6157750878563061, + "grad_norm": 2.622344970703125, + "learning_rate": 9.156162439418108e-06, + "loss": 0.6793, + "step": 4731 + }, + { + "epoch": 0.6161655603279969, + "grad_norm": 3.469761371612549, + "learning_rate": 9.15500028005525e-06, + "loss": 0.7301, + "step": 4734 + }, + { + "epoch": 0.6165560327996876, + "grad_norm": 2.781172752380371, + "learning_rate": 9.153837394822582e-06, + "loss": 0.7908, + "step": 4737 + }, + { + "epoch": 0.6169465052713784, + "grad_norm": 2.506148099899292, + "learning_rate": 9.15267378392326e-06, + "loss": 0.7075, + "step": 4740 + }, + { + "epoch": 0.6173369777430691, + "grad_norm": 3.1186656951904297, + "learning_rate": 9.151509447560566e-06, + "loss": 0.5936, + "step": 4743 + }, + { + "epoch": 0.6177274502147598, + "grad_norm": 2.5772218704223633, + "learning_rate": 9.150344385937904e-06, + "loss": 0.6029, + "step": 4746 + }, + { + "epoch": 0.6181179226864506, + "grad_norm": 2.423085927963257, + "learning_rate": 9.149178599258807e-06, + "loss": 0.7071, + "step": 4749 + }, + { + "epoch": 0.6185083951581414, + "grad_norm": 3.1963913440704346, + "learning_rate": 9.14801208772694e-06, + "loss": 0.6297, + "step": 4752 + }, + { + "epoch": 0.6188988676298322, + "grad_norm": 2.7055928707122803, + "learning_rate": 9.146844851546087e-06, + "loss": 0.7194, + "step": 4755 + }, + { + "epoch": 0.6192893401015228, + "grad_norm": 2.607105016708374, + "learning_rate": 9.145676890920161e-06, + "loss": 0.6897, + "step": 4758 + }, + { + "epoch": 0.6196798125732136, + "grad_norm": 2.645681381225586, + "learning_rate": 9.144508206053203e-06, + "loss": 0.636, + "step": 4761 + }, + { + "epoch": 0.6200702850449044, + "grad_norm": 2.521937131881714, + "learning_rate": 9.143338797149381e-06, + "loss": 0.6608, + "step": 4764 + }, + { + "epoch": 0.6204607575165951, + "grad_norm": 2.7200021743774414, + "learning_rate": 9.142168664412987e-06, + "loss": 0.6228, + "step": 4767 + }, + { + "epoch": 0.6208512299882858, + "grad_norm": 3.465608835220337, + "learning_rate": 9.140997808048442e-06, + "loss": 0.6801, + "step": 4770 + }, + { + "epoch": 0.6212417024599766, + "grad_norm": 2.4828827381134033, + "learning_rate": 9.139826228260292e-06, + "loss": 0.6134, + "step": 4773 + }, + { + "epoch": 0.6216321749316673, + "grad_norm": 2.365074634552002, + "learning_rate": 9.138653925253207e-06, + "loss": 0.5044, + "step": 4776 + }, + { + "epoch": 0.6220226474033581, + "grad_norm": 2.630094051361084, + "learning_rate": 9.137480899231987e-06, + "loss": 0.5772, + "step": 4779 + }, + { + "epoch": 0.6224131198750488, + "grad_norm": 2.5148115158081055, + "learning_rate": 9.13630715040156e-06, + "loss": 0.6968, + "step": 4782 + }, + { + "epoch": 0.6228035923467395, + "grad_norm": 2.7790770530700684, + "learning_rate": 9.135132678966975e-06, + "loss": 0.5712, + "step": 4785 + }, + { + "epoch": 0.6231940648184303, + "grad_norm": 2.780951738357544, + "learning_rate": 9.13395748513341e-06, + "loss": 0.6046, + "step": 4788 + }, + { + "epoch": 0.6235845372901211, + "grad_norm": 2.728588819503784, + "learning_rate": 9.132781569106168e-06, + "loss": 0.734, + "step": 4791 + }, + { + "epoch": 0.6239750097618117, + "grad_norm": 2.4009926319122314, + "learning_rate": 9.131604931090681e-06, + "loss": 0.62, + "step": 4794 + }, + { + "epoch": 0.6243654822335025, + "grad_norm": 3.470907211303711, + "learning_rate": 9.130427571292503e-06, + "loss": 0.7862, + "step": 4797 + }, + { + "epoch": 0.6247559547051933, + "grad_norm": 3.43316912651062, + "learning_rate": 9.129249489917317e-06, + "loss": 0.651, + "step": 4800 + }, + { + "epoch": 0.6251464271768841, + "grad_norm": 2.8522374629974365, + "learning_rate": 9.128070687170933e-06, + "loss": 0.5899, + "step": 4803 + }, + { + "epoch": 0.6255368996485747, + "grad_norm": 2.6155104637145996, + "learning_rate": 9.126891163259286e-06, + "loss": 0.6208, + "step": 4806 + }, + { + "epoch": 0.6259273721202655, + "grad_norm": 2.6993188858032227, + "learning_rate": 9.125710918388431e-06, + "loss": 0.6091, + "step": 4809 + }, + { + "epoch": 0.6263178445919563, + "grad_norm": 3.6405208110809326, + "learning_rate": 9.12452995276456e-06, + "loss": 0.6169, + "step": 4812 + }, + { + "epoch": 0.626708317063647, + "grad_norm": 2.4731392860412598, + "learning_rate": 9.123348266593983e-06, + "loss": 0.6637, + "step": 4815 + }, + { + "epoch": 0.6270987895353377, + "grad_norm": 2.377974271774292, + "learning_rate": 9.122165860083136e-06, + "loss": 0.6365, + "step": 4818 + }, + { + "epoch": 0.6274892620070285, + "grad_norm": 2.286585569381714, + "learning_rate": 9.120982733438587e-06, + "loss": 0.6843, + "step": 4821 + }, + { + "epoch": 0.6278797344787193, + "grad_norm": 2.7373149394989014, + "learning_rate": 9.119798886867025e-06, + "loss": 0.6896, + "step": 4824 + }, + { + "epoch": 0.62827020695041, + "grad_norm": 2.6947765350341797, + "learning_rate": 9.118614320575261e-06, + "loss": 0.6867, + "step": 4827 + }, + { + "epoch": 0.6286606794221007, + "grad_norm": 2.763946771621704, + "learning_rate": 9.117429034770241e-06, + "loss": 0.6705, + "step": 4830 + }, + { + "epoch": 0.6290511518937915, + "grad_norm": 2.622236728668213, + "learning_rate": 9.116243029659032e-06, + "loss": 0.7195, + "step": 4833 + }, + { + "epoch": 0.6294416243654822, + "grad_norm": 2.6941754817962646, + "learning_rate": 9.115056305448822e-06, + "loss": 0.7228, + "step": 4836 + }, + { + "epoch": 0.629832096837173, + "grad_norm": 3.360145092010498, + "learning_rate": 9.113868862346934e-06, + "loss": 0.679, + "step": 4839 + }, + { + "epoch": 0.6302225693088638, + "grad_norm": 2.4699013233184814, + "learning_rate": 9.11268070056081e-06, + "loss": 0.6504, + "step": 4842 + }, + { + "epoch": 0.6306130417805544, + "grad_norm": 2.265643835067749, + "learning_rate": 9.111491820298019e-06, + "loss": 0.6572, + "step": 4845 + }, + { + "epoch": 0.6310035142522452, + "grad_norm": 2.609482765197754, + "learning_rate": 9.110302221766257e-06, + "loss": 0.643, + "step": 4848 + }, + { + "epoch": 0.631393986723936, + "grad_norm": 2.8599019050598145, + "learning_rate": 9.109111905173342e-06, + "loss": 0.7884, + "step": 4851 + }, + { + "epoch": 0.6317844591956268, + "grad_norm": 2.703747272491455, + "learning_rate": 9.107920870727223e-06, + "loss": 0.6817, + "step": 4854 + }, + { + "epoch": 0.6321749316673174, + "grad_norm": 2.2183678150177, + "learning_rate": 9.106729118635968e-06, + "loss": 0.6101, + "step": 4857 + }, + { + "epoch": 0.6325654041390082, + "grad_norm": 2.5604469776153564, + "learning_rate": 9.105536649107778e-06, + "loss": 0.7131, + "step": 4860 + }, + { + "epoch": 0.632955876610699, + "grad_norm": 2.3223118782043457, + "learning_rate": 9.10434346235097e-06, + "loss": 0.6237, + "step": 4863 + }, + { + "epoch": 0.6333463490823897, + "grad_norm": 2.4612014293670654, + "learning_rate": 9.103149558573995e-06, + "loss": 0.695, + "step": 4866 + }, + { + "epoch": 0.6337368215540804, + "grad_norm": 3.208245277404785, + "learning_rate": 9.101954937985422e-06, + "loss": 0.6685, + "step": 4869 + }, + { + "epoch": 0.6341272940257712, + "grad_norm": 3.3025028705596924, + "learning_rate": 9.100759600793951e-06, + "loss": 0.6657, + "step": 4872 + }, + { + "epoch": 0.6345177664974619, + "grad_norm": 2.8037331104278564, + "learning_rate": 9.099563547208406e-06, + "loss": 0.573, + "step": 4875 + }, + { + "epoch": 0.6349082389691527, + "grad_norm": 2.62872052192688, + "learning_rate": 9.098366777437733e-06, + "loss": 0.7397, + "step": 4878 + }, + { + "epoch": 0.6352987114408434, + "grad_norm": 2.564282178878784, + "learning_rate": 9.097169291691007e-06, + "loss": 0.626, + "step": 4881 + }, + { + "epoch": 0.6356891839125342, + "grad_norm": 2.9302821159362793, + "learning_rate": 9.095971090177421e-06, + "loss": 0.7665, + "step": 4884 + }, + { + "epoch": 0.6360796563842249, + "grad_norm": 2.7549326419830322, + "learning_rate": 9.094772173106305e-06, + "loss": 0.7412, + "step": 4887 + }, + { + "epoch": 0.6364701288559157, + "grad_norm": 2.602814197540283, + "learning_rate": 9.093572540687104e-06, + "loss": 0.6038, + "step": 4890 + }, + { + "epoch": 0.6368606013276064, + "grad_norm": 2.4310333728790283, + "learning_rate": 9.09237219312939e-06, + "loss": 0.6858, + "step": 4893 + }, + { + "epoch": 0.6372510737992971, + "grad_norm": 2.8889222145080566, + "learning_rate": 9.091171130642866e-06, + "loss": 0.6245, + "step": 4896 + }, + { + "epoch": 0.6376415462709879, + "grad_norm": 2.956254005432129, + "learning_rate": 9.089969353437351e-06, + "loss": 0.7212, + "step": 4899 + }, + { + "epoch": 0.6380320187426787, + "grad_norm": 2.366194248199463, + "learning_rate": 9.088766861722793e-06, + "loss": 0.6385, + "step": 4902 + }, + { + "epoch": 0.6384224912143693, + "grad_norm": 2.7603938579559326, + "learning_rate": 9.087563655709266e-06, + "loss": 0.7277, + "step": 4905 + }, + { + "epoch": 0.6388129636860601, + "grad_norm": 3.687945604324341, + "learning_rate": 9.086359735606969e-06, + "loss": 0.6555, + "step": 4908 + }, + { + "epoch": 0.6392034361577509, + "grad_norm": 3.259951591491699, + "learning_rate": 9.085155101626221e-06, + "loss": 0.7351, + "step": 4911 + }, + { + "epoch": 0.6395939086294417, + "grad_norm": 2.4649972915649414, + "learning_rate": 9.083949753977471e-06, + "loss": 0.5857, + "step": 4914 + }, + { + "epoch": 0.6399843811011323, + "grad_norm": 2.663567304611206, + "learning_rate": 9.08274369287129e-06, + "loss": 0.648, + "step": 4917 + }, + { + "epoch": 0.6403748535728231, + "grad_norm": 2.2157647609710693, + "learning_rate": 9.081536918518377e-06, + "loss": 0.6431, + "step": 4920 + }, + { + "epoch": 0.6407653260445139, + "grad_norm": 2.5896902084350586, + "learning_rate": 9.080329431129548e-06, + "loss": 0.6592, + "step": 4923 + }, + { + "epoch": 0.6411557985162046, + "grad_norm": 3.887195348739624, + "learning_rate": 9.07912123091575e-06, + "loss": 0.7259, + "step": 4926 + }, + { + "epoch": 0.6415462709878954, + "grad_norm": 2.6314144134521484, + "learning_rate": 9.077912318088057e-06, + "loss": 0.5699, + "step": 4929 + }, + { + "epoch": 0.6419367434595861, + "grad_norm": 2.7441248893737793, + "learning_rate": 9.076702692857661e-06, + "loss": 0.6653, + "step": 4932 + }, + { + "epoch": 0.6423272159312768, + "grad_norm": 3.4282479286193848, + "learning_rate": 9.075492355435878e-06, + "loss": 0.634, + "step": 4935 + }, + { + "epoch": 0.6427176884029676, + "grad_norm": 2.2707557678222656, + "learning_rate": 9.074281306034156e-06, + "loss": 0.6404, + "step": 4938 + }, + { + "epoch": 0.6431081608746584, + "grad_norm": 2.613741397857666, + "learning_rate": 9.073069544864062e-06, + "loss": 0.7175, + "step": 4941 + }, + { + "epoch": 0.643498633346349, + "grad_norm": 3.953160285949707, + "learning_rate": 9.071857072137284e-06, + "loss": 0.6577, + "step": 4944 + }, + { + "epoch": 0.6438891058180398, + "grad_norm": 2.4851486682891846, + "learning_rate": 9.070643888065642e-06, + "loss": 0.6288, + "step": 4947 + }, + { + "epoch": 0.6442795782897306, + "grad_norm": 2.4990899562835693, + "learning_rate": 9.069429992861078e-06, + "loss": 0.7342, + "step": 4950 + }, + { + "epoch": 0.6446700507614214, + "grad_norm": 2.3601980209350586, + "learning_rate": 9.068215386735655e-06, + "loss": 0.568, + "step": 4953 + }, + { + "epoch": 0.645060523233112, + "grad_norm": 2.9806814193725586, + "learning_rate": 9.06700006990156e-06, + "loss": 0.6322, + "step": 4956 + }, + { + "epoch": 0.6454509957048028, + "grad_norm": 3.1821587085723877, + "learning_rate": 9.06578404257111e-06, + "loss": 0.7119, + "step": 4959 + }, + { + "epoch": 0.6458414681764936, + "grad_norm": 2.242640972137451, + "learning_rate": 9.064567304956741e-06, + "loss": 0.7047, + "step": 4962 + }, + { + "epoch": 0.6462319406481843, + "grad_norm": 2.693079710006714, + "learning_rate": 9.063349857271015e-06, + "loss": 0.6865, + "step": 4965 + }, + { + "epoch": 0.646622413119875, + "grad_norm": 3.0986106395721436, + "learning_rate": 9.062131699726615e-06, + "loss": 0.6241, + "step": 4968 + }, + { + "epoch": 0.6470128855915658, + "grad_norm": 2.6668357849121094, + "learning_rate": 9.060912832536354e-06, + "loss": 0.7457, + "step": 4971 + }, + { + "epoch": 0.6474033580632566, + "grad_norm": 2.3594837188720703, + "learning_rate": 9.059693255913165e-06, + "loss": 0.6704, + "step": 4974 + }, + { + "epoch": 0.6477938305349473, + "grad_norm": 3.7347023487091064, + "learning_rate": 9.058472970070102e-06, + "loss": 0.6353, + "step": 4977 + }, + { + "epoch": 0.648184303006638, + "grad_norm": 2.6183836460113525, + "learning_rate": 9.05725197522035e-06, + "loss": 0.6586, + "step": 4980 + }, + { + "epoch": 0.6485747754783288, + "grad_norm": 2.691641330718994, + "learning_rate": 9.056030271577213e-06, + "loss": 0.6442, + "step": 4983 + }, + { + "epoch": 0.6489652479500195, + "grad_norm": 2.6118125915527344, + "learning_rate": 9.054807859354122e-06, + "loss": 0.6896, + "step": 4986 + }, + { + "epoch": 0.6493557204217103, + "grad_norm": 2.557551383972168, + "learning_rate": 9.053584738764625e-06, + "loss": 0.7078, + "step": 4989 + }, + { + "epoch": 0.649746192893401, + "grad_norm": 2.4774162769317627, + "learning_rate": 9.052360910022404e-06, + "loss": 0.7401, + "step": 4992 + }, + { + "epoch": 0.6501366653650917, + "grad_norm": 2.230543851852417, + "learning_rate": 9.051136373341256e-06, + "loss": 0.6496, + "step": 4995 + }, + { + "epoch": 0.6505271378367825, + "grad_norm": 3.3752076625823975, + "learning_rate": 9.049911128935104e-06, + "loss": 0.5981, + "step": 4998 + }, + { + "epoch": 0.6509176103084733, + "grad_norm": 2.407163619995117, + "learning_rate": 9.048685177018001e-06, + "loss": 0.6235, + "step": 5001 + }, + { + "epoch": 0.6513080827801639, + "grad_norm": 2.324744939804077, + "learning_rate": 9.047458517804113e-06, + "loss": 0.6085, + "step": 5004 + }, + { + "epoch": 0.6516985552518547, + "grad_norm": 2.3298988342285156, + "learning_rate": 9.046231151507738e-06, + "loss": 0.5624, + "step": 5007 + }, + { + "epoch": 0.6520890277235455, + "grad_norm": 3.9053573608398438, + "learning_rate": 9.04500307834329e-06, + "loss": 0.687, + "step": 5010 + }, + { + "epoch": 0.6524795001952363, + "grad_norm": 2.682884931564331, + "learning_rate": 9.043774298525316e-06, + "loss": 0.6969, + "step": 5013 + }, + { + "epoch": 0.652869972666927, + "grad_norm": 2.7601308822631836, + "learning_rate": 9.04254481226848e-06, + "loss": 0.6236, + "step": 5016 + }, + { + "epoch": 0.6532604451386177, + "grad_norm": 2.39310622215271, + "learning_rate": 9.041314619787566e-06, + "loss": 0.6217, + "step": 5019 + }, + { + "epoch": 0.6536509176103085, + "grad_norm": 2.353529453277588, + "learning_rate": 9.040083721297493e-06, + "loss": 0.6112, + "step": 5022 + }, + { + "epoch": 0.6540413900819992, + "grad_norm": 2.530123472213745, + "learning_rate": 9.03885211701329e-06, + "loss": 0.6053, + "step": 5025 + }, + { + "epoch": 0.65443186255369, + "grad_norm": 2.888646125793457, + "learning_rate": 9.037619807150121e-06, + "loss": 0.6883, + "step": 5028 + }, + { + "epoch": 0.6548223350253807, + "grad_norm": 2.598736524581909, + "learning_rate": 9.036386791923265e-06, + "loss": 0.6944, + "step": 5031 + }, + { + "epoch": 0.6552128074970714, + "grad_norm": 2.7701828479766846, + "learning_rate": 9.035153071548127e-06, + "loss": 0.6272, + "step": 5034 + }, + { + "epoch": 0.6556032799687622, + "grad_norm": 2.3290834426879883, + "learning_rate": 9.033918646240236e-06, + "loss": 0.6815, + "step": 5037 + }, + { + "epoch": 0.655993752440453, + "grad_norm": 2.8352549076080322, + "learning_rate": 9.032683516215245e-06, + "loss": 0.662, + "step": 5040 + }, + { + "epoch": 0.6563842249121437, + "grad_norm": 4.3119659423828125, + "learning_rate": 9.031447681688926e-06, + "loss": 0.6342, + "step": 5043 + }, + { + "epoch": 0.6567746973838344, + "grad_norm": 2.828580379486084, + "learning_rate": 9.03021114287718e-06, + "loss": 0.7723, + "step": 5046 + }, + { + "epoch": 0.6571651698555252, + "grad_norm": 2.502751350402832, + "learning_rate": 9.028973899996022e-06, + "loss": 0.6425, + "step": 5049 + }, + { + "epoch": 0.657555642327216, + "grad_norm": 2.7101762294769287, + "learning_rate": 9.027735953261603e-06, + "loss": 0.6139, + "step": 5052 + }, + { + "epoch": 0.6579461147989066, + "grad_norm": 2.368980646133423, + "learning_rate": 9.026497302890184e-06, + "loss": 0.6776, + "step": 5055 + }, + { + "epoch": 0.6583365872705974, + "grad_norm": 2.5557427406311035, + "learning_rate": 9.025257949098158e-06, + "loss": 0.6162, + "step": 5058 + }, + { + "epoch": 0.6587270597422882, + "grad_norm": 2.6219143867492676, + "learning_rate": 9.024017892102036e-06, + "loss": 0.6914, + "step": 5061 + }, + { + "epoch": 0.659117532213979, + "grad_norm": 6.491698741912842, + "learning_rate": 9.022777132118452e-06, + "loss": 0.7507, + "step": 5064 + }, + { + "epoch": 0.6595080046856696, + "grad_norm": 2.6775612831115723, + "learning_rate": 9.021535669364167e-06, + "loss": 0.7287, + "step": 5067 + }, + { + "epoch": 0.6598984771573604, + "grad_norm": 3.4174671173095703, + "learning_rate": 9.020293504056061e-06, + "loss": 0.7409, + "step": 5070 + }, + { + "epoch": 0.6602889496290512, + "grad_norm": 2.259105682373047, + "learning_rate": 9.019050636411135e-06, + "loss": 0.5771, + "step": 5073 + }, + { + "epoch": 0.6606794221007419, + "grad_norm": 2.384159803390503, + "learning_rate": 9.01780706664652e-06, + "loss": 0.6882, + "step": 5076 + }, + { + "epoch": 0.6610698945724326, + "grad_norm": 2.471353054046631, + "learning_rate": 9.016562794979463e-06, + "loss": 0.6606, + "step": 5079 + }, + { + "epoch": 0.6614603670441234, + "grad_norm": 2.462852716445923, + "learning_rate": 9.015317821627332e-06, + "loss": 0.6092, + "step": 5082 + }, + { + "epoch": 0.6618508395158141, + "grad_norm": 2.5678861141204834, + "learning_rate": 9.01407214680763e-06, + "loss": 0.7437, + "step": 5085 + }, + { + "epoch": 0.6622413119875049, + "grad_norm": 2.7121429443359375, + "learning_rate": 9.012825770737963e-06, + "loss": 0.6702, + "step": 5088 + }, + { + "epoch": 0.6626317844591956, + "grad_norm": 2.5541470050811768, + "learning_rate": 9.011578693636078e-06, + "loss": 0.6909, + "step": 5091 + }, + { + "epoch": 0.6630222569308863, + "grad_norm": 3.0044548511505127, + "learning_rate": 9.010330915719834e-06, + "loss": 0.6704, + "step": 5094 + }, + { + "epoch": 0.6634127294025771, + "grad_norm": 2.6041769981384277, + "learning_rate": 9.009082437207215e-06, + "loss": 0.6867, + "step": 5097 + }, + { + "epoch": 0.6638032018742679, + "grad_norm": 2.428577423095703, + "learning_rate": 9.00783325831633e-06, + "loss": 0.6128, + "step": 5100 + }, + { + "epoch": 0.6641936743459587, + "grad_norm": 2.460480213165283, + "learning_rate": 9.006583379265405e-06, + "loss": 0.6392, + "step": 5103 + }, + { + "epoch": 0.6645841468176493, + "grad_norm": 2.7471256256103516, + "learning_rate": 9.00533280027279e-06, + "loss": 0.6698, + "step": 5106 + }, + { + "epoch": 0.6649746192893401, + "grad_norm": 2.6716361045837402, + "learning_rate": 9.004081521556965e-06, + "loss": 0.7285, + "step": 5109 + }, + { + "epoch": 0.6653650917610309, + "grad_norm": 2.468386173248291, + "learning_rate": 9.002829543336518e-06, + "loss": 0.6738, + "step": 5112 + }, + { + "epoch": 0.6657555642327216, + "grad_norm": 2.630680561065674, + "learning_rate": 9.001576865830173e-06, + "loss": 0.7629, + "step": 5115 + }, + { + "epoch": 0.6661460367044123, + "grad_norm": 2.4180047512054443, + "learning_rate": 9.000323489256766e-06, + "loss": 0.6747, + "step": 5118 + }, + { + "epoch": 0.6665365091761031, + "grad_norm": 4.522125720977783, + "learning_rate": 8.999069413835262e-06, + "loss": 0.6331, + "step": 5121 + }, + { + "epoch": 0.6669269816477938, + "grad_norm": 2.297905445098877, + "learning_rate": 8.997814639784743e-06, + "loss": 0.5958, + "step": 5124 + }, + { + "epoch": 0.6673174541194846, + "grad_norm": 2.3950231075286865, + "learning_rate": 8.996559167324417e-06, + "loss": 0.6209, + "step": 5127 + }, + { + "epoch": 0.6677079265911753, + "grad_norm": 2.4264347553253174, + "learning_rate": 8.995302996673613e-06, + "loss": 0.6492, + "step": 5130 + }, + { + "epoch": 0.668098399062866, + "grad_norm": 3.127281427383423, + "learning_rate": 8.99404612805178e-06, + "loss": 0.6959, + "step": 5133 + }, + { + "epoch": 0.6684888715345568, + "grad_norm": 2.8744657039642334, + "learning_rate": 8.99278856167849e-06, + "loss": 0.6416, + "step": 5136 + }, + { + "epoch": 0.6688793440062476, + "grad_norm": 2.994250535964966, + "learning_rate": 8.991530297773437e-06, + "loss": 0.6651, + "step": 5139 + }, + { + "epoch": 0.6692698164779383, + "grad_norm": 2.4786083698272705, + "learning_rate": 8.99027133655644e-06, + "loss": 0.6271, + "step": 5142 + }, + { + "epoch": 0.669660288949629, + "grad_norm": 2.418321371078491, + "learning_rate": 8.989011678247433e-06, + "loss": 0.6906, + "step": 5145 + }, + { + "epoch": 0.6700507614213198, + "grad_norm": 2.5830347537994385, + "learning_rate": 8.987751323066479e-06, + "loss": 0.7363, + "step": 5148 + }, + { + "epoch": 0.6704412338930106, + "grad_norm": 2.7174127101898193, + "learning_rate": 8.986490271233757e-06, + "loss": 0.7439, + "step": 5151 + }, + { + "epoch": 0.6708317063647012, + "grad_norm": 2.870361804962158, + "learning_rate": 8.985228522969571e-06, + "loss": 0.6398, + "step": 5154 + }, + { + "epoch": 0.671222178836392, + "grad_norm": 2.797898292541504, + "learning_rate": 8.983966078494346e-06, + "loss": 0.6451, + "step": 5157 + }, + { + "epoch": 0.6716126513080828, + "grad_norm": 2.4643352031707764, + "learning_rate": 8.98270293802863e-06, + "loss": 0.6963, + "step": 5160 + }, + { + "epoch": 0.6720031237797736, + "grad_norm": 2.715022325515747, + "learning_rate": 8.981439101793086e-06, + "loss": 0.6041, + "step": 5163 + }, + { + "epoch": 0.6723935962514642, + "grad_norm": 2.511801242828369, + "learning_rate": 8.980174570008506e-06, + "loss": 0.6198, + "step": 5166 + }, + { + "epoch": 0.672784068723155, + "grad_norm": 2.53893780708313, + "learning_rate": 8.978909342895806e-06, + "loss": 0.6393, + "step": 5169 + }, + { + "epoch": 0.6731745411948458, + "grad_norm": 2.6536126136779785, + "learning_rate": 8.977643420676009e-06, + "loss": 0.7114, + "step": 5172 + }, + { + "epoch": 0.6735650136665365, + "grad_norm": 2.3529465198516846, + "learning_rate": 8.976376803570278e-06, + "loss": 0.6438, + "step": 5175 + }, + { + "epoch": 0.6739554861382272, + "grad_norm": 2.5893778800964355, + "learning_rate": 8.975109491799883e-06, + "loss": 0.661, + "step": 5178 + }, + { + "epoch": 0.674345958609918, + "grad_norm": 2.3905746936798096, + "learning_rate": 8.973841485586224e-06, + "loss": 0.5583, + "step": 5181 + }, + { + "epoch": 0.6747364310816087, + "grad_norm": 2.6573519706726074, + "learning_rate": 8.972572785150815e-06, + "loss": 0.6492, + "step": 5184 + }, + { + "epoch": 0.6751269035532995, + "grad_norm": 2.810039520263672, + "learning_rate": 8.971303390715299e-06, + "loss": 0.6346, + "step": 5187 + }, + { + "epoch": 0.6755173760249903, + "grad_norm": 3.9205002784729004, + "learning_rate": 8.970033302501433e-06, + "loss": 0.7605, + "step": 5190 + }, + { + "epoch": 0.675907848496681, + "grad_norm": 2.3390591144561768, + "learning_rate": 8.968762520731103e-06, + "loss": 0.6231, + "step": 5193 + }, + { + "epoch": 0.6762983209683717, + "grad_norm": 2.8558874130249023, + "learning_rate": 8.96749104562631e-06, + "loss": 0.6479, + "step": 5196 + }, + { + "epoch": 0.6766887934400625, + "grad_norm": 2.8038036823272705, + "learning_rate": 8.966218877409173e-06, + "loss": 0.7772, + "step": 5199 + }, + { + "epoch": 0.6770792659117533, + "grad_norm": 2.364079475402832, + "learning_rate": 8.964946016301946e-06, + "loss": 0.5847, + "step": 5202 + }, + { + "epoch": 0.6774697383834439, + "grad_norm": 2.699650764465332, + "learning_rate": 8.963672462526991e-06, + "loss": 0.667, + "step": 5205 + }, + { + "epoch": 0.6778602108551347, + "grad_norm": 2.889913320541382, + "learning_rate": 8.962398216306794e-06, + "loss": 0.5679, + "step": 5208 + }, + { + "epoch": 0.6782506833268255, + "grad_norm": 3.220335006713867, + "learning_rate": 8.961123277863965e-06, + "loss": 0.7115, + "step": 5211 + }, + { + "epoch": 0.6786411557985162, + "grad_norm": 2.591503143310547, + "learning_rate": 8.959847647421231e-06, + "loss": 0.6948, + "step": 5214 + }, + { + "epoch": 0.6790316282702069, + "grad_norm": 2.682509183883667, + "learning_rate": 8.958571325201446e-06, + "loss": 0.5903, + "step": 5217 + }, + { + "epoch": 0.6794221007418977, + "grad_norm": 3.3060030937194824, + "learning_rate": 8.957294311427575e-06, + "loss": 0.6707, + "step": 5220 + }, + { + "epoch": 0.6798125732135885, + "grad_norm": 2.3943710327148438, + "learning_rate": 8.956016606322715e-06, + "loss": 0.6522, + "step": 5223 + }, + { + "epoch": 0.6802030456852792, + "grad_norm": 2.6341614723205566, + "learning_rate": 8.954738210110075e-06, + "loss": 0.5324, + "step": 5226 + }, + { + "epoch": 0.6805935181569699, + "grad_norm": 3.0761325359344482, + "learning_rate": 8.953459123012988e-06, + "loss": 0.6379, + "step": 5229 + }, + { + "epoch": 0.6809839906286607, + "grad_norm": 2.637105703353882, + "learning_rate": 8.952179345254912e-06, + "loss": 0.7148, + "step": 5232 + }, + { + "epoch": 0.6813744631003514, + "grad_norm": 2.6795005798339844, + "learning_rate": 8.950898877059417e-06, + "loss": 0.6578, + "step": 5235 + }, + { + "epoch": 0.6817649355720422, + "grad_norm": 2.5985288619995117, + "learning_rate": 8.9496177186502e-06, + "loss": 0.6964, + "step": 5238 + }, + { + "epoch": 0.6821554080437329, + "grad_norm": 3.457732677459717, + "learning_rate": 8.948335870251075e-06, + "loss": 0.7052, + "step": 5241 + }, + { + "epoch": 0.6825458805154236, + "grad_norm": 2.462507724761963, + "learning_rate": 8.947053332085981e-06, + "loss": 0.6498, + "step": 5244 + }, + { + "epoch": 0.6829363529871144, + "grad_norm": 2.4393460750579834, + "learning_rate": 8.945770104378973e-06, + "loss": 0.6244, + "step": 5247 + }, + { + "epoch": 0.6833268254588052, + "grad_norm": 2.7880349159240723, + "learning_rate": 8.944486187354229e-06, + "loss": 0.7032, + "step": 5250 + }, + { + "epoch": 0.6837172979304958, + "grad_norm": 2.685492753982544, + "learning_rate": 8.943201581236045e-06, + "loss": 0.6221, + "step": 5253 + }, + { + "epoch": 0.6841077704021866, + "grad_norm": 3.0528624057769775, + "learning_rate": 8.94191628624884e-06, + "loss": 0.6866, + "step": 5256 + }, + { + "epoch": 0.6844982428738774, + "grad_norm": 2.485928773880005, + "learning_rate": 8.940630302617153e-06, + "loss": 0.6147, + "step": 5259 + }, + { + "epoch": 0.6848887153455682, + "grad_norm": 2.2878990173339844, + "learning_rate": 8.939343630565643e-06, + "loss": 0.5663, + "step": 5262 + }, + { + "epoch": 0.6852791878172588, + "grad_norm": 3.4641735553741455, + "learning_rate": 8.938056270319086e-06, + "loss": 0.6713, + "step": 5265 + }, + { + "epoch": 0.6856696602889496, + "grad_norm": 2.376103162765503, + "learning_rate": 8.936768222102382e-06, + "loss": 0.6678, + "step": 5268 + }, + { + "epoch": 0.6860601327606404, + "grad_norm": 2.2345285415649414, + "learning_rate": 8.935479486140556e-06, + "loss": 0.5991, + "step": 5271 + }, + { + "epoch": 0.6864506052323311, + "grad_norm": 2.701521873474121, + "learning_rate": 8.934190062658738e-06, + "loss": 0.687, + "step": 5274 + }, + { + "epoch": 0.6868410777040219, + "grad_norm": 2.6253838539123535, + "learning_rate": 8.932899951882195e-06, + "loss": 0.6385, + "step": 5277 + }, + { + "epoch": 0.6872315501757126, + "grad_norm": 2.780701160430908, + "learning_rate": 8.931609154036303e-06, + "loss": 0.6819, + "step": 5280 + }, + { + "epoch": 0.6876220226474034, + "grad_norm": 2.612517833709717, + "learning_rate": 8.930317669346565e-06, + "loss": 0.5871, + "step": 5283 + }, + { + "epoch": 0.6880124951190941, + "grad_norm": 2.6523735523223877, + "learning_rate": 8.929025498038595e-06, + "loss": 0.6296, + "step": 5286 + }, + { + "epoch": 0.6884029675907849, + "grad_norm": 3.8099377155303955, + "learning_rate": 8.927732640338138e-06, + "loss": 0.7596, + "step": 5289 + }, + { + "epoch": 0.6887934400624756, + "grad_norm": 2.856313705444336, + "learning_rate": 8.92643909647105e-06, + "loss": 0.7508, + "step": 5292 + }, + { + "epoch": 0.6891839125341663, + "grad_norm": 3.0642218589782715, + "learning_rate": 8.925144866663313e-06, + "loss": 0.6367, + "step": 5295 + }, + { + "epoch": 0.6895743850058571, + "grad_norm": 2.504683017730713, + "learning_rate": 8.923849951141025e-06, + "loss": 0.5706, + "step": 5298 + }, + { + "epoch": 0.6899648574775479, + "grad_norm": 2.745274543762207, + "learning_rate": 8.922554350130404e-06, + "loss": 0.6845, + "step": 5301 + }, + { + "epoch": 0.6903553299492385, + "grad_norm": 2.6270129680633545, + "learning_rate": 8.921258063857792e-06, + "loss": 0.7274, + "step": 5304 + }, + { + "epoch": 0.6907458024209293, + "grad_norm": 2.979092597961426, + "learning_rate": 8.919961092549643e-06, + "loss": 0.6765, + "step": 5307 + }, + { + "epoch": 0.6911362748926201, + "grad_norm": 3.506718873977661, + "learning_rate": 8.91866343643254e-06, + "loss": 0.6857, + "step": 5310 + }, + { + "epoch": 0.6915267473643109, + "grad_norm": 3.513763189315796, + "learning_rate": 8.917365095733176e-06, + "loss": 0.6291, + "step": 5313 + }, + { + "epoch": 0.6919172198360015, + "grad_norm": 2.3708810806274414, + "learning_rate": 8.91606607067837e-06, + "loss": 0.5938, + "step": 5316 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 2.5921242237091064, + "learning_rate": 8.914766361495063e-06, + "loss": 0.6986, + "step": 5319 + }, + { + "epoch": 0.6926981647793831, + "grad_norm": 2.9242255687713623, + "learning_rate": 8.913465968410307e-06, + "loss": 0.7117, + "step": 5322 + }, + { + "epoch": 0.6930886372510738, + "grad_norm": 2.42401123046875, + "learning_rate": 8.912164891651277e-06, + "loss": 0.6327, + "step": 5325 + }, + { + "epoch": 0.6934791097227645, + "grad_norm": 2.31184458732605, + "learning_rate": 8.910863131445273e-06, + "loss": 0.6479, + "step": 5328 + }, + { + "epoch": 0.6938695821944553, + "grad_norm": 2.514890193939209, + "learning_rate": 8.909560688019705e-06, + "loss": 0.7545, + "step": 5331 + }, + { + "epoch": 0.694260054666146, + "grad_norm": 2.294757127761841, + "learning_rate": 8.908257561602112e-06, + "loss": 0.6594, + "step": 5334 + }, + { + "epoch": 0.6946505271378368, + "grad_norm": 2.3671836853027344, + "learning_rate": 8.906953752420142e-06, + "loss": 0.6085, + "step": 5337 + }, + { + "epoch": 0.6950409996095275, + "grad_norm": 2.5829970836639404, + "learning_rate": 8.905649260701571e-06, + "loss": 0.7096, + "step": 5340 + }, + { + "epoch": 0.6954314720812182, + "grad_norm": 2.5177972316741943, + "learning_rate": 8.904344086674292e-06, + "loss": 0.6784, + "step": 5343 + }, + { + "epoch": 0.695821944552909, + "grad_norm": 2.783193349838257, + "learning_rate": 8.903038230566314e-06, + "loss": 0.7778, + "step": 5346 + }, + { + "epoch": 0.6962124170245998, + "grad_norm": 2.4683210849761963, + "learning_rate": 8.901731692605767e-06, + "loss": 0.6198, + "step": 5349 + }, + { + "epoch": 0.6966028894962905, + "grad_norm": 2.2853734493255615, + "learning_rate": 8.900424473020904e-06, + "loss": 0.6472, + "step": 5352 + }, + { + "epoch": 0.6969933619679812, + "grad_norm": 3.5069949626922607, + "learning_rate": 8.899116572040087e-06, + "loss": 0.6187, + "step": 5355 + }, + { + "epoch": 0.697383834439672, + "grad_norm": 2.6812729835510254, + "learning_rate": 8.897807989891809e-06, + "loss": 0.6091, + "step": 5358 + }, + { + "epoch": 0.6977743069113628, + "grad_norm": 2.5712661743164062, + "learning_rate": 8.896498726804677e-06, + "loss": 0.6686, + "step": 5361 + }, + { + "epoch": 0.6981647793830535, + "grad_norm": 3.0275492668151855, + "learning_rate": 8.895188783007412e-06, + "loss": 0.6555, + "step": 5364 + }, + { + "epoch": 0.6985552518547442, + "grad_norm": 2.5897321701049805, + "learning_rate": 8.893878158728861e-06, + "loss": 0.6643, + "step": 5367 + }, + { + "epoch": 0.698945724326435, + "grad_norm": 2.7661631107330322, + "learning_rate": 8.892566854197988e-06, + "loss": 0.6861, + "step": 5370 + }, + { + "epoch": 0.6993361967981258, + "grad_norm": 2.3814573287963867, + "learning_rate": 8.891254869643873e-06, + "loss": 0.6581, + "step": 5373 + }, + { + "epoch": 0.6997266692698165, + "grad_norm": 2.452768564224243, + "learning_rate": 8.88994220529572e-06, + "loss": 0.6112, + "step": 5376 + }, + { + "epoch": 0.7001171417415072, + "grad_norm": 2.68501615524292, + "learning_rate": 8.888628861382846e-06, + "loss": 0.604, + "step": 5379 + }, + { + "epoch": 0.700507614213198, + "grad_norm": 2.675839424133301, + "learning_rate": 8.88731483813469e-06, + "loss": 0.7262, + "step": 5382 + }, + { + "epoch": 0.7008980866848887, + "grad_norm": 4.175928592681885, + "learning_rate": 8.88600013578081e-06, + "loss": 0.6624, + "step": 5385 + }, + { + "epoch": 0.7012885591565795, + "grad_norm": 2.8697991371154785, + "learning_rate": 8.884684754550882e-06, + "loss": 0.6547, + "step": 5388 + }, + { + "epoch": 0.7016790316282702, + "grad_norm": 4.71966552734375, + "learning_rate": 8.8833686946747e-06, + "loss": 0.591, + "step": 5391 + }, + { + "epoch": 0.7020695040999609, + "grad_norm": 2.5646259784698486, + "learning_rate": 8.882051956382175e-06, + "loss": 0.7158, + "step": 5394 + }, + { + "epoch": 0.7024599765716517, + "grad_norm": 2.482074737548828, + "learning_rate": 8.88073453990334e-06, + "loss": 0.6647, + "step": 5397 + }, + { + "epoch": 0.7028504490433425, + "grad_norm": 2.8584282398223877, + "learning_rate": 8.879416445468344e-06, + "loss": 0.5627, + "step": 5400 + }, + { + "epoch": 0.7032409215150331, + "grad_norm": 2.51839280128479, + "learning_rate": 8.878097673307458e-06, + "loss": 0.7744, + "step": 5403 + }, + { + "epoch": 0.7036313939867239, + "grad_norm": 2.4203269481658936, + "learning_rate": 8.876778223651067e-06, + "loss": 0.5973, + "step": 5406 + }, + { + "epoch": 0.7040218664584147, + "grad_norm": 2.3395378589630127, + "learning_rate": 8.875458096729674e-06, + "loss": 0.5872, + "step": 5409 + }, + { + "epoch": 0.7044123389301055, + "grad_norm": 2.6769402027130127, + "learning_rate": 8.874137292773906e-06, + "loss": 0.6168, + "step": 5412 + }, + { + "epoch": 0.7048028114017961, + "grad_norm": 2.5752854347229004, + "learning_rate": 8.872815812014501e-06, + "loss": 0.7544, + "step": 5415 + }, + { + "epoch": 0.7051932838734869, + "grad_norm": 2.6731069087982178, + "learning_rate": 8.871493654682321e-06, + "loss": 0.6455, + "step": 5418 + }, + { + "epoch": 0.7055837563451777, + "grad_norm": 2.6608500480651855, + "learning_rate": 8.870170821008346e-06, + "loss": 0.6383, + "step": 5421 + }, + { + "epoch": 0.7059742288168684, + "grad_norm": 3.0593783855438232, + "learning_rate": 8.86884731122367e-06, + "loss": 0.6723, + "step": 5424 + }, + { + "epoch": 0.7063647012885591, + "grad_norm": 2.809638738632202, + "learning_rate": 8.867523125559504e-06, + "loss": 0.6422, + "step": 5427 + }, + { + "epoch": 0.7067551737602499, + "grad_norm": 2.5025877952575684, + "learning_rate": 8.866198264247187e-06, + "loss": 0.687, + "step": 5430 + }, + { + "epoch": 0.7071456462319406, + "grad_norm": 3.1199793815612793, + "learning_rate": 8.864872727518168e-06, + "loss": 0.5947, + "step": 5433 + }, + { + "epoch": 0.7075361187036314, + "grad_norm": 2.5214881896972656, + "learning_rate": 8.863546515604012e-06, + "loss": 0.6269, + "step": 5436 + }, + { + "epoch": 0.7079265911753222, + "grad_norm": 2.481205940246582, + "learning_rate": 8.862219628736409e-06, + "loss": 0.7641, + "step": 5439 + }, + { + "epoch": 0.7083170636470129, + "grad_norm": 2.585766553878784, + "learning_rate": 8.86089206714716e-06, + "loss": 0.698, + "step": 5442 + }, + { + "epoch": 0.7087075361187036, + "grad_norm": 3.276780843734741, + "learning_rate": 8.859563831068188e-06, + "loss": 0.6059, + "step": 5445 + }, + { + "epoch": 0.7090980085903944, + "grad_norm": 2.3243212699890137, + "learning_rate": 8.858234920731536e-06, + "loss": 0.6083, + "step": 5448 + }, + { + "epoch": 0.7094884810620852, + "grad_norm": 2.7607672214508057, + "learning_rate": 8.856905336369359e-06, + "loss": 0.7022, + "step": 5451 + }, + { + "epoch": 0.7098789535337758, + "grad_norm": 2.5045323371887207, + "learning_rate": 8.855575078213933e-06, + "loss": 0.5898, + "step": 5454 + }, + { + "epoch": 0.7102694260054666, + "grad_norm": 2.378309488296509, + "learning_rate": 8.854244146497654e-06, + "loss": 0.6493, + "step": 5457 + }, + { + "epoch": 0.7106598984771574, + "grad_norm": 3.019075870513916, + "learning_rate": 8.852912541453029e-06, + "loss": 0.5836, + "step": 5460 + }, + { + "epoch": 0.7110503709488482, + "grad_norm": 2.7453722953796387, + "learning_rate": 8.85158026331269e-06, + "loss": 0.59, + "step": 5463 + }, + { + "epoch": 0.7114408434205388, + "grad_norm": 2.3514020442962646, + "learning_rate": 8.85024731230938e-06, + "loss": 0.5568, + "step": 5466 + }, + { + "epoch": 0.7118313158922296, + "grad_norm": 2.4078609943389893, + "learning_rate": 8.848913688675963e-06, + "loss": 0.694, + "step": 5469 + }, + { + "epoch": 0.7122217883639204, + "grad_norm": 2.3529157638549805, + "learning_rate": 8.847579392645425e-06, + "loss": 0.6763, + "step": 5472 + }, + { + "epoch": 0.7126122608356111, + "grad_norm": 2.9866087436676025, + "learning_rate": 8.846244424450858e-06, + "loss": 0.6423, + "step": 5475 + }, + { + "epoch": 0.7130027333073018, + "grad_norm": 2.784130334854126, + "learning_rate": 8.844908784325483e-06, + "loss": 0.6326, + "step": 5478 + }, + { + "epoch": 0.7133932057789926, + "grad_norm": 2.18365740776062, + "learning_rate": 8.84357247250263e-06, + "loss": 0.5958, + "step": 5481 + }, + { + "epoch": 0.7137836782506833, + "grad_norm": 2.3948042392730713, + "learning_rate": 8.842235489215755e-06, + "loss": 0.7437, + "step": 5484 + }, + { + "epoch": 0.7141741507223741, + "grad_norm": 2.9066920280456543, + "learning_rate": 8.84089783469842e-06, + "loss": 0.6481, + "step": 5487 + }, + { + "epoch": 0.7145646231940648, + "grad_norm": 2.770456075668335, + "learning_rate": 8.839559509184317e-06, + "loss": 0.6289, + "step": 5490 + }, + { + "epoch": 0.7149550956657555, + "grad_norm": 2.8254899978637695, + "learning_rate": 8.838220512907241e-06, + "loss": 0.6341, + "step": 5493 + }, + { + "epoch": 0.7153455681374463, + "grad_norm": 2.9626049995422363, + "learning_rate": 8.836880846101118e-06, + "loss": 0.6394, + "step": 5496 + }, + { + "epoch": 0.7157360406091371, + "grad_norm": 2.709442377090454, + "learning_rate": 8.835540508999982e-06, + "loss": 0.7887, + "step": 5499 + }, + { + "epoch": 0.7161265130808278, + "grad_norm": 2.8093841075897217, + "learning_rate": 8.834199501837988e-06, + "loss": 0.5902, + "step": 5502 + }, + { + "epoch": 0.7165169855525185, + "grad_norm": 2.7774314880371094, + "learning_rate": 8.832857824849407e-06, + "loss": 0.6338, + "step": 5505 + }, + { + "epoch": 0.7169074580242093, + "grad_norm": 4.112159729003906, + "learning_rate": 8.831515478268627e-06, + "loss": 0.6495, + "step": 5508 + }, + { + "epoch": 0.7172979304959001, + "grad_norm": 3.2125117778778076, + "learning_rate": 8.830172462330155e-06, + "loss": 0.639, + "step": 5511 + }, + { + "epoch": 0.7176884029675907, + "grad_norm": 2.521543264389038, + "learning_rate": 8.828828777268609e-06, + "loss": 0.6834, + "step": 5514 + }, + { + "epoch": 0.7180788754392815, + "grad_norm": 2.344866991043091, + "learning_rate": 8.827484423318731e-06, + "loss": 0.7033, + "step": 5517 + }, + { + "epoch": 0.7184693479109723, + "grad_norm": 3.047065496444702, + "learning_rate": 8.826139400715377e-06, + "loss": 0.6674, + "step": 5520 + }, + { + "epoch": 0.718859820382663, + "grad_norm": 2.916560411453247, + "learning_rate": 8.824793709693517e-06, + "loss": 0.6385, + "step": 5523 + }, + { + "epoch": 0.7192502928543538, + "grad_norm": 2.3283605575561523, + "learning_rate": 8.823447350488243e-06, + "loss": 0.6017, + "step": 5526 + }, + { + "epoch": 0.7196407653260445, + "grad_norm": 3.2105703353881836, + "learning_rate": 8.822100323334761e-06, + "loss": 0.635, + "step": 5529 + }, + { + "epoch": 0.7200312377977353, + "grad_norm": 2.4558160305023193, + "learning_rate": 8.820752628468391e-06, + "loss": 0.6822, + "step": 5532 + }, + { + "epoch": 0.720421710269426, + "grad_norm": 2.4689717292785645, + "learning_rate": 8.819404266124575e-06, + "loss": 0.7866, + "step": 5535 + }, + { + "epoch": 0.7208121827411168, + "grad_norm": 2.5155982971191406, + "learning_rate": 8.818055236538872e-06, + "loss": 0.6903, + "step": 5538 + }, + { + "epoch": 0.7212026552128075, + "grad_norm": 3.5505385398864746, + "learning_rate": 8.816705539946948e-06, + "loss": 0.5982, + "step": 5541 + }, + { + "epoch": 0.7215931276844982, + "grad_norm": 2.4933745861053467, + "learning_rate": 8.815355176584595e-06, + "loss": 0.6077, + "step": 5544 + }, + { + "epoch": 0.721983600156189, + "grad_norm": 3.3649566173553467, + "learning_rate": 8.81400414668772e-06, + "loss": 0.7257, + "step": 5547 + }, + { + "epoch": 0.7223740726278798, + "grad_norm": 2.446016550064087, + "learning_rate": 8.812652450492345e-06, + "loss": 0.5711, + "step": 5550 + }, + { + "epoch": 0.7227645450995704, + "grad_norm": 2.715980052947998, + "learning_rate": 8.811300088234607e-06, + "loss": 0.6866, + "step": 5553 + }, + { + "epoch": 0.7231550175712612, + "grad_norm": 2.724425792694092, + "learning_rate": 8.80994706015076e-06, + "loss": 0.6597, + "step": 5556 + }, + { + "epoch": 0.723545490042952, + "grad_norm": 3.384270429611206, + "learning_rate": 8.808593366477177e-06, + "loss": 0.6568, + "step": 5559 + }, + { + "epoch": 0.7239359625146428, + "grad_norm": 2.415548086166382, + "learning_rate": 8.807239007450345e-06, + "loss": 0.6081, + "step": 5562 + }, + { + "epoch": 0.7243264349863334, + "grad_norm": 2.3857581615448, + "learning_rate": 8.805883983306869e-06, + "loss": 0.7436, + "step": 5565 + }, + { + "epoch": 0.7247169074580242, + "grad_norm": 2.494887351989746, + "learning_rate": 8.804528294283466e-06, + "loss": 0.6451, + "step": 5568 + }, + { + "epoch": 0.725107379929715, + "grad_norm": 2.5483858585357666, + "learning_rate": 8.803171940616974e-06, + "loss": 0.6226, + "step": 5571 + }, + { + "epoch": 0.7254978524014057, + "grad_norm": 2.5574679374694824, + "learning_rate": 8.801814922544345e-06, + "loss": 0.6289, + "step": 5574 + }, + { + "epoch": 0.7258883248730964, + "grad_norm": 2.4144797325134277, + "learning_rate": 8.800457240302646e-06, + "loss": 0.6128, + "step": 5577 + }, + { + "epoch": 0.7262787973447872, + "grad_norm": 2.4427857398986816, + "learning_rate": 8.799098894129063e-06, + "loss": 0.6263, + "step": 5580 + }, + { + "epoch": 0.726669269816478, + "grad_norm": 2.785493850708008, + "learning_rate": 8.797739884260896e-06, + "loss": 0.7186, + "step": 5583 + }, + { + "epoch": 0.7270597422881687, + "grad_norm": 2.493211030960083, + "learning_rate": 8.79638021093556e-06, + "loss": 0.6333, + "step": 5586 + }, + { + "epoch": 0.7274502147598594, + "grad_norm": 2.7701425552368164, + "learning_rate": 8.795019874390587e-06, + "loss": 0.7146, + "step": 5589 + }, + { + "epoch": 0.7278406872315502, + "grad_norm": 2.20003604888916, + "learning_rate": 8.793658874863626e-06, + "loss": 0.5846, + "step": 5592 + }, + { + "epoch": 0.7282311597032409, + "grad_norm": 3.9232680797576904, + "learning_rate": 8.79229721259244e-06, + "loss": 0.6657, + "step": 5595 + }, + { + "epoch": 0.7286216321749317, + "grad_norm": 2.5265204906463623, + "learning_rate": 8.790934887814908e-06, + "loss": 0.6148, + "step": 5598 + }, + { + "epoch": 0.7290121046466224, + "grad_norm": 2.7512199878692627, + "learning_rate": 8.789571900769028e-06, + "loss": 0.6684, + "step": 5601 + }, + { + "epoch": 0.7294025771183131, + "grad_norm": 2.3781023025512695, + "learning_rate": 8.788208251692908e-06, + "loss": 0.7028, + "step": 5604 + }, + { + "epoch": 0.7297930495900039, + "grad_norm": 2.4492669105529785, + "learning_rate": 8.786843940824775e-06, + "loss": 0.5972, + "step": 5607 + }, + { + "epoch": 0.7301835220616947, + "grad_norm": 2.4138023853302, + "learning_rate": 8.785478968402972e-06, + "loss": 0.5792, + "step": 5610 + }, + { + "epoch": 0.7305739945333855, + "grad_norm": 3.772378921508789, + "learning_rate": 8.784113334665958e-06, + "loss": 0.7113, + "step": 5613 + }, + { + "epoch": 0.7309644670050761, + "grad_norm": 2.438852071762085, + "learning_rate": 8.782747039852304e-06, + "loss": 0.6087, + "step": 5616 + }, + { + "epoch": 0.7313549394767669, + "grad_norm": 2.447324514389038, + "learning_rate": 8.7813800842007e-06, + "loss": 0.6334, + "step": 5619 + }, + { + "epoch": 0.7317454119484577, + "grad_norm": 2.2790205478668213, + "learning_rate": 8.78001246794995e-06, + "loss": 0.6197, + "step": 5622 + }, + { + "epoch": 0.7321358844201484, + "grad_norm": 2.459559917449951, + "learning_rate": 8.778644191338974e-06, + "loss": 0.6726, + "step": 5625 + }, + { + "epoch": 0.7325263568918391, + "grad_norm": 2.6263234615325928, + "learning_rate": 8.777275254606808e-06, + "loss": 0.6843, + "step": 5628 + }, + { + "epoch": 0.7329168293635299, + "grad_norm": 2.590893507003784, + "learning_rate": 8.775905657992599e-06, + "loss": 0.7584, + "step": 5631 + }, + { + "epoch": 0.7333073018352206, + "grad_norm": 2.9429869651794434, + "learning_rate": 8.774535401735616e-06, + "loss": 0.687, + "step": 5634 + }, + { + "epoch": 0.7336977743069114, + "grad_norm": 2.8409743309020996, + "learning_rate": 8.773164486075238e-06, + "loss": 0.7379, + "step": 5637 + }, + { + "epoch": 0.7340882467786021, + "grad_norm": 2.462329864501953, + "learning_rate": 8.771792911250963e-06, + "loss": 0.6646, + "step": 5640 + }, + { + "epoch": 0.7344787192502928, + "grad_norm": 2.554640769958496, + "learning_rate": 8.770420677502401e-06, + "loss": 0.7051, + "step": 5643 + }, + { + "epoch": 0.7348691917219836, + "grad_norm": 2.4346702098846436, + "learning_rate": 8.769047785069277e-06, + "loss": 0.6723, + "step": 5646 + }, + { + "epoch": 0.7352596641936744, + "grad_norm": 2.7797698974609375, + "learning_rate": 8.767674234191436e-06, + "loss": 0.6458, + "step": 5649 + }, + { + "epoch": 0.735650136665365, + "grad_norm": 2.8324296474456787, + "learning_rate": 8.76630002510883e-06, + "loss": 0.7206, + "step": 5652 + }, + { + "epoch": 0.7360406091370558, + "grad_norm": 2.280325412750244, + "learning_rate": 8.764925158061537e-06, + "loss": 0.6426, + "step": 5655 + }, + { + "epoch": 0.7364310816087466, + "grad_norm": 3.168109655380249, + "learning_rate": 8.763549633289737e-06, + "loss": 0.6371, + "step": 5658 + }, + { + "epoch": 0.7368215540804374, + "grad_norm": 3.2661867141723633, + "learning_rate": 8.762173451033731e-06, + "loss": 0.6164, + "step": 5661 + }, + { + "epoch": 0.737212026552128, + "grad_norm": 4.669281959533691, + "learning_rate": 8.760796611533939e-06, + "loss": 0.6704, + "step": 5664 + }, + { + "epoch": 0.7376024990238188, + "grad_norm": 2.702472686767578, + "learning_rate": 8.75941911503089e-06, + "loss": 0.6053, + "step": 5667 + }, + { + "epoch": 0.7379929714955096, + "grad_norm": 2.3921306133270264, + "learning_rate": 8.758040961765233e-06, + "loss": 0.6229, + "step": 5670 + }, + { + "epoch": 0.7383834439672003, + "grad_norm": 2.605787515640259, + "learning_rate": 8.756662151977724e-06, + "loss": 0.7242, + "step": 5673 + }, + { + "epoch": 0.738773916438891, + "grad_norm": 2.5881075859069824, + "learning_rate": 8.755282685909239e-06, + "loss": 0.6878, + "step": 5676 + }, + { + "epoch": 0.7391643889105818, + "grad_norm": 2.3970558643341064, + "learning_rate": 8.753902563800769e-06, + "loss": 0.6808, + "step": 5679 + }, + { + "epoch": 0.7395548613822726, + "grad_norm": 2.6305997371673584, + "learning_rate": 8.752521785893418e-06, + "loss": 0.6692, + "step": 5682 + }, + { + "epoch": 0.7399453338539633, + "grad_norm": 3.6381564140319824, + "learning_rate": 8.751140352428406e-06, + "loss": 0.6898, + "step": 5685 + }, + { + "epoch": 0.740335806325654, + "grad_norm": 2.4845845699310303, + "learning_rate": 8.749758263647066e-06, + "loss": 0.6876, + "step": 5688 + }, + { + "epoch": 0.7407262787973448, + "grad_norm": 2.307579517364502, + "learning_rate": 8.748375519790846e-06, + "loss": 0.5223, + "step": 5691 + }, + { + "epoch": 0.7411167512690355, + "grad_norm": 2.4577105045318604, + "learning_rate": 8.746992121101306e-06, + "loss": 0.6736, + "step": 5694 + }, + { + "epoch": 0.7415072237407263, + "grad_norm": 2.539513111114502, + "learning_rate": 8.745608067820127e-06, + "loss": 0.6554, + "step": 5697 + }, + { + "epoch": 0.7418976962124171, + "grad_norm": 2.9040579795837402, + "learning_rate": 8.744223360189097e-06, + "loss": 0.6961, + "step": 5700 + }, + { + "epoch": 0.7422881686841077, + "grad_norm": 3.254425525665283, + "learning_rate": 8.742837998450122e-06, + "loss": 0.6879, + "step": 5703 + }, + { + "epoch": 0.7426786411557985, + "grad_norm": 3.117102861404419, + "learning_rate": 8.741451982845224e-06, + "loss": 0.6168, + "step": 5706 + }, + { + "epoch": 0.7430691136274893, + "grad_norm": 2.4279301166534424, + "learning_rate": 8.740065313616536e-06, + "loss": 0.6354, + "step": 5709 + }, + { + "epoch": 0.7434595860991801, + "grad_norm": 2.2769625186920166, + "learning_rate": 8.738677991006304e-06, + "loss": 0.6411, + "step": 5712 + }, + { + "epoch": 0.7438500585708707, + "grad_norm": 2.597215175628662, + "learning_rate": 8.737290015256892e-06, + "loss": 0.6494, + "step": 5715 + }, + { + "epoch": 0.7442405310425615, + "grad_norm": 3.5001959800720215, + "learning_rate": 8.735901386610777e-06, + "loss": 0.6644, + "step": 5718 + }, + { + "epoch": 0.7446310035142523, + "grad_norm": 2.5007054805755615, + "learning_rate": 8.73451210531055e-06, + "loss": 0.643, + "step": 5721 + }, + { + "epoch": 0.745021475985943, + "grad_norm": 2.652787923812866, + "learning_rate": 8.733122171598914e-06, + "loss": 0.7299, + "step": 5724 + }, + { + "epoch": 0.7454119484576337, + "grad_norm": 2.6559813022613525, + "learning_rate": 8.731731585718687e-06, + "loss": 0.6918, + "step": 5727 + }, + { + "epoch": 0.7458024209293245, + "grad_norm": 2.646437644958496, + "learning_rate": 8.730340347912803e-06, + "loss": 0.6691, + "step": 5730 + }, + { + "epoch": 0.7461928934010152, + "grad_norm": 2.6184146404266357, + "learning_rate": 8.728948458424307e-06, + "loss": 0.6599, + "step": 5733 + }, + { + "epoch": 0.746583365872706, + "grad_norm": 2.406266927719116, + "learning_rate": 8.72755591749636e-06, + "loss": 0.6671, + "step": 5736 + }, + { + "epoch": 0.7469738383443967, + "grad_norm": 3.493818759918213, + "learning_rate": 8.726162725372237e-06, + "loss": 0.6482, + "step": 5739 + }, + { + "epoch": 0.7473643108160875, + "grad_norm": 2.1511340141296387, + "learning_rate": 8.724768882295324e-06, + "loss": 0.6, + "step": 5742 + }, + { + "epoch": 0.7477547832877782, + "grad_norm": 2.556877851486206, + "learning_rate": 8.723374388509123e-06, + "loss": 0.6613, + "step": 5745 + }, + { + "epoch": 0.748145255759469, + "grad_norm": 2.497506856918335, + "learning_rate": 8.721979244257247e-06, + "loss": 0.6198, + "step": 5748 + }, + { + "epoch": 0.7485357282311597, + "grad_norm": 2.378761053085327, + "learning_rate": 8.72058344978343e-06, + "loss": 0.5909, + "step": 5751 + }, + { + "epoch": 0.7489262007028504, + "grad_norm": 3.043630838394165, + "learning_rate": 8.71918700533151e-06, + "loss": 0.6554, + "step": 5754 + }, + { + "epoch": 0.7493166731745412, + "grad_norm": 2.5283005237579346, + "learning_rate": 8.717789911145445e-06, + "loss": 0.6827, + "step": 5757 + }, + { + "epoch": 0.749707145646232, + "grad_norm": 2.631117105484009, + "learning_rate": 8.716392167469303e-06, + "loss": 0.6304, + "step": 5760 + }, + { + "epoch": 0.7500976181179226, + "grad_norm": 2.4751923084259033, + "learning_rate": 8.714993774547267e-06, + "loss": 0.7159, + "step": 5763 + }, + { + "epoch": 0.7504880905896134, + "grad_norm": 3.3558480739593506, + "learning_rate": 8.713594732623635e-06, + "loss": 0.7029, + "step": 5766 + }, + { + "epoch": 0.7508785630613042, + "grad_norm": 2.266209125518799, + "learning_rate": 8.712195041942814e-06, + "loss": 0.7196, + "step": 5769 + }, + { + "epoch": 0.751269035532995, + "grad_norm": 2.9685750007629395, + "learning_rate": 8.71079470274933e-06, + "loss": 0.6283, + "step": 5772 + }, + { + "epoch": 0.7516595080046856, + "grad_norm": 2.5255727767944336, + "learning_rate": 8.709393715287817e-06, + "loss": 0.5652, + "step": 5775 + }, + { + "epoch": 0.7520499804763764, + "grad_norm": 2.282809257507324, + "learning_rate": 8.707992079803025e-06, + "loss": 0.5898, + "step": 5778 + }, + { + "epoch": 0.7524404529480672, + "grad_norm": 2.4005398750305176, + "learning_rate": 8.706589796539818e-06, + "loss": 0.7173, + "step": 5781 + }, + { + "epoch": 0.7528309254197579, + "grad_norm": 3.0054335594177246, + "learning_rate": 8.70518686574317e-06, + "loss": 0.6631, + "step": 5784 + }, + { + "epoch": 0.7532213978914487, + "grad_norm": 2.390732526779175, + "learning_rate": 8.703783287658172e-06, + "loss": 0.7253, + "step": 5787 + }, + { + "epoch": 0.7536118703631394, + "grad_norm": 2.417182445526123, + "learning_rate": 8.702379062530026e-06, + "loss": 0.6376, + "step": 5790 + }, + { + "epoch": 0.7540023428348301, + "grad_norm": 2.577763795852661, + "learning_rate": 8.700974190604045e-06, + "loss": 0.621, + "step": 5793 + }, + { + "epoch": 0.7543928153065209, + "grad_norm": 2.8259828090667725, + "learning_rate": 8.69956867212566e-06, + "loss": 0.624, + "step": 5796 + }, + { + "epoch": 0.7547832877782117, + "grad_norm": 3.2486746311187744, + "learning_rate": 8.698162507340408e-06, + "loss": 0.6333, + "step": 5799 + }, + { + "epoch": 0.7551737602499023, + "grad_norm": 2.8928191661834717, + "learning_rate": 8.696755696493949e-06, + "loss": 0.6203, + "step": 5802 + }, + { + "epoch": 0.7555642327215931, + "grad_norm": 3.078284502029419, + "learning_rate": 8.695348239832045e-06, + "loss": 0.6175, + "step": 5805 + }, + { + "epoch": 0.7559547051932839, + "grad_norm": 2.5001373291015625, + "learning_rate": 8.693940137600578e-06, + "loss": 0.639, + "step": 5808 + }, + { + "epoch": 0.7563451776649747, + "grad_norm": 2.3842241764068604, + "learning_rate": 8.692531390045538e-06, + "loss": 0.6531, + "step": 5811 + }, + { + "epoch": 0.7567356501366653, + "grad_norm": 2.515578269958496, + "learning_rate": 8.691121997413035e-06, + "loss": 0.6603, + "step": 5814 + }, + { + "epoch": 0.7571261226083561, + "grad_norm": 2.7876977920532227, + "learning_rate": 8.689711959949282e-06, + "loss": 0.6454, + "step": 5817 + }, + { + "epoch": 0.7575165950800469, + "grad_norm": 3.4954264163970947, + "learning_rate": 8.688301277900613e-06, + "loss": 0.7285, + "step": 5820 + }, + { + "epoch": 0.7579070675517376, + "grad_norm": 2.9758989810943604, + "learning_rate": 8.686889951513468e-06, + "loss": 0.6519, + "step": 5823 + }, + { + "epoch": 0.7582975400234283, + "grad_norm": 2.79794979095459, + "learning_rate": 8.685477981034407e-06, + "loss": 0.6698, + "step": 5826 + }, + { + "epoch": 0.7586880124951191, + "grad_norm": 2.7669429779052734, + "learning_rate": 8.684065366710093e-06, + "loss": 0.7602, + "step": 5829 + }, + { + "epoch": 0.7590784849668099, + "grad_norm": 2.231877326965332, + "learning_rate": 8.682652108787312e-06, + "loss": 0.6112, + "step": 5832 + }, + { + "epoch": 0.7594689574385006, + "grad_norm": 2.530808210372925, + "learning_rate": 8.681238207512955e-06, + "loss": 0.5664, + "step": 5835 + }, + { + "epoch": 0.7598594299101913, + "grad_norm": 3.670186758041382, + "learning_rate": 8.679823663134025e-06, + "loss": 0.5716, + "step": 5838 + }, + { + "epoch": 0.7602499023818821, + "grad_norm": 2.7073922157287598, + "learning_rate": 8.678408475897643e-06, + "loss": 0.556, + "step": 5841 + }, + { + "epoch": 0.7606403748535728, + "grad_norm": 2.874410390853882, + "learning_rate": 8.67699264605104e-06, + "loss": 0.6784, + "step": 5844 + }, + { + "epoch": 0.7610308473252636, + "grad_norm": 2.438572883605957, + "learning_rate": 8.675576173841555e-06, + "loss": 0.7022, + "step": 5847 + }, + { + "epoch": 0.7614213197969543, + "grad_norm": 2.563101053237915, + "learning_rate": 8.674159059516645e-06, + "loss": 0.7107, + "step": 5850 + }, + { + "epoch": 0.761811792268645, + "grad_norm": 2.388939380645752, + "learning_rate": 8.672741303323877e-06, + "loss": 0.6116, + "step": 5853 + }, + { + "epoch": 0.7622022647403358, + "grad_norm": 2.337643623352051, + "learning_rate": 8.671322905510931e-06, + "loss": 0.6408, + "step": 5856 + }, + { + "epoch": 0.7625927372120266, + "grad_norm": 2.49255633354187, + "learning_rate": 8.669903866325594e-06, + "loss": 0.6909, + "step": 5859 + }, + { + "epoch": 0.7629832096837172, + "grad_norm": 2.3588013648986816, + "learning_rate": 8.668484186015775e-06, + "loss": 0.6309, + "step": 5862 + }, + { + "epoch": 0.763373682155408, + "grad_norm": 2.3805925846099854, + "learning_rate": 8.667063864829483e-06, + "loss": 0.6424, + "step": 5865 + }, + { + "epoch": 0.7637641546270988, + "grad_norm": 3.148681879043579, + "learning_rate": 8.665642903014851e-06, + "loss": 0.6926, + "step": 5868 + }, + { + "epoch": 0.7641546270987896, + "grad_norm": 2.75685977935791, + "learning_rate": 8.664221300820114e-06, + "loss": 0.6597, + "step": 5871 + }, + { + "epoch": 0.7645450995704803, + "grad_norm": 2.3916969299316406, + "learning_rate": 8.662799058493625e-06, + "loss": 0.5482, + "step": 5874 + }, + { + "epoch": 0.764935572042171, + "grad_norm": 2.5081396102905273, + "learning_rate": 8.661376176283844e-06, + "loss": 0.6723, + "step": 5877 + }, + { + "epoch": 0.7653260445138618, + "grad_norm": 2.1820735931396484, + "learning_rate": 8.659952654439348e-06, + "loss": 0.6866, + "step": 5880 + }, + { + "epoch": 0.7657165169855525, + "grad_norm": 2.611650228500366, + "learning_rate": 8.658528493208825e-06, + "loss": 0.7078, + "step": 5883 + }, + { + "epoch": 0.7661069894572433, + "grad_norm": 2.291048765182495, + "learning_rate": 8.657103692841067e-06, + "loss": 0.6261, + "step": 5886 + }, + { + "epoch": 0.766497461928934, + "grad_norm": 2.417051076889038, + "learning_rate": 8.655678253584989e-06, + "loss": 0.6115, + "step": 5889 + }, + { + "epoch": 0.7668879344006247, + "grad_norm": 2.867833137512207, + "learning_rate": 8.65425217568961e-06, + "loss": 0.7631, + "step": 5892 + }, + { + "epoch": 0.7672784068723155, + "grad_norm": 2.566713571548462, + "learning_rate": 8.652825459404065e-06, + "loss": 0.5086, + "step": 5895 + }, + { + "epoch": 0.7676688793440063, + "grad_norm": 2.6429190635681152, + "learning_rate": 8.651398104977595e-06, + "loss": 0.5853, + "step": 5898 + }, + { + "epoch": 0.768059351815697, + "grad_norm": 2.9396679401397705, + "learning_rate": 8.649970112659558e-06, + "loss": 0.5673, + "step": 5901 + }, + { + "epoch": 0.7684498242873877, + "grad_norm": 2.601750612258911, + "learning_rate": 8.648541482699422e-06, + "loss": 0.7371, + "step": 5904 + }, + { + "epoch": 0.7688402967590785, + "grad_norm": 2.7551345825195312, + "learning_rate": 8.647112215346763e-06, + "loss": 0.6192, + "step": 5907 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 2.5565147399902344, + "learning_rate": 8.645682310851273e-06, + "loss": 0.6002, + "step": 5910 + }, + { + "epoch": 0.7696212417024599, + "grad_norm": 3.5521817207336426, + "learning_rate": 8.644251769462751e-06, + "loss": 0.6178, + "step": 5913 + }, + { + "epoch": 0.7700117141741507, + "grad_norm": 2.3843579292297363, + "learning_rate": 8.642820591431114e-06, + "loss": 0.5701, + "step": 5916 + }, + { + "epoch": 0.7704021866458415, + "grad_norm": 2.349097490310669, + "learning_rate": 8.641388777006381e-06, + "loss": 0.6391, + "step": 5919 + }, + { + "epoch": 0.7707926591175323, + "grad_norm": 2.2657618522644043, + "learning_rate": 8.639956326438688e-06, + "loss": 0.6661, + "step": 5922 + }, + { + "epoch": 0.7711831315892229, + "grad_norm": 3.2351880073547363, + "learning_rate": 8.638523239978285e-06, + "loss": 0.6367, + "step": 5925 + }, + { + "epoch": 0.7715736040609137, + "grad_norm": 2.358475923538208, + "learning_rate": 8.637089517875524e-06, + "loss": 0.6163, + "step": 5928 + }, + { + "epoch": 0.7719640765326045, + "grad_norm": 2.8347368240356445, + "learning_rate": 8.635655160380879e-06, + "loss": 0.7084, + "step": 5931 + }, + { + "epoch": 0.7723545490042952, + "grad_norm": 2.7897112369537354, + "learning_rate": 8.634220167744922e-06, + "loss": 0.612, + "step": 5934 + }, + { + "epoch": 0.7727450214759859, + "grad_norm": 2.506141424179077, + "learning_rate": 8.632784540218348e-06, + "loss": 0.6386, + "step": 5937 + }, + { + "epoch": 0.7731354939476767, + "grad_norm": 2.703279495239258, + "learning_rate": 8.631348278051956e-06, + "loss": 0.6336, + "step": 5940 + }, + { + "epoch": 0.7735259664193674, + "grad_norm": 2.3759536743164062, + "learning_rate": 8.62991138149666e-06, + "loss": 0.6074, + "step": 5943 + }, + { + "epoch": 0.7739164388910582, + "grad_norm": 3.115846872329712, + "learning_rate": 8.628473850803484e-06, + "loss": 0.6421, + "step": 5946 + }, + { + "epoch": 0.7743069113627489, + "grad_norm": 3.065495729446411, + "learning_rate": 8.627035686223557e-06, + "loss": 0.6935, + "step": 5949 + }, + { + "epoch": 0.7746973838344396, + "grad_norm": 2.664783477783203, + "learning_rate": 8.625596888008127e-06, + "loss": 0.6864, + "step": 5952 + }, + { + "epoch": 0.7750878563061304, + "grad_norm": 2.6792876720428467, + "learning_rate": 8.624157456408547e-06, + "loss": 0.7312, + "step": 5955 + }, + { + "epoch": 0.7754783287778212, + "grad_norm": 2.283553123474121, + "learning_rate": 8.622717391676284e-06, + "loss": 0.6556, + "step": 5958 + }, + { + "epoch": 0.775868801249512, + "grad_norm": 3.2782957553863525, + "learning_rate": 8.621276694062915e-06, + "loss": 0.6916, + "step": 5961 + }, + { + "epoch": 0.7762592737212026, + "grad_norm": 2.4025683403015137, + "learning_rate": 8.619835363820123e-06, + "loss": 0.8082, + "step": 5964 + }, + { + "epoch": 0.7766497461928934, + "grad_norm": 2.7000598907470703, + "learning_rate": 8.61839340119971e-06, + "loss": 0.6322, + "step": 5967 + }, + { + "epoch": 0.7770402186645842, + "grad_norm": 3.0906264781951904, + "learning_rate": 8.61695080645358e-06, + "loss": 0.6954, + "step": 5970 + }, + { + "epoch": 0.7774306911362749, + "grad_norm": 2.45308256149292, + "learning_rate": 8.615507579833754e-06, + "loss": 0.7327, + "step": 5973 + }, + { + "epoch": 0.7778211636079656, + "grad_norm": 2.349186420440674, + "learning_rate": 8.61406372159236e-06, + "loss": 0.6414, + "step": 5976 + }, + { + "epoch": 0.7782116360796564, + "grad_norm": 2.472166061401367, + "learning_rate": 8.612619231981636e-06, + "loss": 0.6639, + "step": 5979 + }, + { + "epoch": 0.7786021085513471, + "grad_norm": 2.506831645965576, + "learning_rate": 8.611174111253932e-06, + "loss": 0.5965, + "step": 5982 + }, + { + "epoch": 0.7789925810230379, + "grad_norm": 3.1906239986419678, + "learning_rate": 8.609728359661709e-06, + "loss": 0.6799, + "step": 5985 + }, + { + "epoch": 0.7793830534947286, + "grad_norm": 3.2461135387420654, + "learning_rate": 8.608281977457534e-06, + "loss": 0.7414, + "step": 5988 + }, + { + "epoch": 0.7797735259664194, + "grad_norm": 2.6268630027770996, + "learning_rate": 8.606834964894089e-06, + "loss": 0.655, + "step": 5991 + }, + { + "epoch": 0.7801639984381101, + "grad_norm": 2.3125433921813965, + "learning_rate": 8.605387322224162e-06, + "loss": 0.6857, + "step": 5994 + }, + { + "epoch": 0.7805544709098009, + "grad_norm": 3.1327126026153564, + "learning_rate": 8.603939049700655e-06, + "loss": 0.7118, + "step": 5997 + }, + { + "epoch": 0.7809449433814916, + "grad_norm": 3.7016682624816895, + "learning_rate": 8.602490147576579e-06, + "loss": 0.7118, + "step": 6000 + }, + { + "epoch": 0.7813354158531823, + "grad_norm": 2.4639270305633545, + "learning_rate": 8.601040616105053e-06, + "loss": 0.7133, + "step": 6003 + }, + { + "epoch": 0.7817258883248731, + "grad_norm": 2.384366989135742, + "learning_rate": 8.599590455539304e-06, + "loss": 0.6218, + "step": 6006 + }, + { + "epoch": 0.7821163607965639, + "grad_norm": 3.026371717453003, + "learning_rate": 8.598139666132676e-06, + "loss": 0.6333, + "step": 6009 + }, + { + "epoch": 0.7825068332682545, + "grad_norm": 2.3918018341064453, + "learning_rate": 8.596688248138618e-06, + "loss": 0.5672, + "step": 6012 + }, + { + "epoch": 0.7828973057399453, + "grad_norm": 2.7614986896514893, + "learning_rate": 8.59523620181069e-06, + "loss": 0.68, + "step": 6015 + }, + { + "epoch": 0.7832877782116361, + "grad_norm": 2.838771343231201, + "learning_rate": 8.59378352740256e-06, + "loss": 0.7742, + "step": 6018 + }, + { + "epoch": 0.7836782506833269, + "grad_norm": 2.317967176437378, + "learning_rate": 8.592330225168008e-06, + "loss": 0.6576, + "step": 6021 + }, + { + "epoch": 0.7840687231550175, + "grad_norm": 2.4622199535369873, + "learning_rate": 8.590876295360924e-06, + "loss": 0.706, + "step": 6024 + }, + { + "epoch": 0.7844591956267083, + "grad_norm": 2.2674942016601562, + "learning_rate": 8.589421738235304e-06, + "loss": 0.5825, + "step": 6027 + }, + { + "epoch": 0.7848496680983991, + "grad_norm": 2.866095781326294, + "learning_rate": 8.587966554045259e-06, + "loss": 0.6909, + "step": 6030 + }, + { + "epoch": 0.7852401405700898, + "grad_norm": 2.193502902984619, + "learning_rate": 8.586510743045002e-06, + "loss": 0.7236, + "step": 6033 + }, + { + "epoch": 0.7856306130417805, + "grad_norm": 2.5779576301574707, + "learning_rate": 8.585054305488866e-06, + "loss": 0.6158, + "step": 6036 + }, + { + "epoch": 0.7860210855134713, + "grad_norm": 2.6559667587280273, + "learning_rate": 8.583597241631283e-06, + "loss": 0.5794, + "step": 6039 + }, + { + "epoch": 0.786411557985162, + "grad_norm": 2.5779333114624023, + "learning_rate": 8.582139551726801e-06, + "loss": 0.6935, + "step": 6042 + }, + { + "epoch": 0.7868020304568528, + "grad_norm": 2.5629148483276367, + "learning_rate": 8.580681236030075e-06, + "loss": 0.615, + "step": 6045 + }, + { + "epoch": 0.7871925029285436, + "grad_norm": 2.830688238143921, + "learning_rate": 8.579222294795871e-06, + "loss": 0.6147, + "step": 6048 + }, + { + "epoch": 0.7875829754002343, + "grad_norm": 2.249699115753174, + "learning_rate": 8.57776272827906e-06, + "loss": 0.6282, + "step": 6051 + }, + { + "epoch": 0.787973447871925, + "grad_norm": 2.603372097015381, + "learning_rate": 8.576302536734628e-06, + "loss": 0.8085, + "step": 6054 + }, + { + "epoch": 0.7883639203436158, + "grad_norm": 2.5415706634521484, + "learning_rate": 8.574841720417666e-06, + "loss": 0.733, + "step": 6057 + }, + { + "epoch": 0.7887543928153066, + "grad_norm": 2.7278621196746826, + "learning_rate": 8.573380279583374e-06, + "loss": 0.6182, + "step": 6060 + }, + { + "epoch": 0.7891448652869972, + "grad_norm": 2.673105001449585, + "learning_rate": 8.571918214487068e-06, + "loss": 0.702, + "step": 6063 + }, + { + "epoch": 0.789535337758688, + "grad_norm": 2.781696081161499, + "learning_rate": 8.570455525384161e-06, + "loss": 0.6388, + "step": 6066 + }, + { + "epoch": 0.7899258102303788, + "grad_norm": 2.392241954803467, + "learning_rate": 8.568992212530187e-06, + "loss": 0.5784, + "step": 6069 + }, + { + "epoch": 0.7903162827020696, + "grad_norm": 3.162004232406616, + "learning_rate": 8.56752827618078e-06, + "loss": 0.7068, + "step": 6072 + }, + { + "epoch": 0.7907067551737602, + "grad_norm": 2.509340763092041, + "learning_rate": 8.566063716591689e-06, + "loss": 0.6832, + "step": 6075 + }, + { + "epoch": 0.791097227645451, + "grad_norm": 3.036722421646118, + "learning_rate": 8.56459853401877e-06, + "loss": 0.7436, + "step": 6078 + }, + { + "epoch": 0.7914877001171418, + "grad_norm": 2.521820306777954, + "learning_rate": 8.563132728717983e-06, + "loss": 0.6142, + "step": 6081 + }, + { + "epoch": 0.7918781725888325, + "grad_norm": 2.626577377319336, + "learning_rate": 8.561666300945406e-06, + "loss": 0.6366, + "step": 6084 + }, + { + "epoch": 0.7922686450605232, + "grad_norm": 2.6800007820129395, + "learning_rate": 8.560199250957218e-06, + "loss": 0.6728, + "step": 6087 + }, + { + "epoch": 0.792659117532214, + "grad_norm": 2.4256107807159424, + "learning_rate": 8.55873157900971e-06, + "loss": 0.6083, + "step": 6090 + }, + { + "epoch": 0.7930495900039047, + "grad_norm": 2.7973265647888184, + "learning_rate": 8.557263285359282e-06, + "loss": 0.6473, + "step": 6093 + }, + { + "epoch": 0.7934400624755955, + "grad_norm": 4.817023277282715, + "learning_rate": 8.55579437026244e-06, + "loss": 0.6608, + "step": 6096 + }, + { + "epoch": 0.7938305349472862, + "grad_norm": 2.4599454402923584, + "learning_rate": 8.554324833975805e-06, + "loss": 0.5633, + "step": 6099 + }, + { + "epoch": 0.7942210074189769, + "grad_norm": 2.911656618118286, + "learning_rate": 8.552854676756097e-06, + "loss": 0.683, + "step": 6102 + }, + { + "epoch": 0.7946114798906677, + "grad_norm": 2.656649589538574, + "learning_rate": 8.551383898860152e-06, + "loss": 0.7645, + "step": 6105 + }, + { + "epoch": 0.7950019523623585, + "grad_norm": 2.6597232818603516, + "learning_rate": 8.54991250054491e-06, + "loss": 0.6028, + "step": 6108 + }, + { + "epoch": 0.7953924248340491, + "grad_norm": 2.368178367614746, + "learning_rate": 8.548440482067422e-06, + "loss": 0.6649, + "step": 6111 + }, + { + "epoch": 0.7957828973057399, + "grad_norm": 2.8005642890930176, + "learning_rate": 8.546967843684846e-06, + "loss": 0.6525, + "step": 6114 + }, + { + "epoch": 0.7961733697774307, + "grad_norm": 2.426015853881836, + "learning_rate": 8.54549458565445e-06, + "loss": 0.5669, + "step": 6117 + }, + { + "epoch": 0.7965638422491215, + "grad_norm": 2.4316413402557373, + "learning_rate": 8.544020708233608e-06, + "loss": 0.6233, + "step": 6120 + }, + { + "epoch": 0.7969543147208121, + "grad_norm": 2.4489283561706543, + "learning_rate": 8.542546211679806e-06, + "loss": 0.7188, + "step": 6123 + }, + { + "epoch": 0.7973447871925029, + "grad_norm": 3.5121216773986816, + "learning_rate": 8.541071096250633e-06, + "loss": 0.6304, + "step": 6126 + }, + { + "epoch": 0.7977352596641937, + "grad_norm": 2.336344003677368, + "learning_rate": 8.539595362203787e-06, + "loss": 0.6539, + "step": 6129 + }, + { + "epoch": 0.7981257321358844, + "grad_norm": 2.4872500896453857, + "learning_rate": 8.538119009797079e-06, + "loss": 0.6592, + "step": 6132 + }, + { + "epoch": 0.7985162046075752, + "grad_norm": 2.25089955329895, + "learning_rate": 8.536642039288421e-06, + "loss": 0.6421, + "step": 6135 + }, + { + "epoch": 0.7989066770792659, + "grad_norm": 4.216074466705322, + "learning_rate": 8.53516445093584e-06, + "loss": 0.6712, + "step": 6138 + }, + { + "epoch": 0.7992971495509567, + "grad_norm": 3.146594285964966, + "learning_rate": 8.533686244997466e-06, + "loss": 0.7159, + "step": 6141 + }, + { + "epoch": 0.7996876220226474, + "grad_norm": 2.209770441055298, + "learning_rate": 8.53220742173154e-06, + "loss": 0.6019, + "step": 6144 + }, + { + "epoch": 0.8000780944943382, + "grad_norm": 3.0704033374786377, + "learning_rate": 8.530727981396406e-06, + "loss": 0.7287, + "step": 6147 + }, + { + "epoch": 0.8004685669660289, + "grad_norm": 2.567683219909668, + "learning_rate": 8.529247924250524e-06, + "loss": 0.6043, + "step": 6150 + }, + { + "epoch": 0.8008590394377196, + "grad_norm": 3.0786571502685547, + "learning_rate": 8.527767250552452e-06, + "loss": 0.6735, + "step": 6153 + }, + { + "epoch": 0.8012495119094104, + "grad_norm": 2.3544533252716064, + "learning_rate": 8.526285960560864e-06, + "loss": 0.6422, + "step": 6156 + }, + { + "epoch": 0.8016399843811012, + "grad_norm": 2.1804707050323486, + "learning_rate": 8.524804054534535e-06, + "loss": 0.6256, + "step": 6159 + }, + { + "epoch": 0.8020304568527918, + "grad_norm": 2.7374136447906494, + "learning_rate": 8.523321532732354e-06, + "loss": 0.5924, + "step": 6162 + }, + { + "epoch": 0.8024209293244826, + "grad_norm": 2.6108615398406982, + "learning_rate": 8.521838395413312e-06, + "loss": 0.7202, + "step": 6165 + }, + { + "epoch": 0.8028114017961734, + "grad_norm": 2.354628086090088, + "learning_rate": 8.520354642836512e-06, + "loss": 0.6135, + "step": 6168 + }, + { + "epoch": 0.8032018742678642, + "grad_norm": 2.340169668197632, + "learning_rate": 8.518870275261161e-06, + "loss": 0.6044, + "step": 6171 + }, + { + "epoch": 0.8035923467395548, + "grad_norm": 2.519382953643799, + "learning_rate": 8.517385292946578e-06, + "loss": 0.6896, + "step": 6174 + }, + { + "epoch": 0.8039828192112456, + "grad_norm": 2.6927080154418945, + "learning_rate": 8.515899696152183e-06, + "loss": 0.5729, + "step": 6177 + }, + { + "epoch": 0.8043732916829364, + "grad_norm": 2.767089605331421, + "learning_rate": 8.514413485137505e-06, + "loss": 0.5569, + "step": 6180 + }, + { + "epoch": 0.8047637641546271, + "grad_norm": 3.1286277770996094, + "learning_rate": 8.512926660162186e-06, + "loss": 0.6846, + "step": 6183 + }, + { + "epoch": 0.8051542366263178, + "grad_norm": 3.36635684967041, + "learning_rate": 8.511439221485971e-06, + "loss": 0.6434, + "step": 6186 + }, + { + "epoch": 0.8055447090980086, + "grad_norm": 2.8603038787841797, + "learning_rate": 8.50995116936871e-06, + "loss": 0.7117, + "step": 6189 + }, + { + "epoch": 0.8059351815696993, + "grad_norm": 2.3195858001708984, + "learning_rate": 8.508462504070363e-06, + "loss": 0.6657, + "step": 6192 + }, + { + "epoch": 0.8063256540413901, + "grad_norm": 2.69417142868042, + "learning_rate": 8.506973225850996e-06, + "loss": 0.6849, + "step": 6195 + }, + { + "epoch": 0.8067161265130808, + "grad_norm": 2.8202831745147705, + "learning_rate": 8.505483334970787e-06, + "loss": 0.7059, + "step": 6198 + }, + { + "epoch": 0.8071065989847716, + "grad_norm": 2.3420493602752686, + "learning_rate": 8.503992831690011e-06, + "loss": 0.6732, + "step": 6201 + }, + { + "epoch": 0.8074970714564623, + "grad_norm": 2.349193811416626, + "learning_rate": 8.502501716269061e-06, + "loss": 0.6318, + "step": 6204 + }, + { + "epoch": 0.8078875439281531, + "grad_norm": 2.51961350440979, + "learning_rate": 8.501009988968427e-06, + "loss": 0.6682, + "step": 6207 + }, + { + "epoch": 0.8082780163998438, + "grad_norm": 2.5512888431549072, + "learning_rate": 8.499517650048715e-06, + "loss": 0.6768, + "step": 6210 + }, + { + "epoch": 0.8086684888715345, + "grad_norm": 2.58780837059021, + "learning_rate": 8.498024699770631e-06, + "loss": 0.6196, + "step": 6213 + }, + { + "epoch": 0.8090589613432253, + "grad_norm": 3.9888927936553955, + "learning_rate": 8.49653113839499e-06, + "loss": 0.7719, + "step": 6216 + }, + { + "epoch": 0.8094494338149161, + "grad_norm": 3.109391927719116, + "learning_rate": 8.495036966182716e-06, + "loss": 0.7418, + "step": 6219 + }, + { + "epoch": 0.8098399062866068, + "grad_norm": 2.7313342094421387, + "learning_rate": 8.493542183394835e-06, + "loss": 0.7156, + "step": 6222 + }, + { + "epoch": 0.8102303787582975, + "grad_norm": 2.2368297576904297, + "learning_rate": 8.492046790292485e-06, + "loss": 0.673, + "step": 6225 + }, + { + "epoch": 0.8106208512299883, + "grad_norm": 2.565585136413574, + "learning_rate": 8.490550787136906e-06, + "loss": 0.6379, + "step": 6228 + }, + { + "epoch": 0.811011323701679, + "grad_norm": 2.4983272552490234, + "learning_rate": 8.489054174189448e-06, + "loss": 0.6459, + "step": 6231 + }, + { + "epoch": 0.8114017961733698, + "grad_norm": 2.1908957958221436, + "learning_rate": 8.487556951711567e-06, + "loss": 0.6178, + "step": 6234 + }, + { + "epoch": 0.8117922686450605, + "grad_norm": 2.3688669204711914, + "learning_rate": 8.486059119964822e-06, + "loss": 0.7055, + "step": 6237 + }, + { + "epoch": 0.8121827411167513, + "grad_norm": 2.3758630752563477, + "learning_rate": 8.484560679210883e-06, + "loss": 0.5999, + "step": 6240 + }, + { + "epoch": 0.812573213588442, + "grad_norm": 2.6127567291259766, + "learning_rate": 8.483061629711522e-06, + "loss": 0.7506, + "step": 6243 + }, + { + "epoch": 0.8129636860601328, + "grad_norm": 3.369610548019409, + "learning_rate": 8.481561971728622e-06, + "loss": 0.6454, + "step": 6246 + }, + { + "epoch": 0.8133541585318235, + "grad_norm": 2.357160806655884, + "learning_rate": 8.480061705524173e-06, + "loss": 0.621, + "step": 6249 + }, + { + "epoch": 0.8137446310035142, + "grad_norm": 2.6496059894561768, + "learning_rate": 8.47856083136026e-06, + "loss": 0.7154, + "step": 6252 + }, + { + "epoch": 0.814135103475205, + "grad_norm": 2.5359816551208496, + "learning_rate": 8.47705934949909e-06, + "loss": 0.7051, + "step": 6255 + }, + { + "epoch": 0.8145255759468958, + "grad_norm": 2.283336877822876, + "learning_rate": 8.475557260202966e-06, + "loss": 0.646, + "step": 6258 + }, + { + "epoch": 0.8149160484185864, + "grad_norm": 2.6512975692749023, + "learning_rate": 8.474054563734303e-06, + "loss": 0.6226, + "step": 6261 + }, + { + "epoch": 0.8153065208902772, + "grad_norm": 2.991300582885742, + "learning_rate": 8.472551260355612e-06, + "loss": 0.7599, + "step": 6264 + }, + { + "epoch": 0.815696993361968, + "grad_norm": 2.309941530227661, + "learning_rate": 8.471047350329523e-06, + "loss": 0.5883, + "step": 6267 + }, + { + "epoch": 0.8160874658336588, + "grad_norm": 2.3625473976135254, + "learning_rate": 8.469542833918762e-06, + "loss": 0.7385, + "step": 6270 + }, + { + "epoch": 0.8164779383053494, + "grad_norm": 2.453748941421509, + "learning_rate": 8.46803771138617e-06, + "loss": 0.5488, + "step": 6273 + }, + { + "epoch": 0.8168684107770402, + "grad_norm": 3.088710069656372, + "learning_rate": 8.466531982994684e-06, + "loss": 0.6956, + "step": 6276 + }, + { + "epoch": 0.817258883248731, + "grad_norm": 2.4059526920318604, + "learning_rate": 8.465025649007352e-06, + "loss": 0.5862, + "step": 6279 + }, + { + "epoch": 0.8176493557204217, + "grad_norm": 3.4859941005706787, + "learning_rate": 8.463518709687328e-06, + "loss": 0.5537, + "step": 6282 + }, + { + "epoch": 0.8180398281921124, + "grad_norm": 3.3922863006591797, + "learning_rate": 8.462011165297873e-06, + "loss": 0.6667, + "step": 6285 + }, + { + "epoch": 0.8184303006638032, + "grad_norm": 3.5943069458007812, + "learning_rate": 8.46050301610235e-06, + "loss": 0.6736, + "step": 6288 + }, + { + "epoch": 0.818820773135494, + "grad_norm": 2.6332778930664062, + "learning_rate": 8.45899426236423e-06, + "loss": 0.7386, + "step": 6291 + }, + { + "epoch": 0.8192112456071847, + "grad_norm": 3.304346799850464, + "learning_rate": 8.45748490434709e-06, + "loss": 0.7207, + "step": 6294 + }, + { + "epoch": 0.8196017180788754, + "grad_norm": 2.711082935333252, + "learning_rate": 8.455974942314612e-06, + "loss": 0.6716, + "step": 6297 + }, + { + "epoch": 0.8199921905505662, + "grad_norm": 2.3270423412323, + "learning_rate": 8.454464376530579e-06, + "loss": 0.634, + "step": 6300 + }, + { + "epoch": 0.8203826630222569, + "grad_norm": 2.4965059757232666, + "learning_rate": 8.452953207258888e-06, + "loss": 0.6086, + "step": 6303 + }, + { + "epoch": 0.8207731354939477, + "grad_norm": 2.57068133354187, + "learning_rate": 8.451441434763534e-06, + "loss": 0.6383, + "step": 6306 + }, + { + "epoch": 0.8211636079656385, + "grad_norm": 3.073572874069214, + "learning_rate": 8.449929059308623e-06, + "loss": 0.6085, + "step": 6309 + }, + { + "epoch": 0.8215540804373291, + "grad_norm": 2.8972861766815186, + "learning_rate": 8.448416081158363e-06, + "loss": 0.76, + "step": 6312 + }, + { + "epoch": 0.8219445529090199, + "grad_norm": 2.6822032928466797, + "learning_rate": 8.446902500577067e-06, + "loss": 0.6039, + "step": 6315 + }, + { + "epoch": 0.8223350253807107, + "grad_norm": 2.42512845993042, + "learning_rate": 8.445388317829157e-06, + "loss": 0.6074, + "step": 6318 + }, + { + "epoch": 0.8227254978524015, + "grad_norm": 2.6248953342437744, + "learning_rate": 8.443873533179156e-06, + "loss": 0.7003, + "step": 6321 + }, + { + "epoch": 0.8231159703240921, + "grad_norm": 3.51706862449646, + "learning_rate": 8.442358146891692e-06, + "loss": 0.6427, + "step": 6324 + }, + { + "epoch": 0.8235064427957829, + "grad_norm": 2.4549152851104736, + "learning_rate": 8.440842159231503e-06, + "loss": 0.662, + "step": 6327 + }, + { + "epoch": 0.8238969152674737, + "grad_norm": 2.4990108013153076, + "learning_rate": 8.439325570463426e-06, + "loss": 0.7352, + "step": 6330 + }, + { + "epoch": 0.8242873877391644, + "grad_norm": 3.4981563091278076, + "learning_rate": 8.437808380852408e-06, + "loss": 0.7033, + "step": 6333 + }, + { + "epoch": 0.8246778602108551, + "grad_norm": 3.689436674118042, + "learning_rate": 8.436290590663498e-06, + "loss": 0.6423, + "step": 6336 + }, + { + "epoch": 0.8250683326825459, + "grad_norm": 2.182603359222412, + "learning_rate": 8.43477220016185e-06, + "loss": 0.6086, + "step": 6339 + }, + { + "epoch": 0.8254588051542366, + "grad_norm": 2.4376380443573, + "learning_rate": 8.433253209612727e-06, + "loss": 0.6321, + "step": 6342 + }, + { + "epoch": 0.8258492776259274, + "grad_norm": 2.2977375984191895, + "learning_rate": 8.431733619281486e-06, + "loss": 0.6188, + "step": 6345 + }, + { + "epoch": 0.8262397500976181, + "grad_norm": 2.3864758014678955, + "learning_rate": 8.430213429433605e-06, + "loss": 0.653, + "step": 6348 + }, + { + "epoch": 0.8266302225693088, + "grad_norm": 3.494652509689331, + "learning_rate": 8.42869264033465e-06, + "loss": 0.6787, + "step": 6351 + }, + { + "epoch": 0.8270206950409996, + "grad_norm": 2.394007682800293, + "learning_rate": 8.427171252250308e-06, + "loss": 0.5673, + "step": 6354 + }, + { + "epoch": 0.8274111675126904, + "grad_norm": 2.576982259750366, + "learning_rate": 8.425649265446356e-06, + "loss": 0.7278, + "step": 6357 + }, + { + "epoch": 0.827801639984381, + "grad_norm": 2.5857975482940674, + "learning_rate": 8.424126680188684e-06, + "loss": 0.7381, + "step": 6360 + }, + { + "epoch": 0.8281921124560718, + "grad_norm": 2.553446054458618, + "learning_rate": 8.422603496743285e-06, + "loss": 0.5989, + "step": 6363 + }, + { + "epoch": 0.8285825849277626, + "grad_norm": 2.7090554237365723, + "learning_rate": 8.421079715376255e-06, + "loss": 0.6736, + "step": 6366 + }, + { + "epoch": 0.8289730573994534, + "grad_norm": 2.7122576236724854, + "learning_rate": 8.419555336353793e-06, + "loss": 0.6736, + "step": 6369 + }, + { + "epoch": 0.829363529871144, + "grad_norm": 2.37516713142395, + "learning_rate": 8.418030359942211e-06, + "loss": 0.6084, + "step": 6372 + }, + { + "epoch": 0.8297540023428348, + "grad_norm": 2.34924054145813, + "learning_rate": 8.416504786407913e-06, + "loss": 0.7132, + "step": 6375 + }, + { + "epoch": 0.8301444748145256, + "grad_norm": 2.3876399993896484, + "learning_rate": 8.414978616017418e-06, + "loss": 0.7109, + "step": 6378 + }, + { + "epoch": 0.8305349472862164, + "grad_norm": 2.995637893676758, + "learning_rate": 8.413451849037342e-06, + "loss": 0.6192, + "step": 6381 + }, + { + "epoch": 0.830925419757907, + "grad_norm": 2.5374836921691895, + "learning_rate": 8.41192448573441e-06, + "loss": 0.6034, + "step": 6384 + }, + { + "epoch": 0.8313158922295978, + "grad_norm": 2.576464891433716, + "learning_rate": 8.410396526375446e-06, + "loss": 0.6396, + "step": 6387 + }, + { + "epoch": 0.8317063647012886, + "grad_norm": 2.4232656955718994, + "learning_rate": 8.408867971227384e-06, + "loss": 0.6386, + "step": 6390 + }, + { + "epoch": 0.8320968371729793, + "grad_norm": 2.4770760536193848, + "learning_rate": 8.40733882055726e-06, + "loss": 0.6802, + "step": 6393 + }, + { + "epoch": 0.8324873096446701, + "grad_norm": 2.2709827423095703, + "learning_rate": 8.40580907463221e-06, + "loss": 0.6144, + "step": 6396 + }, + { + "epoch": 0.8328777821163608, + "grad_norm": 2.4368174076080322, + "learning_rate": 8.40427873371948e-06, + "loss": 0.6847, + "step": 6399 + }, + { + "epoch": 0.8332682545880515, + "grad_norm": 2.510909080505371, + "learning_rate": 8.402747798086417e-06, + "loss": 0.7264, + "step": 6402 + }, + { + "epoch": 0.8336587270597423, + "grad_norm": 3.945446491241455, + "learning_rate": 8.401216268000473e-06, + "loss": 0.7692, + "step": 6405 + }, + { + "epoch": 0.8340491995314331, + "grad_norm": 2.4416685104370117, + "learning_rate": 8.3996841437292e-06, + "loss": 0.6565, + "step": 6408 + }, + { + "epoch": 0.8344396720031237, + "grad_norm": 2.5184824466705322, + "learning_rate": 8.39815142554026e-06, + "loss": 0.6929, + "step": 6411 + }, + { + "epoch": 0.8348301444748145, + "grad_norm": 2.375833511352539, + "learning_rate": 8.396618113701416e-06, + "loss": 0.6166, + "step": 6414 + }, + { + "epoch": 0.8352206169465053, + "grad_norm": 2.406198024749756, + "learning_rate": 8.395084208480531e-06, + "loss": 0.5858, + "step": 6417 + }, + { + "epoch": 0.8356110894181961, + "grad_norm": 3.6379899978637695, + "learning_rate": 8.393549710145578e-06, + "loss": 0.5477, + "step": 6420 + }, + { + "epoch": 0.8360015618898867, + "grad_norm": 2.8601226806640625, + "learning_rate": 8.39201461896463e-06, + "loss": 0.7262, + "step": 6423 + }, + { + "epoch": 0.8363920343615775, + "grad_norm": 2.3782405853271484, + "learning_rate": 8.390478935205864e-06, + "loss": 0.672, + "step": 6426 + }, + { + "epoch": 0.8367825068332683, + "grad_norm": 2.471163749694824, + "learning_rate": 8.388942659137558e-06, + "loss": 0.6127, + "step": 6429 + }, + { + "epoch": 0.837172979304959, + "grad_norm": 2.592690944671631, + "learning_rate": 8.3874057910281e-06, + "loss": 0.7282, + "step": 6432 + }, + { + "epoch": 0.8375634517766497, + "grad_norm": 2.696824073791504, + "learning_rate": 8.385868331145977e-06, + "loss": 0.6982, + "step": 6435 + }, + { + "epoch": 0.8379539242483405, + "grad_norm": 2.5590860843658447, + "learning_rate": 8.38433027975978e-06, + "loss": 0.6906, + "step": 6438 + }, + { + "epoch": 0.8383443967200312, + "grad_norm": 2.547417163848877, + "learning_rate": 8.3827916371382e-06, + "loss": 0.6392, + "step": 6441 + }, + { + "epoch": 0.838734869191722, + "grad_norm": 2.264625310897827, + "learning_rate": 8.381252403550043e-06, + "loss": 0.6396, + "step": 6444 + }, + { + "epoch": 0.8391253416634127, + "grad_norm": 2.935336112976074, + "learning_rate": 8.3797125792642e-06, + "loss": 0.6588, + "step": 6447 + }, + { + "epoch": 0.8395158141351035, + "grad_norm": 2.308274745941162, + "learning_rate": 8.378172164549678e-06, + "loss": 0.6231, + "step": 6450 + }, + { + "epoch": 0.8399062866067942, + "grad_norm": 2.976280927658081, + "learning_rate": 8.376631159675587e-06, + "loss": 0.6117, + "step": 6453 + }, + { + "epoch": 0.840296759078485, + "grad_norm": 2.33328914642334, + "learning_rate": 8.375089564911137e-06, + "loss": 0.7065, + "step": 6456 + }, + { + "epoch": 0.8406872315501757, + "grad_norm": 2.5370125770568848, + "learning_rate": 8.373547380525639e-06, + "loss": 0.671, + "step": 6459 + }, + { + "epoch": 0.8410777040218664, + "grad_norm": 3.066671133041382, + "learning_rate": 8.372004606788511e-06, + "loss": 0.5947, + "step": 6462 + }, + { + "epoch": 0.8414681764935572, + "grad_norm": 2.562814474105835, + "learning_rate": 8.370461243969272e-06, + "loss": 0.5773, + "step": 6465 + }, + { + "epoch": 0.841858648965248, + "grad_norm": 2.8281662464141846, + "learning_rate": 8.368917292337544e-06, + "loss": 0.6202, + "step": 6468 + }, + { + "epoch": 0.8422491214369386, + "grad_norm": 3.3414337635040283, + "learning_rate": 8.36737275216305e-06, + "loss": 0.6927, + "step": 6471 + }, + { + "epoch": 0.8426395939086294, + "grad_norm": 2.5855307579040527, + "learning_rate": 8.365827623715624e-06, + "loss": 0.6142, + "step": 6474 + }, + { + "epoch": 0.8430300663803202, + "grad_norm": 2.5391762256622314, + "learning_rate": 8.36428190726519e-06, + "loss": 0.6912, + "step": 6477 + }, + { + "epoch": 0.843420538852011, + "grad_norm": 2.4301860332489014, + "learning_rate": 8.362735603081784e-06, + "loss": 0.7405, + "step": 6480 + }, + { + "epoch": 0.8438110113237017, + "grad_norm": 2.3643648624420166, + "learning_rate": 8.361188711435543e-06, + "loss": 0.5884, + "step": 6483 + }, + { + "epoch": 0.8442014837953924, + "grad_norm": 2.4440126419067383, + "learning_rate": 8.359641232596707e-06, + "loss": 0.6901, + "step": 6486 + }, + { + "epoch": 0.8445919562670832, + "grad_norm": 2.4889235496520996, + "learning_rate": 8.358093166835614e-06, + "loss": 0.6792, + "step": 6489 + }, + { + "epoch": 0.8449824287387739, + "grad_norm": 2.9333462715148926, + "learning_rate": 8.356544514422708e-06, + "loss": 0.535, + "step": 6492 + }, + { + "epoch": 0.8453729012104647, + "grad_norm": 3.2094194889068604, + "learning_rate": 8.354995275628536e-06, + "loss": 0.713, + "step": 6495 + }, + { + "epoch": 0.8457633736821554, + "grad_norm": 2.7420010566711426, + "learning_rate": 8.35344545072375e-06, + "loss": 0.6427, + "step": 6498 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 2.5620057582855225, + "learning_rate": 8.351895039979096e-06, + "loss": 0.7041, + "step": 6501 + }, + { + "epoch": 0.8465443186255369, + "grad_norm": 3.0430588722229004, + "learning_rate": 8.350344043665432e-06, + "loss": 0.6237, + "step": 6504 + }, + { + "epoch": 0.8469347910972277, + "grad_norm": 3.560917377471924, + "learning_rate": 8.34879246205371e-06, + "loss": 0.6937, + "step": 6507 + }, + { + "epoch": 0.8473252635689184, + "grad_norm": 2.730560064315796, + "learning_rate": 8.34724029541499e-06, + "loss": 0.6022, + "step": 6510 + }, + { + "epoch": 0.8477157360406091, + "grad_norm": 2.495201587677002, + "learning_rate": 8.345687544020432e-06, + "loss": 0.6019, + "step": 6513 + }, + { + "epoch": 0.8481062085122999, + "grad_norm": 2.6979150772094727, + "learning_rate": 8.344134208141298e-06, + "loss": 0.696, + "step": 6516 + }, + { + "epoch": 0.8484966809839907, + "grad_norm": 3.2949979305267334, + "learning_rate": 8.342580288048953e-06, + "loss": 0.6723, + "step": 6519 + }, + { + "epoch": 0.8488871534556813, + "grad_norm": 2.2897496223449707, + "learning_rate": 8.341025784014865e-06, + "loss": 0.6004, + "step": 6522 + }, + { + "epoch": 0.8492776259273721, + "grad_norm": 2.2679920196533203, + "learning_rate": 8.3394706963106e-06, + "loss": 0.6487, + "step": 6525 + }, + { + "epoch": 0.8496680983990629, + "grad_norm": 2.6986122131347656, + "learning_rate": 8.337915025207829e-06, + "loss": 0.6389, + "step": 6528 + }, + { + "epoch": 0.8500585708707536, + "grad_norm": 2.4359288215637207, + "learning_rate": 8.336358770978325e-06, + "loss": 0.5729, + "step": 6531 + }, + { + "epoch": 0.8504490433424443, + "grad_norm": 2.355321168899536, + "learning_rate": 8.334801933893963e-06, + "loss": 0.6444, + "step": 6534 + }, + { + "epoch": 0.8508395158141351, + "grad_norm": 2.818239212036133, + "learning_rate": 8.333244514226718e-06, + "loss": 0.6435, + "step": 6537 + }, + { + "epoch": 0.8512299882858259, + "grad_norm": 2.3462061882019043, + "learning_rate": 8.331686512248669e-06, + "loss": 0.7376, + "step": 6540 + }, + { + "epoch": 0.8516204607575166, + "grad_norm": 2.6086630821228027, + "learning_rate": 8.330127928231994e-06, + "loss": 0.6756, + "step": 6543 + }, + { + "epoch": 0.8520109332292073, + "grad_norm": 2.5126383304595947, + "learning_rate": 8.328568762448978e-06, + "loss": 0.6713, + "step": 6546 + }, + { + "epoch": 0.8524014057008981, + "grad_norm": 3.5347037315368652, + "learning_rate": 8.327009015172e-06, + "loss": 0.6268, + "step": 6549 + }, + { + "epoch": 0.8527918781725888, + "grad_norm": 2.4854538440704346, + "learning_rate": 8.325448686673545e-06, + "loss": 0.6253, + "step": 6552 + }, + { + "epoch": 0.8531823506442796, + "grad_norm": 2.2126731872558594, + "learning_rate": 8.323887777226204e-06, + "loss": 0.5835, + "step": 6555 + }, + { + "epoch": 0.8535728231159703, + "grad_norm": 3.192509889602661, + "learning_rate": 8.322326287102655e-06, + "loss": 0.5728, + "step": 6558 + }, + { + "epoch": 0.853963295587661, + "grad_norm": 2.4523346424102783, + "learning_rate": 8.320764216575696e-06, + "loss": 0.5734, + "step": 6561 + }, + { + "epoch": 0.8543537680593518, + "grad_norm": 2.1568102836608887, + "learning_rate": 8.319201565918214e-06, + "loss": 0.7255, + "step": 6564 + }, + { + "epoch": 0.8547442405310426, + "grad_norm": 2.56260347366333, + "learning_rate": 8.317638335403203e-06, + "loss": 0.6623, + "step": 6567 + }, + { + "epoch": 0.8551347130027334, + "grad_norm": 2.450410842895508, + "learning_rate": 8.31607452530375e-06, + "loss": 0.6081, + "step": 6570 + }, + { + "epoch": 0.855525185474424, + "grad_norm": 2.3223392963409424, + "learning_rate": 8.314510135893057e-06, + "loss": 0.6811, + "step": 6573 + }, + { + "epoch": 0.8559156579461148, + "grad_norm": 2.9705634117126465, + "learning_rate": 8.312945167444413e-06, + "loss": 0.7399, + "step": 6576 + }, + { + "epoch": 0.8563061304178056, + "grad_norm": 3.151143789291382, + "learning_rate": 8.31137962023122e-06, + "loss": 0.6165, + "step": 6579 + }, + { + "epoch": 0.8566966028894963, + "grad_norm": 2.2744345664978027, + "learning_rate": 8.309813494526973e-06, + "loss": 0.5956, + "step": 6582 + }, + { + "epoch": 0.857087075361187, + "grad_norm": 2.3356449604034424, + "learning_rate": 8.30824679060527e-06, + "loss": 0.6373, + "step": 6585 + }, + { + "epoch": 0.8574775478328778, + "grad_norm": 2.322843551635742, + "learning_rate": 8.306679508739813e-06, + "loss": 0.6431, + "step": 6588 + }, + { + "epoch": 0.8578680203045685, + "grad_norm": 2.799908399581909, + "learning_rate": 8.305111649204402e-06, + "loss": 0.6925, + "step": 6591 + }, + { + "epoch": 0.8582584927762593, + "grad_norm": 3.1793437004089355, + "learning_rate": 8.30354321227294e-06, + "loss": 0.6127, + "step": 6594 + }, + { + "epoch": 0.85864896524795, + "grad_norm": 2.3757338523864746, + "learning_rate": 8.301974198219427e-06, + "loss": 0.6473, + "step": 6597 + }, + { + "epoch": 0.8590394377196408, + "grad_norm": 2.6627843379974365, + "learning_rate": 8.300404607317968e-06, + "loss": 0.7966, + "step": 6600 + }, + { + "epoch": 0.8594299101913315, + "grad_norm": 2.2631454467773438, + "learning_rate": 8.298834439842768e-06, + "loss": 0.5955, + "step": 6603 + }, + { + "epoch": 0.8598203826630223, + "grad_norm": 3.50620174407959, + "learning_rate": 8.29726369606813e-06, + "loss": 0.6787, + "step": 6606 + }, + { + "epoch": 0.860210855134713, + "grad_norm": 2.180896520614624, + "learning_rate": 8.295692376268462e-06, + "loss": 0.561, + "step": 6609 + }, + { + "epoch": 0.8606013276064037, + "grad_norm": 2.7363123893737793, + "learning_rate": 8.29412048071827e-06, + "loss": 0.6337, + "step": 6612 + }, + { + "epoch": 0.8609918000780945, + "grad_norm": 2.419677495956421, + "learning_rate": 8.292548009692156e-06, + "loss": 0.6391, + "step": 6615 + }, + { + "epoch": 0.8613822725497853, + "grad_norm": 2.2987353801727295, + "learning_rate": 8.290974963464835e-06, + "loss": 0.5479, + "step": 6618 + }, + { + "epoch": 0.8617727450214759, + "grad_norm": 2.5347087383270264, + "learning_rate": 8.289401342311108e-06, + "loss": 0.6663, + "step": 6621 + }, + { + "epoch": 0.8621632174931667, + "grad_norm": 2.4396309852600098, + "learning_rate": 8.287827146505888e-06, + "loss": 0.5425, + "step": 6624 + }, + { + "epoch": 0.8625536899648575, + "grad_norm": 2.2343242168426514, + "learning_rate": 8.286252376324181e-06, + "loss": 0.5953, + "step": 6627 + }, + { + "epoch": 0.8629441624365483, + "grad_norm": 2.7455759048461914, + "learning_rate": 8.284677032041099e-06, + "loss": 0.6759, + "step": 6630 + }, + { + "epoch": 0.8633346349082389, + "grad_norm": 2.5142693519592285, + "learning_rate": 8.283101113931849e-06, + "loss": 0.6229, + "step": 6633 + }, + { + "epoch": 0.8637251073799297, + "grad_norm": 2.3203072547912598, + "learning_rate": 8.281524622271741e-06, + "loss": 0.6268, + "step": 6636 + }, + { + "epoch": 0.8641155798516205, + "grad_norm": 2.863619565963745, + "learning_rate": 8.279947557336184e-06, + "loss": 0.6373, + "step": 6639 + }, + { + "epoch": 0.8645060523233112, + "grad_norm": 2.654784679412842, + "learning_rate": 8.278369919400688e-06, + "loss": 0.6699, + "step": 6642 + }, + { + "epoch": 0.8648965247950019, + "grad_norm": 2.5628163814544678, + "learning_rate": 8.276791708740865e-06, + "loss": 0.7191, + "step": 6645 + }, + { + "epoch": 0.8652869972666927, + "grad_norm": 2.7298977375030518, + "learning_rate": 8.275212925632424e-06, + "loss": 0.6759, + "step": 6648 + }, + { + "epoch": 0.8656774697383834, + "grad_norm": 2.9911298751831055, + "learning_rate": 8.273633570351175e-06, + "loss": 0.6965, + "step": 6651 + }, + { + "epoch": 0.8660679422100742, + "grad_norm": 2.741440534591675, + "learning_rate": 8.272053643173028e-06, + "loss": 0.5874, + "step": 6654 + }, + { + "epoch": 0.866458414681765, + "grad_norm": 2.3754491806030273, + "learning_rate": 8.270473144373992e-06, + "loss": 0.7394, + "step": 6657 + }, + { + "epoch": 0.8668488871534556, + "grad_norm": 2.6968066692352295, + "learning_rate": 8.268892074230179e-06, + "loss": 0.6542, + "step": 6660 + }, + { + "epoch": 0.8672393596251464, + "grad_norm": 2.947608709335327, + "learning_rate": 8.267310433017795e-06, + "loss": 0.6674, + "step": 6663 + }, + { + "epoch": 0.8676298320968372, + "grad_norm": 2.4192142486572266, + "learning_rate": 8.265728221013154e-06, + "loss": 0.6469, + "step": 6666 + }, + { + "epoch": 0.868020304568528, + "grad_norm": 2.530045986175537, + "learning_rate": 8.264145438492664e-06, + "loss": 0.735, + "step": 6669 + }, + { + "epoch": 0.8684107770402186, + "grad_norm": 3.38845157623291, + "learning_rate": 8.26256208573283e-06, + "loss": 0.6779, + "step": 6672 + }, + { + "epoch": 0.8688012495119094, + "grad_norm": 2.9682278633117676, + "learning_rate": 8.260978163010265e-06, + "loss": 0.6009, + "step": 6675 + }, + { + "epoch": 0.8691917219836002, + "grad_norm": 2.4228317737579346, + "learning_rate": 8.259393670601673e-06, + "loss": 0.6627, + "step": 6678 + }, + { + "epoch": 0.869582194455291, + "grad_norm": 2.3679723739624023, + "learning_rate": 8.257808608783864e-06, + "loss": 0.6578, + "step": 6681 + }, + { + "epoch": 0.8699726669269816, + "grad_norm": 2.9238123893737793, + "learning_rate": 8.256222977833746e-06, + "loss": 0.6094, + "step": 6684 + }, + { + "epoch": 0.8703631393986724, + "grad_norm": 3.277010440826416, + "learning_rate": 8.254636778028321e-06, + "loss": 0.5665, + "step": 6687 + }, + { + "epoch": 0.8707536118703632, + "grad_norm": 2.598862648010254, + "learning_rate": 8.2530500096447e-06, + "loss": 0.6638, + "step": 6690 + }, + { + "epoch": 0.8711440843420539, + "grad_norm": 2.5538406372070312, + "learning_rate": 8.251462672960087e-06, + "loss": 0.7227, + "step": 6693 + }, + { + "epoch": 0.8715345568137446, + "grad_norm": 2.3932902812957764, + "learning_rate": 8.249874768251783e-06, + "loss": 0.6756, + "step": 6696 + }, + { + "epoch": 0.8719250292854354, + "grad_norm": 2.3504838943481445, + "learning_rate": 8.248286295797194e-06, + "loss": 0.671, + "step": 6699 + }, + { + "epoch": 0.8723155017571261, + "grad_norm": 3.473229169845581, + "learning_rate": 8.246697255873822e-06, + "loss": 0.6147, + "step": 6702 + }, + { + "epoch": 0.8727059742288169, + "grad_norm": 2.742168426513672, + "learning_rate": 8.24510764875927e-06, + "loss": 0.6286, + "step": 6705 + }, + { + "epoch": 0.8730964467005076, + "grad_norm": 2.26391863822937, + "learning_rate": 8.243517474731238e-06, + "loss": 0.5674, + "step": 6708 + }, + { + "epoch": 0.8734869191721983, + "grad_norm": 2.6377511024475098, + "learning_rate": 8.241926734067528e-06, + "loss": 0.731, + "step": 6711 + }, + { + "epoch": 0.8738773916438891, + "grad_norm": 2.8868167400360107, + "learning_rate": 8.240335427046037e-06, + "loss": 0.7232, + "step": 6714 + }, + { + "epoch": 0.8742678641155799, + "grad_norm": 2.5035762786865234, + "learning_rate": 8.238743553944762e-06, + "loss": 0.7436, + "step": 6717 + }, + { + "epoch": 0.8746583365872705, + "grad_norm": 4.231232643127441, + "learning_rate": 8.237151115041803e-06, + "loss": 0.6408, + "step": 6720 + }, + { + "epoch": 0.8750488090589613, + "grad_norm": 3.021878719329834, + "learning_rate": 8.235558110615354e-06, + "loss": 0.6811, + "step": 6723 + }, + { + "epoch": 0.8754392815306521, + "grad_norm": 3.348742961883545, + "learning_rate": 8.233964540943708e-06, + "loss": 0.7516, + "step": 6726 + }, + { + "epoch": 0.8758297540023429, + "grad_norm": 2.2513198852539062, + "learning_rate": 8.232370406305263e-06, + "loss": 0.6115, + "step": 6729 + }, + { + "epoch": 0.8762202264740335, + "grad_norm": 2.705314874649048, + "learning_rate": 8.230775706978507e-06, + "loss": 0.6867, + "step": 6732 + }, + { + "epoch": 0.8766106989457243, + "grad_norm": 3.001579523086548, + "learning_rate": 8.22918044324203e-06, + "loss": 0.6006, + "step": 6735 + }, + { + "epoch": 0.8770011714174151, + "grad_norm": 2.306036949157715, + "learning_rate": 8.227584615374524e-06, + "loss": 0.5314, + "step": 6738 + }, + { + "epoch": 0.8773916438891058, + "grad_norm": 2.6267998218536377, + "learning_rate": 8.225988223654775e-06, + "loss": 0.7156, + "step": 6741 + }, + { + "epoch": 0.8777821163607966, + "grad_norm": 2.350863456726074, + "learning_rate": 8.224391268361672e-06, + "loss": 0.5479, + "step": 6744 + }, + { + "epoch": 0.8781725888324873, + "grad_norm": 2.4865407943725586, + "learning_rate": 8.222793749774194e-06, + "loss": 0.5811, + "step": 6747 + }, + { + "epoch": 0.878563061304178, + "grad_norm": 3.3585386276245117, + "learning_rate": 8.221195668171429e-06, + "loss": 0.6603, + "step": 6750 + }, + { + "epoch": 0.8789535337758688, + "grad_norm": 4.48010778427124, + "learning_rate": 8.219597023832558e-06, + "loss": 0.6756, + "step": 6753 + }, + { + "epoch": 0.8793440062475596, + "grad_norm": 2.635272979736328, + "learning_rate": 8.21799781703686e-06, + "loss": 0.7552, + "step": 6756 + }, + { + "epoch": 0.8797344787192503, + "grad_norm": 2.7308220863342285, + "learning_rate": 8.216398048063712e-06, + "loss": 0.6473, + "step": 6759 + }, + { + "epoch": 0.880124951190941, + "grad_norm": 3.390993118286133, + "learning_rate": 8.214797717192591e-06, + "loss": 0.7474, + "step": 6762 + }, + { + "epoch": 0.8805154236626318, + "grad_norm": 2.1488442420959473, + "learning_rate": 8.213196824703074e-06, + "loss": 0.5481, + "step": 6765 + }, + { + "epoch": 0.8809058961343226, + "grad_norm": 2.6741676330566406, + "learning_rate": 8.21159537087483e-06, + "loss": 0.6344, + "step": 6768 + }, + { + "epoch": 0.8812963686060132, + "grad_norm": 2.426910877227783, + "learning_rate": 8.20999335598763e-06, + "loss": 0.6935, + "step": 6771 + }, + { + "epoch": 0.881686841077704, + "grad_norm": 2.539985179901123, + "learning_rate": 8.208390780321344e-06, + "loss": 0.6417, + "step": 6774 + }, + { + "epoch": 0.8820773135493948, + "grad_norm": 3.312417507171631, + "learning_rate": 8.20678764415594e-06, + "loss": 0.6397, + "step": 6777 + }, + { + "epoch": 0.8824677860210856, + "grad_norm": 2.3949477672576904, + "learning_rate": 8.205183947771478e-06, + "loss": 0.6372, + "step": 6780 + }, + { + "epoch": 0.8828582584927762, + "grad_norm": 2.6882550716400146, + "learning_rate": 8.203579691448124e-06, + "loss": 0.7021, + "step": 6783 + }, + { + "epoch": 0.883248730964467, + "grad_norm": 2.244438886642456, + "learning_rate": 8.201974875466138e-06, + "loss": 0.6369, + "step": 6786 + }, + { + "epoch": 0.8836392034361578, + "grad_norm": 2.579512357711792, + "learning_rate": 8.200369500105876e-06, + "loss": 0.6198, + "step": 6789 + }, + { + "epoch": 0.8840296759078485, + "grad_norm": 2.45475172996521, + "learning_rate": 8.198763565647796e-06, + "loss": 0.6765, + "step": 6792 + }, + { + "epoch": 0.8844201483795392, + "grad_norm": 2.1796491146087646, + "learning_rate": 8.19715707237245e-06, + "loss": 0.5684, + "step": 6795 + }, + { + "epoch": 0.88481062085123, + "grad_norm": 3.6202688217163086, + "learning_rate": 8.195550020560488e-06, + "loss": 0.6061, + "step": 6798 + }, + { + "epoch": 0.8852010933229207, + "grad_norm": 2.273062229156494, + "learning_rate": 8.193942410492662e-06, + "loss": 0.5358, + "step": 6801 + }, + { + "epoch": 0.8855915657946115, + "grad_norm": 2.308729410171509, + "learning_rate": 8.192334242449816e-06, + "loss": 0.5674, + "step": 6804 + }, + { + "epoch": 0.8859820382663022, + "grad_norm": 2.949068784713745, + "learning_rate": 8.190725516712893e-06, + "loss": 0.6682, + "step": 6807 + }, + { + "epoch": 0.886372510737993, + "grad_norm": 2.661355972290039, + "learning_rate": 8.189116233562933e-06, + "loss": 0.7175, + "step": 6810 + }, + { + "epoch": 0.8867629832096837, + "grad_norm": 3.1999619007110596, + "learning_rate": 8.187506393281076e-06, + "loss": 0.6816, + "step": 6813 + }, + { + "epoch": 0.8871534556813745, + "grad_norm": 3.3736467361450195, + "learning_rate": 8.185895996148558e-06, + "loss": 0.6235, + "step": 6816 + }, + { + "epoch": 0.8875439281530652, + "grad_norm": 2.3668558597564697, + "learning_rate": 8.184285042446713e-06, + "loss": 0.7111, + "step": 6819 + }, + { + "epoch": 0.8879344006247559, + "grad_norm": 3.4571516513824463, + "learning_rate": 8.18267353245697e-06, + "loss": 0.6578, + "step": 6822 + }, + { + "epoch": 0.8883248730964467, + "grad_norm": 2.8542165756225586, + "learning_rate": 8.181061466460856e-06, + "loss": 0.6717, + "step": 6825 + }, + { + "epoch": 0.8887153455681375, + "grad_norm": 2.5274338722229004, + "learning_rate": 8.179448844739995e-06, + "loss": 0.7508, + "step": 6828 + }, + { + "epoch": 0.8891058180398282, + "grad_norm": 2.284775733947754, + "learning_rate": 8.177835667576108e-06, + "loss": 0.6448, + "step": 6831 + }, + { + "epoch": 0.8894962905115189, + "grad_norm": 2.375109910964966, + "learning_rate": 8.176221935251016e-06, + "loss": 0.6777, + "step": 6834 + }, + { + "epoch": 0.8898867629832097, + "grad_norm": 2.5060408115386963, + "learning_rate": 8.174607648046635e-06, + "loss": 0.7086, + "step": 6837 + }, + { + "epoch": 0.8902772354549005, + "grad_norm": 3.7444376945495605, + "learning_rate": 8.172992806244976e-06, + "loss": 0.6429, + "step": 6840 + }, + { + "epoch": 0.8906677079265912, + "grad_norm": 2.7690014839172363, + "learning_rate": 8.171377410128149e-06, + "loss": 0.6279, + "step": 6843 + }, + { + "epoch": 0.8910581803982819, + "grad_norm": 3.0757744312286377, + "learning_rate": 8.169761459978358e-06, + "loss": 0.7566, + "step": 6846 + }, + { + "epoch": 0.8914486528699727, + "grad_norm": 2.608480930328369, + "learning_rate": 8.16814495607791e-06, + "loss": 0.6398, + "step": 6849 + }, + { + "epoch": 0.8918391253416634, + "grad_norm": 2.6468796730041504, + "learning_rate": 8.166527898709202e-06, + "loss": 0.6788, + "step": 6852 + }, + { + "epoch": 0.8922295978133542, + "grad_norm": 2.874540328979492, + "learning_rate": 8.164910288154733e-06, + "loss": 0.6425, + "step": 6855 + }, + { + "epoch": 0.8926200702850449, + "grad_norm": 2.4776840209960938, + "learning_rate": 8.163292124697094e-06, + "loss": 0.7579, + "step": 6858 + }, + { + "epoch": 0.8930105427567356, + "grad_norm": 2.2084155082702637, + "learning_rate": 8.161673408618975e-06, + "loss": 0.6798, + "step": 6861 + }, + { + "epoch": 0.8934010152284264, + "grad_norm": 2.4386301040649414, + "learning_rate": 8.160054140203163e-06, + "loss": 0.6953, + "step": 6864 + }, + { + "epoch": 0.8937914877001172, + "grad_norm": 2.9873063564300537, + "learning_rate": 8.15843431973254e-06, + "loss": 0.6849, + "step": 6867 + }, + { + "epoch": 0.8941819601718078, + "grad_norm": 2.3077900409698486, + "learning_rate": 8.156813947490086e-06, + "loss": 0.5833, + "step": 6870 + }, + { + "epoch": 0.8945724326434986, + "grad_norm": 3.516432046890259, + "learning_rate": 8.155193023758876e-06, + "loss": 0.6973, + "step": 6873 + }, + { + "epoch": 0.8949629051151894, + "grad_norm": 2.4575483798980713, + "learning_rate": 8.153571548822083e-06, + "loss": 0.6842, + "step": 6876 + }, + { + "epoch": 0.8953533775868802, + "grad_norm": 2.8039913177490234, + "learning_rate": 8.151949522962975e-06, + "loss": 0.6416, + "step": 6879 + }, + { + "epoch": 0.8957438500585708, + "grad_norm": 3.1583502292633057, + "learning_rate": 8.150326946464913e-06, + "loss": 0.6764, + "step": 6882 + }, + { + "epoch": 0.8961343225302616, + "grad_norm": 2.762176752090454, + "learning_rate": 8.148703819611364e-06, + "loss": 0.6455, + "step": 6885 + }, + { + "epoch": 0.8965247950019524, + "grad_norm": 3.457850694656372, + "learning_rate": 8.147080142685882e-06, + "loss": 0.6589, + "step": 6888 + }, + { + "epoch": 0.8969152674736431, + "grad_norm": 3.8411946296691895, + "learning_rate": 8.145455915972117e-06, + "loss": 0.7218, + "step": 6891 + }, + { + "epoch": 0.8973057399453338, + "grad_norm": 2.2801125049591064, + "learning_rate": 8.143831139753822e-06, + "loss": 0.6221, + "step": 6894 + }, + { + "epoch": 0.8976962124170246, + "grad_norm": 2.8186426162719727, + "learning_rate": 8.14220581431484e-06, + "loss": 0.7055, + "step": 6897 + }, + { + "epoch": 0.8980866848887153, + "grad_norm": 2.3961057662963867, + "learning_rate": 8.140579939939113e-06, + "loss": 0.6705, + "step": 6900 + }, + { + "epoch": 0.8984771573604061, + "grad_norm": 2.616764783859253, + "learning_rate": 8.138953516910676e-06, + "loss": 0.5349, + "step": 6903 + }, + { + "epoch": 0.8988676298320968, + "grad_norm": 2.015042543411255, + "learning_rate": 8.137326545513664e-06, + "loss": 0.535, + "step": 6906 + }, + { + "epoch": 0.8992581023037876, + "grad_norm": 2.598292112350464, + "learning_rate": 8.135699026032305e-06, + "loss": 0.7247, + "step": 6909 + }, + { + "epoch": 0.8996485747754783, + "grad_norm": 2.3289687633514404, + "learning_rate": 8.134070958750923e-06, + "loss": 0.6876, + "step": 6912 + }, + { + "epoch": 0.9000390472471691, + "grad_norm": 2.219310760498047, + "learning_rate": 8.132442343953937e-06, + "loss": 0.6288, + "step": 6915 + }, + { + "epoch": 0.9004295197188599, + "grad_norm": 2.5258898735046387, + "learning_rate": 8.130813181925862e-06, + "loss": 0.6711, + "step": 6918 + }, + { + "epoch": 0.9008199921905505, + "grad_norm": 2.4190750122070312, + "learning_rate": 8.129183472951312e-06, + "loss": 0.7272, + "step": 6921 + }, + { + "epoch": 0.9012104646622413, + "grad_norm": 2.566995620727539, + "learning_rate": 8.127553217314991e-06, + "loss": 0.735, + "step": 6924 + }, + { + "epoch": 0.9016009371339321, + "grad_norm": 2.446164608001709, + "learning_rate": 8.125922415301704e-06, + "loss": 0.6453, + "step": 6927 + }, + { + "epoch": 0.9019914096056229, + "grad_norm": 2.655388593673706, + "learning_rate": 8.124291067196347e-06, + "loss": 0.5551, + "step": 6930 + }, + { + "epoch": 0.9023818820773135, + "grad_norm": 2.4122257232666016, + "learning_rate": 8.12265917328391e-06, + "loss": 0.6901, + "step": 6933 + }, + { + "epoch": 0.9027723545490043, + "grad_norm": 2.494718551635742, + "learning_rate": 8.121026733849486e-06, + "loss": 0.6336, + "step": 6936 + }, + { + "epoch": 0.9031628270206951, + "grad_norm": 2.373518466949463, + "learning_rate": 8.119393749178258e-06, + "loss": 0.6479, + "step": 6939 + }, + { + "epoch": 0.9035532994923858, + "grad_norm": 2.2196645736694336, + "learning_rate": 8.117760219555505e-06, + "loss": 0.5689, + "step": 6942 + }, + { + "epoch": 0.9039437719640765, + "grad_norm": 3.160625457763672, + "learning_rate": 8.116126145266599e-06, + "loss": 0.627, + "step": 6945 + }, + { + "epoch": 0.9043342444357673, + "grad_norm": 2.553009271621704, + "learning_rate": 8.114491526597012e-06, + "loss": 0.5639, + "step": 6948 + }, + { + "epoch": 0.904724716907458, + "grad_norm": 2.1808202266693115, + "learning_rate": 8.112856363832307e-06, + "loss": 0.5542, + "step": 6951 + }, + { + "epoch": 0.9051151893791488, + "grad_norm": 2.4375030994415283, + "learning_rate": 8.111220657258144e-06, + "loss": 0.5841, + "step": 6954 + }, + { + "epoch": 0.9055056618508395, + "grad_norm": 2.3346710205078125, + "learning_rate": 8.109584407160277e-06, + "loss": 0.6731, + "step": 6957 + }, + { + "epoch": 0.9058961343225302, + "grad_norm": 2.4284756183624268, + "learning_rate": 8.107947613824554e-06, + "loss": 0.6966, + "step": 6960 + }, + { + "epoch": 0.906286606794221, + "grad_norm": 3.4861011505126953, + "learning_rate": 8.106310277536921e-06, + "loss": 0.6403, + "step": 6963 + }, + { + "epoch": 0.9066770792659118, + "grad_norm": 3.8682639598846436, + "learning_rate": 8.104672398583419e-06, + "loss": 0.6438, + "step": 6966 + }, + { + "epoch": 0.9070675517376025, + "grad_norm": 2.232022523880005, + "learning_rate": 8.10303397725018e-06, + "loss": 0.545, + "step": 6969 + }, + { + "epoch": 0.9074580242092932, + "grad_norm": 3.7557497024536133, + "learning_rate": 8.101395013823433e-06, + "loss": 0.7008, + "step": 6972 + }, + { + "epoch": 0.907848496680984, + "grad_norm": 2.4933996200561523, + "learning_rate": 8.099755508589502e-06, + "loss": 0.6323, + "step": 6975 + }, + { + "epoch": 0.9082389691526748, + "grad_norm": 2.4882712364196777, + "learning_rate": 8.098115461834803e-06, + "loss": 0.6243, + "step": 6978 + }, + { + "epoch": 0.9086294416243654, + "grad_norm": 2.3607735633850098, + "learning_rate": 8.096474873845851e-06, + "loss": 0.6152, + "step": 6981 + }, + { + "epoch": 0.9090199140960562, + "grad_norm": 3.5572595596313477, + "learning_rate": 8.094833744909252e-06, + "loss": 0.7645, + "step": 6984 + }, + { + "epoch": 0.909410386567747, + "grad_norm": 3.0269618034362793, + "learning_rate": 8.09319207531171e-06, + "loss": 0.6273, + "step": 6987 + }, + { + "epoch": 0.9098008590394377, + "grad_norm": 3.1948349475860596, + "learning_rate": 8.091549865340019e-06, + "loss": 0.6008, + "step": 6990 + }, + { + "epoch": 0.9101913315111285, + "grad_norm": 2.419280529022217, + "learning_rate": 8.08990711528107e-06, + "loss": 0.6304, + "step": 6993 + }, + { + "epoch": 0.9105818039828192, + "grad_norm": 2.2587809562683105, + "learning_rate": 8.088263825421847e-06, + "loss": 0.6211, + "step": 6996 + }, + { + "epoch": 0.91097227645451, + "grad_norm": 2.709536552429199, + "learning_rate": 8.086619996049431e-06, + "loss": 0.6343, + "step": 6999 + }, + { + "epoch": 0.9113627489262007, + "grad_norm": 5.089230060577393, + "learning_rate": 8.084975627450995e-06, + "loss": 0.6053, + "step": 7002 + }, + { + "epoch": 0.9117532213978915, + "grad_norm": 2.689100742340088, + "learning_rate": 8.083330719913808e-06, + "loss": 0.668, + "step": 7005 + }, + { + "epoch": 0.9121436938695822, + "grad_norm": 2.3222506046295166, + "learning_rate": 8.08168527372523e-06, + "loss": 0.5972, + "step": 7008 + }, + { + "epoch": 0.9125341663412729, + "grad_norm": 2.3201560974121094, + "learning_rate": 8.080039289172717e-06, + "loss": 0.6853, + "step": 7011 + }, + { + "epoch": 0.9129246388129637, + "grad_norm": 2.8677849769592285, + "learning_rate": 8.078392766543821e-06, + "loss": 0.5319, + "step": 7014 + }, + { + "epoch": 0.9133151112846545, + "grad_norm": 2.3285553455352783, + "learning_rate": 8.076745706126184e-06, + "loss": 0.6937, + "step": 7017 + }, + { + "epoch": 0.9137055837563451, + "grad_norm": 2.447075366973877, + "learning_rate": 8.075098108207544e-06, + "loss": 0.7326, + "step": 7020 + }, + { + "epoch": 0.9140960562280359, + "grad_norm": 3.435882091522217, + "learning_rate": 8.073449973075733e-06, + "loss": 0.6119, + "step": 7023 + }, + { + "epoch": 0.9144865286997267, + "grad_norm": 2.431121587753296, + "learning_rate": 8.071801301018678e-06, + "loss": 0.6601, + "step": 7026 + }, + { + "epoch": 0.9148770011714175, + "grad_norm": 3.276074171066284, + "learning_rate": 8.070152092324399e-06, + "loss": 0.6196, + "step": 7029 + }, + { + "epoch": 0.9152674736431081, + "grad_norm": 2.349234104156494, + "learning_rate": 8.068502347281006e-06, + "loss": 0.5846, + "step": 7032 + }, + { + "epoch": 0.9156579461147989, + "grad_norm": 2.1926095485687256, + "learning_rate": 8.06685206617671e-06, + "loss": 0.7039, + "step": 7035 + }, + { + "epoch": 0.9160484185864897, + "grad_norm": 2.8905913829803467, + "learning_rate": 8.06520124929981e-06, + "loss": 0.6995, + "step": 7038 + }, + { + "epoch": 0.9164388910581804, + "grad_norm": 2.275421380996704, + "learning_rate": 8.063549896938698e-06, + "loss": 0.5875, + "step": 7041 + }, + { + "epoch": 0.9168293635298711, + "grad_norm": 2.2735865116119385, + "learning_rate": 8.061898009381865e-06, + "loss": 0.6204, + "step": 7044 + }, + { + "epoch": 0.9172198360015619, + "grad_norm": 2.518444299697876, + "learning_rate": 8.06024558691789e-06, + "loss": 0.6785, + "step": 7047 + }, + { + "epoch": 0.9176103084732526, + "grad_norm": 2.3565497398376465, + "learning_rate": 8.05859262983545e-06, + "loss": 0.6093, + "step": 7050 + }, + { + "epoch": 0.9180007809449434, + "grad_norm": 2.3102149963378906, + "learning_rate": 8.056939138423313e-06, + "loss": 0.701, + "step": 7053 + }, + { + "epoch": 0.9183912534166341, + "grad_norm": 3.3671510219573975, + "learning_rate": 8.055285112970337e-06, + "loss": 0.6835, + "step": 7056 + }, + { + "epoch": 0.9187817258883249, + "grad_norm": 2.4449825286865234, + "learning_rate": 8.05363055376548e-06, + "loss": 0.6697, + "step": 7059 + }, + { + "epoch": 0.9191721983600156, + "grad_norm": 2.696711540222168, + "learning_rate": 8.051975461097789e-06, + "loss": 0.6039, + "step": 7062 + }, + { + "epoch": 0.9195626708317064, + "grad_norm": 2.4236817359924316, + "learning_rate": 8.050319835256406e-06, + "loss": 0.6511, + "step": 7065 + }, + { + "epoch": 0.9199531433033971, + "grad_norm": 2.9492392539978027, + "learning_rate": 8.048663676530563e-06, + "loss": 0.6339, + "step": 7068 + }, + { + "epoch": 0.9203436157750878, + "grad_norm": 2.4083123207092285, + "learning_rate": 8.04700698520959e-06, + "loss": 0.6088, + "step": 7071 + }, + { + "epoch": 0.9207340882467786, + "grad_norm": 2.4848201274871826, + "learning_rate": 8.045349761582908e-06, + "loss": 0.6274, + "step": 7074 + }, + { + "epoch": 0.9211245607184694, + "grad_norm": 6.317074298858643, + "learning_rate": 8.043692005940029e-06, + "loss": 0.6443, + "step": 7077 + }, + { + "epoch": 0.9215150331901601, + "grad_norm": 2.408294916152954, + "learning_rate": 8.042033718570559e-06, + "loss": 0.694, + "step": 7080 + }, + { + "epoch": 0.9219055056618508, + "grad_norm": 2.3996033668518066, + "learning_rate": 8.040374899764198e-06, + "loss": 0.6197, + "step": 7083 + }, + { + "epoch": 0.9222959781335416, + "grad_norm": 2.790170907974243, + "learning_rate": 8.038715549810737e-06, + "loss": 0.575, + "step": 7086 + }, + { + "epoch": 0.9226864506052324, + "grad_norm": 2.455018997192383, + "learning_rate": 8.037055669000062e-06, + "loss": 0.6055, + "step": 7089 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 2.327902317047119, + "learning_rate": 8.035395257622151e-06, + "loss": 0.5594, + "step": 7092 + }, + { + "epoch": 0.9234673955486138, + "grad_norm": 3.6916446685791016, + "learning_rate": 8.033734315967074e-06, + "loss": 0.6818, + "step": 7095 + }, + { + "epoch": 0.9238578680203046, + "grad_norm": 2.5879135131835938, + "learning_rate": 8.032072844324995e-06, + "loss": 0.66, + "step": 7098 + }, + { + "epoch": 0.9242483404919953, + "grad_norm": 2.4714932441711426, + "learning_rate": 8.030410842986169e-06, + "loss": 0.6118, + "step": 7101 + }, + { + "epoch": 0.9246388129636861, + "grad_norm": 2.5215554237365723, + "learning_rate": 8.028748312240942e-06, + "loss": 0.7066, + "step": 7104 + }, + { + "epoch": 0.9250292854353768, + "grad_norm": 2.2146830558776855, + "learning_rate": 8.027085252379755e-06, + "loss": 0.5609, + "step": 7107 + }, + { + "epoch": 0.9254197579070675, + "grad_norm": 2.9621617794036865, + "learning_rate": 8.025421663693147e-06, + "loss": 0.6719, + "step": 7110 + }, + { + "epoch": 0.9258102303787583, + "grad_norm": 2.4687397480010986, + "learning_rate": 8.023757546471737e-06, + "loss": 0.634, + "step": 7113 + }, + { + "epoch": 0.9262007028504491, + "grad_norm": 2.19496488571167, + "learning_rate": 8.022092901006245e-06, + "loss": 0.6325, + "step": 7116 + }, + { + "epoch": 0.9265911753221397, + "grad_norm": 3.014942169189453, + "learning_rate": 8.020427727587479e-06, + "loss": 0.6796, + "step": 7119 + }, + { + "epoch": 0.9269816477938305, + "grad_norm": 2.605684995651245, + "learning_rate": 8.018762026506344e-06, + "loss": 0.6616, + "step": 7122 + }, + { + "epoch": 0.9273721202655213, + "grad_norm": 2.285712957382202, + "learning_rate": 8.017095798053834e-06, + "loss": 0.5857, + "step": 7125 + }, + { + "epoch": 0.9277625927372121, + "grad_norm": 2.385016918182373, + "learning_rate": 8.015429042521034e-06, + "loss": 0.6805, + "step": 7128 + }, + { + "epoch": 0.9281530652089027, + "grad_norm": 2.6234757900238037, + "learning_rate": 8.013761760199125e-06, + "loss": 0.6721, + "step": 7131 + }, + { + "epoch": 0.9285435376805935, + "grad_norm": 2.4633302688598633, + "learning_rate": 8.012093951379376e-06, + "loss": 0.6305, + "step": 7134 + }, + { + "epoch": 0.9289340101522843, + "grad_norm": 2.8097805976867676, + "learning_rate": 8.01042561635315e-06, + "loss": 0.613, + "step": 7137 + }, + { + "epoch": 0.929324482623975, + "grad_norm": 2.5998694896698, + "learning_rate": 8.008756755411902e-06, + "loss": 0.619, + "step": 7140 + }, + { + "epoch": 0.9297149550956657, + "grad_norm": 2.280726194381714, + "learning_rate": 8.007087368847178e-06, + "loss": 0.6404, + "step": 7143 + }, + { + "epoch": 0.9301054275673565, + "grad_norm": 2.546746015548706, + "learning_rate": 8.005417456950617e-06, + "loss": 0.6563, + "step": 7146 + }, + { + "epoch": 0.9304959000390473, + "grad_norm": 3.274017572402954, + "learning_rate": 8.003747020013948e-06, + "loss": 0.6148, + "step": 7149 + }, + { + "epoch": 0.930886372510738, + "grad_norm": 2.3492894172668457, + "learning_rate": 8.002076058328996e-06, + "loss": 0.5916, + "step": 7152 + }, + { + "epoch": 0.9312768449824287, + "grad_norm": 2.176417112350464, + "learning_rate": 8.00040457218767e-06, + "loss": 0.5756, + "step": 7155 + }, + { + "epoch": 0.9316673174541195, + "grad_norm": 2.4730896949768066, + "learning_rate": 7.998732561881976e-06, + "loss": 0.5959, + "step": 7158 + }, + { + "epoch": 0.9320577899258102, + "grad_norm": 2.558849573135376, + "learning_rate": 7.997060027704016e-06, + "loss": 0.6642, + "step": 7161 + }, + { + "epoch": 0.932448262397501, + "grad_norm": 2.7973668575286865, + "learning_rate": 7.99538696994597e-06, + "loss": 0.6965, + "step": 7164 + }, + { + "epoch": 0.9328387348691918, + "grad_norm": 2.7359659671783447, + "learning_rate": 7.993713388900124e-06, + "loss": 0.671, + "step": 7167 + }, + { + "epoch": 0.9332292073408824, + "grad_norm": 3.094665765762329, + "learning_rate": 7.992039284858846e-06, + "loss": 0.6593, + "step": 7170 + }, + { + "epoch": 0.9336196798125732, + "grad_norm": 2.6220576763153076, + "learning_rate": 7.990364658114599e-06, + "loss": 0.6067, + "step": 7173 + }, + { + "epoch": 0.934010152284264, + "grad_norm": 2.2661070823669434, + "learning_rate": 7.988689508959936e-06, + "loss": 0.6265, + "step": 7176 + }, + { + "epoch": 0.9344006247559548, + "grad_norm": 3.1826601028442383, + "learning_rate": 7.987013837687505e-06, + "loss": 0.6483, + "step": 7179 + }, + { + "epoch": 0.9347910972276454, + "grad_norm": 2.6422383785247803, + "learning_rate": 7.98533764459004e-06, + "loss": 0.6612, + "step": 7182 + }, + { + "epoch": 0.9351815696993362, + "grad_norm": 2.5566980838775635, + "learning_rate": 7.983660929960368e-06, + "loss": 0.6701, + "step": 7185 + }, + { + "epoch": 0.935572042171027, + "grad_norm": 2.446725368499756, + "learning_rate": 7.98198369409141e-06, + "loss": 0.62, + "step": 7188 + }, + { + "epoch": 0.9359625146427177, + "grad_norm": 2.3360819816589355, + "learning_rate": 7.980305937276172e-06, + "loss": 0.598, + "step": 7191 + }, + { + "epoch": 0.9363529871144084, + "grad_norm": 2.8677642345428467, + "learning_rate": 7.978627659807757e-06, + "loss": 0.6929, + "step": 7194 + }, + { + "epoch": 0.9367434595860992, + "grad_norm": 2.7148616313934326, + "learning_rate": 7.976948861979356e-06, + "loss": 0.626, + "step": 7197 + }, + { + "epoch": 0.9371339320577899, + "grad_norm": 2.4548072814941406, + "learning_rate": 7.975269544084251e-06, + "loss": 0.6934, + "step": 7200 + }, + { + "epoch": 0.9375244045294807, + "grad_norm": 3.422706365585327, + "learning_rate": 7.973589706415816e-06, + "loss": 0.666, + "step": 7203 + }, + { + "epoch": 0.9379148770011714, + "grad_norm": 2.493299722671509, + "learning_rate": 7.971909349267514e-06, + "loss": 0.7444, + "step": 7206 + }, + { + "epoch": 0.9383053494728621, + "grad_norm": 2.5340187549591064, + "learning_rate": 7.970228472932901e-06, + "loss": 0.6665, + "step": 7209 + }, + { + "epoch": 0.9386958219445529, + "grad_norm": 2.3306686878204346, + "learning_rate": 7.968547077705624e-06, + "loss": 0.6389, + "step": 7212 + }, + { + "epoch": 0.9390862944162437, + "grad_norm": 2.3347361087799072, + "learning_rate": 7.966865163879416e-06, + "loss": 0.6495, + "step": 7215 + }, + { + "epoch": 0.9394767668879344, + "grad_norm": 2.574287176132202, + "learning_rate": 7.965182731748104e-06, + "loss": 0.6285, + "step": 7218 + }, + { + "epoch": 0.9398672393596251, + "grad_norm": 2.4919655323028564, + "learning_rate": 7.96349978160561e-06, + "loss": 0.6683, + "step": 7221 + }, + { + "epoch": 0.9402577118313159, + "grad_norm": 2.245663642883301, + "learning_rate": 7.961816313745936e-06, + "loss": 0.5926, + "step": 7224 + }, + { + "epoch": 0.9406481843030067, + "grad_norm": 2.493183135986328, + "learning_rate": 7.960132328463184e-06, + "loss": 0.6197, + "step": 7227 + }, + { + "epoch": 0.9410386567746973, + "grad_norm": 2.5422475337982178, + "learning_rate": 7.958447826051538e-06, + "loss": 0.6305, + "step": 7230 + }, + { + "epoch": 0.9414291292463881, + "grad_norm": 2.432318925857544, + "learning_rate": 7.956762806805285e-06, + "loss": 0.7437, + "step": 7233 + }, + { + "epoch": 0.9418196017180789, + "grad_norm": 2.516645669937134, + "learning_rate": 7.955077271018788e-06, + "loss": 0.6369, + "step": 7236 + }, + { + "epoch": 0.9422100741897697, + "grad_norm": 2.9639275074005127, + "learning_rate": 7.953391218986507e-06, + "loss": 0.5996, + "step": 7239 + }, + { + "epoch": 0.9426005466614603, + "grad_norm": 2.503748655319214, + "learning_rate": 7.951704651002994e-06, + "loss": 0.6876, + "step": 7242 + }, + { + "epoch": 0.9429910191331511, + "grad_norm": 2.3590073585510254, + "learning_rate": 7.950017567362888e-06, + "loss": 0.6315, + "step": 7245 + }, + { + "epoch": 0.9433814916048419, + "grad_norm": 2.718970775604248, + "learning_rate": 7.948329968360919e-06, + "loss": 0.7917, + "step": 7248 + }, + { + "epoch": 0.9437719640765326, + "grad_norm": 3.0263495445251465, + "learning_rate": 7.946641854291908e-06, + "loss": 0.62, + "step": 7251 + }, + { + "epoch": 0.9441624365482234, + "grad_norm": 2.2088472843170166, + "learning_rate": 7.944953225450762e-06, + "loss": 0.5835, + "step": 7254 + }, + { + "epoch": 0.9445529090199141, + "grad_norm": 2.943924903869629, + "learning_rate": 7.943264082132484e-06, + "loss": 0.6698, + "step": 7257 + }, + { + "epoch": 0.9449433814916048, + "grad_norm": 3.520203113555908, + "learning_rate": 7.941574424632161e-06, + "loss": 0.6166, + "step": 7260 + }, + { + "epoch": 0.9453338539632956, + "grad_norm": 2.5051796436309814, + "learning_rate": 7.939884253244977e-06, + "loss": 0.5959, + "step": 7263 + }, + { + "epoch": 0.9457243264349864, + "grad_norm": 2.657858371734619, + "learning_rate": 7.938193568266195e-06, + "loss": 0.6384, + "step": 7266 + }, + { + "epoch": 0.946114798906677, + "grad_norm": 2.747825860977173, + "learning_rate": 7.93650236999118e-06, + "loss": 0.6422, + "step": 7269 + }, + { + "epoch": 0.9465052713783678, + "grad_norm": 2.390871524810791, + "learning_rate": 7.93481065871538e-06, + "loss": 0.6069, + "step": 7272 + }, + { + "epoch": 0.9468957438500586, + "grad_norm": 4.321902275085449, + "learning_rate": 7.933118434734329e-06, + "loss": 0.6149, + "step": 7275 + }, + { + "epoch": 0.9472862163217494, + "grad_norm": 2.5380969047546387, + "learning_rate": 7.931425698343657e-06, + "loss": 0.6597, + "step": 7278 + }, + { + "epoch": 0.94767668879344, + "grad_norm": 2.3168387413024902, + "learning_rate": 7.929732449839085e-06, + "loss": 0.6158, + "step": 7281 + }, + { + "epoch": 0.9480671612651308, + "grad_norm": 2.2340736389160156, + "learning_rate": 7.928038689516417e-06, + "loss": 0.546, + "step": 7284 + }, + { + "epoch": 0.9484576337368216, + "grad_norm": 2.3683393001556396, + "learning_rate": 7.92634441767155e-06, + "loss": 0.5225, + "step": 7287 + }, + { + "epoch": 0.9488481062085123, + "grad_norm": 3.3821890354156494, + "learning_rate": 7.924649634600468e-06, + "loss": 0.7159, + "step": 7290 + }, + { + "epoch": 0.949238578680203, + "grad_norm": 2.016606569290161, + "learning_rate": 7.922954340599247e-06, + "loss": 0.578, + "step": 7293 + }, + { + "epoch": 0.9496290511518938, + "grad_norm": 3.1906471252441406, + "learning_rate": 7.921258535964051e-06, + "loss": 0.7206, + "step": 7296 + }, + { + "epoch": 0.9500195236235845, + "grad_norm": 2.839524507522583, + "learning_rate": 7.919562220991137e-06, + "loss": 0.6458, + "step": 7299 + }, + { + "epoch": 0.9504099960952753, + "grad_norm": 2.620429039001465, + "learning_rate": 7.917865395976844e-06, + "loss": 0.6736, + "step": 7302 + }, + { + "epoch": 0.950800468566966, + "grad_norm": 2.331636667251587, + "learning_rate": 7.916168061217603e-06, + "loss": 0.6511, + "step": 7305 + }, + { + "epoch": 0.9511909410386568, + "grad_norm": 3.166801929473877, + "learning_rate": 7.914470217009937e-06, + "loss": 0.7378, + "step": 7308 + }, + { + "epoch": 0.9515814135103475, + "grad_norm": 2.2052841186523438, + "learning_rate": 7.912771863650457e-06, + "loss": 0.6025, + "step": 7311 + }, + { + "epoch": 0.9519718859820383, + "grad_norm": 2.397726058959961, + "learning_rate": 7.911073001435859e-06, + "loss": 0.6423, + "step": 7314 + }, + { + "epoch": 0.952362358453729, + "grad_norm": 2.694310426712036, + "learning_rate": 7.909373630662931e-06, + "loss": 0.6696, + "step": 7317 + }, + { + "epoch": 0.9527528309254197, + "grad_norm": 2.3516201972961426, + "learning_rate": 7.90767375162855e-06, + "loss": 0.6661, + "step": 7320 + }, + { + "epoch": 0.9531433033971105, + "grad_norm": 3.105769395828247, + "learning_rate": 7.905973364629682e-06, + "loss": 0.643, + "step": 7323 + }, + { + "epoch": 0.9535337758688013, + "grad_norm": 2.4792470932006836, + "learning_rate": 7.904272469963381e-06, + "loss": 0.6866, + "step": 7326 + }, + { + "epoch": 0.9539242483404919, + "grad_norm": 2.354459285736084, + "learning_rate": 7.90257106792679e-06, + "loss": 0.6899, + "step": 7329 + }, + { + "epoch": 0.9543147208121827, + "grad_norm": 2.265612840652466, + "learning_rate": 7.900869158817137e-06, + "loss": 0.6038, + "step": 7332 + }, + { + "epoch": 0.9547051932838735, + "grad_norm": 2.663803815841675, + "learning_rate": 7.899166742931745e-06, + "loss": 0.6569, + "step": 7335 + }, + { + "epoch": 0.9550956657555643, + "grad_norm": 2.5542824268341064, + "learning_rate": 7.897463820568024e-06, + "loss": 0.6466, + "step": 7338 + }, + { + "epoch": 0.955486138227255, + "grad_norm": 2.5738067626953125, + "learning_rate": 7.895760392023467e-06, + "loss": 0.6287, + "step": 7341 + }, + { + "epoch": 0.9558766106989457, + "grad_norm": 2.7856462001800537, + "learning_rate": 7.894056457595661e-06, + "loss": 0.6974, + "step": 7344 + }, + { + "epoch": 0.9562670831706365, + "grad_norm": 2.3538174629211426, + "learning_rate": 7.892352017582281e-06, + "loss": 0.5836, + "step": 7347 + }, + { + "epoch": 0.9566575556423272, + "grad_norm": 3.389510154724121, + "learning_rate": 7.89064707228109e-06, + "loss": 0.6142, + "step": 7350 + }, + { + "epoch": 0.957048028114018, + "grad_norm": 2.4188413619995117, + "learning_rate": 7.888941621989934e-06, + "loss": 0.5828, + "step": 7353 + }, + { + "epoch": 0.9574385005857087, + "grad_norm": 2.8886420726776123, + "learning_rate": 7.887235667006754e-06, + "loss": 0.5977, + "step": 7356 + }, + { + "epoch": 0.9578289730573994, + "grad_norm": 3.743659019470215, + "learning_rate": 7.885529207629578e-06, + "loss": 0.6807, + "step": 7359 + }, + { + "epoch": 0.9582194455290902, + "grad_norm": 3.0627243518829346, + "learning_rate": 7.883822244156518e-06, + "loss": 0.6187, + "step": 7362 + }, + { + "epoch": 0.958609918000781, + "grad_norm": 2.3821604251861572, + "learning_rate": 7.88211477688578e-06, + "loss": 0.6304, + "step": 7365 + }, + { + "epoch": 0.9590003904724717, + "grad_norm": 2.6071219444274902, + "learning_rate": 7.880406806115655e-06, + "loss": 0.6801, + "step": 7368 + }, + { + "epoch": 0.9593908629441624, + "grad_norm": 2.3850324153900146, + "learning_rate": 7.878698332144518e-06, + "loss": 0.7271, + "step": 7371 + }, + { + "epoch": 0.9597813354158532, + "grad_norm": 2.5399577617645264, + "learning_rate": 7.87698935527084e-06, + "loss": 0.6031, + "step": 7374 + }, + { + "epoch": 0.960171807887544, + "grad_norm": 2.3061318397521973, + "learning_rate": 7.875279875793173e-06, + "loss": 0.6618, + "step": 7377 + }, + { + "epoch": 0.9605622803592346, + "grad_norm": 2.355522632598877, + "learning_rate": 7.87356989401016e-06, + "loss": 0.5788, + "step": 7380 + }, + { + "epoch": 0.9609527528309254, + "grad_norm": 2.948932409286499, + "learning_rate": 7.871859410220531e-06, + "loss": 0.7404, + "step": 7383 + }, + { + "epoch": 0.9613432253026162, + "grad_norm": 2.565260648727417, + "learning_rate": 7.870148424723107e-06, + "loss": 0.6905, + "step": 7386 + }, + { + "epoch": 0.961733697774307, + "grad_norm": 3.4541046619415283, + "learning_rate": 7.86843693781679e-06, + "loss": 0.6341, + "step": 7389 + }, + { + "epoch": 0.9621241702459976, + "grad_norm": 2.6894516944885254, + "learning_rate": 7.866724949800574e-06, + "loss": 0.6463, + "step": 7392 + }, + { + "epoch": 0.9625146427176884, + "grad_norm": 3.081102132797241, + "learning_rate": 7.86501246097354e-06, + "loss": 0.6437, + "step": 7395 + }, + { + "epoch": 0.9629051151893792, + "grad_norm": 2.0764200687408447, + "learning_rate": 7.863299471634855e-06, + "loss": 0.5674, + "step": 7398 + }, + { + "epoch": 0.9632955876610699, + "grad_norm": 2.589940309524536, + "learning_rate": 7.861585982083777e-06, + "loss": 0.6612, + "step": 7401 + }, + { + "epoch": 0.9636860601327606, + "grad_norm": 2.2398934364318848, + "learning_rate": 7.859871992619647e-06, + "loss": 0.5479, + "step": 7404 + }, + { + "epoch": 0.9640765326044514, + "grad_norm": 2.3958423137664795, + "learning_rate": 7.8581575035419e-06, + "loss": 0.6572, + "step": 7407 + }, + { + "epoch": 0.9644670050761421, + "grad_norm": 2.667583465576172, + "learning_rate": 7.856442515150044e-06, + "loss": 0.6274, + "step": 7410 + }, + { + "epoch": 0.9648574775478329, + "grad_norm": 2.350374460220337, + "learning_rate": 7.854727027743693e-06, + "loss": 0.6201, + "step": 7413 + }, + { + "epoch": 0.9652479500195236, + "grad_norm": 2.8348028659820557, + "learning_rate": 7.853011041622536e-06, + "loss": 0.6498, + "step": 7416 + }, + { + "epoch": 0.9656384224912143, + "grad_norm": 2.5437428951263428, + "learning_rate": 7.85129455708635e-06, + "loss": 0.6588, + "step": 7419 + }, + { + "epoch": 0.9660288949629051, + "grad_norm": 3.723634958267212, + "learning_rate": 7.849577574435004e-06, + "loss": 0.6834, + "step": 7422 + }, + { + "epoch": 0.9664193674345959, + "grad_norm": 2.6529476642608643, + "learning_rate": 7.847860093968452e-06, + "loss": 0.6071, + "step": 7425 + }, + { + "epoch": 0.9668098399062867, + "grad_norm": 2.431433916091919, + "learning_rate": 7.84614211598673e-06, + "loss": 0.6335, + "step": 7428 + }, + { + "epoch": 0.9672003123779773, + "grad_norm": 2.597510576248169, + "learning_rate": 7.84442364078997e-06, + "loss": 0.6908, + "step": 7431 + }, + { + "epoch": 0.9675907848496681, + "grad_norm": 3.476114273071289, + "learning_rate": 7.842704668678383e-06, + "loss": 0.6247, + "step": 7434 + }, + { + "epoch": 0.9679812573213589, + "grad_norm": 4.752762794494629, + "learning_rate": 7.84098519995227e-06, + "loss": 0.5914, + "step": 7437 + }, + { + "epoch": 0.9683717297930496, + "grad_norm": 2.079385995864868, + "learning_rate": 7.839265234912019e-06, + "loss": 0.6201, + "step": 7440 + }, + { + "epoch": 0.9687622022647403, + "grad_norm": 2.5645883083343506, + "learning_rate": 7.837544773858104e-06, + "loss": 0.6475, + "step": 7443 + }, + { + "epoch": 0.9691526747364311, + "grad_norm": 2.471785068511963, + "learning_rate": 7.835823817091088e-06, + "loss": 0.5883, + "step": 7446 + }, + { + "epoch": 0.9695431472081218, + "grad_norm": 2.9823968410491943, + "learning_rate": 7.834102364911615e-06, + "loss": 0.6943, + "step": 7449 + }, + { + "epoch": 0.9699336196798126, + "grad_norm": 3.442391872406006, + "learning_rate": 7.832380417620421e-06, + "loss": 0.7353, + "step": 7452 + }, + { + "epoch": 0.9703240921515033, + "grad_norm": 2.499734401702881, + "learning_rate": 7.83065797551833e-06, + "loss": 0.6728, + "step": 7455 + }, + { + "epoch": 0.970714564623194, + "grad_norm": 2.1826016902923584, + "learning_rate": 7.828935038906242e-06, + "loss": 0.5552, + "step": 7458 + }, + { + "epoch": 0.9711050370948848, + "grad_norm": 3.049651861190796, + "learning_rate": 7.827211608085156e-06, + "loss": 0.5896, + "step": 7461 + }, + { + "epoch": 0.9714955095665756, + "grad_norm": 3.0418035984039307, + "learning_rate": 7.825487683356147e-06, + "loss": 0.6457, + "step": 7464 + }, + { + "epoch": 0.9718859820382663, + "grad_norm": 2.458627700805664, + "learning_rate": 7.823763265020385e-06, + "loss": 0.5146, + "step": 7467 + }, + { + "epoch": 0.972276454509957, + "grad_norm": 2.405871629714966, + "learning_rate": 7.822038353379123e-06, + "loss": 0.6785, + "step": 7470 + }, + { + "epoch": 0.9726669269816478, + "grad_norm": 3.09692120552063, + "learning_rate": 7.820312948733694e-06, + "loss": 0.6946, + "step": 7473 + }, + { + "epoch": 0.9730573994533386, + "grad_norm": 2.938538074493408, + "learning_rate": 7.818587051385528e-06, + "loss": 0.5546, + "step": 7476 + }, + { + "epoch": 0.9734478719250292, + "grad_norm": 2.3744659423828125, + "learning_rate": 7.816860661636133e-06, + "loss": 0.6079, + "step": 7479 + }, + { + "epoch": 0.97383834439672, + "grad_norm": 2.385146379470825, + "learning_rate": 7.815133779787106e-06, + "loss": 0.6877, + "step": 7482 + }, + { + "epoch": 0.9742288168684108, + "grad_norm": 2.245079278945923, + "learning_rate": 7.81340640614013e-06, + "loss": 0.6019, + "step": 7485 + }, + { + "epoch": 0.9746192893401016, + "grad_norm": 2.719306707382202, + "learning_rate": 7.811678540996974e-06, + "loss": 0.7438, + "step": 7488 + }, + { + "epoch": 0.9750097618117922, + "grad_norm": 2.31829833984375, + "learning_rate": 7.80995018465949e-06, + "loss": 0.6019, + "step": 7491 + }, + { + "epoch": 0.975400234283483, + "grad_norm": 2.4517407417297363, + "learning_rate": 7.808221337429622e-06, + "loss": 0.706, + "step": 7494 + }, + { + "epoch": 0.9757907067551738, + "grad_norm": 2.333136558532715, + "learning_rate": 7.806491999609393e-06, + "loss": 0.5492, + "step": 7497 + }, + { + "epoch": 0.9761811792268645, + "grad_norm": 2.3452956676483154, + "learning_rate": 7.804762171500915e-06, + "loss": 0.6827, + "step": 7500 + }, + { + "epoch": 0.9765716516985552, + "grad_norm": 2.5150270462036133, + "learning_rate": 7.803031853406389e-06, + "loss": 0.6804, + "step": 7503 + }, + { + "epoch": 0.976962124170246, + "grad_norm": 2.3206984996795654, + "learning_rate": 7.801301045628091e-06, + "loss": 0.6799, + "step": 7506 + }, + { + "epoch": 0.9773525966419367, + "grad_norm": 2.463517904281616, + "learning_rate": 7.799569748468395e-06, + "loss": 0.6221, + "step": 7509 + }, + { + "epoch": 0.9777430691136275, + "grad_norm": 2.432781457901001, + "learning_rate": 7.797837962229755e-06, + "loss": 0.744, + "step": 7512 + }, + { + "epoch": 0.9781335415853183, + "grad_norm": 2.4999756813049316, + "learning_rate": 7.796105687214705e-06, + "loss": 0.6272, + "step": 7515 + }, + { + "epoch": 0.978524014057009, + "grad_norm": 2.6092498302459717, + "learning_rate": 7.794372923725876e-06, + "loss": 0.6399, + "step": 7518 + }, + { + "epoch": 0.9789144865286997, + "grad_norm": 2.057215452194214, + "learning_rate": 7.792639672065978e-06, + "loss": 0.5434, + "step": 7521 + }, + { + "epoch": 0.9793049590003905, + "grad_norm": 2.6579465866088867, + "learning_rate": 7.790905932537802e-06, + "loss": 0.6238, + "step": 7524 + }, + { + "epoch": 0.9796954314720813, + "grad_norm": 2.4319543838500977, + "learning_rate": 7.78917170544423e-06, + "loss": 0.6155, + "step": 7527 + }, + { + "epoch": 0.9800859039437719, + "grad_norm": 2.190973997116089, + "learning_rate": 7.787436991088228e-06, + "loss": 0.605, + "step": 7530 + }, + { + "epoch": 0.9804763764154627, + "grad_norm": 3.2778749465942383, + "learning_rate": 7.78570178977285e-06, + "loss": 0.6937, + "step": 7533 + }, + { + "epoch": 0.9808668488871535, + "grad_norm": 3.431509256362915, + "learning_rate": 7.783966101801228e-06, + "loss": 0.6452, + "step": 7536 + }, + { + "epoch": 0.9812573213588442, + "grad_norm": 2.6262214183807373, + "learning_rate": 7.782229927476585e-06, + "loss": 0.649, + "step": 7539 + }, + { + "epoch": 0.9816477938305349, + "grad_norm": 2.3871302604675293, + "learning_rate": 7.780493267102226e-06, + "loss": 0.6498, + "step": 7542 + }, + { + "epoch": 0.9820382663022257, + "grad_norm": 2.3250765800476074, + "learning_rate": 7.778756120981544e-06, + "loss": 0.608, + "step": 7545 + }, + { + "epoch": 0.9824287387739165, + "grad_norm": 2.921252489089966, + "learning_rate": 7.777018489418011e-06, + "loss": 0.6643, + "step": 7548 + }, + { + "epoch": 0.9828192112456072, + "grad_norm": 3.2305235862731934, + "learning_rate": 7.775280372715193e-06, + "loss": 0.6425, + "step": 7551 + }, + { + "epoch": 0.9832096837172979, + "grad_norm": 2.3904078006744385, + "learning_rate": 7.77354177117673e-06, + "loss": 0.6353, + "step": 7554 + }, + { + "epoch": 0.9836001561889887, + "grad_norm": 3.6194827556610107, + "learning_rate": 7.771802685106356e-06, + "loss": 0.6311, + "step": 7557 + }, + { + "epoch": 0.9839906286606794, + "grad_norm": 2.5549159049987793, + "learning_rate": 7.770063114807882e-06, + "loss": 0.6819, + "step": 7560 + }, + { + "epoch": 0.9843811011323702, + "grad_norm": 2.8810348510742188, + "learning_rate": 7.76832306058521e-06, + "loss": 0.6459, + "step": 7563 + }, + { + "epoch": 0.9847715736040609, + "grad_norm": 2.3030362129211426, + "learning_rate": 7.766582522742323e-06, + "loss": 0.6363, + "step": 7566 + }, + { + "epoch": 0.9851620460757516, + "grad_norm": 2.76395583152771, + "learning_rate": 7.764841501583288e-06, + "loss": 0.572, + "step": 7569 + }, + { + "epoch": 0.9855525185474424, + "grad_norm": 2.2686634063720703, + "learning_rate": 7.76309999741226e-06, + "loss": 0.5821, + "step": 7572 + }, + { + "epoch": 0.9859429910191332, + "grad_norm": 2.786400079727173, + "learning_rate": 7.761358010533478e-06, + "loss": 0.6529, + "step": 7575 + }, + { + "epoch": 0.9863334634908238, + "grad_norm": 2.2984442710876465, + "learning_rate": 7.759615541251257e-06, + "loss": 0.6813, + "step": 7578 + }, + { + "epoch": 0.9867239359625146, + "grad_norm": 2.821648359298706, + "learning_rate": 7.757872589870008e-06, + "loss": 0.6864, + "step": 7581 + }, + { + "epoch": 0.9871144084342054, + "grad_norm": 2.334958791732788, + "learning_rate": 7.756129156694219e-06, + "loss": 0.6827, + "step": 7584 + }, + { + "epoch": 0.9875048809058962, + "grad_norm": 2.1806464195251465, + "learning_rate": 7.754385242028464e-06, + "loss": 0.6831, + "step": 7587 + }, + { + "epoch": 0.9878953533775868, + "grad_norm": 2.2671940326690674, + "learning_rate": 7.7526408461774e-06, + "loss": 0.6335, + "step": 7590 + }, + { + "epoch": 0.9882858258492776, + "grad_norm": 2.351231098175049, + "learning_rate": 7.750895969445773e-06, + "loss": 0.6129, + "step": 7593 + }, + { + "epoch": 0.9886762983209684, + "grad_norm": 2.558773994445801, + "learning_rate": 7.749150612138407e-06, + "loss": 0.771, + "step": 7596 + }, + { + "epoch": 0.9890667707926591, + "grad_norm": 4.013977527618408, + "learning_rate": 7.747404774560213e-06, + "loss": 0.7569, + "step": 7599 + }, + { + "epoch": 0.9894572432643499, + "grad_norm": 2.6245102882385254, + "learning_rate": 7.745658457016182e-06, + "loss": 0.6937, + "step": 7602 + }, + { + "epoch": 0.9898477157360406, + "grad_norm": 2.6696815490722656, + "learning_rate": 7.743911659811399e-06, + "loss": 0.5789, + "step": 7605 + }, + { + "epoch": 0.9902381882077314, + "grad_norm": 2.585771322250366, + "learning_rate": 7.74216438325102e-06, + "loss": 0.7235, + "step": 7608 + }, + { + "epoch": 0.9906286606794221, + "grad_norm": 2.760922431945801, + "learning_rate": 7.740416627640287e-06, + "loss": 0.6648, + "step": 7611 + }, + { + "epoch": 0.9910191331511129, + "grad_norm": 2.4276010990142822, + "learning_rate": 7.738668393284538e-06, + "loss": 0.5827, + "step": 7614 + }, + { + "epoch": 0.9914096056228036, + "grad_norm": 2.4627444744110107, + "learning_rate": 7.736919680489183e-06, + "loss": 0.565, + "step": 7617 + }, + { + "epoch": 0.9918000780944943, + "grad_norm": 2.5422356128692627, + "learning_rate": 7.735170489559715e-06, + "loss": 0.6392, + "step": 7620 + }, + { + "epoch": 0.9921905505661851, + "grad_norm": 3.172285318374634, + "learning_rate": 7.733420820801718e-06, + "loss": 0.7205, + "step": 7623 + }, + { + "epoch": 0.9925810230378759, + "grad_norm": 2.1353375911712646, + "learning_rate": 7.731670674520854e-06, + "loss": 0.5064, + "step": 7626 + }, + { + "epoch": 0.9929714955095665, + "grad_norm": 2.693587303161621, + "learning_rate": 7.72992005102287e-06, + "loss": 0.6689, + "step": 7629 + }, + { + "epoch": 0.9933619679812573, + "grad_norm": 2.3884193897247314, + "learning_rate": 7.728168950613595e-06, + "loss": 0.5864, + "step": 7632 + }, + { + "epoch": 0.9937524404529481, + "grad_norm": 2.1766364574432373, + "learning_rate": 7.726417373598944e-06, + "loss": 0.726, + "step": 7635 + }, + { + "epoch": 0.9941429129246389, + "grad_norm": 3.2286782264709473, + "learning_rate": 7.724665320284913e-06, + "loss": 0.6333, + "step": 7638 + }, + { + "epoch": 0.9945333853963295, + "grad_norm": 2.1552257537841797, + "learning_rate": 7.722912790977582e-06, + "loss": 0.5859, + "step": 7641 + }, + { + "epoch": 0.9949238578680203, + "grad_norm": 2.586132049560547, + "learning_rate": 7.721159785983116e-06, + "loss": 0.5727, + "step": 7644 + }, + { + "epoch": 0.9953143303397111, + "grad_norm": 2.324312686920166, + "learning_rate": 7.71940630560776e-06, + "loss": 0.6321, + "step": 7647 + }, + { + "epoch": 0.9957048028114018, + "grad_norm": 2.7428348064422607, + "learning_rate": 7.717652350157843e-06, + "loss": 0.7647, + "step": 7650 + }, + { + "epoch": 0.9960952752830925, + "grad_norm": 2.904000759124756, + "learning_rate": 7.715897919939775e-06, + "loss": 0.6042, + "step": 7653 + }, + { + "epoch": 0.9964857477547833, + "grad_norm": 2.2930192947387695, + "learning_rate": 7.714143015260056e-06, + "loss": 0.6164, + "step": 7656 + }, + { + "epoch": 0.996876220226474, + "grad_norm": 2.6954126358032227, + "learning_rate": 7.712387636425261e-06, + "loss": 0.6288, + "step": 7659 + }, + { + "epoch": 0.9972666926981648, + "grad_norm": 2.165433168411255, + "learning_rate": 7.710631783742053e-06, + "loss": 0.6372, + "step": 7662 + }, + { + "epoch": 0.9976571651698555, + "grad_norm": 2.2576329708099365, + "learning_rate": 7.708875457517176e-06, + "loss": 0.7483, + "step": 7665 + }, + { + "epoch": 0.9980476376415462, + "grad_norm": 2.5120785236358643, + "learning_rate": 7.707118658057453e-06, + "loss": 0.6769, + "step": 7668 + }, + { + "epoch": 0.998438110113237, + "grad_norm": 2.3859639167785645, + "learning_rate": 7.705361385669795e-06, + "loss": 0.6798, + "step": 7671 + }, + { + "epoch": 0.9988285825849278, + "grad_norm": 2.457106828689575, + "learning_rate": 7.703603640661195e-06, + "loss": 0.6851, + "step": 7674 + }, + { + "epoch": 0.9992190550566185, + "grad_norm": 2.6297943592071533, + "learning_rate": 7.701845423338727e-06, + "loss": 0.6308, + "step": 7677 + }, + { + "epoch": 0.9996095275283092, + "grad_norm": 2.5548126697540283, + "learning_rate": 7.700086734009546e-06, + "loss": 0.5697, + "step": 7680 + }, + { + "epoch": 1.0, + "grad_norm": 7.604892730712891, + "learning_rate": 7.698327572980893e-06, + "loss": 0.5724, + "step": 7683 + }, + { + "epoch": 1.0003904724716908, + "grad_norm": 2.4292237758636475, + "learning_rate": 7.69656794056009e-06, + "loss": 0.5014, + "step": 7686 + }, + { + "epoch": 1.0007809449433815, + "grad_norm": 2.254239320755005, + "learning_rate": 7.694807837054542e-06, + "loss": 0.5559, + "step": 7689 + }, + { + "epoch": 1.0011714174150723, + "grad_norm": 2.3204002380371094, + "learning_rate": 7.693047262771734e-06, + "loss": 0.4617, + "step": 7692 + }, + { + "epoch": 1.001561889886763, + "grad_norm": 2.1934375762939453, + "learning_rate": 7.691286218019232e-06, + "loss": 0.5337, + "step": 7695 + }, + { + "epoch": 1.0019523623584536, + "grad_norm": 3.9040029048919678, + "learning_rate": 7.689524703104691e-06, + "loss": 0.5133, + "step": 7698 + }, + { + "epoch": 1.0023428348301444, + "grad_norm": 2.4869489669799805, + "learning_rate": 7.687762718335841e-06, + "loss": 0.478, + "step": 7701 + }, + { + "epoch": 1.0027333073018352, + "grad_norm": 2.8475353717803955, + "learning_rate": 7.686000264020499e-06, + "loss": 0.5887, + "step": 7704 + }, + { + "epoch": 1.003123779773526, + "grad_norm": 2.571333408355713, + "learning_rate": 7.684237340466563e-06, + "loss": 0.5476, + "step": 7707 + }, + { + "epoch": 1.0035142522452167, + "grad_norm": 3.4672353267669678, + "learning_rate": 7.682473947982008e-06, + "loss": 0.5168, + "step": 7710 + }, + { + "epoch": 1.0039047247169075, + "grad_norm": 2.6517016887664795, + "learning_rate": 7.680710086874899e-06, + "loss": 0.5201, + "step": 7713 + }, + { + "epoch": 1.0042951971885983, + "grad_norm": 3.378917694091797, + "learning_rate": 7.678945757453375e-06, + "loss": 0.5947, + "step": 7716 + }, + { + "epoch": 1.004685669660289, + "grad_norm": 3.3272864818573, + "learning_rate": 7.677180960025665e-06, + "loss": 0.5434, + "step": 7719 + }, + { + "epoch": 1.0050761421319796, + "grad_norm": 2.3880414962768555, + "learning_rate": 7.675415694900072e-06, + "loss": 0.489, + "step": 7722 + }, + { + "epoch": 1.0054666146036704, + "grad_norm": 2.6195037364959717, + "learning_rate": 7.673649962384985e-06, + "loss": 0.5526, + "step": 7725 + }, + { + "epoch": 1.0058570870753611, + "grad_norm": 2.506817102432251, + "learning_rate": 7.671883762788877e-06, + "loss": 0.4631, + "step": 7728 + }, + { + "epoch": 1.006247559547052, + "grad_norm": 3.4505410194396973, + "learning_rate": 7.670117096420294e-06, + "loss": 0.4856, + "step": 7731 + }, + { + "epoch": 1.0066380320187427, + "grad_norm": 2.148554801940918, + "learning_rate": 7.668349963587872e-06, + "loss": 0.4429, + "step": 7734 + }, + { + "epoch": 1.0070285044904335, + "grad_norm": 2.645296812057495, + "learning_rate": 7.666582364600324e-06, + "loss": 0.5216, + "step": 7737 + }, + { + "epoch": 1.0074189769621242, + "grad_norm": 3.129601001739502, + "learning_rate": 7.664814299766447e-06, + "loss": 0.5421, + "step": 7740 + }, + { + "epoch": 1.007809449433815, + "grad_norm": 2.280381917953491, + "learning_rate": 7.66304576939512e-06, + "loss": 0.5322, + "step": 7743 + }, + { + "epoch": 1.0081999219055056, + "grad_norm": 2.548676013946533, + "learning_rate": 7.661276773795297e-06, + "loss": 0.513, + "step": 7746 + }, + { + "epoch": 1.0085903943771963, + "grad_norm": 3.303908109664917, + "learning_rate": 7.65950731327602e-06, + "loss": 0.5974, + "step": 7749 + }, + { + "epoch": 1.008980866848887, + "grad_norm": 2.7848570346832275, + "learning_rate": 7.657737388146411e-06, + "loss": 0.5288, + "step": 7752 + }, + { + "epoch": 1.0093713393205779, + "grad_norm": 2.4875972270965576, + "learning_rate": 7.65596699871567e-06, + "loss": 0.5485, + "step": 7755 + }, + { + "epoch": 1.0097618117922686, + "grad_norm": 2.5876026153564453, + "learning_rate": 7.654196145293082e-06, + "loss": 0.4749, + "step": 7758 + }, + { + "epoch": 1.0101522842639594, + "grad_norm": 2.8364417552948, + "learning_rate": 7.652424828188011e-06, + "loss": 0.533, + "step": 7761 + }, + { + "epoch": 1.0105427567356502, + "grad_norm": 2.3310322761535645, + "learning_rate": 7.650653047709906e-06, + "loss": 0.6102, + "step": 7764 + }, + { + "epoch": 1.010933229207341, + "grad_norm": 4.061567306518555, + "learning_rate": 7.648880804168287e-06, + "loss": 0.4694, + "step": 7767 + }, + { + "epoch": 1.0113237016790317, + "grad_norm": 2.307548999786377, + "learning_rate": 7.647108097872763e-06, + "loss": 0.5576, + "step": 7770 + }, + { + "epoch": 1.0117141741507223, + "grad_norm": 2.360511541366577, + "learning_rate": 7.645334929133024e-06, + "loss": 0.6458, + "step": 7773 + }, + { + "epoch": 1.012104646622413, + "grad_norm": 2.422124147415161, + "learning_rate": 7.643561298258836e-06, + "loss": 0.439, + "step": 7776 + }, + { + "epoch": 1.0124951190941038, + "grad_norm": 2.6831116676330566, + "learning_rate": 7.641787205560051e-06, + "loss": 0.5588, + "step": 7779 + }, + { + "epoch": 1.0128855915657946, + "grad_norm": 2.6215929985046387, + "learning_rate": 7.6400126513466e-06, + "loss": 0.5985, + "step": 7782 + }, + { + "epoch": 1.0132760640374854, + "grad_norm": 3.8551838397979736, + "learning_rate": 7.638237635928493e-06, + "loss": 0.5659, + "step": 7785 + }, + { + "epoch": 1.0136665365091762, + "grad_norm": 2.633751392364502, + "learning_rate": 7.63646215961582e-06, + "loss": 0.5141, + "step": 7788 + }, + { + "epoch": 1.014057008980867, + "grad_norm": 2.5351755619049072, + "learning_rate": 7.634686222718757e-06, + "loss": 0.473, + "step": 7791 + }, + { + "epoch": 1.0144474814525577, + "grad_norm": 2.423412799835205, + "learning_rate": 7.632909825547549e-06, + "loss": 0.4506, + "step": 7794 + }, + { + "epoch": 1.0148379539242482, + "grad_norm": 2.8047244548797607, + "learning_rate": 7.631132968412536e-06, + "loss": 0.4848, + "step": 7797 + }, + { + "epoch": 1.015228426395939, + "grad_norm": 2.771131753921509, + "learning_rate": 7.629355651624126e-06, + "loss": 0.4393, + "step": 7800 + }, + { + "epoch": 1.0156188988676298, + "grad_norm": 3.0369348526000977, + "learning_rate": 7.627577875492817e-06, + "loss": 0.4429, + "step": 7803 + }, + { + "epoch": 1.0160093713393206, + "grad_norm": 2.7859838008880615, + "learning_rate": 7.625799640329181e-06, + "loss": 0.4625, + "step": 7806 + }, + { + "epoch": 1.0163998438110113, + "grad_norm": 2.7366514205932617, + "learning_rate": 7.62402094644387e-06, + "loss": 0.5831, + "step": 7809 + }, + { + "epoch": 1.0167903162827021, + "grad_norm": 2.6900017261505127, + "learning_rate": 7.622241794147622e-06, + "loss": 0.461, + "step": 7812 + }, + { + "epoch": 1.0171807887543929, + "grad_norm": 2.5590507984161377, + "learning_rate": 7.6204621837512495e-06, + "loss": 0.5923, + "step": 7815 + }, + { + "epoch": 1.0175712612260837, + "grad_norm": 2.411809206008911, + "learning_rate": 7.6186821155656435e-06, + "loss": 0.4458, + "step": 7818 + }, + { + "epoch": 1.0179617336977742, + "grad_norm": 3.017052412033081, + "learning_rate": 7.616901589901781e-06, + "loss": 0.5078, + "step": 7821 + }, + { + "epoch": 1.018352206169465, + "grad_norm": 2.686730146408081, + "learning_rate": 7.615120607070717e-06, + "loss": 0.5053, + "step": 7824 + }, + { + "epoch": 1.0187426786411558, + "grad_norm": 2.5899784564971924, + "learning_rate": 7.613339167383585e-06, + "loss": 0.5209, + "step": 7827 + }, + { + "epoch": 1.0191331511128465, + "grad_norm": 2.8320977687835693, + "learning_rate": 7.6115572711515975e-06, + "loss": 0.588, + "step": 7830 + }, + { + "epoch": 1.0195236235845373, + "grad_norm": 2.990689516067505, + "learning_rate": 7.609774918686048e-06, + "loss": 0.5531, + "step": 7833 + }, + { + "epoch": 1.019914096056228, + "grad_norm": 2.3511035442352295, + "learning_rate": 7.60799211029831e-06, + "loss": 0.4349, + "step": 7836 + }, + { + "epoch": 1.0203045685279188, + "grad_norm": 4.445713520050049, + "learning_rate": 7.606208846299839e-06, + "loss": 0.4621, + "step": 7839 + }, + { + "epoch": 1.0206950409996096, + "grad_norm": 2.555044651031494, + "learning_rate": 7.604425127002162e-06, + "loss": 0.5091, + "step": 7842 + }, + { + "epoch": 1.0210855134713002, + "grad_norm": 2.6413726806640625, + "learning_rate": 7.602640952716897e-06, + "loss": 0.5894, + "step": 7845 + }, + { + "epoch": 1.021475985942991, + "grad_norm": 3.004298210144043, + "learning_rate": 7.600856323755732e-06, + "loss": 0.467, + "step": 7848 + }, + { + "epoch": 1.0218664584146817, + "grad_norm": 3.5717835426330566, + "learning_rate": 7.599071240430438e-06, + "loss": 0.5908, + "step": 7851 + }, + { + "epoch": 1.0222569308863725, + "grad_norm": 4.032369613647461, + "learning_rate": 7.5972857030528654e-06, + "loss": 0.5001, + "step": 7854 + }, + { + "epoch": 1.0226474033580633, + "grad_norm": 2.679647922515869, + "learning_rate": 7.595499711934946e-06, + "loss": 0.549, + "step": 7857 + }, + { + "epoch": 1.023037875829754, + "grad_norm": 2.6583425998687744, + "learning_rate": 7.593713267388686e-06, + "loss": 0.4949, + "step": 7860 + }, + { + "epoch": 1.0234283483014448, + "grad_norm": 2.699441432952881, + "learning_rate": 7.591926369726174e-06, + "loss": 0.5405, + "step": 7863 + }, + { + "epoch": 1.0238188207731356, + "grad_norm": 2.4705007076263428, + "learning_rate": 7.590139019259579e-06, + "loss": 0.6412, + "step": 7866 + }, + { + "epoch": 1.0242092932448263, + "grad_norm": 2.6664178371429443, + "learning_rate": 7.588351216301147e-06, + "loss": 0.4991, + "step": 7869 + }, + { + "epoch": 1.024599765716517, + "grad_norm": 2.7439897060394287, + "learning_rate": 7.5865629611632005e-06, + "loss": 0.5448, + "step": 7872 + }, + { + "epoch": 1.0249902381882077, + "grad_norm": 2.5113844871520996, + "learning_rate": 7.584774254158147e-06, + "loss": 0.5012, + "step": 7875 + }, + { + "epoch": 1.0253807106598984, + "grad_norm": 2.640521287918091, + "learning_rate": 7.582985095598469e-06, + "loss": 0.5282, + "step": 7878 + }, + { + "epoch": 1.0257711831315892, + "grad_norm": 2.277463912963867, + "learning_rate": 7.5811954857967285e-06, + "loss": 0.4448, + "step": 7881 + }, + { + "epoch": 1.02616165560328, + "grad_norm": 2.85617733001709, + "learning_rate": 7.579405425065567e-06, + "loss": 0.4533, + "step": 7884 + }, + { + "epoch": 1.0265521280749708, + "grad_norm": 2.714686155319214, + "learning_rate": 7.577614913717703e-06, + "loss": 0.4766, + "step": 7887 + }, + { + "epoch": 1.0269426005466615, + "grad_norm": 4.346346378326416, + "learning_rate": 7.575823952065936e-06, + "loss": 0.4783, + "step": 7890 + }, + { + "epoch": 1.0273330730183523, + "grad_norm": 2.2822585105895996, + "learning_rate": 7.574032540423145e-06, + "loss": 0.4686, + "step": 7893 + }, + { + "epoch": 1.0277235454900429, + "grad_norm": 2.7824783325195312, + "learning_rate": 7.572240679102283e-06, + "loss": 0.5298, + "step": 7896 + }, + { + "epoch": 1.0281140179617336, + "grad_norm": 2.2325971126556396, + "learning_rate": 7.570448368416387e-06, + "loss": 0.4116, + "step": 7899 + }, + { + "epoch": 1.0285044904334244, + "grad_norm": 2.75234317779541, + "learning_rate": 7.568655608678566e-06, + "loss": 0.5276, + "step": 7902 + }, + { + "epoch": 1.0288949629051152, + "grad_norm": 2.7082951068878174, + "learning_rate": 7.566862400202015e-06, + "loss": 0.5465, + "step": 7905 + }, + { + "epoch": 1.029285435376806, + "grad_norm": 2.646604299545288, + "learning_rate": 7.5650687433000026e-06, + "loss": 0.5085, + "step": 7908 + }, + { + "epoch": 1.0296759078484967, + "grad_norm": 2.7950475215911865, + "learning_rate": 7.563274638285876e-06, + "loss": 0.4742, + "step": 7911 + }, + { + "epoch": 1.0300663803201875, + "grad_norm": 2.585192918777466, + "learning_rate": 7.5614800854730645e-06, + "loss": 0.4821, + "step": 7914 + }, + { + "epoch": 1.0304568527918783, + "grad_norm": 2.4464495182037354, + "learning_rate": 7.559685085175069e-06, + "loss": 0.5618, + "step": 7917 + }, + { + "epoch": 1.0308473252635688, + "grad_norm": 2.5040671825408936, + "learning_rate": 7.557889637705473e-06, + "loss": 0.4845, + "step": 7920 + }, + { + "epoch": 1.0312377977352596, + "grad_norm": 2.605333089828491, + "learning_rate": 7.556093743377941e-06, + "loss": 0.5349, + "step": 7923 + }, + { + "epoch": 1.0316282702069504, + "grad_norm": 2.8396682739257812, + "learning_rate": 7.5542974025062076e-06, + "loss": 0.5243, + "step": 7926 + }, + { + "epoch": 1.0320187426786411, + "grad_norm": 2.373145818710327, + "learning_rate": 7.552500615404093e-06, + "loss": 0.4422, + "step": 7929 + }, + { + "epoch": 1.032409215150332, + "grad_norm": 2.7025516033172607, + "learning_rate": 7.550703382385488e-06, + "loss": 0.5432, + "step": 7932 + }, + { + "epoch": 1.0327996876220227, + "grad_norm": 3.0393564701080322, + "learning_rate": 7.548905703764371e-06, + "loss": 0.4643, + "step": 7935 + }, + { + "epoch": 1.0331901600937134, + "grad_norm": 2.475292921066284, + "learning_rate": 7.5471075798547865e-06, + "loss": 0.4984, + "step": 7938 + }, + { + "epoch": 1.0335806325654042, + "grad_norm": 2.519763708114624, + "learning_rate": 7.545309010970867e-06, + "loss": 0.4986, + "step": 7941 + }, + { + "epoch": 1.0339711050370948, + "grad_norm": 2.4332497119903564, + "learning_rate": 7.54350999742682e-06, + "loss": 0.4935, + "step": 7944 + }, + { + "epoch": 1.0343615775087855, + "grad_norm": 3.0140507221221924, + "learning_rate": 7.5417105395369235e-06, + "loss": 0.4621, + "step": 7947 + }, + { + "epoch": 1.0347520499804763, + "grad_norm": 2.577070713043213, + "learning_rate": 7.539910637615546e-06, + "loss": 0.4794, + "step": 7950 + }, + { + "epoch": 1.035142522452167, + "grad_norm": 2.6139345169067383, + "learning_rate": 7.538110291977123e-06, + "loss": 0.476, + "step": 7953 + }, + { + "epoch": 1.0355329949238579, + "grad_norm": 2.6360697746276855, + "learning_rate": 7.53630950293617e-06, + "loss": 0.4843, + "step": 7956 + }, + { + "epoch": 1.0359234673955486, + "grad_norm": 2.5535480976104736, + "learning_rate": 7.5345082708072836e-06, + "loss": 0.4639, + "step": 7959 + }, + { + "epoch": 1.0363139398672394, + "grad_norm": 2.9951012134552, + "learning_rate": 7.532706595905133e-06, + "loss": 0.5363, + "step": 7962 + }, + { + "epoch": 1.0367044123389302, + "grad_norm": 2.2908568382263184, + "learning_rate": 7.53090447854447e-06, + "loss": 0.4262, + "step": 7965 + }, + { + "epoch": 1.037094884810621, + "grad_norm": 2.7684085369110107, + "learning_rate": 7.529101919040116e-06, + "loss": 0.5007, + "step": 7968 + }, + { + "epoch": 1.0374853572823115, + "grad_norm": 2.3997421264648438, + "learning_rate": 7.5272989177069795e-06, + "loss": 0.5509, + "step": 7971 + }, + { + "epoch": 1.0378758297540023, + "grad_norm": 2.433640241622925, + "learning_rate": 7.525495474860037e-06, + "loss": 0.5496, + "step": 7974 + }, + { + "epoch": 1.038266302225693, + "grad_norm": 3.6235389709472656, + "learning_rate": 7.52369159081435e-06, + "loss": 0.5144, + "step": 7977 + }, + { + "epoch": 1.0386567746973838, + "grad_norm": 2.715113401412964, + "learning_rate": 7.521887265885049e-06, + "loss": 0.4914, + "step": 7980 + }, + { + "epoch": 1.0390472471690746, + "grad_norm": 2.352916955947876, + "learning_rate": 7.520082500387349e-06, + "loss": 0.4605, + "step": 7983 + }, + { + "epoch": 1.0394377196407654, + "grad_norm": 2.825352668762207, + "learning_rate": 7.518277294636538e-06, + "loss": 0.5833, + "step": 7986 + }, + { + "epoch": 1.0398281921124561, + "grad_norm": 2.906801462173462, + "learning_rate": 7.51647164894798e-06, + "loss": 0.5857, + "step": 7989 + }, + { + "epoch": 1.040218664584147, + "grad_norm": 2.4375948905944824, + "learning_rate": 7.51466556363712e-06, + "loss": 0.4738, + "step": 7992 + }, + { + "epoch": 1.0406091370558375, + "grad_norm": 2.6101300716400146, + "learning_rate": 7.512859039019476e-06, + "loss": 0.4943, + "step": 7995 + }, + { + "epoch": 1.0409996095275282, + "grad_norm": 7.20686149597168, + "learning_rate": 7.511052075410644e-06, + "loss": 0.5433, + "step": 7998 + }, + { + "epoch": 1.041390081999219, + "grad_norm": 2.5502688884735107, + "learning_rate": 7.509244673126298e-06, + "loss": 0.5695, + "step": 8001 + }, + { + "epoch": 1.0417805544709098, + "grad_norm": 3.865182399749756, + "learning_rate": 7.507436832482185e-06, + "loss": 0.4969, + "step": 8004 + }, + { + "epoch": 1.0421710269426006, + "grad_norm": 2.659177541732788, + "learning_rate": 7.5056285537941335e-06, + "loss": 0.6332, + "step": 8007 + }, + { + "epoch": 1.0425614994142913, + "grad_norm": 2.6241230964660645, + "learning_rate": 7.503819837378042e-06, + "loss": 0.4546, + "step": 8010 + }, + { + "epoch": 1.042951971885982, + "grad_norm": 3.0326120853424072, + "learning_rate": 7.502010683549894e-06, + "loss": 0.5339, + "step": 8013 + }, + { + "epoch": 1.0433424443576729, + "grad_norm": 2.694370985031128, + "learning_rate": 7.500201092625743e-06, + "loss": 0.4685, + "step": 8016 + }, + { + "epoch": 1.0437329168293634, + "grad_norm": 2.683987855911255, + "learning_rate": 7.498391064921721e-06, + "loss": 0.5303, + "step": 8019 + }, + { + "epoch": 1.0441233893010542, + "grad_norm": 3.791745901107788, + "learning_rate": 7.496580600754036e-06, + "loss": 0.4798, + "step": 8022 + }, + { + "epoch": 1.044513861772745, + "grad_norm": 3.1122212409973145, + "learning_rate": 7.494769700438971e-06, + "loss": 0.5105, + "step": 8025 + }, + { + "epoch": 1.0449043342444357, + "grad_norm": 2.536529302597046, + "learning_rate": 7.492958364292888e-06, + "loss": 0.4569, + "step": 8028 + }, + { + "epoch": 1.0452948067161265, + "grad_norm": 2.741360902786255, + "learning_rate": 7.491146592632223e-06, + "loss": 0.5162, + "step": 8031 + }, + { + "epoch": 1.0456852791878173, + "grad_norm": 2.675175905227661, + "learning_rate": 7.48933438577349e-06, + "loss": 0.4978, + "step": 8034 + }, + { + "epoch": 1.046075751659508, + "grad_norm": 2.6189332008361816, + "learning_rate": 7.487521744033275e-06, + "loss": 0.4782, + "step": 8037 + }, + { + "epoch": 1.0464662241311988, + "grad_norm": 3.0063090324401855, + "learning_rate": 7.485708667728245e-06, + "loss": 0.4818, + "step": 8040 + }, + { + "epoch": 1.0468566966028896, + "grad_norm": 3.2774417400360107, + "learning_rate": 7.483895157175141e-06, + "loss": 0.4994, + "step": 8043 + }, + { + "epoch": 1.0472471690745802, + "grad_norm": 2.4329867362976074, + "learning_rate": 7.482081212690777e-06, + "loss": 0.478, + "step": 8046 + }, + { + "epoch": 1.047637641546271, + "grad_norm": 2.885873556137085, + "learning_rate": 7.480266834592047e-06, + "loss": 0.5012, + "step": 8049 + }, + { + "epoch": 1.0480281140179617, + "grad_norm": 2.5598886013031006, + "learning_rate": 7.478452023195918e-06, + "loss": 0.4845, + "step": 8052 + }, + { + "epoch": 1.0484185864896525, + "grad_norm": 3.7768354415893555, + "learning_rate": 7.476636778819435e-06, + "loss": 0.4945, + "step": 8055 + }, + { + "epoch": 1.0488090589613432, + "grad_norm": 3.060943365097046, + "learning_rate": 7.474821101779718e-06, + "loss": 0.5462, + "step": 8058 + }, + { + "epoch": 1.049199531433034, + "grad_norm": 2.9565443992614746, + "learning_rate": 7.47300499239396e-06, + "loss": 0.5928, + "step": 8061 + }, + { + "epoch": 1.0495900039047248, + "grad_norm": 2.9404168128967285, + "learning_rate": 7.471188450979432e-06, + "loss": 0.6085, + "step": 8064 + }, + { + "epoch": 1.0499804763764156, + "grad_norm": 2.50423264503479, + "learning_rate": 7.4693714778534795e-06, + "loss": 0.5282, + "step": 8067 + }, + { + "epoch": 1.0503709488481061, + "grad_norm": 2.8531126976013184, + "learning_rate": 7.467554073333525e-06, + "loss": 0.5469, + "step": 8070 + }, + { + "epoch": 1.0507614213197969, + "grad_norm": 2.6135265827178955, + "learning_rate": 7.465736237737066e-06, + "loss": 0.5384, + "step": 8073 + }, + { + "epoch": 1.0511518937914877, + "grad_norm": 2.651714324951172, + "learning_rate": 7.463917971381672e-06, + "loss": 0.4965, + "step": 8076 + }, + { + "epoch": 1.0515423662631784, + "grad_norm": 2.368147611618042, + "learning_rate": 7.462099274584993e-06, + "loss": 0.5733, + "step": 8079 + }, + { + "epoch": 1.0519328387348692, + "grad_norm": 3.0812952518463135, + "learning_rate": 7.460280147664749e-06, + "loss": 0.4543, + "step": 8082 + }, + { + "epoch": 1.05232331120656, + "grad_norm": 2.6388537883758545, + "learning_rate": 7.458460590938741e-06, + "loss": 0.5554, + "step": 8085 + }, + { + "epoch": 1.0527137836782507, + "grad_norm": 2.5167076587677, + "learning_rate": 7.4566406047248385e-06, + "loss": 0.5311, + "step": 8088 + }, + { + "epoch": 1.0531042561499415, + "grad_norm": 3.7936503887176514, + "learning_rate": 7.454820189340989e-06, + "loss": 0.5205, + "step": 8091 + }, + { + "epoch": 1.053494728621632, + "grad_norm": 3.482969045639038, + "learning_rate": 7.452999345105218e-06, + "loss": 0.4703, + "step": 8094 + }, + { + "epoch": 1.0538852010933228, + "grad_norm": 3.16825270652771, + "learning_rate": 7.451178072335621e-06, + "loss": 0.4716, + "step": 8097 + }, + { + "epoch": 1.0542756735650136, + "grad_norm": 2.6491572856903076, + "learning_rate": 7.449356371350371e-06, + "loss": 0.5886, + "step": 8100 + }, + { + "epoch": 1.0546661460367044, + "grad_norm": 2.7450640201568604, + "learning_rate": 7.447534242467718e-06, + "loss": 0.4394, + "step": 8103 + }, + { + "epoch": 1.0550566185083952, + "grad_norm": 2.392484664916992, + "learning_rate": 7.445711686005978e-06, + "loss": 0.4185, + "step": 8106 + }, + { + "epoch": 1.055447090980086, + "grad_norm": 2.569303274154663, + "learning_rate": 7.443888702283555e-06, + "loss": 0.4916, + "step": 8109 + }, + { + "epoch": 1.0558375634517767, + "grad_norm": 2.565568447113037, + "learning_rate": 7.442065291618915e-06, + "loss": 0.4677, + "step": 8112 + }, + { + "epoch": 1.0562280359234675, + "grad_norm": 2.54575252532959, + "learning_rate": 7.440241454330606e-06, + "loss": 0.4087, + "step": 8115 + }, + { + "epoch": 1.0566185083951583, + "grad_norm": 3.055168390274048, + "learning_rate": 7.438417190737248e-06, + "loss": 0.442, + "step": 8118 + }, + { + "epoch": 1.0570089808668488, + "grad_norm": 2.5168895721435547, + "learning_rate": 7.436592501157538e-06, + "loss": 0.4346, + "step": 8121 + }, + { + "epoch": 1.0573994533385396, + "grad_norm": 2.5084915161132812, + "learning_rate": 7.434767385910243e-06, + "loss": 0.4508, + "step": 8124 + }, + { + "epoch": 1.0577899258102303, + "grad_norm": 3.041224479675293, + "learning_rate": 7.432941845314207e-06, + "loss": 0.4823, + "step": 8127 + }, + { + "epoch": 1.0581803982819211, + "grad_norm": 3.2237794399261475, + "learning_rate": 7.431115879688351e-06, + "loss": 0.5292, + "step": 8130 + }, + { + "epoch": 1.058570870753612, + "grad_norm": 3.038773775100708, + "learning_rate": 7.429289489351663e-06, + "loss": 0.5, + "step": 8133 + }, + { + "epoch": 1.0589613432253027, + "grad_norm": 3.018162488937378, + "learning_rate": 7.4274626746232125e-06, + "loss": 0.4816, + "step": 8136 + }, + { + "epoch": 1.0593518156969934, + "grad_norm": 2.773552179336548, + "learning_rate": 7.425635435822139e-06, + "loss": 0.4643, + "step": 8139 + }, + { + "epoch": 1.0597422881686842, + "grad_norm": 2.652228593826294, + "learning_rate": 7.423807773267659e-06, + "loss": 0.5507, + "step": 8142 + }, + { + "epoch": 1.0601327606403748, + "grad_norm": 2.473726511001587, + "learning_rate": 7.421979687279058e-06, + "loss": 0.5871, + "step": 8145 + }, + { + "epoch": 1.0605232331120655, + "grad_norm": 2.8862738609313965, + "learning_rate": 7.420151178175702e-06, + "loss": 0.5979, + "step": 8148 + }, + { + "epoch": 1.0609137055837563, + "grad_norm": 3.1102709770202637, + "learning_rate": 7.4183222462770266e-06, + "loss": 0.4983, + "step": 8151 + }, + { + "epoch": 1.061304178055447, + "grad_norm": 2.4361696243286133, + "learning_rate": 7.416492891902541e-06, + "loss": 0.4916, + "step": 8154 + }, + { + "epoch": 1.0616946505271379, + "grad_norm": 2.7870354652404785, + "learning_rate": 7.414663115371832e-06, + "loss": 0.4924, + "step": 8157 + }, + { + "epoch": 1.0620851229988286, + "grad_norm": 2.7369415760040283, + "learning_rate": 7.412832917004556e-06, + "loss": 0.5325, + "step": 8160 + }, + { + "epoch": 1.0624755954705194, + "grad_norm": 2.71121883392334, + "learning_rate": 7.411002297120444e-06, + "loss": 0.5131, + "step": 8163 + }, + { + "epoch": 1.0628660679422102, + "grad_norm": 2.594827890396118, + "learning_rate": 7.409171256039305e-06, + "loss": 0.468, + "step": 8166 + }, + { + "epoch": 1.0632565404139007, + "grad_norm": 2.7998273372650146, + "learning_rate": 7.407339794081013e-06, + "loss": 0.4999, + "step": 8169 + }, + { + "epoch": 1.0636470128855915, + "grad_norm": 2.753424882888794, + "learning_rate": 7.405507911565526e-06, + "loss": 0.517, + "step": 8172 + }, + { + "epoch": 1.0640374853572823, + "grad_norm": 2.8335673809051514, + "learning_rate": 7.403675608812866e-06, + "loss": 0.4969, + "step": 8175 + }, + { + "epoch": 1.064427957828973, + "grad_norm": 2.668989419937134, + "learning_rate": 7.401842886143133e-06, + "loss": 0.4657, + "step": 8178 + }, + { + "epoch": 1.0648184303006638, + "grad_norm": 3.0732874870300293, + "learning_rate": 7.400009743876502e-06, + "loss": 0.4661, + "step": 8181 + }, + { + "epoch": 1.0652089027723546, + "grad_norm": 2.5212926864624023, + "learning_rate": 7.398176182333217e-06, + "loss": 0.4812, + "step": 8184 + }, + { + "epoch": 1.0655993752440454, + "grad_norm": 2.622633457183838, + "learning_rate": 7.396342201833597e-06, + "loss": 0.5319, + "step": 8187 + }, + { + "epoch": 1.0659898477157361, + "grad_norm": 3.5963311195373535, + "learning_rate": 7.394507802698037e-06, + "loss": 0.52, + "step": 8190 + }, + { + "epoch": 1.066380320187427, + "grad_norm": 2.81231951713562, + "learning_rate": 7.392672985247002e-06, + "loss": 0.5517, + "step": 8193 + }, + { + "epoch": 1.0667707926591175, + "grad_norm": 2.674590826034546, + "learning_rate": 7.390837749801027e-06, + "loss": 0.5542, + "step": 8196 + }, + { + "epoch": 1.0671612651308082, + "grad_norm": 2.3117876052856445, + "learning_rate": 7.389002096680729e-06, + "loss": 0.4823, + "step": 8199 + }, + { + "epoch": 1.067551737602499, + "grad_norm": 2.682879686355591, + "learning_rate": 7.387166026206789e-06, + "loss": 0.4202, + "step": 8202 + }, + { + "epoch": 1.0679422100741898, + "grad_norm": 3.0507800579071045, + "learning_rate": 7.3853295386999665e-06, + "loss": 0.5087, + "step": 8205 + }, + { + "epoch": 1.0683326825458805, + "grad_norm": 3.6044976711273193, + "learning_rate": 7.383492634481093e-06, + "loss": 0.5968, + "step": 8208 + }, + { + "epoch": 1.0687231550175713, + "grad_norm": 2.9628076553344727, + "learning_rate": 7.381655313871069e-06, + "loss": 0.5076, + "step": 8211 + }, + { + "epoch": 1.069113627489262, + "grad_norm": 2.616960287094116, + "learning_rate": 7.379817577190873e-06, + "loss": 0.5251, + "step": 8214 + }, + { + "epoch": 1.0695040999609526, + "grad_norm": 3.7109944820404053, + "learning_rate": 7.377979424761551e-06, + "loss": 0.5161, + "step": 8217 + }, + { + "epoch": 1.0698945724326434, + "grad_norm": 2.695524215698242, + "learning_rate": 7.376140856904227e-06, + "loss": 0.525, + "step": 8220 + }, + { + "epoch": 1.0702850449043342, + "grad_norm": 2.8099405765533447, + "learning_rate": 7.374301873940093e-06, + "loss": 0.5413, + "step": 8223 + }, + { + "epoch": 1.070675517376025, + "grad_norm": 2.4617536067962646, + "learning_rate": 7.372462476190417e-06, + "loss": 0.5188, + "step": 8226 + }, + { + "epoch": 1.0710659898477157, + "grad_norm": 2.8862340450286865, + "learning_rate": 7.370622663976539e-06, + "loss": 0.499, + "step": 8229 + }, + { + "epoch": 1.0714564623194065, + "grad_norm": 2.755232334136963, + "learning_rate": 7.3687824376198665e-06, + "loss": 0.4981, + "step": 8232 + }, + { + "epoch": 1.0718469347910973, + "grad_norm": 2.4629595279693604, + "learning_rate": 7.3669417974418865e-06, + "loss": 0.501, + "step": 8235 + }, + { + "epoch": 1.072237407262788, + "grad_norm": 2.6113266944885254, + "learning_rate": 7.365100743764153e-06, + "loss": 0.4459, + "step": 8238 + }, + { + "epoch": 1.0726278797344788, + "grad_norm": 2.6274919509887695, + "learning_rate": 7.363259276908294e-06, + "loss": 0.4977, + "step": 8241 + }, + { + "epoch": 1.0730183522061694, + "grad_norm": 2.575099468231201, + "learning_rate": 7.3614173971960134e-06, + "loss": 0.5597, + "step": 8244 + }, + { + "epoch": 1.0734088246778601, + "grad_norm": 2.7904398441314697, + "learning_rate": 7.35957510494908e-06, + "loss": 0.5409, + "step": 8247 + }, + { + "epoch": 1.073799297149551, + "grad_norm": 2.6250545978546143, + "learning_rate": 7.357732400489342e-06, + "loss": 0.4675, + "step": 8250 + }, + { + "epoch": 1.0741897696212417, + "grad_norm": 2.454293727874756, + "learning_rate": 7.35588928413871e-06, + "loss": 0.5445, + "step": 8253 + }, + { + "epoch": 1.0745802420929325, + "grad_norm": 2.9731876850128174, + "learning_rate": 7.354045756219177e-06, + "loss": 0.5563, + "step": 8256 + }, + { + "epoch": 1.0749707145646232, + "grad_norm": 2.4799342155456543, + "learning_rate": 7.352201817052804e-06, + "loss": 0.5015, + "step": 8259 + }, + { + "epoch": 1.075361187036314, + "grad_norm": 2.2865350246429443, + "learning_rate": 7.350357466961719e-06, + "loss": 0.4247, + "step": 8262 + }, + { + "epoch": 1.0757516595080048, + "grad_norm": 2.5727767944335938, + "learning_rate": 7.348512706268132e-06, + "loss": 0.5026, + "step": 8265 + }, + { + "epoch": 1.0761421319796955, + "grad_norm": 2.5520944595336914, + "learning_rate": 7.346667535294314e-06, + "loss": 0.4901, + "step": 8268 + }, + { + "epoch": 1.076532604451386, + "grad_norm": 2.7078912258148193, + "learning_rate": 7.344821954362615e-06, + "loss": 0.5435, + "step": 8271 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 2.717442512512207, + "learning_rate": 7.342975963795454e-06, + "loss": 0.4969, + "step": 8274 + }, + { + "epoch": 1.0773135493947676, + "grad_norm": 2.646892547607422, + "learning_rate": 7.341129563915319e-06, + "loss": 0.6138, + "step": 8277 + }, + { + "epoch": 1.0777040218664584, + "grad_norm": 3.4911608695983887, + "learning_rate": 7.339282755044776e-06, + "loss": 0.4708, + "step": 8280 + }, + { + "epoch": 1.0780944943381492, + "grad_norm": 2.682307243347168, + "learning_rate": 7.337435537506456e-06, + "loss": 0.4995, + "step": 8283 + }, + { + "epoch": 1.07848496680984, + "grad_norm": 2.9293227195739746, + "learning_rate": 7.335587911623065e-06, + "loss": 0.4823, + "step": 8286 + }, + { + "epoch": 1.0788754392815307, + "grad_norm": 2.717480182647705, + "learning_rate": 7.33373987771738e-06, + "loss": 0.4743, + "step": 8289 + }, + { + "epoch": 1.0792659117532213, + "grad_norm": 2.61503005027771, + "learning_rate": 7.331891436112246e-06, + "loss": 0.4466, + "step": 8292 + }, + { + "epoch": 1.079656384224912, + "grad_norm": 2.5344207286834717, + "learning_rate": 7.330042587130586e-06, + "loss": 0.4884, + "step": 8295 + }, + { + "epoch": 1.0800468566966028, + "grad_norm": 2.708646774291992, + "learning_rate": 7.328193331095387e-06, + "loss": 0.4836, + "step": 8298 + }, + { + "epoch": 1.0804373291682936, + "grad_norm": 2.689146041870117, + "learning_rate": 7.326343668329711e-06, + "loss": 0.5197, + "step": 8301 + }, + { + "epoch": 1.0808278016399844, + "grad_norm": 2.5231661796569824, + "learning_rate": 7.324493599156688e-06, + "loss": 0.5055, + "step": 8304 + }, + { + "epoch": 1.0812182741116751, + "grad_norm": 2.644646167755127, + "learning_rate": 7.322643123899525e-06, + "loss": 0.4866, + "step": 8307 + }, + { + "epoch": 1.081608746583366, + "grad_norm": 3.1305809020996094, + "learning_rate": 7.3207922428814935e-06, + "loss": 0.5115, + "step": 8310 + }, + { + "epoch": 1.0819992190550567, + "grad_norm": 2.661428928375244, + "learning_rate": 7.318940956425941e-06, + "loss": 0.5316, + "step": 8313 + }, + { + "epoch": 1.0823896915267475, + "grad_norm": 2.929919958114624, + "learning_rate": 7.317089264856281e-06, + "loss": 0.5167, + "step": 8316 + }, + { + "epoch": 1.082780163998438, + "grad_norm": 2.6820735931396484, + "learning_rate": 7.315237168496e-06, + "loss": 0.5193, + "step": 8319 + }, + { + "epoch": 1.0831706364701288, + "grad_norm": 2.9934213161468506, + "learning_rate": 7.313384667668657e-06, + "loss": 0.4755, + "step": 8322 + }, + { + "epoch": 1.0835611089418196, + "grad_norm": 2.795145273208618, + "learning_rate": 7.311531762697879e-06, + "loss": 0.4628, + "step": 8325 + }, + { + "epoch": 1.0839515814135103, + "grad_norm": 2.4267120361328125, + "learning_rate": 7.309678453907365e-06, + "loss": 0.4883, + "step": 8328 + }, + { + "epoch": 1.084342053885201, + "grad_norm": 2.76674485206604, + "learning_rate": 7.307824741620883e-06, + "loss": 0.4929, + "step": 8331 + }, + { + "epoch": 1.0847325263568919, + "grad_norm": 2.8393499851226807, + "learning_rate": 7.3059706261622745e-06, + "loss": 0.5926, + "step": 8334 + }, + { + "epoch": 1.0851229988285827, + "grad_norm": 2.8024489879608154, + "learning_rate": 7.304116107855449e-06, + "loss": 0.5291, + "step": 8337 + }, + { + "epoch": 1.0855134713002734, + "grad_norm": 2.2711691856384277, + "learning_rate": 7.302261187024386e-06, + "loss": 0.5723, + "step": 8340 + }, + { + "epoch": 1.085903943771964, + "grad_norm": 2.544989585876465, + "learning_rate": 7.300405863993136e-06, + "loss": 0.4604, + "step": 8343 + }, + { + "epoch": 1.0862944162436547, + "grad_norm": 4.123943328857422, + "learning_rate": 7.298550139085823e-06, + "loss": 0.4671, + "step": 8346 + }, + { + "epoch": 1.0866848887153455, + "grad_norm": 2.6443090438842773, + "learning_rate": 7.296694012626635e-06, + "loss": 0.4958, + "step": 8349 + }, + { + "epoch": 1.0870753611870363, + "grad_norm": 2.406564474105835, + "learning_rate": 7.294837484939835e-06, + "loss": 0.4587, + "step": 8352 + }, + { + "epoch": 1.087465833658727, + "grad_norm": 2.6871235370635986, + "learning_rate": 7.292980556349754e-06, + "loss": 0.5546, + "step": 8355 + }, + { + "epoch": 1.0878563061304178, + "grad_norm": 2.515164375305176, + "learning_rate": 7.291123227180793e-06, + "loss": 0.46, + "step": 8358 + }, + { + "epoch": 1.0882467786021086, + "grad_norm": 2.631863832473755, + "learning_rate": 7.289265497757424e-06, + "loss": 0.4499, + "step": 8361 + }, + { + "epoch": 1.0886372510737994, + "grad_norm": 3.6877024173736572, + "learning_rate": 7.287407368404189e-06, + "loss": 0.519, + "step": 8364 + }, + { + "epoch": 1.08902772354549, + "grad_norm": 2.77193021774292, + "learning_rate": 7.2855488394456975e-06, + "loss": 0.5345, + "step": 8367 + }, + { + "epoch": 1.0894181960171807, + "grad_norm": 2.9497268199920654, + "learning_rate": 7.283689911206633e-06, + "loss": 0.4815, + "step": 8370 + }, + { + "epoch": 1.0898086684888715, + "grad_norm": 2.750422477722168, + "learning_rate": 7.281830584011745e-06, + "loss": 0.4844, + "step": 8373 + }, + { + "epoch": 1.0901991409605623, + "grad_norm": 2.9934771060943604, + "learning_rate": 7.279970858185854e-06, + "loss": 0.4681, + "step": 8376 + }, + { + "epoch": 1.090589613432253, + "grad_norm": 2.8141846656799316, + "learning_rate": 7.278110734053852e-06, + "loss": 0.5763, + "step": 8379 + }, + { + "epoch": 1.0909800859039438, + "grad_norm": 2.7781667709350586, + "learning_rate": 7.276250211940695e-06, + "loss": 0.5492, + "step": 8382 + }, + { + "epoch": 1.0913705583756346, + "grad_norm": 2.797043800354004, + "learning_rate": 7.274389292171416e-06, + "loss": 0.4497, + "step": 8385 + }, + { + "epoch": 1.0917610308473253, + "grad_norm": 2.90187406539917, + "learning_rate": 7.272527975071111e-06, + "loss": 0.4726, + "step": 8388 + }, + { + "epoch": 1.0921515033190161, + "grad_norm": 2.9917635917663574, + "learning_rate": 7.270666260964949e-06, + "loss": 0.5161, + "step": 8391 + }, + { + "epoch": 1.0925419757907067, + "grad_norm": 2.316232919692993, + "learning_rate": 7.268804150178171e-06, + "loss": 0.4295, + "step": 8394 + }, + { + "epoch": 1.0929324482623974, + "grad_norm": 2.60370135307312, + "learning_rate": 7.266941643036077e-06, + "loss": 0.4582, + "step": 8397 + }, + { + "epoch": 1.0933229207340882, + "grad_norm": 2.546489953994751, + "learning_rate": 7.26507873986405e-06, + "loss": 0.4716, + "step": 8400 + }, + { + "epoch": 1.093713393205779, + "grad_norm": 2.750613212585449, + "learning_rate": 7.26321544098753e-06, + "loss": 0.5778, + "step": 8403 + }, + { + "epoch": 1.0941038656774698, + "grad_norm": 2.4992191791534424, + "learning_rate": 7.261351746732035e-06, + "loss": 0.4807, + "step": 8406 + }, + { + "epoch": 1.0944943381491605, + "grad_norm": 2.8816723823547363, + "learning_rate": 7.2594876574231465e-06, + "loss": 0.5364, + "step": 8409 + }, + { + "epoch": 1.0948848106208513, + "grad_norm": 2.80865478515625, + "learning_rate": 7.257623173386516e-06, + "loss": 0.546, + "step": 8412 + }, + { + "epoch": 1.095275283092542, + "grad_norm": 2.4753668308258057, + "learning_rate": 7.25575829494787e-06, + "loss": 0.426, + "step": 8415 + }, + { + "epoch": 1.0956657555642326, + "grad_norm": 2.413982629776001, + "learning_rate": 7.253893022432993e-06, + "loss": 0.4686, + "step": 8418 + }, + { + "epoch": 1.0960562280359234, + "grad_norm": 2.514122247695923, + "learning_rate": 7.2520273561677455e-06, + "loss": 0.5196, + "step": 8421 + }, + { + "epoch": 1.0964467005076142, + "grad_norm": 2.6076467037200928, + "learning_rate": 7.25016129647806e-06, + "loss": 0.5595, + "step": 8424 + }, + { + "epoch": 1.096837172979305, + "grad_norm": 3.611821174621582, + "learning_rate": 7.248294843689927e-06, + "loss": 0.5155, + "step": 8427 + }, + { + "epoch": 1.0972276454509957, + "grad_norm": 3.5970561504364014, + "learning_rate": 7.246427998129414e-06, + "loss": 0.5353, + "step": 8430 + }, + { + "epoch": 1.0976181179226865, + "grad_norm": 2.886139392852783, + "learning_rate": 7.244560760122656e-06, + "loss": 0.5134, + "step": 8433 + }, + { + "epoch": 1.0980085903943773, + "grad_norm": 2.6416497230529785, + "learning_rate": 7.242693129995857e-06, + "loss": 0.4561, + "step": 8436 + }, + { + "epoch": 1.098399062866068, + "grad_norm": 2.9228434562683105, + "learning_rate": 7.2408251080752845e-06, + "loss": 0.4602, + "step": 8439 + }, + { + "epoch": 1.0987895353377586, + "grad_norm": 2.7232162952423096, + "learning_rate": 7.2389566946872795e-06, + "loss": 0.4795, + "step": 8442 + }, + { + "epoch": 1.0991800078094494, + "grad_norm": 2.603848934173584, + "learning_rate": 7.23708789015825e-06, + "loss": 0.5097, + "step": 8445 + }, + { + "epoch": 1.0995704802811401, + "grad_norm": 3.2897467613220215, + "learning_rate": 7.235218694814673e-06, + "loss": 0.5706, + "step": 8448 + }, + { + "epoch": 1.099960952752831, + "grad_norm": 2.6159188747406006, + "learning_rate": 7.233349108983091e-06, + "loss": 0.5107, + "step": 8451 + }, + { + "epoch": 1.1003514252245217, + "grad_norm": 2.4405086040496826, + "learning_rate": 7.231479132990118e-06, + "loss": 0.5245, + "step": 8454 + }, + { + "epoch": 1.1007418976962124, + "grad_norm": 2.8570423126220703, + "learning_rate": 7.229608767162437e-06, + "loss": 0.4532, + "step": 8457 + }, + { + "epoch": 1.1011323701679032, + "grad_norm": 3.2515525817871094, + "learning_rate": 7.2277380118267924e-06, + "loss": 0.5231, + "step": 8460 + }, + { + "epoch": 1.101522842639594, + "grad_norm": 2.619053602218628, + "learning_rate": 7.2258668673100055e-06, + "loss": 0.4558, + "step": 8463 + }, + { + "epoch": 1.1019133151112848, + "grad_norm": 3.4879095554351807, + "learning_rate": 7.223995333938958e-06, + "loss": 0.5, + "step": 8466 + }, + { + "epoch": 1.1023037875829753, + "grad_norm": 2.678898334503174, + "learning_rate": 7.222123412040605e-06, + "loss": 0.5018, + "step": 8469 + }, + { + "epoch": 1.102694260054666, + "grad_norm": 2.831430196762085, + "learning_rate": 7.220251101941966e-06, + "loss": 0.5194, + "step": 8472 + }, + { + "epoch": 1.1030847325263569, + "grad_norm": 2.5068278312683105, + "learning_rate": 7.21837840397013e-06, + "loss": 0.5053, + "step": 8475 + }, + { + "epoch": 1.1034752049980476, + "grad_norm": 4.6476898193359375, + "learning_rate": 7.216505318452254e-06, + "loss": 0.4749, + "step": 8478 + }, + { + "epoch": 1.1038656774697384, + "grad_norm": 2.4245364665985107, + "learning_rate": 7.214631845715563e-06, + "loss": 0.6265, + "step": 8481 + }, + { + "epoch": 1.1042561499414292, + "grad_norm": 2.79682993888855, + "learning_rate": 7.212757986087346e-06, + "loss": 0.5395, + "step": 8484 + }, + { + "epoch": 1.10464662241312, + "grad_norm": 4.285904407501221, + "learning_rate": 7.210883739894963e-06, + "loss": 0.4866, + "step": 8487 + }, + { + "epoch": 1.1050370948848107, + "grad_norm": 2.6022517681121826, + "learning_rate": 7.209009107465843e-06, + "loss": 0.4493, + "step": 8490 + }, + { + "epoch": 1.1054275673565013, + "grad_norm": 2.655932664871216, + "learning_rate": 7.207134089127479e-06, + "loss": 0.4609, + "step": 8493 + }, + { + "epoch": 1.105818039828192, + "grad_norm": 2.3036811351776123, + "learning_rate": 7.205258685207433e-06, + "loss": 0.5362, + "step": 8496 + }, + { + "epoch": 1.1062085122998828, + "grad_norm": 3.0361623764038086, + "learning_rate": 7.203382896033332e-06, + "loss": 0.5391, + "step": 8499 + }, + { + "epoch": 1.1065989847715736, + "grad_norm": 3.3767175674438477, + "learning_rate": 7.201506721932876e-06, + "loss": 0.5089, + "step": 8502 + }, + { + "epoch": 1.1069894572432644, + "grad_norm": 2.5257315635681152, + "learning_rate": 7.199630163233828e-06, + "loss": 0.475, + "step": 8505 + }, + { + "epoch": 1.1073799297149551, + "grad_norm": 2.7609949111938477, + "learning_rate": 7.197753220264017e-06, + "loss": 0.6277, + "step": 8508 + }, + { + "epoch": 1.107770402186646, + "grad_norm": 2.56369686126709, + "learning_rate": 7.1958758933513405e-06, + "loss": 0.4942, + "step": 8511 + }, + { + "epoch": 1.1081608746583367, + "grad_norm": 2.836712121963501, + "learning_rate": 7.1939981828237646e-06, + "loss": 0.4874, + "step": 8514 + }, + { + "epoch": 1.1085513471300272, + "grad_norm": 2.9525368213653564, + "learning_rate": 7.192120089009322e-06, + "loss": 0.5098, + "step": 8517 + }, + { + "epoch": 1.108941819601718, + "grad_norm": 2.7775232791900635, + "learning_rate": 7.190241612236113e-06, + "loss": 0.5331, + "step": 8520 + }, + { + "epoch": 1.1093322920734088, + "grad_norm": 4.186012268066406, + "learning_rate": 7.188362752832302e-06, + "loss": 0.5345, + "step": 8523 + }, + { + "epoch": 1.1097227645450995, + "grad_norm": 2.920480728149414, + "learning_rate": 7.18648351112612e-06, + "loss": 0.5221, + "step": 8526 + }, + { + "epoch": 1.1101132370167903, + "grad_norm": 2.599139928817749, + "learning_rate": 7.184603887445869e-06, + "loss": 0.5322, + "step": 8529 + }, + { + "epoch": 1.110503709488481, + "grad_norm": 2.4106132984161377, + "learning_rate": 7.182723882119915e-06, + "loss": 0.4708, + "step": 8532 + }, + { + "epoch": 1.1108941819601719, + "grad_norm": 2.7855916023254395, + "learning_rate": 7.18084349547669e-06, + "loss": 0.5707, + "step": 8535 + }, + { + "epoch": 1.1112846544318626, + "grad_norm": 2.641810655593872, + "learning_rate": 7.178962727844694e-06, + "loss": 0.5432, + "step": 8538 + }, + { + "epoch": 1.1116751269035534, + "grad_norm": 2.4662587642669678, + "learning_rate": 7.177081579552494e-06, + "loss": 0.5, + "step": 8541 + }, + { + "epoch": 1.112065599375244, + "grad_norm": 3.3329203128814697, + "learning_rate": 7.175200050928722e-06, + "loss": 0.4517, + "step": 8544 + }, + { + "epoch": 1.1124560718469347, + "grad_norm": 2.6928961277008057, + "learning_rate": 7.173318142302077e-06, + "loss": 0.4792, + "step": 8547 + }, + { + "epoch": 1.1128465443186255, + "grad_norm": 3.24125075340271, + "learning_rate": 7.171435854001324e-06, + "loss": 0.5087, + "step": 8550 + }, + { + "epoch": 1.1132370167903163, + "grad_norm": 2.6062655448913574, + "learning_rate": 7.169553186355296e-06, + "loss": 0.5109, + "step": 8553 + }, + { + "epoch": 1.113627489262007, + "grad_norm": 3.467337131500244, + "learning_rate": 7.167670139692888e-06, + "loss": 0.5299, + "step": 8556 + }, + { + "epoch": 1.1140179617336978, + "grad_norm": 2.869731903076172, + "learning_rate": 7.16578671434307e-06, + "loss": 0.4565, + "step": 8559 + }, + { + "epoch": 1.1144084342053886, + "grad_norm": 2.4524435997009277, + "learning_rate": 7.163902910634868e-06, + "loss": 0.4392, + "step": 8562 + }, + { + "epoch": 1.1147989066770791, + "grad_norm": 4.4590067863464355, + "learning_rate": 7.16201872889738e-06, + "loss": 0.413, + "step": 8565 + }, + { + "epoch": 1.11518937914877, + "grad_norm": 2.6730213165283203, + "learning_rate": 7.160134169459768e-06, + "loss": 0.4561, + "step": 8568 + }, + { + "epoch": 1.1155798516204607, + "grad_norm": 2.4888358116149902, + "learning_rate": 7.15824923265126e-06, + "loss": 0.5105, + "step": 8571 + }, + { + "epoch": 1.1159703240921515, + "grad_norm": 2.672909736633301, + "learning_rate": 7.156363918801152e-06, + "loss": 0.5565, + "step": 8574 + }, + { + "epoch": 1.1163607965638422, + "grad_norm": 2.708988666534424, + "learning_rate": 7.154478228238804e-06, + "loss": 0.556, + "step": 8577 + }, + { + "epoch": 1.116751269035533, + "grad_norm": 2.6112136840820312, + "learning_rate": 7.152592161293642e-06, + "loss": 0.5554, + "step": 8580 + }, + { + "epoch": 1.1171417415072238, + "grad_norm": 2.5211217403411865, + "learning_rate": 7.150705718295157e-06, + "loss": 0.4915, + "step": 8583 + }, + { + "epoch": 1.1175322139789146, + "grad_norm": 3.3289098739624023, + "learning_rate": 7.1488188995729095e-06, + "loss": 0.4083, + "step": 8586 + }, + { + "epoch": 1.1179226864506053, + "grad_norm": 2.5261623859405518, + "learning_rate": 7.146931705456518e-06, + "loss": 0.4479, + "step": 8589 + }, + { + "epoch": 1.1183131589222959, + "grad_norm": 2.5002522468566895, + "learning_rate": 7.145044136275675e-06, + "loss": 0.5424, + "step": 8592 + }, + { + "epoch": 1.1187036313939867, + "grad_norm": 2.2631165981292725, + "learning_rate": 7.143156192360135e-06, + "loss": 0.4406, + "step": 8595 + }, + { + "epoch": 1.1190941038656774, + "grad_norm": 2.8084895610809326, + "learning_rate": 7.141267874039715e-06, + "loss": 0.4845, + "step": 8598 + }, + { + "epoch": 1.1194845763373682, + "grad_norm": 2.751574993133545, + "learning_rate": 7.139379181644304e-06, + "loss": 0.4932, + "step": 8601 + }, + { + "epoch": 1.119875048809059, + "grad_norm": 2.6821987628936768, + "learning_rate": 7.137490115503848e-06, + "loss": 0.4911, + "step": 8604 + }, + { + "epoch": 1.1202655212807497, + "grad_norm": 2.521465301513672, + "learning_rate": 7.135600675948367e-06, + "loss": 0.5492, + "step": 8607 + }, + { + "epoch": 1.1206559937524405, + "grad_norm": 2.7268893718719482, + "learning_rate": 7.133710863307941e-06, + "loss": 0.5966, + "step": 8610 + }, + { + "epoch": 1.1210464662241313, + "grad_norm": 3.1640634536743164, + "learning_rate": 7.131820677912715e-06, + "loss": 0.5761, + "step": 8613 + }, + { + "epoch": 1.121436938695822, + "grad_norm": 2.8102428913116455, + "learning_rate": 7.129930120092902e-06, + "loss": 0.4964, + "step": 8616 + }, + { + "epoch": 1.1218274111675126, + "grad_norm": 2.8150181770324707, + "learning_rate": 7.128039190178776e-06, + "loss": 0.5629, + "step": 8619 + }, + { + "epoch": 1.1222178836392034, + "grad_norm": 2.691941976547241, + "learning_rate": 7.1261478885006815e-06, + "loss": 0.4933, + "step": 8622 + }, + { + "epoch": 1.1226083561108942, + "grad_norm": 3.071762800216675, + "learning_rate": 7.124256215389023e-06, + "loss": 0.5538, + "step": 8625 + }, + { + "epoch": 1.122998828582585, + "grad_norm": 3.8000946044921875, + "learning_rate": 7.122364171174273e-06, + "loss": 0.5261, + "step": 8628 + }, + { + "epoch": 1.1233893010542757, + "grad_norm": 2.708369493484497, + "learning_rate": 7.1204717561869684e-06, + "loss": 0.537, + "step": 8631 + }, + { + "epoch": 1.1237797735259665, + "grad_norm": 2.911895275115967, + "learning_rate": 7.118578970757707e-06, + "loss": 0.489, + "step": 8634 + }, + { + "epoch": 1.1241702459976572, + "grad_norm": 2.5384697914123535, + "learning_rate": 7.116685815217157e-06, + "loss": 0.4744, + "step": 8637 + }, + { + "epoch": 1.1245607184693478, + "grad_norm": 2.503662586212158, + "learning_rate": 7.114792289896046e-06, + "loss": 0.5045, + "step": 8640 + }, + { + "epoch": 1.1249511909410386, + "grad_norm": 2.787290334701538, + "learning_rate": 7.112898395125174e-06, + "loss": 0.471, + "step": 8643 + }, + { + "epoch": 1.1253416634127293, + "grad_norm": 2.368600368499756, + "learning_rate": 7.111004131235396e-06, + "loss": 0.4786, + "step": 8646 + }, + { + "epoch": 1.1257321358844201, + "grad_norm": 2.707949638366699, + "learning_rate": 7.109109498557636e-06, + "loss": 0.5496, + "step": 8649 + }, + { + "epoch": 1.1261226083561109, + "grad_norm": 2.765071392059326, + "learning_rate": 7.107214497422885e-06, + "loss": 0.5355, + "step": 8652 + }, + { + "epoch": 1.1265130808278017, + "grad_norm": 3.1850650310516357, + "learning_rate": 7.105319128162194e-06, + "loss": 0.4455, + "step": 8655 + }, + { + "epoch": 1.1269035532994924, + "grad_norm": 2.8872485160827637, + "learning_rate": 7.10342339110668e-06, + "loss": 0.5017, + "step": 8658 + }, + { + "epoch": 1.1272940257711832, + "grad_norm": 3.188868761062622, + "learning_rate": 7.101527286587524e-06, + "loss": 0.52, + "step": 8661 + }, + { + "epoch": 1.127684498242874, + "grad_norm": 2.5735905170440674, + "learning_rate": 7.099630814935973e-06, + "loss": 0.513, + "step": 8664 + }, + { + "epoch": 1.1280749707145645, + "grad_norm": 2.725409507751465, + "learning_rate": 7.097733976483335e-06, + "loss": 0.5538, + "step": 8667 + }, + { + "epoch": 1.1284654431862553, + "grad_norm": 2.974015235900879, + "learning_rate": 7.095836771560984e-06, + "loss": 0.5158, + "step": 8670 + }, + { + "epoch": 1.128855915657946, + "grad_norm": 2.86106276512146, + "learning_rate": 7.093939200500359e-06, + "loss": 0.5367, + "step": 8673 + }, + { + "epoch": 1.1292463881296368, + "grad_norm": 3.0025503635406494, + "learning_rate": 7.092041263632961e-06, + "loss": 0.5411, + "step": 8676 + }, + { + "epoch": 1.1296368606013276, + "grad_norm": 3.5087170600891113, + "learning_rate": 7.090142961290354e-06, + "loss": 0.564, + "step": 8679 + }, + { + "epoch": 1.1300273330730184, + "grad_norm": 2.5758073329925537, + "learning_rate": 7.088244293804169e-06, + "loss": 0.4342, + "step": 8682 + }, + { + "epoch": 1.1304178055447092, + "grad_norm": 2.531656503677368, + "learning_rate": 7.086345261506098e-06, + "loss": 0.5625, + "step": 8685 + }, + { + "epoch": 1.1308082780164, + "grad_norm": 2.7261338233947754, + "learning_rate": 7.0844458647279e-06, + "loss": 0.4574, + "step": 8688 + }, + { + "epoch": 1.1311987504880907, + "grad_norm": 2.582547664642334, + "learning_rate": 7.082546103801394e-06, + "loss": 0.5195, + "step": 8691 + }, + { + "epoch": 1.1315892229597813, + "grad_norm": 2.5053184032440186, + "learning_rate": 7.080645979058466e-06, + "loss": 0.4558, + "step": 8694 + }, + { + "epoch": 1.131979695431472, + "grad_norm": 2.540330410003662, + "learning_rate": 7.0787454908310614e-06, + "loss": 0.5509, + "step": 8697 + }, + { + "epoch": 1.1323701679031628, + "grad_norm": 2.711557149887085, + "learning_rate": 7.076844639451193e-06, + "loss": 0.5265, + "step": 8700 + }, + { + "epoch": 1.1327606403748536, + "grad_norm": 2.436702013015747, + "learning_rate": 7.074943425250933e-06, + "loss": 0.4933, + "step": 8703 + }, + { + "epoch": 1.1331511128465444, + "grad_norm": 2.6545193195343018, + "learning_rate": 7.073041848562424e-06, + "loss": 0.4736, + "step": 8706 + }, + { + "epoch": 1.1335415853182351, + "grad_norm": 2.6401283740997314, + "learning_rate": 7.071139909717865e-06, + "loss": 0.5213, + "step": 8709 + }, + { + "epoch": 1.133932057789926, + "grad_norm": 2.585341453552246, + "learning_rate": 7.06923760904952e-06, + "loss": 0.5262, + "step": 8712 + }, + { + "epoch": 1.1343225302616164, + "grad_norm": 2.6922566890716553, + "learning_rate": 7.067334946889718e-06, + "loss": 0.5317, + "step": 8715 + }, + { + "epoch": 1.1347130027333072, + "grad_norm": 2.583522081375122, + "learning_rate": 7.06543192357085e-06, + "loss": 0.4766, + "step": 8718 + }, + { + "epoch": 1.135103475204998, + "grad_norm": 2.4560515880584717, + "learning_rate": 7.06352853942537e-06, + "loss": 0.5026, + "step": 8721 + }, + { + "epoch": 1.1354939476766888, + "grad_norm": 2.9161624908447266, + "learning_rate": 7.061624794785795e-06, + "loss": 0.5901, + "step": 8724 + }, + { + "epoch": 1.1358844201483795, + "grad_norm": 2.5634748935699463, + "learning_rate": 7.059720689984705e-06, + "loss": 0.496, + "step": 8727 + }, + { + "epoch": 1.1362748926200703, + "grad_norm": 2.5747694969177246, + "learning_rate": 7.0578162253547445e-06, + "loss": 0.5148, + "step": 8730 + }, + { + "epoch": 1.136665365091761, + "grad_norm": 2.320279121398926, + "learning_rate": 7.055911401228618e-06, + "loss": 0.4465, + "step": 8733 + }, + { + "epoch": 1.1370558375634519, + "grad_norm": 2.7718331813812256, + "learning_rate": 7.054006217939093e-06, + "loss": 0.6083, + "step": 8736 + }, + { + "epoch": 1.1374463100351426, + "grad_norm": 2.2558786869049072, + "learning_rate": 7.052100675819006e-06, + "loss": 0.4439, + "step": 8739 + }, + { + "epoch": 1.1378367825068332, + "grad_norm": 2.8862807750701904, + "learning_rate": 7.050194775201246e-06, + "loss": 0.5045, + "step": 8742 + }, + { + "epoch": 1.138227254978524, + "grad_norm": 2.867145538330078, + "learning_rate": 7.048288516418772e-06, + "loss": 0.5259, + "step": 8745 + }, + { + "epoch": 1.1386177274502147, + "grad_norm": 2.575711965560913, + "learning_rate": 7.046381899804602e-06, + "loss": 0.562, + "step": 8748 + }, + { + "epoch": 1.1390081999219055, + "grad_norm": 2.9559128284454346, + "learning_rate": 7.044474925691821e-06, + "loss": 0.5471, + "step": 8751 + }, + { + "epoch": 1.1393986723935963, + "grad_norm": 2.526271343231201, + "learning_rate": 7.042567594413571e-06, + "loss": 0.5216, + "step": 8754 + }, + { + "epoch": 1.139789144865287, + "grad_norm": 2.697111129760742, + "learning_rate": 7.040659906303058e-06, + "loss": 0.4498, + "step": 8757 + }, + { + "epoch": 1.1401796173369778, + "grad_norm": 2.5852532386779785, + "learning_rate": 7.038751861693553e-06, + "loss": 0.4273, + "step": 8760 + }, + { + "epoch": 1.1405700898086686, + "grad_norm": 2.4026541709899902, + "learning_rate": 7.036843460918388e-06, + "loss": 0.4601, + "step": 8763 + }, + { + "epoch": 1.1409605622803594, + "grad_norm": 3.018265962600708, + "learning_rate": 7.034934704310954e-06, + "loss": 0.537, + "step": 8766 + }, + { + "epoch": 1.14135103475205, + "grad_norm": 2.6514596939086914, + "learning_rate": 7.0330255922047075e-06, + "loss": 0.4801, + "step": 8769 + }, + { + "epoch": 1.1417415072237407, + "grad_norm": 2.793213129043579, + "learning_rate": 7.031116124933167e-06, + "loss": 0.5271, + "step": 8772 + }, + { + "epoch": 1.1421319796954315, + "grad_norm": 3.0178632736206055, + "learning_rate": 7.029206302829914e-06, + "loss": 0.4884, + "step": 8775 + }, + { + "epoch": 1.1425224521671222, + "grad_norm": 2.857133388519287, + "learning_rate": 7.027296126228586e-06, + "loss": 0.5116, + "step": 8778 + }, + { + "epoch": 1.142912924638813, + "grad_norm": 2.7881019115448, + "learning_rate": 7.0253855954628925e-06, + "loss": 0.4901, + "step": 8781 + }, + { + "epoch": 1.1433033971105038, + "grad_norm": 2.7480220794677734, + "learning_rate": 7.023474710866595e-06, + "loss": 0.4977, + "step": 8784 + }, + { + "epoch": 1.1436938695821945, + "grad_norm": 2.6863813400268555, + "learning_rate": 7.021563472773522e-06, + "loss": 0.476, + "step": 8787 + }, + { + "epoch": 1.144084342053885, + "grad_norm": 2.9496448040008545, + "learning_rate": 7.019651881517562e-06, + "loss": 0.4938, + "step": 8790 + }, + { + "epoch": 1.1444748145255759, + "grad_norm": 2.909560203552246, + "learning_rate": 7.017739937432668e-06, + "loss": 0.4599, + "step": 8793 + }, + { + "epoch": 1.1448652869972666, + "grad_norm": 3.5830419063568115, + "learning_rate": 7.015827640852852e-06, + "loss": 0.5422, + "step": 8796 + }, + { + "epoch": 1.1452557594689574, + "grad_norm": 3.0168848037719727, + "learning_rate": 7.013914992112187e-06, + "loss": 0.4376, + "step": 8799 + }, + { + "epoch": 1.1456462319406482, + "grad_norm": 2.4287290573120117, + "learning_rate": 7.0120019915448125e-06, + "loss": 0.4951, + "step": 8802 + }, + { + "epoch": 1.146036704412339, + "grad_norm": 2.5160810947418213, + "learning_rate": 7.01008863948492e-06, + "loss": 0.5767, + "step": 8805 + }, + { + "epoch": 1.1464271768840297, + "grad_norm": 2.722810745239258, + "learning_rate": 7.00817493626677e-06, + "loss": 0.5479, + "step": 8808 + }, + { + "epoch": 1.1468176493557205, + "grad_norm": 3.0669102668762207, + "learning_rate": 7.006260882224684e-06, + "loss": 0.6298, + "step": 8811 + }, + { + "epoch": 1.1472081218274113, + "grad_norm": 2.84295392036438, + "learning_rate": 7.004346477693042e-06, + "loss": 0.4986, + "step": 8814 + }, + { + "epoch": 1.1475985942991018, + "grad_norm": 2.5158627033233643, + "learning_rate": 7.0024317230062884e-06, + "loss": 0.5512, + "step": 8817 + }, + { + "epoch": 1.1479890667707926, + "grad_norm": 2.525624990463257, + "learning_rate": 7.0005166184989245e-06, + "loss": 0.4828, + "step": 8820 + }, + { + "epoch": 1.1483795392424834, + "grad_norm": 3.02347993850708, + "learning_rate": 6.9986011645055175e-06, + "loss": 0.483, + "step": 8823 + }, + { + "epoch": 1.1487700117141741, + "grad_norm": 3.2486398220062256, + "learning_rate": 6.99668536136069e-06, + "loss": 0.6375, + "step": 8826 + }, + { + "epoch": 1.149160484185865, + "grad_norm": 3.728907823562622, + "learning_rate": 6.9947692093991295e-06, + "loss": 0.5128, + "step": 8829 + }, + { + "epoch": 1.1495509566575557, + "grad_norm": 3.2733185291290283, + "learning_rate": 6.992852708955586e-06, + "loss": 0.5249, + "step": 8832 + }, + { + "epoch": 1.1499414291292465, + "grad_norm": 2.754006862640381, + "learning_rate": 6.990935860364865e-06, + "loss": 0.5313, + "step": 8835 + }, + { + "epoch": 1.150331901600937, + "grad_norm": 2.4702494144439697, + "learning_rate": 6.989018663961838e-06, + "loss": 0.4722, + "step": 8838 + }, + { + "epoch": 1.1507223740726278, + "grad_norm": 2.463707208633423, + "learning_rate": 6.987101120081436e-06, + "loss": 0.4612, + "step": 8841 + }, + { + "epoch": 1.1511128465443186, + "grad_norm": 2.601174831390381, + "learning_rate": 6.9851832290586465e-06, + "loss": 0.4494, + "step": 8844 + }, + { + "epoch": 1.1515033190160093, + "grad_norm": 2.667945623397827, + "learning_rate": 6.983264991228525e-06, + "loss": 0.4577, + "step": 8847 + }, + { + "epoch": 1.1518937914877, + "grad_norm": 2.5175373554229736, + "learning_rate": 6.981346406926179e-06, + "loss": 0.5165, + "step": 8850 + }, + { + "epoch": 1.1522842639593909, + "grad_norm": 2.911750078201294, + "learning_rate": 6.979427476486786e-06, + "loss": 0.5315, + "step": 8853 + }, + { + "epoch": 1.1526747364310816, + "grad_norm": 2.7814128398895264, + "learning_rate": 6.9775082002455775e-06, + "loss": 0.4972, + "step": 8856 + }, + { + "epoch": 1.1530652089027724, + "grad_norm": 2.6590986251831055, + "learning_rate": 6.975588578537846e-06, + "loss": 0.553, + "step": 8859 + }, + { + "epoch": 1.1534556813744632, + "grad_norm": 2.5675268173217773, + "learning_rate": 6.973668611698945e-06, + "loss": 0.4934, + "step": 8862 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 2.761390447616577, + "learning_rate": 6.971748300064291e-06, + "loss": 0.6005, + "step": 8865 + }, + { + "epoch": 1.1542366263178445, + "grad_norm": 2.5827791690826416, + "learning_rate": 6.969827643969356e-06, + "loss": 0.4941, + "step": 8868 + }, + { + "epoch": 1.1546270987895353, + "grad_norm": 2.479886293411255, + "learning_rate": 6.9679066437496744e-06, + "loss": 0.4605, + "step": 8871 + }, + { + "epoch": 1.155017571261226, + "grad_norm": 2.5713136196136475, + "learning_rate": 6.965985299740844e-06, + "loss": 0.567, + "step": 8874 + }, + { + "epoch": 1.1554080437329168, + "grad_norm": 2.7975733280181885, + "learning_rate": 6.964063612278517e-06, + "loss": 0.5209, + "step": 8877 + }, + { + "epoch": 1.1557985162046076, + "grad_norm": 2.6016461849212646, + "learning_rate": 6.962141581698408e-06, + "loss": 0.4932, + "step": 8880 + }, + { + "epoch": 1.1561889886762984, + "grad_norm": 3.229020357131958, + "learning_rate": 6.960219208336293e-06, + "loss": 0.5739, + "step": 8883 + }, + { + "epoch": 1.1565794611479892, + "grad_norm": 2.7121598720550537, + "learning_rate": 6.958296492528005e-06, + "loss": 0.5604, + "step": 8886 + }, + { + "epoch": 1.15696993361968, + "grad_norm": 3.215475082397461, + "learning_rate": 6.956373434609441e-06, + "loss": 0.5234, + "step": 8889 + }, + { + "epoch": 1.1573604060913705, + "grad_norm": 2.619798421859741, + "learning_rate": 6.954450034916552e-06, + "loss": 0.4628, + "step": 8892 + }, + { + "epoch": 1.1577508785630612, + "grad_norm": 2.3444454669952393, + "learning_rate": 6.952526293785356e-06, + "loss": 0.4755, + "step": 8895 + }, + { + "epoch": 1.158141351034752, + "grad_norm": 2.6880340576171875, + "learning_rate": 6.950602211551921e-06, + "loss": 0.4455, + "step": 8898 + }, + { + "epoch": 1.1585318235064428, + "grad_norm": 2.406294345855713, + "learning_rate": 6.948677788552384e-06, + "loss": 0.495, + "step": 8901 + }, + { + "epoch": 1.1589222959781336, + "grad_norm": 3.347994327545166, + "learning_rate": 6.946753025122938e-06, + "loss": 0.4894, + "step": 8904 + }, + { + "epoch": 1.1593127684498243, + "grad_norm": 2.817830801010132, + "learning_rate": 6.944827921599832e-06, + "loss": 0.5655, + "step": 8907 + }, + { + "epoch": 1.159703240921515, + "grad_norm": 3.159911870956421, + "learning_rate": 6.942902478319382e-06, + "loss": 0.5178, + "step": 8910 + }, + { + "epoch": 1.1600937133932057, + "grad_norm": 2.5966691970825195, + "learning_rate": 6.940976695617954e-06, + "loss": 0.5147, + "step": 8913 + }, + { + "epoch": 1.1604841858648964, + "grad_norm": 2.7088546752929688, + "learning_rate": 6.9390505738319815e-06, + "loss": 0.472, + "step": 8916 + }, + { + "epoch": 1.1608746583365872, + "grad_norm": 2.583479404449463, + "learning_rate": 6.937124113297953e-06, + "loss": 0.4644, + "step": 8919 + }, + { + "epoch": 1.161265130808278, + "grad_norm": 3.7508015632629395, + "learning_rate": 6.935197314352415e-06, + "loss": 0.502, + "step": 8922 + }, + { + "epoch": 1.1616556032799688, + "grad_norm": 2.6737987995147705, + "learning_rate": 6.93327017733198e-06, + "loss": 0.5079, + "step": 8925 + }, + { + "epoch": 1.1620460757516595, + "grad_norm": 2.8131909370422363, + "learning_rate": 6.93134270257331e-06, + "loss": 0.5143, + "step": 8928 + }, + { + "epoch": 1.1624365482233503, + "grad_norm": 3.131113290786743, + "learning_rate": 6.929414890413135e-06, + "loss": 0.4909, + "step": 8931 + }, + { + "epoch": 1.162827020695041, + "grad_norm": 2.632091522216797, + "learning_rate": 6.9274867411882355e-06, + "loss": 0.4803, + "step": 8934 + }, + { + "epoch": 1.1632174931667318, + "grad_norm": 3.4867727756500244, + "learning_rate": 6.925558255235458e-06, + "loss": 0.5363, + "step": 8937 + }, + { + "epoch": 1.1636079656384224, + "grad_norm": 2.5351741313934326, + "learning_rate": 6.923629432891704e-06, + "loss": 0.465, + "step": 8940 + }, + { + "epoch": 1.1639984381101132, + "grad_norm": 2.7857537269592285, + "learning_rate": 6.921700274493935e-06, + "loss": 0.5723, + "step": 8943 + }, + { + "epoch": 1.164388910581804, + "grad_norm": 2.703540802001953, + "learning_rate": 6.9197707803791714e-06, + "loss": 0.5018, + "step": 8946 + }, + { + "epoch": 1.1647793830534947, + "grad_norm": 2.7754833698272705, + "learning_rate": 6.917840950884489e-06, + "loss": 0.4933, + "step": 8949 + }, + { + "epoch": 1.1651698555251855, + "grad_norm": 2.64084792137146, + "learning_rate": 6.915910786347029e-06, + "loss": 0.5401, + "step": 8952 + }, + { + "epoch": 1.1655603279968763, + "grad_norm": 2.382725715637207, + "learning_rate": 6.913980287103984e-06, + "loss": 0.4424, + "step": 8955 + }, + { + "epoch": 1.165950800468567, + "grad_norm": 2.396775245666504, + "learning_rate": 6.912049453492609e-06, + "loss": 0.4835, + "step": 8958 + }, + { + "epoch": 1.1663412729402578, + "grad_norm": 2.7328484058380127, + "learning_rate": 6.910118285850218e-06, + "loss": 0.5132, + "step": 8961 + }, + { + "epoch": 1.1667317454119486, + "grad_norm": 2.620654821395874, + "learning_rate": 6.90818678451418e-06, + "loss": 0.4648, + "step": 8964 + }, + { + "epoch": 1.1671222178836391, + "grad_norm": 3.5922820568084717, + "learning_rate": 6.906254949821926e-06, + "loss": 0.5845, + "step": 8967 + }, + { + "epoch": 1.16751269035533, + "grad_norm": 2.563399314880371, + "learning_rate": 6.904322782110942e-06, + "loss": 0.595, + "step": 8970 + }, + { + "epoch": 1.1679031628270207, + "grad_norm": 3.2699246406555176, + "learning_rate": 6.9023902817187735e-06, + "loss": 0.4712, + "step": 8973 + }, + { + "epoch": 1.1682936352987114, + "grad_norm": 2.6587018966674805, + "learning_rate": 6.900457448983024e-06, + "loss": 0.5394, + "step": 8976 + }, + { + "epoch": 1.1686841077704022, + "grad_norm": 3.6553778648376465, + "learning_rate": 6.898524284241357e-06, + "loss": 0.5078, + "step": 8979 + }, + { + "epoch": 1.169074580242093, + "grad_norm": 2.561161994934082, + "learning_rate": 6.896590787831493e-06, + "loss": 0.5351, + "step": 8982 + }, + { + "epoch": 1.1694650527137838, + "grad_norm": 3.0311195850372314, + "learning_rate": 6.894656960091206e-06, + "loss": 0.5395, + "step": 8985 + }, + { + "epoch": 1.1698555251854743, + "grad_norm": 2.600670337677002, + "learning_rate": 6.892722801358336e-06, + "loss": 0.4744, + "step": 8988 + }, + { + "epoch": 1.170245997657165, + "grad_norm": 2.8165881633758545, + "learning_rate": 6.890788311970773e-06, + "loss": 0.5708, + "step": 8991 + }, + { + "epoch": 1.1706364701288559, + "grad_norm": 3.0961506366729736, + "learning_rate": 6.888853492266469e-06, + "loss": 0.4359, + "step": 8994 + }, + { + "epoch": 1.1710269426005466, + "grad_norm": 2.740213394165039, + "learning_rate": 6.886918342583433e-06, + "loss": 0.4675, + "step": 8997 + }, + { + "epoch": 1.1714174150722374, + "grad_norm": 2.547696828842163, + "learning_rate": 6.884982863259734e-06, + "loss": 0.544, + "step": 9000 + }, + { + "epoch": 1.1718078875439282, + "grad_norm": 2.924173355102539, + "learning_rate": 6.883047054633494e-06, + "loss": 0.4527, + "step": 9003 + }, + { + "epoch": 1.172198360015619, + "grad_norm": 2.757521629333496, + "learning_rate": 6.8811109170428935e-06, + "loss": 0.4779, + "step": 9006 + }, + { + "epoch": 1.1725888324873097, + "grad_norm": 2.3954832553863525, + "learning_rate": 6.8791744508261735e-06, + "loss": 0.4461, + "step": 9009 + }, + { + "epoch": 1.1729793049590005, + "grad_norm": 2.38798189163208, + "learning_rate": 6.877237656321631e-06, + "loss": 0.5077, + "step": 9012 + }, + { + "epoch": 1.173369777430691, + "grad_norm": 2.7606191635131836, + "learning_rate": 6.875300533867619e-06, + "loss": 0.5737, + "step": 9015 + }, + { + "epoch": 1.1737602499023818, + "grad_norm": 2.400142192840576, + "learning_rate": 6.873363083802547e-06, + "loss": 0.5388, + "step": 9018 + }, + { + "epoch": 1.1741507223740726, + "grad_norm": 3.098991632461548, + "learning_rate": 6.8714253064648865e-06, + "loss": 0.5075, + "step": 9021 + }, + { + "epoch": 1.1745411948457634, + "grad_norm": 2.514508008956909, + "learning_rate": 6.8694872021931625e-06, + "loss": 0.5503, + "step": 9024 + }, + { + "epoch": 1.1749316673174541, + "grad_norm": 2.7697696685791016, + "learning_rate": 6.867548771325956e-06, + "loss": 0.5224, + "step": 9027 + }, + { + "epoch": 1.175322139789145, + "grad_norm": 2.679353952407837, + "learning_rate": 6.865610014201909e-06, + "loss": 0.5027, + "step": 9030 + }, + { + "epoch": 1.1757126122608357, + "grad_norm": 2.770353317260742, + "learning_rate": 6.863670931159716e-06, + "loss": 0.4884, + "step": 9033 + }, + { + "epoch": 1.1761030847325264, + "grad_norm": 2.543590784072876, + "learning_rate": 6.861731522538133e-06, + "loss": 0.4744, + "step": 9036 + }, + { + "epoch": 1.1764935572042172, + "grad_norm": 3.0317022800445557, + "learning_rate": 6.859791788675969e-06, + "loss": 0.5167, + "step": 9039 + }, + { + "epoch": 1.1768840296759078, + "grad_norm": 2.7118606567382812, + "learning_rate": 6.8578517299120916e-06, + "loss": 0.5217, + "step": 9042 + }, + { + "epoch": 1.1772745021475985, + "grad_norm": 2.709425449371338, + "learning_rate": 6.855911346585427e-06, + "loss": 0.5527, + "step": 9045 + }, + { + "epoch": 1.1776649746192893, + "grad_norm": 2.604459762573242, + "learning_rate": 6.853970639034953e-06, + "loss": 0.4463, + "step": 9048 + }, + { + "epoch": 1.17805544709098, + "grad_norm": 2.796820878982544, + "learning_rate": 6.852029607599707e-06, + "loss": 0.5053, + "step": 9051 + }, + { + "epoch": 1.1784459195626709, + "grad_norm": 3.0297458171844482, + "learning_rate": 6.850088252618787e-06, + "loss": 0.5075, + "step": 9054 + }, + { + "epoch": 1.1788363920343616, + "grad_norm": 2.483165979385376, + "learning_rate": 6.84814657443134e-06, + "loss": 0.4528, + "step": 9057 + }, + { + "epoch": 1.1792268645060524, + "grad_norm": 2.606029987335205, + "learning_rate": 6.846204573376576e-06, + "loss": 0.4552, + "step": 9060 + }, + { + "epoch": 1.179617336977743, + "grad_norm": 2.4798717498779297, + "learning_rate": 6.844262249793755e-06, + "loss": 0.514, + "step": 9063 + }, + { + "epoch": 1.1800078094494337, + "grad_norm": 2.535163164138794, + "learning_rate": 6.842319604022201e-06, + "loss": 0.5319, + "step": 9066 + }, + { + "epoch": 1.1803982819211245, + "grad_norm": 2.676571846008301, + "learning_rate": 6.840376636401285e-06, + "loss": 0.4704, + "step": 9069 + }, + { + "epoch": 1.1807887543928153, + "grad_norm": 2.842242956161499, + "learning_rate": 6.838433347270444e-06, + "loss": 0.496, + "step": 9072 + }, + { + "epoch": 1.181179226864506, + "grad_norm": 2.5695507526397705, + "learning_rate": 6.8364897369691655e-06, + "loss": 0.5432, + "step": 9075 + }, + { + "epoch": 1.1815696993361968, + "grad_norm": 2.565473794937134, + "learning_rate": 6.834545805836992e-06, + "loss": 0.4648, + "step": 9078 + }, + { + "epoch": 1.1819601718078876, + "grad_norm": 2.2036924362182617, + "learning_rate": 6.832601554213525e-06, + "loss": 0.4596, + "step": 9081 + }, + { + "epoch": 1.1823506442795784, + "grad_norm": 2.521524429321289, + "learning_rate": 6.830656982438421e-06, + "loss": 0.4932, + "step": 9084 + }, + { + "epoch": 1.1827411167512691, + "grad_norm": 2.6289567947387695, + "learning_rate": 6.828712090851395e-06, + "loss": 0.5188, + "step": 9087 + }, + { + "epoch": 1.1831315892229597, + "grad_norm": 2.556784152984619, + "learning_rate": 6.826766879792215e-06, + "loss": 0.5411, + "step": 9090 + }, + { + "epoch": 1.1835220616946505, + "grad_norm": 3.470583915710449, + "learning_rate": 6.824821349600702e-06, + "loss": 0.5407, + "step": 9093 + }, + { + "epoch": 1.1839125341663412, + "grad_norm": 2.5393197536468506, + "learning_rate": 6.822875500616739e-06, + "loss": 0.4989, + "step": 9096 + }, + { + "epoch": 1.184303006638032, + "grad_norm": 2.5908868312835693, + "learning_rate": 6.82092933318026e-06, + "loss": 0.4926, + "step": 9099 + }, + { + "epoch": 1.1846934791097228, + "grad_norm": 3.1211929321289062, + "learning_rate": 6.818982847631258e-06, + "loss": 0.517, + "step": 9102 + }, + { + "epoch": 1.1850839515814136, + "grad_norm": 2.8722972869873047, + "learning_rate": 6.8170360443097794e-06, + "loss": 0.5064, + "step": 9105 + }, + { + "epoch": 1.1854744240531043, + "grad_norm": 2.619140386581421, + "learning_rate": 6.815088923555925e-06, + "loss": 0.4872, + "step": 9108 + }, + { + "epoch": 1.185864896524795, + "grad_norm": 2.4975740909576416, + "learning_rate": 6.813141485709856e-06, + "loss": 0.4782, + "step": 9111 + }, + { + "epoch": 1.1862553689964859, + "grad_norm": 2.6161115169525146, + "learning_rate": 6.811193731111782e-06, + "loss": 0.4789, + "step": 9114 + }, + { + "epoch": 1.1866458414681764, + "grad_norm": 3.224074602127075, + "learning_rate": 6.809245660101974e-06, + "loss": 0.5793, + "step": 9117 + }, + { + "epoch": 1.1870363139398672, + "grad_norm": 2.5327675342559814, + "learning_rate": 6.8072972730207555e-06, + "loss": 0.4145, + "step": 9120 + }, + { + "epoch": 1.187426786411558, + "grad_norm": 2.6852951049804688, + "learning_rate": 6.8053485702085045e-06, + "loss": 0.4999, + "step": 9123 + }, + { + "epoch": 1.1878172588832487, + "grad_norm": 2.3816871643066406, + "learning_rate": 6.8033995520056565e-06, + "loss": 0.5238, + "step": 9126 + }, + { + "epoch": 1.1882077313549395, + "grad_norm": 2.494142770767212, + "learning_rate": 6.801450218752701e-06, + "loss": 0.4763, + "step": 9129 + }, + { + "epoch": 1.1885982038266303, + "grad_norm": 2.643019676208496, + "learning_rate": 6.799500570790182e-06, + "loss": 0.5655, + "step": 9132 + }, + { + "epoch": 1.188988676298321, + "grad_norm": 2.7931339740753174, + "learning_rate": 6.797550608458698e-06, + "loss": 0.5176, + "step": 9135 + }, + { + "epoch": 1.1893791487700116, + "grad_norm": 2.6419882774353027, + "learning_rate": 6.795600332098905e-06, + "loss": 0.4891, + "step": 9138 + }, + { + "epoch": 1.1897696212417024, + "grad_norm": 2.681978940963745, + "learning_rate": 6.793649742051511e-06, + "loss": 0.4745, + "step": 9141 + }, + { + "epoch": 1.1901600937133932, + "grad_norm": 2.605980396270752, + "learning_rate": 6.7916988386572806e-06, + "loss": 0.4831, + "step": 9144 + }, + { + "epoch": 1.190550566185084, + "grad_norm": 2.5457866191864014, + "learning_rate": 6.789747622257033e-06, + "loss": 0.4513, + "step": 9147 + }, + { + "epoch": 1.1909410386567747, + "grad_norm": 2.7378089427948, + "learning_rate": 6.787796093191638e-06, + "loss": 0.5176, + "step": 9150 + }, + { + "epoch": 1.1913315111284655, + "grad_norm": 2.543762683868408, + "learning_rate": 6.785844251802031e-06, + "loss": 0.5133, + "step": 9153 + }, + { + "epoch": 1.1917219836001562, + "grad_norm": 2.86997652053833, + "learning_rate": 6.783892098429187e-06, + "loss": 0.5483, + "step": 9156 + }, + { + "epoch": 1.192112456071847, + "grad_norm": 3.302907943725586, + "learning_rate": 6.781939633414146e-06, + "loss": 0.5072, + "step": 9159 + }, + { + "epoch": 1.1925029285435378, + "grad_norm": 2.7741358280181885, + "learning_rate": 6.779986857098002e-06, + "loss": 0.4742, + "step": 9162 + }, + { + "epoch": 1.1928934010152283, + "grad_norm": 2.5811760425567627, + "learning_rate": 6.778033769821896e-06, + "loss": 0.5102, + "step": 9165 + }, + { + "epoch": 1.1932838734869191, + "grad_norm": 4.686474800109863, + "learning_rate": 6.776080371927033e-06, + "loss": 0.4987, + "step": 9168 + }, + { + "epoch": 1.1936743459586099, + "grad_norm": 3.189173460006714, + "learning_rate": 6.774126663754663e-06, + "loss": 0.464, + "step": 9171 + }, + { + "epoch": 1.1940648184303007, + "grad_norm": 2.439980983734131, + "learning_rate": 6.7721726456461e-06, + "loss": 0.447, + "step": 9174 + }, + { + "epoch": 1.1944552909019914, + "grad_norm": 2.823490858078003, + "learning_rate": 6.770218317942701e-06, + "loss": 0.4427, + "step": 9177 + }, + { + "epoch": 1.1948457633736822, + "grad_norm": 2.5944924354553223, + "learning_rate": 6.768263680985888e-06, + "loss": 0.4201, + "step": 9180 + }, + { + "epoch": 1.195236235845373, + "grad_norm": 3.171487808227539, + "learning_rate": 6.766308735117129e-06, + "loss": 0.615, + "step": 9183 + }, + { + "epoch": 1.1956267083170635, + "grad_norm": 2.696826219558716, + "learning_rate": 6.764353480677949e-06, + "loss": 0.5554, + "step": 9186 + }, + { + "epoch": 1.1960171807887543, + "grad_norm": 2.919696569442749, + "learning_rate": 6.762397918009929e-06, + "loss": 0.5412, + "step": 9189 + }, + { + "epoch": 1.196407653260445, + "grad_norm": 2.540598154067993, + "learning_rate": 6.760442047454699e-06, + "loss": 0.5434, + "step": 9192 + }, + { + "epoch": 1.1967981257321358, + "grad_norm": 2.6347830295562744, + "learning_rate": 6.758485869353948e-06, + "loss": 0.4581, + "step": 9195 + }, + { + "epoch": 1.1971885982038266, + "grad_norm": 2.45042085647583, + "learning_rate": 6.756529384049415e-06, + "loss": 0.5198, + "step": 9198 + }, + { + "epoch": 1.1975790706755174, + "grad_norm": 3.13641357421875, + "learning_rate": 6.754572591882892e-06, + "loss": 0.46, + "step": 9201 + }, + { + "epoch": 1.1979695431472082, + "grad_norm": 3.144970417022705, + "learning_rate": 6.752615493196231e-06, + "loss": 0.4713, + "step": 9204 + }, + { + "epoch": 1.198360015618899, + "grad_norm": 2.6827385425567627, + "learning_rate": 6.750658088331326e-06, + "loss": 0.5368, + "step": 9207 + }, + { + "epoch": 1.1987504880905897, + "grad_norm": 2.579334259033203, + "learning_rate": 6.7487003776301394e-06, + "loss": 0.5117, + "step": 9210 + }, + { + "epoch": 1.1991409605622803, + "grad_norm": 2.422724962234497, + "learning_rate": 6.746742361434675e-06, + "loss": 0.3792, + "step": 9213 + }, + { + "epoch": 1.199531433033971, + "grad_norm": 2.511230230331421, + "learning_rate": 6.744784040086994e-06, + "loss": 0.5195, + "step": 9216 + }, + { + "epoch": 1.1999219055056618, + "grad_norm": 2.661550283432007, + "learning_rate": 6.742825413929213e-06, + "loss": 0.4506, + "step": 9219 + }, + { + "epoch": 1.2003123779773526, + "grad_norm": 2.8288450241088867, + "learning_rate": 6.740866483303497e-06, + "loss": 0.5301, + "step": 9222 + }, + { + "epoch": 1.2007028504490433, + "grad_norm": 2.494079351425171, + "learning_rate": 6.73890724855207e-06, + "loss": 0.4576, + "step": 9225 + }, + { + "epoch": 1.2010933229207341, + "grad_norm": 2.7465291023254395, + "learning_rate": 6.736947710017202e-06, + "loss": 0.4389, + "step": 9228 + }, + { + "epoch": 1.201483795392425, + "grad_norm": 3.119746208190918, + "learning_rate": 6.734987868041226e-06, + "loss": 0.519, + "step": 9231 + }, + { + "epoch": 1.2018742678641157, + "grad_norm": 2.939565896987915, + "learning_rate": 6.733027722966519e-06, + "loss": 0.5981, + "step": 9234 + }, + { + "epoch": 1.2022647403358064, + "grad_norm": 2.5819921493530273, + "learning_rate": 6.731067275135512e-06, + "loss": 0.5423, + "step": 9237 + }, + { + "epoch": 1.202655212807497, + "grad_norm": 2.769885301589966, + "learning_rate": 6.7291065248906975e-06, + "loss": 0.481, + "step": 9240 + }, + { + "epoch": 1.2030456852791878, + "grad_norm": 2.8047449588775635, + "learning_rate": 6.727145472574608e-06, + "loss": 0.4647, + "step": 9243 + }, + { + "epoch": 1.2034361577508785, + "grad_norm": 2.8334429264068604, + "learning_rate": 6.725184118529839e-06, + "loss": 0.525, + "step": 9246 + }, + { + "epoch": 1.2038266302225693, + "grad_norm": 2.9303195476531982, + "learning_rate": 6.723222463099033e-06, + "loss": 0.4845, + "step": 9249 + }, + { + "epoch": 1.20421710269426, + "grad_norm": 2.522890567779541, + "learning_rate": 6.721260506624888e-06, + "loss": 0.423, + "step": 9252 + }, + { + "epoch": 1.2046075751659509, + "grad_norm": 3.1758248805999756, + "learning_rate": 6.719298249450153e-06, + "loss": 0.4912, + "step": 9255 + }, + { + "epoch": 1.2049980476376416, + "grad_norm": 2.49548602104187, + "learning_rate": 6.7173356919176315e-06, + "loss": 0.4624, + "step": 9258 + }, + { + "epoch": 1.2053885201093322, + "grad_norm": 2.884762763977051, + "learning_rate": 6.7153728343701776e-06, + "loss": 0.4791, + "step": 9261 + }, + { + "epoch": 1.205778992581023, + "grad_norm": 2.4931530952453613, + "learning_rate": 6.7134096771506976e-06, + "loss": 0.542, + "step": 9264 + }, + { + "epoch": 1.2061694650527137, + "grad_norm": 2.408496618270874, + "learning_rate": 6.711446220602152e-06, + "loss": 0.4824, + "step": 9267 + }, + { + "epoch": 1.2065599375244045, + "grad_norm": 2.6510279178619385, + "learning_rate": 6.70948246506755e-06, + "loss": 0.5171, + "step": 9270 + }, + { + "epoch": 1.2069504099960953, + "grad_norm": 2.6602025032043457, + "learning_rate": 6.707518410889959e-06, + "loss": 0.5202, + "step": 9273 + }, + { + "epoch": 1.207340882467786, + "grad_norm": 3.2938411235809326, + "learning_rate": 6.7055540584124955e-06, + "loss": 0.4416, + "step": 9276 + }, + { + "epoch": 1.2077313549394768, + "grad_norm": 2.6444809436798096, + "learning_rate": 6.703589407978324e-06, + "loss": 0.5285, + "step": 9279 + }, + { + "epoch": 1.2081218274111676, + "grad_norm": 2.7918782234191895, + "learning_rate": 6.7016244599306675e-06, + "loss": 0.5213, + "step": 9282 + }, + { + "epoch": 1.2085122998828584, + "grad_norm": 2.3297805786132812, + "learning_rate": 6.699659214612797e-06, + "loss": 0.4901, + "step": 9285 + }, + { + "epoch": 1.208902772354549, + "grad_norm": 2.6607468128204346, + "learning_rate": 6.697693672368038e-06, + "loss": 0.518, + "step": 9288 + }, + { + "epoch": 1.2092932448262397, + "grad_norm": 2.5616703033447266, + "learning_rate": 6.695727833539765e-06, + "loss": 0.47, + "step": 9291 + }, + { + "epoch": 1.2096837172979304, + "grad_norm": 3.2793567180633545, + "learning_rate": 6.693761698471406e-06, + "loss": 0.552, + "step": 9294 + }, + { + "epoch": 1.2100741897696212, + "grad_norm": 2.4998676776885986, + "learning_rate": 6.6917952675064435e-06, + "loss": 0.5091, + "step": 9297 + }, + { + "epoch": 1.210464662241312, + "grad_norm": 2.6549830436706543, + "learning_rate": 6.689828540988406e-06, + "loss": 0.5316, + "step": 9300 + }, + { + "epoch": 1.2108551347130028, + "grad_norm": 2.714661121368408, + "learning_rate": 6.687861519260877e-06, + "loss": 0.5968, + "step": 9303 + }, + { + "epoch": 1.2112456071846935, + "grad_norm": 4.035742282867432, + "learning_rate": 6.685894202667491e-06, + "loss": 0.4923, + "step": 9306 + }, + { + "epoch": 1.2116360796563843, + "grad_norm": 2.6630351543426514, + "learning_rate": 6.683926591551934e-06, + "loss": 0.5069, + "step": 9309 + }, + { + "epoch": 1.212026552128075, + "grad_norm": 2.704087257385254, + "learning_rate": 6.681958686257945e-06, + "loss": 0.5276, + "step": 9312 + }, + { + "epoch": 1.2124170245997656, + "grad_norm": 4.812976837158203, + "learning_rate": 6.679990487129311e-06, + "loss": 0.5277, + "step": 9315 + }, + { + "epoch": 1.2128074970714564, + "grad_norm": 2.845241069793701, + "learning_rate": 6.678021994509874e-06, + "loss": 0.5935, + "step": 9318 + }, + { + "epoch": 1.2131979695431472, + "grad_norm": 2.6971890926361084, + "learning_rate": 6.676053208743525e-06, + "loss": 0.5589, + "step": 9321 + }, + { + "epoch": 1.213588442014838, + "grad_norm": 2.571746349334717, + "learning_rate": 6.674084130174204e-06, + "loss": 0.4472, + "step": 9324 + }, + { + "epoch": 1.2139789144865287, + "grad_norm": 2.5498881340026855, + "learning_rate": 6.67211475914591e-06, + "loss": 0.496, + "step": 9327 + }, + { + "epoch": 1.2143693869582195, + "grad_norm": 2.6578900814056396, + "learning_rate": 6.670145096002683e-06, + "loss": 0.5228, + "step": 9330 + }, + { + "epoch": 1.2147598594299103, + "grad_norm": 2.7154641151428223, + "learning_rate": 6.668175141088622e-06, + "loss": 0.5132, + "step": 9333 + }, + { + "epoch": 1.2151503319016008, + "grad_norm": 2.644035577774048, + "learning_rate": 6.666204894747874e-06, + "loss": 0.5248, + "step": 9336 + }, + { + "epoch": 1.2155408043732916, + "grad_norm": 2.640990734100342, + "learning_rate": 6.664234357324636e-06, + "loss": 0.4759, + "step": 9339 + }, + { + "epoch": 1.2159312768449824, + "grad_norm": 2.595244884490967, + "learning_rate": 6.662263529163155e-06, + "loss": 0.5262, + "step": 9342 + }, + { + "epoch": 1.2163217493166731, + "grad_norm": 2.70097279548645, + "learning_rate": 6.660292410607734e-06, + "loss": 0.5814, + "step": 9345 + }, + { + "epoch": 1.216712221788364, + "grad_norm": 3.5157313346862793, + "learning_rate": 6.658321002002722e-06, + "loss": 0.508, + "step": 9348 + }, + { + "epoch": 1.2171026942600547, + "grad_norm": 2.607250213623047, + "learning_rate": 6.656349303692519e-06, + "loss": 0.501, + "step": 9351 + }, + { + "epoch": 1.2174931667317455, + "grad_norm": 3.2209982872009277, + "learning_rate": 6.654377316021576e-06, + "loss": 0.4115, + "step": 9354 + }, + { + "epoch": 1.2178836392034362, + "grad_norm": 2.4936866760253906, + "learning_rate": 6.652405039334396e-06, + "loss": 0.4835, + "step": 9357 + }, + { + "epoch": 1.218274111675127, + "grad_norm": 2.4629316329956055, + "learning_rate": 6.650432473975534e-06, + "loss": 0.4565, + "step": 9360 + }, + { + "epoch": 1.2186645841468176, + "grad_norm": 3.328362464904785, + "learning_rate": 6.648459620289589e-06, + "loss": 0.5637, + "step": 9363 + }, + { + "epoch": 1.2190550566185083, + "grad_norm": 2.8752493858337402, + "learning_rate": 6.646486478621217e-06, + "loss": 0.4952, + "step": 9366 + }, + { + "epoch": 1.219445529090199, + "grad_norm": 2.651597738265991, + "learning_rate": 6.644513049315121e-06, + "loss": 0.4809, + "step": 9369 + }, + { + "epoch": 1.2198360015618899, + "grad_norm": 2.642346143722534, + "learning_rate": 6.642539332716055e-06, + "loss": 0.4713, + "step": 9372 + }, + { + "epoch": 1.2202264740335806, + "grad_norm": 2.764996290206909, + "learning_rate": 6.6405653291688225e-06, + "loss": 0.5554, + "step": 9375 + }, + { + "epoch": 1.2206169465052714, + "grad_norm": 2.660748243331909, + "learning_rate": 6.638591039018277e-06, + "loss": 0.475, + "step": 9378 + }, + { + "epoch": 1.2210074189769622, + "grad_norm": 2.4974265098571777, + "learning_rate": 6.636616462609324e-06, + "loss": 0.5118, + "step": 9381 + }, + { + "epoch": 1.221397891448653, + "grad_norm": 3.5743143558502197, + "learning_rate": 6.634641600286921e-06, + "loss": 0.5091, + "step": 9384 + }, + { + "epoch": 1.2217883639203437, + "grad_norm": 2.744140625, + "learning_rate": 6.632666452396067e-06, + "loss": 0.4696, + "step": 9387 + }, + { + "epoch": 1.2221788363920343, + "grad_norm": 2.7472984790802, + "learning_rate": 6.630691019281819e-06, + "loss": 0.4901, + "step": 9390 + }, + { + "epoch": 1.222569308863725, + "grad_norm": 3.2367637157440186, + "learning_rate": 6.6287153012892805e-06, + "loss": 0.4851, + "step": 9393 + }, + { + "epoch": 1.2229597813354158, + "grad_norm": 2.966864824295044, + "learning_rate": 6.626739298763605e-06, + "loss": 0.4722, + "step": 9396 + }, + { + "epoch": 1.2233502538071066, + "grad_norm": 2.585268020629883, + "learning_rate": 6.624763012049995e-06, + "loss": 0.4988, + "step": 9399 + }, + { + "epoch": 1.2237407262787974, + "grad_norm": 2.772038221359253, + "learning_rate": 6.622786441493706e-06, + "loss": 0.5594, + "step": 9402 + }, + { + "epoch": 1.2241311987504881, + "grad_norm": 3.0204148292541504, + "learning_rate": 6.62080958744004e-06, + "loss": 0.4736, + "step": 9405 + }, + { + "epoch": 1.224521671222179, + "grad_norm": 2.651444673538208, + "learning_rate": 6.618832450234348e-06, + "loss": 0.473, + "step": 9408 + }, + { + "epoch": 1.2249121436938695, + "grad_norm": 3.069899559020996, + "learning_rate": 6.6168550302220334e-06, + "loss": 0.618, + "step": 9411 + }, + { + "epoch": 1.2253026161655602, + "grad_norm": 2.877786874771118, + "learning_rate": 6.6148773277485455e-06, + "loss": 0.5673, + "step": 9414 + }, + { + "epoch": 1.225693088637251, + "grad_norm": 2.4998676776885986, + "learning_rate": 6.612899343159385e-06, + "loss": 0.4381, + "step": 9417 + }, + { + "epoch": 1.2260835611089418, + "grad_norm": 2.4126298427581787, + "learning_rate": 6.610921076800103e-06, + "loss": 0.5247, + "step": 9420 + }, + { + "epoch": 1.2264740335806326, + "grad_norm": 2.9004220962524414, + "learning_rate": 6.608942529016298e-06, + "loss": 0.5114, + "step": 9423 + }, + { + "epoch": 1.2268645060523233, + "grad_norm": 3.1982264518737793, + "learning_rate": 6.606963700153618e-06, + "loss": 0.5308, + "step": 9426 + }, + { + "epoch": 1.227254978524014, + "grad_norm": 2.521232843399048, + "learning_rate": 6.604984590557759e-06, + "loss": 0.4787, + "step": 9429 + }, + { + "epoch": 1.2276454509957049, + "grad_norm": 3.711055040359497, + "learning_rate": 6.603005200574471e-06, + "loss": 0.4829, + "step": 9432 + }, + { + "epoch": 1.2280359234673957, + "grad_norm": 2.418956756591797, + "learning_rate": 6.601025530549544e-06, + "loss": 0.4596, + "step": 9435 + }, + { + "epoch": 1.2284263959390862, + "grad_norm": 2.617719888687134, + "learning_rate": 6.5990455808288256e-06, + "loss": 0.5762, + "step": 9438 + }, + { + "epoch": 1.228816868410777, + "grad_norm": 3.037282705307007, + "learning_rate": 6.597065351758207e-06, + "loss": 0.4857, + "step": 9441 + }, + { + "epoch": 1.2292073408824677, + "grad_norm": 2.377511739730835, + "learning_rate": 6.5950848436836335e-06, + "loss": 0.4933, + "step": 9444 + }, + { + "epoch": 1.2295978133541585, + "grad_norm": 2.6414594650268555, + "learning_rate": 6.5931040569510926e-06, + "loss": 0.5575, + "step": 9447 + }, + { + "epoch": 1.2299882858258493, + "grad_norm": 3.2469959259033203, + "learning_rate": 6.591122991906625e-06, + "loss": 0.537, + "step": 9450 + }, + { + "epoch": 1.23037875829754, + "grad_norm": 2.7081236839294434, + "learning_rate": 6.5891416488963155e-06, + "loss": 0.5333, + "step": 9453 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 3.2291836738586426, + "learning_rate": 6.587160028266306e-06, + "loss": 0.5007, + "step": 9456 + }, + { + "epoch": 1.2311597032409216, + "grad_norm": 3.501340389251709, + "learning_rate": 6.585178130362776e-06, + "loss": 0.4621, + "step": 9459 + }, + { + "epoch": 1.2315501757126124, + "grad_norm": 2.6834263801574707, + "learning_rate": 6.583195955531963e-06, + "loss": 0.472, + "step": 9462 + }, + { + "epoch": 1.231940648184303, + "grad_norm": 3.268996238708496, + "learning_rate": 6.581213504120146e-06, + "loss": 0.5295, + "step": 9465 + }, + { + "epoch": 1.2323311206559937, + "grad_norm": 2.7084834575653076, + "learning_rate": 6.579230776473658e-06, + "loss": 0.543, + "step": 9468 + }, + { + "epoch": 1.2327215931276845, + "grad_norm": 2.728600263595581, + "learning_rate": 6.577247772938874e-06, + "loss": 0.5473, + "step": 9471 + }, + { + "epoch": 1.2331120655993753, + "grad_norm": 3.1851062774658203, + "learning_rate": 6.575264493862221e-06, + "loss": 0.5181, + "step": 9474 + }, + { + "epoch": 1.233502538071066, + "grad_norm": 2.5818753242492676, + "learning_rate": 6.573280939590178e-06, + "loss": 0.433, + "step": 9477 + }, + { + "epoch": 1.2338930105427568, + "grad_norm": 2.6541337966918945, + "learning_rate": 6.571297110469261e-06, + "loss": 0.503, + "step": 9480 + }, + { + "epoch": 1.2342834830144476, + "grad_norm": 2.538254499435425, + "learning_rate": 6.569313006846048e-06, + "loss": 0.5565, + "step": 9483 + }, + { + "epoch": 1.2346739554861381, + "grad_norm": 2.7005538940429688, + "learning_rate": 6.567328629067151e-06, + "loss": 0.5625, + "step": 9486 + }, + { + "epoch": 1.235064427957829, + "grad_norm": 2.8296282291412354, + "learning_rate": 6.565343977479241e-06, + "loss": 0.5815, + "step": 9489 + }, + { + "epoch": 1.2354549004295197, + "grad_norm": 2.405651092529297, + "learning_rate": 6.563359052429031e-06, + "loss": 0.4166, + "step": 9492 + }, + { + "epoch": 1.2358453729012104, + "grad_norm": 2.3672516345977783, + "learning_rate": 6.561373854263283e-06, + "loss": 0.5841, + "step": 9495 + }, + { + "epoch": 1.2362358453729012, + "grad_norm": 2.7860894203186035, + "learning_rate": 6.559388383328808e-06, + "loss": 0.4879, + "step": 9498 + }, + { + "epoch": 1.236626317844592, + "grad_norm": 2.6302614212036133, + "learning_rate": 6.5574026399724625e-06, + "loss": 0.5354, + "step": 9501 + }, + { + "epoch": 1.2370167903162828, + "grad_norm": 2.8287155628204346, + "learning_rate": 6.5554166245411525e-06, + "loss": 0.4181, + "step": 9504 + }, + { + "epoch": 1.2374072627879735, + "grad_norm": 2.8489389419555664, + "learning_rate": 6.55343033738183e-06, + "loss": 0.4872, + "step": 9507 + }, + { + "epoch": 1.2377977352596643, + "grad_norm": 3.101821184158325, + "learning_rate": 6.551443778841495e-06, + "loss": 0.5089, + "step": 9510 + }, + { + "epoch": 1.2381882077313549, + "grad_norm": 3.262808084487915, + "learning_rate": 6.549456949267197e-06, + "loss": 0.5069, + "step": 9513 + }, + { + "epoch": 1.2385786802030456, + "grad_norm": 2.664828300476074, + "learning_rate": 6.547469849006027e-06, + "loss": 0.4743, + "step": 9516 + }, + { + "epoch": 1.2389691526747364, + "grad_norm": 2.8644232749938965, + "learning_rate": 6.545482478405133e-06, + "loss": 0.4491, + "step": 9519 + }, + { + "epoch": 1.2393596251464272, + "grad_norm": 2.658860921859741, + "learning_rate": 6.543494837811698e-06, + "loss": 0.5217, + "step": 9522 + }, + { + "epoch": 1.239750097618118, + "grad_norm": 2.8123228549957275, + "learning_rate": 6.541506927572965e-06, + "loss": 0.52, + "step": 9525 + }, + { + "epoch": 1.2401405700898087, + "grad_norm": 2.8045687675476074, + "learning_rate": 6.539518748036212e-06, + "loss": 0.463, + "step": 9528 + }, + { + "epoch": 1.2405310425614995, + "grad_norm": 2.54270076751709, + "learning_rate": 6.537530299548774e-06, + "loss": 0.5799, + "step": 9531 + }, + { + "epoch": 1.24092151503319, + "grad_norm": 2.9174578189849854, + "learning_rate": 6.535541582458027e-06, + "loss": 0.4923, + "step": 9534 + }, + { + "epoch": 1.2413119875048808, + "grad_norm": 3.1313931941986084, + "learning_rate": 6.533552597111395e-06, + "loss": 0.5614, + "step": 9537 + }, + { + "epoch": 1.2417024599765716, + "grad_norm": 2.6302151679992676, + "learning_rate": 6.531563343856352e-06, + "loss": 0.5368, + "step": 9540 + }, + { + "epoch": 1.2420929324482624, + "grad_norm": 2.6476080417633057, + "learning_rate": 6.5295738230404125e-06, + "loss": 0.5409, + "step": 9543 + }, + { + "epoch": 1.2424834049199531, + "grad_norm": 2.8190321922302246, + "learning_rate": 6.527584035011145e-06, + "loss": 0.4559, + "step": 9546 + }, + { + "epoch": 1.242873877391644, + "grad_norm": 3.6176981925964355, + "learning_rate": 6.525593980116161e-06, + "loss": 0.5597, + "step": 9549 + }, + { + "epoch": 1.2432643498633347, + "grad_norm": 2.844064474105835, + "learning_rate": 6.523603658703117e-06, + "loss": 0.575, + "step": 9552 + }, + { + "epoch": 1.2436548223350254, + "grad_norm": 3.310737371444702, + "learning_rate": 6.52161307111972e-06, + "loss": 0.5425, + "step": 9555 + }, + { + "epoch": 1.2440452948067162, + "grad_norm": 2.583648204803467, + "learning_rate": 6.519622217713719e-06, + "loss": 0.4643, + "step": 9558 + }, + { + "epoch": 1.2444357672784068, + "grad_norm": 2.5113677978515625, + "learning_rate": 6.517631098832914e-06, + "loss": 0.4568, + "step": 9561 + }, + { + "epoch": 1.2448262397500975, + "grad_norm": 4.185740947723389, + "learning_rate": 6.515639714825148e-06, + "loss": 0.4515, + "step": 9564 + }, + { + "epoch": 1.2452167122217883, + "grad_norm": 2.941195011138916, + "learning_rate": 6.513648066038314e-06, + "loss": 0.5402, + "step": 9567 + }, + { + "epoch": 1.245607184693479, + "grad_norm": 2.2286741733551025, + "learning_rate": 6.511656152820347e-06, + "loss": 0.4221, + "step": 9570 + }, + { + "epoch": 1.2459976571651699, + "grad_norm": 2.8112363815307617, + "learning_rate": 6.509663975519228e-06, + "loss": 0.5133, + "step": 9573 + }, + { + "epoch": 1.2463881296368606, + "grad_norm": 2.6967437267303467, + "learning_rate": 6.507671534482991e-06, + "loss": 0.5319, + "step": 9576 + }, + { + "epoch": 1.2467786021085514, + "grad_norm": 2.501628875732422, + "learning_rate": 6.505678830059707e-06, + "loss": 0.4809, + "step": 9579 + }, + { + "epoch": 1.2471690745802422, + "grad_norm": 2.820998191833496, + "learning_rate": 6.5036858625974986e-06, + "loss": 0.5363, + "step": 9582 + }, + { + "epoch": 1.247559547051933, + "grad_norm": 2.792283296585083, + "learning_rate": 6.501692632444534e-06, + "loss": 0.4598, + "step": 9585 + }, + { + "epoch": 1.2479500195236235, + "grad_norm": 2.6301913261413574, + "learning_rate": 6.499699139949025e-06, + "loss": 0.4655, + "step": 9588 + }, + { + "epoch": 1.2483404919953143, + "grad_norm": 2.497735023498535, + "learning_rate": 6.497705385459232e-06, + "loss": 0.5135, + "step": 9591 + }, + { + "epoch": 1.248730964467005, + "grad_norm": 2.354994535446167, + "learning_rate": 6.4957113693234586e-06, + "loss": 0.4236, + "step": 9594 + }, + { + "epoch": 1.2491214369386958, + "grad_norm": 3.677664041519165, + "learning_rate": 6.493717091890056e-06, + "loss": 0.4943, + "step": 9597 + }, + { + "epoch": 1.2495119094103866, + "grad_norm": 2.9446117877960205, + "learning_rate": 6.491722553507419e-06, + "loss": 0.5157, + "step": 9600 + }, + { + "epoch": 1.2499023818820774, + "grad_norm": 3.580937623977661, + "learning_rate": 6.48972775452399e-06, + "loss": 0.5131, + "step": 9603 + }, + { + "epoch": 1.2502928543537681, + "grad_norm": 2.604628562927246, + "learning_rate": 6.487732695288256e-06, + "loss": 0.4847, + "step": 9606 + }, + { + "epoch": 1.2506833268254587, + "grad_norm": 2.9998080730438232, + "learning_rate": 6.4857373761487505e-06, + "loss": 0.5538, + "step": 9609 + }, + { + "epoch": 1.2510737992971497, + "grad_norm": 2.7411253452301025, + "learning_rate": 6.4837417974540505e-06, + "loss": 0.4721, + "step": 9612 + }, + { + "epoch": 1.2514642717688402, + "grad_norm": 2.709904432296753, + "learning_rate": 6.481745959552781e-06, + "loss": 0.446, + "step": 9615 + }, + { + "epoch": 1.251854744240531, + "grad_norm": 2.6614980697631836, + "learning_rate": 6.479749862793609e-06, + "loss": 0.5096, + "step": 9618 + }, + { + "epoch": 1.2522452167122218, + "grad_norm": 2.518862724304199, + "learning_rate": 6.477753507525249e-06, + "loss": 0.5016, + "step": 9621 + }, + { + "epoch": 1.2526356891839125, + "grad_norm": 2.293060302734375, + "learning_rate": 6.475756894096458e-06, + "loss": 0.5011, + "step": 9624 + }, + { + "epoch": 1.2530261616556033, + "grad_norm": 2.695930004119873, + "learning_rate": 6.4737600228560435e-06, + "loss": 0.5012, + "step": 9627 + }, + { + "epoch": 1.253416634127294, + "grad_norm": 2.469388246536255, + "learning_rate": 6.471762894152853e-06, + "loss": 0.489, + "step": 9630 + }, + { + "epoch": 1.2538071065989849, + "grad_norm": 3.3694796562194824, + "learning_rate": 6.469765508335783e-06, + "loss": 0.435, + "step": 9633 + }, + { + "epoch": 1.2541975790706754, + "grad_norm": 2.474310874938965, + "learning_rate": 6.467767865753768e-06, + "loss": 0.4652, + "step": 9636 + }, + { + "epoch": 1.2545880515423662, + "grad_norm": 3.2254626750946045, + "learning_rate": 6.465769966755795e-06, + "loss": 0.5496, + "step": 9639 + }, + { + "epoch": 1.254978524014057, + "grad_norm": 2.502992630004883, + "learning_rate": 6.4637718116908945e-06, + "loss": 0.4993, + "step": 9642 + }, + { + "epoch": 1.2553689964857477, + "grad_norm": 2.6724181175231934, + "learning_rate": 6.461773400908136e-06, + "loss": 0.5079, + "step": 9645 + }, + { + "epoch": 1.2557594689574385, + "grad_norm": 2.582564353942871, + "learning_rate": 6.459774734756639e-06, + "loss": 0.546, + "step": 9648 + }, + { + "epoch": 1.2561499414291293, + "grad_norm": 2.8137545585632324, + "learning_rate": 6.457775813585567e-06, + "loss": 0.5178, + "step": 9651 + }, + { + "epoch": 1.25654041390082, + "grad_norm": 3.133225679397583, + "learning_rate": 6.4557766377441285e-06, + "loss": 0.5144, + "step": 9654 + }, + { + "epoch": 1.2569308863725106, + "grad_norm": 2.940943717956543, + "learning_rate": 6.453777207581573e-06, + "loss": 0.4967, + "step": 9657 + }, + { + "epoch": 1.2573213588442016, + "grad_norm": 2.1819236278533936, + "learning_rate": 6.451777523447197e-06, + "loss": 0.439, + "step": 9660 + }, + { + "epoch": 1.2577118313158921, + "grad_norm": 2.8423125743865967, + "learning_rate": 6.449777585690344e-06, + "loss": 0.5978, + "step": 9663 + }, + { + "epoch": 1.258102303787583, + "grad_norm": 2.4884462356567383, + "learning_rate": 6.447777394660394e-06, + "loss": 0.5501, + "step": 9666 + }, + { + "epoch": 1.2584927762592737, + "grad_norm": 2.5996439456939697, + "learning_rate": 6.445776950706779e-06, + "loss": 0.5206, + "step": 9669 + }, + { + "epoch": 1.2588832487309645, + "grad_norm": 2.7258780002593994, + "learning_rate": 6.4437762541789735e-06, + "loss": 0.4822, + "step": 9672 + }, + { + "epoch": 1.2592737212026552, + "grad_norm": 2.3117198944091797, + "learning_rate": 6.441775305426494e-06, + "loss": 0.4431, + "step": 9675 + }, + { + "epoch": 1.259664193674346, + "grad_norm": 2.780622959136963, + "learning_rate": 6.4397741047989e-06, + "loss": 0.382, + "step": 9678 + }, + { + "epoch": 1.2600546661460368, + "grad_norm": 2.5801424980163574, + "learning_rate": 6.4377726526458e-06, + "loss": 0.4101, + "step": 9681 + }, + { + "epoch": 1.2604451386177273, + "grad_norm": 2.5400123596191406, + "learning_rate": 6.435770949316843e-06, + "loss": 0.4303, + "step": 9684 + }, + { + "epoch": 1.2608356110894183, + "grad_norm": 2.6822738647460938, + "learning_rate": 6.43376899516172e-06, + "loss": 0.4843, + "step": 9687 + }, + { + "epoch": 1.2612260835611089, + "grad_norm": 2.613121747970581, + "learning_rate": 6.43176679053017e-06, + "loss": 0.4722, + "step": 9690 + }, + { + "epoch": 1.2616165560327997, + "grad_norm": 3.226043224334717, + "learning_rate": 6.429764335771973e-06, + "loss": 0.4816, + "step": 9693 + }, + { + "epoch": 1.2620070285044904, + "grad_norm": 2.7644383907318115, + "learning_rate": 6.427761631236955e-06, + "loss": 0.4896, + "step": 9696 + }, + { + "epoch": 1.2623975009761812, + "grad_norm": 2.632202625274658, + "learning_rate": 6.4257586772749845e-06, + "loss": 0.5031, + "step": 9699 + }, + { + "epoch": 1.262787973447872, + "grad_norm": 3.154134750366211, + "learning_rate": 6.423755474235972e-06, + "loss": 0.5501, + "step": 9702 + }, + { + "epoch": 1.2631784459195627, + "grad_norm": 2.597161054611206, + "learning_rate": 6.421752022469874e-06, + "loss": 0.592, + "step": 9705 + }, + { + "epoch": 1.2635689183912535, + "grad_norm": 2.492748975753784, + "learning_rate": 6.4197483223266865e-06, + "loss": 0.5851, + "step": 9708 + }, + { + "epoch": 1.263959390862944, + "grad_norm": 2.564765453338623, + "learning_rate": 6.417744374156455e-06, + "loss": 0.5529, + "step": 9711 + }, + { + "epoch": 1.2643498633346348, + "grad_norm": 2.9283761978149414, + "learning_rate": 6.4157401783092645e-06, + "loss": 0.5119, + "step": 9714 + }, + { + "epoch": 1.2647403358063256, + "grad_norm": 2.577751874923706, + "learning_rate": 6.413735735135241e-06, + "loss": 0.4277, + "step": 9717 + }, + { + "epoch": 1.2651308082780164, + "grad_norm": 2.47666072845459, + "learning_rate": 6.411731044984562e-06, + "loss": 0.5617, + "step": 9720 + }, + { + "epoch": 1.2655212807497072, + "grad_norm": 4.063993453979492, + "learning_rate": 6.409726108207436e-06, + "loss": 0.4548, + "step": 9723 + }, + { + "epoch": 1.265911753221398, + "grad_norm": 2.558497428894043, + "learning_rate": 6.407720925154126e-06, + "loss": 0.5154, + "step": 9726 + }, + { + "epoch": 1.2663022256930887, + "grad_norm": 2.8390650749206543, + "learning_rate": 6.4057154961749324e-06, + "loss": 0.5305, + "step": 9729 + }, + { + "epoch": 1.2666926981647793, + "grad_norm": 2.5285894870758057, + "learning_rate": 6.403709821620198e-06, + "loss": 0.4643, + "step": 9732 + }, + { + "epoch": 1.2670831706364702, + "grad_norm": 2.475567579269409, + "learning_rate": 6.401703901840311e-06, + "loss": 0.52, + "step": 9735 + }, + { + "epoch": 1.2674736431081608, + "grad_norm": 2.560523271560669, + "learning_rate": 6.3996977371857e-06, + "loss": 0.5761, + "step": 9738 + }, + { + "epoch": 1.2678641155798516, + "grad_norm": 2.965449810028076, + "learning_rate": 6.397691328006839e-06, + "loss": 0.481, + "step": 9741 + }, + { + "epoch": 1.2682545880515423, + "grad_norm": 2.673515558242798, + "learning_rate": 6.395684674654245e-06, + "loss": 0.5183, + "step": 9744 + }, + { + "epoch": 1.2686450605232331, + "grad_norm": 2.865999221801758, + "learning_rate": 6.393677777478473e-06, + "loss": 0.5216, + "step": 9747 + }, + { + "epoch": 1.2690355329949239, + "grad_norm": 2.5293450355529785, + "learning_rate": 6.391670636830126e-06, + "loss": 0.496, + "step": 9750 + }, + { + "epoch": 1.2694260054666147, + "grad_norm": 2.695828914642334, + "learning_rate": 6.389663253059846e-06, + "loss": 0.5161, + "step": 9753 + }, + { + "epoch": 1.2698164779383054, + "grad_norm": 3.5010106563568115, + "learning_rate": 6.3876556265183185e-06, + "loss": 0.4878, + "step": 9756 + }, + { + "epoch": 1.270206950409996, + "grad_norm": 3.126572847366333, + "learning_rate": 6.3856477575562735e-06, + "loss": 0.5395, + "step": 9759 + }, + { + "epoch": 1.270597422881687, + "grad_norm": 3.661149024963379, + "learning_rate": 6.38363964652448e-06, + "loss": 0.5382, + "step": 9762 + }, + { + "epoch": 1.2709878953533775, + "grad_norm": 2.6320817470550537, + "learning_rate": 6.381631293773751e-06, + "loss": 0.5113, + "step": 9765 + }, + { + "epoch": 1.2713783678250683, + "grad_norm": 2.5496699810028076, + "learning_rate": 6.3796226996549404e-06, + "loss": 0.5602, + "step": 9768 + }, + { + "epoch": 1.271768840296759, + "grad_norm": 2.5300614833831787, + "learning_rate": 6.3776138645189475e-06, + "loss": 0.5367, + "step": 9771 + }, + { + "epoch": 1.2721593127684498, + "grad_norm": 2.5111279487609863, + "learning_rate": 6.37560478871671e-06, + "loss": 0.4835, + "step": 9774 + }, + { + "epoch": 1.2725497852401406, + "grad_norm": 2.6758015155792236, + "learning_rate": 6.37359547259921e-06, + "loss": 0.5407, + "step": 9777 + }, + { + "epoch": 1.2729402577118314, + "grad_norm": 2.8629322052001953, + "learning_rate": 6.371585916517471e-06, + "loss": 0.4738, + "step": 9780 + }, + { + "epoch": 1.2733307301835222, + "grad_norm": 2.585297107696533, + "learning_rate": 6.3695761208225585e-06, + "loss": 0.496, + "step": 9783 + }, + { + "epoch": 1.2737212026552127, + "grad_norm": 3.2529196739196777, + "learning_rate": 6.3675660858655765e-06, + "loss": 0.4647, + "step": 9786 + }, + { + "epoch": 1.2741116751269035, + "grad_norm": 2.405118703842163, + "learning_rate": 6.3655558119976765e-06, + "loss": 0.5462, + "step": 9789 + }, + { + "epoch": 1.2745021475985943, + "grad_norm": 2.579508066177368, + "learning_rate": 6.363545299570051e-06, + "loss": 0.4761, + "step": 9792 + }, + { + "epoch": 1.274892620070285, + "grad_norm": 2.9745607376098633, + "learning_rate": 6.361534548933928e-06, + "loss": 0.5342, + "step": 9795 + }, + { + "epoch": 1.2752830925419758, + "grad_norm": 2.931020975112915, + "learning_rate": 6.359523560440585e-06, + "loss": 0.4915, + "step": 9798 + }, + { + "epoch": 1.2756735650136666, + "grad_norm": 2.9338982105255127, + "learning_rate": 6.357512334441336e-06, + "loss": 0.6454, + "step": 9801 + }, + { + "epoch": 1.2760640374853573, + "grad_norm": 2.669160842895508, + "learning_rate": 6.355500871287538e-06, + "loss": 0.4672, + "step": 9804 + }, + { + "epoch": 1.276454509957048, + "grad_norm": 2.584463119506836, + "learning_rate": 6.353489171330588e-06, + "loss": 0.5323, + "step": 9807 + }, + { + "epoch": 1.276844982428739, + "grad_norm": 2.854555606842041, + "learning_rate": 6.351477234921928e-06, + "loss": 0.4001, + "step": 9810 + }, + { + "epoch": 1.2772354549004294, + "grad_norm": 3.679852247238159, + "learning_rate": 6.349465062413038e-06, + "loss": 0.5706, + "step": 9813 + }, + { + "epoch": 1.2776259273721202, + "grad_norm": 2.8256568908691406, + "learning_rate": 6.34745265415544e-06, + "loss": 0.4925, + "step": 9816 + }, + { + "epoch": 1.278016399843811, + "grad_norm": 3.215463876724243, + "learning_rate": 6.3454400105006985e-06, + "loss": 0.5522, + "step": 9819 + }, + { + "epoch": 1.2784068723155018, + "grad_norm": 2.2167320251464844, + "learning_rate": 6.343427131800417e-06, + "loss": 0.4712, + "step": 9822 + }, + { + "epoch": 1.2787973447871925, + "grad_norm": 2.5765607357025146, + "learning_rate": 6.341414018406242e-06, + "loss": 0.5017, + "step": 9825 + }, + { + "epoch": 1.2791878172588833, + "grad_norm": 2.5398025512695312, + "learning_rate": 6.3394006706698615e-06, + "loss": 0.4501, + "step": 9828 + }, + { + "epoch": 1.279578289730574, + "grad_norm": 2.628095865249634, + "learning_rate": 6.337387088943e-06, + "loss": 0.5367, + "step": 9831 + }, + { + "epoch": 1.2799687622022646, + "grad_norm": 2.486548662185669, + "learning_rate": 6.335373273577429e-06, + "loss": 0.5093, + "step": 9834 + }, + { + "epoch": 1.2803592346739554, + "grad_norm": 2.8743481636047363, + "learning_rate": 6.333359224924955e-06, + "loss": 0.4291, + "step": 9837 + }, + { + "epoch": 1.2807497071456462, + "grad_norm": 2.8362040519714355, + "learning_rate": 6.331344943337428e-06, + "loss": 0.4317, + "step": 9840 + }, + { + "epoch": 1.281140179617337, + "grad_norm": 2.6773505210876465, + "learning_rate": 6.329330429166741e-06, + "loss": 0.4869, + "step": 9843 + }, + { + "epoch": 1.2815306520890277, + "grad_norm": 2.4184865951538086, + "learning_rate": 6.327315682764825e-06, + "loss": 0.5656, + "step": 9846 + }, + { + "epoch": 1.2819211245607185, + "grad_norm": 2.5900416374206543, + "learning_rate": 6.325300704483653e-06, + "loss": 0.4823, + "step": 9849 + }, + { + "epoch": 1.2823115970324093, + "grad_norm": 2.582472085952759, + "learning_rate": 6.3232854946752345e-06, + "loss": 0.4939, + "step": 9852 + }, + { + "epoch": 1.2827020695041, + "grad_norm": 2.6832892894744873, + "learning_rate": 6.321270053691624e-06, + "loss": 0.499, + "step": 9855 + }, + { + "epoch": 1.2830925419757908, + "grad_norm": 2.630359649658203, + "learning_rate": 6.319254381884914e-06, + "loss": 0.4966, + "step": 9858 + }, + { + "epoch": 1.2834830144474814, + "grad_norm": 2.7337570190429688, + "learning_rate": 6.317238479607239e-06, + "loss": 0.4762, + "step": 9861 + }, + { + "epoch": 1.2838734869191721, + "grad_norm": 2.502153158187866, + "learning_rate": 6.315222347210773e-06, + "loss": 0.5532, + "step": 9864 + }, + { + "epoch": 1.284263959390863, + "grad_norm": 2.532254934310913, + "learning_rate": 6.31320598504773e-06, + "loss": 0.5093, + "step": 9867 + }, + { + "epoch": 1.2846544318625537, + "grad_norm": 2.7918901443481445, + "learning_rate": 6.311189393470364e-06, + "loss": 0.5242, + "step": 9870 + }, + { + "epoch": 1.2850449043342445, + "grad_norm": 2.472677230834961, + "learning_rate": 6.309172572830969e-06, + "loss": 0.4492, + "step": 9873 + }, + { + "epoch": 1.2854353768059352, + "grad_norm": 2.5617218017578125, + "learning_rate": 6.30715552348188e-06, + "loss": 0.4832, + "step": 9876 + }, + { + "epoch": 1.285825849277626, + "grad_norm": 2.8872265815734863, + "learning_rate": 6.30513824577547e-06, + "loss": 0.5347, + "step": 9879 + }, + { + "epoch": 1.2862163217493165, + "grad_norm": 2.543060302734375, + "learning_rate": 6.3031207400641535e-06, + "loss": 0.553, + "step": 9882 + }, + { + "epoch": 1.2866067942210075, + "grad_norm": 2.5379858016967773, + "learning_rate": 6.301103006700388e-06, + "loss": 0.4914, + "step": 9885 + }, + { + "epoch": 1.286997266692698, + "grad_norm": 2.7542781829833984, + "learning_rate": 6.299085046036662e-06, + "loss": 0.4767, + "step": 9888 + }, + { + "epoch": 1.2873877391643889, + "grad_norm": 2.86118745803833, + "learning_rate": 6.297066858425512e-06, + "loss": 0.4647, + "step": 9891 + }, + { + "epoch": 1.2877782116360796, + "grad_norm": 4.252162456512451, + "learning_rate": 6.29504844421951e-06, + "loss": 0.4904, + "step": 9894 + }, + { + "epoch": 1.2881686841077704, + "grad_norm": 3.537123441696167, + "learning_rate": 6.2930298037712704e-06, + "loss": 0.5455, + "step": 9897 + }, + { + "epoch": 1.2885591565794612, + "grad_norm": 3.0261664390563965, + "learning_rate": 6.2910109374334434e-06, + "loss": 0.5058, + "step": 9900 + }, + { + "epoch": 1.288949629051152, + "grad_norm": 2.73046612739563, + "learning_rate": 6.288991845558721e-06, + "loss": 0.5526, + "step": 9903 + }, + { + "epoch": 1.2893401015228427, + "grad_norm": 2.695645332336426, + "learning_rate": 6.286972528499835e-06, + "loss": 0.6358, + "step": 9906 + }, + { + "epoch": 1.2897305739945333, + "grad_norm": 2.4671976566314697, + "learning_rate": 6.284952986609556e-06, + "loss": 0.5519, + "step": 9909 + }, + { + "epoch": 1.290121046466224, + "grad_norm": 2.7098228931427, + "learning_rate": 6.282933220240695e-06, + "loss": 0.4461, + "step": 9912 + }, + { + "epoch": 1.2905115189379148, + "grad_norm": 2.5738272666931152, + "learning_rate": 6.280913229746096e-06, + "loss": 0.4398, + "step": 9915 + }, + { + "epoch": 1.2909019914096056, + "grad_norm": 2.765180826187134, + "learning_rate": 6.278893015478652e-06, + "loss": 0.6561, + "step": 9918 + }, + { + "epoch": 1.2912924638812964, + "grad_norm": 2.4514658451080322, + "learning_rate": 6.27687257779129e-06, + "loss": 0.4876, + "step": 9921 + }, + { + "epoch": 1.2916829363529871, + "grad_norm": 2.56715726852417, + "learning_rate": 6.274851917036971e-06, + "loss": 0.459, + "step": 9924 + }, + { + "epoch": 1.292073408824678, + "grad_norm": 2.630765438079834, + "learning_rate": 6.272831033568708e-06, + "loss": 0.4325, + "step": 9927 + }, + { + "epoch": 1.2924638812963687, + "grad_norm": 2.567124843597412, + "learning_rate": 6.27080992773954e-06, + "loss": 0.4577, + "step": 9930 + }, + { + "epoch": 1.2928543537680595, + "grad_norm": 2.8632030487060547, + "learning_rate": 6.26878859990255e-06, + "loss": 0.5061, + "step": 9933 + }, + { + "epoch": 1.29324482623975, + "grad_norm": 3.047332525253296, + "learning_rate": 6.266767050410862e-06, + "loss": 0.4394, + "step": 9936 + }, + { + "epoch": 1.2936352987114408, + "grad_norm": 2.710806131362915, + "learning_rate": 6.264745279617634e-06, + "loss": 0.4752, + "step": 9939 + }, + { + "epoch": 1.2940257711831316, + "grad_norm": 2.510077714920044, + "learning_rate": 6.262723287876068e-06, + "loss": 0.4945, + "step": 9942 + }, + { + "epoch": 1.2944162436548223, + "grad_norm": 3.1183907985687256, + "learning_rate": 6.260701075539397e-06, + "loss": 0.4816, + "step": 9945 + }, + { + "epoch": 1.294806716126513, + "grad_norm": 2.8329877853393555, + "learning_rate": 6.258678642960902e-06, + "loss": 0.4994, + "step": 9948 + }, + { + "epoch": 1.2951971885982039, + "grad_norm": 2.953737735748291, + "learning_rate": 6.256655990493896e-06, + "loss": 0.4451, + "step": 9951 + }, + { + "epoch": 1.2955876610698946, + "grad_norm": 2.770921468734741, + "learning_rate": 6.254633118491732e-06, + "loss": 0.5525, + "step": 9954 + }, + { + "epoch": 1.2959781335415852, + "grad_norm": 2.6168859004974365, + "learning_rate": 6.252610027307803e-06, + "loss": 0.4346, + "step": 9957 + }, + { + "epoch": 1.2963686060132762, + "grad_norm": 2.5253684520721436, + "learning_rate": 6.250586717295535e-06, + "loss": 0.4968, + "step": 9960 + }, + { + "epoch": 1.2967590784849667, + "grad_norm": 2.5443472862243652, + "learning_rate": 6.248563188808401e-06, + "loss": 0.4767, + "step": 9963 + }, + { + "epoch": 1.2971495509566575, + "grad_norm": 2.66750431060791, + "learning_rate": 6.246539442199901e-06, + "loss": 0.5305, + "step": 9966 + }, + { + "epoch": 1.2975400234283483, + "grad_norm": 3.3093185424804688, + "learning_rate": 6.244515477823585e-06, + "loss": 0.5501, + "step": 9969 + }, + { + "epoch": 1.297930495900039, + "grad_norm": 2.5666654109954834, + "learning_rate": 6.242491296033033e-06, + "loss": 0.5458, + "step": 9972 + }, + { + "epoch": 1.2983209683717298, + "grad_norm": 2.224759340286255, + "learning_rate": 6.240466897181865e-06, + "loss": 0.4821, + "step": 9975 + }, + { + "epoch": 1.2987114408434206, + "grad_norm": 2.936877727508545, + "learning_rate": 6.23844228162374e-06, + "loss": 0.4621, + "step": 9978 + }, + { + "epoch": 1.2991019133151114, + "grad_norm": 2.833364248275757, + "learning_rate": 6.236417449712353e-06, + "loss": 0.4913, + "step": 9981 + }, + { + "epoch": 1.299492385786802, + "grad_norm": 2.554704189300537, + "learning_rate": 6.23439240180144e-06, + "loss": 0.4368, + "step": 9984 + }, + { + "epoch": 1.2998828582584927, + "grad_norm": 2.432718276977539, + "learning_rate": 6.232367138244768e-06, + "loss": 0.5568, + "step": 9987 + }, + { + "epoch": 1.3002733307301835, + "grad_norm": 2.5396578311920166, + "learning_rate": 6.230341659396152e-06, + "loss": 0.5137, + "step": 9990 + }, + { + "epoch": 1.3006638032018742, + "grad_norm": 2.720970392227173, + "learning_rate": 6.228315965609437e-06, + "loss": 0.474, + "step": 9993 + }, + { + "epoch": 1.301054275673565, + "grad_norm": 3.310605525970459, + "learning_rate": 6.226290057238506e-06, + "loss": 0.4991, + "step": 9996 + }, + { + "epoch": 1.3014447481452558, + "grad_norm": 2.8268284797668457, + "learning_rate": 6.224263934637281e-06, + "loss": 0.3968, + "step": 9999 + }, + { + "epoch": 1.3018352206169466, + "grad_norm": 2.8904576301574707, + "learning_rate": 6.222237598159723e-06, + "loss": 0.5225, + "step": 10002 + }, + { + "epoch": 1.3022256930886371, + "grad_norm": 2.540018081665039, + "learning_rate": 6.220211048159826e-06, + "loss": 0.4189, + "step": 10005 + }, + { + "epoch": 1.302616165560328, + "grad_norm": 2.706920862197876, + "learning_rate": 6.2181842849916284e-06, + "loss": 0.5126, + "step": 10008 + }, + { + "epoch": 1.3030066380320187, + "grad_norm": 3.605525016784668, + "learning_rate": 6.216157309009198e-06, + "loss": 0.4911, + "step": 10011 + }, + { + "epoch": 1.3033971105037094, + "grad_norm": 2.4646642208099365, + "learning_rate": 6.214130120566643e-06, + "loss": 0.4261, + "step": 10014 + }, + { + "epoch": 1.3037875829754002, + "grad_norm": 2.7980430126190186, + "learning_rate": 6.212102720018112e-06, + "loss": 0.483, + "step": 10017 + }, + { + "epoch": 1.304178055447091, + "grad_norm": 2.9838016033172607, + "learning_rate": 6.210075107717785e-06, + "loss": 0.4596, + "step": 10020 + }, + { + "epoch": 1.3045685279187818, + "grad_norm": 2.551286220550537, + "learning_rate": 6.208047284019881e-06, + "loss": 0.4226, + "step": 10023 + }, + { + "epoch": 1.3049590003904725, + "grad_norm": 2.640700340270996, + "learning_rate": 6.20601924927866e-06, + "loss": 0.4905, + "step": 10026 + }, + { + "epoch": 1.3053494728621633, + "grad_norm": 2.759453296661377, + "learning_rate": 6.203991003848411e-06, + "loss": 0.467, + "step": 10029 + }, + { + "epoch": 1.3057399453338538, + "grad_norm": 2.907013177871704, + "learning_rate": 6.201962548083468e-06, + "loss": 0.487, + "step": 10032 + }, + { + "epoch": 1.3061304178055448, + "grad_norm": 2.470888376235962, + "learning_rate": 6.199933882338196e-06, + "loss": 0.4687, + "step": 10035 + }, + { + "epoch": 1.3065208902772354, + "grad_norm": 2.576772451400757, + "learning_rate": 6.197905006966999e-06, + "loss": 0.425, + "step": 10038 + }, + { + "epoch": 1.3069113627489262, + "grad_norm": 2.713982582092285, + "learning_rate": 6.195875922324318e-06, + "loss": 0.4787, + "step": 10041 + }, + { + "epoch": 1.307301835220617, + "grad_norm": 2.516087532043457, + "learning_rate": 6.1938466287646285e-06, + "loss": 0.4707, + "step": 10044 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 2.569885492324829, + "learning_rate": 6.191817126642444e-06, + "loss": 0.458, + "step": 10047 + }, + { + "epoch": 1.3080827801639985, + "grad_norm": 3.064462184906006, + "learning_rate": 6.189787416312315e-06, + "loss": 0.5624, + "step": 10050 + }, + { + "epoch": 1.3084732526356893, + "grad_norm": 2.9090516567230225, + "learning_rate": 6.187757498128827e-06, + "loss": 0.5414, + "step": 10053 + }, + { + "epoch": 1.30886372510738, + "grad_norm": 2.532334566116333, + "learning_rate": 6.185727372446604e-06, + "loss": 0.5469, + "step": 10056 + }, + { + "epoch": 1.3092541975790706, + "grad_norm": 2.665684223175049, + "learning_rate": 6.183697039620302e-06, + "loss": 0.5266, + "step": 10059 + }, + { + "epoch": 1.3096446700507614, + "grad_norm": 2.6369564533233643, + "learning_rate": 6.181666500004617e-06, + "loss": 0.4774, + "step": 10062 + }, + { + "epoch": 1.3100351425224521, + "grad_norm": 2.548985481262207, + "learning_rate": 6.179635753954283e-06, + "loss": 0.4342, + "step": 10065 + }, + { + "epoch": 1.310425614994143, + "grad_norm": 2.7811813354492188, + "learning_rate": 6.177604801824062e-06, + "loss": 0.4539, + "step": 10068 + }, + { + "epoch": 1.3108160874658337, + "grad_norm": 2.4228222370147705, + "learning_rate": 6.17557364396876e-06, + "loss": 0.446, + "step": 10071 + }, + { + "epoch": 1.3112065599375244, + "grad_norm": 2.639486789703369, + "learning_rate": 6.173542280743214e-06, + "loss": 0.5583, + "step": 10074 + }, + { + "epoch": 1.3115970324092152, + "grad_norm": 3.5657029151916504, + "learning_rate": 6.171510712502303e-06, + "loss": 0.5388, + "step": 10077 + }, + { + "epoch": 1.3119875048809058, + "grad_norm": 2.815169095993042, + "learning_rate": 6.169478939600933e-06, + "loss": 0.5293, + "step": 10080 + }, + { + "epoch": 1.3123779773525968, + "grad_norm": 2.9462995529174805, + "learning_rate": 6.167446962394054e-06, + "loss": 0.5047, + "step": 10083 + }, + { + "epoch": 1.3127684498242873, + "grad_norm": 2.7291252613067627, + "learning_rate": 6.165414781236647e-06, + "loss": 0.4032, + "step": 10086 + }, + { + "epoch": 1.313158922295978, + "grad_norm": 2.6978507041931152, + "learning_rate": 6.163382396483728e-06, + "loss": 0.5027, + "step": 10089 + }, + { + "epoch": 1.3135493947676689, + "grad_norm": 2.74324369430542, + "learning_rate": 6.161349808490351e-06, + "loss": 0.4742, + "step": 10092 + }, + { + "epoch": 1.3139398672393596, + "grad_norm": 2.520897388458252, + "learning_rate": 6.159317017611607e-06, + "loss": 0.4695, + "step": 10095 + }, + { + "epoch": 1.3143303397110504, + "grad_norm": 2.367314338684082, + "learning_rate": 6.157284024202619e-06, + "loss": 0.4521, + "step": 10098 + }, + { + "epoch": 1.3147208121827412, + "grad_norm": 2.571027994155884, + "learning_rate": 6.155250828618547e-06, + "loss": 0.4327, + "step": 10101 + }, + { + "epoch": 1.315111284654432, + "grad_norm": 2.6807379722595215, + "learning_rate": 6.153217431214583e-06, + "loss": 0.5297, + "step": 10104 + }, + { + "epoch": 1.3155017571261225, + "grad_norm": 2.974998950958252, + "learning_rate": 6.1511838323459624e-06, + "loss": 0.5418, + "step": 10107 + }, + { + "epoch": 1.3158922295978135, + "grad_norm": 2.9916090965270996, + "learning_rate": 6.149150032367946e-06, + "loss": 0.4774, + "step": 10110 + }, + { + "epoch": 1.316282702069504, + "grad_norm": 3.1814825534820557, + "learning_rate": 6.147116031635838e-06, + "loss": 0.5896, + "step": 10113 + }, + { + "epoch": 1.3166731745411948, + "grad_norm": 2.669076681137085, + "learning_rate": 6.145081830504971e-06, + "loss": 0.4869, + "step": 10116 + }, + { + "epoch": 1.3170636470128856, + "grad_norm": 2.4291157722473145, + "learning_rate": 6.1430474293307175e-06, + "loss": 0.4442, + "step": 10119 + }, + { + "epoch": 1.3174541194845764, + "grad_norm": 2.7454960346221924, + "learning_rate": 6.141012828468484e-06, + "loss": 0.506, + "step": 10122 + }, + { + "epoch": 1.3178445919562671, + "grad_norm": 2.744112730026245, + "learning_rate": 6.138978028273709e-06, + "loss": 0.5368, + "step": 10125 + }, + { + "epoch": 1.318235064427958, + "grad_norm": 2.635687828063965, + "learning_rate": 6.13694302910187e-06, + "loss": 0.4753, + "step": 10128 + }, + { + "epoch": 1.3186255368996487, + "grad_norm": 2.746645212173462, + "learning_rate": 6.134907831308473e-06, + "loss": 0.4922, + "step": 10131 + }, + { + "epoch": 1.3190160093713392, + "grad_norm": 2.9933664798736572, + "learning_rate": 6.132872435249067e-06, + "loss": 0.5043, + "step": 10134 + }, + { + "epoch": 1.31940648184303, + "grad_norm": 2.8414406776428223, + "learning_rate": 6.130836841279228e-06, + "loss": 0.4989, + "step": 10137 + }, + { + "epoch": 1.3197969543147208, + "grad_norm": 2.6284008026123047, + "learning_rate": 6.128801049754572e-06, + "loss": 0.5045, + "step": 10140 + }, + { + "epoch": 1.3201874267864115, + "grad_norm": 2.5800864696502686, + "learning_rate": 6.1267650610307496e-06, + "loss": 0.4773, + "step": 10143 + }, + { + "epoch": 1.3205778992581023, + "grad_norm": 3.2710795402526855, + "learning_rate": 6.1247288754634395e-06, + "loss": 0.4755, + "step": 10146 + }, + { + "epoch": 1.320968371729793, + "grad_norm": 2.9163713455200195, + "learning_rate": 6.122692493408362e-06, + "loss": 0.5315, + "step": 10149 + }, + { + "epoch": 1.3213588442014839, + "grad_norm": 3.6550145149230957, + "learning_rate": 6.120655915221268e-06, + "loss": 0.5406, + "step": 10152 + }, + { + "epoch": 1.3217493166731744, + "grad_norm": 2.446315050125122, + "learning_rate": 6.118619141257941e-06, + "loss": 0.4987, + "step": 10155 + }, + { + "epoch": 1.3221397891448654, + "grad_norm": 2.639047622680664, + "learning_rate": 6.116582171874204e-06, + "loss": 0.497, + "step": 10158 + }, + { + "epoch": 1.322530261616556, + "grad_norm": 3.0936222076416016, + "learning_rate": 6.11454500742591e-06, + "loss": 0.451, + "step": 10161 + }, + { + "epoch": 1.3229207340882467, + "grad_norm": 2.499833106994629, + "learning_rate": 6.112507648268951e-06, + "loss": 0.5365, + "step": 10164 + }, + { + "epoch": 1.3233112065599375, + "grad_norm": 2.466047763824463, + "learning_rate": 6.110470094759243e-06, + "loss": 0.4519, + "step": 10167 + }, + { + "epoch": 1.3237016790316283, + "grad_norm": 2.9250972270965576, + "learning_rate": 6.1084323472527465e-06, + "loss": 0.4602, + "step": 10170 + }, + { + "epoch": 1.324092151503319, + "grad_norm": 2.504556894302368, + "learning_rate": 6.106394406105451e-06, + "loss": 0.4688, + "step": 10173 + }, + { + "epoch": 1.3244826239750098, + "grad_norm": 3.0200345516204834, + "learning_rate": 6.104356271673379e-06, + "loss": 0.4318, + "step": 10176 + }, + { + "epoch": 1.3248730964467006, + "grad_norm": 2.429672956466675, + "learning_rate": 6.102317944312592e-06, + "loss": 0.4749, + "step": 10179 + }, + { + "epoch": 1.3252635689183911, + "grad_norm": 2.6296794414520264, + "learning_rate": 6.1002794243791774e-06, + "loss": 0.4971, + "step": 10182 + }, + { + "epoch": 1.325654041390082, + "grad_norm": 2.859323024749756, + "learning_rate": 6.098240712229263e-06, + "loss": 0.553, + "step": 10185 + }, + { + "epoch": 1.3260445138617727, + "grad_norm": 2.4780797958374023, + "learning_rate": 6.096201808219005e-06, + "loss": 0.4585, + "step": 10188 + }, + { + "epoch": 1.3264349863334635, + "grad_norm": 2.843698263168335, + "learning_rate": 6.094162712704599e-06, + "loss": 0.5146, + "step": 10191 + }, + { + "epoch": 1.3268254588051542, + "grad_norm": 2.588787317276001, + "learning_rate": 6.0921234260422675e-06, + "loss": 0.5512, + "step": 10194 + }, + { + "epoch": 1.327215931276845, + "grad_norm": 2.7030463218688965, + "learning_rate": 6.090083948588271e-06, + "loss": 0.4705, + "step": 10197 + }, + { + "epoch": 1.3276064037485358, + "grad_norm": 2.659209966659546, + "learning_rate": 6.088044280698903e-06, + "loss": 0.5182, + "step": 10200 + }, + { + "epoch": 1.3279968762202266, + "grad_norm": 2.3089091777801514, + "learning_rate": 6.086004422730487e-06, + "loss": 0.5118, + "step": 10203 + }, + { + "epoch": 1.3283873486919173, + "grad_norm": 3.2967066764831543, + "learning_rate": 6.083964375039384e-06, + "loss": 0.4932, + "step": 10206 + }, + { + "epoch": 1.3287778211636079, + "grad_norm": 3.2022552490234375, + "learning_rate": 6.081924137981984e-06, + "loss": 0.5659, + "step": 10209 + }, + { + "epoch": 1.3291682936352986, + "grad_norm": 2.5667223930358887, + "learning_rate": 6.079883711914713e-06, + "loss": 0.4576, + "step": 10212 + }, + { + "epoch": 1.3295587661069894, + "grad_norm": 2.711458444595337, + "learning_rate": 6.07784309719403e-06, + "loss": 0.4464, + "step": 10215 + }, + { + "epoch": 1.3299492385786802, + "grad_norm": 2.9583401679992676, + "learning_rate": 6.075802294176425e-06, + "loss": 0.5373, + "step": 10218 + }, + { + "epoch": 1.330339711050371, + "grad_norm": 4.091579914093018, + "learning_rate": 6.073761303218423e-06, + "loss": 0.5137, + "step": 10221 + }, + { + "epoch": 1.3307301835220617, + "grad_norm": 2.6844708919525146, + "learning_rate": 6.071720124676579e-06, + "loss": 0.6416, + "step": 10224 + }, + { + "epoch": 1.3311206559937525, + "grad_norm": 2.5595126152038574, + "learning_rate": 6.069678758907486e-06, + "loss": 0.55, + "step": 10227 + }, + { + "epoch": 1.331511128465443, + "grad_norm": 2.9795103073120117, + "learning_rate": 6.067637206267761e-06, + "loss": 0.4621, + "step": 10230 + }, + { + "epoch": 1.331901600937134, + "grad_norm": 2.5117387771606445, + "learning_rate": 6.065595467114064e-06, + "loss": 0.4778, + "step": 10233 + }, + { + "epoch": 1.3322920734088246, + "grad_norm": 2.6409218311309814, + "learning_rate": 6.063553541803081e-06, + "loss": 0.4728, + "step": 10236 + }, + { + "epoch": 1.3326825458805154, + "grad_norm": 2.658996105194092, + "learning_rate": 6.061511430691529e-06, + "loss": 0.4789, + "step": 10239 + }, + { + "epoch": 1.3330730183522062, + "grad_norm": 2.801865339279175, + "learning_rate": 6.059469134136167e-06, + "loss": 0.4593, + "step": 10242 + }, + { + "epoch": 1.333463490823897, + "grad_norm": 2.8045008182525635, + "learning_rate": 6.057426652493773e-06, + "loss": 0.5459, + "step": 10245 + }, + { + "epoch": 1.3338539632955877, + "grad_norm": 2.777100086212158, + "learning_rate": 6.055383986121169e-06, + "loss": 0.578, + "step": 10248 + }, + { + "epoch": 1.3342444357672785, + "grad_norm": 2.5679752826690674, + "learning_rate": 6.053341135375202e-06, + "loss": 0.4892, + "step": 10251 + }, + { + "epoch": 1.3346349082389692, + "grad_norm": 2.5535635948181152, + "learning_rate": 6.051298100612755e-06, + "loss": 0.5193, + "step": 10254 + }, + { + "epoch": 1.3350253807106598, + "grad_norm": 2.5816335678100586, + "learning_rate": 6.049254882190742e-06, + "loss": 0.5621, + "step": 10257 + }, + { + "epoch": 1.3354158531823506, + "grad_norm": 2.74711275100708, + "learning_rate": 6.047211480466105e-06, + "loss": 0.5361, + "step": 10260 + }, + { + "epoch": 1.3358063256540413, + "grad_norm": 2.556450605392456, + "learning_rate": 6.045167895795829e-06, + "loss": 0.4354, + "step": 10263 + }, + { + "epoch": 1.336196798125732, + "grad_norm": 2.905630588531494, + "learning_rate": 6.043124128536919e-06, + "loss": 0.4784, + "step": 10266 + }, + { + "epoch": 1.3365872705974229, + "grad_norm": 3.139017343521118, + "learning_rate": 6.041080179046418e-06, + "loss": 0.5801, + "step": 10269 + }, + { + "epoch": 1.3369777430691137, + "grad_norm": 2.844486713409424, + "learning_rate": 6.039036047681402e-06, + "loss": 0.5141, + "step": 10272 + }, + { + "epoch": 1.3373682155408044, + "grad_norm": 2.940715789794922, + "learning_rate": 6.036991734798971e-06, + "loss": 0.5246, + "step": 10275 + }, + { + "epoch": 1.3377586880124952, + "grad_norm": 2.516834020614624, + "learning_rate": 6.034947240756268e-06, + "loss": 0.4542, + "step": 10278 + }, + { + "epoch": 1.338149160484186, + "grad_norm": 2.7946300506591797, + "learning_rate": 6.032902565910456e-06, + "loss": 0.5008, + "step": 10281 + }, + { + "epoch": 1.3385396329558765, + "grad_norm": 2.5001931190490723, + "learning_rate": 6.030857710618743e-06, + "loss": 0.4844, + "step": 10284 + }, + { + "epoch": 1.3389301054275673, + "grad_norm": 2.5661776065826416, + "learning_rate": 6.0288126752383535e-06, + "loss": 0.4938, + "step": 10287 + }, + { + "epoch": 1.339320577899258, + "grad_norm": 2.576228618621826, + "learning_rate": 6.026767460126555e-06, + "loss": 0.5228, + "step": 10290 + }, + { + "epoch": 1.3397110503709488, + "grad_norm": 2.4661436080932617, + "learning_rate": 6.0247220656406415e-06, + "loss": 0.4381, + "step": 10293 + }, + { + "epoch": 1.3401015228426396, + "grad_norm": 2.5111136436462402, + "learning_rate": 6.022676492137939e-06, + "loss": 0.5015, + "step": 10296 + }, + { + "epoch": 1.3404919953143304, + "grad_norm": 2.6229360103607178, + "learning_rate": 6.020630739975803e-06, + "loss": 0.4764, + "step": 10299 + }, + { + "epoch": 1.3408824677860212, + "grad_norm": 2.8185629844665527, + "learning_rate": 6.018584809511625e-06, + "loss": 0.5273, + "step": 10302 + }, + { + "epoch": 1.3412729402577117, + "grad_norm": 2.5472960472106934, + "learning_rate": 6.0165387011028235e-06, + "loss": 0.4626, + "step": 10305 + }, + { + "epoch": 1.3416634127294027, + "grad_norm": 2.914748430252075, + "learning_rate": 6.01449241510685e-06, + "loss": 0.5434, + "step": 10308 + }, + { + "epoch": 1.3420538852010933, + "grad_norm": 2.704376220703125, + "learning_rate": 6.012445951881185e-06, + "loss": 0.5164, + "step": 10311 + }, + { + "epoch": 1.342444357672784, + "grad_norm": 2.3576114177703857, + "learning_rate": 6.010399311783343e-06, + "loss": 0.4096, + "step": 10314 + }, + { + "epoch": 1.3428348301444748, + "grad_norm": 2.456395387649536, + "learning_rate": 6.008352495170866e-06, + "loss": 0.4971, + "step": 10317 + }, + { + "epoch": 1.3432253026161656, + "grad_norm": 2.8089687824249268, + "learning_rate": 6.006305502401329e-06, + "loss": 0.4943, + "step": 10320 + }, + { + "epoch": 1.3436157750878563, + "grad_norm": 2.7782235145568848, + "learning_rate": 6.0042583338323376e-06, + "loss": 0.5548, + "step": 10323 + }, + { + "epoch": 1.3440062475595471, + "grad_norm": 2.7142183780670166, + "learning_rate": 6.002210989821528e-06, + "loss": 0.5179, + "step": 10326 + }, + { + "epoch": 1.344396720031238, + "grad_norm": 2.6159982681274414, + "learning_rate": 6.0001634707265675e-06, + "loss": 0.4984, + "step": 10329 + }, + { + "epoch": 1.3447871925029284, + "grad_norm": 2.9001638889312744, + "learning_rate": 5.998115776905152e-06, + "loss": 0.5373, + "step": 10332 + }, + { + "epoch": 1.3451776649746192, + "grad_norm": 2.7461464405059814, + "learning_rate": 5.996067908715012e-06, + "loss": 0.5562, + "step": 10335 + }, + { + "epoch": 1.34556813744631, + "grad_norm": 2.5092601776123047, + "learning_rate": 5.994019866513901e-06, + "loss": 0.5171, + "step": 10338 + }, + { + "epoch": 1.3459586099180008, + "grad_norm": 2.727618455886841, + "learning_rate": 5.991971650659612e-06, + "loss": 0.4846, + "step": 10341 + }, + { + "epoch": 1.3463490823896915, + "grad_norm": 2.589345932006836, + "learning_rate": 5.9899232615099626e-06, + "loss": 0.5663, + "step": 10344 + }, + { + "epoch": 1.3467395548613823, + "grad_norm": 3.144259452819824, + "learning_rate": 5.9878746994227996e-06, + "loss": 0.6201, + "step": 10347 + }, + { + "epoch": 1.347130027333073, + "grad_norm": 2.4682085514068604, + "learning_rate": 5.985825964756008e-06, + "loss": 0.4205, + "step": 10350 + }, + { + "epoch": 1.3475204998047636, + "grad_norm": 3.090812921524048, + "learning_rate": 5.9837770578674925e-06, + "loss": 0.4667, + "step": 10353 + }, + { + "epoch": 1.3479109722764546, + "grad_norm": 2.575993299484253, + "learning_rate": 5.981727979115195e-06, + "loss": 0.4956, + "step": 10356 + }, + { + "epoch": 1.3483014447481452, + "grad_norm": 2.3932414054870605, + "learning_rate": 5.979678728857086e-06, + "loss": 0.4554, + "step": 10359 + }, + { + "epoch": 1.348691917219836, + "grad_norm": 2.7043232917785645, + "learning_rate": 5.977629307451162e-06, + "loss": 0.5633, + "step": 10362 + }, + { + "epoch": 1.3490823896915267, + "grad_norm": 2.7044363021850586, + "learning_rate": 5.975579715255455e-06, + "loss": 0.431, + "step": 10365 + }, + { + "epoch": 1.3494728621632175, + "grad_norm": 2.512634038925171, + "learning_rate": 5.973529952628023e-06, + "loss": 0.4995, + "step": 10368 + }, + { + "epoch": 1.3498633346349083, + "grad_norm": 3.501483917236328, + "learning_rate": 5.97148001992696e-06, + "loss": 0.4722, + "step": 10371 + }, + { + "epoch": 1.350253807106599, + "grad_norm": 2.4441893100738525, + "learning_rate": 5.969429917510378e-06, + "loss": 0.5325, + "step": 10374 + }, + { + "epoch": 1.3506442795782898, + "grad_norm": 2.3490185737609863, + "learning_rate": 5.9673796457364295e-06, + "loss": 0.456, + "step": 10377 + }, + { + "epoch": 1.3510347520499804, + "grad_norm": 3.762827157974243, + "learning_rate": 5.965329204963292e-06, + "loss": 0.5797, + "step": 10380 + }, + { + "epoch": 1.3514252245216714, + "grad_norm": 3.0791232585906982, + "learning_rate": 5.9632785955491735e-06, + "loss": 0.4798, + "step": 10383 + }, + { + "epoch": 1.351815696993362, + "grad_norm": 2.7821664810180664, + "learning_rate": 5.961227817852311e-06, + "loss": 0.5121, + "step": 10386 + }, + { + "epoch": 1.3522061694650527, + "grad_norm": 2.4701087474823, + "learning_rate": 5.959176872230969e-06, + "loss": 0.4682, + "step": 10389 + }, + { + "epoch": 1.3525966419367434, + "grad_norm": 2.5216877460479736, + "learning_rate": 5.957125759043449e-06, + "loss": 0.4486, + "step": 10392 + }, + { + "epoch": 1.3529871144084342, + "grad_norm": 2.7570204734802246, + "learning_rate": 5.955074478648068e-06, + "loss": 0.5457, + "step": 10395 + }, + { + "epoch": 1.353377586880125, + "grad_norm": 2.721740484237671, + "learning_rate": 5.9530230314031875e-06, + "loss": 0.4864, + "step": 10398 + }, + { + "epoch": 1.3537680593518158, + "grad_norm": 2.7100112438201904, + "learning_rate": 5.950971417667189e-06, + "loss": 0.4506, + "step": 10401 + }, + { + "epoch": 1.3541585318235065, + "grad_norm": 3.179159641265869, + "learning_rate": 5.948919637798482e-06, + "loss": 0.5391, + "step": 10404 + }, + { + "epoch": 1.354549004295197, + "grad_norm": 2.8454229831695557, + "learning_rate": 5.946867692155511e-06, + "loss": 0.507, + "step": 10407 + }, + { + "epoch": 1.3549394767668879, + "grad_norm": 2.766425609588623, + "learning_rate": 5.944815581096746e-06, + "loss": 0.5161, + "step": 10410 + }, + { + "epoch": 1.3553299492385786, + "grad_norm": 2.6787941455841064, + "learning_rate": 5.942763304980689e-06, + "loss": 0.4643, + "step": 10413 + }, + { + "epoch": 1.3557204217102694, + "grad_norm": 2.679385185241699, + "learning_rate": 5.940710864165863e-06, + "loss": 0.5447, + "step": 10416 + }, + { + "epoch": 1.3561108941819602, + "grad_norm": 2.693063497543335, + "learning_rate": 5.938658259010829e-06, + "loss": 0.5279, + "step": 10419 + }, + { + "epoch": 1.356501366653651, + "grad_norm": 2.5846731662750244, + "learning_rate": 5.936605489874172e-06, + "loss": 0.5074, + "step": 10422 + }, + { + "epoch": 1.3568918391253417, + "grad_norm": 2.7756001949310303, + "learning_rate": 5.934552557114507e-06, + "loss": 0.4502, + "step": 10425 + }, + { + "epoch": 1.3572823115970323, + "grad_norm": 2.7776410579681396, + "learning_rate": 5.932499461090475e-06, + "loss": 0.5334, + "step": 10428 + }, + { + "epoch": 1.3576727840687233, + "grad_norm": 2.835615396499634, + "learning_rate": 5.930446202160749e-06, + "loss": 0.5619, + "step": 10431 + }, + { + "epoch": 1.3580632565404138, + "grad_norm": 2.890580177307129, + "learning_rate": 5.928392780684028e-06, + "loss": 0.561, + "step": 10434 + }, + { + "epoch": 1.3584537290121046, + "grad_norm": 4.082695960998535, + "learning_rate": 5.926339197019043e-06, + "loss": 0.5714, + "step": 10437 + }, + { + "epoch": 1.3588442014837954, + "grad_norm": 2.4185142517089844, + "learning_rate": 5.924285451524549e-06, + "loss": 0.5067, + "step": 10440 + }, + { + "epoch": 1.3592346739554861, + "grad_norm": 2.5827300548553467, + "learning_rate": 5.922231544559331e-06, + "loss": 0.4823, + "step": 10443 + }, + { + "epoch": 1.359625146427177, + "grad_norm": 2.5972402095794678, + "learning_rate": 5.9201774764822e-06, + "loss": 0.5133, + "step": 10446 + }, + { + "epoch": 1.3600156188988677, + "grad_norm": 2.655304431915283, + "learning_rate": 5.918123247652002e-06, + "loss": 0.4895, + "step": 10449 + }, + { + "epoch": 1.3604060913705585, + "grad_norm": 3.1130921840667725, + "learning_rate": 5.916068858427604e-06, + "loss": 0.5255, + "step": 10452 + }, + { + "epoch": 1.360796563842249, + "grad_norm": 2.6465952396392822, + "learning_rate": 5.914014309167901e-06, + "loss": 0.5161, + "step": 10455 + }, + { + "epoch": 1.36118703631394, + "grad_norm": 2.8651537895202637, + "learning_rate": 5.911959600231825e-06, + "loss": 0.4893, + "step": 10458 + }, + { + "epoch": 1.3615775087856306, + "grad_norm": 2.6356594562530518, + "learning_rate": 5.909904731978323e-06, + "loss": 0.4806, + "step": 10461 + }, + { + "epoch": 1.3619679812573213, + "grad_norm": 2.7509796619415283, + "learning_rate": 5.90784970476638e-06, + "loss": 0.4904, + "step": 10464 + }, + { + "epoch": 1.362358453729012, + "grad_norm": 2.8054521083831787, + "learning_rate": 5.905794518955002e-06, + "loss": 0.5197, + "step": 10467 + }, + { + "epoch": 1.3627489262007029, + "grad_norm": 2.970839262008667, + "learning_rate": 5.903739174903226e-06, + "loss": 0.4888, + "step": 10470 + }, + { + "epoch": 1.3631393986723936, + "grad_norm": 2.5401625633239746, + "learning_rate": 5.901683672970118e-06, + "loss": 0.6049, + "step": 10473 + }, + { + "epoch": 1.3635298711440844, + "grad_norm": 2.5715386867523193, + "learning_rate": 5.89962801351477e-06, + "loss": 0.4562, + "step": 10476 + }, + { + "epoch": 1.3639203436157752, + "grad_norm": 2.8467347621917725, + "learning_rate": 5.897572196896301e-06, + "loss": 0.4964, + "step": 10479 + }, + { + "epoch": 1.3643108160874657, + "grad_norm": 3.585697650909424, + "learning_rate": 5.895516223473856e-06, + "loss": 0.5151, + "step": 10482 + }, + { + "epoch": 1.3647012885591565, + "grad_norm": 2.776897668838501, + "learning_rate": 5.8934600936066115e-06, + "loss": 0.5529, + "step": 10485 + }, + { + "epoch": 1.3650917610308473, + "grad_norm": 2.829629421234131, + "learning_rate": 5.891403807653768e-06, + "loss": 0.5388, + "step": 10488 + }, + { + "epoch": 1.365482233502538, + "grad_norm": 2.7404351234436035, + "learning_rate": 5.889347365974554e-06, + "loss": 0.4966, + "step": 10491 + }, + { + "epoch": 1.3658727059742288, + "grad_norm": 2.4462528228759766, + "learning_rate": 5.887290768928228e-06, + "loss": 0.5372, + "step": 10494 + }, + { + "epoch": 1.3662631784459196, + "grad_norm": 2.7154576778411865, + "learning_rate": 5.88523401687407e-06, + "loss": 0.586, + "step": 10497 + }, + { + "epoch": 1.3666536509176104, + "grad_norm": 3.031604766845703, + "learning_rate": 5.883177110171392e-06, + "loss": 0.5067, + "step": 10500 + }, + { + "epoch": 1.367044123389301, + "grad_norm": 4.211544513702393, + "learning_rate": 5.881120049179529e-06, + "loss": 0.5154, + "step": 10503 + }, + { + "epoch": 1.367434595860992, + "grad_norm": 2.9091553688049316, + "learning_rate": 5.8790628342578485e-06, + "loss": 0.4859, + "step": 10506 + }, + { + "epoch": 1.3678250683326825, + "grad_norm": 2.4611270427703857, + "learning_rate": 5.87700546576574e-06, + "loss": 0.4909, + "step": 10509 + }, + { + "epoch": 1.3682155408043732, + "grad_norm": 2.56895112991333, + "learning_rate": 5.874947944062621e-06, + "loss": 0.491, + "step": 10512 + }, + { + "epoch": 1.368606013276064, + "grad_norm": 2.8200464248657227, + "learning_rate": 5.872890269507938e-06, + "loss": 0.4582, + "step": 10515 + }, + { + "epoch": 1.3689964857477548, + "grad_norm": 2.5796611309051514, + "learning_rate": 5.870832442461161e-06, + "loss": 0.432, + "step": 10518 + }, + { + "epoch": 1.3693869582194456, + "grad_norm": 2.872462034225464, + "learning_rate": 5.868774463281788e-06, + "loss": 0.5382, + "step": 10521 + }, + { + "epoch": 1.3697774306911363, + "grad_norm": 2.5397450923919678, + "learning_rate": 5.866716332329343e-06, + "loss": 0.4294, + "step": 10524 + }, + { + "epoch": 1.370167903162827, + "grad_norm": 2.746539831161499, + "learning_rate": 5.8646580499633786e-06, + "loss": 0.5046, + "step": 10527 + }, + { + "epoch": 1.3705583756345177, + "grad_norm": 2.5908896923065186, + "learning_rate": 5.862599616543473e-06, + "loss": 0.5438, + "step": 10530 + }, + { + "epoch": 1.3709488481062087, + "grad_norm": 3.0889134407043457, + "learning_rate": 5.860541032429227e-06, + "loss": 0.5459, + "step": 10533 + }, + { + "epoch": 1.3713393205778992, + "grad_norm": 2.655442953109741, + "learning_rate": 5.858482297980275e-06, + "loss": 0.4929, + "step": 10536 + }, + { + "epoch": 1.37172979304959, + "grad_norm": 2.756154775619507, + "learning_rate": 5.856423413556269e-06, + "loss": 0.4623, + "step": 10539 + }, + { + "epoch": 1.3721202655212807, + "grad_norm": 2.576265573501587, + "learning_rate": 5.854364379516896e-06, + "loss": 0.5525, + "step": 10542 + }, + { + "epoch": 1.3725107379929715, + "grad_norm": 2.8419313430786133, + "learning_rate": 5.852305196221864e-06, + "loss": 0.5179, + "step": 10545 + }, + { + "epoch": 1.3729012104646623, + "grad_norm": 2.7825493812561035, + "learning_rate": 5.8502458640309055e-06, + "loss": 0.4992, + "step": 10548 + }, + { + "epoch": 1.373291682936353, + "grad_norm": 2.590987205505371, + "learning_rate": 5.8481863833037846e-06, + "loss": 0.489, + "step": 10551 + }, + { + "epoch": 1.3736821554080438, + "grad_norm": 2.5935583114624023, + "learning_rate": 5.846126754400285e-06, + "loss": 0.5009, + "step": 10554 + }, + { + "epoch": 1.3740726278797344, + "grad_norm": 3.053290605545044, + "learning_rate": 5.844066977680223e-06, + "loss": 0.4684, + "step": 10557 + }, + { + "epoch": 1.3744631003514252, + "grad_norm": 3.1412622928619385, + "learning_rate": 5.842007053503436e-06, + "loss": 0.5036, + "step": 10560 + }, + { + "epoch": 1.374853572823116, + "grad_norm": 2.869248390197754, + "learning_rate": 5.839946982229786e-06, + "loss": 0.432, + "step": 10563 + }, + { + "epoch": 1.3752440452948067, + "grad_norm": 2.681100606918335, + "learning_rate": 5.8378867642191675e-06, + "loss": 0.5079, + "step": 10566 + }, + { + "epoch": 1.3756345177664975, + "grad_norm": 2.4988043308258057, + "learning_rate": 5.835826399831492e-06, + "loss": 0.4901, + "step": 10569 + }, + { + "epoch": 1.3760249902381883, + "grad_norm": 2.498563766479492, + "learning_rate": 5.833765889426706e-06, + "loss": 0.4638, + "step": 10572 + }, + { + "epoch": 1.376415462709879, + "grad_norm": 2.5112462043762207, + "learning_rate": 5.831705233364768e-06, + "loss": 0.4716, + "step": 10575 + }, + { + "epoch": 1.3768059351815696, + "grad_norm": 2.4168717861175537, + "learning_rate": 5.82964443200568e-06, + "loss": 0.476, + "step": 10578 + }, + { + "epoch": 1.3771964076532606, + "grad_norm": 2.5891149044036865, + "learning_rate": 5.827583485709453e-06, + "loss": 0.5093, + "step": 10581 + }, + { + "epoch": 1.3775868801249511, + "grad_norm": 2.5921523571014404, + "learning_rate": 5.825522394836132e-06, + "loss": 0.5532, + "step": 10584 + }, + { + "epoch": 1.377977352596642, + "grad_norm": 2.529782295227051, + "learning_rate": 5.823461159745786e-06, + "loss": 0.4955, + "step": 10587 + }, + { + "epoch": 1.3783678250683327, + "grad_norm": 2.8079707622528076, + "learning_rate": 5.821399780798507e-06, + "loss": 0.5489, + "step": 10590 + }, + { + "epoch": 1.3787582975400234, + "grad_norm": 3.1426773071289062, + "learning_rate": 5.8193382583544155e-06, + "loss": 0.478, + "step": 10593 + }, + { + "epoch": 1.3791487700117142, + "grad_norm": 2.480963706970215, + "learning_rate": 5.817276592773651e-06, + "loss": 0.4766, + "step": 10596 + }, + { + "epoch": 1.379539242483405, + "grad_norm": 2.4679062366485596, + "learning_rate": 5.815214784416386e-06, + "loss": 0.5077, + "step": 10599 + }, + { + "epoch": 1.3799297149550958, + "grad_norm": 2.7252042293548584, + "learning_rate": 5.813152833642816e-06, + "loss": 0.518, + "step": 10602 + }, + { + "epoch": 1.3803201874267863, + "grad_norm": 2.4545648097991943, + "learning_rate": 5.811090740813154e-06, + "loss": 0.4882, + "step": 10605 + }, + { + "epoch": 1.380710659898477, + "grad_norm": 2.6716935634613037, + "learning_rate": 5.809028506287647e-06, + "loss": 0.5061, + "step": 10608 + }, + { + "epoch": 1.3811011323701678, + "grad_norm": 2.5040860176086426, + "learning_rate": 5.806966130426561e-06, + "loss": 0.4739, + "step": 10611 + }, + { + "epoch": 1.3814916048418586, + "grad_norm": 2.5503172874450684, + "learning_rate": 5.80490361359019e-06, + "loss": 0.5359, + "step": 10614 + }, + { + "epoch": 1.3818820773135494, + "grad_norm": 2.5785112380981445, + "learning_rate": 5.802840956138851e-06, + "loss": 0.4595, + "step": 10617 + }, + { + "epoch": 1.3822725497852402, + "grad_norm": 3.1451265811920166, + "learning_rate": 5.800778158432886e-06, + "loss": 0.4806, + "step": 10620 + }, + { + "epoch": 1.382663022256931, + "grad_norm": 2.8773903846740723, + "learning_rate": 5.798715220832661e-06, + "loss": 0.5106, + "step": 10623 + }, + { + "epoch": 1.3830534947286217, + "grad_norm": 2.5292470455169678, + "learning_rate": 5.796652143698568e-06, + "loss": 0.4749, + "step": 10626 + }, + { + "epoch": 1.3834439672003125, + "grad_norm": 2.4957058429718018, + "learning_rate": 5.7945889273910215e-06, + "loss": 0.5352, + "step": 10629 + }, + { + "epoch": 1.383834439672003, + "grad_norm": 2.4294910430908203, + "learning_rate": 5.79252557227046e-06, + "loss": 0.5267, + "step": 10632 + }, + { + "epoch": 1.3842249121436938, + "grad_norm": 3.0004422664642334, + "learning_rate": 5.7904620786973476e-06, + "loss": 0.4788, + "step": 10635 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 2.777956962585449, + "learning_rate": 5.788398447032174e-06, + "loss": 0.5147, + "step": 10638 + }, + { + "epoch": 1.3850058570870754, + "grad_norm": 3.005110263824463, + "learning_rate": 5.78633467763545e-06, + "loss": 0.5815, + "step": 10641 + }, + { + "epoch": 1.3853963295587661, + "grad_norm": 2.7683937549591064, + "learning_rate": 5.784270770867713e-06, + "loss": 0.5067, + "step": 10644 + }, + { + "epoch": 1.385786802030457, + "grad_norm": 2.3799400329589844, + "learning_rate": 5.782206727089521e-06, + "loss": 0.4352, + "step": 10647 + }, + { + "epoch": 1.3861772745021477, + "grad_norm": 2.8313100337982178, + "learning_rate": 5.780142546661461e-06, + "loss": 0.4987, + "step": 10650 + }, + { + "epoch": 1.3865677469738382, + "grad_norm": 2.4416072368621826, + "learning_rate": 5.778078229944137e-06, + "loss": 0.5242, + "step": 10653 + }, + { + "epoch": 1.3869582194455292, + "grad_norm": 4.677280426025391, + "learning_rate": 5.776013777298184e-06, + "loss": 0.5669, + "step": 10656 + }, + { + "epoch": 1.3873486919172198, + "grad_norm": 2.549098014831543, + "learning_rate": 5.7739491890842556e-06, + "loss": 0.4694, + "step": 10659 + }, + { + "epoch": 1.3877391643889105, + "grad_norm": 2.7043087482452393, + "learning_rate": 5.771884465663033e-06, + "loss": 0.5064, + "step": 10662 + }, + { + "epoch": 1.3881296368606013, + "grad_norm": 2.652681350708008, + "learning_rate": 5.7698196073952195e-06, + "loss": 0.4974, + "step": 10665 + }, + { + "epoch": 1.388520109332292, + "grad_norm": 2.560001850128174, + "learning_rate": 5.767754614641538e-06, + "loss": 0.5313, + "step": 10668 + }, + { + "epoch": 1.3889105818039829, + "grad_norm": 2.911827325820923, + "learning_rate": 5.76568948776274e-06, + "loss": 0.5815, + "step": 10671 + }, + { + "epoch": 1.3893010542756736, + "grad_norm": 3.264103889465332, + "learning_rate": 5.763624227119602e-06, + "loss": 0.5138, + "step": 10674 + }, + { + "epoch": 1.3896915267473644, + "grad_norm": 2.7543458938598633, + "learning_rate": 5.761558833072915e-06, + "loss": 0.5643, + "step": 10677 + }, + { + "epoch": 1.390081999219055, + "grad_norm": 2.639909505844116, + "learning_rate": 5.759493305983504e-06, + "loss": 0.5443, + "step": 10680 + }, + { + "epoch": 1.3904724716907457, + "grad_norm": 3.0217959880828857, + "learning_rate": 5.757427646212208e-06, + "loss": 0.5419, + "step": 10683 + }, + { + "epoch": 1.3908629441624365, + "grad_norm": 2.6086668968200684, + "learning_rate": 5.755361854119898e-06, + "loss": 0.5186, + "step": 10686 + }, + { + "epoch": 1.3912534166341273, + "grad_norm": 2.5692412853240967, + "learning_rate": 5.753295930067461e-06, + "loss": 0.439, + "step": 10689 + }, + { + "epoch": 1.391643889105818, + "grad_norm": 2.9494736194610596, + "learning_rate": 5.751229874415808e-06, + "loss": 0.4788, + "step": 10692 + }, + { + "epoch": 1.3920343615775088, + "grad_norm": 4.0618896484375, + "learning_rate": 5.749163687525878e-06, + "loss": 0.4985, + "step": 10695 + }, + { + "epoch": 1.3924248340491996, + "grad_norm": 2.8040335178375244, + "learning_rate": 5.747097369758626e-06, + "loss": 0.5112, + "step": 10698 + }, + { + "epoch": 1.3928153065208901, + "grad_norm": 3.7752537727355957, + "learning_rate": 5.745030921475036e-06, + "loss": 0.5142, + "step": 10701 + }, + { + "epoch": 1.3932057789925811, + "grad_norm": 2.545715093612671, + "learning_rate": 5.742964343036111e-06, + "loss": 0.4986, + "step": 10704 + }, + { + "epoch": 1.3935962514642717, + "grad_norm": 3.0850815773010254, + "learning_rate": 5.74089763480288e-06, + "loss": 0.4749, + "step": 10707 + }, + { + "epoch": 1.3939867239359625, + "grad_norm": 2.574460029602051, + "learning_rate": 5.738830797136389e-06, + "loss": 0.5006, + "step": 10710 + }, + { + "epoch": 1.3943771964076532, + "grad_norm": 2.6434524059295654, + "learning_rate": 5.736763830397713e-06, + "loss": 0.4904, + "step": 10713 + }, + { + "epoch": 1.394767668879344, + "grad_norm": 2.6245415210723877, + "learning_rate": 5.734696734947946e-06, + "loss": 0.4879, + "step": 10716 + }, + { + "epoch": 1.3951581413510348, + "grad_norm": 2.7214295864105225, + "learning_rate": 5.732629511148204e-06, + "loss": 0.5052, + "step": 10719 + }, + { + "epoch": 1.3955486138227255, + "grad_norm": 2.8227062225341797, + "learning_rate": 5.730562159359628e-06, + "loss": 0.5524, + "step": 10722 + }, + { + "epoch": 1.3959390862944163, + "grad_norm": 4.676625728607178, + "learning_rate": 5.728494679943378e-06, + "loss": 0.5505, + "step": 10725 + }, + { + "epoch": 1.3963295587661069, + "grad_norm": 3.0134570598602295, + "learning_rate": 5.726427073260641e-06, + "loss": 0.4246, + "step": 10728 + }, + { + "epoch": 1.3967200312377979, + "grad_norm": 2.913252353668213, + "learning_rate": 5.7243593396726235e-06, + "loss": 0.5162, + "step": 10731 + }, + { + "epoch": 1.3971105037094884, + "grad_norm": 2.4458465576171875, + "learning_rate": 5.722291479540552e-06, + "loss": 0.496, + "step": 10734 + }, + { + "epoch": 1.3975009761811792, + "grad_norm": 2.5969252586364746, + "learning_rate": 5.72022349322568e-06, + "loss": 0.5335, + "step": 10737 + }, + { + "epoch": 1.39789144865287, + "grad_norm": 2.6486926078796387, + "learning_rate": 5.7181553810892785e-06, + "loss": 0.4731, + "step": 10740 + }, + { + "epoch": 1.3982819211245607, + "grad_norm": 2.3636739253997803, + "learning_rate": 5.716087143492643e-06, + "loss": 0.4643, + "step": 10743 + }, + { + "epoch": 1.3986723935962515, + "grad_norm": 2.5083768367767334, + "learning_rate": 5.714018780797088e-06, + "loss": 0.4983, + "step": 10746 + }, + { + "epoch": 1.3990628660679423, + "grad_norm": 2.5721659660339355, + "learning_rate": 5.7119502933639545e-06, + "loss": 0.5938, + "step": 10749 + }, + { + "epoch": 1.399453338539633, + "grad_norm": 2.918208360671997, + "learning_rate": 5.709881681554604e-06, + "loss": 0.4612, + "step": 10752 + }, + { + "epoch": 1.3998438110113236, + "grad_norm": 2.538756847381592, + "learning_rate": 5.7078129457304165e-06, + "loss": 0.4997, + "step": 10755 + }, + { + "epoch": 1.4002342834830144, + "grad_norm": 2.635209798812866, + "learning_rate": 5.7057440862527965e-06, + "loss": 0.4774, + "step": 10758 + }, + { + "epoch": 1.4006247559547051, + "grad_norm": 2.62276554107666, + "learning_rate": 5.70367510348317e-06, + "loss": 0.4467, + "step": 10761 + }, + { + "epoch": 1.401015228426396, + "grad_norm": 2.6300337314605713, + "learning_rate": 5.7016059977829805e-06, + "loss": 0.4499, + "step": 10764 + }, + { + "epoch": 1.4014057008980867, + "grad_norm": 2.4392457008361816, + "learning_rate": 5.6995367695137e-06, + "loss": 0.5159, + "step": 10767 + }, + { + "epoch": 1.4017961733697775, + "grad_norm": 2.7532896995544434, + "learning_rate": 5.697467419036819e-06, + "loss": 0.5695, + "step": 10770 + }, + { + "epoch": 1.4021866458414682, + "grad_norm": 3.3758609294891357, + "learning_rate": 5.695397946713848e-06, + "loss": 0.5062, + "step": 10773 + }, + { + "epoch": 1.4025771183131588, + "grad_norm": 2.9240944385528564, + "learning_rate": 5.6933283529063165e-06, + "loss": 0.4811, + "step": 10776 + }, + { + "epoch": 1.4029675907848498, + "grad_norm": 3.1382761001586914, + "learning_rate": 5.691258637975781e-06, + "loss": 0.506, + "step": 10779 + }, + { + "epoch": 1.4033580632565403, + "grad_norm": 2.694122314453125, + "learning_rate": 5.689188802283816e-06, + "loss": 0.5352, + "step": 10782 + }, + { + "epoch": 1.403748535728231, + "grad_norm": 2.632483720779419, + "learning_rate": 5.687118846192015e-06, + "loss": 0.4797, + "step": 10785 + }, + { + "epoch": 1.4041390081999219, + "grad_norm": 2.6875948905944824, + "learning_rate": 5.685048770062e-06, + "loss": 0.5539, + "step": 10788 + }, + { + "epoch": 1.4045294806716127, + "grad_norm": 2.9431047439575195, + "learning_rate": 5.682978574255404e-06, + "loss": 0.5159, + "step": 10791 + }, + { + "epoch": 1.4049199531433034, + "grad_norm": 2.7504560947418213, + "learning_rate": 5.680908259133889e-06, + "loss": 0.4741, + "step": 10794 + }, + { + "epoch": 1.4053104256149942, + "grad_norm": 2.588517189025879, + "learning_rate": 5.678837825059134e-06, + "loss": 0.5162, + "step": 10797 + }, + { + "epoch": 1.405700898086685, + "grad_norm": 2.9769070148468018, + "learning_rate": 5.676767272392837e-06, + "loss": 0.5516, + "step": 10800 + }, + { + "epoch": 1.4060913705583755, + "grad_norm": 3.6146578788757324, + "learning_rate": 5.674696601496723e-06, + "loss": 0.4143, + "step": 10803 + }, + { + "epoch": 1.4064818430300665, + "grad_norm": 2.6829850673675537, + "learning_rate": 5.672625812732531e-06, + "loss": 0.4618, + "step": 10806 + }, + { + "epoch": 1.406872315501757, + "grad_norm": 2.5348923206329346, + "learning_rate": 5.670554906462024e-06, + "loss": 0.4398, + "step": 10809 + }, + { + "epoch": 1.4072627879734478, + "grad_norm": 2.8809609413146973, + "learning_rate": 5.668483883046987e-06, + "loss": 0.438, + "step": 10812 + }, + { + "epoch": 1.4076532604451386, + "grad_norm": 2.6564998626708984, + "learning_rate": 5.666412742849222e-06, + "loss": 0.4728, + "step": 10815 + }, + { + "epoch": 1.4080437329168294, + "grad_norm": 2.5094847679138184, + "learning_rate": 5.664341486230552e-06, + "loss": 0.5797, + "step": 10818 + }, + { + "epoch": 1.4084342053885202, + "grad_norm": 4.3344268798828125, + "learning_rate": 5.662270113552822e-06, + "loss": 0.4779, + "step": 10821 + }, + { + "epoch": 1.408824677860211, + "grad_norm": 2.705709218978882, + "learning_rate": 5.660198625177897e-06, + "loss": 0.5035, + "step": 10824 + }, + { + "epoch": 1.4092151503319017, + "grad_norm": 2.7563042640686035, + "learning_rate": 5.65812702146766e-06, + "loss": 0.4518, + "step": 10827 + }, + { + "epoch": 1.4096056228035923, + "grad_norm": 2.515812635421753, + "learning_rate": 5.656055302784017e-06, + "loss": 0.4553, + "step": 10830 + }, + { + "epoch": 1.409996095275283, + "grad_norm": 2.5328338146209717, + "learning_rate": 5.653983469488893e-06, + "loss": 0.467, + "step": 10833 + }, + { + "epoch": 1.4103865677469738, + "grad_norm": 2.820492744445801, + "learning_rate": 5.651911521944233e-06, + "loss": 0.5257, + "step": 10836 + }, + { + "epoch": 1.4107770402186646, + "grad_norm": 3.2355074882507324, + "learning_rate": 5.649839460512001e-06, + "loss": 0.4985, + "step": 10839 + }, + { + "epoch": 1.4111675126903553, + "grad_norm": 3.2596559524536133, + "learning_rate": 5.647767285554183e-06, + "loss": 0.5677, + "step": 10842 + }, + { + "epoch": 1.4115579851620461, + "grad_norm": 2.754706382751465, + "learning_rate": 5.645694997432783e-06, + "loss": 0.4839, + "step": 10845 + }, + { + "epoch": 1.4119484576337369, + "grad_norm": 2.3893485069274902, + "learning_rate": 5.643622596509823e-06, + "loss": 0.4494, + "step": 10848 + }, + { + "epoch": 1.4123389301054274, + "grad_norm": 2.934878349304199, + "learning_rate": 5.641550083147352e-06, + "loss": 0.4586, + "step": 10851 + }, + { + "epoch": 1.4127294025771184, + "grad_norm": 2.4555704593658447, + "learning_rate": 5.639477457707431e-06, + "loss": 0.5323, + "step": 10854 + }, + { + "epoch": 1.413119875048809, + "grad_norm": 2.666161298751831, + "learning_rate": 5.6374047205521424e-06, + "loss": 0.4899, + "step": 10857 + }, + { + "epoch": 1.4135103475204998, + "grad_norm": 2.8357090950012207, + "learning_rate": 5.6353318720435905e-06, + "loss": 0.5637, + "step": 10860 + }, + { + "epoch": 1.4139008199921905, + "grad_norm": 2.9021785259246826, + "learning_rate": 5.633258912543897e-06, + "loss": 0.5518, + "step": 10863 + }, + { + "epoch": 1.4142912924638813, + "grad_norm": 2.5638427734375, + "learning_rate": 5.631185842415203e-06, + "loss": 0.448, + "step": 10866 + }, + { + "epoch": 1.414681764935572, + "grad_norm": 2.585672378540039, + "learning_rate": 5.62911266201967e-06, + "loss": 0.5891, + "step": 10869 + }, + { + "epoch": 1.4150722374072628, + "grad_norm": 2.641378164291382, + "learning_rate": 5.62703937171948e-06, + "loss": 0.4652, + "step": 10872 + }, + { + "epoch": 1.4154627098789536, + "grad_norm": 2.626474618911743, + "learning_rate": 5.62496597187683e-06, + "loss": 0.5193, + "step": 10875 + }, + { + "epoch": 1.4158531823506442, + "grad_norm": 2.8795952796936035, + "learning_rate": 5.622892462853938e-06, + "loss": 0.4486, + "step": 10878 + }, + { + "epoch": 1.4162436548223352, + "grad_norm": 3.0229554176330566, + "learning_rate": 5.620818845013046e-06, + "loss": 0.501, + "step": 10881 + }, + { + "epoch": 1.4166341272940257, + "grad_norm": 2.5767412185668945, + "learning_rate": 5.618745118716406e-06, + "loss": 0.462, + "step": 10884 + }, + { + "epoch": 1.4170245997657165, + "grad_norm": 2.8493659496307373, + "learning_rate": 5.616671284326296e-06, + "loss": 0.5408, + "step": 10887 + }, + { + "epoch": 1.4174150722374073, + "grad_norm": 2.6280994415283203, + "learning_rate": 5.6145973422050085e-06, + "loss": 0.5258, + "step": 10890 + }, + { + "epoch": 1.417805544709098, + "grad_norm": 2.548902988433838, + "learning_rate": 5.61252329271486e-06, + "loss": 0.4983, + "step": 10893 + }, + { + "epoch": 1.4181960171807888, + "grad_norm": 2.673133373260498, + "learning_rate": 5.6104491362181805e-06, + "loss": 0.6353, + "step": 10896 + }, + { + "epoch": 1.4185864896524796, + "grad_norm": 2.571382999420166, + "learning_rate": 5.60837487307732e-06, + "loss": 0.4516, + "step": 10899 + }, + { + "epoch": 1.4189769621241703, + "grad_norm": 3.19085693359375, + "learning_rate": 5.60630050365465e-06, + "loss": 0.5573, + "step": 10902 + }, + { + "epoch": 1.419367434595861, + "grad_norm": 2.6122026443481445, + "learning_rate": 5.604226028312558e-06, + "loss": 0.534, + "step": 10905 + }, + { + "epoch": 1.4197579070675517, + "grad_norm": 4.08894157409668, + "learning_rate": 5.602151447413449e-06, + "loss": 0.5676, + "step": 10908 + }, + { + "epoch": 1.4201483795392424, + "grad_norm": 2.857332229614258, + "learning_rate": 5.600076761319748e-06, + "loss": 0.5727, + "step": 10911 + }, + { + "epoch": 1.4205388520109332, + "grad_norm": 5.386052131652832, + "learning_rate": 5.5980019703939006e-06, + "loss": 0.4778, + "step": 10914 + }, + { + "epoch": 1.420929324482624, + "grad_norm": 2.5507993698120117, + "learning_rate": 5.595927074998367e-06, + "loss": 0.5343, + "step": 10917 + }, + { + "epoch": 1.4213197969543148, + "grad_norm": 3.1667394638061523, + "learning_rate": 5.593852075495627e-06, + "loss": 0.4644, + "step": 10920 + }, + { + "epoch": 1.4217102694260055, + "grad_norm": 2.6564674377441406, + "learning_rate": 5.591776972248178e-06, + "loss": 0.465, + "step": 10923 + }, + { + "epoch": 1.422100741897696, + "grad_norm": 2.5531013011932373, + "learning_rate": 5.589701765618539e-06, + "loss": 0.4859, + "step": 10926 + }, + { + "epoch": 1.422491214369387, + "grad_norm": 2.634110689163208, + "learning_rate": 5.587626455969238e-06, + "loss": 0.5084, + "step": 10929 + }, + { + "epoch": 1.4228816868410776, + "grad_norm": 2.458155632019043, + "learning_rate": 5.5855510436628345e-06, + "loss": 0.4223, + "step": 10932 + }, + { + "epoch": 1.4232721593127684, + "grad_norm": 2.6002464294433594, + "learning_rate": 5.583475529061895e-06, + "loss": 0.607, + "step": 10935 + }, + { + "epoch": 1.4236626317844592, + "grad_norm": 2.449547290802002, + "learning_rate": 5.581399912529008e-06, + "loss": 0.4332, + "step": 10938 + }, + { + "epoch": 1.42405310425615, + "grad_norm": 2.828279972076416, + "learning_rate": 5.579324194426779e-06, + "loss": 0.5483, + "step": 10941 + }, + { + "epoch": 1.4244435767278407, + "grad_norm": 2.5671331882476807, + "learning_rate": 5.577248375117832e-06, + "loss": 0.5026, + "step": 10944 + }, + { + "epoch": 1.4248340491995315, + "grad_norm": 2.9858782291412354, + "learning_rate": 5.575172454964808e-06, + "loss": 0.5081, + "step": 10947 + }, + { + "epoch": 1.4252245216712223, + "grad_norm": 2.493577003479004, + "learning_rate": 5.573096434330366e-06, + "loss": 0.4517, + "step": 10950 + }, + { + "epoch": 1.4256149941429128, + "grad_norm": 2.619349718093872, + "learning_rate": 5.571020313577183e-06, + "loss": 0.5067, + "step": 10953 + }, + { + "epoch": 1.4260054666146036, + "grad_norm": 2.6694607734680176, + "learning_rate": 5.5689440930679514e-06, + "loss": 0.5137, + "step": 10956 + }, + { + "epoch": 1.4263959390862944, + "grad_norm": 2.9713809490203857, + "learning_rate": 5.566867773165386e-06, + "loss": 0.5501, + "step": 10959 + }, + { + "epoch": 1.4267864115579851, + "grad_norm": 2.9507107734680176, + "learning_rate": 5.564791354232211e-06, + "loss": 0.5056, + "step": 10962 + }, + { + "epoch": 1.427176884029676, + "grad_norm": 3.161423683166504, + "learning_rate": 5.562714836631175e-06, + "loss": 0.461, + "step": 10965 + }, + { + "epoch": 1.4275673565013667, + "grad_norm": 2.630687713623047, + "learning_rate": 5.560638220725042e-06, + "loss": 0.4621, + "step": 10968 + }, + { + "epoch": 1.4279578289730575, + "grad_norm": 2.7884042263031006, + "learning_rate": 5.55856150687659e-06, + "loss": 0.5301, + "step": 10971 + }, + { + "epoch": 1.4283483014447482, + "grad_norm": 2.516737222671509, + "learning_rate": 5.5564846954486184e-06, + "loss": 0.4985, + "step": 10974 + }, + { + "epoch": 1.428738773916439, + "grad_norm": 2.5410304069519043, + "learning_rate": 5.55440778680394e-06, + "loss": 0.3917, + "step": 10977 + }, + { + "epoch": 1.4291292463881295, + "grad_norm": 2.4755969047546387, + "learning_rate": 5.552330781305389e-06, + "loss": 0.4808, + "step": 10980 + }, + { + "epoch": 1.4295197188598203, + "grad_norm": 3.0844016075134277, + "learning_rate": 5.550253679315812e-06, + "loss": 0.4758, + "step": 10983 + }, + { + "epoch": 1.429910191331511, + "grad_norm": 2.555314779281616, + "learning_rate": 5.548176481198075e-06, + "loss": 0.4533, + "step": 10986 + }, + { + "epoch": 1.4303006638032019, + "grad_norm": 3.199594736099243, + "learning_rate": 5.5460991873150605e-06, + "loss": 0.5575, + "step": 10989 + }, + { + "epoch": 1.4306911362748926, + "grad_norm": 2.5074615478515625, + "learning_rate": 5.544021798029665e-06, + "loss": 0.4556, + "step": 10992 + }, + { + "epoch": 1.4310816087465834, + "grad_norm": 2.9451918601989746, + "learning_rate": 5.541944313704807e-06, + "loss": 0.5797, + "step": 10995 + }, + { + "epoch": 1.4314720812182742, + "grad_norm": 2.5799622535705566, + "learning_rate": 5.539866734703416e-06, + "loss": 0.4921, + "step": 10998 + }, + { + "epoch": 1.4318625536899647, + "grad_norm": 2.503661870956421, + "learning_rate": 5.537789061388445e-06, + "loss": 0.5223, + "step": 11001 + }, + { + "epoch": 1.4322530261616557, + "grad_norm": 2.756873846054077, + "learning_rate": 5.535711294122854e-06, + "loss": 0.5098, + "step": 11004 + }, + { + "epoch": 1.4326434986333463, + "grad_norm": 2.3765101432800293, + "learning_rate": 5.533633433269627e-06, + "loss": 0.5223, + "step": 11007 + }, + { + "epoch": 1.433033971105037, + "grad_norm": 2.8661093711853027, + "learning_rate": 5.531555479191764e-06, + "loss": 0.5067, + "step": 11010 + }, + { + "epoch": 1.4334244435767278, + "grad_norm": 2.547419309616089, + "learning_rate": 5.529477432252275e-06, + "loss": 0.4815, + "step": 11013 + }, + { + "epoch": 1.4338149160484186, + "grad_norm": 2.4830684661865234, + "learning_rate": 5.527399292814193e-06, + "loss": 0.501, + "step": 11016 + }, + { + "epoch": 1.4342053885201094, + "grad_norm": 2.8555281162261963, + "learning_rate": 5.525321061240563e-06, + "loss": 0.4946, + "step": 11019 + }, + { + "epoch": 1.4345958609918001, + "grad_norm": 2.6124064922332764, + "learning_rate": 5.523242737894451e-06, + "loss": 0.4766, + "step": 11022 + }, + { + "epoch": 1.434986333463491, + "grad_norm": 2.4766440391540527, + "learning_rate": 5.521164323138931e-06, + "loss": 0.4742, + "step": 11025 + }, + { + "epoch": 1.4353768059351815, + "grad_norm": 2.639648199081421, + "learning_rate": 5.519085817337101e-06, + "loss": 0.4725, + "step": 11028 + }, + { + "epoch": 1.4357672784068722, + "grad_norm": 2.593116521835327, + "learning_rate": 5.517007220852072e-06, + "loss": 0.4893, + "step": 11031 + }, + { + "epoch": 1.436157750878563, + "grad_norm": 2.823061466217041, + "learning_rate": 5.514928534046968e-06, + "loss": 0.5143, + "step": 11034 + }, + { + "epoch": 1.4365482233502538, + "grad_norm": 2.7042064666748047, + "learning_rate": 5.512849757284932e-06, + "loss": 0.5257, + "step": 11037 + }, + { + "epoch": 1.4369386958219446, + "grad_norm": 2.685746431350708, + "learning_rate": 5.510770890929122e-06, + "loss": 0.5436, + "step": 11040 + }, + { + "epoch": 1.4373291682936353, + "grad_norm": 2.78102970123291, + "learning_rate": 5.5086919353427124e-06, + "loss": 0.5199, + "step": 11043 + }, + { + "epoch": 1.437719640765326, + "grad_norm": 2.8201215267181396, + "learning_rate": 5.506612890888892e-06, + "loss": 0.6273, + "step": 11046 + }, + { + "epoch": 1.4381101132370167, + "grad_norm": 2.8764889240264893, + "learning_rate": 5.5045337579308654e-06, + "loss": 0.4641, + "step": 11049 + }, + { + "epoch": 1.4385005857087076, + "grad_norm": 4.958584308624268, + "learning_rate": 5.502454536831854e-06, + "loss": 0.484, + "step": 11052 + }, + { + "epoch": 1.4388910581803982, + "grad_norm": 2.74894380569458, + "learning_rate": 5.5003752279550905e-06, + "loss": 0.5001, + "step": 11055 + }, + { + "epoch": 1.439281530652089, + "grad_norm": 3.1531951427459717, + "learning_rate": 5.498295831663827e-06, + "loss": 0.4416, + "step": 11058 + }, + { + "epoch": 1.4396720031237797, + "grad_norm": 2.578901767730713, + "learning_rate": 5.496216348321329e-06, + "loss": 0.4324, + "step": 11061 + }, + { + "epoch": 1.4400624755954705, + "grad_norm": 2.5892860889434814, + "learning_rate": 5.49413677829088e-06, + "loss": 0.4925, + "step": 11064 + }, + { + "epoch": 1.4404529480671613, + "grad_norm": 2.979255199432373, + "learning_rate": 5.492057121935777e-06, + "loss": 0.5162, + "step": 11067 + }, + { + "epoch": 1.440843420538852, + "grad_norm": 2.781018018722534, + "learning_rate": 5.489977379619328e-06, + "loss": 0.5983, + "step": 11070 + }, + { + "epoch": 1.4412338930105428, + "grad_norm": 2.6435060501098633, + "learning_rate": 5.487897551704862e-06, + "loss": 0.4696, + "step": 11073 + }, + { + "epoch": 1.4416243654822334, + "grad_norm": 2.8207993507385254, + "learning_rate": 5.48581763855572e-06, + "loss": 0.6021, + "step": 11076 + }, + { + "epoch": 1.4420148379539244, + "grad_norm": 2.7664260864257812, + "learning_rate": 5.4837376405352595e-06, + "loss": 0.5321, + "step": 11079 + }, + { + "epoch": 1.442405310425615, + "grad_norm": 2.488525390625, + "learning_rate": 5.481657558006849e-06, + "loss": 0.4147, + "step": 11082 + }, + { + "epoch": 1.4427957828973057, + "grad_norm": 2.4613826274871826, + "learning_rate": 5.4795773913338765e-06, + "loss": 0.5012, + "step": 11085 + }, + { + "epoch": 1.4431862553689965, + "grad_norm": 3.2609658241271973, + "learning_rate": 5.477497140879745e-06, + "loss": 0.5489, + "step": 11088 + }, + { + "epoch": 1.4435767278406872, + "grad_norm": 2.521787405014038, + "learning_rate": 5.475416807007866e-06, + "loss": 0.5193, + "step": 11091 + }, + { + "epoch": 1.443967200312378, + "grad_norm": 3.109292984008789, + "learning_rate": 5.473336390081671e-06, + "loss": 0.515, + "step": 11094 + }, + { + "epoch": 1.4443576727840688, + "grad_norm": 3.495357036590576, + "learning_rate": 5.471255890464604e-06, + "loss": 0.5132, + "step": 11097 + }, + { + "epoch": 1.4447481452557596, + "grad_norm": 3.0955452919006348, + "learning_rate": 5.469175308520124e-06, + "loss": 0.4092, + "step": 11100 + }, + { + "epoch": 1.4451386177274501, + "grad_norm": 2.3907532691955566, + "learning_rate": 5.467094644611705e-06, + "loss": 0.4515, + "step": 11103 + }, + { + "epoch": 1.4455290901991409, + "grad_norm": 2.64176869392395, + "learning_rate": 5.465013899102836e-06, + "loss": 0.5404, + "step": 11106 + }, + { + "epoch": 1.4459195626708317, + "grad_norm": 2.4482946395874023, + "learning_rate": 5.4629330723570154e-06, + "loss": 0.4778, + "step": 11109 + }, + { + "epoch": 1.4463100351425224, + "grad_norm": 2.8428847789764404, + "learning_rate": 5.460852164737761e-06, + "loss": 0.5936, + "step": 11112 + }, + { + "epoch": 1.4467005076142132, + "grad_norm": 2.7340033054351807, + "learning_rate": 5.458771176608602e-06, + "loss": 0.497, + "step": 11115 + }, + { + "epoch": 1.447090980085904, + "grad_norm": 2.5455846786499023, + "learning_rate": 5.456690108333086e-06, + "loss": 0.452, + "step": 11118 + }, + { + "epoch": 1.4474814525575947, + "grad_norm": 2.627605676651001, + "learning_rate": 5.454608960274765e-06, + "loss": 0.5079, + "step": 11121 + }, + { + "epoch": 1.4478719250292853, + "grad_norm": 2.4849627017974854, + "learning_rate": 5.452527732797219e-06, + "loss": 0.46, + "step": 11124 + }, + { + "epoch": 1.4482623975009763, + "grad_norm": 3.0936851501464844, + "learning_rate": 5.450446426264028e-06, + "loss": 0.5866, + "step": 11127 + }, + { + "epoch": 1.4486528699726668, + "grad_norm": 2.7293624877929688, + "learning_rate": 5.448365041038796e-06, + "loss": 0.5474, + "step": 11130 + }, + { + "epoch": 1.4490433424443576, + "grad_norm": 2.670182704925537, + "learning_rate": 5.446283577485132e-06, + "loss": 0.5431, + "step": 11133 + }, + { + "epoch": 1.4494338149160484, + "grad_norm": 2.502770185470581, + "learning_rate": 5.4442020359666655e-06, + "loss": 0.4375, + "step": 11136 + }, + { + "epoch": 1.4498242873877392, + "grad_norm": 2.5619125366210938, + "learning_rate": 5.442120416847041e-06, + "loss": 0.533, + "step": 11139 + }, + { + "epoch": 1.45021475985943, + "grad_norm": 2.700932025909424, + "learning_rate": 5.440038720489906e-06, + "loss": 0.4754, + "step": 11142 + }, + { + "epoch": 1.4506052323311207, + "grad_norm": 2.834325075149536, + "learning_rate": 5.437956947258935e-06, + "loss": 0.4661, + "step": 11145 + }, + { + "epoch": 1.4509957048028115, + "grad_norm": 2.6361968517303467, + "learning_rate": 5.435875097517805e-06, + "loss": 0.5139, + "step": 11148 + }, + { + "epoch": 1.451386177274502, + "grad_norm": 2.8682637214660645, + "learning_rate": 5.433793171630213e-06, + "loss": 0.4919, + "step": 11151 + }, + { + "epoch": 1.451776649746193, + "grad_norm": 2.7310025691986084, + "learning_rate": 5.431711169959866e-06, + "loss": 0.4384, + "step": 11154 + }, + { + "epoch": 1.4521671222178836, + "grad_norm": 3.184248208999634, + "learning_rate": 5.429629092870488e-06, + "loss": 0.4457, + "step": 11157 + }, + { + "epoch": 1.4525575946895743, + "grad_norm": 2.8572938442230225, + "learning_rate": 5.4275469407258096e-06, + "loss": 0.5435, + "step": 11160 + }, + { + "epoch": 1.4529480671612651, + "grad_norm": 2.845240354537964, + "learning_rate": 5.425464713889579e-06, + "loss": 0.5269, + "step": 11163 + }, + { + "epoch": 1.453338539632956, + "grad_norm": 2.744442939758301, + "learning_rate": 5.42338241272556e-06, + "loss": 0.5117, + "step": 11166 + }, + { + "epoch": 1.4537290121046467, + "grad_norm": 3.74177622795105, + "learning_rate": 5.4213000375975226e-06, + "loss": 0.4822, + "step": 11169 + }, + { + "epoch": 1.4541194845763374, + "grad_norm": 2.628488779067993, + "learning_rate": 5.419217588869255e-06, + "loss": 0.4378, + "step": 11172 + }, + { + "epoch": 1.4545099570480282, + "grad_norm": 2.955030918121338, + "learning_rate": 5.4171350669045585e-06, + "loss": 0.536, + "step": 11175 + }, + { + "epoch": 1.4549004295197188, + "grad_norm": 2.7912299633026123, + "learning_rate": 5.415052472067241e-06, + "loss": 0.5115, + "step": 11178 + }, + { + "epoch": 1.4552909019914095, + "grad_norm": 2.868971824645996, + "learning_rate": 5.412969804721132e-06, + "loss": 0.4951, + "step": 11181 + }, + { + "epoch": 1.4556813744631003, + "grad_norm": 2.6621592044830322, + "learning_rate": 5.410887065230064e-06, + "loss": 0.5373, + "step": 11184 + }, + { + "epoch": 1.456071846934791, + "grad_norm": 2.808011770248413, + "learning_rate": 5.4088042539578925e-06, + "loss": 0.5717, + "step": 11187 + }, + { + "epoch": 1.4564623194064819, + "grad_norm": 2.3780086040496826, + "learning_rate": 5.406721371268476e-06, + "loss": 0.4945, + "step": 11190 + }, + { + "epoch": 1.4568527918781726, + "grad_norm": 2.6770284175872803, + "learning_rate": 5.404638417525693e-06, + "loss": 0.4986, + "step": 11193 + }, + { + "epoch": 1.4572432643498634, + "grad_norm": 2.5734894275665283, + "learning_rate": 5.4025553930934295e-06, + "loss": 0.4697, + "step": 11196 + }, + { + "epoch": 1.457633736821554, + "grad_norm": 2.9276740550994873, + "learning_rate": 5.4004722983355854e-06, + "loss": 0.5275, + "step": 11199 + }, + { + "epoch": 1.458024209293245, + "grad_norm": 2.488164186477661, + "learning_rate": 5.398389133616074e-06, + "loss": 0.4691, + "step": 11202 + }, + { + "epoch": 1.4584146817649355, + "grad_norm": 2.68459153175354, + "learning_rate": 5.396305899298817e-06, + "loss": 0.545, + "step": 11205 + }, + { + "epoch": 1.4588051542366263, + "grad_norm": 2.5225677490234375, + "learning_rate": 5.394222595747755e-06, + "loss": 0.4992, + "step": 11208 + }, + { + "epoch": 1.459195626708317, + "grad_norm": 2.5488972663879395, + "learning_rate": 5.3921392233268345e-06, + "loss": 0.3998, + "step": 11211 + }, + { + "epoch": 1.4595860991800078, + "grad_norm": 2.9433960914611816, + "learning_rate": 5.390055782400016e-06, + "loss": 0.5108, + "step": 11214 + }, + { + "epoch": 1.4599765716516986, + "grad_norm": 2.6426749229431152, + "learning_rate": 5.387972273331273e-06, + "loss": 0.4687, + "step": 11217 + }, + { + "epoch": 1.4603670441233894, + "grad_norm": 2.4266717433929443, + "learning_rate": 5.38588869648459e-06, + "loss": 0.4355, + "step": 11220 + }, + { + "epoch": 1.4607575165950801, + "grad_norm": 2.7653090953826904, + "learning_rate": 5.383805052223964e-06, + "loss": 0.5161, + "step": 11223 + }, + { + "epoch": 1.4611479890667707, + "grad_norm": 2.5222866535186768, + "learning_rate": 5.381721340913403e-06, + "loss": 0.5055, + "step": 11226 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 2.504310369491577, + "learning_rate": 5.379637562916925e-06, + "loss": 0.5179, + "step": 11229 + }, + { + "epoch": 1.4619289340101522, + "grad_norm": 3.081963300704956, + "learning_rate": 5.377553718598566e-06, + "loss": 0.5027, + "step": 11232 + }, + { + "epoch": 1.462319406481843, + "grad_norm": 2.8216118812561035, + "learning_rate": 5.375469808322364e-06, + "loss": 0.5212, + "step": 11235 + }, + { + "epoch": 1.4627098789535338, + "grad_norm": 2.547189474105835, + "learning_rate": 5.3733858324523795e-06, + "loss": 0.4809, + "step": 11238 + }, + { + "epoch": 1.4631003514252245, + "grad_norm": 2.744575023651123, + "learning_rate": 5.371301791352673e-06, + "loss": 0.4945, + "step": 11241 + }, + { + "epoch": 1.4634908238969153, + "grad_norm": 2.8713624477386475, + "learning_rate": 5.369217685387326e-06, + "loss": 0.5239, + "step": 11244 + }, + { + "epoch": 1.463881296368606, + "grad_norm": 2.840270519256592, + "learning_rate": 5.367133514920425e-06, + "loss": 0.4806, + "step": 11247 + }, + { + "epoch": 1.4642717688402969, + "grad_norm": 3.1532773971557617, + "learning_rate": 5.3650492803160715e-06, + "loss": 0.5324, + "step": 11250 + }, + { + "epoch": 1.4646622413119874, + "grad_norm": 2.547405958175659, + "learning_rate": 5.362964981938378e-06, + "loss": 0.5092, + "step": 11253 + }, + { + "epoch": 1.4650527137836782, + "grad_norm": 2.8435122966766357, + "learning_rate": 5.360880620151464e-06, + "loss": 0.5244, + "step": 11256 + }, + { + "epoch": 1.465443186255369, + "grad_norm": 2.7665226459503174, + "learning_rate": 5.358796195319467e-06, + "loss": 0.5135, + "step": 11259 + }, + { + "epoch": 1.4658336587270597, + "grad_norm": 2.7496416568756104, + "learning_rate": 5.356711707806527e-06, + "loss": 0.5243, + "step": 11262 + }, + { + "epoch": 1.4662241311987505, + "grad_norm": 2.4985382556915283, + "learning_rate": 5.354627157976803e-06, + "loss": 0.4796, + "step": 11265 + }, + { + "epoch": 1.4666146036704413, + "grad_norm": 3.4736976623535156, + "learning_rate": 5.35254254619446e-06, + "loss": 0.4909, + "step": 11268 + }, + { + "epoch": 1.467005076142132, + "grad_norm": 2.8436739444732666, + "learning_rate": 5.3504578728236755e-06, + "loss": 0.5168, + "step": 11271 + }, + { + "epoch": 1.4673955486138226, + "grad_norm": 3.0882272720336914, + "learning_rate": 5.34837313822864e-06, + "loss": 0.5582, + "step": 11274 + }, + { + "epoch": 1.4677860210855136, + "grad_norm": 5.404419898986816, + "learning_rate": 5.346288342773549e-06, + "loss": 0.511, + "step": 11277 + }, + { + "epoch": 1.4681764935572041, + "grad_norm": 2.911956548690796, + "learning_rate": 5.344203486822612e-06, + "loss": 0.4474, + "step": 11280 + }, + { + "epoch": 1.468566966028895, + "grad_norm": 3.251417398452759, + "learning_rate": 5.342118570740052e-06, + "loss": 0.565, + "step": 11283 + }, + { + "epoch": 1.4689574385005857, + "grad_norm": 2.6077330112457275, + "learning_rate": 5.340033594890096e-06, + "loss": 0.5114, + "step": 11286 + }, + { + "epoch": 1.4693479109722765, + "grad_norm": 2.613679885864258, + "learning_rate": 5.337948559636986e-06, + "loss": 0.4709, + "step": 11289 + }, + { + "epoch": 1.4697383834439672, + "grad_norm": 3.1710855960845947, + "learning_rate": 5.335863465344974e-06, + "loss": 0.4774, + "step": 11292 + }, + { + "epoch": 1.470128855915658, + "grad_norm": 2.876929521560669, + "learning_rate": 5.333778312378323e-06, + "loss": 0.5028, + "step": 11295 + }, + { + "epoch": 1.4705193283873488, + "grad_norm": 2.627826690673828, + "learning_rate": 5.3316931011013005e-06, + "loss": 0.4586, + "step": 11298 + }, + { + "epoch": 1.4709098008590393, + "grad_norm": 3.5629796981811523, + "learning_rate": 5.329607831878191e-06, + "loss": 0.5185, + "step": 11301 + }, + { + "epoch": 1.47130027333073, + "grad_norm": 2.592860221862793, + "learning_rate": 5.327522505073288e-06, + "loss": 0.508, + "step": 11304 + }, + { + "epoch": 1.4716907458024209, + "grad_norm": 2.701939344406128, + "learning_rate": 5.325437121050892e-06, + "loss": 0.5413, + "step": 11307 + }, + { + "epoch": 1.4720812182741116, + "grad_norm": 2.25862717628479, + "learning_rate": 5.323351680175315e-06, + "loss": 0.4955, + "step": 11310 + }, + { + "epoch": 1.4724716907458024, + "grad_norm": 2.581425666809082, + "learning_rate": 5.3212661828108804e-06, + "loss": 0.5118, + "step": 11313 + }, + { + "epoch": 1.4728621632174932, + "grad_norm": 2.7208588123321533, + "learning_rate": 5.31918062932192e-06, + "loss": 0.5058, + "step": 11316 + }, + { + "epoch": 1.473252635689184, + "grad_norm": 2.554607391357422, + "learning_rate": 5.317095020072773e-06, + "loss": 0.4651, + "step": 11319 + }, + { + "epoch": 1.4736431081608747, + "grad_norm": 2.7195706367492676, + "learning_rate": 5.315009355427795e-06, + "loss": 0.5694, + "step": 11322 + }, + { + "epoch": 1.4740335806325655, + "grad_norm": 2.790142774581909, + "learning_rate": 5.312923635751344e-06, + "loss": 0.4539, + "step": 11325 + }, + { + "epoch": 1.474424053104256, + "grad_norm": 2.372218132019043, + "learning_rate": 5.310837861407794e-06, + "loss": 0.4228, + "step": 11328 + }, + { + "epoch": 1.4748145255759468, + "grad_norm": 2.9723665714263916, + "learning_rate": 5.308752032761522e-06, + "loss": 0.469, + "step": 11331 + }, + { + "epoch": 1.4752049980476376, + "grad_norm": 2.6167614459991455, + "learning_rate": 5.306666150176919e-06, + "loss": 0.5017, + "step": 11334 + }, + { + "epoch": 1.4755954705193284, + "grad_norm": 2.730600357055664, + "learning_rate": 5.304580214018385e-06, + "loss": 0.5289, + "step": 11337 + }, + { + "epoch": 1.4759859429910192, + "grad_norm": 2.659367799758911, + "learning_rate": 5.30249422465033e-06, + "loss": 0.425, + "step": 11340 + }, + { + "epoch": 1.47637641546271, + "grad_norm": 2.761075019836426, + "learning_rate": 5.300408182437169e-06, + "loss": 0.5373, + "step": 11343 + }, + { + "epoch": 1.4767668879344007, + "grad_norm": 2.6184074878692627, + "learning_rate": 5.298322087743331e-06, + "loss": 0.5185, + "step": 11346 + }, + { + "epoch": 1.4771573604060912, + "grad_norm": 2.6529762744903564, + "learning_rate": 5.296235940933251e-06, + "loss": 0.4562, + "step": 11349 + }, + { + "epoch": 1.4775478328777822, + "grad_norm": 2.837994337081909, + "learning_rate": 5.294149742371375e-06, + "loss": 0.535, + "step": 11352 + }, + { + "epoch": 1.4779383053494728, + "grad_norm": 2.4222748279571533, + "learning_rate": 5.292063492422159e-06, + "loss": 0.4687, + "step": 11355 + }, + { + "epoch": 1.4783287778211636, + "grad_norm": 2.741462230682373, + "learning_rate": 5.289977191450064e-06, + "loss": 0.5317, + "step": 11358 + }, + { + "epoch": 1.4787192502928543, + "grad_norm": 2.7143518924713135, + "learning_rate": 5.287890839819566e-06, + "loss": 0.5798, + "step": 11361 + }, + { + "epoch": 1.479109722764545, + "grad_norm": 2.4520294666290283, + "learning_rate": 5.285804437895141e-06, + "loss": 0.4746, + "step": 11364 + }, + { + "epoch": 1.4795001952362359, + "grad_norm": 3.155306577682495, + "learning_rate": 5.283717986041285e-06, + "loss": 0.5228, + "step": 11367 + }, + { + "epoch": 1.4798906677079267, + "grad_norm": 2.5893876552581787, + "learning_rate": 5.281631484622491e-06, + "loss": 0.4359, + "step": 11370 + }, + { + "epoch": 1.4802811401796174, + "grad_norm": 2.689603090286255, + "learning_rate": 5.27954493400327e-06, + "loss": 0.5414, + "step": 11373 + }, + { + "epoch": 1.480671612651308, + "grad_norm": 2.8122377395629883, + "learning_rate": 5.277458334548138e-06, + "loss": 0.4752, + "step": 11376 + }, + { + "epoch": 1.4810620851229988, + "grad_norm": 2.8746378421783447, + "learning_rate": 5.2753716866216175e-06, + "loss": 0.5519, + "step": 11379 + }, + { + "epoch": 1.4814525575946895, + "grad_norm": 2.670448064804077, + "learning_rate": 5.273284990588243e-06, + "loss": 0.4566, + "step": 11382 + }, + { + "epoch": 1.4818430300663803, + "grad_norm": 2.758208751678467, + "learning_rate": 5.2711982468125556e-06, + "loss": 0.5005, + "step": 11385 + }, + { + "epoch": 1.482233502538071, + "grad_norm": 2.7960431575775146, + "learning_rate": 5.269111455659105e-06, + "loss": 0.4609, + "step": 11388 + }, + { + "epoch": 1.4826239750097618, + "grad_norm": 3.621656656265259, + "learning_rate": 5.26702461749245e-06, + "loss": 0.5131, + "step": 11391 + }, + { + "epoch": 1.4830144474814526, + "grad_norm": 3.8719372749328613, + "learning_rate": 5.264937732677153e-06, + "loss": 0.4925, + "step": 11394 + }, + { + "epoch": 1.4834049199531434, + "grad_norm": 2.7081613540649414, + "learning_rate": 5.262850801577796e-06, + "loss": 0.5013, + "step": 11397 + }, + { + "epoch": 1.4837953924248342, + "grad_norm": 2.990955114364624, + "learning_rate": 5.260763824558954e-06, + "loss": 0.5611, + "step": 11400 + }, + { + "epoch": 1.4841858648965247, + "grad_norm": 2.5260677337646484, + "learning_rate": 5.258676801985222e-06, + "loss": 0.5057, + "step": 11403 + }, + { + "epoch": 1.4845763373682155, + "grad_norm": 2.441223382949829, + "learning_rate": 5.256589734221196e-06, + "loss": 0.4837, + "step": 11406 + }, + { + "epoch": 1.4849668098399063, + "grad_norm": 2.5809214115142822, + "learning_rate": 5.254502621631482e-06, + "loss": 0.4651, + "step": 11409 + }, + { + "epoch": 1.485357282311597, + "grad_norm": 2.6332476139068604, + "learning_rate": 5.252415464580698e-06, + "loss": 0.5485, + "step": 11412 + }, + { + "epoch": 1.4857477547832878, + "grad_norm": 2.675795316696167, + "learning_rate": 5.25032826343346e-06, + "loss": 0.5142, + "step": 11415 + }, + { + "epoch": 1.4861382272549786, + "grad_norm": 2.489439010620117, + "learning_rate": 5.248241018554404e-06, + "loss": 0.4575, + "step": 11418 + }, + { + "epoch": 1.4865286997266693, + "grad_norm": 2.5916011333465576, + "learning_rate": 5.246153730308162e-06, + "loss": 0.51, + "step": 11421 + }, + { + "epoch": 1.48691917219836, + "grad_norm": 2.8045613765716553, + "learning_rate": 5.244066399059385e-06, + "loss": 0.4623, + "step": 11424 + }, + { + "epoch": 1.487309644670051, + "grad_norm": 2.699486255645752, + "learning_rate": 5.241979025172717e-06, + "loss": 0.5239, + "step": 11427 + }, + { + "epoch": 1.4877001171417414, + "grad_norm": 2.300581693649292, + "learning_rate": 5.239891609012824e-06, + "loss": 0.4958, + "step": 11430 + }, + { + "epoch": 1.4880905896134322, + "grad_norm": 2.496739625930786, + "learning_rate": 5.237804150944373e-06, + "loss": 0.5159, + "step": 11433 + }, + { + "epoch": 1.488481062085123, + "grad_norm": 2.7403767108917236, + "learning_rate": 5.2357166513320344e-06, + "loss": 0.5758, + "step": 11436 + }, + { + "epoch": 1.4888715345568138, + "grad_norm": 3.07275128364563, + "learning_rate": 5.233629110540494e-06, + "loss": 0.4913, + "step": 11439 + }, + { + "epoch": 1.4892620070285045, + "grad_norm": 2.733052968978882, + "learning_rate": 5.2315415289344405e-06, + "loss": 0.5081, + "step": 11442 + }, + { + "epoch": 1.4896524795001953, + "grad_norm": 2.944462299346924, + "learning_rate": 5.229453906878569e-06, + "loss": 0.5173, + "step": 11445 + }, + { + "epoch": 1.490042951971886, + "grad_norm": 2.523071527481079, + "learning_rate": 5.227366244737582e-06, + "loss": 0.5012, + "step": 11448 + }, + { + "epoch": 1.4904334244435766, + "grad_norm": 2.42546010017395, + "learning_rate": 5.225278542876189e-06, + "loss": 0.5246, + "step": 11451 + }, + { + "epoch": 1.4908238969152674, + "grad_norm": 2.5009751319885254, + "learning_rate": 5.2231908016591104e-06, + "loss": 0.4843, + "step": 11454 + }, + { + "epoch": 1.4912143693869582, + "grad_norm": 3.1024861335754395, + "learning_rate": 5.221103021451066e-06, + "loss": 0.4763, + "step": 11457 + }, + { + "epoch": 1.491604841858649, + "grad_norm": 2.727797269821167, + "learning_rate": 5.219015202616792e-06, + "loss": 0.5251, + "step": 11460 + }, + { + "epoch": 1.4919953143303397, + "grad_norm": 2.642549753189087, + "learning_rate": 5.2169273455210205e-06, + "loss": 0.4429, + "step": 11463 + }, + { + "epoch": 1.4923857868020305, + "grad_norm": 2.5647194385528564, + "learning_rate": 5.214839450528498e-06, + "loss": 0.5479, + "step": 11466 + }, + { + "epoch": 1.4927762592737213, + "grad_norm": 2.818277597427368, + "learning_rate": 5.212751518003977e-06, + "loss": 0.4881, + "step": 11469 + }, + { + "epoch": 1.4931667317454118, + "grad_norm": 2.8775992393493652, + "learning_rate": 5.210663548312212e-06, + "loss": 0.536, + "step": 11472 + }, + { + "epoch": 1.4935572042171028, + "grad_norm": 3.7240641117095947, + "learning_rate": 5.208575541817971e-06, + "loss": 0.4804, + "step": 11475 + }, + { + "epoch": 1.4939476766887934, + "grad_norm": 2.5948472023010254, + "learning_rate": 5.206487498886017e-06, + "loss": 0.6184, + "step": 11478 + }, + { + "epoch": 1.4943381491604841, + "grad_norm": 3.2808594703674316, + "learning_rate": 5.2043994198811356e-06, + "loss": 0.6083, + "step": 11481 + }, + { + "epoch": 1.494728621632175, + "grad_norm": 2.4906272888183594, + "learning_rate": 5.202311305168103e-06, + "loss": 0.4931, + "step": 11484 + }, + { + "epoch": 1.4951190941038657, + "grad_norm": 2.869908571243286, + "learning_rate": 5.200223155111711e-06, + "loss": 0.5207, + "step": 11487 + }, + { + "epoch": 1.4955095665755564, + "grad_norm": 2.8915436267852783, + "learning_rate": 5.198134970076757e-06, + "loss": 0.5586, + "step": 11490 + }, + { + "epoch": 1.4959000390472472, + "grad_norm": 3.477677345275879, + "learning_rate": 5.196046750428039e-06, + "loss": 0.5211, + "step": 11493 + }, + { + "epoch": 1.496290511518938, + "grad_norm": 2.8467986583709717, + "learning_rate": 5.193958496530367e-06, + "loss": 0.4828, + "step": 11496 + }, + { + "epoch": 1.4966809839906285, + "grad_norm": 2.7901194095611572, + "learning_rate": 5.1918702087485515e-06, + "loss": 0.5162, + "step": 11499 + }, + { + "epoch": 1.4970714564623195, + "grad_norm": 2.9377663135528564, + "learning_rate": 5.189781887447417e-06, + "loss": 0.4016, + "step": 11502 + }, + { + "epoch": 1.49746192893401, + "grad_norm": 2.7478699684143066, + "learning_rate": 5.187693532991784e-06, + "loss": 0.4597, + "step": 11505 + }, + { + "epoch": 1.4978524014057009, + "grad_norm": 2.764273166656494, + "learning_rate": 5.185605145746487e-06, + "loss": 0.4994, + "step": 11508 + }, + { + "epoch": 1.4982428738773916, + "grad_norm": 2.593223810195923, + "learning_rate": 5.183516726076362e-06, + "loss": 0.5124, + "step": 11511 + }, + { + "epoch": 1.4986333463490824, + "grad_norm": 2.9273481369018555, + "learning_rate": 5.181428274346249e-06, + "loss": 0.4994, + "step": 11514 + }, + { + "epoch": 1.4990238188207732, + "grad_norm": 2.4240915775299072, + "learning_rate": 5.179339790920999e-06, + "loss": 0.485, + "step": 11517 + }, + { + "epoch": 1.499414291292464, + "grad_norm": 2.7207272052764893, + "learning_rate": 5.177251276165465e-06, + "loss": 0.538, + "step": 11520 + }, + { + "epoch": 1.4998047637641547, + "grad_norm": 2.5581884384155273, + "learning_rate": 5.175162730444505e-06, + "loss": 0.4982, + "step": 11523 + }, + { + "epoch": 1.5001952362358453, + "grad_norm": 2.6691441535949707, + "learning_rate": 5.173074154122986e-06, + "loss": 0.5418, + "step": 11526 + }, + { + "epoch": 1.5005857087075363, + "grad_norm": 2.560971736907959, + "learning_rate": 5.170985547565775e-06, + "loss": 0.473, + "step": 11529 + }, + { + "epoch": 1.5009761811792268, + "grad_norm": 2.5895869731903076, + "learning_rate": 5.1688969111377505e-06, + "loss": 0.4541, + "step": 11532 + }, + { + "epoch": 1.5013666536509176, + "grad_norm": 2.396502733230591, + "learning_rate": 5.166808245203789e-06, + "loss": 0.51, + "step": 11535 + }, + { + "epoch": 1.5017571261226084, + "grad_norm": 2.532872200012207, + "learning_rate": 5.164719550128779e-06, + "loss": 0.5406, + "step": 11538 + }, + { + "epoch": 1.5021475985942991, + "grad_norm": 2.419482946395874, + "learning_rate": 5.162630826277609e-06, + "loss": 0.4598, + "step": 11541 + }, + { + "epoch": 1.50253807106599, + "grad_norm": 2.596123218536377, + "learning_rate": 5.160542074015177e-06, + "loss": 0.4867, + "step": 11544 + }, + { + "epoch": 1.5029285435376805, + "grad_norm": 2.9170663356781006, + "learning_rate": 5.158453293706383e-06, + "loss": 0.4896, + "step": 11547 + }, + { + "epoch": 1.5033190160093715, + "grad_norm": 2.6187119483947754, + "learning_rate": 5.15636448571613e-06, + "loss": 0.4578, + "step": 11550 + }, + { + "epoch": 1.503709488481062, + "grad_norm": 2.4606587886810303, + "learning_rate": 5.1542756504093315e-06, + "loss": 0.4461, + "step": 11553 + }, + { + "epoch": 1.5040999609527528, + "grad_norm": 2.533982038497925, + "learning_rate": 5.152186788150901e-06, + "loss": 0.499, + "step": 11556 + }, + { + "epoch": 1.5044904334244436, + "grad_norm": 2.852947473526001, + "learning_rate": 5.1500978993057596e-06, + "loss": 0.5007, + "step": 11559 + }, + { + "epoch": 1.5048809058961343, + "grad_norm": 3.0088095664978027, + "learning_rate": 5.1480089842388295e-06, + "loss": 0.5055, + "step": 11562 + }, + { + "epoch": 1.505271378367825, + "grad_norm": 2.5373263359069824, + "learning_rate": 5.145920043315041e-06, + "loss": 0.4603, + "step": 11565 + }, + { + "epoch": 1.5056618508395159, + "grad_norm": 2.940673828125, + "learning_rate": 5.143831076899329e-06, + "loss": 0.6183, + "step": 11568 + }, + { + "epoch": 1.5060523233112066, + "grad_norm": 2.945150375366211, + "learning_rate": 5.14174208535663e-06, + "loss": 0.55, + "step": 11571 + }, + { + "epoch": 1.5064427957828972, + "grad_norm": 2.7196662425994873, + "learning_rate": 5.1396530690518876e-06, + "loss": 0.5235, + "step": 11574 + }, + { + "epoch": 1.5068332682545882, + "grad_norm": 2.892042875289917, + "learning_rate": 5.137564028350048e-06, + "loss": 0.5291, + "step": 11577 + }, + { + "epoch": 1.5072237407262787, + "grad_norm": 2.4638333320617676, + "learning_rate": 5.135474963616062e-06, + "loss": 0.4451, + "step": 11580 + }, + { + "epoch": 1.5076142131979695, + "grad_norm": 2.493852138519287, + "learning_rate": 5.133385875214883e-06, + "loss": 0.5536, + "step": 11583 + }, + { + "epoch": 1.5080046856696603, + "grad_norm": 2.891443967819214, + "learning_rate": 5.131296763511473e-06, + "loss": 0.4479, + "step": 11586 + }, + { + "epoch": 1.508395158141351, + "grad_norm": 2.568324327468872, + "learning_rate": 5.129207628870796e-06, + "loss": 0.4524, + "step": 11589 + }, + { + "epoch": 1.5087856306130418, + "grad_norm": 3.3236899375915527, + "learning_rate": 5.127118471657817e-06, + "loss": 0.4885, + "step": 11592 + }, + { + "epoch": 1.5091761030847324, + "grad_norm": 2.7332510948181152, + "learning_rate": 5.125029292237508e-06, + "loss": 0.5649, + "step": 11595 + }, + { + "epoch": 1.5095665755564234, + "grad_norm": 2.6280899047851562, + "learning_rate": 5.122940090974846e-06, + "loss": 0.4571, + "step": 11598 + }, + { + "epoch": 1.509957048028114, + "grad_norm": 2.642153024673462, + "learning_rate": 5.120850868234808e-06, + "loss": 0.5028, + "step": 11601 + }, + { + "epoch": 1.510347520499805, + "grad_norm": 2.4956655502319336, + "learning_rate": 5.118761624382377e-06, + "loss": 0.5421, + "step": 11604 + }, + { + "epoch": 1.5107379929714955, + "grad_norm": 3.0363759994506836, + "learning_rate": 5.11667235978254e-06, + "loss": 0.5359, + "step": 11607 + }, + { + "epoch": 1.5111284654431862, + "grad_norm": 3.446927070617676, + "learning_rate": 5.114583074800289e-06, + "loss": 0.5317, + "step": 11610 + }, + { + "epoch": 1.511518937914877, + "grad_norm": 2.6204493045806885, + "learning_rate": 5.112493769800614e-06, + "loss": 0.4863, + "step": 11613 + }, + { + "epoch": 1.5119094103865678, + "grad_norm": 2.5534026622772217, + "learning_rate": 5.110404445148515e-06, + "loss": 0.5407, + "step": 11616 + }, + { + "epoch": 1.5122998828582586, + "grad_norm": 2.5729386806488037, + "learning_rate": 5.108315101208991e-06, + "loss": 0.538, + "step": 11619 + }, + { + "epoch": 1.512690355329949, + "grad_norm": 2.6536967754364014, + "learning_rate": 5.106225738347047e-06, + "loss": 0.4916, + "step": 11622 + }, + { + "epoch": 1.51308082780164, + "grad_norm": 2.6462488174438477, + "learning_rate": 5.10413635692769e-06, + "loss": 0.4578, + "step": 11625 + }, + { + "epoch": 1.5134713002733307, + "grad_norm": 2.5776121616363525, + "learning_rate": 5.10204695731593e-06, + "loss": 0.5028, + "step": 11628 + }, + { + "epoch": 1.5138617727450214, + "grad_norm": 2.8399088382720947, + "learning_rate": 5.099957539876783e-06, + "loss": 0.5134, + "step": 11631 + }, + { + "epoch": 1.5142522452167122, + "grad_norm": 2.53810453414917, + "learning_rate": 5.097868104975262e-06, + "loss": 0.5759, + "step": 11634 + }, + { + "epoch": 1.514642717688403, + "grad_norm": 2.6173861026763916, + "learning_rate": 5.09577865297639e-06, + "loss": 0.522, + "step": 11637 + }, + { + "epoch": 1.5150331901600937, + "grad_norm": 2.800180673599243, + "learning_rate": 5.0936891842451895e-06, + "loss": 0.6223, + "step": 11640 + }, + { + "epoch": 1.5154236626317843, + "grad_norm": 2.824063539505005, + "learning_rate": 5.0915996991466845e-06, + "loss": 0.4638, + "step": 11643 + }, + { + "epoch": 1.5158141351034753, + "grad_norm": 2.6472010612487793, + "learning_rate": 5.089510198045904e-06, + "loss": 0.4483, + "step": 11646 + }, + { + "epoch": 1.5162046075751658, + "grad_norm": 2.6510941982269287, + "learning_rate": 5.087420681307881e-06, + "loss": 0.5283, + "step": 11649 + }, + { + "epoch": 1.5165950800468568, + "grad_norm": 2.5269935131073, + "learning_rate": 5.085331149297649e-06, + "loss": 0.4922, + "step": 11652 + }, + { + "epoch": 1.5169855525185474, + "grad_norm": 2.427276134490967, + "learning_rate": 5.083241602380246e-06, + "loss": 0.4429, + "step": 11655 + }, + { + "epoch": 1.5173760249902382, + "grad_norm": 2.5796139240264893, + "learning_rate": 5.081152040920708e-06, + "loss": 0.5555, + "step": 11658 + }, + { + "epoch": 1.517766497461929, + "grad_norm": 3.097916603088379, + "learning_rate": 5.079062465284081e-06, + "loss": 0.5131, + "step": 11661 + }, + { + "epoch": 1.5181569699336197, + "grad_norm": 2.6202802658081055, + "learning_rate": 5.076972875835406e-06, + "loss": 0.5081, + "step": 11664 + }, + { + "epoch": 1.5185474424053105, + "grad_norm": 2.4596011638641357, + "learning_rate": 5.074883272939732e-06, + "loss": 0.4617, + "step": 11667 + }, + { + "epoch": 1.518937914877001, + "grad_norm": 3.6132545471191406, + "learning_rate": 5.072793656962108e-06, + "loss": 0.4359, + "step": 11670 + }, + { + "epoch": 1.519328387348692, + "grad_norm": 2.3989944458007812, + "learning_rate": 5.0707040282675855e-06, + "loss": 0.4565, + "step": 11673 + }, + { + "epoch": 1.5197188598203826, + "grad_norm": 3.203303813934326, + "learning_rate": 5.068614387221218e-06, + "loss": 0.4667, + "step": 11676 + }, + { + "epoch": 1.5201093322920736, + "grad_norm": 2.8090553283691406, + "learning_rate": 5.066524734188061e-06, + "loss": 0.5105, + "step": 11679 + }, + { + "epoch": 1.5204998047637641, + "grad_norm": 2.759253740310669, + "learning_rate": 5.064435069533174e-06, + "loss": 0.5128, + "step": 11682 + }, + { + "epoch": 1.520890277235455, + "grad_norm": 2.5702476501464844, + "learning_rate": 5.062345393621615e-06, + "loss": 0.4793, + "step": 11685 + }, + { + "epoch": 1.5212807497071457, + "grad_norm": 2.932741403579712, + "learning_rate": 5.060255706818447e-06, + "loss": 0.4439, + "step": 11688 + }, + { + "epoch": 1.5216712221788364, + "grad_norm": 2.6461048126220703, + "learning_rate": 5.058166009488733e-06, + "loss": 0.5125, + "step": 11691 + }, + { + "epoch": 1.5220616946505272, + "grad_norm": 2.705612897872925, + "learning_rate": 5.05607630199754e-06, + "loss": 0.533, + "step": 11694 + }, + { + "epoch": 1.5224521671222178, + "grad_norm": 2.7387025356292725, + "learning_rate": 5.0539865847099354e-06, + "loss": 0.4833, + "step": 11697 + }, + { + "epoch": 1.5228426395939088, + "grad_norm": 2.522993803024292, + "learning_rate": 5.051896857990988e-06, + "loss": 0.5356, + "step": 11700 + }, + { + "epoch": 1.5232331120655993, + "grad_norm": 2.885016441345215, + "learning_rate": 5.049807122205768e-06, + "loss": 0.5451, + "step": 11703 + }, + { + "epoch": 1.52362358453729, + "grad_norm": 2.7136266231536865, + "learning_rate": 5.04771737771935e-06, + "loss": 0.4738, + "step": 11706 + }, + { + "epoch": 1.5240140570089808, + "grad_norm": 2.6322624683380127, + "learning_rate": 5.045627624896804e-06, + "loss": 0.5244, + "step": 11709 + }, + { + "epoch": 1.5244045294806716, + "grad_norm": 3.288853406906128, + "learning_rate": 5.0435378641032095e-06, + "loss": 0.5564, + "step": 11712 + }, + { + "epoch": 1.5247950019523624, + "grad_norm": 2.5992119312286377, + "learning_rate": 5.0414480957036415e-06, + "loss": 0.5101, + "step": 11715 + }, + { + "epoch": 1.525185474424053, + "grad_norm": 2.7537200450897217, + "learning_rate": 5.039358320063179e-06, + "loss": 0.5499, + "step": 11718 + }, + { + "epoch": 1.525575946895744, + "grad_norm": 2.5986032485961914, + "learning_rate": 5.037268537546901e-06, + "loss": 0.4637, + "step": 11721 + }, + { + "epoch": 1.5259664193674345, + "grad_norm": 2.795783042907715, + "learning_rate": 5.035178748519887e-06, + "loss": 0.5092, + "step": 11724 + }, + { + "epoch": 1.5263568918391255, + "grad_norm": 2.8021063804626465, + "learning_rate": 5.03308895334722e-06, + "loss": 0.5189, + "step": 11727 + }, + { + "epoch": 1.526747364310816, + "grad_norm": 2.8685810565948486, + "learning_rate": 5.0309991523939805e-06, + "loss": 0.5911, + "step": 11730 + }, + { + "epoch": 1.5271378367825068, + "grad_norm": 2.5764682292938232, + "learning_rate": 5.028909346025257e-06, + "loss": 0.4561, + "step": 11733 + }, + { + "epoch": 1.5275283092541976, + "grad_norm": 2.9899494647979736, + "learning_rate": 5.026819534606131e-06, + "loss": 0.4998, + "step": 11736 + }, + { + "epoch": 1.5279187817258884, + "grad_norm": 2.4830875396728516, + "learning_rate": 5.024729718501688e-06, + "loss": 0.4199, + "step": 11739 + }, + { + "epoch": 1.5283092541975791, + "grad_norm": 2.9966747760772705, + "learning_rate": 5.022639898077016e-06, + "loss": 0.4864, + "step": 11742 + }, + { + "epoch": 1.5286997266692697, + "grad_norm": 2.7699429988861084, + "learning_rate": 5.020550073697202e-06, + "loss": 0.5197, + "step": 11745 + }, + { + "epoch": 1.5290901991409607, + "grad_norm": 3.0958468914031982, + "learning_rate": 5.018460245727333e-06, + "loss": 0.4696, + "step": 11748 + }, + { + "epoch": 1.5294806716126512, + "grad_norm": 2.7975351810455322, + "learning_rate": 5.016370414532495e-06, + "loss": 0.563, + "step": 11751 + }, + { + "epoch": 1.5298711440843422, + "grad_norm": 3.140310525894165, + "learning_rate": 5.014280580477782e-06, + "loss": 0.5284, + "step": 11754 + }, + { + "epoch": 1.5302616165560328, + "grad_norm": 2.853102684020996, + "learning_rate": 5.012190743928282e-06, + "loss": 0.4683, + "step": 11757 + }, + { + "epoch": 1.5306520890277235, + "grad_norm": 2.716357946395874, + "learning_rate": 5.010100905249084e-06, + "loss": 0.4811, + "step": 11760 + }, + { + "epoch": 1.5310425614994143, + "grad_norm": 3.0148074626922607, + "learning_rate": 5.0080110648052815e-06, + "loss": 0.5135, + "step": 11763 + }, + { + "epoch": 1.531433033971105, + "grad_norm": 2.7584571838378906, + "learning_rate": 5.00592122296196e-06, + "loss": 0.4751, + "step": 11766 + }, + { + "epoch": 1.5318235064427959, + "grad_norm": 2.487861156463623, + "learning_rate": 5.003831380084216e-06, + "loss": 0.5302, + "step": 11769 + }, + { + "epoch": 1.5322139789144864, + "grad_norm": 2.4335525035858154, + "learning_rate": 5.001741536537135e-06, + "loss": 0.4591, + "step": 11772 + }, + { + "epoch": 1.5326044513861774, + "grad_norm": 2.387814521789551, + "learning_rate": 4.999651692685813e-06, + "loss": 0.4683, + "step": 11775 + }, + { + "epoch": 1.532994923857868, + "grad_norm": 2.30008602142334, + "learning_rate": 4.997561848895338e-06, + "loss": 0.4718, + "step": 11778 + }, + { + "epoch": 1.5333853963295587, + "grad_norm": 2.351167678833008, + "learning_rate": 4.995472005530804e-06, + "loss": 0.4853, + "step": 11781 + }, + { + "epoch": 1.5337758688012495, + "grad_norm": 2.77471923828125, + "learning_rate": 4.993382162957302e-06, + "loss": 0.4924, + "step": 11784 + }, + { + "epoch": 1.5341663412729403, + "grad_norm": 2.6090359687805176, + "learning_rate": 4.991292321539921e-06, + "loss": 0.5111, + "step": 11787 + }, + { + "epoch": 1.534556813744631, + "grad_norm": 2.52298903465271, + "learning_rate": 4.989202481643755e-06, + "loss": 0.4663, + "step": 11790 + }, + { + "epoch": 1.5349472862163216, + "grad_norm": 3.639498472213745, + "learning_rate": 4.98711264363389e-06, + "loss": 0.4876, + "step": 11793 + }, + { + "epoch": 1.5353377586880126, + "grad_norm": 2.813267946243286, + "learning_rate": 4.98502280787542e-06, + "loss": 0.5949, + "step": 11796 + }, + { + "epoch": 1.5357282311597031, + "grad_norm": 2.9353835582733154, + "learning_rate": 4.9829329747334345e-06, + "loss": 0.4813, + "step": 11799 + }, + { + "epoch": 1.5361187036313941, + "grad_norm": 2.37113618850708, + "learning_rate": 4.9808431445730225e-06, + "loss": 0.4632, + "step": 11802 + }, + { + "epoch": 1.5365091761030847, + "grad_norm": 3.059115409851074, + "learning_rate": 4.978753317759271e-06, + "loss": 0.5102, + "step": 11805 + }, + { + "epoch": 1.5368996485747755, + "grad_norm": 2.4613428115844727, + "learning_rate": 4.976663494657271e-06, + "loss": 0.4513, + "step": 11808 + }, + { + "epoch": 1.5372901210464662, + "grad_norm": 2.785877227783203, + "learning_rate": 4.97457367563211e-06, + "loss": 0.5541, + "step": 11811 + }, + { + "epoch": 1.537680593518157, + "grad_norm": 2.452495574951172, + "learning_rate": 4.972483861048875e-06, + "loss": 0.4388, + "step": 11814 + }, + { + "epoch": 1.5380710659898478, + "grad_norm": 2.4876022338867188, + "learning_rate": 4.970394051272651e-06, + "loss": 0.5395, + "step": 11817 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 2.76699161529541, + "learning_rate": 4.968304246668524e-06, + "loss": 0.5129, + "step": 11820 + }, + { + "epoch": 1.5388520109332293, + "grad_norm": 2.546515703201294, + "learning_rate": 4.9662144476015785e-06, + "loss": 0.4904, + "step": 11823 + }, + { + "epoch": 1.5392424834049199, + "grad_norm": 2.5809149742126465, + "learning_rate": 4.964124654436898e-06, + "loss": 0.4912, + "step": 11826 + }, + { + "epoch": 1.5396329558766109, + "grad_norm": 3.259404182434082, + "learning_rate": 4.962034867539565e-06, + "loss": 0.5205, + "step": 11829 + }, + { + "epoch": 1.5400234283483014, + "grad_norm": 2.743436098098755, + "learning_rate": 4.959945087274659e-06, + "loss": 0.4914, + "step": 11832 + }, + { + "epoch": 1.5404139008199922, + "grad_norm": 2.653174638748169, + "learning_rate": 4.957855314007265e-06, + "loss": 0.5208, + "step": 11835 + }, + { + "epoch": 1.540804373291683, + "grad_norm": 2.54440975189209, + "learning_rate": 4.955765548102457e-06, + "loss": 0.4773, + "step": 11838 + }, + { + "epoch": 1.5411948457633737, + "grad_norm": 3.731177806854248, + "learning_rate": 4.9536757899253156e-06, + "loss": 0.4968, + "step": 11841 + }, + { + "epoch": 1.5415853182350645, + "grad_norm": 2.7818851470947266, + "learning_rate": 4.951586039840916e-06, + "loss": 0.4352, + "step": 11844 + }, + { + "epoch": 1.541975790706755, + "grad_norm": 2.795779228210449, + "learning_rate": 4.949496298214331e-06, + "loss": 0.4873, + "step": 11847 + }, + { + "epoch": 1.542366263178446, + "grad_norm": 2.714663028717041, + "learning_rate": 4.9474065654106384e-06, + "loss": 0.6013, + "step": 11850 + }, + { + "epoch": 1.5427567356501366, + "grad_norm": 2.7950053215026855, + "learning_rate": 4.945316841794909e-06, + "loss": 0.4848, + "step": 11853 + }, + { + "epoch": 1.5431472081218274, + "grad_norm": 2.954528570175171, + "learning_rate": 4.943227127732212e-06, + "loss": 0.6347, + "step": 11856 + }, + { + "epoch": 1.5435376805935181, + "grad_norm": 2.76151442527771, + "learning_rate": 4.9411374235876155e-06, + "loss": 0.5226, + "step": 11859 + }, + { + "epoch": 1.543928153065209, + "grad_norm": 4.05312967300415, + "learning_rate": 4.939047729726189e-06, + "loss": 0.5568, + "step": 11862 + }, + { + "epoch": 1.5443186255368997, + "grad_norm": 2.8274898529052734, + "learning_rate": 4.936958046512994e-06, + "loss": 0.5498, + "step": 11865 + }, + { + "epoch": 1.5447090980085902, + "grad_norm": 2.768404483795166, + "learning_rate": 4.934868374313097e-06, + "loss": 0.5259, + "step": 11868 + }, + { + "epoch": 1.5450995704802812, + "grad_norm": 2.711665153503418, + "learning_rate": 4.9327787134915576e-06, + "loss": 0.5094, + "step": 11871 + }, + { + "epoch": 1.5454900429519718, + "grad_norm": 2.3921022415161133, + "learning_rate": 4.9306890644134345e-06, + "loss": 0.4367, + "step": 11874 + }, + { + "epoch": 1.5458805154236628, + "grad_norm": 2.8708338737487793, + "learning_rate": 4.928599427443788e-06, + "loss": 0.5638, + "step": 11877 + }, + { + "epoch": 1.5462709878953533, + "grad_norm": 2.8968679904937744, + "learning_rate": 4.92650980294767e-06, + "loss": 0.524, + "step": 11880 + }, + { + "epoch": 1.546661460367044, + "grad_norm": 3.2613213062286377, + "learning_rate": 4.924420191290134e-06, + "loss": 0.4995, + "step": 11883 + }, + { + "epoch": 1.5470519328387349, + "grad_norm": 2.6707489490509033, + "learning_rate": 4.922330592836234e-06, + "loss": 0.4213, + "step": 11886 + }, + { + "epoch": 1.5474424053104257, + "grad_norm": 2.552860736846924, + "learning_rate": 4.920241007951012e-06, + "loss": 0.4674, + "step": 11889 + }, + { + "epoch": 1.5478328777821164, + "grad_norm": 3.741816282272339, + "learning_rate": 4.918151436999521e-06, + "loss": 0.4466, + "step": 11892 + }, + { + "epoch": 1.548223350253807, + "grad_norm": 3.1363577842712402, + "learning_rate": 4.916061880346802e-06, + "loss": 0.4513, + "step": 11895 + }, + { + "epoch": 1.548613822725498, + "grad_norm": 3.1754071712493896, + "learning_rate": 4.913972338357895e-06, + "loss": 0.4901, + "step": 11898 + }, + { + "epoch": 1.5490042951971885, + "grad_norm": 2.3715884685516357, + "learning_rate": 4.91188281139784e-06, + "loss": 0.5472, + "step": 11901 + }, + { + "epoch": 1.5493947676688793, + "grad_norm": 2.7456414699554443, + "learning_rate": 4.909793299831673e-06, + "loss": 0.5158, + "step": 11904 + }, + { + "epoch": 1.54978524014057, + "grad_norm": 2.7107367515563965, + "learning_rate": 4.907703804024425e-06, + "loss": 0.4408, + "step": 11907 + }, + { + "epoch": 1.5501757126122608, + "grad_norm": 2.649536609649658, + "learning_rate": 4.90561432434113e-06, + "loss": 0.534, + "step": 11910 + }, + { + "epoch": 1.5505661850839516, + "grad_norm": 2.4240305423736572, + "learning_rate": 4.903524861146814e-06, + "loss": 0.49, + "step": 11913 + }, + { + "epoch": 1.5509566575556424, + "grad_norm": 2.6543915271759033, + "learning_rate": 4.901435414806502e-06, + "loss": 0.4731, + "step": 11916 + }, + { + "epoch": 1.5513471300273332, + "grad_norm": 2.658057928085327, + "learning_rate": 4.899345985685215e-06, + "loss": 0.4885, + "step": 11919 + }, + { + "epoch": 1.5517376024990237, + "grad_norm": 2.6774919033050537, + "learning_rate": 4.8972565741479715e-06, + "loss": 0.4726, + "step": 11922 + }, + { + "epoch": 1.5521280749707147, + "grad_norm": 2.612046718597412, + "learning_rate": 4.89516718055979e-06, + "loss": 0.4846, + "step": 11925 + }, + { + "epoch": 1.5525185474424053, + "grad_norm": 2.682743549346924, + "learning_rate": 4.893077805285681e-06, + "loss": 0.4907, + "step": 11928 + }, + { + "epoch": 1.552909019914096, + "grad_norm": 2.6726317405700684, + "learning_rate": 4.890988448690653e-06, + "loss": 0.4273, + "step": 11931 + }, + { + "epoch": 1.5532994923857868, + "grad_norm": 3.1792590618133545, + "learning_rate": 4.8888991111397144e-06, + "loss": 0.5008, + "step": 11934 + }, + { + "epoch": 1.5536899648574776, + "grad_norm": 2.972460985183716, + "learning_rate": 4.886809792997868e-06, + "loss": 0.534, + "step": 11937 + }, + { + "epoch": 1.5540804373291683, + "grad_norm": 2.578676223754883, + "learning_rate": 4.884720494630113e-06, + "loss": 0.5175, + "step": 11940 + }, + { + "epoch": 1.554470909800859, + "grad_norm": 2.6860499382019043, + "learning_rate": 4.882631216401445e-06, + "loss": 0.4532, + "step": 11943 + }, + { + "epoch": 1.5548613822725499, + "grad_norm": 2.5615968704223633, + "learning_rate": 4.880541958676856e-06, + "loss": 0.4637, + "step": 11946 + }, + { + "epoch": 1.5552518547442404, + "grad_norm": 2.9076266288757324, + "learning_rate": 4.878452721821336e-06, + "loss": 0.5384, + "step": 11949 + }, + { + "epoch": 1.5556423272159314, + "grad_norm": 2.601215124130249, + "learning_rate": 4.876363506199869e-06, + "loss": 0.4748, + "step": 11952 + }, + { + "epoch": 1.556032799687622, + "grad_norm": 2.8393771648406982, + "learning_rate": 4.874274312177439e-06, + "loss": 0.5763, + "step": 11955 + }, + { + "epoch": 1.5564232721593128, + "grad_norm": 2.3092920780181885, + "learning_rate": 4.87218514011902e-06, + "loss": 0.4322, + "step": 11958 + }, + { + "epoch": 1.5568137446310035, + "grad_norm": 2.9448838233947754, + "learning_rate": 4.87009599038959e-06, + "loss": 0.5118, + "step": 11961 + }, + { + "epoch": 1.5572042171026943, + "grad_norm": 2.6871285438537598, + "learning_rate": 4.868006863354117e-06, + "loss": 0.5206, + "step": 11964 + }, + { + "epoch": 1.557594689574385, + "grad_norm": 2.516818046569824, + "learning_rate": 4.865917759377567e-06, + "loss": 0.4144, + "step": 11967 + }, + { + "epoch": 1.5579851620460756, + "grad_norm": 3.410189151763916, + "learning_rate": 4.8638286788249025e-06, + "loss": 0.4504, + "step": 11970 + }, + { + "epoch": 1.5583756345177666, + "grad_norm": 3.0182759761810303, + "learning_rate": 4.861739622061078e-06, + "loss": 0.4726, + "step": 11973 + }, + { + "epoch": 1.5587661069894572, + "grad_norm": 2.5583255290985107, + "learning_rate": 4.859650589451055e-06, + "loss": 0.457, + "step": 11976 + }, + { + "epoch": 1.559156579461148, + "grad_norm": 2.401313304901123, + "learning_rate": 4.857561581359777e-06, + "loss": 0.5042, + "step": 11979 + }, + { + "epoch": 1.5595470519328387, + "grad_norm": 2.8148505687713623, + "learning_rate": 4.855472598152193e-06, + "loss": 0.4324, + "step": 11982 + }, + { + "epoch": 1.5599375244045295, + "grad_norm": 3.2659788131713867, + "learning_rate": 4.8533836401932395e-06, + "loss": 0.4854, + "step": 11985 + }, + { + "epoch": 1.5603279968762203, + "grad_norm": 2.5836613178253174, + "learning_rate": 4.851294707847856e-06, + "loss": 0.5318, + "step": 11988 + }, + { + "epoch": 1.5607184693479108, + "grad_norm": 2.268071413040161, + "learning_rate": 4.849205801480976e-06, + "loss": 0.4897, + "step": 11991 + }, + { + "epoch": 1.5611089418196018, + "grad_norm": 2.9751930236816406, + "learning_rate": 4.847116921457524e-06, + "loss": 0.4713, + "step": 11994 + }, + { + "epoch": 1.5614994142912924, + "grad_norm": 3.3204164505004883, + "learning_rate": 4.8450280681424235e-06, + "loss": 0.4727, + "step": 11997 + }, + { + "epoch": 1.5618898867629833, + "grad_norm": 2.7896077632904053, + "learning_rate": 4.842939241900595e-06, + "loss": 0.5829, + "step": 12000 + }, + { + "epoch": 1.562280359234674, + "grad_norm": 2.7530436515808105, + "learning_rate": 4.84085044309695e-06, + "loss": 0.4501, + "step": 12003 + }, + { + "epoch": 1.5626708317063647, + "grad_norm": 2.5019564628601074, + "learning_rate": 4.838761672096398e-06, + "loss": 0.4916, + "step": 12006 + }, + { + "epoch": 1.5630613041780554, + "grad_norm": 2.6398847103118896, + "learning_rate": 4.8366729292638425e-06, + "loss": 0.5605, + "step": 12009 + }, + { + "epoch": 1.5634517766497462, + "grad_norm": 3.4240760803222656, + "learning_rate": 4.834584214964182e-06, + "loss": 0.5437, + "step": 12012 + }, + { + "epoch": 1.563842249121437, + "grad_norm": 2.8203554153442383, + "learning_rate": 4.8324955295623105e-06, + "loss": 0.5711, + "step": 12015 + }, + { + "epoch": 1.5642327215931275, + "grad_norm": 2.9343690872192383, + "learning_rate": 4.8304068734231194e-06, + "loss": 0.4177, + "step": 12018 + }, + { + "epoch": 1.5646231940648185, + "grad_norm": 2.7858777046203613, + "learning_rate": 4.82831824691149e-06, + "loss": 0.4601, + "step": 12021 + }, + { + "epoch": 1.565013666536509, + "grad_norm": 2.6615116596221924, + "learning_rate": 4.826229650392301e-06, + "loss": 0.5292, + "step": 12024 + }, + { + "epoch": 1.5654041390082, + "grad_norm": 2.6517207622528076, + "learning_rate": 4.824141084230429e-06, + "loss": 0.5592, + "step": 12027 + }, + { + "epoch": 1.5657946114798906, + "grad_norm": 2.7702314853668213, + "learning_rate": 4.822052548790737e-06, + "loss": 0.5166, + "step": 12030 + }, + { + "epoch": 1.5661850839515814, + "grad_norm": 2.714916706085205, + "learning_rate": 4.819964044438092e-06, + "loss": 0.4259, + "step": 12033 + }, + { + "epoch": 1.5665755564232722, + "grad_norm": 2.4574501514434814, + "learning_rate": 4.81787557153735e-06, + "loss": 0.4761, + "step": 12036 + }, + { + "epoch": 1.566966028894963, + "grad_norm": 2.5279035568237305, + "learning_rate": 4.81578713045336e-06, + "loss": 0.4604, + "step": 12039 + }, + { + "epoch": 1.5673565013666537, + "grad_norm": 2.7598211765289307, + "learning_rate": 4.813698721550973e-06, + "loss": 0.4823, + "step": 12042 + }, + { + "epoch": 1.5677469738383443, + "grad_norm": 2.556950330734253, + "learning_rate": 4.811610345195027e-06, + "loss": 0.4942, + "step": 12045 + }, + { + "epoch": 1.5681374463100353, + "grad_norm": 3.494272470474243, + "learning_rate": 4.809522001750358e-06, + "loss": 0.505, + "step": 12048 + }, + { + "epoch": 1.5685279187817258, + "grad_norm": 2.75669002532959, + "learning_rate": 4.807433691581793e-06, + "loss": 0.486, + "step": 12051 + }, + { + "epoch": 1.5689183912534166, + "grad_norm": 2.629504680633545, + "learning_rate": 4.805345415054158e-06, + "loss": 0.4452, + "step": 12054 + }, + { + "epoch": 1.5693088637251074, + "grad_norm": 2.816953659057617, + "learning_rate": 4.803257172532267e-06, + "loss": 0.4767, + "step": 12057 + }, + { + "epoch": 1.5696993361967981, + "grad_norm": 2.689758777618408, + "learning_rate": 4.801168964380938e-06, + "loss": 0.5014, + "step": 12060 + }, + { + "epoch": 1.570089808668489, + "grad_norm": 2.534708261489868, + "learning_rate": 4.79908079096497e-06, + "loss": 0.4778, + "step": 12063 + }, + { + "epoch": 1.5704802811401795, + "grad_norm": 2.8629817962646484, + "learning_rate": 4.796992652649166e-06, + "loss": 0.5156, + "step": 12066 + }, + { + "epoch": 1.5708707536118705, + "grad_norm": 3.1996259689331055, + "learning_rate": 4.794904549798319e-06, + "loss": 0.4859, + "step": 12069 + }, + { + "epoch": 1.571261226083561, + "grad_norm": 2.5046546459198, + "learning_rate": 4.792816482777216e-06, + "loss": 0.5298, + "step": 12072 + }, + { + "epoch": 1.571651698555252, + "grad_norm": 2.7203421592712402, + "learning_rate": 4.790728451950636e-06, + "loss": 0.5006, + "step": 12075 + }, + { + "epoch": 1.5720421710269425, + "grad_norm": 2.6201863288879395, + "learning_rate": 4.7886404576833564e-06, + "loss": 0.5196, + "step": 12078 + }, + { + "epoch": 1.5724326434986333, + "grad_norm": 2.636479377746582, + "learning_rate": 4.786552500340144e-06, + "loss": 0.4408, + "step": 12081 + }, + { + "epoch": 1.572823115970324, + "grad_norm": 2.937016487121582, + "learning_rate": 4.784464580285761e-06, + "loss": 0.5735, + "step": 12084 + }, + { + "epoch": 1.5732135884420149, + "grad_norm": 2.6083309650421143, + "learning_rate": 4.782376697884962e-06, + "loss": 0.5457, + "step": 12087 + }, + { + "epoch": 1.5736040609137056, + "grad_norm": 2.875025987625122, + "learning_rate": 4.780288853502496e-06, + "loss": 0.5335, + "step": 12090 + }, + { + "epoch": 1.5739945333853962, + "grad_norm": 2.6229593753814697, + "learning_rate": 4.778201047503106e-06, + "loss": 0.5258, + "step": 12093 + }, + { + "epoch": 1.5743850058570872, + "grad_norm": 2.5524308681488037, + "learning_rate": 4.776113280251525e-06, + "loss": 0.4433, + "step": 12096 + }, + { + "epoch": 1.5747754783287777, + "grad_norm": 2.6454172134399414, + "learning_rate": 4.7740255521124825e-06, + "loss": 0.5138, + "step": 12099 + }, + { + "epoch": 1.5751659508004687, + "grad_norm": 2.7520358562469482, + "learning_rate": 4.771937863450701e-06, + "loss": 0.5128, + "step": 12102 + }, + { + "epoch": 1.5755564232721593, + "grad_norm": 2.627397060394287, + "learning_rate": 4.769850214630897e-06, + "loss": 0.5503, + "step": 12105 + }, + { + "epoch": 1.57594689574385, + "grad_norm": 2.6487503051757812, + "learning_rate": 4.767762606017775e-06, + "loss": 0.4488, + "step": 12108 + }, + { + "epoch": 1.5763373682155408, + "grad_norm": 2.3210136890411377, + "learning_rate": 4.765675037976038e-06, + "loss": 0.4593, + "step": 12111 + }, + { + "epoch": 1.5767278406872316, + "grad_norm": 2.7287368774414062, + "learning_rate": 4.763587510870377e-06, + "loss": 0.5627, + "step": 12114 + }, + { + "epoch": 1.5771183131589224, + "grad_norm": 2.7396085262298584, + "learning_rate": 4.761500025065482e-06, + "loss": 0.4773, + "step": 12117 + }, + { + "epoch": 1.577508785630613, + "grad_norm": 2.732445478439331, + "learning_rate": 4.7594125809260315e-06, + "loss": 0.4791, + "step": 12120 + }, + { + "epoch": 1.577899258102304, + "grad_norm": 2.82453989982605, + "learning_rate": 4.7573251788166954e-06, + "loss": 0.5459, + "step": 12123 + }, + { + "epoch": 1.5782897305739945, + "grad_norm": 2.6506190299987793, + "learning_rate": 4.755237819102141e-06, + "loss": 0.4923, + "step": 12126 + }, + { + "epoch": 1.5786802030456852, + "grad_norm": 2.771843910217285, + "learning_rate": 4.7531505021470245e-06, + "loss": 0.5828, + "step": 12129 + }, + { + "epoch": 1.579070675517376, + "grad_norm": 2.5433382987976074, + "learning_rate": 4.751063228315996e-06, + "loss": 0.5142, + "step": 12132 + }, + { + "epoch": 1.5794611479890668, + "grad_norm": 2.8748183250427246, + "learning_rate": 4.748975997973698e-06, + "loss": 0.5367, + "step": 12135 + }, + { + "epoch": 1.5798516204607576, + "grad_norm": 2.996832847595215, + "learning_rate": 4.746888811484765e-06, + "loss": 0.4731, + "step": 12138 + }, + { + "epoch": 1.580242092932448, + "grad_norm": 2.554917335510254, + "learning_rate": 4.744801669213822e-06, + "loss": 0.4605, + "step": 12141 + }, + { + "epoch": 1.580632565404139, + "grad_norm": 2.709306240081787, + "learning_rate": 4.742714571525492e-06, + "loss": 0.5183, + "step": 12144 + }, + { + "epoch": 1.5810230378758297, + "grad_norm": 3.325532913208008, + "learning_rate": 4.740627518784387e-06, + "loss": 0.4707, + "step": 12147 + }, + { + "epoch": 1.5814135103475206, + "grad_norm": 2.7227187156677246, + "learning_rate": 4.738540511355107e-06, + "loss": 0.4803, + "step": 12150 + }, + { + "epoch": 1.5818039828192112, + "grad_norm": 2.8616886138916016, + "learning_rate": 4.736453549602249e-06, + "loss": 0.5472, + "step": 12153 + }, + { + "epoch": 1.582194455290902, + "grad_norm": 2.477766513824463, + "learning_rate": 4.734366633890404e-06, + "loss": 0.5774, + "step": 12156 + }, + { + "epoch": 1.5825849277625927, + "grad_norm": 2.6389472484588623, + "learning_rate": 4.732279764584148e-06, + "loss": 0.4967, + "step": 12159 + }, + { + "epoch": 1.5829754002342835, + "grad_norm": 3.282442569732666, + "learning_rate": 4.730192942048054e-06, + "loss": 0.4934, + "step": 12162 + }, + { + "epoch": 1.5833658727059743, + "grad_norm": 2.8340752124786377, + "learning_rate": 4.7281061666466845e-06, + "loss": 0.5129, + "step": 12165 + }, + { + "epoch": 1.5837563451776648, + "grad_norm": 2.659324884414673, + "learning_rate": 4.726019438744596e-06, + "loss": 0.4756, + "step": 12168 + }, + { + "epoch": 1.5841468176493558, + "grad_norm": 2.673013925552368, + "learning_rate": 4.723932758706337e-06, + "loss": 0.4839, + "step": 12171 + }, + { + "epoch": 1.5845372901210464, + "grad_norm": 2.854623556137085, + "learning_rate": 4.721846126896442e-06, + "loss": 0.5232, + "step": 12174 + }, + { + "epoch": 1.5849277625927374, + "grad_norm": 2.7628884315490723, + "learning_rate": 4.7197595436794445e-06, + "loss": 0.4909, + "step": 12177 + }, + { + "epoch": 1.585318235064428, + "grad_norm": 2.850160598754883, + "learning_rate": 4.717673009419865e-06, + "loss": 0.4582, + "step": 12180 + }, + { + "epoch": 1.5857087075361187, + "grad_norm": 3.7766642570495605, + "learning_rate": 4.715586524482216e-06, + "loss": 0.4465, + "step": 12183 + }, + { + "epoch": 1.5860991800078095, + "grad_norm": 2.8595075607299805, + "learning_rate": 4.7135000892310025e-06, + "loss": 0.4837, + "step": 12186 + }, + { + "epoch": 1.5864896524795002, + "grad_norm": 2.8177318572998047, + "learning_rate": 4.711413704030722e-06, + "loss": 0.4759, + "step": 12189 + }, + { + "epoch": 1.586880124951191, + "grad_norm": 2.634204149246216, + "learning_rate": 4.709327369245861e-06, + "loss": 0.4633, + "step": 12192 + }, + { + "epoch": 1.5872705974228816, + "grad_norm": 2.40295147895813, + "learning_rate": 4.7072410852408965e-06, + "loss": 0.499, + "step": 12195 + }, + { + "epoch": 1.5876610698945726, + "grad_norm": 3.038196325302124, + "learning_rate": 4.705154852380299e-06, + "loss": 0.4878, + "step": 12198 + }, + { + "epoch": 1.5880515423662631, + "grad_norm": 2.4882776737213135, + "learning_rate": 4.7030686710285275e-06, + "loss": 0.4522, + "step": 12201 + }, + { + "epoch": 1.5884420148379539, + "grad_norm": 3.963737964630127, + "learning_rate": 4.700982541550034e-06, + "loss": 0.5776, + "step": 12204 + }, + { + "epoch": 1.5888324873096447, + "grad_norm": 2.4971764087677, + "learning_rate": 4.6988964643092635e-06, + "loss": 0.4437, + "step": 12207 + }, + { + "epoch": 1.5892229597813354, + "grad_norm": 2.5991008281707764, + "learning_rate": 4.696810439670645e-06, + "loss": 0.4736, + "step": 12210 + }, + { + "epoch": 1.5896134322530262, + "grad_norm": 3.21665620803833, + "learning_rate": 4.694724467998607e-06, + "loss": 0.4886, + "step": 12213 + }, + { + "epoch": 1.5900039047247168, + "grad_norm": 2.903533458709717, + "learning_rate": 4.692638549657561e-06, + "loss": 0.4829, + "step": 12216 + }, + { + "epoch": 1.5903943771964077, + "grad_norm": 2.9863076210021973, + "learning_rate": 4.690552685011913e-06, + "loss": 0.5482, + "step": 12219 + }, + { + "epoch": 1.5907848496680983, + "grad_norm": 2.6374809741973877, + "learning_rate": 4.688466874426062e-06, + "loss": 0.5568, + "step": 12222 + }, + { + "epoch": 1.5911753221397893, + "grad_norm": 3.212864398956299, + "learning_rate": 4.68638111826439e-06, + "loss": 0.4724, + "step": 12225 + }, + { + "epoch": 1.5915657946114798, + "grad_norm": 2.7329304218292236, + "learning_rate": 4.684295416891278e-06, + "loss": 0.5167, + "step": 12228 + }, + { + "epoch": 1.5919562670831706, + "grad_norm": 2.6516621112823486, + "learning_rate": 4.6822097706710935e-06, + "loss": 0.519, + "step": 12231 + }, + { + "epoch": 1.5923467395548614, + "grad_norm": 2.7666425704956055, + "learning_rate": 4.680124179968193e-06, + "loss": 0.5548, + "step": 12234 + }, + { + "epoch": 1.5927372120265522, + "grad_norm": 2.8573389053344727, + "learning_rate": 4.678038645146926e-06, + "loss": 0.5459, + "step": 12237 + }, + { + "epoch": 1.593127684498243, + "grad_norm": 2.5972604751586914, + "learning_rate": 4.67595316657163e-06, + "loss": 0.4768, + "step": 12240 + }, + { + "epoch": 1.5935181569699335, + "grad_norm": 3.3133761882781982, + "learning_rate": 4.673867744606633e-06, + "loss": 0.4992, + "step": 12243 + }, + { + "epoch": 1.5939086294416245, + "grad_norm": 2.886195659637451, + "learning_rate": 4.671782379616256e-06, + "loss": 0.5065, + "step": 12246 + }, + { + "epoch": 1.594299101913315, + "grad_norm": 2.5426437854766846, + "learning_rate": 4.669697071964807e-06, + "loss": 0.4161, + "step": 12249 + }, + { + "epoch": 1.5946895743850058, + "grad_norm": 3.0730109214782715, + "learning_rate": 4.667611822016584e-06, + "loss": 0.6377, + "step": 12252 + }, + { + "epoch": 1.5950800468566966, + "grad_norm": 2.649171829223633, + "learning_rate": 4.665526630135877e-06, + "loss": 0.5299, + "step": 12255 + }, + { + "epoch": 1.5954705193283873, + "grad_norm": 2.3111774921417236, + "learning_rate": 4.663441496686964e-06, + "loss": 0.4358, + "step": 12258 + }, + { + "epoch": 1.5958609918000781, + "grad_norm": 3.4514715671539307, + "learning_rate": 4.661356422034113e-06, + "loss": 0.572, + "step": 12261 + }, + { + "epoch": 1.596251464271769, + "grad_norm": 2.392629623413086, + "learning_rate": 4.659271406541584e-06, + "loss": 0.5273, + "step": 12264 + }, + { + "epoch": 1.5966419367434597, + "grad_norm": 2.7930562496185303, + "learning_rate": 4.65718645057362e-06, + "loss": 0.483, + "step": 12267 + }, + { + "epoch": 1.5970324092151502, + "grad_norm": 2.5676727294921875, + "learning_rate": 4.655101554494465e-06, + "loss": 0.4614, + "step": 12270 + }, + { + "epoch": 1.5974228816868412, + "grad_norm": 2.318683385848999, + "learning_rate": 4.653016718668342e-06, + "loss": 0.4482, + "step": 12273 + }, + { + "epoch": 1.5978133541585318, + "grad_norm": 2.654034376144409, + "learning_rate": 4.650931943459469e-06, + "loss": 0.4685, + "step": 12276 + }, + { + "epoch": 1.5982038266302225, + "grad_norm": 2.6656546592712402, + "learning_rate": 4.64884722923205e-06, + "loss": 0.5102, + "step": 12279 + }, + { + "epoch": 1.5985942991019133, + "grad_norm": 2.5751311779022217, + "learning_rate": 4.646762576350282e-06, + "loss": 0.4802, + "step": 12282 + }, + { + "epoch": 1.598984771573604, + "grad_norm": 2.3519175052642822, + "learning_rate": 4.644677985178349e-06, + "loss": 0.468, + "step": 12285 + }, + { + "epoch": 1.5993752440452949, + "grad_norm": 2.790675401687622, + "learning_rate": 4.642593456080425e-06, + "loss": 0.6174, + "step": 12288 + }, + { + "epoch": 1.5997657165169854, + "grad_norm": 2.661125898361206, + "learning_rate": 4.640508989420672e-06, + "loss": 0.4254, + "step": 12291 + }, + { + "epoch": 1.6001561889886764, + "grad_norm": 2.642672300338745, + "learning_rate": 4.638424585563241e-06, + "loss": 0.4429, + "step": 12294 + }, + { + "epoch": 1.600546661460367, + "grad_norm": 2.7480437755584717, + "learning_rate": 4.636340244872275e-06, + "loss": 0.5078, + "step": 12297 + }, + { + "epoch": 1.600937133932058, + "grad_norm": 3.16561222076416, + "learning_rate": 4.634255967711905e-06, + "loss": 0.5383, + "step": 12300 + }, + { + "epoch": 1.6013276064037485, + "grad_norm": 2.9352433681488037, + "learning_rate": 4.632171754446246e-06, + "loss": 0.4734, + "step": 12303 + }, + { + "epoch": 1.6017180788754393, + "grad_norm": 2.6060163974761963, + "learning_rate": 4.630087605439407e-06, + "loss": 0.466, + "step": 12306 + }, + { + "epoch": 1.60210855134713, + "grad_norm": 2.827497720718384, + "learning_rate": 4.628003521055486e-06, + "loss": 0.4221, + "step": 12309 + }, + { + "epoch": 1.6024990238188208, + "grad_norm": 2.8147799968719482, + "learning_rate": 4.625919501658568e-06, + "loss": 0.4873, + "step": 12312 + }, + { + "epoch": 1.6028894962905116, + "grad_norm": 3.2619431018829346, + "learning_rate": 4.623835547612726e-06, + "loss": 0.4946, + "step": 12315 + }, + { + "epoch": 1.6032799687622021, + "grad_norm": 3.4461402893066406, + "learning_rate": 4.621751659282021e-06, + "loss": 0.5536, + "step": 12318 + }, + { + "epoch": 1.6036704412338931, + "grad_norm": 2.9235973358154297, + "learning_rate": 4.619667837030508e-06, + "loss": 0.5901, + "step": 12321 + }, + { + "epoch": 1.6040609137055837, + "grad_norm": 2.7743449211120605, + "learning_rate": 4.6175840812222214e-06, + "loss": 0.4504, + "step": 12324 + }, + { + "epoch": 1.6044513861772745, + "grad_norm": 3.0675461292266846, + "learning_rate": 4.615500392221193e-06, + "loss": 0.5658, + "step": 12327 + }, + { + "epoch": 1.6048418586489652, + "grad_norm": 2.25282883644104, + "learning_rate": 4.613416770391437e-06, + "loss": 0.4366, + "step": 12330 + }, + { + "epoch": 1.605232331120656, + "grad_norm": 2.5219714641571045, + "learning_rate": 4.611333216096957e-06, + "loss": 0.574, + "step": 12333 + }, + { + "epoch": 1.6056228035923468, + "grad_norm": 2.765669345855713, + "learning_rate": 4.6092497297017475e-06, + "loss": 0.5322, + "step": 12336 + }, + { + "epoch": 1.6060132760640373, + "grad_norm": 2.6073596477508545, + "learning_rate": 4.607166311569787e-06, + "loss": 0.4499, + "step": 12339 + }, + { + "epoch": 1.6064037485357283, + "grad_norm": 2.747248411178589, + "learning_rate": 4.605082962065047e-06, + "loss": 0.4734, + "step": 12342 + }, + { + "epoch": 1.6067942210074189, + "grad_norm": 3.6643686294555664, + "learning_rate": 4.602999681551482e-06, + "loss": 0.4507, + "step": 12345 + }, + { + "epoch": 1.6071846934791099, + "grad_norm": 2.712887763977051, + "learning_rate": 4.600916470393037e-06, + "loss": 0.5074, + "step": 12348 + }, + { + "epoch": 1.6075751659508004, + "grad_norm": 2.606999635696411, + "learning_rate": 4.5988333289536444e-06, + "loss": 0.4703, + "step": 12351 + }, + { + "epoch": 1.6079656384224912, + "grad_norm": 3.723257064819336, + "learning_rate": 4.596750257597227e-06, + "loss": 0.5086, + "step": 12354 + }, + { + "epoch": 1.608356110894182, + "grad_norm": 3.431612253189087, + "learning_rate": 4.59466725668769e-06, + "loss": 0.5268, + "step": 12357 + }, + { + "epoch": 1.6087465833658727, + "grad_norm": 2.8018877506256104, + "learning_rate": 4.592584326588931e-06, + "loss": 0.5083, + "step": 12360 + }, + { + "epoch": 1.6091370558375635, + "grad_norm": 2.578345775604248, + "learning_rate": 4.590501467664834e-06, + "loss": 0.4861, + "step": 12363 + }, + { + "epoch": 1.609527528309254, + "grad_norm": 2.7130467891693115, + "learning_rate": 4.588418680279268e-06, + "loss": 0.5426, + "step": 12366 + }, + { + "epoch": 1.609918000780945, + "grad_norm": 2.725135564804077, + "learning_rate": 4.5863359647960924e-06, + "loss": 0.4927, + "step": 12369 + }, + { + "epoch": 1.6103084732526356, + "grad_norm": 2.6991970539093018, + "learning_rate": 4.584253321579155e-06, + "loss": 0.5302, + "step": 12372 + }, + { + "epoch": 1.6106989457243266, + "grad_norm": 2.4623939990997314, + "learning_rate": 4.582170750992287e-06, + "loss": 0.3889, + "step": 12375 + }, + { + "epoch": 1.6110894181960171, + "grad_norm": 2.5120255947113037, + "learning_rate": 4.580088253399311e-06, + "loss": 0.427, + "step": 12378 + }, + { + "epoch": 1.611479890667708, + "grad_norm": 2.3307392597198486, + "learning_rate": 4.578005829164032e-06, + "loss": 0.5037, + "step": 12381 + }, + { + "epoch": 1.6118703631393987, + "grad_norm": 2.7387290000915527, + "learning_rate": 4.575923478650246e-06, + "loss": 0.4973, + "step": 12384 + }, + { + "epoch": 1.6122608356110895, + "grad_norm": 2.8055717945098877, + "learning_rate": 4.573841202221739e-06, + "loss": 0.5121, + "step": 12387 + }, + { + "epoch": 1.6126513080827802, + "grad_norm": 3.864004373550415, + "learning_rate": 4.5717590002422755e-06, + "loss": 0.5525, + "step": 12390 + }, + { + "epoch": 1.6130417805544708, + "grad_norm": 2.713977336883545, + "learning_rate": 4.569676873075613e-06, + "loss": 0.5923, + "step": 12393 + }, + { + "epoch": 1.6134322530261618, + "grad_norm": 2.7594332695007324, + "learning_rate": 4.567594821085497e-06, + "loss": 0.5122, + "step": 12396 + }, + { + "epoch": 1.6138227254978523, + "grad_norm": 2.3967032432556152, + "learning_rate": 4.565512844635657e-06, + "loss": 0.4932, + "step": 12399 + }, + { + "epoch": 1.614213197969543, + "grad_norm": 2.6698012351989746, + "learning_rate": 4.563430944089807e-06, + "loss": 0.457, + "step": 12402 + }, + { + "epoch": 1.6146036704412339, + "grad_norm": 2.7440128326416016, + "learning_rate": 4.561349119811655e-06, + "loss": 0.4945, + "step": 12405 + }, + { + "epoch": 1.6149941429129246, + "grad_norm": 2.666889190673828, + "learning_rate": 4.559267372164886e-06, + "loss": 0.5029, + "step": 12408 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 2.826246738433838, + "learning_rate": 4.557185701513182e-06, + "loss": 0.4937, + "step": 12411 + }, + { + "epoch": 1.615775087856306, + "grad_norm": 2.5194458961486816, + "learning_rate": 4.555104108220205e-06, + "loss": 0.4776, + "step": 12414 + }, + { + "epoch": 1.616165560327997, + "grad_norm": 2.725764274597168, + "learning_rate": 4.553022592649602e-06, + "loss": 0.5434, + "step": 12417 + }, + { + "epoch": 1.6165560327996875, + "grad_norm": 2.692974805831909, + "learning_rate": 4.550941155165015e-06, + "loss": 0.518, + "step": 12420 + }, + { + "epoch": 1.6169465052713785, + "grad_norm": 2.79667329788208, + "learning_rate": 4.548859796130061e-06, + "loss": 0.5053, + "step": 12423 + }, + { + "epoch": 1.617336977743069, + "grad_norm": 2.6578612327575684, + "learning_rate": 4.546778515908352e-06, + "loss": 0.4535, + "step": 12426 + }, + { + "epoch": 1.6177274502147598, + "grad_norm": 2.6429173946380615, + "learning_rate": 4.544697314863483e-06, + "loss": 0.4714, + "step": 12429 + }, + { + "epoch": 1.6181179226864506, + "grad_norm": 2.6964876651763916, + "learning_rate": 4.542616193359035e-06, + "loss": 0.5593, + "step": 12432 + }, + { + "epoch": 1.6185083951581414, + "grad_norm": 2.86681866645813, + "learning_rate": 4.540535151758575e-06, + "loss": 0.5782, + "step": 12435 + }, + { + "epoch": 1.6188988676298322, + "grad_norm": 4.218873977661133, + "learning_rate": 4.538454190425658e-06, + "loss": 0.5248, + "step": 12438 + }, + { + "epoch": 1.6192893401015227, + "grad_norm": 4.104538917541504, + "learning_rate": 4.5363733097238235e-06, + "loss": 0.5671, + "step": 12441 + }, + { + "epoch": 1.6196798125732137, + "grad_norm": 2.703684091567993, + "learning_rate": 4.534292510016597e-06, + "loss": 0.522, + "step": 12444 + }, + { + "epoch": 1.6200702850449042, + "grad_norm": 3.4295432567596436, + "learning_rate": 4.532211791667488e-06, + "loss": 0.5239, + "step": 12447 + }, + { + "epoch": 1.6204607575165952, + "grad_norm": 2.69512939453125, + "learning_rate": 4.5301311550399966e-06, + "loss": 0.4816, + "step": 12450 + }, + { + "epoch": 1.6208512299882858, + "grad_norm": 2.700019121170044, + "learning_rate": 4.5280506004976025e-06, + "loss": 0.4808, + "step": 12453 + }, + { + "epoch": 1.6212417024599766, + "grad_norm": 2.682190179824829, + "learning_rate": 4.525970128403777e-06, + "loss": 0.5554, + "step": 12456 + }, + { + "epoch": 1.6216321749316673, + "grad_norm": 2.4642701148986816, + "learning_rate": 4.523889739121971e-06, + "loss": 0.4808, + "step": 12459 + }, + { + "epoch": 1.622022647403358, + "grad_norm": 2.6819307804107666, + "learning_rate": 4.521809433015627e-06, + "loss": 0.5932, + "step": 12462 + }, + { + "epoch": 1.6224131198750489, + "grad_norm": 2.5704028606414795, + "learning_rate": 4.51972921044817e-06, + "loss": 0.5527, + "step": 12465 + }, + { + "epoch": 1.6228035923467394, + "grad_norm": 2.388164758682251, + "learning_rate": 4.517649071783008e-06, + "loss": 0.5643, + "step": 12468 + }, + { + "epoch": 1.6231940648184304, + "grad_norm": 2.3138227462768555, + "learning_rate": 4.51556901738354e-06, + "loss": 0.4487, + "step": 12471 + }, + { + "epoch": 1.623584537290121, + "grad_norm": 2.4271912574768066, + "learning_rate": 4.513489047613144e-06, + "loss": 0.465, + "step": 12474 + }, + { + "epoch": 1.6239750097618117, + "grad_norm": 2.532554864883423, + "learning_rate": 4.511409162835188e-06, + "loss": 0.5073, + "step": 12477 + }, + { + "epoch": 1.6243654822335025, + "grad_norm": 2.848936080932617, + "learning_rate": 4.509329363413023e-06, + "loss": 0.4336, + "step": 12480 + }, + { + "epoch": 1.6247559547051933, + "grad_norm": 2.899810791015625, + "learning_rate": 4.507249649709988e-06, + "loss": 0.5284, + "step": 12483 + }, + { + "epoch": 1.625146427176884, + "grad_norm": 2.649650812149048, + "learning_rate": 4.505170022089401e-06, + "loss": 0.506, + "step": 12486 + }, + { + "epoch": 1.6255368996485746, + "grad_norm": 2.40224552154541, + "learning_rate": 4.503090480914571e-06, + "loss": 0.4498, + "step": 12489 + }, + { + "epoch": 1.6259273721202656, + "grad_norm": 2.5746147632598877, + "learning_rate": 4.501011026548789e-06, + "loss": 0.5017, + "step": 12492 + }, + { + "epoch": 1.6263178445919562, + "grad_norm": 2.6533167362213135, + "learning_rate": 4.49893165935533e-06, + "loss": 0.5273, + "step": 12495 + }, + { + "epoch": 1.6267083170636472, + "grad_norm": 4.123754501342773, + "learning_rate": 4.496852379697456e-06, + "loss": 0.5213, + "step": 12498 + }, + { + "epoch": 1.6270987895353377, + "grad_norm": 3.875533103942871, + "learning_rate": 4.494773187938413e-06, + "loss": 0.5422, + "step": 12501 + }, + { + "epoch": 1.6274892620070285, + "grad_norm": 3.554651975631714, + "learning_rate": 4.4926940844414316e-06, + "loss": 0.4757, + "step": 12504 + }, + { + "epoch": 1.6278797344787193, + "grad_norm": 2.468736171722412, + "learning_rate": 4.490615069569727e-06, + "loss": 0.5176, + "step": 12507 + }, + { + "epoch": 1.62827020695041, + "grad_norm": 3.26057767868042, + "learning_rate": 4.488536143686497e-06, + "loss": 0.522, + "step": 12510 + }, + { + "epoch": 1.6286606794221008, + "grad_norm": 2.696108102798462, + "learning_rate": 4.486457307154927e-06, + "loss": 0.5466, + "step": 12513 + }, + { + "epoch": 1.6290511518937913, + "grad_norm": 2.7262580394744873, + "learning_rate": 4.4843785603381855e-06, + "loss": 0.5753, + "step": 12516 + }, + { + "epoch": 1.6294416243654823, + "grad_norm": 2.864877462387085, + "learning_rate": 4.482299903599424e-06, + "loss": 0.457, + "step": 12519 + }, + { + "epoch": 1.629832096837173, + "grad_norm": 2.8265585899353027, + "learning_rate": 4.48022133730178e-06, + "loss": 0.5091, + "step": 12522 + }, + { + "epoch": 1.630222569308864, + "grad_norm": 3.3130998611450195, + "learning_rate": 4.478142861808375e-06, + "loss": 0.4634, + "step": 12525 + }, + { + "epoch": 1.6306130417805544, + "grad_norm": 2.6155717372894287, + "learning_rate": 4.476064477482316e-06, + "loss": 0.4637, + "step": 12528 + }, + { + "epoch": 1.6310035142522452, + "grad_norm": 2.8541154861450195, + "learning_rate": 4.4739861846866885e-06, + "loss": 0.5217, + "step": 12531 + }, + { + "epoch": 1.631393986723936, + "grad_norm": 2.5578043460845947, + "learning_rate": 4.47190798378457e-06, + "loss": 0.5005, + "step": 12534 + }, + { + "epoch": 1.6317844591956268, + "grad_norm": 3.508741617202759, + "learning_rate": 4.469829875139014e-06, + "loss": 0.512, + "step": 12537 + }, + { + "epoch": 1.6321749316673175, + "grad_norm": 2.5680432319641113, + "learning_rate": 4.467751859113064e-06, + "loss": 0.5627, + "step": 12540 + }, + { + "epoch": 1.632565404139008, + "grad_norm": 2.6496567726135254, + "learning_rate": 4.465673936069746e-06, + "loss": 0.4628, + "step": 12543 + }, + { + "epoch": 1.632955876610699, + "grad_norm": 2.5644404888153076, + "learning_rate": 4.463596106372066e-06, + "loss": 0.5009, + "step": 12546 + }, + { + "epoch": 1.6333463490823896, + "grad_norm": 2.2369980812072754, + "learning_rate": 4.461518370383017e-06, + "loss": 0.4671, + "step": 12549 + }, + { + "epoch": 1.6337368215540804, + "grad_norm": 2.7351465225219727, + "learning_rate": 4.459440728465578e-06, + "loss": 0.5614, + "step": 12552 + }, + { + "epoch": 1.6341272940257712, + "grad_norm": 2.928203821182251, + "learning_rate": 4.4573631809827045e-06, + "loss": 0.5391, + "step": 12555 + }, + { + "epoch": 1.634517766497462, + "grad_norm": 2.912785291671753, + "learning_rate": 4.4552857282973435e-06, + "loss": 0.4928, + "step": 12558 + }, + { + "epoch": 1.6349082389691527, + "grad_norm": 3.1030282974243164, + "learning_rate": 4.453208370772417e-06, + "loss": 0.5712, + "step": 12561 + }, + { + "epoch": 1.6352987114408433, + "grad_norm": 2.6347246170043945, + "learning_rate": 4.45113110877084e-06, + "loss": 0.5575, + "step": 12564 + }, + { + "epoch": 1.6356891839125343, + "grad_norm": 2.7265217304229736, + "learning_rate": 4.449053942655503e-06, + "loss": 0.5683, + "step": 12567 + }, + { + "epoch": 1.6360796563842248, + "grad_norm": 2.5423009395599365, + "learning_rate": 4.446976872789284e-06, + "loss": 0.4451, + "step": 12570 + }, + { + "epoch": 1.6364701288559158, + "grad_norm": 2.4595041275024414, + "learning_rate": 4.444899899535042e-06, + "loss": 0.4713, + "step": 12573 + }, + { + "epoch": 1.6368606013276064, + "grad_norm": 2.64188814163208, + "learning_rate": 4.442823023255619e-06, + "loss": 0.4481, + "step": 12576 + }, + { + "epoch": 1.6372510737992971, + "grad_norm": 2.5329110622406006, + "learning_rate": 4.440746244313842e-06, + "loss": 0.5019, + "step": 12579 + }, + { + "epoch": 1.637641546270988, + "grad_norm": 2.4973838329315186, + "learning_rate": 4.43866956307252e-06, + "loss": 0.4662, + "step": 12582 + }, + { + "epoch": 1.6380320187426787, + "grad_norm": 2.651660203933716, + "learning_rate": 4.436592979894445e-06, + "loss": 0.4375, + "step": 12585 + }, + { + "epoch": 1.6384224912143694, + "grad_norm": 2.4576289653778076, + "learning_rate": 4.4345164951423895e-06, + "loss": 0.4207, + "step": 12588 + }, + { + "epoch": 1.63881296368606, + "grad_norm": 3.1014697551727295, + "learning_rate": 4.432440109179113e-06, + "loss": 0.5241, + "step": 12591 + }, + { + "epoch": 1.639203436157751, + "grad_norm": 2.6351943016052246, + "learning_rate": 4.430363822367357e-06, + "loss": 0.5933, + "step": 12594 + }, + { + "epoch": 1.6395939086294415, + "grad_norm": 2.748256206512451, + "learning_rate": 4.428287635069841e-06, + "loss": 0.5146, + "step": 12597 + }, + { + "epoch": 1.6399843811011323, + "grad_norm": 2.712522506713867, + "learning_rate": 4.426211547649274e-06, + "loss": 0.5062, + "step": 12600 + }, + { + "epoch": 1.640374853572823, + "grad_norm": 2.814810276031494, + "learning_rate": 4.42413556046834e-06, + "loss": 0.5229, + "step": 12603 + }, + { + "epoch": 1.6407653260445139, + "grad_norm": 2.7721574306488037, + "learning_rate": 4.422059673889714e-06, + "loss": 0.4604, + "step": 12606 + }, + { + "epoch": 1.6411557985162046, + "grad_norm": 2.700059175491333, + "learning_rate": 4.419983888276047e-06, + "loss": 0.5371, + "step": 12609 + }, + { + "epoch": 1.6415462709878954, + "grad_norm": 2.785081624984741, + "learning_rate": 4.417908203989975e-06, + "loss": 0.5373, + "step": 12612 + }, + { + "epoch": 1.6419367434595862, + "grad_norm": 2.8077304363250732, + "learning_rate": 4.415832621394116e-06, + "loss": 0.5162, + "step": 12615 + }, + { + "epoch": 1.6423272159312767, + "grad_norm": 2.6027612686157227, + "learning_rate": 4.413757140851067e-06, + "loss": 0.5698, + "step": 12618 + }, + { + "epoch": 1.6427176884029677, + "grad_norm": 2.704460620880127, + "learning_rate": 4.411681762723415e-06, + "loss": 0.476, + "step": 12621 + }, + { + "epoch": 1.6431081608746583, + "grad_norm": 2.906745672225952, + "learning_rate": 4.409606487373718e-06, + "loss": 0.4695, + "step": 12624 + }, + { + "epoch": 1.643498633346349, + "grad_norm": 2.890692710876465, + "learning_rate": 4.407531315164527e-06, + "loss": 0.5786, + "step": 12627 + }, + { + "epoch": 1.6438891058180398, + "grad_norm": 2.488058567047119, + "learning_rate": 4.4054562464583705e-06, + "loss": 0.4332, + "step": 12630 + }, + { + "epoch": 1.6442795782897306, + "grad_norm": 2.7957355976104736, + "learning_rate": 4.403381281617755e-06, + "loss": 0.4859, + "step": 12633 + }, + { + "epoch": 1.6446700507614214, + "grad_norm": 2.960820436477661, + "learning_rate": 4.401306421005176e-06, + "loss": 0.4931, + "step": 12636 + }, + { + "epoch": 1.645060523233112, + "grad_norm": 2.780045986175537, + "learning_rate": 4.399231664983104e-06, + "loss": 0.5052, + "step": 12639 + }, + { + "epoch": 1.645450995704803, + "grad_norm": 2.800703525543213, + "learning_rate": 4.3971570139139975e-06, + "loss": 0.57, + "step": 12642 + }, + { + "epoch": 1.6458414681764935, + "grad_norm": 3.8522229194641113, + "learning_rate": 4.395082468160291e-06, + "loss": 0.5197, + "step": 12645 + }, + { + "epoch": 1.6462319406481845, + "grad_norm": 2.728604793548584, + "learning_rate": 4.393008028084407e-06, + "loss": 0.4438, + "step": 12648 + }, + { + "epoch": 1.646622413119875, + "grad_norm": 2.7166476249694824, + "learning_rate": 4.390933694048742e-06, + "loss": 0.5547, + "step": 12651 + }, + { + "epoch": 1.6470128855915658, + "grad_norm": 2.5482025146484375, + "learning_rate": 4.3888594664156795e-06, + "loss": 0.5274, + "step": 12654 + }, + { + "epoch": 1.6474033580632566, + "grad_norm": 3.0686612129211426, + "learning_rate": 4.386785345547584e-06, + "loss": 0.5536, + "step": 12657 + }, + { + "epoch": 1.6477938305349473, + "grad_norm": 2.8095195293426514, + "learning_rate": 4.384711331806797e-06, + "loss": 0.4901, + "step": 12660 + }, + { + "epoch": 1.648184303006638, + "grad_norm": 2.81028151512146, + "learning_rate": 4.3826374255556476e-06, + "loss": 0.515, + "step": 12663 + }, + { + "epoch": 1.6485747754783286, + "grad_norm": 2.4701220989227295, + "learning_rate": 4.38056362715644e-06, + "loss": 0.5399, + "step": 12666 + }, + { + "epoch": 1.6489652479500196, + "grad_norm": 2.5714948177337646, + "learning_rate": 4.378489936971463e-06, + "loss": 0.5045, + "step": 12669 + }, + { + "epoch": 1.6493557204217102, + "grad_norm": 2.7451059818267822, + "learning_rate": 4.376416355362989e-06, + "loss": 0.4809, + "step": 12672 + }, + { + "epoch": 1.649746192893401, + "grad_norm": 2.625004529953003, + "learning_rate": 4.3743428826932635e-06, + "loss": 0.5552, + "step": 12675 + }, + { + "epoch": 1.6501366653650917, + "grad_norm": 2.387716770172119, + "learning_rate": 4.37226951932452e-06, + "loss": 0.4466, + "step": 12678 + }, + { + "epoch": 1.6505271378367825, + "grad_norm": 2.606682777404785, + "learning_rate": 4.370196265618973e-06, + "loss": 0.5303, + "step": 12681 + }, + { + "epoch": 1.6509176103084733, + "grad_norm": 2.3871326446533203, + "learning_rate": 4.368123121938812e-06, + "loss": 0.4816, + "step": 12684 + }, + { + "epoch": 1.6513080827801638, + "grad_norm": 3.2255170345306396, + "learning_rate": 4.3660500886462105e-06, + "loss": 0.5146, + "step": 12687 + }, + { + "epoch": 1.6516985552518548, + "grad_norm": 2.7184829711914062, + "learning_rate": 4.3639771661033275e-06, + "loss": 0.5417, + "step": 12690 + }, + { + "epoch": 1.6520890277235454, + "grad_norm": 3.5075788497924805, + "learning_rate": 4.361904354672296e-06, + "loss": 0.4598, + "step": 12693 + }, + { + "epoch": 1.6524795001952364, + "grad_norm": 2.910029649734497, + "learning_rate": 4.3598316547152295e-06, + "loss": 0.5129, + "step": 12696 + }, + { + "epoch": 1.652869972666927, + "grad_norm": 2.9071853160858154, + "learning_rate": 4.357759066594228e-06, + "loss": 0.5485, + "step": 12699 + }, + { + "epoch": 1.6532604451386177, + "grad_norm": 2.824410915374756, + "learning_rate": 4.3556865906713654e-06, + "loss": 0.5358, + "step": 12702 + }, + { + "epoch": 1.6536509176103085, + "grad_norm": 2.680872678756714, + "learning_rate": 4.3536142273087005e-06, + "loss": 0.5531, + "step": 12705 + }, + { + "epoch": 1.6540413900819992, + "grad_norm": 2.904576063156128, + "learning_rate": 4.351541976868271e-06, + "loss": 0.523, + "step": 12708 + }, + { + "epoch": 1.65443186255369, + "grad_norm": 2.6715736389160156, + "learning_rate": 4.349469839712093e-06, + "loss": 0.4883, + "step": 12711 + }, + { + "epoch": 1.6548223350253806, + "grad_norm": 2.5224480628967285, + "learning_rate": 4.347397816202165e-06, + "loss": 0.509, + "step": 12714 + }, + { + "epoch": 1.6552128074970716, + "grad_norm": 3.416038990020752, + "learning_rate": 4.345325906700467e-06, + "loss": 0.4453, + "step": 12717 + }, + { + "epoch": 1.655603279968762, + "grad_norm": 2.9145963191986084, + "learning_rate": 4.343254111568954e-06, + "loss": 0.5879, + "step": 12720 + }, + { + "epoch": 1.655993752440453, + "grad_norm": 3.9935879707336426, + "learning_rate": 4.341182431169568e-06, + "loss": 0.5139, + "step": 12723 + }, + { + "epoch": 1.6563842249121437, + "grad_norm": 4.118599891662598, + "learning_rate": 4.339110865864225e-06, + "loss": 0.5033, + "step": 12726 + }, + { + "epoch": 1.6567746973838344, + "grad_norm": 2.687406301498413, + "learning_rate": 4.337039416014821e-06, + "loss": 0.5637, + "step": 12729 + }, + { + "epoch": 1.6571651698555252, + "grad_norm": 2.5384199619293213, + "learning_rate": 4.334968081983238e-06, + "loss": 0.5492, + "step": 12732 + }, + { + "epoch": 1.657555642327216, + "grad_norm": 2.816277265548706, + "learning_rate": 4.3328968641313326e-06, + "loss": 0.4586, + "step": 12735 + }, + { + "epoch": 1.6579461147989067, + "grad_norm": 2.6040713787078857, + "learning_rate": 4.330825762820942e-06, + "loss": 0.4576, + "step": 12738 + }, + { + "epoch": 1.6583365872705973, + "grad_norm": 2.7248170375823975, + "learning_rate": 4.3287547784138815e-06, + "loss": 0.4663, + "step": 12741 + }, + { + "epoch": 1.6587270597422883, + "grad_norm": 2.8028006553649902, + "learning_rate": 4.32668391127195e-06, + "loss": 0.5485, + "step": 12744 + }, + { + "epoch": 1.6591175322139788, + "grad_norm": 2.470500946044922, + "learning_rate": 4.324613161756923e-06, + "loss": 0.4919, + "step": 12747 + }, + { + "epoch": 1.6595080046856696, + "grad_norm": 2.61730694770813, + "learning_rate": 4.322542530230556e-06, + "loss": 0.4849, + "step": 12750 + }, + { + "epoch": 1.6598984771573604, + "grad_norm": 2.686258316040039, + "learning_rate": 4.320472017054584e-06, + "loss": 0.4206, + "step": 12753 + }, + { + "epoch": 1.6602889496290512, + "grad_norm": 2.6718552112579346, + "learning_rate": 4.318401622590719e-06, + "loss": 0.5302, + "step": 12756 + }, + { + "epoch": 1.660679422100742, + "grad_norm": 2.5929765701293945, + "learning_rate": 4.316331347200659e-06, + "loss": 0.4858, + "step": 12759 + }, + { + "epoch": 1.6610698945724325, + "grad_norm": 2.6052663326263428, + "learning_rate": 4.314261191246073e-06, + "loss": 0.4703, + "step": 12762 + }, + { + "epoch": 1.6614603670441235, + "grad_norm": 2.5814874172210693, + "learning_rate": 4.312191155088616e-06, + "loss": 0.4815, + "step": 12765 + }, + { + "epoch": 1.661850839515814, + "grad_norm": 2.9272522926330566, + "learning_rate": 4.310121239089915e-06, + "loss": 0.5128, + "step": 12768 + }, + { + "epoch": 1.662241311987505, + "grad_norm": 3.3689401149749756, + "learning_rate": 4.308051443611582e-06, + "loss": 0.4717, + "step": 12771 + }, + { + "epoch": 1.6626317844591956, + "grad_norm": 2.8045191764831543, + "learning_rate": 4.305981769015207e-06, + "loss": 0.4954, + "step": 12774 + }, + { + "epoch": 1.6630222569308863, + "grad_norm": 2.869739294052124, + "learning_rate": 4.303912215662359e-06, + "loss": 0.4864, + "step": 12777 + }, + { + "epoch": 1.6634127294025771, + "grad_norm": 2.9662373065948486, + "learning_rate": 4.30184278391458e-06, + "loss": 0.5554, + "step": 12780 + }, + { + "epoch": 1.663803201874268, + "grad_norm": 2.603210926055908, + "learning_rate": 4.299773474133398e-06, + "loss": 0.467, + "step": 12783 + }, + { + "epoch": 1.6641936743459587, + "grad_norm": 2.6853482723236084, + "learning_rate": 4.297704286680319e-06, + "loss": 0.5312, + "step": 12786 + }, + { + "epoch": 1.6645841468176492, + "grad_norm": 2.606921672821045, + "learning_rate": 4.295635221916823e-06, + "loss": 0.5665, + "step": 12789 + }, + { + "epoch": 1.6649746192893402, + "grad_norm": 2.5418643951416016, + "learning_rate": 4.293566280204371e-06, + "loss": 0.4628, + "step": 12792 + }, + { + "epoch": 1.6653650917610308, + "grad_norm": 3.40142822265625, + "learning_rate": 4.2914974619044045e-06, + "loss": 0.487, + "step": 12795 + }, + { + "epoch": 1.6657555642327218, + "grad_norm": 2.476050853729248, + "learning_rate": 4.289428767378341e-06, + "loss": 0.5231, + "step": 12798 + }, + { + "epoch": 1.6661460367044123, + "grad_norm": 2.9447269439697266, + "learning_rate": 4.287360196987578e-06, + "loss": 0.5296, + "step": 12801 + }, + { + "epoch": 1.666536509176103, + "grad_norm": 3.3223366737365723, + "learning_rate": 4.2852917510934876e-06, + "loss": 0.4443, + "step": 12804 + }, + { + "epoch": 1.6669269816477938, + "grad_norm": 2.9464173316955566, + "learning_rate": 4.283223430057425e-06, + "loss": 0.5343, + "step": 12807 + }, + { + "epoch": 1.6673174541194846, + "grad_norm": 3.260638475418091, + "learning_rate": 4.281155234240722e-06, + "loss": 0.4831, + "step": 12810 + }, + { + "epoch": 1.6677079265911754, + "grad_norm": 2.604071617126465, + "learning_rate": 4.279087164004686e-06, + "loss": 0.5149, + "step": 12813 + }, + { + "epoch": 1.668098399062866, + "grad_norm": 2.497521162033081, + "learning_rate": 4.277019219710607e-06, + "loss": 0.4481, + "step": 12816 + }, + { + "epoch": 1.668488871534557, + "grad_norm": 2.61600399017334, + "learning_rate": 4.274951401719748e-06, + "loss": 0.5209, + "step": 12819 + }, + { + "epoch": 1.6688793440062475, + "grad_norm": 2.627828359603882, + "learning_rate": 4.272883710393356e-06, + "loss": 0.538, + "step": 12822 + }, + { + "epoch": 1.6692698164779383, + "grad_norm": 2.7958621978759766, + "learning_rate": 4.270816146092649e-06, + "loss": 0.4669, + "step": 12825 + }, + { + "epoch": 1.669660288949629, + "grad_norm": 2.8905887603759766, + "learning_rate": 4.268748709178828e-06, + "loss": 0.554, + "step": 12828 + }, + { + "epoch": 1.6700507614213198, + "grad_norm": 2.683310031890869, + "learning_rate": 4.2666814000130685e-06, + "loss": 0.518, + "step": 12831 + }, + { + "epoch": 1.6704412338930106, + "grad_norm": 3.2548582553863525, + "learning_rate": 4.264614218956525e-06, + "loss": 0.514, + "step": 12834 + }, + { + "epoch": 1.6708317063647011, + "grad_norm": 2.7682833671569824, + "learning_rate": 4.262547166370333e-06, + "loss": 0.5013, + "step": 12837 + }, + { + "epoch": 1.6712221788363921, + "grad_norm": 2.7718143463134766, + "learning_rate": 4.2604802426155975e-06, + "loss": 0.5049, + "step": 12840 + }, + { + "epoch": 1.6716126513080827, + "grad_norm": 2.5910987854003906, + "learning_rate": 4.258413448053409e-06, + "loss": 0.5122, + "step": 12843 + }, + { + "epoch": 1.6720031237797737, + "grad_norm": 2.675654172897339, + "learning_rate": 4.25634678304483e-06, + "loss": 0.5298, + "step": 12846 + }, + { + "epoch": 1.6723935962514642, + "grad_norm": 2.951941728591919, + "learning_rate": 4.254280247950904e-06, + "loss": 0.5185, + "step": 12849 + }, + { + "epoch": 1.672784068723155, + "grad_norm": 3.080920696258545, + "learning_rate": 4.252213843132651e-06, + "loss": 0.4948, + "step": 12852 + }, + { + "epoch": 1.6731745411948458, + "grad_norm": 2.5052084922790527, + "learning_rate": 4.250147568951062e-06, + "loss": 0.4856, + "step": 12855 + }, + { + "epoch": 1.6735650136665365, + "grad_norm": 2.8432629108428955, + "learning_rate": 4.2480814257671195e-06, + "loss": 0.4367, + "step": 12858 + }, + { + "epoch": 1.6739554861382273, + "grad_norm": 2.4311513900756836, + "learning_rate": 4.24601541394177e-06, + "loss": 0.5279, + "step": 12861 + }, + { + "epoch": 1.6743459586099179, + "grad_norm": 2.772369861602783, + "learning_rate": 4.243949533835941e-06, + "loss": 0.5058, + "step": 12864 + }, + { + "epoch": 1.6747364310816089, + "grad_norm": 2.680986166000366, + "learning_rate": 4.241883785810538e-06, + "loss": 0.5034, + "step": 12867 + }, + { + "epoch": 1.6751269035532994, + "grad_norm": 2.6189749240875244, + "learning_rate": 4.239818170226442e-06, + "loss": 0.4952, + "step": 12870 + }, + { + "epoch": 1.6755173760249904, + "grad_norm": 2.844604253768921, + "learning_rate": 4.237752687444514e-06, + "loss": 0.5086, + "step": 12873 + }, + { + "epoch": 1.675907848496681, + "grad_norm": 2.81457781791687, + "learning_rate": 4.235687337825586e-06, + "loss": 0.4829, + "step": 12876 + }, + { + "epoch": 1.6762983209683717, + "grad_norm": 3.1727747917175293, + "learning_rate": 4.233622121730474e-06, + "loss": 0.5165, + "step": 12879 + }, + { + "epoch": 1.6766887934400625, + "grad_norm": 2.7962770462036133, + "learning_rate": 4.231557039519965e-06, + "loss": 0.4056, + "step": 12882 + }, + { + "epoch": 1.6770792659117533, + "grad_norm": 2.9042179584503174, + "learning_rate": 4.229492091554823e-06, + "loss": 0.6229, + "step": 12885 + }, + { + "epoch": 1.677469738383444, + "grad_norm": 2.709332227706909, + "learning_rate": 4.227427278195794e-06, + "loss": 0.4915, + "step": 12888 + }, + { + "epoch": 1.6778602108551346, + "grad_norm": 2.407127857208252, + "learning_rate": 4.225362599803592e-06, + "loss": 0.445, + "step": 12891 + }, + { + "epoch": 1.6782506833268256, + "grad_norm": 2.5972049236297607, + "learning_rate": 4.2232980567389156e-06, + "loss": 0.4591, + "step": 12894 + }, + { + "epoch": 1.6786411557985161, + "grad_norm": 3.5093605518341064, + "learning_rate": 4.221233649362432e-06, + "loss": 0.5018, + "step": 12897 + }, + { + "epoch": 1.679031628270207, + "grad_norm": 2.601149320602417, + "learning_rate": 4.219169378034795e-06, + "loss": 0.4992, + "step": 12900 + }, + { + "epoch": 1.6794221007418977, + "grad_norm": 2.5034475326538086, + "learning_rate": 4.217105243116623e-06, + "loss": 0.4433, + "step": 12903 + }, + { + "epoch": 1.6798125732135885, + "grad_norm": 2.8355612754821777, + "learning_rate": 4.21504124496852e-06, + "loss": 0.4554, + "step": 12906 + }, + { + "epoch": 1.6802030456852792, + "grad_norm": 2.9951822757720947, + "learning_rate": 4.212977383951059e-06, + "loss": 0.5543, + "step": 12909 + }, + { + "epoch": 1.6805935181569698, + "grad_norm": 3.2354860305786133, + "learning_rate": 4.210913660424793e-06, + "loss": 0.4783, + "step": 12912 + }, + { + "epoch": 1.6809839906286608, + "grad_norm": 2.7228434085845947, + "learning_rate": 4.208850074750251e-06, + "loss": 0.4661, + "step": 12915 + }, + { + "epoch": 1.6813744631003513, + "grad_norm": 3.545387029647827, + "learning_rate": 4.206786627287936e-06, + "loss": 0.5935, + "step": 12918 + }, + { + "epoch": 1.6817649355720423, + "grad_norm": 2.7536470890045166, + "learning_rate": 4.20472331839833e-06, + "loss": 0.4677, + "step": 12921 + }, + { + "epoch": 1.6821554080437329, + "grad_norm": 2.6740031242370605, + "learning_rate": 4.202660148441886e-06, + "loss": 0.5313, + "step": 12924 + }, + { + "epoch": 1.6825458805154236, + "grad_norm": 2.6446104049682617, + "learning_rate": 4.200597117779038e-06, + "loss": 0.4866, + "step": 12927 + }, + { + "epoch": 1.6829363529871144, + "grad_norm": 2.780635118484497, + "learning_rate": 4.198534226770191e-06, + "loss": 0.5019, + "step": 12930 + }, + { + "epoch": 1.6833268254588052, + "grad_norm": 3.038558006286621, + "learning_rate": 4.196471475775728e-06, + "loss": 0.4894, + "step": 12933 + }, + { + "epoch": 1.683717297930496, + "grad_norm": 3.2223899364471436, + "learning_rate": 4.1944088651560085e-06, + "loss": 0.5071, + "step": 12936 + }, + { + "epoch": 1.6841077704021865, + "grad_norm": 3.295980930328369, + "learning_rate": 4.192346395271364e-06, + "loss": 0.538, + "step": 12939 + }, + { + "epoch": 1.6844982428738775, + "grad_norm": 2.9584147930145264, + "learning_rate": 4.190284066482107e-06, + "loss": 0.551, + "step": 12942 + }, + { + "epoch": 1.684888715345568, + "grad_norm": 2.542959451675415, + "learning_rate": 4.18822187914852e-06, + "loss": 0.4231, + "step": 12945 + }, + { + "epoch": 1.6852791878172588, + "grad_norm": 2.58151912689209, + "learning_rate": 4.186159833630862e-06, + "loss": 0.5433, + "step": 12948 + }, + { + "epoch": 1.6856696602889496, + "grad_norm": 2.462104082107544, + "learning_rate": 4.18409793028937e-06, + "loss": 0.4551, + "step": 12951 + }, + { + "epoch": 1.6860601327606404, + "grad_norm": 2.931121587753296, + "learning_rate": 4.182036169484252e-06, + "loss": 0.5095, + "step": 12954 + }, + { + "epoch": 1.6864506052323311, + "grad_norm": 2.6130144596099854, + "learning_rate": 4.1799745515756964e-06, + "loss": 0.4584, + "step": 12957 + }, + { + "epoch": 1.686841077704022, + "grad_norm": 2.993987798690796, + "learning_rate": 4.17791307692386e-06, + "loss": 0.4698, + "step": 12960 + }, + { + "epoch": 1.6872315501757127, + "grad_norm": 2.883889675140381, + "learning_rate": 4.1758517458888805e-06, + "loss": 0.5164, + "step": 12963 + }, + { + "epoch": 1.6876220226474032, + "grad_norm": 2.5255353450775146, + "learning_rate": 4.173790558830868e-06, + "loss": 0.4782, + "step": 12966 + }, + { + "epoch": 1.6880124951190942, + "grad_norm": 2.975459575653076, + "learning_rate": 4.171729516109904e-06, + "loss": 0.5081, + "step": 12969 + }, + { + "epoch": 1.6884029675907848, + "grad_norm": 2.7381391525268555, + "learning_rate": 4.169668618086054e-06, + "loss": 0.5112, + "step": 12972 + }, + { + "epoch": 1.6887934400624756, + "grad_norm": 2.576137065887451, + "learning_rate": 4.167607865119348e-06, + "loss": 0.5115, + "step": 12975 + }, + { + "epoch": 1.6891839125341663, + "grad_norm": 2.776411533355713, + "learning_rate": 4.165547257569797e-06, + "loss": 0.4519, + "step": 12978 + }, + { + "epoch": 1.689574385005857, + "grad_norm": 2.5057878494262695, + "learning_rate": 4.163486795797384e-06, + "loss": 0.4683, + "step": 12981 + }, + { + "epoch": 1.6899648574775479, + "grad_norm": 2.751605272293091, + "learning_rate": 4.161426480162069e-06, + "loss": 0.4207, + "step": 12984 + }, + { + "epoch": 1.6903553299492384, + "grad_norm": 2.7747738361358643, + "learning_rate": 4.1593663110237845e-06, + "loss": 0.5169, + "step": 12987 + }, + { + "epoch": 1.6907458024209294, + "grad_norm": 2.7248165607452393, + "learning_rate": 4.157306288742435e-06, + "loss": 0.4886, + "step": 12990 + }, + { + "epoch": 1.69113627489262, + "grad_norm": 4.524834156036377, + "learning_rate": 4.155246413677907e-06, + "loss": 0.4705, + "step": 12993 + }, + { + "epoch": 1.691526747364311, + "grad_norm": 3.088681936264038, + "learning_rate": 4.153186686190051e-06, + "loss": 0.4504, + "step": 12996 + }, + { + "epoch": 1.6919172198360015, + "grad_norm": 2.302208662033081, + "learning_rate": 4.151127106638701e-06, + "loss": 0.4188, + "step": 12999 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 2.612056255340576, + "learning_rate": 4.149067675383659e-06, + "loss": 0.5164, + "step": 13002 + }, + { + "epoch": 1.692698164779383, + "grad_norm": 2.487496852874756, + "learning_rate": 4.147008392784703e-06, + "loss": 0.4577, + "step": 13005 + }, + { + "epoch": 1.6930886372510738, + "grad_norm": 2.817664861679077, + "learning_rate": 4.144949259201589e-06, + "loss": 0.4666, + "step": 13008 + }, + { + "epoch": 1.6934791097227646, + "grad_norm": 3.112154006958008, + "learning_rate": 4.142890274994038e-06, + "loss": 0.5034, + "step": 13011 + }, + { + "epoch": 1.6938695821944552, + "grad_norm": 2.818704605102539, + "learning_rate": 4.1408314405217544e-06, + "loss": 0.5956, + "step": 13014 + }, + { + "epoch": 1.6942600546661462, + "grad_norm": 2.6162893772125244, + "learning_rate": 4.138772756144411e-06, + "loss": 0.5144, + "step": 13017 + }, + { + "epoch": 1.6946505271378367, + "grad_norm": 3.4802184104919434, + "learning_rate": 4.136714222221654e-06, + "loss": 0.375, + "step": 13020 + }, + { + "epoch": 1.6950409996095275, + "grad_norm": 2.7583959102630615, + "learning_rate": 4.134655839113105e-06, + "loss": 0.4669, + "step": 13023 + }, + { + "epoch": 1.6954314720812182, + "grad_norm": 3.098271369934082, + "learning_rate": 4.132597607178362e-06, + "loss": 0.5267, + "step": 13026 + }, + { + "epoch": 1.695821944552909, + "grad_norm": 2.7036032676696777, + "learning_rate": 4.130539526776994e-06, + "loss": 0.5162, + "step": 13029 + }, + { + "epoch": 1.6962124170245998, + "grad_norm": 2.6259026527404785, + "learning_rate": 4.12848159826854e-06, + "loss": 0.4926, + "step": 13032 + }, + { + "epoch": 1.6966028894962903, + "grad_norm": 3.8111186027526855, + "learning_rate": 4.126423822012516e-06, + "loss": 0.4105, + "step": 13035 + }, + { + "epoch": 1.6969933619679813, + "grad_norm": 2.448268175125122, + "learning_rate": 4.1243661983684155e-06, + "loss": 0.5241, + "step": 13038 + }, + { + "epoch": 1.697383834439672, + "grad_norm": 2.864579200744629, + "learning_rate": 4.1223087276956964e-06, + "loss": 0.5952, + "step": 13041 + }, + { + "epoch": 1.6977743069113629, + "grad_norm": 2.495445728302002, + "learning_rate": 4.120251410353797e-06, + "loss": 0.4932, + "step": 13044 + }, + { + "epoch": 1.6981647793830534, + "grad_norm": 2.7435507774353027, + "learning_rate": 4.1181942467021246e-06, + "loss": 0.4825, + "step": 13047 + }, + { + "epoch": 1.6985552518547442, + "grad_norm": 2.677366018295288, + "learning_rate": 4.116137237100062e-06, + "loss": 0.4762, + "step": 13050 + }, + { + "epoch": 1.698945724326435, + "grad_norm": 2.592958688735962, + "learning_rate": 4.1140803819069665e-06, + "loss": 0.4575, + "step": 13053 + }, + { + "epoch": 1.6993361967981258, + "grad_norm": 2.5551681518554688, + "learning_rate": 4.112023681482163e-06, + "loss": 0.5381, + "step": 13056 + }, + { + "epoch": 1.6997266692698165, + "grad_norm": 2.487131357192993, + "learning_rate": 4.109967136184955e-06, + "loss": 0.4652, + "step": 13059 + }, + { + "epoch": 1.700117141741507, + "grad_norm": 2.729330539703369, + "learning_rate": 4.107910746374616e-06, + "loss": 0.4711, + "step": 13062 + }, + { + "epoch": 1.700507614213198, + "grad_norm": 2.5914466381073, + "learning_rate": 4.105854512410391e-06, + "loss": 0.4262, + "step": 13065 + }, + { + "epoch": 1.7008980866848886, + "grad_norm": 2.8775665760040283, + "learning_rate": 4.1037984346515035e-06, + "loss": 0.4693, + "step": 13068 + }, + { + "epoch": 1.7012885591565796, + "grad_norm": 4.019662380218506, + "learning_rate": 4.101742513457144e-06, + "loss": 0.5601, + "step": 13071 + }, + { + "epoch": 1.7016790316282702, + "grad_norm": 2.315446615219116, + "learning_rate": 4.099686749186478e-06, + "loss": 0.4316, + "step": 13074 + }, + { + "epoch": 1.702069504099961, + "grad_norm": 2.7482759952545166, + "learning_rate": 4.097631142198641e-06, + "loss": 0.4958, + "step": 13077 + }, + { + "epoch": 1.7024599765716517, + "grad_norm": 2.536959409713745, + "learning_rate": 4.0955756928527484e-06, + "loss": 0.4941, + "step": 13080 + }, + { + "epoch": 1.7028504490433425, + "grad_norm": 2.7001805305480957, + "learning_rate": 4.093520401507878e-06, + "loss": 0.4224, + "step": 13083 + }, + { + "epoch": 1.7032409215150333, + "grad_norm": 2.7134954929351807, + "learning_rate": 4.091465268523086e-06, + "loss": 0.5029, + "step": 13086 + }, + { + "epoch": 1.7036313939867238, + "grad_norm": 2.511518716812134, + "learning_rate": 4.089410294257401e-06, + "loss": 0.4307, + "step": 13089 + }, + { + "epoch": 1.7040218664584148, + "grad_norm": 2.8347418308258057, + "learning_rate": 4.087355479069822e-06, + "loss": 0.4964, + "step": 13092 + }, + { + "epoch": 1.7044123389301054, + "grad_norm": 2.555575132369995, + "learning_rate": 4.085300823319321e-06, + "loss": 0.4809, + "step": 13095 + }, + { + "epoch": 1.7048028114017961, + "grad_norm": 2.7679803371429443, + "learning_rate": 4.083246327364842e-06, + "loss": 0.4417, + "step": 13098 + }, + { + "epoch": 1.705193283873487, + "grad_norm": 2.7733895778656006, + "learning_rate": 4.0811919915653e-06, + "loss": 0.5306, + "step": 13101 + }, + { + "epoch": 1.7055837563451777, + "grad_norm": 2.7801830768585205, + "learning_rate": 4.079137816279586e-06, + "loss": 0.5075, + "step": 13104 + }, + { + "epoch": 1.7059742288168684, + "grad_norm": 3.2246150970458984, + "learning_rate": 4.077083801866555e-06, + "loss": 0.5001, + "step": 13107 + }, + { + "epoch": 1.706364701288559, + "grad_norm": 2.7091383934020996, + "learning_rate": 4.0750299486850436e-06, + "loss": 0.4837, + "step": 13110 + }, + { + "epoch": 1.70675517376025, + "grad_norm": 3.151174783706665, + "learning_rate": 4.072976257093855e-06, + "loss": 0.5339, + "step": 13113 + }, + { + "epoch": 1.7071456462319405, + "grad_norm": 2.550407886505127, + "learning_rate": 4.070922727451765e-06, + "loss": 0.4892, + "step": 13116 + }, + { + "epoch": 1.7075361187036315, + "grad_norm": 2.9066054821014404, + "learning_rate": 4.068869360117519e-06, + "loss": 0.4448, + "step": 13119 + }, + { + "epoch": 1.707926591175322, + "grad_norm": 2.571066379547119, + "learning_rate": 4.066816155449837e-06, + "loss": 0.6094, + "step": 13122 + }, + { + "epoch": 1.7083170636470129, + "grad_norm": 2.580421209335327, + "learning_rate": 4.06476311380741e-06, + "loss": 0.4401, + "step": 13125 + }, + { + "epoch": 1.7087075361187036, + "grad_norm": 2.7733590602874756, + "learning_rate": 4.0627102355488986e-06, + "loss": 0.5215, + "step": 13128 + }, + { + "epoch": 1.7090980085903944, + "grad_norm": 2.7099995613098145, + "learning_rate": 4.060657521032939e-06, + "loss": 0.5325, + "step": 13131 + }, + { + "epoch": 1.7094884810620852, + "grad_norm": 2.444837808609009, + "learning_rate": 4.058604970618133e-06, + "loss": 0.4751, + "step": 13134 + }, + { + "epoch": 1.7098789535337757, + "grad_norm": 2.5356292724609375, + "learning_rate": 4.056552584663059e-06, + "loss": 0.4456, + "step": 13137 + }, + { + "epoch": 1.7102694260054667, + "grad_norm": 2.789571762084961, + "learning_rate": 4.054500363526264e-06, + "loss": 0.4897, + "step": 13140 + }, + { + "epoch": 1.7106598984771573, + "grad_norm": 2.5640852451324463, + "learning_rate": 4.052448307566265e-06, + "loss": 0.4146, + "step": 13143 + }, + { + "epoch": 1.7110503709488483, + "grad_norm": 3.146996259689331, + "learning_rate": 4.050396417141555e-06, + "loss": 0.4841, + "step": 13146 + }, + { + "epoch": 1.7114408434205388, + "grad_norm": 2.4805715084075928, + "learning_rate": 4.048344692610591e-06, + "loss": 0.4652, + "step": 13149 + }, + { + "epoch": 1.7118313158922296, + "grad_norm": 2.782636880874634, + "learning_rate": 4.046293134331808e-06, + "loss": 0.5592, + "step": 13152 + }, + { + "epoch": 1.7122217883639204, + "grad_norm": 2.8912787437438965, + "learning_rate": 4.044241742663608e-06, + "loss": 0.5252, + "step": 13155 + }, + { + "epoch": 1.7126122608356111, + "grad_norm": 2.741149425506592, + "learning_rate": 4.042190517964366e-06, + "loss": 0.5303, + "step": 13158 + }, + { + "epoch": 1.713002733307302, + "grad_norm": 2.541614294052124, + "learning_rate": 4.040139460592425e-06, + "loss": 0.4626, + "step": 13161 + }, + { + "epoch": 1.7133932057789925, + "grad_norm": 2.6299006938934326, + "learning_rate": 4.038088570906101e-06, + "loss": 0.485, + "step": 13164 + }, + { + "epoch": 1.7137836782506835, + "grad_norm": 2.4726014137268066, + "learning_rate": 4.036037849263681e-06, + "loss": 0.4611, + "step": 13167 + }, + { + "epoch": 1.714174150722374, + "grad_norm": 2.893159866333008, + "learning_rate": 4.03398729602342e-06, + "loss": 0.5536, + "step": 13170 + }, + { + "epoch": 1.7145646231940648, + "grad_norm": 2.6248669624328613, + "learning_rate": 4.031936911543547e-06, + "loss": 0.461, + "step": 13173 + }, + { + "epoch": 1.7149550956657555, + "grad_norm": 2.4585137367248535, + "learning_rate": 4.029886696182258e-06, + "loss": 0.486, + "step": 13176 + }, + { + "epoch": 1.7153455681374463, + "grad_norm": 3.081102132797241, + "learning_rate": 4.027836650297722e-06, + "loss": 0.4455, + "step": 13179 + }, + { + "epoch": 1.715736040609137, + "grad_norm": 2.7504281997680664, + "learning_rate": 4.025786774248079e-06, + "loss": 0.4965, + "step": 13182 + }, + { + "epoch": 1.7161265130808276, + "grad_norm": 2.750417709350586, + "learning_rate": 4.023737068391437e-06, + "loss": 0.4974, + "step": 13185 + }, + { + "epoch": 1.7165169855525186, + "grad_norm": 2.768533229827881, + "learning_rate": 4.021687533085876e-06, + "loss": 0.5187, + "step": 13188 + }, + { + "epoch": 1.7169074580242092, + "grad_norm": 2.4056451320648193, + "learning_rate": 4.019638168689442e-06, + "loss": 0.4074, + "step": 13191 + }, + { + "epoch": 1.7172979304959002, + "grad_norm": 2.582237482070923, + "learning_rate": 4.0175889755601605e-06, + "loss": 0.5469, + "step": 13194 + }, + { + "epoch": 1.7176884029675907, + "grad_norm": 2.816178560256958, + "learning_rate": 4.015539954056017e-06, + "loss": 0.4812, + "step": 13197 + }, + { + "epoch": 1.7180788754392815, + "grad_norm": 2.4552249908447266, + "learning_rate": 4.013491104534973e-06, + "loss": 0.4056, + "step": 13200 + }, + { + "epoch": 1.7184693479109723, + "grad_norm": 2.762195110321045, + "learning_rate": 4.011442427354958e-06, + "loss": 0.5348, + "step": 13203 + }, + { + "epoch": 1.718859820382663, + "grad_norm": 2.9049007892608643, + "learning_rate": 4.009393922873871e-06, + "loss": 0.4404, + "step": 13206 + }, + { + "epoch": 1.7192502928543538, + "grad_norm": 2.658714532852173, + "learning_rate": 4.007345591449583e-06, + "loss": 0.5044, + "step": 13209 + }, + { + "epoch": 1.7196407653260444, + "grad_norm": 3.3005869388580322, + "learning_rate": 4.005297433439929e-06, + "loss": 0.4398, + "step": 13212 + }, + { + "epoch": 1.7200312377977354, + "grad_norm": 2.9398393630981445, + "learning_rate": 4.003249449202723e-06, + "loss": 0.4689, + "step": 13215 + }, + { + "epoch": 1.720421710269426, + "grad_norm": 2.541372060775757, + "learning_rate": 4.0012016390957414e-06, + "loss": 0.5121, + "step": 13218 + }, + { + "epoch": 1.720812182741117, + "grad_norm": 2.7919490337371826, + "learning_rate": 3.999154003476732e-06, + "loss": 0.4993, + "step": 13221 + }, + { + "epoch": 1.7212026552128075, + "grad_norm": 3.1717112064361572, + "learning_rate": 3.997106542703413e-06, + "loss": 0.5892, + "step": 13224 + }, + { + "epoch": 1.7215931276844982, + "grad_norm": 2.6251416206359863, + "learning_rate": 3.99505925713347e-06, + "loss": 0.47, + "step": 13227 + }, + { + "epoch": 1.721983600156189, + "grad_norm": 3.013826608657837, + "learning_rate": 3.993012147124561e-06, + "loss": 0.5313, + "step": 13230 + }, + { + "epoch": 1.7223740726278798, + "grad_norm": 2.7360174655914307, + "learning_rate": 3.990965213034311e-06, + "loss": 0.6029, + "step": 13233 + }, + { + "epoch": 1.7227645450995706, + "grad_norm": 2.6772170066833496, + "learning_rate": 3.988918455220317e-06, + "loss": 0.4684, + "step": 13236 + }, + { + "epoch": 1.723155017571261, + "grad_norm": 2.4262611865997314, + "learning_rate": 3.986871874040141e-06, + "loss": 0.5268, + "step": 13239 + }, + { + "epoch": 1.723545490042952, + "grad_norm": 2.6841237545013428, + "learning_rate": 3.9848254698513176e-06, + "loss": 0.5358, + "step": 13242 + }, + { + "epoch": 1.7239359625146427, + "grad_norm": 2.6044881343841553, + "learning_rate": 3.98277924301135e-06, + "loss": 0.6248, + "step": 13245 + }, + { + "epoch": 1.7243264349863334, + "grad_norm": 3.0996434688568115, + "learning_rate": 3.980733193877707e-06, + "loss": 0.504, + "step": 13248 + }, + { + "epoch": 1.7247169074580242, + "grad_norm": 2.843388795852661, + "learning_rate": 3.978687322807832e-06, + "loss": 0.4786, + "step": 13251 + }, + { + "epoch": 1.725107379929715, + "grad_norm": 3.045637607574463, + "learning_rate": 3.9766416301591336e-06, + "loss": 0.4879, + "step": 13254 + }, + { + "epoch": 1.7254978524014057, + "grad_norm": 2.5478973388671875, + "learning_rate": 3.974596116288988e-06, + "loss": 0.4843, + "step": 13257 + }, + { + "epoch": 1.7258883248730963, + "grad_norm": 2.8354203701019287, + "learning_rate": 3.972550781554745e-06, + "loss": 0.5909, + "step": 13260 + }, + { + "epoch": 1.7262787973447873, + "grad_norm": 2.7125988006591797, + "learning_rate": 3.970505626313718e-06, + "loss": 0.4752, + "step": 13263 + }, + { + "epoch": 1.7266692698164778, + "grad_norm": 2.5882513523101807, + "learning_rate": 3.9684606509231935e-06, + "loss": 0.4578, + "step": 13266 + }, + { + "epoch": 1.7270597422881688, + "grad_norm": 2.7400310039520264, + "learning_rate": 3.966415855740423e-06, + "loss": 0.5583, + "step": 13269 + }, + { + "epoch": 1.7274502147598594, + "grad_norm": 2.917891025543213, + "learning_rate": 3.964371241122627e-06, + "loss": 0.4712, + "step": 13272 + }, + { + "epoch": 1.7278406872315502, + "grad_norm": 2.7035434246063232, + "learning_rate": 3.962326807426996e-06, + "loss": 0.5, + "step": 13275 + }, + { + "epoch": 1.728231159703241, + "grad_norm": 2.7468101978302, + "learning_rate": 3.960282555010691e-06, + "loss": 0.5003, + "step": 13278 + }, + { + "epoch": 1.7286216321749317, + "grad_norm": 2.587204694747925, + "learning_rate": 3.958238484230835e-06, + "loss": 0.5208, + "step": 13281 + }, + { + "epoch": 1.7290121046466225, + "grad_norm": 2.676217555999756, + "learning_rate": 3.956194595444525e-06, + "loss": 0.5368, + "step": 13284 + }, + { + "epoch": 1.729402577118313, + "grad_norm": 2.5884201526641846, + "learning_rate": 3.954150889008823e-06, + "loss": 0.5131, + "step": 13287 + }, + { + "epoch": 1.729793049590004, + "grad_norm": 2.455409049987793, + "learning_rate": 3.95210736528076e-06, + "loss": 0.4353, + "step": 13290 + }, + { + "epoch": 1.7301835220616946, + "grad_norm": 2.7997970581054688, + "learning_rate": 3.9500640246173376e-06, + "loss": 0.5197, + "step": 13293 + }, + { + "epoch": 1.7305739945333856, + "grad_norm": 2.450458288192749, + "learning_rate": 3.948020867375521e-06, + "loss": 0.5324, + "step": 13296 + }, + { + "epoch": 1.7309644670050761, + "grad_norm": 2.428149461746216, + "learning_rate": 3.945977893912244e-06, + "loss": 0.508, + "step": 13299 + }, + { + "epoch": 1.7313549394767669, + "grad_norm": 4.35640811920166, + "learning_rate": 3.943935104584413e-06, + "loss": 0.525, + "step": 13302 + }, + { + "epoch": 1.7317454119484577, + "grad_norm": 2.549283266067505, + "learning_rate": 3.941892499748897e-06, + "loss": 0.4751, + "step": 13305 + }, + { + "epoch": 1.7321358844201484, + "grad_norm": 3.114838123321533, + "learning_rate": 3.9398500797625355e-06, + "loss": 0.5122, + "step": 13308 + }, + { + "epoch": 1.7325263568918392, + "grad_norm": 3.187574863433838, + "learning_rate": 3.937807844982136e-06, + "loss": 0.4961, + "step": 13311 + }, + { + "epoch": 1.7329168293635298, + "grad_norm": 2.5373189449310303, + "learning_rate": 3.93576579576447e-06, + "loss": 0.4976, + "step": 13314 + }, + { + "epoch": 1.7333073018352207, + "grad_norm": 2.5374085903167725, + "learning_rate": 3.93372393246628e-06, + "loss": 0.4775, + "step": 13317 + }, + { + "epoch": 1.7336977743069113, + "grad_norm": 2.4994728565216064, + "learning_rate": 3.931682255444276e-06, + "loss": 0.4706, + "step": 13320 + }, + { + "epoch": 1.734088246778602, + "grad_norm": 4.567479610443115, + "learning_rate": 3.929640765055137e-06, + "loss": 0.5715, + "step": 13323 + }, + { + "epoch": 1.7344787192502928, + "grad_norm": 2.689119815826416, + "learning_rate": 3.927599461655503e-06, + "loss": 0.4771, + "step": 13326 + }, + { + "epoch": 1.7348691917219836, + "grad_norm": 2.8153414726257324, + "learning_rate": 3.925558345601987e-06, + "loss": 0.4448, + "step": 13329 + }, + { + "epoch": 1.7352596641936744, + "grad_norm": 2.966712236404419, + "learning_rate": 3.923517417251168e-06, + "loss": 0.5059, + "step": 13332 + }, + { + "epoch": 1.735650136665365, + "grad_norm": 2.778205156326294, + "learning_rate": 3.921476676959591e-06, + "loss": 0.4661, + "step": 13335 + }, + { + "epoch": 1.736040609137056, + "grad_norm": 3.179136276245117, + "learning_rate": 3.919436125083771e-06, + "loss": 0.5206, + "step": 13338 + }, + { + "epoch": 1.7364310816087465, + "grad_norm": 2.8156237602233887, + "learning_rate": 3.917395761980186e-06, + "loss": 0.5265, + "step": 13341 + }, + { + "epoch": 1.7368215540804375, + "grad_norm": 2.721435546875, + "learning_rate": 3.915355588005283e-06, + "loss": 0.5218, + "step": 13344 + }, + { + "epoch": 1.737212026552128, + "grad_norm": 3.0874862670898438, + "learning_rate": 3.913315603515479e-06, + "loss": 0.4019, + "step": 13347 + }, + { + "epoch": 1.7376024990238188, + "grad_norm": 3.176990032196045, + "learning_rate": 3.911275808867151e-06, + "loss": 0.5453, + "step": 13350 + }, + { + "epoch": 1.7379929714955096, + "grad_norm": 2.735309600830078, + "learning_rate": 3.909236204416651e-06, + "loss": 0.4105, + "step": 13353 + }, + { + "epoch": 1.7383834439672003, + "grad_norm": 2.8676302433013916, + "learning_rate": 3.90719679052029e-06, + "loss": 0.4664, + "step": 13356 + }, + { + "epoch": 1.7387739164388911, + "grad_norm": 3.0311317443847656, + "learning_rate": 3.905157567534349e-06, + "loss": 0.509, + "step": 13359 + }, + { + "epoch": 1.7391643889105817, + "grad_norm": 2.751351833343506, + "learning_rate": 3.9031185358150794e-06, + "loss": 0.5447, + "step": 13362 + }, + { + "epoch": 1.7395548613822727, + "grad_norm": 2.6076507568359375, + "learning_rate": 3.901079695718696e-06, + "loss": 0.4675, + "step": 13365 + }, + { + "epoch": 1.7399453338539632, + "grad_norm": 2.406752109527588, + "learning_rate": 3.899041047601375e-06, + "loss": 0.5027, + "step": 13368 + }, + { + "epoch": 1.740335806325654, + "grad_norm": 2.558464527130127, + "learning_rate": 3.897002591819269e-06, + "loss": 0.4831, + "step": 13371 + }, + { + "epoch": 1.7407262787973448, + "grad_norm": 2.5584099292755127, + "learning_rate": 3.894964328728489e-06, + "loss": 0.4087, + "step": 13374 + }, + { + "epoch": 1.7411167512690355, + "grad_norm": 2.5314037799835205, + "learning_rate": 3.8929262586851164e-06, + "loss": 0.4534, + "step": 13377 + }, + { + "epoch": 1.7415072237407263, + "grad_norm": 2.593812942504883, + "learning_rate": 3.890888382045198e-06, + "loss": 0.454, + "step": 13380 + }, + { + "epoch": 1.741897696212417, + "grad_norm": 2.6239094734191895, + "learning_rate": 3.8888506991647455e-06, + "loss": 0.4661, + "step": 13383 + }, + { + "epoch": 1.7422881686841079, + "grad_norm": 2.606416940689087, + "learning_rate": 3.886813210399738e-06, + "loss": 0.5233, + "step": 13386 + }, + { + "epoch": 1.7426786411557984, + "grad_norm": 2.7739343643188477, + "learning_rate": 3.884775916106121e-06, + "loss": 0.4525, + "step": 13389 + }, + { + "epoch": 1.7430691136274894, + "grad_norm": 2.44478440284729, + "learning_rate": 3.882738816639806e-06, + "loss": 0.452, + "step": 13392 + }, + { + "epoch": 1.74345958609918, + "grad_norm": 2.652439832687378, + "learning_rate": 3.880701912356668e-06, + "loss": 0.4519, + "step": 13395 + }, + { + "epoch": 1.7438500585708707, + "grad_norm": 2.414402484893799, + "learning_rate": 3.878665203612553e-06, + "loss": 0.4507, + "step": 13398 + }, + { + "epoch": 1.7442405310425615, + "grad_norm": 2.590601682662964, + "learning_rate": 3.876628690763265e-06, + "loss": 0.4609, + "step": 13401 + }, + { + "epoch": 1.7446310035142523, + "grad_norm": 2.637747049331665, + "learning_rate": 3.874592374164583e-06, + "loss": 0.6, + "step": 13404 + }, + { + "epoch": 1.745021475985943, + "grad_norm": 2.618107795715332, + "learning_rate": 3.872556254172246e-06, + "loss": 0.5137, + "step": 13407 + }, + { + "epoch": 1.7454119484576336, + "grad_norm": 2.5348377227783203, + "learning_rate": 3.870520331141961e-06, + "loss": 0.5094, + "step": 13410 + }, + { + "epoch": 1.7458024209293246, + "grad_norm": 2.787950277328491, + "learning_rate": 3.868484605429396e-06, + "loss": 0.5294, + "step": 13413 + }, + { + "epoch": 1.7461928934010151, + "grad_norm": 2.4089295864105225, + "learning_rate": 3.866449077390192e-06, + "loss": 0.439, + "step": 13416 + }, + { + "epoch": 1.7465833658727061, + "grad_norm": 2.6536457538604736, + "learning_rate": 3.864413747379948e-06, + "loss": 0.4402, + "step": 13419 + }, + { + "epoch": 1.7469738383443967, + "grad_norm": 2.460862398147583, + "learning_rate": 3.862378615754233e-06, + "loss": 0.4751, + "step": 13422 + }, + { + "epoch": 1.7473643108160875, + "grad_norm": 2.9238407611846924, + "learning_rate": 3.860343682868583e-06, + "loss": 0.4904, + "step": 13425 + }, + { + "epoch": 1.7477547832877782, + "grad_norm": 2.762789249420166, + "learning_rate": 3.858308949078492e-06, + "loss": 0.5113, + "step": 13428 + }, + { + "epoch": 1.748145255759469, + "grad_norm": 2.810913324356079, + "learning_rate": 3.856274414739428e-06, + "loss": 0.4988, + "step": 13431 + }, + { + "epoch": 1.7485357282311598, + "grad_norm": 2.568662643432617, + "learning_rate": 3.854240080206815e-06, + "loss": 0.5601, + "step": 13434 + }, + { + "epoch": 1.7489262007028503, + "grad_norm": 2.7099461555480957, + "learning_rate": 3.852205945836051e-06, + "loss": 0.4212, + "step": 13437 + }, + { + "epoch": 1.7493166731745413, + "grad_norm": 2.6403145790100098, + "learning_rate": 3.850172011982494e-06, + "loss": 0.4719, + "step": 13440 + }, + { + "epoch": 1.7497071456462319, + "grad_norm": 2.7137389183044434, + "learning_rate": 3.848138279001466e-06, + "loss": 0.4694, + "step": 13443 + }, + { + "epoch": 1.7500976181179226, + "grad_norm": 2.654871702194214, + "learning_rate": 3.8461047472482584e-06, + "loss": 0.4603, + "step": 13446 + }, + { + "epoch": 1.7504880905896134, + "grad_norm": 2.468869209289551, + "learning_rate": 3.844071417078124e-06, + "loss": 0.4592, + "step": 13449 + }, + { + "epoch": 1.7508785630613042, + "grad_norm": 3.2677621841430664, + "learning_rate": 3.842038288846282e-06, + "loss": 0.5041, + "step": 13452 + }, + { + "epoch": 1.751269035532995, + "grad_norm": 2.530336380004883, + "learning_rate": 3.8400053629079145e-06, + "loss": 0.5458, + "step": 13455 + }, + { + "epoch": 1.7516595080046855, + "grad_norm": 2.7045438289642334, + "learning_rate": 3.8379726396181705e-06, + "loss": 0.4496, + "step": 13458 + }, + { + "epoch": 1.7520499804763765, + "grad_norm": 2.640897035598755, + "learning_rate": 3.83594011933216e-06, + "loss": 0.4873, + "step": 13461 + }, + { + "epoch": 1.752440452948067, + "grad_norm": 2.674314022064209, + "learning_rate": 3.833907802404963e-06, + "loss": 0.5766, + "step": 13464 + }, + { + "epoch": 1.752830925419758, + "grad_norm": 3.62727689743042, + "learning_rate": 3.83187568919162e-06, + "loss": 0.4801, + "step": 13467 + }, + { + "epoch": 1.7532213978914486, + "grad_norm": 2.658531904220581, + "learning_rate": 3.829843780047137e-06, + "loss": 0.5359, + "step": 13470 + }, + { + "epoch": 1.7536118703631394, + "grad_norm": 3.2108259201049805, + "learning_rate": 3.827812075326483e-06, + "loss": 0.542, + "step": 13473 + }, + { + "epoch": 1.7540023428348301, + "grad_norm": 2.5640816688537598, + "learning_rate": 3.825780575384595e-06, + "loss": 0.5172, + "step": 13476 + }, + { + "epoch": 1.754392815306521, + "grad_norm": 4.314723491668701, + "learning_rate": 3.823749280576369e-06, + "loss": 0.4687, + "step": 13479 + }, + { + "epoch": 1.7547832877782117, + "grad_norm": 2.8595173358917236, + "learning_rate": 3.821718191256669e-06, + "loss": 0.5252, + "step": 13482 + }, + { + "epoch": 1.7551737602499022, + "grad_norm": 2.5360357761383057, + "learning_rate": 3.819687307780321e-06, + "loss": 0.4925, + "step": 13485 + }, + { + "epoch": 1.7555642327215932, + "grad_norm": 2.3102357387542725, + "learning_rate": 3.81765663050212e-06, + "loss": 0.516, + "step": 13488 + }, + { + "epoch": 1.7559547051932838, + "grad_norm": 2.8565187454223633, + "learning_rate": 3.8156261597768165e-06, + "loss": 0.5065, + "step": 13491 + }, + { + "epoch": 1.7563451776649748, + "grad_norm": 3.1690919399261475, + "learning_rate": 3.8135958959591334e-06, + "loss": 0.506, + "step": 13494 + }, + { + "epoch": 1.7567356501366653, + "grad_norm": 2.7095978260040283, + "learning_rate": 3.8115658394037496e-06, + "loss": 0.5125, + "step": 13497 + }, + { + "epoch": 1.757126122608356, + "grad_norm": 2.614197254180908, + "learning_rate": 3.809535990465314e-06, + "loss": 0.5541, + "step": 13500 + }, + { + "epoch": 1.7575165950800469, + "grad_norm": 3.0147502422332764, + "learning_rate": 3.807506349498438e-06, + "loss": 0.4678, + "step": 13503 + }, + { + "epoch": 1.7579070675517376, + "grad_norm": 2.663944721221924, + "learning_rate": 3.8054769168576924e-06, + "loss": 0.4942, + "step": 13506 + }, + { + "epoch": 1.7582975400234284, + "grad_norm": 2.467390775680542, + "learning_rate": 3.803447692897617e-06, + "loss": 0.4927, + "step": 13509 + }, + { + "epoch": 1.758688012495119, + "grad_norm": 3.1490232944488525, + "learning_rate": 3.8014186779727123e-06, + "loss": 0.5778, + "step": 13512 + }, + { + "epoch": 1.75907848496681, + "grad_norm": 2.497331142425537, + "learning_rate": 3.7993898724374435e-06, + "loss": 0.4939, + "step": 13515 + }, + { + "epoch": 1.7594689574385005, + "grad_norm": 2.6486501693725586, + "learning_rate": 3.7973612766462387e-06, + "loss": 0.4405, + "step": 13518 + }, + { + "epoch": 1.7598594299101913, + "grad_norm": 3.311718702316284, + "learning_rate": 3.7953328909534876e-06, + "loss": 0.5304, + "step": 13521 + }, + { + "epoch": 1.760249902381882, + "grad_norm": 3.001080274581909, + "learning_rate": 3.7933047157135465e-06, + "loss": 0.6345, + "step": 13524 + }, + { + "epoch": 1.7606403748535728, + "grad_norm": 2.920088529586792, + "learning_rate": 3.7912767512807318e-06, + "loss": 0.5114, + "step": 13527 + }, + { + "epoch": 1.7610308473252636, + "grad_norm": 2.604182720184326, + "learning_rate": 3.7892489980093285e-06, + "loss": 0.5102, + "step": 13530 + }, + { + "epoch": 1.7614213197969542, + "grad_norm": 3.248330593109131, + "learning_rate": 3.7872214562535765e-06, + "loss": 0.4517, + "step": 13533 + }, + { + "epoch": 1.7618117922686451, + "grad_norm": 3.4100341796875, + "learning_rate": 3.785194126367685e-06, + "loss": 0.4723, + "step": 13536 + }, + { + "epoch": 1.7622022647403357, + "grad_norm": 3.491152286529541, + "learning_rate": 3.783167008705825e-06, + "loss": 0.5118, + "step": 13539 + }, + { + "epoch": 1.7625927372120267, + "grad_norm": 2.765254259109497, + "learning_rate": 3.7811401036221283e-06, + "loss": 0.4562, + "step": 13542 + }, + { + "epoch": 1.7629832096837172, + "grad_norm": 2.513669967651367, + "learning_rate": 3.779113411470692e-06, + "loss": 0.5244, + "step": 13545 + }, + { + "epoch": 1.763373682155408, + "grad_norm": 2.982708215713501, + "learning_rate": 3.7770869326055733e-06, + "loss": 0.5286, + "step": 13548 + }, + { + "epoch": 1.7637641546270988, + "grad_norm": 3.210015058517456, + "learning_rate": 3.7750606673807945e-06, + "loss": 0.5115, + "step": 13551 + }, + { + "epoch": 1.7641546270987896, + "grad_norm": 3.011054277420044, + "learning_rate": 3.773034616150342e-06, + "loss": 0.5402, + "step": 13554 + }, + { + "epoch": 1.7645450995704803, + "grad_norm": 4.221112251281738, + "learning_rate": 3.7710087792681594e-06, + "loss": 0.6668, + "step": 13557 + }, + { + "epoch": 1.7649355720421709, + "grad_norm": 2.5336077213287354, + "learning_rate": 3.7689831570881584e-06, + "loss": 0.4922, + "step": 13560 + }, + { + "epoch": 1.7653260445138619, + "grad_norm": 2.933840751647949, + "learning_rate": 3.7669577499642094e-06, + "loss": 0.5391, + "step": 13563 + }, + { + "epoch": 1.7657165169855524, + "grad_norm": 2.6583011150360107, + "learning_rate": 3.7649325582501478e-06, + "loss": 0.5105, + "step": 13566 + }, + { + "epoch": 1.7661069894572434, + "grad_norm": 2.7706074714660645, + "learning_rate": 3.7629075822997685e-06, + "loss": 0.5597, + "step": 13569 + }, + { + "epoch": 1.766497461928934, + "grad_norm": 2.6521201133728027, + "learning_rate": 3.7608828224668346e-06, + "loss": 0.4072, + "step": 13572 + }, + { + "epoch": 1.7668879344006247, + "grad_norm": 2.8983657360076904, + "learning_rate": 3.7588582791050644e-06, + "loss": 0.5915, + "step": 13575 + }, + { + "epoch": 1.7672784068723155, + "grad_norm": 2.609997510910034, + "learning_rate": 3.7568339525681407e-06, + "loss": 0.4157, + "step": 13578 + }, + { + "epoch": 1.7676688793440063, + "grad_norm": 3.424751043319702, + "learning_rate": 3.754809843209712e-06, + "loss": 0.5333, + "step": 13581 + }, + { + "epoch": 1.768059351815697, + "grad_norm": 4.254384994506836, + "learning_rate": 3.752785951383383e-06, + "loss": 0.5174, + "step": 13584 + }, + { + "epoch": 1.7684498242873876, + "grad_norm": 3.230611562728882, + "learning_rate": 3.7507622774427242e-06, + "loss": 0.602, + "step": 13587 + }, + { + "epoch": 1.7688402967590786, + "grad_norm": 3.1927285194396973, + "learning_rate": 3.748738821741269e-06, + "loss": 0.523, + "step": 13590 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 3.054600238800049, + "learning_rate": 3.7467155846325086e-06, + "loss": 0.4559, + "step": 13593 + }, + { + "epoch": 1.76962124170246, + "grad_norm": 2.9370806217193604, + "learning_rate": 3.7446925664699e-06, + "loss": 0.5063, + "step": 13596 + }, + { + "epoch": 1.7700117141741507, + "grad_norm": 2.851138114929199, + "learning_rate": 3.7426697676068575e-06, + "loss": 0.4684, + "step": 13599 + }, + { + "epoch": 1.7704021866458415, + "grad_norm": 2.6864585876464844, + "learning_rate": 3.740647188396762e-06, + "loss": 0.4323, + "step": 13602 + }, + { + "epoch": 1.7707926591175323, + "grad_norm": 3.145974636077881, + "learning_rate": 3.7386248291929544e-06, + "loss": 0.4725, + "step": 13605 + }, + { + "epoch": 1.7711831315892228, + "grad_norm": 4.056112766265869, + "learning_rate": 3.7366026903487346e-06, + "loss": 0.6875, + "step": 13608 + }, + { + "epoch": 1.7715736040609138, + "grad_norm": 2.8808858394622803, + "learning_rate": 3.7345807722173655e-06, + "loss": 0.6169, + "step": 13611 + }, + { + "epoch": 1.7719640765326043, + "grad_norm": 3.3072636127471924, + "learning_rate": 3.732559075152075e-06, + "loss": 0.5757, + "step": 13614 + }, + { + "epoch": 1.7723545490042953, + "grad_norm": 2.6923253536224365, + "learning_rate": 3.730537599506049e-06, + "loss": 0.5674, + "step": 13617 + }, + { + "epoch": 1.772745021475986, + "grad_norm": 2.857133150100708, + "learning_rate": 3.7285163456324323e-06, + "loss": 0.5753, + "step": 13620 + }, + { + "epoch": 1.7731354939476767, + "grad_norm": 2.6387181282043457, + "learning_rate": 3.7264953138843363e-06, + "loss": 0.525, + "step": 13623 + }, + { + "epoch": 1.7735259664193674, + "grad_norm": 2.5029585361480713, + "learning_rate": 3.724474504614829e-06, + "loss": 0.4137, + "step": 13626 + }, + { + "epoch": 1.7739164388910582, + "grad_norm": 2.632925510406494, + "learning_rate": 3.7224539181769425e-06, + "loss": 0.5328, + "step": 13629 + }, + { + "epoch": 1.774306911362749, + "grad_norm": 2.7073307037353516, + "learning_rate": 3.7204335549236703e-06, + "loss": 0.4625, + "step": 13632 + }, + { + "epoch": 1.7746973838344395, + "grad_norm": 2.770639181137085, + "learning_rate": 3.718413415207962e-06, + "loss": 0.4865, + "step": 13635 + }, + { + "epoch": 1.7750878563061305, + "grad_norm": 3.0762181282043457, + "learning_rate": 3.7163934993827364e-06, + "loss": 0.522, + "step": 13638 + }, + { + "epoch": 1.775478328777821, + "grad_norm": 2.7162609100341797, + "learning_rate": 3.714373807800864e-06, + "loss": 0.4633, + "step": 13641 + }, + { + "epoch": 1.775868801249512, + "grad_norm": 2.4745163917541504, + "learning_rate": 3.7123543408151843e-06, + "loss": 0.4783, + "step": 13644 + }, + { + "epoch": 1.7762592737212026, + "grad_norm": 2.7672512531280518, + "learning_rate": 3.710335098778492e-06, + "loss": 0.548, + "step": 13647 + }, + { + "epoch": 1.7766497461928934, + "grad_norm": 2.50407075881958, + "learning_rate": 3.7083160820435445e-06, + "loss": 0.4029, + "step": 13650 + }, + { + "epoch": 1.7770402186645842, + "grad_norm": 2.7135939598083496, + "learning_rate": 3.706297290963059e-06, + "loss": 0.4438, + "step": 13653 + }, + { + "epoch": 1.777430691136275, + "grad_norm": 2.827840805053711, + "learning_rate": 3.7042787258897163e-06, + "loss": 0.5588, + "step": 13656 + }, + { + "epoch": 1.7778211636079657, + "grad_norm": 2.7864699363708496, + "learning_rate": 3.7022603871761554e-06, + "loss": 0.4841, + "step": 13659 + }, + { + "epoch": 1.7782116360796563, + "grad_norm": 3.1784870624542236, + "learning_rate": 3.700242275174973e-06, + "loss": 0.5815, + "step": 13662 + }, + { + "epoch": 1.7786021085513473, + "grad_norm": 2.666109323501587, + "learning_rate": 3.698224390238732e-06, + "loss": 0.5251, + "step": 13665 + }, + { + "epoch": 1.7789925810230378, + "grad_norm": 2.4882564544677734, + "learning_rate": 3.6962067327199523e-06, + "loss": 0.4551, + "step": 13668 + }, + { + "epoch": 1.7793830534947286, + "grad_norm": 2.7891669273376465, + "learning_rate": 3.6941893029711123e-06, + "loss": 0.4774, + "step": 13671 + }, + { + "epoch": 1.7797735259664194, + "grad_norm": 2.7892396450042725, + "learning_rate": 3.6921721013446555e-06, + "loss": 0.512, + "step": 13674 + }, + { + "epoch": 1.7801639984381101, + "grad_norm": 2.7493927478790283, + "learning_rate": 3.690155128192979e-06, + "loss": 0.3913, + "step": 13677 + }, + { + "epoch": 1.780554470909801, + "grad_norm": 2.922779083251953, + "learning_rate": 3.6881383838684475e-06, + "loss": 0.5359, + "step": 13680 + }, + { + "epoch": 1.7809449433814915, + "grad_norm": 3.476430892944336, + "learning_rate": 3.6861218687233813e-06, + "loss": 0.4305, + "step": 13683 + }, + { + "epoch": 1.7813354158531824, + "grad_norm": 2.7361629009246826, + "learning_rate": 3.6841055831100593e-06, + "loss": 0.5422, + "step": 13686 + }, + { + "epoch": 1.781725888324873, + "grad_norm": 2.4145750999450684, + "learning_rate": 3.6820895273807257e-06, + "loss": 0.5088, + "step": 13689 + }, + { + "epoch": 1.782116360796564, + "grad_norm": 2.698687791824341, + "learning_rate": 3.6800737018875765e-06, + "loss": 0.5185, + "step": 13692 + }, + { + "epoch": 1.7825068332682545, + "grad_norm": 3.8540420532226562, + "learning_rate": 3.678058106982775e-06, + "loss": 0.4911, + "step": 13695 + }, + { + "epoch": 1.7828973057399453, + "grad_norm": 2.5574920177459717, + "learning_rate": 3.676042743018442e-06, + "loss": 0.5211, + "step": 13698 + }, + { + "epoch": 1.783287778211636, + "grad_norm": 2.5759153366088867, + "learning_rate": 3.674027610346658e-06, + "loss": 0.455, + "step": 13701 + }, + { + "epoch": 1.7836782506833269, + "grad_norm": 2.5695457458496094, + "learning_rate": 3.672012709319459e-06, + "loss": 0.4909, + "step": 13704 + }, + { + "epoch": 1.7840687231550176, + "grad_norm": 2.9643442630767822, + "learning_rate": 3.669998040288847e-06, + "loss": 0.4506, + "step": 13707 + }, + { + "epoch": 1.7844591956267082, + "grad_norm": 2.8210248947143555, + "learning_rate": 3.66798360360678e-06, + "loss": 0.4668, + "step": 13710 + }, + { + "epoch": 1.7848496680983992, + "grad_norm": 2.568918228149414, + "learning_rate": 3.6659693996251745e-06, + "loss": 0.4581, + "step": 13713 + }, + { + "epoch": 1.7852401405700897, + "grad_norm": 2.8247551918029785, + "learning_rate": 3.663955428695908e-06, + "loss": 0.4848, + "step": 13716 + }, + { + "epoch": 1.7856306130417805, + "grad_norm": 2.960132598876953, + "learning_rate": 3.6619416911708196e-06, + "loss": 0.497, + "step": 13719 + }, + { + "epoch": 1.7860210855134713, + "grad_norm": 2.265368938446045, + "learning_rate": 3.6599281874017005e-06, + "loss": 0.4379, + "step": 13722 + }, + { + "epoch": 1.786411557985162, + "grad_norm": 2.88246750831604, + "learning_rate": 3.6579149177403093e-06, + "loss": 0.61, + "step": 13725 + }, + { + "epoch": 1.7868020304568528, + "grad_norm": 2.980820655822754, + "learning_rate": 3.6559018825383587e-06, + "loss": 0.5359, + "step": 13728 + }, + { + "epoch": 1.7871925029285436, + "grad_norm": 2.5079283714294434, + "learning_rate": 3.6538890821475204e-06, + "loss": 0.5089, + "step": 13731 + }, + { + "epoch": 1.7875829754002344, + "grad_norm": 2.517543077468872, + "learning_rate": 3.6518765169194294e-06, + "loss": 0.4798, + "step": 13734 + }, + { + "epoch": 1.787973447871925, + "grad_norm": 3.10083270072937, + "learning_rate": 3.649864187205672e-06, + "loss": 0.4349, + "step": 13737 + }, + { + "epoch": 1.788363920343616, + "grad_norm": 2.687077522277832, + "learning_rate": 3.647852093357803e-06, + "loss": 0.4688, + "step": 13740 + }, + { + "epoch": 1.7887543928153065, + "grad_norm": 2.802783966064453, + "learning_rate": 3.645840235727328e-06, + "loss": 0.4867, + "step": 13743 + }, + { + "epoch": 1.7891448652869972, + "grad_norm": 2.6640493869781494, + "learning_rate": 3.6438286146657166e-06, + "loss": 0.5281, + "step": 13746 + }, + { + "epoch": 1.789535337758688, + "grad_norm": 2.8820464611053467, + "learning_rate": 3.6418172305243914e-06, + "loss": 0.4845, + "step": 13749 + }, + { + "epoch": 1.7899258102303788, + "grad_norm": 2.6128082275390625, + "learning_rate": 3.6398060836547404e-06, + "loss": 0.4531, + "step": 13752 + }, + { + "epoch": 1.7903162827020696, + "grad_norm": 2.659701108932495, + "learning_rate": 3.637795174408104e-06, + "loss": 0.5323, + "step": 13755 + }, + { + "epoch": 1.79070675517376, + "grad_norm": 2.875169277191162, + "learning_rate": 3.635784503135785e-06, + "loss": 0.5001, + "step": 13758 + }, + { + "epoch": 1.791097227645451, + "grad_norm": 3.0239455699920654, + "learning_rate": 3.6337740701890446e-06, + "loss": 0.4817, + "step": 13761 + }, + { + "epoch": 1.7914877001171416, + "grad_norm": 2.8980631828308105, + "learning_rate": 3.6317638759190985e-06, + "loss": 0.4957, + "step": 13764 + }, + { + "epoch": 1.7918781725888326, + "grad_norm": 2.978820562362671, + "learning_rate": 3.629753920677126e-06, + "loss": 0.5658, + "step": 13767 + }, + { + "epoch": 1.7922686450605232, + "grad_norm": 2.6764168739318848, + "learning_rate": 3.6277442048142615e-06, + "loss": 0.5507, + "step": 13770 + }, + { + "epoch": 1.792659117532214, + "grad_norm": 2.452681064605713, + "learning_rate": 3.6257347286815956e-06, + "loss": 0.4449, + "step": 13773 + }, + { + "epoch": 1.7930495900039047, + "grad_norm": 2.4262232780456543, + "learning_rate": 3.623725492630184e-06, + "loss": 0.4299, + "step": 13776 + }, + { + "epoch": 1.7934400624755955, + "grad_norm": 2.8017055988311768, + "learning_rate": 3.6217164970110296e-06, + "loss": 0.4115, + "step": 13779 + }, + { + "epoch": 1.7938305349472863, + "grad_norm": 2.725071430206299, + "learning_rate": 3.6197077421751077e-06, + "loss": 0.5487, + "step": 13782 + }, + { + "epoch": 1.7942210074189768, + "grad_norm": 3.4036777019500732, + "learning_rate": 3.6176992284733375e-06, + "loss": 0.4692, + "step": 13785 + }, + { + "epoch": 1.7946114798906678, + "grad_norm": 2.6664416790008545, + "learning_rate": 3.6156909562566054e-06, + "loss": 0.5513, + "step": 13788 + }, + { + "epoch": 1.7950019523623584, + "grad_norm": 3.166633129119873, + "learning_rate": 3.6136829258757503e-06, + "loss": 0.4979, + "step": 13791 + }, + { + "epoch": 1.7953924248340491, + "grad_norm": 2.793639898300171, + "learning_rate": 3.611675137681572e-06, + "loss": 0.5232, + "step": 13794 + }, + { + "epoch": 1.79578289730574, + "grad_norm": 2.91501784324646, + "learning_rate": 3.609667592024827e-06, + "loss": 0.516, + "step": 13797 + }, + { + "epoch": 1.7961733697774307, + "grad_norm": 2.355177164077759, + "learning_rate": 3.607660289256228e-06, + "loss": 0.5682, + "step": 13800 + }, + { + "epoch": 1.7965638422491215, + "grad_norm": 2.7519314289093018, + "learning_rate": 3.6056532297264486e-06, + "loss": 0.5224, + "step": 13803 + }, + { + "epoch": 1.796954314720812, + "grad_norm": 3.09912109375, + "learning_rate": 3.603646413786115e-06, + "loss": 0.502, + "step": 13806 + }, + { + "epoch": 1.797344787192503, + "grad_norm": 2.3987009525299072, + "learning_rate": 3.601639841785816e-06, + "loss": 0.4405, + "step": 13809 + }, + { + "epoch": 1.7977352596641936, + "grad_norm": 2.6418380737304688, + "learning_rate": 3.599633514076096e-06, + "loss": 0.4318, + "step": 13812 + }, + { + "epoch": 1.7981257321358846, + "grad_norm": 2.3674392700195312, + "learning_rate": 3.5976274310074536e-06, + "loss": 0.453, + "step": 13815 + }, + { + "epoch": 1.798516204607575, + "grad_norm": 2.514341354370117, + "learning_rate": 3.595621592930351e-06, + "loss": 0.4702, + "step": 13818 + }, + { + "epoch": 1.7989066770792659, + "grad_norm": 2.4524552822113037, + "learning_rate": 3.5936160001951977e-06, + "loss": 0.4447, + "step": 13821 + }, + { + "epoch": 1.7992971495509567, + "grad_norm": 2.8817389011383057, + "learning_rate": 3.5916106531523737e-06, + "loss": 0.5262, + "step": 13824 + }, + { + "epoch": 1.7996876220226474, + "grad_norm": 2.756680965423584, + "learning_rate": 3.5896055521522043e-06, + "loss": 0.5041, + "step": 13827 + }, + { + "epoch": 1.8000780944943382, + "grad_norm": 2.7653608322143555, + "learning_rate": 3.587600697544979e-06, + "loss": 0.5412, + "step": 13830 + }, + { + "epoch": 1.8004685669660287, + "grad_norm": 2.692890167236328, + "learning_rate": 3.585596089680941e-06, + "loss": 0.5134, + "step": 13833 + }, + { + "epoch": 1.8008590394377197, + "grad_norm": 2.555551767349243, + "learning_rate": 3.5835917289102893e-06, + "loss": 0.4865, + "step": 13836 + }, + { + "epoch": 1.8012495119094103, + "grad_norm": 2.5611484050750732, + "learning_rate": 3.5815876155831845e-06, + "loss": 0.4808, + "step": 13839 + }, + { + "epoch": 1.8016399843811013, + "grad_norm": 3.0679988861083984, + "learning_rate": 3.5795837500497388e-06, + "loss": 0.5976, + "step": 13842 + }, + { + "epoch": 1.8020304568527918, + "grad_norm": 3.132232427597046, + "learning_rate": 3.577580132660023e-06, + "loss": 0.4778, + "step": 13845 + }, + { + "epoch": 1.8024209293244826, + "grad_norm": 3.60653018951416, + "learning_rate": 3.575576763764067e-06, + "loss": 0.4976, + "step": 13848 + }, + { + "epoch": 1.8028114017961734, + "grad_norm": 2.553593873977661, + "learning_rate": 3.573573643711852e-06, + "loss": 0.4926, + "step": 13851 + }, + { + "epoch": 1.8032018742678642, + "grad_norm": 2.7795355319976807, + "learning_rate": 3.5715707728533227e-06, + "loss": 0.4787, + "step": 13854 + }, + { + "epoch": 1.803592346739555, + "grad_norm": 2.8556997776031494, + "learning_rate": 3.5695681515383727e-06, + "loss": 0.4926, + "step": 13857 + }, + { + "epoch": 1.8039828192112455, + "grad_norm": 2.585430383682251, + "learning_rate": 3.5675657801168583e-06, + "loss": 0.4774, + "step": 13860 + }, + { + "epoch": 1.8043732916829365, + "grad_norm": 3.1514101028442383, + "learning_rate": 3.5655636589385874e-06, + "loss": 0.6497, + "step": 13863 + }, + { + "epoch": 1.804763764154627, + "grad_norm": 2.7750942707061768, + "learning_rate": 3.563561788353329e-06, + "loss": 0.5409, + "step": 13866 + }, + { + "epoch": 1.8051542366263178, + "grad_norm": 2.840725898742676, + "learning_rate": 3.561560168710804e-06, + "loss": 0.6219, + "step": 13869 + }, + { + "epoch": 1.8055447090980086, + "grad_norm": 2.4959402084350586, + "learning_rate": 3.559558800360692e-06, + "loss": 0.475, + "step": 13872 + }, + { + "epoch": 1.8059351815696993, + "grad_norm": 2.7684502601623535, + "learning_rate": 3.557557683652627e-06, + "loss": 0.4395, + "step": 13875 + }, + { + "epoch": 1.8063256540413901, + "grad_norm": 3.1551716327667236, + "learning_rate": 3.5555568189362e-06, + "loss": 0.4665, + "step": 13878 + }, + { + "epoch": 1.8067161265130807, + "grad_norm": 3.5174217224121094, + "learning_rate": 3.5535562065609598e-06, + "loss": 0.4556, + "step": 13881 + }, + { + "epoch": 1.8071065989847717, + "grad_norm": 2.683762311935425, + "learning_rate": 3.551555846876405e-06, + "loss": 0.4735, + "step": 13884 + }, + { + "epoch": 1.8074970714564622, + "grad_norm": 2.77344012260437, + "learning_rate": 3.5495557402319975e-06, + "loss": 0.5262, + "step": 13887 + }, + { + "epoch": 1.8078875439281532, + "grad_norm": 3.604360342025757, + "learning_rate": 3.5475558869771516e-06, + "loss": 0.5683, + "step": 13890 + }, + { + "epoch": 1.8082780163998438, + "grad_norm": 2.6248981952667236, + "learning_rate": 3.545556287461236e-06, + "loss": 0.4836, + "step": 13893 + }, + { + "epoch": 1.8086684888715345, + "grad_norm": 3.0263054370880127, + "learning_rate": 3.543556942033577e-06, + "loss": 0.4698, + "step": 13896 + }, + { + "epoch": 1.8090589613432253, + "grad_norm": 2.9796009063720703, + "learning_rate": 3.5415578510434572e-06, + "loss": 0.4816, + "step": 13899 + }, + { + "epoch": 1.809449433814916, + "grad_norm": 2.5007407665252686, + "learning_rate": 3.539559014840112e-06, + "loss": 0.4536, + "step": 13902 + }, + { + "epoch": 1.8098399062866068, + "grad_norm": 4.0354323387146, + "learning_rate": 3.537560433772733e-06, + "loss": 0.5184, + "step": 13905 + }, + { + "epoch": 1.8102303787582974, + "grad_norm": 2.4867868423461914, + "learning_rate": 3.5355621081904717e-06, + "loss": 0.4836, + "step": 13908 + }, + { + "epoch": 1.8106208512299884, + "grad_norm": 2.469088315963745, + "learning_rate": 3.5335640384424296e-06, + "loss": 0.5388, + "step": 13911 + }, + { + "epoch": 1.811011323701679, + "grad_norm": 2.7055606842041016, + "learning_rate": 3.531566224877665e-06, + "loss": 0.4555, + "step": 13914 + }, + { + "epoch": 1.81140179617337, + "grad_norm": 2.7885193824768066, + "learning_rate": 3.529568667845192e-06, + "loss": 0.5323, + "step": 13917 + }, + { + "epoch": 1.8117922686450605, + "grad_norm": 2.801211357116699, + "learning_rate": 3.5275713676939782e-06, + "loss": 0.4274, + "step": 13920 + }, + { + "epoch": 1.8121827411167513, + "grad_norm": 2.6563968658447266, + "learning_rate": 3.525574324772949e-06, + "loss": 0.4206, + "step": 13923 + }, + { + "epoch": 1.812573213588442, + "grad_norm": 3.055948495864868, + "learning_rate": 3.523577539430985e-06, + "loss": 0.4404, + "step": 13926 + }, + { + "epoch": 1.8129636860601328, + "grad_norm": 2.6117143630981445, + "learning_rate": 3.5215810120169182e-06, + "loss": 0.4857, + "step": 13929 + }, + { + "epoch": 1.8133541585318236, + "grad_norm": 2.4713497161865234, + "learning_rate": 3.5195847428795388e-06, + "loss": 0.4886, + "step": 13932 + }, + { + "epoch": 1.8137446310035141, + "grad_norm": 2.4422478675842285, + "learning_rate": 3.5175887323675896e-06, + "loss": 0.5087, + "step": 13935 + }, + { + "epoch": 1.8141351034752051, + "grad_norm": 2.721740484237671, + "learning_rate": 3.5155929808297706e-06, + "loss": 0.4973, + "step": 13938 + }, + { + "epoch": 1.8145255759468957, + "grad_norm": 2.5684258937835693, + "learning_rate": 3.5135974886147358e-06, + "loss": 0.4052, + "step": 13941 + }, + { + "epoch": 1.8149160484185864, + "grad_norm": 2.683708667755127, + "learning_rate": 3.5116022560710916e-06, + "loss": 0.4971, + "step": 13944 + }, + { + "epoch": 1.8153065208902772, + "grad_norm": 2.9392552375793457, + "learning_rate": 3.5096072835474015e-06, + "loss": 0.4841, + "step": 13947 + }, + { + "epoch": 1.815696993361968, + "grad_norm": 2.6948726177215576, + "learning_rate": 3.5076125713921844e-06, + "loss": 0.5777, + "step": 13950 + }, + { + "epoch": 1.8160874658336588, + "grad_norm": 2.9752037525177, + "learning_rate": 3.505618119953913e-06, + "loss": 0.4687, + "step": 13953 + }, + { + "epoch": 1.8164779383053493, + "grad_norm": 2.543264865875244, + "learning_rate": 3.5036239295810105e-06, + "loss": 0.4773, + "step": 13956 + }, + { + "epoch": 1.8168684107770403, + "grad_norm": 2.5448291301727295, + "learning_rate": 3.5016300006218607e-06, + "loss": 0.5217, + "step": 13959 + }, + { + "epoch": 1.8172588832487309, + "grad_norm": 2.9891393184661865, + "learning_rate": 3.4996363334247975e-06, + "loss": 0.4791, + "step": 13962 + }, + { + "epoch": 1.8176493557204219, + "grad_norm": 2.651190757751465, + "learning_rate": 3.49764292833811e-06, + "loss": 0.5361, + "step": 13965 + }, + { + "epoch": 1.8180398281921124, + "grad_norm": 2.7708990573883057, + "learning_rate": 3.4956497857100437e-06, + "loss": 0.4645, + "step": 13968 + }, + { + "epoch": 1.8184303006638032, + "grad_norm": 2.510695457458496, + "learning_rate": 3.493656905888794e-06, + "loss": 0.4663, + "step": 13971 + }, + { + "epoch": 1.818820773135494, + "grad_norm": 3.075878381729126, + "learning_rate": 3.4916642892225138e-06, + "loss": 0.5056, + "step": 13974 + }, + { + "epoch": 1.8192112456071847, + "grad_norm": 2.6439952850341797, + "learning_rate": 3.4896719360593106e-06, + "loss": 0.533, + "step": 13977 + }, + { + "epoch": 1.8196017180788755, + "grad_norm": 2.594844102859497, + "learning_rate": 3.4876798467472415e-06, + "loss": 0.4981, + "step": 13980 + }, + { + "epoch": 1.819992190550566, + "grad_norm": 3.148606538772583, + "learning_rate": 3.4856880216343235e-06, + "loss": 0.5199, + "step": 13983 + }, + { + "epoch": 1.820382663022257, + "grad_norm": 2.5869855880737305, + "learning_rate": 3.4836964610685207e-06, + "loss": 0.437, + "step": 13986 + }, + { + "epoch": 1.8207731354939476, + "grad_norm": 2.792637586593628, + "learning_rate": 3.4817051653977553e-06, + "loss": 0.5815, + "step": 13989 + }, + { + "epoch": 1.8211636079656386, + "grad_norm": 2.534087896347046, + "learning_rate": 3.479714134969905e-06, + "loss": 0.4854, + "step": 13992 + }, + { + "epoch": 1.8215540804373291, + "grad_norm": 4.054605960845947, + "learning_rate": 3.4777233701327974e-06, + "loss": 0.5168, + "step": 13995 + }, + { + "epoch": 1.82194455290902, + "grad_norm": 2.612586498260498, + "learning_rate": 3.4757328712342143e-06, + "loss": 0.4172, + "step": 13998 + }, + { + "epoch": 1.8223350253807107, + "grad_norm": 2.6771693229675293, + "learning_rate": 3.4737426386218913e-06, + "loss": 0.4638, + "step": 14001 + }, + { + "epoch": 1.8227254978524015, + "grad_norm": 2.4215550422668457, + "learning_rate": 3.4717526726435204e-06, + "loss": 0.5155, + "step": 14004 + }, + { + "epoch": 1.8231159703240922, + "grad_norm": 2.9423484802246094, + "learning_rate": 3.469762973646741e-06, + "loss": 0.4784, + "step": 14007 + }, + { + "epoch": 1.8235064427957828, + "grad_norm": 3.212175130844116, + "learning_rate": 3.4677735419791507e-06, + "loss": 0.4752, + "step": 14010 + }, + { + "epoch": 1.8238969152674738, + "grad_norm": 2.640934944152832, + "learning_rate": 3.465784377988301e-06, + "loss": 0.5054, + "step": 14013 + }, + { + "epoch": 1.8242873877391643, + "grad_norm": 2.4397079944610596, + "learning_rate": 3.4637954820216914e-06, + "loss": 0.4716, + "step": 14016 + }, + { + "epoch": 1.824677860210855, + "grad_norm": 2.737032413482666, + "learning_rate": 3.4618068544267806e-06, + "loss": 0.547, + "step": 14019 + }, + { + "epoch": 1.8250683326825459, + "grad_norm": 2.630606174468994, + "learning_rate": 3.459818495550976e-06, + "loss": 0.5541, + "step": 14022 + }, + { + "epoch": 1.8254588051542366, + "grad_norm": 2.5901596546173096, + "learning_rate": 3.4578304057416394e-06, + "loss": 0.4804, + "step": 14025 + }, + { + "epoch": 1.8258492776259274, + "grad_norm": 2.6611945629119873, + "learning_rate": 3.455842585346088e-06, + "loss": 0.5694, + "step": 14028 + }, + { + "epoch": 1.826239750097618, + "grad_norm": 2.6162936687469482, + "learning_rate": 3.4538550347115863e-06, + "loss": 0.5321, + "step": 14031 + }, + { + "epoch": 1.826630222569309, + "grad_norm": 2.7024919986724854, + "learning_rate": 3.4518677541853584e-06, + "loss": 0.4661, + "step": 14034 + }, + { + "epoch": 1.8270206950409995, + "grad_norm": 2.514699697494507, + "learning_rate": 3.4498807441145775e-06, + "loss": 0.5, + "step": 14037 + }, + { + "epoch": 1.8274111675126905, + "grad_norm": 2.608577013015747, + "learning_rate": 3.4478940048463705e-06, + "loss": 0.4691, + "step": 14040 + }, + { + "epoch": 1.827801639984381, + "grad_norm": 2.8621628284454346, + "learning_rate": 3.445907536727814e-06, + "loss": 0.4189, + "step": 14043 + }, + { + "epoch": 1.8281921124560718, + "grad_norm": 2.61087703704834, + "learning_rate": 3.4439213401059436e-06, + "loss": 0.4572, + "step": 14046 + }, + { + "epoch": 1.8285825849277626, + "grad_norm": 2.482490062713623, + "learning_rate": 3.4419354153277398e-06, + "loss": 0.4796, + "step": 14049 + }, + { + "epoch": 1.8289730573994534, + "grad_norm": 2.7227578163146973, + "learning_rate": 3.4399497627401414e-06, + "loss": 0.4849, + "step": 14052 + }, + { + "epoch": 1.8293635298711441, + "grad_norm": 2.800520181655884, + "learning_rate": 3.437964382690039e-06, + "loss": 0.4679, + "step": 14055 + }, + { + "epoch": 1.8297540023428347, + "grad_norm": 2.624976396560669, + "learning_rate": 3.4359792755242716e-06, + "loss": 0.5038, + "step": 14058 + }, + { + "epoch": 1.8301444748145257, + "grad_norm": 2.559096097946167, + "learning_rate": 3.4339944415896354e-06, + "loss": 0.4533, + "step": 14061 + }, + { + "epoch": 1.8305349472862162, + "grad_norm": 2.5498814582824707, + "learning_rate": 3.432009881232875e-06, + "loss": 0.4878, + "step": 14064 + }, + { + "epoch": 1.830925419757907, + "grad_norm": 2.93689227104187, + "learning_rate": 3.4300255948006893e-06, + "loss": 0.4776, + "step": 14067 + }, + { + "epoch": 1.8313158922295978, + "grad_norm": 2.3573851585388184, + "learning_rate": 3.4280415826397304e-06, + "loss": 0.4692, + "step": 14070 + }, + { + "epoch": 1.8317063647012886, + "grad_norm": 2.303317070007324, + "learning_rate": 3.426057845096598e-06, + "loss": 0.4765, + "step": 14073 + }, + { + "epoch": 1.8320968371729793, + "grad_norm": 2.6520111560821533, + "learning_rate": 3.4240743825178514e-06, + "loss": 0.4867, + "step": 14076 + }, + { + "epoch": 1.83248730964467, + "grad_norm": 2.8947081565856934, + "learning_rate": 3.4220911952499943e-06, + "loss": 0.5239, + "step": 14079 + }, + { + "epoch": 1.8328777821163609, + "grad_norm": 2.4253430366516113, + "learning_rate": 3.4201082836394868e-06, + "loss": 0.5101, + "step": 14082 + }, + { + "epoch": 1.8332682545880514, + "grad_norm": 3.378649950027466, + "learning_rate": 3.418125648032737e-06, + "loss": 0.4679, + "step": 14085 + }, + { + "epoch": 1.8336587270597424, + "grad_norm": 2.7288544178009033, + "learning_rate": 3.4161432887761093e-06, + "loss": 0.4794, + "step": 14088 + }, + { + "epoch": 1.834049199531433, + "grad_norm": 2.832486152648926, + "learning_rate": 3.414161206215918e-06, + "loss": 0.5284, + "step": 14091 + }, + { + "epoch": 1.8344396720031237, + "grad_norm": 2.27494215965271, + "learning_rate": 3.4121794006984265e-06, + "loss": 0.4875, + "step": 14094 + }, + { + "epoch": 1.8348301444748145, + "grad_norm": 2.7032313346862793, + "learning_rate": 3.4101978725698553e-06, + "loss": 0.4654, + "step": 14097 + }, + { + "epoch": 1.8352206169465053, + "grad_norm": 2.672342300415039, + "learning_rate": 3.40821662217637e-06, + "loss": 0.4787, + "step": 14100 + }, + { + "epoch": 1.835611089418196, + "grad_norm": 2.9763712882995605, + "learning_rate": 3.4062356498640915e-06, + "loss": 0.4966, + "step": 14103 + }, + { + "epoch": 1.8360015618898866, + "grad_norm": 2.8209915161132812, + "learning_rate": 3.4042549559790938e-06, + "loss": 0.5311, + "step": 14106 + }, + { + "epoch": 1.8363920343615776, + "grad_norm": 3.304166078567505, + "learning_rate": 3.4022745408673973e-06, + "loss": 0.4676, + "step": 14109 + }, + { + "epoch": 1.8367825068332682, + "grad_norm": 2.846513271331787, + "learning_rate": 3.400294404874978e-06, + "loss": 0.4849, + "step": 14112 + }, + { + "epoch": 1.8371729793049592, + "grad_norm": 2.843594789505005, + "learning_rate": 3.3983145483477582e-06, + "loss": 0.5584, + "step": 14115 + }, + { + "epoch": 1.8375634517766497, + "grad_norm": 2.5627312660217285, + "learning_rate": 3.39633497163162e-06, + "loss": 0.4594, + "step": 14118 + }, + { + "epoch": 1.8379539242483405, + "grad_norm": 2.5003201961517334, + "learning_rate": 3.394355675072388e-06, + "loss": 0.425, + "step": 14121 + }, + { + "epoch": 1.8383443967200312, + "grad_norm": 2.563779354095459, + "learning_rate": 3.3923766590158425e-06, + "loss": 0.5306, + "step": 14124 + }, + { + "epoch": 1.838734869191722, + "grad_norm": 2.529606819152832, + "learning_rate": 3.3903979238077124e-06, + "loss": 0.5286, + "step": 14127 + }, + { + "epoch": 1.8391253416634128, + "grad_norm": 2.8574764728546143, + "learning_rate": 3.3884194697936777e-06, + "loss": 0.5582, + "step": 14130 + }, + { + "epoch": 1.8395158141351033, + "grad_norm": 2.485995292663574, + "learning_rate": 3.3864412973193734e-06, + "loss": 0.4713, + "step": 14133 + }, + { + "epoch": 1.8399062866067943, + "grad_norm": 3.2460153102874756, + "learning_rate": 3.3844634067303783e-06, + "loss": 0.4945, + "step": 14136 + }, + { + "epoch": 1.840296759078485, + "grad_norm": 3.2644894123077393, + "learning_rate": 3.382485798372228e-06, + "loss": 0.524, + "step": 14139 + }, + { + "epoch": 1.8406872315501757, + "grad_norm": 2.7396457195281982, + "learning_rate": 3.380508472590407e-06, + "loss": 0.4508, + "step": 14142 + }, + { + "epoch": 1.8410777040218664, + "grad_norm": 2.678882598876953, + "learning_rate": 3.3785314297303477e-06, + "loss": 0.4521, + "step": 14145 + }, + { + "epoch": 1.8414681764935572, + "grad_norm": 2.4532623291015625, + "learning_rate": 3.3765546701374375e-06, + "loss": 0.4508, + "step": 14148 + }, + { + "epoch": 1.841858648965248, + "grad_norm": 2.8880367279052734, + "learning_rate": 3.3745781941570104e-06, + "loss": 0.4395, + "step": 14151 + }, + { + "epoch": 1.8422491214369385, + "grad_norm": 2.6723203659057617, + "learning_rate": 3.372602002134353e-06, + "loss": 0.4853, + "step": 14154 + }, + { + "epoch": 1.8426395939086295, + "grad_norm": 2.651268243789673, + "learning_rate": 3.370626094414702e-06, + "loss": 0.4853, + "step": 14157 + }, + { + "epoch": 1.84303006638032, + "grad_norm": 2.4674911499023438, + "learning_rate": 3.368650471343246e-06, + "loss": 0.5133, + "step": 14160 + }, + { + "epoch": 1.843420538852011, + "grad_norm": 2.827803373336792, + "learning_rate": 3.36667513326512e-06, + "loss": 0.4601, + "step": 14163 + }, + { + "epoch": 1.8438110113237016, + "grad_norm": 3.614109754562378, + "learning_rate": 3.364700080525412e-06, + "loss": 0.5692, + "step": 14166 + }, + { + "epoch": 1.8442014837953924, + "grad_norm": 3.3453476428985596, + "learning_rate": 3.362725313469161e-06, + "loss": 0.512, + "step": 14169 + }, + { + "epoch": 1.8445919562670832, + "grad_norm": 2.600367546081543, + "learning_rate": 3.3607508324413525e-06, + "loss": 0.5574, + "step": 14172 + }, + { + "epoch": 1.844982428738774, + "grad_norm": 2.761702299118042, + "learning_rate": 3.3587766377869256e-06, + "loss": 0.4362, + "step": 14175 + }, + { + "epoch": 1.8453729012104647, + "grad_norm": 2.6490187644958496, + "learning_rate": 3.3568027298507673e-06, + "loss": 0.5518, + "step": 14178 + }, + { + "epoch": 1.8457633736821553, + "grad_norm": 2.837568521499634, + "learning_rate": 3.3548291089777146e-06, + "loss": 0.4928, + "step": 14181 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 2.8118865489959717, + "learning_rate": 3.352855775512557e-06, + "loss": 0.5401, + "step": 14184 + }, + { + "epoch": 1.8465443186255368, + "grad_norm": 2.6508400440216064, + "learning_rate": 3.350882729800029e-06, + "loss": 0.5607, + "step": 14187 + }, + { + "epoch": 1.8469347910972278, + "grad_norm": 3.0374929904937744, + "learning_rate": 3.3489099721848188e-06, + "loss": 0.4713, + "step": 14190 + }, + { + "epoch": 1.8473252635689184, + "grad_norm": 2.809018135070801, + "learning_rate": 3.346937503011565e-06, + "loss": 0.5143, + "step": 14193 + }, + { + "epoch": 1.8477157360406091, + "grad_norm": 2.843602418899536, + "learning_rate": 3.34496532262485e-06, + "loss": 0.4967, + "step": 14196 + }, + { + "epoch": 1.8481062085123, + "grad_norm": 2.7359254360198975, + "learning_rate": 3.3429934313692102e-06, + "loss": 0.527, + "step": 14199 + }, + { + "epoch": 1.8484966809839907, + "grad_norm": 3.020136594772339, + "learning_rate": 3.341021829589134e-06, + "loss": 0.5108, + "step": 14202 + }, + { + "epoch": 1.8488871534556814, + "grad_norm": 2.610989809036255, + "learning_rate": 3.3390505176290544e-06, + "loss": 0.56, + "step": 14205 + }, + { + "epoch": 1.849277625927372, + "grad_norm": 3.2436866760253906, + "learning_rate": 3.3370794958333547e-06, + "loss": 0.4896, + "step": 14208 + }, + { + "epoch": 1.849668098399063, + "grad_norm": 2.493400812149048, + "learning_rate": 3.33510876454637e-06, + "loss": 0.463, + "step": 14211 + }, + { + "epoch": 1.8500585708707535, + "grad_norm": 2.6635570526123047, + "learning_rate": 3.33313832411238e-06, + "loss": 0.4602, + "step": 14214 + }, + { + "epoch": 1.8504490433424443, + "grad_norm": 2.576613664627075, + "learning_rate": 3.3311681748756185e-06, + "loss": 0.4683, + "step": 14217 + }, + { + "epoch": 1.850839515814135, + "grad_norm": 2.853515148162842, + "learning_rate": 3.3291983171802676e-06, + "loss": 0.5186, + "step": 14220 + }, + { + "epoch": 1.8512299882858259, + "grad_norm": 2.5207786560058594, + "learning_rate": 3.3272287513704544e-06, + "loss": 0.4683, + "step": 14223 + }, + { + "epoch": 1.8516204607575166, + "grad_norm": 2.8509247303009033, + "learning_rate": 3.32525947779026e-06, + "loss": 0.4833, + "step": 14226 + }, + { + "epoch": 1.8520109332292072, + "grad_norm": 4.261683464050293, + "learning_rate": 3.3232904967837116e-06, + "loss": 0.5181, + "step": 14229 + }, + { + "epoch": 1.8524014057008982, + "grad_norm": 2.9350645542144775, + "learning_rate": 3.3213218086947857e-06, + "loss": 0.4807, + "step": 14232 + }, + { + "epoch": 1.8527918781725887, + "grad_norm": 2.8778061866760254, + "learning_rate": 3.3193534138674094e-06, + "loss": 0.4726, + "step": 14235 + }, + { + "epoch": 1.8531823506442797, + "grad_norm": 3.9157052040100098, + "learning_rate": 3.3173853126454546e-06, + "loss": 0.5698, + "step": 14238 + }, + { + "epoch": 1.8535728231159703, + "grad_norm": 2.5443649291992188, + "learning_rate": 3.3154175053727478e-06, + "loss": 0.5034, + "step": 14241 + }, + { + "epoch": 1.853963295587661, + "grad_norm": 2.9121346473693848, + "learning_rate": 3.313449992393055e-06, + "loss": 0.5932, + "step": 14244 + }, + { + "epoch": 1.8543537680593518, + "grad_norm": 2.4990174770355225, + "learning_rate": 3.311482774050105e-06, + "loss": 0.4869, + "step": 14247 + }, + { + "epoch": 1.8547442405310426, + "grad_norm": 3.1738932132720947, + "learning_rate": 3.3095158506875603e-06, + "loss": 0.6015, + "step": 14250 + }, + { + "epoch": 1.8551347130027334, + "grad_norm": 2.808356285095215, + "learning_rate": 3.3075492226490404e-06, + "loss": 0.4492, + "step": 14253 + }, + { + "epoch": 1.855525185474424, + "grad_norm": 2.824268102645874, + "learning_rate": 3.3055828902781116e-06, + "loss": 0.5146, + "step": 14256 + }, + { + "epoch": 1.855915657946115, + "grad_norm": 2.653419017791748, + "learning_rate": 3.3036168539182867e-06, + "loss": 0.5002, + "step": 14259 + }, + { + "epoch": 1.8563061304178055, + "grad_norm": 2.6380562782287598, + "learning_rate": 3.3016511139130296e-06, + "loss": 0.4467, + "step": 14262 + }, + { + "epoch": 1.8566966028894965, + "grad_norm": 3.018332004547119, + "learning_rate": 3.299685670605749e-06, + "loss": 0.5476, + "step": 14265 + }, + { + "epoch": 1.857087075361187, + "grad_norm": 2.474071979522705, + "learning_rate": 3.297720524339805e-06, + "loss": 0.5168, + "step": 14268 + }, + { + "epoch": 1.8574775478328778, + "grad_norm": 2.407060384750366, + "learning_rate": 3.295755675458505e-06, + "loss": 0.44, + "step": 14271 + }, + { + "epoch": 1.8578680203045685, + "grad_norm": 2.6494884490966797, + "learning_rate": 3.2937911243051035e-06, + "loss": 0.56, + "step": 14274 + }, + { + "epoch": 1.8582584927762593, + "grad_norm": 2.5988290309906006, + "learning_rate": 3.291826871222803e-06, + "loss": 0.458, + "step": 14277 + }, + { + "epoch": 1.85864896524795, + "grad_norm": 3.0355000495910645, + "learning_rate": 3.2898629165547534e-06, + "loss": 0.497, + "step": 14280 + }, + { + "epoch": 1.8590394377196406, + "grad_norm": 2.6298789978027344, + "learning_rate": 3.2878992606440557e-06, + "loss": 0.537, + "step": 14283 + }, + { + "epoch": 1.8594299101913316, + "grad_norm": 3.61897349357605, + "learning_rate": 3.2859359038337537e-06, + "loss": 0.5495, + "step": 14286 + }, + { + "epoch": 1.8598203826630222, + "grad_norm": 2.8695971965789795, + "learning_rate": 3.283972846466846e-06, + "loss": 0.4939, + "step": 14289 + }, + { + "epoch": 1.860210855134713, + "grad_norm": 2.84236741065979, + "learning_rate": 3.2820100888862703e-06, + "loss": 0.4501, + "step": 14292 + }, + { + "epoch": 1.8606013276064037, + "grad_norm": 2.9210715293884277, + "learning_rate": 3.2800476314349184e-06, + "loss": 0.5261, + "step": 14295 + }, + { + "epoch": 1.8609918000780945, + "grad_norm": 2.3659050464630127, + "learning_rate": 3.2780854744556284e-06, + "loss": 0.4849, + "step": 14298 + }, + { + "epoch": 1.8613822725497853, + "grad_norm": 2.6904006004333496, + "learning_rate": 3.276123618291182e-06, + "loss": 0.4635, + "step": 14301 + }, + { + "epoch": 1.8617727450214758, + "grad_norm": 3.254288911819458, + "learning_rate": 3.274162063284314e-06, + "loss": 0.4929, + "step": 14304 + }, + { + "epoch": 1.8621632174931668, + "grad_norm": 2.973808526992798, + "learning_rate": 3.2722008097777025e-06, + "loss": 0.4474, + "step": 14307 + }, + { + "epoch": 1.8625536899648574, + "grad_norm": 2.9174914360046387, + "learning_rate": 3.2702398581139742e-06, + "loss": 0.5224, + "step": 14310 + }, + { + "epoch": 1.8629441624365484, + "grad_norm": 2.607135057449341, + "learning_rate": 3.268279208635705e-06, + "loss": 0.4667, + "step": 14313 + }, + { + "epoch": 1.863334634908239, + "grad_norm": 2.792543649673462, + "learning_rate": 3.266318861685414e-06, + "loss": 0.4632, + "step": 14316 + }, + { + "epoch": 1.8637251073799297, + "grad_norm": 2.533592462539673, + "learning_rate": 3.2643588176055706e-06, + "loss": 0.4745, + "step": 14319 + }, + { + "epoch": 1.8641155798516205, + "grad_norm": 2.8035097122192383, + "learning_rate": 3.2623990767385923e-06, + "loss": 0.4732, + "step": 14322 + }, + { + "epoch": 1.8645060523233112, + "grad_norm": 2.933763265609741, + "learning_rate": 3.2604396394268377e-06, + "loss": 0.567, + "step": 14325 + }, + { + "epoch": 1.864896524795002, + "grad_norm": 2.4261467456817627, + "learning_rate": 3.2584805060126183e-06, + "loss": 0.429, + "step": 14328 + }, + { + "epoch": 1.8652869972666926, + "grad_norm": 2.9292244911193848, + "learning_rate": 3.2565216768381924e-06, + "loss": 0.4519, + "step": 14331 + }, + { + "epoch": 1.8656774697383836, + "grad_norm": 2.7582828998565674, + "learning_rate": 3.2545631522457623e-06, + "loss": 0.5999, + "step": 14334 + }, + { + "epoch": 1.866067942210074, + "grad_norm": 2.7902820110321045, + "learning_rate": 3.2526049325774762e-06, + "loss": 0.4989, + "step": 14337 + }, + { + "epoch": 1.866458414681765, + "grad_norm": 2.306330442428589, + "learning_rate": 3.2506470181754336e-06, + "loss": 0.529, + "step": 14340 + }, + { + "epoch": 1.8668488871534556, + "grad_norm": 2.968900442123413, + "learning_rate": 3.2486894093816755e-06, + "loss": 0.5525, + "step": 14343 + }, + { + "epoch": 1.8672393596251464, + "grad_norm": 2.6159310340881348, + "learning_rate": 3.246732106538194e-06, + "loss": 0.4139, + "step": 14346 + }, + { + "epoch": 1.8676298320968372, + "grad_norm": 2.9599854946136475, + "learning_rate": 3.2447751099869264e-06, + "loss": 0.5086, + "step": 14349 + }, + { + "epoch": 1.868020304568528, + "grad_norm": 2.811408519744873, + "learning_rate": 3.242818420069753e-06, + "loss": 0.4619, + "step": 14352 + }, + { + "epoch": 1.8684107770402187, + "grad_norm": 3.143258810043335, + "learning_rate": 3.240862037128506e-06, + "loss": 0.5827, + "step": 14355 + }, + { + "epoch": 1.8688012495119093, + "grad_norm": 2.813084840774536, + "learning_rate": 3.23890596150496e-06, + "loss": 0.497, + "step": 14358 + }, + { + "epoch": 1.8691917219836003, + "grad_norm": 2.9719557762145996, + "learning_rate": 3.2369501935408375e-06, + "loss": 0.5179, + "step": 14361 + }, + { + "epoch": 1.8695821944552908, + "grad_norm": 3.1469063758850098, + "learning_rate": 3.234994733577808e-06, + "loss": 0.4145, + "step": 14364 + }, + { + "epoch": 1.8699726669269816, + "grad_norm": 2.816202402114868, + "learning_rate": 3.2330395819574845e-06, + "loss": 0.4828, + "step": 14367 + }, + { + "epoch": 1.8703631393986724, + "grad_norm": 2.6076595783233643, + "learning_rate": 3.2310847390214283e-06, + "loss": 0.5034, + "step": 14370 + }, + { + "epoch": 1.8707536118703632, + "grad_norm": 3.1273179054260254, + "learning_rate": 3.229130205111147e-06, + "loss": 0.4748, + "step": 14373 + }, + { + "epoch": 1.871144084342054, + "grad_norm": 2.559382438659668, + "learning_rate": 3.2271759805680956e-06, + "loss": 0.4771, + "step": 14376 + }, + { + "epoch": 1.8715345568137445, + "grad_norm": 2.720404624938965, + "learning_rate": 3.22522206573367e-06, + "loss": 0.4631, + "step": 14379 + }, + { + "epoch": 1.8719250292854355, + "grad_norm": 3.042336940765381, + "learning_rate": 3.223268460949215e-06, + "loss": 0.5467, + "step": 14382 + }, + { + "epoch": 1.872315501757126, + "grad_norm": 2.677138090133667, + "learning_rate": 3.221315166556024e-06, + "loss": 0.4852, + "step": 14385 + }, + { + "epoch": 1.872705974228817, + "grad_norm": 3.1034047603607178, + "learning_rate": 3.21936218289533e-06, + "loss": 0.5535, + "step": 14388 + }, + { + "epoch": 1.8730964467005076, + "grad_norm": 3.5317320823669434, + "learning_rate": 3.217409510308318e-06, + "loss": 0.4424, + "step": 14391 + }, + { + "epoch": 1.8734869191721983, + "grad_norm": 3.066141366958618, + "learning_rate": 3.215457149136114e-06, + "loss": 0.454, + "step": 14394 + }, + { + "epoch": 1.8738773916438891, + "grad_norm": 2.683347702026367, + "learning_rate": 3.213505099719791e-06, + "loss": 0.4301, + "step": 14397 + }, + { + "epoch": 1.8742678641155799, + "grad_norm": 2.6969237327575684, + "learning_rate": 3.2115533624003703e-06, + "loss": 0.5306, + "step": 14400 + }, + { + "epoch": 1.8746583365872707, + "grad_norm": 2.678342580795288, + "learning_rate": 3.2096019375188135e-06, + "loss": 0.4622, + "step": 14403 + }, + { + "epoch": 1.8750488090589612, + "grad_norm": 2.750361919403076, + "learning_rate": 3.2076508254160334e-06, + "loss": 0.4639, + "step": 14406 + }, + { + "epoch": 1.8754392815306522, + "grad_norm": 2.4361095428466797, + "learning_rate": 3.205700026432882e-06, + "loss": 0.4648, + "step": 14409 + }, + { + "epoch": 1.8758297540023428, + "grad_norm": 2.7799482345581055, + "learning_rate": 3.2037495409101603e-06, + "loss": 0.5366, + "step": 14412 + }, + { + "epoch": 1.8762202264740335, + "grad_norm": 2.5228137969970703, + "learning_rate": 3.201799369188616e-06, + "loss": 0.5324, + "step": 14415 + }, + { + "epoch": 1.8766106989457243, + "grad_norm": 2.955665349960327, + "learning_rate": 3.1998495116089413e-06, + "loss": 0.528, + "step": 14418 + }, + { + "epoch": 1.877001171417415, + "grad_norm": 3.2559332847595215, + "learning_rate": 3.1978999685117674e-06, + "loss": 0.4378, + "step": 14421 + }, + { + "epoch": 1.8773916438891058, + "grad_norm": 3.0776727199554443, + "learning_rate": 3.1959507402376787e-06, + "loss": 0.4957, + "step": 14424 + }, + { + "epoch": 1.8777821163607966, + "grad_norm": 2.5709543228149414, + "learning_rate": 3.1940018271272013e-06, + "loss": 0.5957, + "step": 14427 + }, + { + "epoch": 1.8781725888324874, + "grad_norm": 2.643850564956665, + "learning_rate": 3.1920532295208045e-06, + "loss": 0.5714, + "step": 14430 + }, + { + "epoch": 1.878563061304178, + "grad_norm": 2.7562592029571533, + "learning_rate": 3.190104947758905e-06, + "loss": 0.527, + "step": 14433 + }, + { + "epoch": 1.878953533775869, + "grad_norm": 2.548182964324951, + "learning_rate": 3.1881569821818646e-06, + "loss": 0.4998, + "step": 14436 + }, + { + "epoch": 1.8793440062475595, + "grad_norm": 2.421619176864624, + "learning_rate": 3.186209333129987e-06, + "loss": 0.4503, + "step": 14439 + }, + { + "epoch": 1.8797344787192503, + "grad_norm": 2.976463794708252, + "learning_rate": 3.1842620009435244e-06, + "loss": 0.5083, + "step": 14442 + }, + { + "epoch": 1.880124951190941, + "grad_norm": 3.011476516723633, + "learning_rate": 3.1823149859626695e-06, + "loss": 0.4516, + "step": 14445 + }, + { + "epoch": 1.8805154236626318, + "grad_norm": 2.777480125427246, + "learning_rate": 3.180368288527563e-06, + "loss": 0.5437, + "step": 14448 + }, + { + "epoch": 1.8809058961343226, + "grad_norm": 2.6614065170288086, + "learning_rate": 3.1784219089782885e-06, + "loss": 0.532, + "step": 14451 + }, + { + "epoch": 1.8812963686060131, + "grad_norm": 3.332582473754883, + "learning_rate": 3.176475847654873e-06, + "loss": 0.5112, + "step": 14454 + }, + { + "epoch": 1.8816868410777041, + "grad_norm": 3.011148452758789, + "learning_rate": 3.1745301048972923e-06, + "loss": 0.5642, + "step": 14457 + }, + { + "epoch": 1.8820773135493947, + "grad_norm": 3.288445472717285, + "learning_rate": 3.1725846810454612e-06, + "loss": 0.4546, + "step": 14460 + }, + { + "epoch": 1.8824677860210857, + "grad_norm": 2.9788262844085693, + "learning_rate": 3.170639576439244e-06, + "loss": 0.5007, + "step": 14463 + }, + { + "epoch": 1.8828582584927762, + "grad_norm": 2.3742592334747314, + "learning_rate": 3.1686947914184424e-06, + "loss": 0.4228, + "step": 14466 + }, + { + "epoch": 1.883248730964467, + "grad_norm": 2.842820405960083, + "learning_rate": 3.1667503263228093e-06, + "loss": 0.5175, + "step": 14469 + }, + { + "epoch": 1.8836392034361578, + "grad_norm": 3.063159942626953, + "learning_rate": 3.1648061814920372e-06, + "loss": 0.476, + "step": 14472 + }, + { + "epoch": 1.8840296759078485, + "grad_norm": 2.8648762702941895, + "learning_rate": 3.1628623572657646e-06, + "loss": 0.4767, + "step": 14475 + }, + { + "epoch": 1.8844201483795393, + "grad_norm": 2.5164406299591064, + "learning_rate": 3.160918853983574e-06, + "loss": 0.5384, + "step": 14478 + }, + { + "epoch": 1.8848106208512299, + "grad_norm": 2.923412322998047, + "learning_rate": 3.1589756719849897e-06, + "loss": 0.4499, + "step": 14481 + }, + { + "epoch": 1.8852010933229209, + "grad_norm": 2.5238845348358154, + "learning_rate": 3.1570328116094835e-06, + "loss": 0.4332, + "step": 14484 + }, + { + "epoch": 1.8855915657946114, + "grad_norm": 2.5288538932800293, + "learning_rate": 3.155090273196467e-06, + "loss": 0.4952, + "step": 14487 + }, + { + "epoch": 1.8859820382663022, + "grad_norm": 2.2970383167266846, + "learning_rate": 3.153148057085299e-06, + "loss": 0.4388, + "step": 14490 + }, + { + "epoch": 1.886372510737993, + "grad_norm": 4.423757553100586, + "learning_rate": 3.1512061636152814e-06, + "loss": 0.4552, + "step": 14493 + }, + { + "epoch": 1.8867629832096837, + "grad_norm": 3.0830297470092773, + "learning_rate": 3.149264593125655e-06, + "loss": 0.5075, + "step": 14496 + }, + { + "epoch": 1.8871534556813745, + "grad_norm": 2.8348443508148193, + "learning_rate": 3.147323345955612e-06, + "loss": 0.5471, + "step": 14499 + }, + { + "epoch": 1.887543928153065, + "grad_norm": 2.6750619411468506, + "learning_rate": 3.1453824224442836e-06, + "loss": 0.458, + "step": 14502 + }, + { + "epoch": 1.887934400624756, + "grad_norm": 2.684641122817993, + "learning_rate": 3.143441822930745e-06, + "loss": 0.4285, + "step": 14505 + }, + { + "epoch": 1.8883248730964466, + "grad_norm": 2.603320598602295, + "learning_rate": 3.1415015477540136e-06, + "loss": 0.5014, + "step": 14508 + }, + { + "epoch": 1.8887153455681376, + "grad_norm": 2.6286399364471436, + "learning_rate": 3.1395615972530514e-06, + "loss": 0.4378, + "step": 14511 + }, + { + "epoch": 1.8891058180398281, + "grad_norm": 2.7393951416015625, + "learning_rate": 3.137621971766766e-06, + "loss": 0.5872, + "step": 14514 + }, + { + "epoch": 1.889496290511519, + "grad_norm": 2.693389415740967, + "learning_rate": 3.1356826716340027e-06, + "loss": 0.5285, + "step": 14517 + }, + { + "epoch": 1.8898867629832097, + "grad_norm": 2.694274663925171, + "learning_rate": 3.1337436971935565e-06, + "loss": 0.4483, + "step": 14520 + }, + { + "epoch": 1.8902772354549005, + "grad_norm": 2.6547138690948486, + "learning_rate": 3.131805048784159e-06, + "loss": 0.4728, + "step": 14523 + }, + { + "epoch": 1.8906677079265912, + "grad_norm": 2.5658202171325684, + "learning_rate": 3.129866726744489e-06, + "loss": 0.5587, + "step": 14526 + }, + { + "epoch": 1.8910581803982818, + "grad_norm": 2.6531288623809814, + "learning_rate": 3.1279287314131694e-06, + "loss": 0.5347, + "step": 14529 + }, + { + "epoch": 1.8914486528699728, + "grad_norm": 2.8203883171081543, + "learning_rate": 3.1259910631287605e-06, + "loss": 0.4708, + "step": 14532 + }, + { + "epoch": 1.8918391253416633, + "grad_norm": 2.5969855785369873, + "learning_rate": 3.1240537222297716e-06, + "loss": 0.5771, + "step": 14535 + }, + { + "epoch": 1.8922295978133543, + "grad_norm": 3.149728775024414, + "learning_rate": 3.122116709054649e-06, + "loss": 0.5279, + "step": 14538 + }, + { + "epoch": 1.8926200702850449, + "grad_norm": 2.6601550579071045, + "learning_rate": 3.120180023941789e-06, + "loss": 0.4675, + "step": 14541 + }, + { + "epoch": 1.8930105427567356, + "grad_norm": 2.72967529296875, + "learning_rate": 3.118243667229523e-06, + "loss": 0.4924, + "step": 14544 + }, + { + "epoch": 1.8934010152284264, + "grad_norm": 3.1012916564941406, + "learning_rate": 3.116307639256131e-06, + "loss": 0.4926, + "step": 14547 + }, + { + "epoch": 1.8937914877001172, + "grad_norm": 2.993875026702881, + "learning_rate": 3.1143719403598307e-06, + "loss": 0.4754, + "step": 14550 + }, + { + "epoch": 1.894181960171808, + "grad_norm": 4.121201515197754, + "learning_rate": 3.1124365708787856e-06, + "loss": 0.5015, + "step": 14553 + }, + { + "epoch": 1.8945724326434985, + "grad_norm": 2.526926040649414, + "learning_rate": 3.110501531151102e-06, + "loss": 0.4477, + "step": 14556 + }, + { + "epoch": 1.8949629051151895, + "grad_norm": 3.1489815711975098, + "learning_rate": 3.1085668215148245e-06, + "loss": 0.444, + "step": 14559 + }, + { + "epoch": 1.89535337758688, + "grad_norm": 2.5161519050598145, + "learning_rate": 3.1066324423079445e-06, + "loss": 0.5054, + "step": 14562 + }, + { + "epoch": 1.8957438500585708, + "grad_norm": 2.526221990585327, + "learning_rate": 3.104698393868395e-06, + "loss": 0.489, + "step": 14565 + }, + { + "epoch": 1.8961343225302616, + "grad_norm": 2.54593825340271, + "learning_rate": 3.102764676534048e-06, + "loss": 0.49, + "step": 14568 + }, + { + "epoch": 1.8965247950019524, + "grad_norm": 3.2360286712646484, + "learning_rate": 3.1008312906427212e-06, + "loss": 0.4462, + "step": 14571 + }, + { + "epoch": 1.8969152674736431, + "grad_norm": 3.160997152328491, + "learning_rate": 3.098898236532172e-06, + "loss": 0.4875, + "step": 14574 + }, + { + "epoch": 1.8973057399453337, + "grad_norm": 4.4345808029174805, + "learning_rate": 3.096965514540102e-06, + "loss": 0.5209, + "step": 14577 + }, + { + "epoch": 1.8976962124170247, + "grad_norm": 2.559445858001709, + "learning_rate": 3.0950331250041515e-06, + "loss": 0.5191, + "step": 14580 + }, + { + "epoch": 1.8980866848887152, + "grad_norm": 2.8593218326568604, + "learning_rate": 3.093101068261909e-06, + "loss": 0.4575, + "step": 14583 + }, + { + "epoch": 1.8984771573604062, + "grad_norm": 2.4053094387054443, + "learning_rate": 3.0911693446508973e-06, + "loss": 0.4613, + "step": 14586 + }, + { + "epoch": 1.8988676298320968, + "grad_norm": 2.4780166149139404, + "learning_rate": 3.089237954508585e-06, + "loss": 0.4018, + "step": 14589 + }, + { + "epoch": 1.8992581023037876, + "grad_norm": 2.995244264602661, + "learning_rate": 3.0873068981723842e-06, + "loss": 0.5322, + "step": 14592 + }, + { + "epoch": 1.8996485747754783, + "grad_norm": 2.625032663345337, + "learning_rate": 3.085376175979643e-06, + "loss": 0.4439, + "step": 14595 + }, + { + "epoch": 1.900039047247169, + "grad_norm": 2.8063700199127197, + "learning_rate": 3.083445788267657e-06, + "loss": 0.4898, + "step": 14598 + }, + { + "epoch": 1.9004295197188599, + "grad_norm": 2.738100051879883, + "learning_rate": 3.081515735373659e-06, + "loss": 0.5017, + "step": 14601 + }, + { + "epoch": 1.9008199921905504, + "grad_norm": 2.68695330619812, + "learning_rate": 3.0795860176348267e-06, + "loss": 0.4812, + "step": 14604 + }, + { + "epoch": 1.9012104646622414, + "grad_norm": 3.1662650108337402, + "learning_rate": 3.0776566353882775e-06, + "loss": 0.4975, + "step": 14607 + }, + { + "epoch": 1.901600937133932, + "grad_norm": 2.6604361534118652, + "learning_rate": 3.0757275889710697e-06, + "loss": 0.4858, + "step": 14610 + }, + { + "epoch": 1.901991409605623, + "grad_norm": 2.82057523727417, + "learning_rate": 3.0737988787202034e-06, + "loss": 0.4714, + "step": 14613 + }, + { + "epoch": 1.9023818820773135, + "grad_norm": 2.7449402809143066, + "learning_rate": 3.071870504972623e-06, + "loss": 0.5916, + "step": 14616 + }, + { + "epoch": 1.9027723545490043, + "grad_norm": 2.767760753631592, + "learning_rate": 3.0699424680652066e-06, + "loss": 0.5381, + "step": 14619 + }, + { + "epoch": 1.903162827020695, + "grad_norm": 2.4877452850341797, + "learning_rate": 3.0680147683347803e-06, + "loss": 0.4768, + "step": 14622 + }, + { + "epoch": 1.9035532994923858, + "grad_norm": 2.712571382522583, + "learning_rate": 3.066087406118111e-06, + "loss": 0.4785, + "step": 14625 + }, + { + "epoch": 1.9039437719640766, + "grad_norm": 2.5379676818847656, + "learning_rate": 3.064160381751905e-06, + "loss": 0.4833, + "step": 14628 + }, + { + "epoch": 1.9043342444357672, + "grad_norm": 2.888591766357422, + "learning_rate": 3.062233695572806e-06, + "loss": 0.5205, + "step": 14631 + }, + { + "epoch": 1.9047247169074581, + "grad_norm": 2.4471936225891113, + "learning_rate": 3.060307347917405e-06, + "loss": 0.4541, + "step": 14634 + }, + { + "epoch": 1.9051151893791487, + "grad_norm": 2.5203559398651123, + "learning_rate": 3.0583813391222294e-06, + "loss": 0.4671, + "step": 14637 + }, + { + "epoch": 1.9055056618508395, + "grad_norm": 2.5161707401275635, + "learning_rate": 3.056455669523749e-06, + "loss": 0.4245, + "step": 14640 + }, + { + "epoch": 1.9058961343225302, + "grad_norm": 2.660309076309204, + "learning_rate": 3.0545303394583755e-06, + "loss": 0.4664, + "step": 14643 + }, + { + "epoch": 1.906286606794221, + "grad_norm": 2.8389623165130615, + "learning_rate": 3.0526053492624574e-06, + "loss": 0.5387, + "step": 14646 + }, + { + "epoch": 1.9066770792659118, + "grad_norm": 2.4667160511016846, + "learning_rate": 3.05068069927229e-06, + "loss": 0.4764, + "step": 14649 + }, + { + "epoch": 1.9070675517376023, + "grad_norm": 3.343127727508545, + "learning_rate": 3.0487563898241025e-06, + "loss": 0.5026, + "step": 14652 + }, + { + "epoch": 1.9074580242092933, + "grad_norm": 2.870053291320801, + "learning_rate": 3.046832421254068e-06, + "loss": 0.4699, + "step": 14655 + }, + { + "epoch": 1.9078484966809839, + "grad_norm": 2.796257495880127, + "learning_rate": 3.0449087938983025e-06, + "loss": 0.5665, + "step": 14658 + }, + { + "epoch": 1.9082389691526749, + "grad_norm": 2.6486213207244873, + "learning_rate": 3.0429855080928567e-06, + "loss": 0.4896, + "step": 14661 + }, + { + "epoch": 1.9086294416243654, + "grad_norm": 2.7578749656677246, + "learning_rate": 3.0410625641737245e-06, + "loss": 0.4799, + "step": 14664 + }, + { + "epoch": 1.9090199140960562, + "grad_norm": 2.627326726913452, + "learning_rate": 3.0391399624768424e-06, + "loss": 0.4241, + "step": 14667 + }, + { + "epoch": 1.909410386567747, + "grad_norm": 2.7783567905426025, + "learning_rate": 3.0372177033380846e-06, + "loss": 0.4997, + "step": 14670 + }, + { + "epoch": 1.9098008590394377, + "grad_norm": 2.5225064754486084, + "learning_rate": 3.0352957870932643e-06, + "loss": 0.4955, + "step": 14673 + }, + { + "epoch": 1.9101913315111285, + "grad_norm": 4.349731922149658, + "learning_rate": 3.0333742140781374e-06, + "loss": 0.4517, + "step": 14676 + }, + { + "epoch": 1.910581803982819, + "grad_norm": 2.4601027965545654, + "learning_rate": 3.031452984628398e-06, + "loss": 0.5083, + "step": 14679 + }, + { + "epoch": 1.91097227645451, + "grad_norm": 2.8652219772338867, + "learning_rate": 3.02953209907968e-06, + "loss": 0.4887, + "step": 14682 + }, + { + "epoch": 1.9113627489262006, + "grad_norm": 2.7341558933258057, + "learning_rate": 3.0276115577675604e-06, + "loss": 0.5125, + "step": 14685 + }, + { + "epoch": 1.9117532213978916, + "grad_norm": 3.915335178375244, + "learning_rate": 3.025691361027552e-06, + "loss": 0.531, + "step": 14688 + }, + { + "epoch": 1.9121436938695822, + "grad_norm": 3.0976195335388184, + "learning_rate": 3.023771509195108e-06, + "loss": 0.4759, + "step": 14691 + }, + { + "epoch": 1.912534166341273, + "grad_norm": 2.6159005165100098, + "learning_rate": 3.0218520026056264e-06, + "loss": 0.4972, + "step": 14694 + }, + { + "epoch": 1.9129246388129637, + "grad_norm": 2.7680933475494385, + "learning_rate": 3.019932841594437e-06, + "loss": 0.5076, + "step": 14697 + }, + { + "epoch": 1.9133151112846545, + "grad_norm": 3.1167070865631104, + "learning_rate": 3.0180140264968153e-06, + "loss": 0.6106, + "step": 14700 + }, + { + "epoch": 1.9137055837563453, + "grad_norm": 2.826345682144165, + "learning_rate": 3.0160955576479735e-06, + "loss": 0.4774, + "step": 14703 + }, + { + "epoch": 1.9140960562280358, + "grad_norm": 2.6846492290496826, + "learning_rate": 3.014177435383063e-06, + "loss": 0.4761, + "step": 14706 + }, + { + "epoch": 1.9144865286997268, + "grad_norm": 2.6631195545196533, + "learning_rate": 3.0122596600371777e-06, + "loss": 0.4419, + "step": 14709 + }, + { + "epoch": 1.9148770011714173, + "grad_norm": 2.869974374771118, + "learning_rate": 3.010342231945349e-06, + "loss": 0.4628, + "step": 14712 + }, + { + "epoch": 1.9152674736431081, + "grad_norm": 2.954019546508789, + "learning_rate": 3.008425151442546e-06, + "loss": 0.4112, + "step": 14715 + }, + { + "epoch": 1.915657946114799, + "grad_norm": 3.6011927127838135, + "learning_rate": 3.0065084188636794e-06, + "loss": 0.5629, + "step": 14718 + }, + { + "epoch": 1.9160484185864897, + "grad_norm": 2.377660036087036, + "learning_rate": 3.0045920345435996e-06, + "loss": 0.4984, + "step": 14721 + }, + { + "epoch": 1.9164388910581804, + "grad_norm": 2.4976119995117188, + "learning_rate": 3.002675998817093e-06, + "loss": 0.5637, + "step": 14724 + }, + { + "epoch": 1.916829363529871, + "grad_norm": 2.8942108154296875, + "learning_rate": 3.0007603120188877e-06, + "loss": 0.4453, + "step": 14727 + }, + { + "epoch": 1.917219836001562, + "grad_norm": 2.675811290740967, + "learning_rate": 2.998844974483649e-06, + "loss": 0.4768, + "step": 14730 + }, + { + "epoch": 1.9176103084732525, + "grad_norm": 2.6159439086914062, + "learning_rate": 2.9969299865459845e-06, + "loss": 0.4118, + "step": 14733 + }, + { + "epoch": 1.9180007809449435, + "grad_norm": 2.5294883251190186, + "learning_rate": 2.995015348540438e-06, + "loss": 0.4178, + "step": 14736 + }, + { + "epoch": 1.918391253416634, + "grad_norm": 2.5951461791992188, + "learning_rate": 2.993101060801491e-06, + "loss": 0.5095, + "step": 14739 + }, + { + "epoch": 1.9187817258883249, + "grad_norm": 2.7234127521514893, + "learning_rate": 2.991187123663567e-06, + "loss": 0.4648, + "step": 14742 + }, + { + "epoch": 1.9191721983600156, + "grad_norm": 2.6454546451568604, + "learning_rate": 2.9892735374610273e-06, + "loss": 0.4701, + "step": 14745 + }, + { + "epoch": 1.9195626708317064, + "grad_norm": 2.923607349395752, + "learning_rate": 2.987360302528169e-06, + "loss": 0.4573, + "step": 14748 + }, + { + "epoch": 1.9199531433033972, + "grad_norm": 2.9072718620300293, + "learning_rate": 2.9854474191992323e-06, + "loss": 0.457, + "step": 14751 + }, + { + "epoch": 1.9203436157750877, + "grad_norm": 2.4134531021118164, + "learning_rate": 2.983534887808394e-06, + "loss": 0.4027, + "step": 14754 + }, + { + "epoch": 1.9207340882467787, + "grad_norm": 2.700012683868408, + "learning_rate": 2.9816227086897696e-06, + "loss": 0.4745, + "step": 14757 + }, + { + "epoch": 1.9211245607184693, + "grad_norm": 2.7360305786132812, + "learning_rate": 2.9797108821774114e-06, + "loss": 0.4525, + "step": 14760 + }, + { + "epoch": 1.9215150331901603, + "grad_norm": 2.5662286281585693, + "learning_rate": 2.9777994086053123e-06, + "loss": 0.5103, + "step": 14763 + }, + { + "epoch": 1.9219055056618508, + "grad_norm": 2.873728036880493, + "learning_rate": 2.975888288307402e-06, + "loss": 0.5333, + "step": 14766 + }, + { + "epoch": 1.9222959781335416, + "grad_norm": 3.446164846420288, + "learning_rate": 2.973977521617549e-06, + "loss": 0.4311, + "step": 14769 + }, + { + "epoch": 1.9226864506052324, + "grad_norm": 2.5216710567474365, + "learning_rate": 2.9720671088695628e-06, + "loss": 0.5313, + "step": 14772 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 2.530691385269165, + "learning_rate": 2.9701570503971846e-06, + "loss": 0.497, + "step": 14775 + }, + { + "epoch": 1.923467395548614, + "grad_norm": 2.462676763534546, + "learning_rate": 2.968247346534101e-06, + "loss": 0.4939, + "step": 14778 + }, + { + "epoch": 1.9238578680203045, + "grad_norm": 3.943150043487549, + "learning_rate": 2.9663379976139307e-06, + "loss": 0.5034, + "step": 14781 + }, + { + "epoch": 1.9242483404919954, + "grad_norm": 2.7674999237060547, + "learning_rate": 2.964429003970234e-06, + "loss": 0.4747, + "step": 14784 + }, + { + "epoch": 1.924638812963686, + "grad_norm": 2.529186725616455, + "learning_rate": 2.9625203659365094e-06, + "loss": 0.4283, + "step": 14787 + }, + { + "epoch": 1.9250292854353768, + "grad_norm": 2.8938591480255127, + "learning_rate": 2.9606120838461884e-06, + "loss": 0.4594, + "step": 14790 + }, + { + "epoch": 1.9254197579070675, + "grad_norm": 2.950148820877075, + "learning_rate": 2.958704158032647e-06, + "loss": 0.5019, + "step": 14793 + }, + { + "epoch": 1.9258102303787583, + "grad_norm": 3.631539821624756, + "learning_rate": 2.956796588829195e-06, + "loss": 0.4972, + "step": 14796 + }, + { + "epoch": 1.926200702850449, + "grad_norm": 2.9207494258880615, + "learning_rate": 2.954889376569081e-06, + "loss": 0.5222, + "step": 14799 + }, + { + "epoch": 1.9265911753221396, + "grad_norm": 2.5649592876434326, + "learning_rate": 2.9529825215854907e-06, + "loss": 0.5079, + "step": 14802 + }, + { + "epoch": 1.9269816477938306, + "grad_norm": 2.7472033500671387, + "learning_rate": 2.951076024211547e-06, + "loss": 0.4459, + "step": 14805 + }, + { + "epoch": 1.9273721202655212, + "grad_norm": 2.761784315109253, + "learning_rate": 2.949169884780313e-06, + "loss": 0.4845, + "step": 14808 + }, + { + "epoch": 1.9277625927372122, + "grad_norm": 2.7410855293273926, + "learning_rate": 2.947264103624784e-06, + "loss": 0.5347, + "step": 14811 + }, + { + "epoch": 1.9281530652089027, + "grad_norm": 3.519693613052368, + "learning_rate": 2.9453586810778996e-06, + "loss": 0.512, + "step": 14814 + }, + { + "epoch": 1.9285435376805935, + "grad_norm": 2.6707022190093994, + "learning_rate": 2.9434536174725305e-06, + "loss": 0.4977, + "step": 14817 + }, + { + "epoch": 1.9289340101522843, + "grad_norm": 3.4989326000213623, + "learning_rate": 2.941548913141487e-06, + "loss": 0.4988, + "step": 14820 + }, + { + "epoch": 1.929324482623975, + "grad_norm": 2.744438886642456, + "learning_rate": 2.9396445684175196e-06, + "loss": 0.4738, + "step": 14823 + }, + { + "epoch": 1.9297149550956658, + "grad_norm": 3.4008469581604004, + "learning_rate": 2.9377405836333106e-06, + "loss": 0.4601, + "step": 14826 + }, + { + "epoch": 1.9301054275673564, + "grad_norm": 3.585939407348633, + "learning_rate": 2.9358369591214847e-06, + "loss": 0.4585, + "step": 14829 + }, + { + "epoch": 1.9304959000390474, + "grad_norm": 2.683812379837036, + "learning_rate": 2.9339336952145957e-06, + "loss": 0.4817, + "step": 14832 + }, + { + "epoch": 1.930886372510738, + "grad_norm": 2.6055679321289062, + "learning_rate": 2.932030792245148e-06, + "loss": 0.5232, + "step": 14835 + }, + { + "epoch": 1.9312768449824287, + "grad_norm": 2.5260632038116455, + "learning_rate": 2.9301282505455687e-06, + "loss": 0.4211, + "step": 14838 + }, + { + "epoch": 1.9316673174541195, + "grad_norm": 2.553103446960449, + "learning_rate": 2.9282260704482313e-06, + "loss": 0.4662, + "step": 14841 + }, + { + "epoch": 1.9320577899258102, + "grad_norm": 2.7395856380462646, + "learning_rate": 2.9263242522854397e-06, + "loss": 0.4706, + "step": 14844 + }, + { + "epoch": 1.932448262397501, + "grad_norm": 2.5782995223999023, + "learning_rate": 2.924422796389439e-06, + "loss": 0.4471, + "step": 14847 + }, + { + "epoch": 1.9328387348691918, + "grad_norm": 2.5767011642456055, + "learning_rate": 2.92252170309241e-06, + "loss": 0.4797, + "step": 14850 + }, + { + "epoch": 1.9332292073408825, + "grad_norm": 2.6651394367218018, + "learning_rate": 2.920620972726468e-06, + "loss": 0.5292, + "step": 14853 + }, + { + "epoch": 1.933619679812573, + "grad_norm": 3.3890509605407715, + "learning_rate": 2.9187206056236693e-06, + "loss": 0.4272, + "step": 14856 + }, + { + "epoch": 1.934010152284264, + "grad_norm": 3.1942598819732666, + "learning_rate": 2.916820602116e-06, + "loss": 0.5652, + "step": 14859 + }, + { + "epoch": 1.9344006247559546, + "grad_norm": 2.847402572631836, + "learning_rate": 2.914920962535391e-06, + "loss": 0.467, + "step": 14862 + }, + { + "epoch": 1.9347910972276454, + "grad_norm": 2.6904537677764893, + "learning_rate": 2.913021687213704e-06, + "loss": 0.5246, + "step": 14865 + }, + { + "epoch": 1.9351815696993362, + "grad_norm": 2.631735324859619, + "learning_rate": 2.9111227764827376e-06, + "loss": 0.4657, + "step": 14868 + }, + { + "epoch": 1.935572042171027, + "grad_norm": 2.6257803440093994, + "learning_rate": 2.9092242306742256e-06, + "loss": 0.5195, + "step": 14871 + }, + { + "epoch": 1.9359625146427177, + "grad_norm": 2.7459394931793213, + "learning_rate": 2.9073260501198424e-06, + "loss": 0.4976, + "step": 14874 + }, + { + "epoch": 1.9363529871144083, + "grad_norm": 2.597118854522705, + "learning_rate": 2.905428235151198e-06, + "loss": 0.4381, + "step": 14877 + }, + { + "epoch": 1.9367434595860993, + "grad_norm": 2.7893166542053223, + "learning_rate": 2.9035307860998346e-06, + "loss": 0.4861, + "step": 14880 + }, + { + "epoch": 1.9371339320577898, + "grad_norm": 2.5637564659118652, + "learning_rate": 2.90163370329723e-06, + "loss": 0.5199, + "step": 14883 + }, + { + "epoch": 1.9375244045294808, + "grad_norm": 2.49005389213562, + "learning_rate": 2.899736987074806e-06, + "loss": 0.5072, + "step": 14886 + }, + { + "epoch": 1.9379148770011714, + "grad_norm": 2.6753792762756348, + "learning_rate": 2.8978406377639114e-06, + "loss": 0.483, + "step": 14889 + }, + { + "epoch": 1.9383053494728621, + "grad_norm": 2.4277470111846924, + "learning_rate": 2.8959446556958333e-06, + "loss": 0.5358, + "step": 14892 + }, + { + "epoch": 1.938695821944553, + "grad_norm": 2.332995653152466, + "learning_rate": 2.8940490412017997e-06, + "loss": 0.4988, + "step": 14895 + }, + { + "epoch": 1.9390862944162437, + "grad_norm": 2.639230728149414, + "learning_rate": 2.892153794612968e-06, + "loss": 0.4237, + "step": 14898 + }, + { + "epoch": 1.9394767668879345, + "grad_norm": 2.7841386795043945, + "learning_rate": 2.8902589162604323e-06, + "loss": 0.4862, + "step": 14901 + }, + { + "epoch": 1.939867239359625, + "grad_norm": 2.618056535720825, + "learning_rate": 2.8883644064752274e-06, + "loss": 0.4738, + "step": 14904 + }, + { + "epoch": 1.940257711831316, + "grad_norm": 2.2609994411468506, + "learning_rate": 2.886470265588319e-06, + "loss": 0.4599, + "step": 14907 + }, + { + "epoch": 1.9406481843030066, + "grad_norm": 2.6283135414123535, + "learning_rate": 2.8845764939306063e-06, + "loss": 0.5095, + "step": 14910 + }, + { + "epoch": 1.9410386567746973, + "grad_norm": 2.641413688659668, + "learning_rate": 2.8826830918329325e-06, + "loss": 0.4114, + "step": 14913 + }, + { + "epoch": 1.941429129246388, + "grad_norm": 2.5801405906677246, + "learning_rate": 2.8807900596260663e-06, + "loss": 0.5321, + "step": 14916 + }, + { + "epoch": 1.9418196017180789, + "grad_norm": 2.513852834701538, + "learning_rate": 2.87889739764072e-06, + "loss": 0.5193, + "step": 14919 + }, + { + "epoch": 1.9422100741897697, + "grad_norm": 2.852112054824829, + "learning_rate": 2.8770051062075343e-06, + "loss": 0.4681, + "step": 14922 + }, + { + "epoch": 1.9426005466614602, + "grad_norm": 2.5019607543945312, + "learning_rate": 2.8751131856570935e-06, + "loss": 0.5315, + "step": 14925 + }, + { + "epoch": 1.9429910191331512, + "grad_norm": 2.95793080329895, + "learning_rate": 2.873221636319908e-06, + "loss": 0.5047, + "step": 14928 + }, + { + "epoch": 1.9433814916048417, + "grad_norm": 2.9171652793884277, + "learning_rate": 2.871330458526429e-06, + "loss": 0.5912, + "step": 14931 + }, + { + "epoch": 1.9437719640765327, + "grad_norm": 3.1407294273376465, + "learning_rate": 2.8694396526070383e-06, + "loss": 0.4123, + "step": 14934 + }, + { + "epoch": 1.9441624365482233, + "grad_norm": 2.9281039237976074, + "learning_rate": 2.8675492188920605e-06, + "loss": 0.6034, + "step": 14937 + }, + { + "epoch": 1.944552909019914, + "grad_norm": 3.462409257888794, + "learning_rate": 2.865659157711748e-06, + "loss": 0.4745, + "step": 14940 + }, + { + "epoch": 1.9449433814916048, + "grad_norm": 3.067357301712036, + "learning_rate": 2.863769469396289e-06, + "loss": 0.4869, + "step": 14943 + }, + { + "epoch": 1.9453338539632956, + "grad_norm": 2.8917782306671143, + "learning_rate": 2.8618801542758116e-06, + "loss": 0.513, + "step": 14946 + }, + { + "epoch": 1.9457243264349864, + "grad_norm": 2.6178336143493652, + "learning_rate": 2.859991212680373e-06, + "loss": 0.5033, + "step": 14949 + }, + { + "epoch": 1.946114798906677, + "grad_norm": 2.847200870513916, + "learning_rate": 2.858102644939966e-06, + "loss": 0.4524, + "step": 14952 + }, + { + "epoch": 1.946505271378368, + "grad_norm": 2.400238275527954, + "learning_rate": 2.8562144513845236e-06, + "loss": 0.412, + "step": 14955 + }, + { + "epoch": 1.9468957438500585, + "grad_norm": 2.936591386795044, + "learning_rate": 2.8543266323439034e-06, + "loss": 0.4973, + "step": 14958 + }, + { + "epoch": 1.9472862163217495, + "grad_norm": 2.7296829223632812, + "learning_rate": 2.8524391881479096e-06, + "loss": 0.4715, + "step": 14961 + }, + { + "epoch": 1.94767668879344, + "grad_norm": 2.435722589492798, + "learning_rate": 2.8505521191262697e-06, + "loss": 0.4847, + "step": 14964 + }, + { + "epoch": 1.9480671612651308, + "grad_norm": 2.6124179363250732, + "learning_rate": 2.8486654256086543e-06, + "loss": 0.5464, + "step": 14967 + }, + { + "epoch": 1.9484576337368216, + "grad_norm": 3.493014335632324, + "learning_rate": 2.8467791079246636e-06, + "loss": 0.5372, + "step": 14970 + }, + { + "epoch": 1.9488481062085123, + "grad_norm": 3.0467758178710938, + "learning_rate": 2.8448931664038315e-06, + "loss": 0.551, + "step": 14973 + }, + { + "epoch": 1.9492385786802031, + "grad_norm": 2.6954057216644287, + "learning_rate": 2.843007601375631e-06, + "loss": 0.5891, + "step": 14976 + }, + { + "epoch": 1.9496290511518937, + "grad_norm": 3.6716325283050537, + "learning_rate": 2.8411224131694647e-06, + "loss": 0.4633, + "step": 14979 + }, + { + "epoch": 1.9500195236235847, + "grad_norm": 2.4117796421051025, + "learning_rate": 2.839237602114672e-06, + "loss": 0.5026, + "step": 14982 + }, + { + "epoch": 1.9504099960952752, + "grad_norm": 3.0987799167633057, + "learning_rate": 2.837353168540522e-06, + "loss": 0.5195, + "step": 14985 + }, + { + "epoch": 1.950800468566966, + "grad_norm": 2.7744829654693604, + "learning_rate": 2.8354691127762256e-06, + "loss": 0.4605, + "step": 14988 + }, + { + "epoch": 1.9511909410386568, + "grad_norm": 2.757351875305176, + "learning_rate": 2.8335854351509223e-06, + "loss": 0.506, + "step": 14991 + }, + { + "epoch": 1.9515814135103475, + "grad_norm": 2.551328659057617, + "learning_rate": 2.8317021359936837e-06, + "loss": 0.5723, + "step": 14994 + }, + { + "epoch": 1.9519718859820383, + "grad_norm": 2.6989192962646484, + "learning_rate": 2.829819215633523e-06, + "loss": 0.4556, + "step": 14997 + }, + { + "epoch": 1.9523623584537289, + "grad_norm": 2.8726086616516113, + "learning_rate": 2.8279366743993776e-06, + "loss": 0.4711, + "step": 15000 + }, + { + "epoch": 1.9527528309254198, + "grad_norm": 2.742509603500366, + "learning_rate": 2.8260545126201277e-06, + "loss": 0.4994, + "step": 15003 + }, + { + "epoch": 1.9531433033971104, + "grad_norm": 2.504814624786377, + "learning_rate": 2.824172730624579e-06, + "loss": 0.5377, + "step": 15006 + }, + { + "epoch": 1.9535337758688014, + "grad_norm": 2.9309048652648926, + "learning_rate": 2.82229132874148e-06, + "loss": 0.5133, + "step": 15009 + }, + { + "epoch": 1.953924248340492, + "grad_norm": 4.551779270172119, + "learning_rate": 2.8204103072995036e-06, + "loss": 0.4612, + "step": 15012 + }, + { + "epoch": 1.9543147208121827, + "grad_norm": 3.107567548751831, + "learning_rate": 2.81852966662726e-06, + "loss": 0.5182, + "step": 15015 + }, + { + "epoch": 1.9547051932838735, + "grad_norm": 2.6413309574127197, + "learning_rate": 2.8166494070532958e-06, + "loss": 0.4866, + "step": 15018 + }, + { + "epoch": 1.9550956657555643, + "grad_norm": 2.4750940799713135, + "learning_rate": 2.8147695289060874e-06, + "loss": 0.4426, + "step": 15021 + }, + { + "epoch": 1.955486138227255, + "grad_norm": 3.1785085201263428, + "learning_rate": 2.8128900325140427e-06, + "loss": 0.473, + "step": 15024 + }, + { + "epoch": 1.9558766106989456, + "grad_norm": 2.49540114402771, + "learning_rate": 2.8110109182055112e-06, + "loss": 0.4889, + "step": 15027 + }, + { + "epoch": 1.9562670831706366, + "grad_norm": 2.723663330078125, + "learning_rate": 2.8091321863087672e-06, + "loss": 0.541, + "step": 15030 + }, + { + "epoch": 1.9566575556423271, + "grad_norm": 3.729959011077881, + "learning_rate": 2.8072538371520208e-06, + "loss": 0.4935, + "step": 15033 + }, + { + "epoch": 1.9570480281140181, + "grad_norm": 2.766430139541626, + "learning_rate": 2.805375871063415e-06, + "loss": 0.5315, + "step": 15036 + }, + { + "epoch": 1.9574385005857087, + "grad_norm": 2.1455156803131104, + "learning_rate": 2.8034982883710293e-06, + "loss": 0.4341, + "step": 15039 + }, + { + "epoch": 1.9578289730573994, + "grad_norm": 2.6431660652160645, + "learning_rate": 2.8016210894028694e-06, + "loss": 0.4833, + "step": 15042 + }, + { + "epoch": 1.9582194455290902, + "grad_norm": 2.4709603786468506, + "learning_rate": 2.799744274486883e-06, + "loss": 0.4393, + "step": 15045 + }, + { + "epoch": 1.958609918000781, + "grad_norm": 2.707580804824829, + "learning_rate": 2.797867843950941e-06, + "loss": 0.4678, + "step": 15048 + }, + { + "epoch": 1.9590003904724718, + "grad_norm": 2.598074197769165, + "learning_rate": 2.795991798122856e-06, + "loss": 0.5941, + "step": 15051 + }, + { + "epoch": 1.9593908629441623, + "grad_norm": 2.8426082134246826, + "learning_rate": 2.794116137330367e-06, + "loss": 0.5295, + "step": 15054 + }, + { + "epoch": 1.9597813354158533, + "grad_norm": 2.351702928543091, + "learning_rate": 2.792240861901147e-06, + "loss": 0.4504, + "step": 15057 + }, + { + "epoch": 1.9601718078875439, + "grad_norm": 2.866417646408081, + "learning_rate": 2.7903659721628063e-06, + "loss": 0.4783, + "step": 15060 + }, + { + "epoch": 1.9605622803592346, + "grad_norm": 2.6855812072753906, + "learning_rate": 2.788491468442881e-06, + "loss": 0.4801, + "step": 15063 + }, + { + "epoch": 1.9609527528309254, + "grad_norm": 2.5768182277679443, + "learning_rate": 2.7866173510688423e-06, + "loss": 0.4532, + "step": 15066 + }, + { + "epoch": 1.9613432253026162, + "grad_norm": 2.8649215698242188, + "learning_rate": 2.7847436203680977e-06, + "loss": 0.5368, + "step": 15069 + }, + { + "epoch": 1.961733697774307, + "grad_norm": 3.515437602996826, + "learning_rate": 2.7828702766679827e-06, + "loss": 0.5493, + "step": 15072 + }, + { + "epoch": 1.9621241702459975, + "grad_norm": 2.8081016540527344, + "learning_rate": 2.780997320295764e-06, + "loss": 0.5247, + "step": 15075 + }, + { + "epoch": 1.9625146427176885, + "grad_norm": 2.8509280681610107, + "learning_rate": 2.7791247515786475e-06, + "loss": 0.5239, + "step": 15078 + }, + { + "epoch": 1.962905115189379, + "grad_norm": 2.9370276927948, + "learning_rate": 2.777252570843765e-06, + "loss": 0.4763, + "step": 15081 + }, + { + "epoch": 1.96329558766107, + "grad_norm": 3.092346429824829, + "learning_rate": 2.77538077841818e-06, + "loss": 0.4743, + "step": 15084 + }, + { + "epoch": 1.9636860601327606, + "grad_norm": 2.7640228271484375, + "learning_rate": 2.7735093746288933e-06, + "loss": 0.5355, + "step": 15087 + }, + { + "epoch": 1.9640765326044514, + "grad_norm": 2.5823822021484375, + "learning_rate": 2.7716383598028367e-06, + "loss": 0.4613, + "step": 15090 + }, + { + "epoch": 1.9644670050761421, + "grad_norm": 2.547419786453247, + "learning_rate": 2.769767734266871e-06, + "loss": 0.5324, + "step": 15093 + }, + { + "epoch": 1.964857477547833, + "grad_norm": 2.7568259239196777, + "learning_rate": 2.7678974983477907e-06, + "loss": 0.5248, + "step": 15096 + }, + { + "epoch": 1.9652479500195237, + "grad_norm": 2.7360951900482178, + "learning_rate": 2.7660276523723195e-06, + "loss": 0.4737, + "step": 15099 + }, + { + "epoch": 1.9656384224912142, + "grad_norm": 2.4766454696655273, + "learning_rate": 2.7641581966671203e-06, + "loss": 0.5236, + "step": 15102 + }, + { + "epoch": 1.9660288949629052, + "grad_norm": 3.0755808353424072, + "learning_rate": 2.7622891315587803e-06, + "loss": 0.4267, + "step": 15105 + }, + { + "epoch": 1.9664193674345958, + "grad_norm": 2.7649264335632324, + "learning_rate": 2.760420457373819e-06, + "loss": 0.4645, + "step": 15108 + }, + { + "epoch": 1.9668098399062868, + "grad_norm": 2.694270372390747, + "learning_rate": 2.7585521744386954e-06, + "loss": 0.5818, + "step": 15111 + }, + { + "epoch": 1.9672003123779773, + "grad_norm": 2.4713313579559326, + "learning_rate": 2.7566842830797914e-06, + "loss": 0.4033, + "step": 15114 + }, + { + "epoch": 1.967590784849668, + "grad_norm": 2.431579351425171, + "learning_rate": 2.754816783623421e-06, + "loss": 0.3948, + "step": 15117 + }, + { + "epoch": 1.9679812573213589, + "grad_norm": 3.099750518798828, + "learning_rate": 2.7529496763958385e-06, + "loss": 0.4857, + "step": 15120 + }, + { + "epoch": 1.9683717297930496, + "grad_norm": 2.6645519733428955, + "learning_rate": 2.7510829617232197e-06, + "loss": 0.5021, + "step": 15123 + }, + { + "epoch": 1.9687622022647404, + "grad_norm": 3.6451914310455322, + "learning_rate": 2.7492166399316746e-06, + "loss": 0.495, + "step": 15126 + }, + { + "epoch": 1.969152674736431, + "grad_norm": 2.5093352794647217, + "learning_rate": 2.7473507113472477e-06, + "loss": 0.4996, + "step": 15129 + }, + { + "epoch": 1.969543147208122, + "grad_norm": 2.9083104133605957, + "learning_rate": 2.7454851762959146e-06, + "loss": 0.5105, + "step": 15132 + }, + { + "epoch": 1.9699336196798125, + "grad_norm": 2.977738618850708, + "learning_rate": 2.7436200351035784e-06, + "loss": 0.5711, + "step": 15135 + }, + { + "epoch": 1.9703240921515033, + "grad_norm": 2.227489948272705, + "learning_rate": 2.7417552880960736e-06, + "loss": 0.4285, + "step": 15138 + }, + { + "epoch": 1.970714564623194, + "grad_norm": 2.565983295440674, + "learning_rate": 2.739890935599171e-06, + "loss": 0.4802, + "step": 15141 + }, + { + "epoch": 1.9711050370948848, + "grad_norm": 2.6615631580352783, + "learning_rate": 2.738026977938567e-06, + "loss": 0.4581, + "step": 15144 + }, + { + "epoch": 1.9714955095665756, + "grad_norm": 2.5697174072265625, + "learning_rate": 2.736163415439892e-06, + "loss": 0.4529, + "step": 15147 + }, + { + "epoch": 1.9718859820382661, + "grad_norm": 2.6330325603485107, + "learning_rate": 2.734300248428704e-06, + "loss": 0.5237, + "step": 15150 + }, + { + "epoch": 1.9722764545099571, + "grad_norm": 2.766741991043091, + "learning_rate": 2.7324374772304978e-06, + "loss": 0.4946, + "step": 15153 + }, + { + "epoch": 1.9726669269816477, + "grad_norm": 2.515528917312622, + "learning_rate": 2.7305751021706943e-06, + "loss": 0.5103, + "step": 15156 + }, + { + "epoch": 1.9730573994533387, + "grad_norm": 2.4971110820770264, + "learning_rate": 2.7287131235746446e-06, + "loss": 0.4888, + "step": 15159 + }, + { + "epoch": 1.9734478719250292, + "grad_norm": 2.7696313858032227, + "learning_rate": 2.7268515417676354e-06, + "loss": 0.4618, + "step": 15162 + }, + { + "epoch": 1.97383834439672, + "grad_norm": 2.6422767639160156, + "learning_rate": 2.7249903570748805e-06, + "loss": 0.5001, + "step": 15165 + }, + { + "epoch": 1.9742288168684108, + "grad_norm": 2.5587644577026367, + "learning_rate": 2.7231295698215223e-06, + "loss": 0.4142, + "step": 15168 + }, + { + "epoch": 1.9746192893401016, + "grad_norm": 2.5920650959014893, + "learning_rate": 2.721269180332638e-06, + "loss": 0.4981, + "step": 15171 + }, + { + "epoch": 1.9750097618117923, + "grad_norm": 2.7538814544677734, + "learning_rate": 2.7194091889332364e-06, + "loss": 0.5542, + "step": 15174 + }, + { + "epoch": 1.9754002342834829, + "grad_norm": 2.732511281967163, + "learning_rate": 2.717549595948251e-06, + "loss": 0.4629, + "step": 15177 + }, + { + "epoch": 1.9757907067551739, + "grad_norm": 3.137880325317383, + "learning_rate": 2.7156904017025485e-06, + "loss": 0.4911, + "step": 15180 + }, + { + "epoch": 1.9761811792268644, + "grad_norm": 3.1518633365631104, + "learning_rate": 2.7138316065209298e-06, + "loss": 0.5212, + "step": 15183 + }, + { + "epoch": 1.9765716516985552, + "grad_norm": 2.9920685291290283, + "learning_rate": 2.7119732107281193e-06, + "loss": 0.4845, + "step": 15186 + }, + { + "epoch": 1.976962124170246, + "grad_norm": 2.8355932235717773, + "learning_rate": 2.710115214648775e-06, + "loss": 0.5633, + "step": 15189 + }, + { + "epoch": 1.9773525966419367, + "grad_norm": 3.028092861175537, + "learning_rate": 2.708257618607485e-06, + "loss": 0.5472, + "step": 15192 + }, + { + "epoch": 1.9777430691136275, + "grad_norm": 2.849092483520508, + "learning_rate": 2.7064004229287688e-06, + "loss": 0.5292, + "step": 15195 + }, + { + "epoch": 1.9781335415853183, + "grad_norm": 2.641608238220215, + "learning_rate": 2.704543627937074e-06, + "loss": 0.5607, + "step": 15198 + }, + { + "epoch": 1.978524014057009, + "grad_norm": 2.6991071701049805, + "learning_rate": 2.702687233956777e-06, + "loss": 0.4641, + "step": 15201 + }, + { + "epoch": 1.9789144865286996, + "grad_norm": 2.600306272506714, + "learning_rate": 2.7008312413121886e-06, + "loss": 0.5011, + "step": 15204 + }, + { + "epoch": 1.9793049590003906, + "grad_norm": 2.6777749061584473, + "learning_rate": 2.6989756503275454e-06, + "loss": 0.488, + "step": 15207 + }, + { + "epoch": 1.9796954314720812, + "grad_norm": 2.550184965133667, + "learning_rate": 2.697120461327014e-06, + "loss": 0.5401, + "step": 15210 + }, + { + "epoch": 1.980085903943772, + "grad_norm": 2.9522573947906494, + "learning_rate": 2.6952656746346937e-06, + "loss": 0.5706, + "step": 15213 + }, + { + "epoch": 1.9804763764154627, + "grad_norm": 3.306687355041504, + "learning_rate": 2.6934112905746136e-06, + "loss": 0.5365, + "step": 15216 + }, + { + "epoch": 1.9808668488871535, + "grad_norm": 3.0320816040039062, + "learning_rate": 2.6915573094707282e-06, + "loss": 0.4609, + "step": 15219 + }, + { + "epoch": 1.9812573213588442, + "grad_norm": 2.547999620437622, + "learning_rate": 2.689703731646922e-06, + "loss": 0.5136, + "step": 15222 + }, + { + "epoch": 1.9816477938305348, + "grad_norm": 3.0180954933166504, + "learning_rate": 2.687850557427017e-06, + "loss": 0.4992, + "step": 15225 + }, + { + "epoch": 1.9820382663022258, + "grad_norm": 3.133898973464966, + "learning_rate": 2.685997787134755e-06, + "loss": 0.51, + "step": 15228 + }, + { + "epoch": 1.9824287387739163, + "grad_norm": 2.631911039352417, + "learning_rate": 2.6841454210938095e-06, + "loss": 0.4903, + "step": 15231 + }, + { + "epoch": 1.9828192112456073, + "grad_norm": 4.11314582824707, + "learning_rate": 2.6822934596277893e-06, + "loss": 0.5155, + "step": 15234 + }, + { + "epoch": 1.9832096837172979, + "grad_norm": 2.362086772918701, + "learning_rate": 2.6804419030602256e-06, + "loss": 0.461, + "step": 15237 + }, + { + "epoch": 1.9836001561889887, + "grad_norm": 2.590529203414917, + "learning_rate": 2.6785907517145825e-06, + "loss": 0.4995, + "step": 15240 + }, + { + "epoch": 1.9839906286606794, + "grad_norm": 2.6236050128936768, + "learning_rate": 2.676740005914249e-06, + "loss": 0.5005, + "step": 15243 + }, + { + "epoch": 1.9843811011323702, + "grad_norm": 2.9158525466918945, + "learning_rate": 2.6748896659825507e-06, + "loss": 0.5008, + "step": 15246 + }, + { + "epoch": 1.984771573604061, + "grad_norm": 2.9477591514587402, + "learning_rate": 2.673039732242737e-06, + "loss": 0.4673, + "step": 15249 + }, + { + "epoch": 1.9851620460757515, + "grad_norm": 4.2003397941589355, + "learning_rate": 2.671190205017985e-06, + "loss": 0.4733, + "step": 15252 + }, + { + "epoch": 1.9855525185474425, + "grad_norm": 2.9860401153564453, + "learning_rate": 2.669341084631405e-06, + "loss": 0.4566, + "step": 15255 + }, + { + "epoch": 1.985942991019133, + "grad_norm": 2.4780070781707764, + "learning_rate": 2.6674923714060365e-06, + "loss": 0.476, + "step": 15258 + }, + { + "epoch": 1.9863334634908238, + "grad_norm": 2.6294336318969727, + "learning_rate": 2.6656440656648434e-06, + "loss": 0.4586, + "step": 15261 + }, + { + "epoch": 1.9867239359625146, + "grad_norm": 2.643993616104126, + "learning_rate": 2.66379616773072e-06, + "loss": 0.4864, + "step": 15264 + }, + { + "epoch": 1.9871144084342054, + "grad_norm": 2.825512647628784, + "learning_rate": 2.6619486779264924e-06, + "loss": 0.5533, + "step": 15267 + }, + { + "epoch": 1.9875048809058962, + "grad_norm": 2.878120183944702, + "learning_rate": 2.6601015965749135e-06, + "loss": 0.4189, + "step": 15270 + }, + { + "epoch": 1.9878953533775867, + "grad_norm": 2.5639851093292236, + "learning_rate": 2.65825492399866e-06, + "loss": 0.5051, + "step": 15273 + }, + { + "epoch": 1.9882858258492777, + "grad_norm": 2.7018072605133057, + "learning_rate": 2.6564086605203478e-06, + "loss": 0.4696, + "step": 15276 + }, + { + "epoch": 1.9886762983209683, + "grad_norm": 2.7019267082214355, + "learning_rate": 2.654562806462512e-06, + "loss": 0.4619, + "step": 15279 + }, + { + "epoch": 1.9890667707926593, + "grad_norm": 2.6082382202148438, + "learning_rate": 2.652717362147618e-06, + "loss": 0.5409, + "step": 15282 + }, + { + "epoch": 1.9894572432643498, + "grad_norm": 2.8006742000579834, + "learning_rate": 2.6508723278980654e-06, + "loss": 0.4565, + "step": 15285 + }, + { + "epoch": 1.9898477157360406, + "grad_norm": 2.544560194015503, + "learning_rate": 2.6490277040361743e-06, + "loss": 0.3934, + "step": 15288 + }, + { + "epoch": 1.9902381882077314, + "grad_norm": 2.632777214050293, + "learning_rate": 2.647183490884198e-06, + "loss": 0.5214, + "step": 15291 + }, + { + "epoch": 1.9906286606794221, + "grad_norm": 2.4380311965942383, + "learning_rate": 2.6453396887643124e-06, + "loss": 0.4517, + "step": 15294 + }, + { + "epoch": 1.991019133151113, + "grad_norm": 2.867617607116699, + "learning_rate": 2.6434962979986334e-06, + "loss": 0.4878, + "step": 15297 + }, + { + "epoch": 1.9914096056228034, + "grad_norm": 2.7441964149475098, + "learning_rate": 2.641653318909194e-06, + "loss": 0.5228, + "step": 15300 + }, + { + "epoch": 1.9918000780944944, + "grad_norm": 2.700723171234131, + "learning_rate": 2.6398107518179584e-06, + "loss": 0.4856, + "step": 15303 + }, + { + "epoch": 1.992190550566185, + "grad_norm": 2.6741185188293457, + "learning_rate": 2.637968597046818e-06, + "loss": 0.5286, + "step": 15306 + }, + { + "epoch": 1.992581023037876, + "grad_norm": 2.861276388168335, + "learning_rate": 2.6361268549175957e-06, + "loss": 0.5157, + "step": 15309 + }, + { + "epoch": 1.9929714955095665, + "grad_norm": 2.8468337059020996, + "learning_rate": 2.6342855257520393e-06, + "loss": 0.4486, + "step": 15312 + }, + { + "epoch": 1.9933619679812573, + "grad_norm": 2.4954781532287598, + "learning_rate": 2.632444609871824e-06, + "loss": 0.4334, + "step": 15315 + }, + { + "epoch": 1.993752440452948, + "grad_norm": 2.777869939804077, + "learning_rate": 2.630604107598555e-06, + "loss": 0.4058, + "step": 15318 + }, + { + "epoch": 1.9941429129246389, + "grad_norm": 3.0267093181610107, + "learning_rate": 2.6287640192537645e-06, + "loss": 0.5133, + "step": 15321 + }, + { + "epoch": 1.9945333853963296, + "grad_norm": 2.5115318298339844, + "learning_rate": 2.62692434515891e-06, + "loss": 0.4361, + "step": 15324 + }, + { + "epoch": 1.9949238578680202, + "grad_norm": 2.824373245239258, + "learning_rate": 2.6250850856353815e-06, + "loss": 0.4602, + "step": 15327 + }, + { + "epoch": 1.9953143303397112, + "grad_norm": 2.420016050338745, + "learning_rate": 2.6232462410044927e-06, + "loss": 0.4136, + "step": 15330 + }, + { + "epoch": 1.9957048028114017, + "grad_norm": 2.629142999649048, + "learning_rate": 2.6214078115874843e-06, + "loss": 0.5071, + "step": 15333 + }, + { + "epoch": 1.9960952752830925, + "grad_norm": 2.6193673610687256, + "learning_rate": 2.6195697977055262e-06, + "loss": 0.4901, + "step": 15336 + }, + { + "epoch": 1.9964857477547833, + "grad_norm": 3.3041958808898926, + "learning_rate": 2.6177321996797193e-06, + "loss": 0.4341, + "step": 15339 + }, + { + "epoch": 1.996876220226474, + "grad_norm": 2.496450424194336, + "learning_rate": 2.615895017831086e-06, + "loss": 0.4542, + "step": 15342 + }, + { + "epoch": 1.9972666926981648, + "grad_norm": 2.4021811485290527, + "learning_rate": 2.6140582524805746e-06, + "loss": 0.4821, + "step": 15345 + }, + { + "epoch": 1.9976571651698554, + "grad_norm": 2.5634875297546387, + "learning_rate": 2.6122219039490704e-06, + "loss": 0.4816, + "step": 15348 + }, + { + "epoch": 1.9980476376415464, + "grad_norm": 2.7742042541503906, + "learning_rate": 2.6103859725573756e-06, + "loss": 0.5172, + "step": 15351 + }, + { + "epoch": 1.998438110113237, + "grad_norm": 2.705730676651001, + "learning_rate": 2.6085504586262245e-06, + "loss": 0.4998, + "step": 15354 + }, + { + "epoch": 1.998828582584928, + "grad_norm": 2.538831949234009, + "learning_rate": 2.606715362476275e-06, + "loss": 0.474, + "step": 15357 + }, + { + "epoch": 1.9992190550566185, + "grad_norm": 2.5744338035583496, + "learning_rate": 2.6048806844281206e-06, + "loss": 0.5268, + "step": 15360 + }, + { + "epoch": 1.9996095275283092, + "grad_norm": 2.8102900981903076, + "learning_rate": 2.6030464248022704e-06, + "loss": 0.5102, + "step": 15363 + }, + { + "epoch": 2.0, + "grad_norm": 8.22767162322998, + "learning_rate": 2.601212583919166e-06, + "loss": 0.5386, + "step": 15366 + }, + { + "epoch": 2.0003904724716906, + "grad_norm": 2.527303695678711, + "learning_rate": 2.5993791620991783e-06, + "loss": 0.3502, + "step": 15369 + }, + { + "epoch": 2.0007809449433815, + "grad_norm": 2.931882858276367, + "learning_rate": 2.5975461596626016e-06, + "loss": 0.3747, + "step": 15372 + }, + { + "epoch": 2.001171417415072, + "grad_norm": 2.3010025024414062, + "learning_rate": 2.5957135769296543e-06, + "loss": 0.3607, + "step": 15375 + }, + { + "epoch": 2.001561889886763, + "grad_norm": 2.2463386058807373, + "learning_rate": 2.5938814142204873e-06, + "loss": 0.3475, + "step": 15378 + }, + { + "epoch": 2.0019523623584536, + "grad_norm": 2.4900364875793457, + "learning_rate": 2.592049671855178e-06, + "loss": 0.3501, + "step": 15381 + }, + { + "epoch": 2.0023428348301446, + "grad_norm": 2.3296518325805664, + "learning_rate": 2.5902183501537247e-06, + "loss": 0.4047, + "step": 15384 + }, + { + "epoch": 2.002733307301835, + "grad_norm": 2.306422710418701, + "learning_rate": 2.5883874494360544e-06, + "loss": 0.372, + "step": 15387 + }, + { + "epoch": 2.003123779773526, + "grad_norm": 2.479424238204956, + "learning_rate": 2.5865569700220257e-06, + "loss": 0.3311, + "step": 15390 + }, + { + "epoch": 2.0035142522452167, + "grad_norm": 2.6242549419403076, + "learning_rate": 2.584726912231417e-06, + "loss": 0.3669, + "step": 15393 + }, + { + "epoch": 2.0039047247169073, + "grad_norm": 2.8145828247070312, + "learning_rate": 2.582897276383933e-06, + "loss": 0.411, + "step": 15396 + }, + { + "epoch": 2.0042951971885983, + "grad_norm": 2.4189159870147705, + "learning_rate": 2.5810680627992134e-06, + "loss": 0.3667, + "step": 15399 + }, + { + "epoch": 2.004685669660289, + "grad_norm": 2.6705586910247803, + "learning_rate": 2.579239271796814e-06, + "loss": 0.3744, + "step": 15402 + }, + { + "epoch": 2.00507614213198, + "grad_norm": 2.6565775871276855, + "learning_rate": 2.5774109036962208e-06, + "loss": 0.3976, + "step": 15405 + }, + { + "epoch": 2.0054666146036704, + "grad_norm": 2.923213005065918, + "learning_rate": 2.5755829588168444e-06, + "loss": 0.3592, + "step": 15408 + }, + { + "epoch": 2.0058570870753614, + "grad_norm": 2.6897075176239014, + "learning_rate": 2.573755437478027e-06, + "loss": 0.3244, + "step": 15411 + }, + { + "epoch": 2.006247559547052, + "grad_norm": 2.722161293029785, + "learning_rate": 2.571928339999031e-06, + "loss": 0.2853, + "step": 15414 + }, + { + "epoch": 2.0066380320187425, + "grad_norm": 2.8695123195648193, + "learning_rate": 2.570101666699044e-06, + "loss": 0.3987, + "step": 15417 + }, + { + "epoch": 2.0070285044904335, + "grad_norm": 2.4492764472961426, + "learning_rate": 2.5682754178971838e-06, + "loss": 0.3593, + "step": 15420 + }, + { + "epoch": 2.007418976962124, + "grad_norm": 2.7777905464172363, + "learning_rate": 2.5664495939124945e-06, + "loss": 0.3572, + "step": 15423 + }, + { + "epoch": 2.007809449433815, + "grad_norm": 2.52078914642334, + "learning_rate": 2.564624195063942e-06, + "loss": 0.346, + "step": 15426 + }, + { + "epoch": 2.0081999219055056, + "grad_norm": 2.5213940143585205, + "learning_rate": 2.5627992216704167e-06, + "loss": 0.3278, + "step": 15429 + }, + { + "epoch": 2.0085903943771966, + "grad_norm": 2.6475985050201416, + "learning_rate": 2.560974674050743e-06, + "loss": 0.3902, + "step": 15432 + }, + { + "epoch": 2.008980866848887, + "grad_norm": 2.8680548667907715, + "learning_rate": 2.5591505525236626e-06, + "loss": 0.3529, + "step": 15435 + }, + { + "epoch": 2.009371339320578, + "grad_norm": 2.852402687072754, + "learning_rate": 2.557326857407844e-06, + "loss": 0.3346, + "step": 15438 + }, + { + "epoch": 2.0097618117922686, + "grad_norm": 2.724487066268921, + "learning_rate": 2.555503589021886e-06, + "loss": 0.3578, + "step": 15441 + }, + { + "epoch": 2.010152284263959, + "grad_norm": 3.298837900161743, + "learning_rate": 2.553680747684309e-06, + "loss": 0.3784, + "step": 15444 + }, + { + "epoch": 2.01054275673565, + "grad_norm": 2.7460174560546875, + "learning_rate": 2.551858333713557e-06, + "loss": 0.3613, + "step": 15447 + }, + { + "epoch": 2.0109332292073407, + "grad_norm": 2.6980228424072266, + "learning_rate": 2.5500363474280066e-06, + "loss": 0.3708, + "step": 15450 + }, + { + "epoch": 2.0113237016790317, + "grad_norm": 2.433493137359619, + "learning_rate": 2.548214789145951e-06, + "loss": 0.3527, + "step": 15453 + }, + { + "epoch": 2.0117141741507223, + "grad_norm": 2.826143741607666, + "learning_rate": 2.5463936591856153e-06, + "loss": 0.3262, + "step": 15456 + }, + { + "epoch": 2.0121046466224133, + "grad_norm": 2.4797494411468506, + "learning_rate": 2.5445729578651427e-06, + "loss": 0.3859, + "step": 15459 + }, + { + "epoch": 2.012495119094104, + "grad_norm": 2.5826714038848877, + "learning_rate": 2.5427526855026097e-06, + "loss": 0.3374, + "step": 15462 + }, + { + "epoch": 2.012885591565795, + "grad_norm": 2.6993298530578613, + "learning_rate": 2.540932842416015e-06, + "loss": 0.3859, + "step": 15465 + }, + { + "epoch": 2.0132760640374854, + "grad_norm": 2.468688488006592, + "learning_rate": 2.5391134289232794e-06, + "loss": 0.3522, + "step": 15468 + }, + { + "epoch": 2.013666536509176, + "grad_norm": 3.0440926551818848, + "learning_rate": 2.5372944453422486e-06, + "loss": 0.3925, + "step": 15471 + }, + { + "epoch": 2.014057008980867, + "grad_norm": 2.1184885501861572, + "learning_rate": 2.5354758919906995e-06, + "loss": 0.2867, + "step": 15474 + }, + { + "epoch": 2.0144474814525575, + "grad_norm": 2.9124364852905273, + "learning_rate": 2.5336577691863286e-06, + "loss": 0.3881, + "step": 15477 + }, + { + "epoch": 2.0148379539242485, + "grad_norm": 2.6721181869506836, + "learning_rate": 2.531840077246754e-06, + "loss": 0.3463, + "step": 15480 + }, + { + "epoch": 2.015228426395939, + "grad_norm": 2.512578248977661, + "learning_rate": 2.5300228164895275e-06, + "loss": 0.3355, + "step": 15483 + }, + { + "epoch": 2.01561889886763, + "grad_norm": 2.8622634410858154, + "learning_rate": 2.5282059872321192e-06, + "loss": 0.4173, + "step": 15486 + }, + { + "epoch": 2.0160093713393206, + "grad_norm": 2.445178985595703, + "learning_rate": 2.526389589791923e-06, + "loss": 0.3202, + "step": 15489 + }, + { + "epoch": 2.016399843811011, + "grad_norm": 2.7001988887786865, + "learning_rate": 2.524573624486264e-06, + "loss": 0.3657, + "step": 15492 + }, + { + "epoch": 2.016790316282702, + "grad_norm": 2.5869991779327393, + "learning_rate": 2.5227580916323846e-06, + "loss": 0.4093, + "step": 15495 + }, + { + "epoch": 2.0171807887543927, + "grad_norm": 2.6643974781036377, + "learning_rate": 2.5209429915474536e-06, + "loss": 0.3307, + "step": 15498 + }, + { + "epoch": 2.0175712612260837, + "grad_norm": 2.8480355739593506, + "learning_rate": 2.5191283245485686e-06, + "loss": 0.3815, + "step": 15501 + }, + { + "epoch": 2.017961733697774, + "grad_norm": 2.674461603164673, + "learning_rate": 2.517314090952745e-06, + "loss": 0.3721, + "step": 15504 + }, + { + "epoch": 2.018352206169465, + "grad_norm": 2.5194647312164307, + "learning_rate": 2.515500291076928e-06, + "loss": 0.303, + "step": 15507 + }, + { + "epoch": 2.0187426786411558, + "grad_norm": 2.625236749649048, + "learning_rate": 2.5136869252379825e-06, + "loss": 0.4136, + "step": 15510 + }, + { + "epoch": 2.0191331511128467, + "grad_norm": 2.7869434356689453, + "learning_rate": 2.511873993752702e-06, + "loss": 0.3184, + "step": 15513 + }, + { + "epoch": 2.0195236235845373, + "grad_norm": 2.7091245651245117, + "learning_rate": 2.5100614969378006e-06, + "loss": 0.3589, + "step": 15516 + }, + { + "epoch": 2.019914096056228, + "grad_norm": 3.2992379665374756, + "learning_rate": 2.508249435109918e-06, + "loss": 0.3377, + "step": 15519 + }, + { + "epoch": 2.020304568527919, + "grad_norm": 2.686725616455078, + "learning_rate": 2.5064378085856146e-06, + "loss": 0.3125, + "step": 15522 + }, + { + "epoch": 2.0206950409996094, + "grad_norm": 2.7225749492645264, + "learning_rate": 2.5046266176813825e-06, + "loss": 0.3388, + "step": 15525 + }, + { + "epoch": 2.0210855134713004, + "grad_norm": 2.5445516109466553, + "learning_rate": 2.5028158627136313e-06, + "loss": 0.3304, + "step": 15528 + }, + { + "epoch": 2.021475985942991, + "grad_norm": 2.5298032760620117, + "learning_rate": 2.5010055439986935e-06, + "loss": 0.359, + "step": 15531 + }, + { + "epoch": 2.021866458414682, + "grad_norm": 2.7906692028045654, + "learning_rate": 2.4991956618528317e-06, + "loss": 0.3837, + "step": 15534 + }, + { + "epoch": 2.0222569308863725, + "grad_norm": 2.5515687465667725, + "learning_rate": 2.4973862165922268e-06, + "loss": 0.3741, + "step": 15537 + }, + { + "epoch": 2.0226474033580635, + "grad_norm": 2.7983710765838623, + "learning_rate": 2.495577208532984e-06, + "loss": 0.4018, + "step": 15540 + }, + { + "epoch": 2.023037875829754, + "grad_norm": 2.931863784790039, + "learning_rate": 2.493768637991135e-06, + "loss": 0.3165, + "step": 15543 + }, + { + "epoch": 2.0234283483014446, + "grad_norm": 2.761305809020996, + "learning_rate": 2.491960505282632e-06, + "loss": 0.3567, + "step": 15546 + }, + { + "epoch": 2.0238188207731356, + "grad_norm": 2.8153398036956787, + "learning_rate": 2.4901528107233535e-06, + "loss": 0.3898, + "step": 15549 + }, + { + "epoch": 2.024209293244826, + "grad_norm": 2.77380633354187, + "learning_rate": 2.4883455546290975e-06, + "loss": 0.3286, + "step": 15552 + }, + { + "epoch": 2.024599765716517, + "grad_norm": 2.8505361080169678, + "learning_rate": 2.486538737315591e-06, + "loss": 0.3429, + "step": 15555 + }, + { + "epoch": 2.0249902381882077, + "grad_norm": 2.722792863845825, + "learning_rate": 2.4847323590984797e-06, + "loss": 0.4518, + "step": 15558 + }, + { + "epoch": 2.0253807106598987, + "grad_norm": 2.7249341011047363, + "learning_rate": 2.482926420293332e-06, + "loss": 0.3427, + "step": 15561 + }, + { + "epoch": 2.025771183131589, + "grad_norm": 2.7627315521240234, + "learning_rate": 2.4811209212156455e-06, + "loss": 0.3595, + "step": 15564 + }, + { + "epoch": 2.0261616556032798, + "grad_norm": 2.6366686820983887, + "learning_rate": 2.479315862180835e-06, + "loss": 0.3352, + "step": 15567 + }, + { + "epoch": 2.0265521280749708, + "grad_norm": 2.850191354751587, + "learning_rate": 2.477511243504241e-06, + "loss": 0.3398, + "step": 15570 + }, + { + "epoch": 2.0269426005466613, + "grad_norm": 3.085858106613159, + "learning_rate": 2.475707065501124e-06, + "loss": 0.3551, + "step": 15573 + }, + { + "epoch": 2.0273330730183523, + "grad_norm": 2.575150728225708, + "learning_rate": 2.473903328486674e-06, + "loss": 0.3393, + "step": 15576 + }, + { + "epoch": 2.027723545490043, + "grad_norm": 2.692265748977661, + "learning_rate": 2.4721000327759988e-06, + "loss": 0.4056, + "step": 15579 + }, + { + "epoch": 2.028114017961734, + "grad_norm": 2.6966402530670166, + "learning_rate": 2.4702971786841278e-06, + "loss": 0.3393, + "step": 15582 + }, + { + "epoch": 2.0285044904334244, + "grad_norm": 2.7873032093048096, + "learning_rate": 2.46849476652602e-06, + "loss": 0.3656, + "step": 15585 + }, + { + "epoch": 2.0288949629051154, + "grad_norm": 2.6634371280670166, + "learning_rate": 2.4666927966165487e-06, + "loss": 0.3544, + "step": 15588 + }, + { + "epoch": 2.029285435376806, + "grad_norm": 2.635133981704712, + "learning_rate": 2.464891269270519e-06, + "loss": 0.4251, + "step": 15591 + }, + { + "epoch": 2.0296759078484965, + "grad_norm": 2.715653657913208, + "learning_rate": 2.4630901848026494e-06, + "loss": 0.3665, + "step": 15594 + }, + { + "epoch": 2.0300663803201875, + "grad_norm": 4.321242332458496, + "learning_rate": 2.4612895435275896e-06, + "loss": 0.3857, + "step": 15597 + }, + { + "epoch": 2.030456852791878, + "grad_norm": 2.4996755123138428, + "learning_rate": 2.4594893457599056e-06, + "loss": 0.3667, + "step": 15600 + }, + { + "epoch": 2.030847325263569, + "grad_norm": 2.7729170322418213, + "learning_rate": 2.4576895918140866e-06, + "loss": 0.3656, + "step": 15603 + }, + { + "epoch": 2.0312377977352596, + "grad_norm": 2.7078745365142822, + "learning_rate": 2.45589028200455e-06, + "loss": 0.3683, + "step": 15606 + }, + { + "epoch": 2.0316282702069506, + "grad_norm": 2.8833117485046387, + "learning_rate": 2.4540914166456286e-06, + "loss": 0.353, + "step": 15609 + }, + { + "epoch": 2.032018742678641, + "grad_norm": 3.260715961456299, + "learning_rate": 2.452292996051581e-06, + "loss": 0.4125, + "step": 15612 + }, + { + "epoch": 2.0324092151503317, + "grad_norm": 2.7390503883361816, + "learning_rate": 2.450495020536586e-06, + "loss": 0.4213, + "step": 15615 + }, + { + "epoch": 2.0327996876220227, + "grad_norm": 2.833608865737915, + "learning_rate": 2.4486974904147488e-06, + "loss": 0.3744, + "step": 15618 + }, + { + "epoch": 2.0331901600937132, + "grad_norm": 3.231580972671509, + "learning_rate": 2.446900406000093e-06, + "loss": 0.3103, + "step": 15621 + }, + { + "epoch": 2.0335806325654042, + "grad_norm": 2.8441002368927, + "learning_rate": 2.445103767606563e-06, + "loss": 0.3917, + "step": 15624 + }, + { + "epoch": 2.0339711050370948, + "grad_norm": 2.8229074478149414, + "learning_rate": 2.443307575548033e-06, + "loss": 0.345, + "step": 15627 + }, + { + "epoch": 2.0343615775087858, + "grad_norm": 2.594104766845703, + "learning_rate": 2.4415118301382885e-06, + "loss": 0.3214, + "step": 15630 + }, + { + "epoch": 2.0347520499804763, + "grad_norm": 3.496530532836914, + "learning_rate": 2.4397165316910472e-06, + "loss": 0.4241, + "step": 15633 + }, + { + "epoch": 2.0351425224521673, + "grad_norm": 2.7391481399536133, + "learning_rate": 2.4379216805199396e-06, + "loss": 0.3217, + "step": 15636 + }, + { + "epoch": 2.035532994923858, + "grad_norm": 2.622236728668213, + "learning_rate": 2.436127276938526e-06, + "loss": 0.4088, + "step": 15639 + }, + { + "epoch": 2.0359234673955484, + "grad_norm": 2.8432962894439697, + "learning_rate": 2.434333321260285e-06, + "loss": 0.3536, + "step": 15642 + }, + { + "epoch": 2.0363139398672394, + "grad_norm": 2.6063525676727295, + "learning_rate": 2.432539813798612e-06, + "loss": 0.378, + "step": 15645 + }, + { + "epoch": 2.03670441233893, + "grad_norm": 2.529179334640503, + "learning_rate": 2.430746754866835e-06, + "loss": 0.3053, + "step": 15648 + }, + { + "epoch": 2.037094884810621, + "grad_norm": 2.843583583831787, + "learning_rate": 2.428954144778195e-06, + "loss": 0.4077, + "step": 15651 + }, + { + "epoch": 2.0374853572823115, + "grad_norm": 2.8022096157073975, + "learning_rate": 2.4271619838458552e-06, + "loss": 0.3926, + "step": 15654 + }, + { + "epoch": 2.0378758297540025, + "grad_norm": 2.6679294109344482, + "learning_rate": 2.4253702723829066e-06, + "loss": 0.3386, + "step": 15657 + }, + { + "epoch": 2.038266302225693, + "grad_norm": 2.6503212451934814, + "learning_rate": 2.423579010702355e-06, + "loss": 0.3481, + "step": 15660 + }, + { + "epoch": 2.038656774697384, + "grad_norm": 2.8369200229644775, + "learning_rate": 2.4217881991171297e-06, + "loss": 0.4236, + "step": 15663 + }, + { + "epoch": 2.0390472471690746, + "grad_norm": 2.750505208969116, + "learning_rate": 2.4199978379400806e-06, + "loss": 0.4346, + "step": 15666 + }, + { + "epoch": 2.039437719640765, + "grad_norm": 2.6629180908203125, + "learning_rate": 2.418207927483984e-06, + "loss": 0.3667, + "step": 15669 + }, + { + "epoch": 2.039828192112456, + "grad_norm": 2.9319400787353516, + "learning_rate": 2.416418468061529e-06, + "loss": 0.4105, + "step": 15672 + }, + { + "epoch": 2.0402186645841467, + "grad_norm": 3.028381109237671, + "learning_rate": 2.4146294599853348e-06, + "loss": 0.3763, + "step": 15675 + }, + { + "epoch": 2.0406091370558377, + "grad_norm": 2.613823890686035, + "learning_rate": 2.412840903567933e-06, + "loss": 0.3848, + "step": 15678 + }, + { + "epoch": 2.0409996095275282, + "grad_norm": 2.842607021331787, + "learning_rate": 2.411052799121784e-06, + "loss": 0.3756, + "step": 15681 + }, + { + "epoch": 2.0413900819992192, + "grad_norm": 2.8494303226470947, + "learning_rate": 2.409265146959265e-06, + "loss": 0.382, + "step": 15684 + }, + { + "epoch": 2.04178055447091, + "grad_norm": 3.151146650314331, + "learning_rate": 2.4074779473926734e-06, + "loss": 0.349, + "step": 15687 + }, + { + "epoch": 2.0421710269426003, + "grad_norm": 2.663558006286621, + "learning_rate": 2.405691200734232e-06, + "loss": 0.4037, + "step": 15690 + }, + { + "epoch": 2.0425614994142913, + "grad_norm": 2.7715821266174316, + "learning_rate": 2.40390490729608e-06, + "loss": 0.333, + "step": 15693 + }, + { + "epoch": 2.042951971885982, + "grad_norm": 2.496654748916626, + "learning_rate": 2.4021190673902777e-06, + "loss": 0.3529, + "step": 15696 + }, + { + "epoch": 2.043342444357673, + "grad_norm": 2.606013059616089, + "learning_rate": 2.4003336813288112e-06, + "loss": 0.3328, + "step": 15699 + }, + { + "epoch": 2.0437329168293634, + "grad_norm": 2.736743211746216, + "learning_rate": 2.3985487494235814e-06, + "loss": 0.3804, + "step": 15702 + }, + { + "epoch": 2.0441233893010544, + "grad_norm": 2.691873550415039, + "learning_rate": 2.396764271986411e-06, + "loss": 0.3932, + "step": 15705 + }, + { + "epoch": 2.044513861772745, + "grad_norm": 3.0015580654144287, + "learning_rate": 2.3949802493290475e-06, + "loss": 0.3425, + "step": 15708 + }, + { + "epoch": 2.044904334244436, + "grad_norm": 3.084564208984375, + "learning_rate": 2.393196681763154e-06, + "loss": 0.3792, + "step": 15711 + }, + { + "epoch": 2.0452948067161265, + "grad_norm": 2.7556662559509277, + "learning_rate": 2.3914135696003144e-06, + "loss": 0.3621, + "step": 15714 + }, + { + "epoch": 2.045685279187817, + "grad_norm": 2.8170785903930664, + "learning_rate": 2.3896309131520367e-06, + "loss": 0.336, + "step": 15717 + }, + { + "epoch": 2.046075751659508, + "grad_norm": 2.6950416564941406, + "learning_rate": 2.387848712729749e-06, + "loss": 0.3685, + "step": 15720 + }, + { + "epoch": 2.0464662241311986, + "grad_norm": 3.019321918487549, + "learning_rate": 2.386066968644796e-06, + "loss": 0.3972, + "step": 15723 + }, + { + "epoch": 2.0468566966028896, + "grad_norm": 2.6395628452301025, + "learning_rate": 2.384285681208445e-06, + "loss": 0.3128, + "step": 15726 + }, + { + "epoch": 2.04724716907458, + "grad_norm": 2.728309392929077, + "learning_rate": 2.3825048507318806e-06, + "loss": 0.3894, + "step": 15729 + }, + { + "epoch": 2.047637641546271, + "grad_norm": 2.9258289337158203, + "learning_rate": 2.380724477526214e-06, + "loss": 0.4404, + "step": 15732 + }, + { + "epoch": 2.0480281140179617, + "grad_norm": 3.706683397293091, + "learning_rate": 2.3789445619024716e-06, + "loss": 0.3687, + "step": 15735 + }, + { + "epoch": 2.0484185864896527, + "grad_norm": 2.782982349395752, + "learning_rate": 2.3771651041715978e-06, + "loss": 0.3007, + "step": 15738 + }, + { + "epoch": 2.0488090589613432, + "grad_norm": 2.7452354431152344, + "learning_rate": 2.3753861046444647e-06, + "loss": 0.3442, + "step": 15741 + }, + { + "epoch": 2.049199531433034, + "grad_norm": 2.8988213539123535, + "learning_rate": 2.373607563631858e-06, + "loss": 0.3732, + "step": 15744 + }, + { + "epoch": 2.049590003904725, + "grad_norm": 3.2119078636169434, + "learning_rate": 2.371829481444483e-06, + "loss": 0.4084, + "step": 15747 + }, + { + "epoch": 2.0499804763764153, + "grad_norm": 2.665807008743286, + "learning_rate": 2.3700518583929704e-06, + "loss": 0.3438, + "step": 15750 + }, + { + "epoch": 2.0503709488481063, + "grad_norm": 3.3220880031585693, + "learning_rate": 2.3682746947878653e-06, + "loss": 0.3147, + "step": 15753 + }, + { + "epoch": 2.050761421319797, + "grad_norm": 2.919126510620117, + "learning_rate": 2.3664979909396334e-06, + "loss": 0.3434, + "step": 15756 + }, + { + "epoch": 2.051151893791488, + "grad_norm": 2.8159265518188477, + "learning_rate": 2.364721747158662e-06, + "loss": 0.37, + "step": 15759 + }, + { + "epoch": 2.0515423662631784, + "grad_norm": 2.8563246726989746, + "learning_rate": 2.3629459637552593e-06, + "loss": 0.3405, + "step": 15762 + }, + { + "epoch": 2.051932838734869, + "grad_norm": 2.744643449783325, + "learning_rate": 2.3611706410396497e-06, + "loss": 0.4583, + "step": 15765 + }, + { + "epoch": 2.05232331120656, + "grad_norm": 2.9006803035736084, + "learning_rate": 2.3593957793219757e-06, + "loss": 0.4063, + "step": 15768 + }, + { + "epoch": 2.0527137836782505, + "grad_norm": 2.7164108753204346, + "learning_rate": 2.357621378912306e-06, + "loss": 0.3646, + "step": 15771 + }, + { + "epoch": 2.0531042561499415, + "grad_norm": 2.580453395843506, + "learning_rate": 2.3558474401206222e-06, + "loss": 0.3469, + "step": 15774 + }, + { + "epoch": 2.053494728621632, + "grad_norm": 3.763662099838257, + "learning_rate": 2.354073963256829e-06, + "loss": 0.3854, + "step": 15777 + }, + { + "epoch": 2.053885201093323, + "grad_norm": 2.7697255611419678, + "learning_rate": 2.352300948630745e-06, + "loss": 0.294, + "step": 15780 + }, + { + "epoch": 2.0542756735650136, + "grad_norm": 2.634119749069214, + "learning_rate": 2.350528396552118e-06, + "loss": 0.3524, + "step": 15783 + }, + { + "epoch": 2.0546661460367046, + "grad_norm": 2.6129114627838135, + "learning_rate": 2.348756307330607e-06, + "loss": 0.3661, + "step": 15786 + }, + { + "epoch": 2.055056618508395, + "grad_norm": 3.03295636177063, + "learning_rate": 2.3469846812757892e-06, + "loss": 0.3947, + "step": 15789 + }, + { + "epoch": 2.0554470909800857, + "grad_norm": 2.5799500942230225, + "learning_rate": 2.345213518697168e-06, + "loss": 0.3157, + "step": 15792 + }, + { + "epoch": 2.0558375634517767, + "grad_norm": 2.8251993656158447, + "learning_rate": 2.343442819904161e-06, + "loss": 0.3364, + "step": 15795 + }, + { + "epoch": 2.0562280359234673, + "grad_norm": 3.4497342109680176, + "learning_rate": 2.341672585206102e-06, + "loss": 0.3281, + "step": 15798 + }, + { + "epoch": 2.0566185083951583, + "grad_norm": 2.605712652206421, + "learning_rate": 2.339902814912251e-06, + "loss": 0.4153, + "step": 15801 + }, + { + "epoch": 2.057008980866849, + "grad_norm": 2.881829261779785, + "learning_rate": 2.3381335093317837e-06, + "loss": 0.3554, + "step": 15804 + }, + { + "epoch": 2.05739945333854, + "grad_norm": 2.6509134769439697, + "learning_rate": 2.3363646687737925e-06, + "loss": 0.3419, + "step": 15807 + }, + { + "epoch": 2.0577899258102303, + "grad_norm": 2.6324145793914795, + "learning_rate": 2.3345962935472884e-06, + "loss": 0.3845, + "step": 15810 + }, + { + "epoch": 2.0581803982819213, + "grad_norm": 3.0626988410949707, + "learning_rate": 2.3328283839612063e-06, + "loss": 0.3581, + "step": 15813 + }, + { + "epoch": 2.058570870753612, + "grad_norm": 2.609157085418701, + "learning_rate": 2.331060940324395e-06, + "loss": 0.3613, + "step": 15816 + }, + { + "epoch": 2.0589613432253024, + "grad_norm": 2.7834722995758057, + "learning_rate": 2.3292939629456206e-06, + "loss": 0.418, + "step": 15819 + }, + { + "epoch": 2.0593518156969934, + "grad_norm": 2.971318006515503, + "learning_rate": 2.3275274521335743e-06, + "loss": 0.3108, + "step": 15822 + }, + { + "epoch": 2.059742288168684, + "grad_norm": 3.0066676139831543, + "learning_rate": 2.3257614081968606e-06, + "loss": 0.3441, + "step": 15825 + }, + { + "epoch": 2.060132760640375, + "grad_norm": 2.56901216506958, + "learning_rate": 2.3239958314440027e-06, + "loss": 0.3672, + "step": 15828 + }, + { + "epoch": 2.0605232331120655, + "grad_norm": 2.7101621627807617, + "learning_rate": 2.3222307221834417e-06, + "loss": 0.3722, + "step": 15831 + }, + { + "epoch": 2.0609137055837565, + "grad_norm": 2.768842935562134, + "learning_rate": 2.3204660807235426e-06, + "loss": 0.4064, + "step": 15834 + }, + { + "epoch": 2.061304178055447, + "grad_norm": 2.6292800903320312, + "learning_rate": 2.3187019073725816e-06, + "loss": 0.3984, + "step": 15837 + }, + { + "epoch": 2.0616946505271376, + "grad_norm": 2.5181849002838135, + "learning_rate": 2.3169382024387547e-06, + "loss": 0.3555, + "step": 15840 + }, + { + "epoch": 2.0620851229988286, + "grad_norm": 2.9010989665985107, + "learning_rate": 2.3151749662301803e-06, + "loss": 0.3347, + "step": 15843 + }, + { + "epoch": 2.062475595470519, + "grad_norm": 2.7419636249542236, + "learning_rate": 2.313412199054893e-06, + "loss": 0.3382, + "step": 15846 + }, + { + "epoch": 2.06286606794221, + "grad_norm": 2.6093106269836426, + "learning_rate": 2.3116499012208428e-06, + "loss": 0.3602, + "step": 15849 + }, + { + "epoch": 2.0632565404139007, + "grad_norm": 2.8060364723205566, + "learning_rate": 2.3098880730358968e-06, + "loss": 0.3394, + "step": 15852 + }, + { + "epoch": 2.0636470128855917, + "grad_norm": 2.528196334838867, + "learning_rate": 2.308126714807848e-06, + "loss": 0.3496, + "step": 15855 + }, + { + "epoch": 2.0640374853572823, + "grad_norm": 2.6221911907196045, + "learning_rate": 2.306365826844399e-06, + "loss": 0.3142, + "step": 15858 + }, + { + "epoch": 2.0644279578289733, + "grad_norm": 2.5942671298980713, + "learning_rate": 2.3046054094531715e-06, + "loss": 0.3163, + "step": 15861 + }, + { + "epoch": 2.064818430300664, + "grad_norm": 2.879770040512085, + "learning_rate": 2.3028454629417106e-06, + "loss": 0.3983, + "step": 15864 + }, + { + "epoch": 2.0652089027723544, + "grad_norm": 3.0213136672973633, + "learning_rate": 2.3010859876174734e-06, + "loss": 0.3028, + "step": 15867 + }, + { + "epoch": 2.0655993752440454, + "grad_norm": 2.5107553005218506, + "learning_rate": 2.2993269837878346e-06, + "loss": 0.355, + "step": 15870 + }, + { + "epoch": 2.065989847715736, + "grad_norm": 2.631068229675293, + "learning_rate": 2.297568451760092e-06, + "loss": 0.387, + "step": 15873 + }, + { + "epoch": 2.066380320187427, + "grad_norm": 3.205779552459717, + "learning_rate": 2.295810391841456e-06, + "loss": 0.3629, + "step": 15876 + }, + { + "epoch": 2.0667707926591175, + "grad_norm": 2.532114028930664, + "learning_rate": 2.294052804339056e-06, + "loss": 0.4009, + "step": 15879 + }, + { + "epoch": 2.0671612651308084, + "grad_norm": 2.673632860183716, + "learning_rate": 2.292295689559934e-06, + "loss": 0.3439, + "step": 15882 + }, + { + "epoch": 2.067551737602499, + "grad_norm": 2.8151535987854004, + "learning_rate": 2.2905390478110635e-06, + "loss": 0.3562, + "step": 15885 + }, + { + "epoch": 2.0679422100741895, + "grad_norm": 2.834294557571411, + "learning_rate": 2.2887828793993212e-06, + "loss": 0.3683, + "step": 15888 + }, + { + "epoch": 2.0683326825458805, + "grad_norm": 2.7955880165100098, + "learning_rate": 2.287027184631506e-06, + "loss": 0.4139, + "step": 15891 + }, + { + "epoch": 2.068723155017571, + "grad_norm": 3.368584394454956, + "learning_rate": 2.285271963814333e-06, + "loss": 0.3226, + "step": 15894 + }, + { + "epoch": 2.069113627489262, + "grad_norm": 2.835724353790283, + "learning_rate": 2.2835172172544384e-06, + "loss": 0.3797, + "step": 15897 + }, + { + "epoch": 2.0695040999609526, + "grad_norm": 3.315255880355835, + "learning_rate": 2.281762945258372e-06, + "loss": 0.379, + "step": 15900 + }, + { + "epoch": 2.0698945724326436, + "grad_norm": 2.753709316253662, + "learning_rate": 2.2800091481325983e-06, + "loss": 0.3426, + "step": 15903 + }, + { + "epoch": 2.070285044904334, + "grad_norm": 3.0424439907073975, + "learning_rate": 2.278255826183506e-06, + "loss": 0.4311, + "step": 15906 + }, + { + "epoch": 2.070675517376025, + "grad_norm": 2.6219685077667236, + "learning_rate": 2.2765029797173954e-06, + "loss": 0.3532, + "step": 15909 + }, + { + "epoch": 2.0710659898477157, + "grad_norm": 3.344362735748291, + "learning_rate": 2.274750609040483e-06, + "loss": 0.3858, + "step": 15912 + }, + { + "epoch": 2.0714564623194063, + "grad_norm": 3.3275110721588135, + "learning_rate": 2.2729987144589083e-06, + "loss": 0.3672, + "step": 15915 + }, + { + "epoch": 2.0718469347910973, + "grad_norm": 2.963124990463257, + "learning_rate": 2.271247296278721e-06, + "loss": 0.3988, + "step": 15918 + }, + { + "epoch": 2.072237407262788, + "grad_norm": 2.8209590911865234, + "learning_rate": 2.2694963548058885e-06, + "loss": 0.3627, + "step": 15921 + }, + { + "epoch": 2.072627879734479, + "grad_norm": 2.65281081199646, + "learning_rate": 2.2677458903462994e-06, + "loss": 0.4011, + "step": 15924 + }, + { + "epoch": 2.0730183522061694, + "grad_norm": 2.652015209197998, + "learning_rate": 2.2659959032057566e-06, + "loss": 0.3114, + "step": 15927 + }, + { + "epoch": 2.0734088246778604, + "grad_norm": 2.65907883644104, + "learning_rate": 2.2642463936899785e-06, + "loss": 0.3698, + "step": 15930 + }, + { + "epoch": 2.073799297149551, + "grad_norm": 2.8742198944091797, + "learning_rate": 2.2624973621045983e-06, + "loss": 0.3786, + "step": 15933 + }, + { + "epoch": 2.074189769621242, + "grad_norm": 2.9052553176879883, + "learning_rate": 2.2607488087551716e-06, + "loss": 0.3868, + "step": 15936 + }, + { + "epoch": 2.0745802420929325, + "grad_norm": 2.704512596130371, + "learning_rate": 2.2590007339471657e-06, + "loss": 0.3433, + "step": 15939 + }, + { + "epoch": 2.074970714564623, + "grad_norm": 2.6483607292175293, + "learning_rate": 2.257253137985966e-06, + "loss": 0.3306, + "step": 15942 + }, + { + "epoch": 2.075361187036314, + "grad_norm": 2.474104642868042, + "learning_rate": 2.25550602117687e-06, + "loss": 0.3005, + "step": 15945 + }, + { + "epoch": 2.0757516595080046, + "grad_norm": 2.596907138824463, + "learning_rate": 2.2537593838251016e-06, + "loss": 0.3277, + "step": 15948 + }, + { + "epoch": 2.0761421319796955, + "grad_norm": 2.7480621337890625, + "learning_rate": 2.252013226235791e-06, + "loss": 0.4159, + "step": 15951 + }, + { + "epoch": 2.076532604451386, + "grad_norm": 2.6667375564575195, + "learning_rate": 2.250267548713987e-06, + "loss": 0.3156, + "step": 15954 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 3.392796516418457, + "learning_rate": 2.2485223515646597e-06, + "loss": 0.3844, + "step": 15957 + }, + { + "epoch": 2.0773135493947676, + "grad_norm": 2.91853404045105, + "learning_rate": 2.246777635092689e-06, + "loss": 0.4067, + "step": 15960 + }, + { + "epoch": 2.0777040218664586, + "grad_norm": 2.7192370891571045, + "learning_rate": 2.245033399602872e-06, + "loss": 0.3416, + "step": 15963 + }, + { + "epoch": 2.078094494338149, + "grad_norm": 2.7866413593292236, + "learning_rate": 2.2432896453999243e-06, + "loss": 0.3529, + "step": 15966 + }, + { + "epoch": 2.0784849668098397, + "grad_norm": 2.8930845260620117, + "learning_rate": 2.2415463727884785e-06, + "loss": 0.3483, + "step": 15969 + }, + { + "epoch": 2.0788754392815307, + "grad_norm": 2.707551956176758, + "learning_rate": 2.239803582073078e-06, + "loss": 0.3398, + "step": 15972 + }, + { + "epoch": 2.0792659117532213, + "grad_norm": 2.8100128173828125, + "learning_rate": 2.2380612735581835e-06, + "loss": 0.3944, + "step": 15975 + }, + { + "epoch": 2.0796563842249123, + "grad_norm": 2.5131068229675293, + "learning_rate": 2.236319447548176e-06, + "loss": 0.3561, + "step": 15978 + }, + { + "epoch": 2.080046856696603, + "grad_norm": 2.6995739936828613, + "learning_rate": 2.234578104347347e-06, + "loss": 0.3422, + "step": 15981 + }, + { + "epoch": 2.080437329168294, + "grad_norm": 4.326542854309082, + "learning_rate": 2.2328372442599057e-06, + "loss": 0.3936, + "step": 15984 + }, + { + "epoch": 2.0808278016399844, + "grad_norm": 2.6090240478515625, + "learning_rate": 2.231096867589975e-06, + "loss": 0.3217, + "step": 15987 + }, + { + "epoch": 2.081218274111675, + "grad_norm": 3.206176519393921, + "learning_rate": 2.2293569746415976e-06, + "loss": 0.3001, + "step": 15990 + }, + { + "epoch": 2.081608746583366, + "grad_norm": 2.488536834716797, + "learning_rate": 2.2276175657187288e-06, + "loss": 0.2904, + "step": 15993 + }, + { + "epoch": 2.0819992190550565, + "grad_norm": 2.5979604721069336, + "learning_rate": 2.225878641125237e-06, + "loss": 0.3798, + "step": 15996 + }, + { + "epoch": 2.0823896915267475, + "grad_norm": 2.6461970806121826, + "learning_rate": 2.2241402011649127e-06, + "loss": 0.3123, + "step": 15999 + }, + { + "epoch": 2.082780163998438, + "grad_norm": 2.82248592376709, + "learning_rate": 2.2224022461414553e-06, + "loss": 0.374, + "step": 16002 + }, + { + "epoch": 2.083170636470129, + "grad_norm": 2.730311632156372, + "learning_rate": 2.220664776358481e-06, + "loss": 0.44, + "step": 16005 + }, + { + "epoch": 2.0835611089418196, + "grad_norm": 2.5103518962860107, + "learning_rate": 2.2189277921195228e-06, + "loss": 0.3888, + "step": 16008 + }, + { + "epoch": 2.0839515814135106, + "grad_norm": 2.6272377967834473, + "learning_rate": 2.217191293728031e-06, + "loss": 0.3516, + "step": 16011 + }, + { + "epoch": 2.084342053885201, + "grad_norm": 2.5898125171661377, + "learning_rate": 2.2154552814873663e-06, + "loss": 0.3519, + "step": 16014 + }, + { + "epoch": 2.0847325263568917, + "grad_norm": 2.7031917572021484, + "learning_rate": 2.213719755700804e-06, + "loss": 0.3611, + "step": 16017 + }, + { + "epoch": 2.0851229988285827, + "grad_norm": 2.8753886222839355, + "learning_rate": 2.21198471667154e-06, + "loss": 0.4155, + "step": 16020 + }, + { + "epoch": 2.085513471300273, + "grad_norm": 2.8421316146850586, + "learning_rate": 2.210250164702682e-06, + "loss": 0.3379, + "step": 16023 + }, + { + "epoch": 2.085903943771964, + "grad_norm": 2.7893972396850586, + "learning_rate": 2.208516100097249e-06, + "loss": 0.3434, + "step": 16026 + }, + { + "epoch": 2.0862944162436547, + "grad_norm": 2.8135573863983154, + "learning_rate": 2.206782523158183e-06, + "loss": 0.3668, + "step": 16029 + }, + { + "epoch": 2.0866848887153457, + "grad_norm": 2.7129807472229004, + "learning_rate": 2.2050494341883344e-06, + "loss": 0.3845, + "step": 16032 + }, + { + "epoch": 2.0870753611870363, + "grad_norm": 3.1642825603485107, + "learning_rate": 2.203316833490469e-06, + "loss": 0.3545, + "step": 16035 + }, + { + "epoch": 2.087465833658727, + "grad_norm": 2.634582757949829, + "learning_rate": 2.2015847213672686e-06, + "loss": 0.3136, + "step": 16038 + }, + { + "epoch": 2.087856306130418, + "grad_norm": 3.1089882850646973, + "learning_rate": 2.1998530981213318e-06, + "loss": 0.3507, + "step": 16041 + }, + { + "epoch": 2.0882467786021084, + "grad_norm": 3.0786855220794678, + "learning_rate": 2.1981219640551683e-06, + "loss": 0.3322, + "step": 16044 + }, + { + "epoch": 2.0886372510737994, + "grad_norm": 2.9572317600250244, + "learning_rate": 2.1963913194712013e-06, + "loss": 0.4171, + "step": 16047 + }, + { + "epoch": 2.08902772354549, + "grad_norm": 3.005173921585083, + "learning_rate": 2.1946611646717726e-06, + "loss": 0.3304, + "step": 16050 + }, + { + "epoch": 2.089418196017181, + "grad_norm": 2.7299466133117676, + "learning_rate": 2.192931499959139e-06, + "loss": 0.3869, + "step": 16053 + }, + { + "epoch": 2.0898086684888715, + "grad_norm": 3.0412538051605225, + "learning_rate": 2.191202325635467e-06, + "loss": 0.3458, + "step": 16056 + }, + { + "epoch": 2.0901991409605625, + "grad_norm": 3.1715352535247803, + "learning_rate": 2.1894736420028383e-06, + "loss": 0.3498, + "step": 16059 + }, + { + "epoch": 2.090589613432253, + "grad_norm": 2.733077049255371, + "learning_rate": 2.1877454493632533e-06, + "loss": 0.3784, + "step": 16062 + }, + { + "epoch": 2.0909800859039436, + "grad_norm": 2.767333745956421, + "learning_rate": 2.1860177480186224e-06, + "loss": 0.3984, + "step": 16065 + }, + { + "epoch": 2.0913705583756346, + "grad_norm": 2.9046363830566406, + "learning_rate": 2.1842905382707695e-06, + "loss": 0.3301, + "step": 16068 + }, + { + "epoch": 2.091761030847325, + "grad_norm": 2.561779737472534, + "learning_rate": 2.182563820421438e-06, + "loss": 0.3179, + "step": 16071 + }, + { + "epoch": 2.092151503319016, + "grad_norm": 2.805349111557007, + "learning_rate": 2.18083759477228e-06, + "loss": 0.4154, + "step": 16074 + }, + { + "epoch": 2.0925419757907067, + "grad_norm": 2.7377660274505615, + "learning_rate": 2.1791118616248615e-06, + "loss": 0.3782, + "step": 16077 + }, + { + "epoch": 2.0929324482623977, + "grad_norm": 2.9340991973876953, + "learning_rate": 2.1773866212806684e-06, + "loss": 0.3573, + "step": 16080 + }, + { + "epoch": 2.093322920734088, + "grad_norm": 3.0035879611968994, + "learning_rate": 2.1756618740410944e-06, + "loss": 0.3522, + "step": 16083 + }, + { + "epoch": 2.093713393205779, + "grad_norm": 2.919433832168579, + "learning_rate": 2.1739376202074504e-06, + "loss": 0.4147, + "step": 16086 + }, + { + "epoch": 2.0941038656774698, + "grad_norm": 2.8312833309173584, + "learning_rate": 2.172213860080956e-06, + "loss": 0.3854, + "step": 16089 + }, + { + "epoch": 2.0944943381491603, + "grad_norm": 2.7150704860687256, + "learning_rate": 2.1704905939627523e-06, + "loss": 0.3951, + "step": 16092 + }, + { + "epoch": 2.0948848106208513, + "grad_norm": 2.916109323501587, + "learning_rate": 2.168767822153891e-06, + "loss": 0.3698, + "step": 16095 + }, + { + "epoch": 2.095275283092542, + "grad_norm": 2.8838951587677, + "learning_rate": 2.1670455449553352e-06, + "loss": 0.3452, + "step": 16098 + }, + { + "epoch": 2.095665755564233, + "grad_norm": 2.8266618251800537, + "learning_rate": 2.1653237626679607e-06, + "loss": 0.3314, + "step": 16101 + }, + { + "epoch": 2.0960562280359234, + "grad_norm": 2.667581796646118, + "learning_rate": 2.163602475592564e-06, + "loss": 0.3685, + "step": 16104 + }, + { + "epoch": 2.0964467005076144, + "grad_norm": 2.537261724472046, + "learning_rate": 2.1618816840298474e-06, + "loss": 0.2771, + "step": 16107 + }, + { + "epoch": 2.096837172979305, + "grad_norm": 2.6051278114318848, + "learning_rate": 2.1601613882804283e-06, + "loss": 0.346, + "step": 16110 + }, + { + "epoch": 2.0972276454509955, + "grad_norm": 3.2211477756500244, + "learning_rate": 2.158441588644843e-06, + "loss": 0.3435, + "step": 16113 + }, + { + "epoch": 2.0976181179226865, + "grad_norm": 2.916541576385498, + "learning_rate": 2.1567222854235337e-06, + "loss": 0.3735, + "step": 16116 + }, + { + "epoch": 2.098008590394377, + "grad_norm": 2.6063573360443115, + "learning_rate": 2.1550034789168584e-06, + "loss": 0.3075, + "step": 16119 + }, + { + "epoch": 2.098399062866068, + "grad_norm": 2.746978759765625, + "learning_rate": 2.1532851694250916e-06, + "loss": 0.3371, + "step": 16122 + }, + { + "epoch": 2.0987895353377586, + "grad_norm": 2.748178243637085, + "learning_rate": 2.1515673572484173e-06, + "loss": 0.3474, + "step": 16125 + }, + { + "epoch": 2.0991800078094496, + "grad_norm": 2.8788228034973145, + "learning_rate": 2.1498500426869325e-06, + "loss": 0.3808, + "step": 16128 + }, + { + "epoch": 2.09957048028114, + "grad_norm": 3.1142728328704834, + "learning_rate": 2.1481332260406502e-06, + "loss": 0.4338, + "step": 16131 + }, + { + "epoch": 2.099960952752831, + "grad_norm": 2.57977032661438, + "learning_rate": 2.1464169076094922e-06, + "loss": 0.3976, + "step": 16134 + }, + { + "epoch": 2.1003514252245217, + "grad_norm": 2.79011869430542, + "learning_rate": 2.1447010876932992e-06, + "loss": 0.3907, + "step": 16137 + }, + { + "epoch": 2.1007418976962122, + "grad_norm": 2.7332963943481445, + "learning_rate": 2.142985766591818e-06, + "loss": 0.3246, + "step": 16140 + }, + { + "epoch": 2.101132370167903, + "grad_norm": 3.274784803390503, + "learning_rate": 2.141270944604715e-06, + "loss": 0.3906, + "step": 16143 + }, + { + "epoch": 2.1015228426395938, + "grad_norm": 2.857478618621826, + "learning_rate": 2.139556622031564e-06, + "loss": 0.5127, + "step": 16146 + }, + { + "epoch": 2.1019133151112848, + "grad_norm": 2.596496105194092, + "learning_rate": 2.1378427991718533e-06, + "loss": 0.3371, + "step": 16149 + }, + { + "epoch": 2.1023037875829753, + "grad_norm": 2.747124671936035, + "learning_rate": 2.1361294763249828e-06, + "loss": 0.3566, + "step": 16152 + }, + { + "epoch": 2.1026942600546663, + "grad_norm": 2.936936140060425, + "learning_rate": 2.13441665379027e-06, + "loss": 0.3858, + "step": 16155 + }, + { + "epoch": 2.103084732526357, + "grad_norm": 2.8474388122558594, + "learning_rate": 2.1327043318669396e-06, + "loss": 0.3285, + "step": 16158 + }, + { + "epoch": 2.1034752049980474, + "grad_norm": 2.6837317943573, + "learning_rate": 2.130992510854128e-06, + "loss": 0.3799, + "step": 16161 + }, + { + "epoch": 2.1038656774697384, + "grad_norm": 3.10606050491333, + "learning_rate": 2.1292811910508916e-06, + "loss": 0.3795, + "step": 16164 + }, + { + "epoch": 2.104256149941429, + "grad_norm": 2.5549697875976562, + "learning_rate": 2.127570372756192e-06, + "loss": 0.3574, + "step": 16167 + }, + { + "epoch": 2.10464662241312, + "grad_norm": 2.6415367126464844, + "learning_rate": 2.1258600562689035e-06, + "loss": 0.3661, + "step": 16170 + }, + { + "epoch": 2.1050370948848105, + "grad_norm": 2.8995654582977295, + "learning_rate": 2.124150241887819e-06, + "loss": 0.3972, + "step": 16173 + }, + { + "epoch": 2.1054275673565015, + "grad_norm": 2.8920884132385254, + "learning_rate": 2.1224409299116356e-06, + "loss": 0.3544, + "step": 16176 + }, + { + "epoch": 2.105818039828192, + "grad_norm": 2.7235703468322754, + "learning_rate": 2.1207321206389702e-06, + "loss": 0.3626, + "step": 16179 + }, + { + "epoch": 2.106208512299883, + "grad_norm": 2.957944869995117, + "learning_rate": 2.119023814368344e-06, + "loss": 0.3354, + "step": 16182 + }, + { + "epoch": 2.1065989847715736, + "grad_norm": 2.9017412662506104, + "learning_rate": 2.117316011398199e-06, + "loss": 0.4035, + "step": 16185 + }, + { + "epoch": 2.106989457243264, + "grad_norm": 2.5684914588928223, + "learning_rate": 2.115608712026882e-06, + "loss": 0.313, + "step": 16188 + }, + { + "epoch": 2.107379929714955, + "grad_norm": 2.6348636150360107, + "learning_rate": 2.113901916552653e-06, + "loss": 0.3506, + "step": 16191 + }, + { + "epoch": 2.1077704021866457, + "grad_norm": 2.8889946937561035, + "learning_rate": 2.1121956252736903e-06, + "loss": 0.3517, + "step": 16194 + }, + { + "epoch": 2.1081608746583367, + "grad_norm": 3.1232848167419434, + "learning_rate": 2.1104898384880766e-06, + "loss": 0.3806, + "step": 16197 + }, + { + "epoch": 2.1085513471300272, + "grad_norm": 2.5369369983673096, + "learning_rate": 2.10878455649381e-06, + "loss": 0.3079, + "step": 16200 + }, + { + "epoch": 2.1089418196017182, + "grad_norm": 2.765028953552246, + "learning_rate": 2.1070797795887965e-06, + "loss": 0.3596, + "step": 16203 + }, + { + "epoch": 2.1093322920734088, + "grad_norm": 2.585745096206665, + "learning_rate": 2.1053755080708614e-06, + "loss": 0.357, + "step": 16206 + }, + { + "epoch": 2.1097227645450998, + "grad_norm": 3.108854055404663, + "learning_rate": 2.1036717422377364e-06, + "loss": 0.3649, + "step": 16209 + }, + { + "epoch": 2.1101132370167903, + "grad_norm": 2.787053108215332, + "learning_rate": 2.101968482387063e-06, + "loss": 0.378, + "step": 16212 + }, + { + "epoch": 2.110503709488481, + "grad_norm": 2.599722146987915, + "learning_rate": 2.1002657288164002e-06, + "loss": 0.3513, + "step": 16215 + }, + { + "epoch": 2.110894181960172, + "grad_norm": 2.803863763809204, + "learning_rate": 2.0985634818232136e-06, + "loss": 0.3057, + "step": 16218 + }, + { + "epoch": 2.1112846544318624, + "grad_norm": 2.749025344848633, + "learning_rate": 2.096861741704884e-06, + "loss": 0.4025, + "step": 16221 + }, + { + "epoch": 2.1116751269035534, + "grad_norm": 2.9662301540374756, + "learning_rate": 2.0951605087586994e-06, + "loss": 0.4046, + "step": 16224 + }, + { + "epoch": 2.112065599375244, + "grad_norm": 2.85591721534729, + "learning_rate": 2.0934597832818653e-06, + "loss": 0.3853, + "step": 16227 + }, + { + "epoch": 2.112456071846935, + "grad_norm": 3.045888900756836, + "learning_rate": 2.0917595655714925e-06, + "loss": 0.3448, + "step": 16230 + }, + { + "epoch": 2.1128465443186255, + "grad_norm": 2.8176419734954834, + "learning_rate": 2.0900598559246032e-06, + "loss": 0.312, + "step": 16233 + }, + { + "epoch": 2.1132370167903165, + "grad_norm": 2.6675214767456055, + "learning_rate": 2.0883606546381372e-06, + "loss": 0.3868, + "step": 16236 + }, + { + "epoch": 2.113627489262007, + "grad_norm": 2.7344720363616943, + "learning_rate": 2.08666196200894e-06, + "loss": 0.3555, + "step": 16239 + }, + { + "epoch": 2.1140179617336976, + "grad_norm": 2.40110445022583, + "learning_rate": 2.084963778333768e-06, + "loss": 0.3102, + "step": 16242 + }, + { + "epoch": 2.1144084342053886, + "grad_norm": 2.716338634490967, + "learning_rate": 2.083266103909292e-06, + "loss": 0.3574, + "step": 16245 + }, + { + "epoch": 2.114798906677079, + "grad_norm": 2.801283121109009, + "learning_rate": 2.081568939032093e-06, + "loss": 0.3879, + "step": 16248 + }, + { + "epoch": 2.11518937914877, + "grad_norm": 2.7228639125823975, + "learning_rate": 2.07987228399866e-06, + "loss": 0.3327, + "step": 16251 + }, + { + "epoch": 2.1155798516204607, + "grad_norm": 3.10695743560791, + "learning_rate": 2.0781761391053944e-06, + "loss": 0.4271, + "step": 16254 + }, + { + "epoch": 2.1159703240921517, + "grad_norm": 2.5058023929595947, + "learning_rate": 2.076480504648613e-06, + "loss": 0.3314, + "step": 16257 + }, + { + "epoch": 2.1163607965638422, + "grad_norm": 2.8873941898345947, + "learning_rate": 2.074785380924535e-06, + "loss": 0.4765, + "step": 16260 + }, + { + "epoch": 2.116751269035533, + "grad_norm": 2.7840540409088135, + "learning_rate": 2.073090768229299e-06, + "loss": 0.4192, + "step": 16263 + }, + { + "epoch": 2.117141741507224, + "grad_norm": 2.253389596939087, + "learning_rate": 2.071396666858947e-06, + "loss": 0.3191, + "step": 16266 + }, + { + "epoch": 2.1175322139789143, + "grad_norm": 2.8525044918060303, + "learning_rate": 2.069703077109438e-06, + "loss": 0.3817, + "step": 16269 + }, + { + "epoch": 2.1179226864506053, + "grad_norm": 2.5376346111297607, + "learning_rate": 2.0680099992766366e-06, + "loss": 0.276, + "step": 16272 + }, + { + "epoch": 2.118313158922296, + "grad_norm": 2.743023633956909, + "learning_rate": 2.0663174336563193e-06, + "loss": 0.3222, + "step": 16275 + }, + { + "epoch": 2.118703631393987, + "grad_norm": 2.61458158493042, + "learning_rate": 2.0646253805441757e-06, + "loss": 0.2982, + "step": 16278 + }, + { + "epoch": 2.1190941038656774, + "grad_norm": 3.0593173503875732, + "learning_rate": 2.0629338402358035e-06, + "loss": 0.3798, + "step": 16281 + }, + { + "epoch": 2.1194845763373684, + "grad_norm": 2.7648446559906006, + "learning_rate": 2.0612428130267087e-06, + "loss": 0.4024, + "step": 16284 + }, + { + "epoch": 2.119875048809059, + "grad_norm": 3.026467800140381, + "learning_rate": 2.0595522992123148e-06, + "loss": 0.3453, + "step": 16287 + }, + { + "epoch": 2.1202655212807495, + "grad_norm": 2.49055814743042, + "learning_rate": 2.057862299087947e-06, + "loss": 0.3484, + "step": 16290 + }, + { + "epoch": 2.1206559937524405, + "grad_norm": 2.641812324523926, + "learning_rate": 2.056172812948846e-06, + "loss": 0.351, + "step": 16293 + }, + { + "epoch": 2.121046466224131, + "grad_norm": 2.7587594985961914, + "learning_rate": 2.0544838410901625e-06, + "loss": 0.3781, + "step": 16296 + }, + { + "epoch": 2.121436938695822, + "grad_norm": 2.873347759246826, + "learning_rate": 2.052795383806955e-06, + "loss": 0.3831, + "step": 16299 + }, + { + "epoch": 2.1218274111675126, + "grad_norm": 2.9109420776367188, + "learning_rate": 2.0511074413941934e-06, + "loss": 0.4032, + "step": 16302 + }, + { + "epoch": 2.1222178836392036, + "grad_norm": 2.9024267196655273, + "learning_rate": 2.0494200141467576e-06, + "loss": 0.3648, + "step": 16305 + }, + { + "epoch": 2.122608356110894, + "grad_norm": 2.8061764240264893, + "learning_rate": 2.04773310235944e-06, + "loss": 0.3407, + "step": 16308 + }, + { + "epoch": 2.1229988285825847, + "grad_norm": 2.516742706298828, + "learning_rate": 2.0460467063269384e-06, + "loss": 0.3815, + "step": 16311 + }, + { + "epoch": 2.1233893010542757, + "grad_norm": 2.9556503295898438, + "learning_rate": 2.0443608263438635e-06, + "loss": 0.3183, + "step": 16314 + }, + { + "epoch": 2.1237797735259663, + "grad_norm": 2.734612464904785, + "learning_rate": 2.0426754627047328e-06, + "loss": 0.3251, + "step": 16317 + }, + { + "epoch": 2.1241702459976572, + "grad_norm": 2.9752840995788574, + "learning_rate": 2.040990615703979e-06, + "loss": 0.3447, + "step": 16320 + }, + { + "epoch": 2.124560718469348, + "grad_norm": 2.810737371444702, + "learning_rate": 2.0393062856359396e-06, + "loss": 0.3616, + "step": 16323 + }, + { + "epoch": 2.124951190941039, + "grad_norm": 2.919602870941162, + "learning_rate": 2.0376224727948625e-06, + "loss": 0.3873, + "step": 16326 + }, + { + "epoch": 2.1253416634127293, + "grad_norm": 2.8167080879211426, + "learning_rate": 2.035939177474909e-06, + "loss": 0.347, + "step": 16329 + }, + { + "epoch": 2.1257321358844203, + "grad_norm": 3.348229169845581, + "learning_rate": 2.0342563999701454e-06, + "loss": 0.3292, + "step": 16332 + }, + { + "epoch": 2.126122608356111, + "grad_norm": 2.9174301624298096, + "learning_rate": 2.032574140574548e-06, + "loss": 0.338, + "step": 16335 + }, + { + "epoch": 2.1265130808278014, + "grad_norm": 3.1886136531829834, + "learning_rate": 2.0308923995820077e-06, + "loss": 0.3348, + "step": 16338 + }, + { + "epoch": 2.1269035532994924, + "grad_norm": 2.831287145614624, + "learning_rate": 2.0292111772863193e-06, + "loss": 0.419, + "step": 16341 + }, + { + "epoch": 2.127294025771183, + "grad_norm": 2.922455310821533, + "learning_rate": 2.0275304739811864e-06, + "loss": 0.3707, + "step": 16344 + }, + { + "epoch": 2.127684498242874, + "grad_norm": 2.8592848777770996, + "learning_rate": 2.0258502899602266e-06, + "loss": 0.3573, + "step": 16347 + }, + { + "epoch": 2.1280749707145645, + "grad_norm": 3.3974833488464355, + "learning_rate": 2.0241706255169663e-06, + "loss": 0.3741, + "step": 16350 + }, + { + "epoch": 2.1284654431862555, + "grad_norm": 2.918564558029175, + "learning_rate": 2.0224914809448374e-06, + "loss": 0.3557, + "step": 16353 + }, + { + "epoch": 2.128855915657946, + "grad_norm": 2.8875043392181396, + "learning_rate": 2.0208128565371813e-06, + "loss": 0.4248, + "step": 16356 + }, + { + "epoch": 2.129246388129637, + "grad_norm": 2.7835309505462646, + "learning_rate": 2.019134752587254e-06, + "loss": 0.4035, + "step": 16359 + }, + { + "epoch": 2.1296368606013276, + "grad_norm": 2.815490245819092, + "learning_rate": 2.017457169388214e-06, + "loss": 0.3947, + "step": 16362 + }, + { + "epoch": 2.130027333073018, + "grad_norm": 3.1463191509246826, + "learning_rate": 2.0157801072331325e-06, + "loss": 0.3536, + "step": 16365 + }, + { + "epoch": 2.130417805544709, + "grad_norm": 2.900385618209839, + "learning_rate": 2.0141035664149868e-06, + "loss": 0.3911, + "step": 16368 + }, + { + "epoch": 2.1308082780163997, + "grad_norm": 3.139719009399414, + "learning_rate": 2.0124275472266678e-06, + "loss": 0.3523, + "step": 16371 + }, + { + "epoch": 2.1311987504880907, + "grad_norm": 2.8128786087036133, + "learning_rate": 2.010752049960972e-06, + "loss": 0.4084, + "step": 16374 + }, + { + "epoch": 2.1315892229597813, + "grad_norm": 2.783865213394165, + "learning_rate": 2.0090770749106024e-06, + "loss": 0.3048, + "step": 16377 + }, + { + "epoch": 2.1319796954314723, + "grad_norm": 2.7417304515838623, + "learning_rate": 2.007402622368178e-06, + "loss": 0.3362, + "step": 16380 + }, + { + "epoch": 2.132370167903163, + "grad_norm": 2.974012613296509, + "learning_rate": 2.00572869262622e-06, + "loss": 0.3618, + "step": 16383 + }, + { + "epoch": 2.132760640374854, + "grad_norm": 2.9564473628997803, + "learning_rate": 2.004055285977158e-06, + "loss": 0.3369, + "step": 16386 + }, + { + "epoch": 2.1331511128465444, + "grad_norm": 2.8104441165924072, + "learning_rate": 2.0023824027133356e-06, + "loss": 0.3538, + "step": 16389 + }, + { + "epoch": 2.133541585318235, + "grad_norm": 3.0070455074310303, + "learning_rate": 2.0007100431270027e-06, + "loss": 0.3255, + "step": 16392 + }, + { + "epoch": 2.133932057789926, + "grad_norm": 2.8861727714538574, + "learning_rate": 1.999038207510316e-06, + "loss": 0.3306, + "step": 16395 + }, + { + "epoch": 2.1343225302616164, + "grad_norm": 2.9251303672790527, + "learning_rate": 1.9973668961553394e-06, + "loss": 0.3543, + "step": 16398 + }, + { + "epoch": 2.1347130027333074, + "grad_norm": 2.2964024543762207, + "learning_rate": 1.9956961093540513e-06, + "loss": 0.3386, + "step": 16401 + }, + { + "epoch": 2.135103475204998, + "grad_norm": 2.716637134552002, + "learning_rate": 1.9940258473983326e-06, + "loss": 0.3383, + "step": 16404 + }, + { + "epoch": 2.135493947676689, + "grad_norm": 2.994713068008423, + "learning_rate": 1.992356110579975e-06, + "loss": 0.3323, + "step": 16407 + }, + { + "epoch": 2.1358844201483795, + "grad_norm": 2.87947940826416, + "learning_rate": 1.9906868991906754e-06, + "loss": 0.4167, + "step": 16410 + }, + { + "epoch": 2.13627489262007, + "grad_norm": 2.95758318901062, + "learning_rate": 1.989018213522046e-06, + "loss": 0.3449, + "step": 16413 + }, + { + "epoch": 2.136665365091761, + "grad_norm": 2.653963565826416, + "learning_rate": 1.9873500538656005e-06, + "loss": 0.3985, + "step": 16416 + }, + { + "epoch": 2.1370558375634516, + "grad_norm": 2.6448793411254883, + "learning_rate": 1.985682420512761e-06, + "loss": 0.3981, + "step": 16419 + }, + { + "epoch": 2.1374463100351426, + "grad_norm": 2.5381240844726562, + "learning_rate": 1.9840153137548634e-06, + "loss": 0.2987, + "step": 16422 + }, + { + "epoch": 2.137836782506833, + "grad_norm": 2.6718051433563232, + "learning_rate": 1.982348733883146e-06, + "loss": 0.3729, + "step": 16425 + }, + { + "epoch": 2.138227254978524, + "grad_norm": 3.0153956413269043, + "learning_rate": 1.980682681188754e-06, + "loss": 0.416, + "step": 16428 + }, + { + "epoch": 2.1386177274502147, + "grad_norm": 3.0083117485046387, + "learning_rate": 1.979017155962747e-06, + "loss": 0.382, + "step": 16431 + }, + { + "epoch": 2.1390081999219053, + "grad_norm": 2.849533796310425, + "learning_rate": 1.9773521584960888e-06, + "loss": 0.3712, + "step": 16434 + }, + { + "epoch": 2.1393986723935963, + "grad_norm": 2.806645154953003, + "learning_rate": 1.9756876890796496e-06, + "loss": 0.3822, + "step": 16437 + }, + { + "epoch": 2.139789144865287, + "grad_norm": 2.8463988304138184, + "learning_rate": 1.9740237480042075e-06, + "loss": 0.357, + "step": 16440 + }, + { + "epoch": 2.140179617336978, + "grad_norm": 3.0570833683013916, + "learning_rate": 1.9723603355604526e-06, + "loss": 0.3629, + "step": 16443 + }, + { + "epoch": 2.1405700898086684, + "grad_norm": 2.503617286682129, + "learning_rate": 1.9706974520389776e-06, + "loss": 0.3127, + "step": 16446 + }, + { + "epoch": 2.1409605622803594, + "grad_norm": 3.2526721954345703, + "learning_rate": 1.9690350977302837e-06, + "loss": 0.3756, + "step": 16449 + }, + { + "epoch": 2.14135103475205, + "grad_norm": 2.4304678440093994, + "learning_rate": 1.967373272924783e-06, + "loss": 0.2854, + "step": 16452 + }, + { + "epoch": 2.141741507223741, + "grad_norm": 2.5894525051116943, + "learning_rate": 1.9657119779127926e-06, + "loss": 0.3817, + "step": 16455 + }, + { + "epoch": 2.1421319796954315, + "grad_norm": 2.8062849044799805, + "learning_rate": 1.9640512129845365e-06, + "loss": 0.3606, + "step": 16458 + }, + { + "epoch": 2.142522452167122, + "grad_norm": 2.921401262283325, + "learning_rate": 1.9623909784301442e-06, + "loss": 0.4018, + "step": 16461 + }, + { + "epoch": 2.142912924638813, + "grad_norm": 2.643878698348999, + "learning_rate": 1.9607312745396602e-06, + "loss": 0.3517, + "step": 16464 + }, + { + "epoch": 2.1433033971105035, + "grad_norm": 2.543466806411743, + "learning_rate": 1.9590721016030285e-06, + "loss": 0.3569, + "step": 16467 + }, + { + "epoch": 2.1436938695821945, + "grad_norm": 2.8661959171295166, + "learning_rate": 1.957413459910102e-06, + "loss": 0.3746, + "step": 16470 + }, + { + "epoch": 2.144084342053885, + "grad_norm": 2.730772018432617, + "learning_rate": 1.9557553497506432e-06, + "loss": 0.2907, + "step": 16473 + }, + { + "epoch": 2.144474814525576, + "grad_norm": 2.7955386638641357, + "learning_rate": 1.954097771414322e-06, + "loss": 0.3574, + "step": 16476 + }, + { + "epoch": 2.1448652869972666, + "grad_norm": 2.8942129611968994, + "learning_rate": 1.952440725190713e-06, + "loss": 0.3566, + "step": 16479 + }, + { + "epoch": 2.1452557594689576, + "grad_norm": 2.645179510116577, + "learning_rate": 1.9507842113692967e-06, + "loss": 0.3757, + "step": 16482 + }, + { + "epoch": 2.145646231940648, + "grad_norm": 2.6048803329467773, + "learning_rate": 1.9491282302394653e-06, + "loss": 0.3349, + "step": 16485 + }, + { + "epoch": 2.1460367044123387, + "grad_norm": 3.326573371887207, + "learning_rate": 1.947472782090514e-06, + "loss": 0.3798, + "step": 16488 + }, + { + "epoch": 2.1464271768840297, + "grad_norm": 2.8624017238616943, + "learning_rate": 1.9458178672116445e-06, + "loss": 0.3991, + "step": 16491 + }, + { + "epoch": 2.1468176493557203, + "grad_norm": 2.8629748821258545, + "learning_rate": 1.9441634858919705e-06, + "loss": 0.3984, + "step": 16494 + }, + { + "epoch": 2.1472081218274113, + "grad_norm": 2.9364376068115234, + "learning_rate": 1.9425096384205066e-06, + "loss": 0.3873, + "step": 16497 + }, + { + "epoch": 2.147598594299102, + "grad_norm": 2.7493503093719482, + "learning_rate": 1.9408563250861756e-06, + "loss": 0.3023, + "step": 16500 + }, + { + "epoch": 2.147989066770793, + "grad_norm": 2.7298777103424072, + "learning_rate": 1.9392035461778104e-06, + "loss": 0.3675, + "step": 16503 + }, + { + "epoch": 2.1483795392424834, + "grad_norm": 3.0014255046844482, + "learning_rate": 1.937551301984147e-06, + "loss": 0.3866, + "step": 16506 + }, + { + "epoch": 2.1487700117141744, + "grad_norm": 2.657407522201538, + "learning_rate": 1.9358995927938284e-06, + "loss": 0.4154, + "step": 16509 + }, + { + "epoch": 2.149160484185865, + "grad_norm": 2.729097843170166, + "learning_rate": 1.934248418895401e-06, + "loss": 0.3707, + "step": 16512 + }, + { + "epoch": 2.1495509566575555, + "grad_norm": 3.432372808456421, + "learning_rate": 1.93259778057733e-06, + "loss": 0.3662, + "step": 16515 + }, + { + "epoch": 2.1499414291292465, + "grad_norm": 2.9015185832977295, + "learning_rate": 1.9309476781279735e-06, + "loss": 0.3547, + "step": 16518 + }, + { + "epoch": 2.150331901600937, + "grad_norm": 2.588021993637085, + "learning_rate": 1.9292981118356013e-06, + "loss": 0.3424, + "step": 16521 + }, + { + "epoch": 2.150722374072628, + "grad_norm": 2.5547730922698975, + "learning_rate": 1.927649081988387e-06, + "loss": 0.357, + "step": 16524 + }, + { + "epoch": 2.1511128465443186, + "grad_norm": 2.5534119606018066, + "learning_rate": 1.926000588874417e-06, + "loss": 0.3456, + "step": 16527 + }, + { + "epoch": 2.1515033190160096, + "grad_norm": 3.0660245418548584, + "learning_rate": 1.924352632781677e-06, + "loss": 0.3396, + "step": 16530 + }, + { + "epoch": 2.1518937914877, + "grad_norm": 2.4878122806549072, + "learning_rate": 1.9227052139980606e-06, + "loss": 0.3132, + "step": 16533 + }, + { + "epoch": 2.152284263959391, + "grad_norm": 2.8623673915863037, + "learning_rate": 1.921058332811371e-06, + "loss": 0.3601, + "step": 16536 + }, + { + "epoch": 2.1526747364310816, + "grad_norm": 2.5879554748535156, + "learning_rate": 1.9194119895093137e-06, + "loss": 0.3428, + "step": 16539 + }, + { + "epoch": 2.153065208902772, + "grad_norm": 2.7629878520965576, + "learning_rate": 1.9177661843794994e-06, + "loss": 0.367, + "step": 16542 + }, + { + "epoch": 2.153455681374463, + "grad_norm": 2.715153694152832, + "learning_rate": 1.9161209177094504e-06, + "loss": 0.3238, + "step": 16545 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 3.1860060691833496, + "learning_rate": 1.9144761897865895e-06, + "loss": 0.3534, + "step": 16548 + }, + { + "epoch": 2.1542366263178447, + "grad_norm": 2.8940200805664062, + "learning_rate": 1.9128320008982456e-06, + "loss": 0.3401, + "step": 16551 + }, + { + "epoch": 2.1546270987895353, + "grad_norm": 2.862811326980591, + "learning_rate": 1.9111883513316564e-06, + "loss": 0.3994, + "step": 16554 + }, + { + "epoch": 2.1550175712612263, + "grad_norm": 2.693480968475342, + "learning_rate": 1.909545241373966e-06, + "loss": 0.3594, + "step": 16557 + }, + { + "epoch": 2.155408043732917, + "grad_norm": 2.9128196239471436, + "learning_rate": 1.9079026713122206e-06, + "loss": 0.3923, + "step": 16560 + }, + { + "epoch": 2.1557985162046074, + "grad_norm": 2.5476996898651123, + "learning_rate": 1.9062606414333723e-06, + "loss": 0.3357, + "step": 16563 + }, + { + "epoch": 2.1561889886762984, + "grad_norm": 2.8649590015411377, + "learning_rate": 1.9046191520242835e-06, + "loss": 0.3318, + "step": 16566 + }, + { + "epoch": 2.156579461147989, + "grad_norm": 3.019973039627075, + "learning_rate": 1.902978203371717e-06, + "loss": 0.3477, + "step": 16569 + }, + { + "epoch": 2.15696993361968, + "grad_norm": 2.824190855026245, + "learning_rate": 1.901337795762343e-06, + "loss": 0.3407, + "step": 16572 + }, + { + "epoch": 2.1573604060913705, + "grad_norm": 2.4870200157165527, + "learning_rate": 1.8996979294827355e-06, + "loss": 0.3377, + "step": 16575 + }, + { + "epoch": 2.1577508785630615, + "grad_norm": 2.9998185634613037, + "learning_rate": 1.89805860481938e-06, + "loss": 0.3986, + "step": 16578 + }, + { + "epoch": 2.158141351034752, + "grad_norm": 2.575596809387207, + "learning_rate": 1.8964198220586599e-06, + "loss": 0.3309, + "step": 16581 + }, + { + "epoch": 2.1585318235064426, + "grad_norm": 3.0732362270355225, + "learning_rate": 1.894781581486867e-06, + "loss": 0.3105, + "step": 16584 + }, + { + "epoch": 2.1589222959781336, + "grad_norm": 2.591250419616699, + "learning_rate": 1.8931438833902005e-06, + "loss": 0.3741, + "step": 16587 + }, + { + "epoch": 2.159312768449824, + "grad_norm": 2.6407053470611572, + "learning_rate": 1.8915067280547622e-06, + "loss": 0.3275, + "step": 16590 + }, + { + "epoch": 2.159703240921515, + "grad_norm": 3.031137704849243, + "learning_rate": 1.8898701157665572e-06, + "loss": 0.36, + "step": 16593 + }, + { + "epoch": 2.1600937133932057, + "grad_norm": 2.810857057571411, + "learning_rate": 1.8882340468115002e-06, + "loss": 0.3636, + "step": 16596 + }, + { + "epoch": 2.1604841858648967, + "grad_norm": 3.037799119949341, + "learning_rate": 1.8865985214754107e-06, + "loss": 0.3521, + "step": 16599 + }, + { + "epoch": 2.160874658336587, + "grad_norm": 3.4607412815093994, + "learning_rate": 1.8849635400440098e-06, + "loss": 0.3723, + "step": 16602 + }, + { + "epoch": 2.161265130808278, + "grad_norm": 2.7808799743652344, + "learning_rate": 1.8833291028029239e-06, + "loss": 0.3518, + "step": 16605 + }, + { + "epoch": 2.1616556032799688, + "grad_norm": 2.7576420307159424, + "learning_rate": 1.881695210037689e-06, + "loss": 0.3491, + "step": 16608 + }, + { + "epoch": 2.1620460757516593, + "grad_norm": 2.637984037399292, + "learning_rate": 1.8800618620337407e-06, + "loss": 0.3452, + "step": 16611 + }, + { + "epoch": 2.1624365482233503, + "grad_norm": 2.8887691497802734, + "learning_rate": 1.8784290590764199e-06, + "loss": 0.3166, + "step": 16614 + }, + { + "epoch": 2.162827020695041, + "grad_norm": 2.620800495147705, + "learning_rate": 1.8767968014509774e-06, + "loss": 0.3097, + "step": 16617 + }, + { + "epoch": 2.163217493166732, + "grad_norm": 2.542872190475464, + "learning_rate": 1.8751650894425639e-06, + "loss": 0.326, + "step": 16620 + }, + { + "epoch": 2.1636079656384224, + "grad_norm": 3.511993169784546, + "learning_rate": 1.8735339233362355e-06, + "loss": 0.39, + "step": 16623 + }, + { + "epoch": 2.1639984381101134, + "grad_norm": 2.7915451526641846, + "learning_rate": 1.8719033034169514e-06, + "loss": 0.4285, + "step": 16626 + }, + { + "epoch": 2.164388910581804, + "grad_norm": 2.7394583225250244, + "learning_rate": 1.8702732299695813e-06, + "loss": 0.3401, + "step": 16629 + }, + { + "epoch": 2.164779383053495, + "grad_norm": 2.7059719562530518, + "learning_rate": 1.8686437032788945e-06, + "loss": 0.3685, + "step": 16632 + }, + { + "epoch": 2.1651698555251855, + "grad_norm": 2.651163339614868, + "learning_rate": 1.8670147236295632e-06, + "loss": 0.3437, + "step": 16635 + }, + { + "epoch": 2.165560327996876, + "grad_norm": 2.559772253036499, + "learning_rate": 1.8653862913061693e-06, + "loss": 0.2954, + "step": 16638 + }, + { + "epoch": 2.165950800468567, + "grad_norm": 2.9166457653045654, + "learning_rate": 1.8637584065931974e-06, + "loss": 0.2884, + "step": 16641 + }, + { + "epoch": 2.1663412729402576, + "grad_norm": 2.951847791671753, + "learning_rate": 1.862131069775034e-06, + "loss": 0.4385, + "step": 16644 + }, + { + "epoch": 2.1667317454119486, + "grad_norm": 2.7294256687164307, + "learning_rate": 1.8605042811359702e-06, + "loss": 0.3622, + "step": 16647 + }, + { + "epoch": 2.167122217883639, + "grad_norm": 2.931398630142212, + "learning_rate": 1.8588780409602053e-06, + "loss": 0.3915, + "step": 16650 + }, + { + "epoch": 2.16751269035533, + "grad_norm": 3.169494152069092, + "learning_rate": 1.8572523495318389e-06, + "loss": 0.3643, + "step": 16653 + }, + { + "epoch": 2.1679031628270207, + "grad_norm": 3.00234055519104, + "learning_rate": 1.855627207134874e-06, + "loss": 0.3221, + "step": 16656 + }, + { + "epoch": 2.1682936352987117, + "grad_norm": 2.8568825721740723, + "learning_rate": 1.854002614053223e-06, + "loss": 0.4029, + "step": 16659 + }, + { + "epoch": 2.168684107770402, + "grad_norm": 2.6517200469970703, + "learning_rate": 1.852378570570697e-06, + "loss": 0.3926, + "step": 16662 + }, + { + "epoch": 2.1690745802420928, + "grad_norm": 2.9220314025878906, + "learning_rate": 1.8507550769710115e-06, + "loss": 0.3665, + "step": 16665 + }, + { + "epoch": 2.1694650527137838, + "grad_norm": 2.624634265899658, + "learning_rate": 1.849132133537791e-06, + "loss": 0.3212, + "step": 16668 + }, + { + "epoch": 2.1698555251854743, + "grad_norm": 2.7517733573913574, + "learning_rate": 1.8475097405545578e-06, + "loss": 0.3628, + "step": 16671 + }, + { + "epoch": 2.1702459976571653, + "grad_norm": 2.7567718029022217, + "learning_rate": 1.8458878983047412e-06, + "loss": 0.3733, + "step": 16674 + }, + { + "epoch": 2.170636470128856, + "grad_norm": 2.902865171432495, + "learning_rate": 1.8442666070716719e-06, + "loss": 0.3259, + "step": 16677 + }, + { + "epoch": 2.171026942600547, + "grad_norm": 3.230361223220825, + "learning_rate": 1.842645867138587e-06, + "loss": 0.3797, + "step": 16680 + }, + { + "epoch": 2.1714174150722374, + "grad_norm": 3.068331003189087, + "learning_rate": 1.8410256787886298e-06, + "loss": 0.4157, + "step": 16683 + }, + { + "epoch": 2.171807887543928, + "grad_norm": 3.102295160293579, + "learning_rate": 1.8394060423048404e-06, + "loss": 0.3912, + "step": 16686 + }, + { + "epoch": 2.172198360015619, + "grad_norm": 2.4631145000457764, + "learning_rate": 1.8377869579701647e-06, + "loss": 0.3233, + "step": 16689 + }, + { + "epoch": 2.1725888324873095, + "grad_norm": 3.171217679977417, + "learning_rate": 1.8361684260674572e-06, + "loss": 0.3941, + "step": 16692 + }, + { + "epoch": 2.1729793049590005, + "grad_norm": 3.546006679534912, + "learning_rate": 1.8345504468794694e-06, + "loss": 0.3908, + "step": 16695 + }, + { + "epoch": 2.173369777430691, + "grad_norm": 2.745622158050537, + "learning_rate": 1.832933020688858e-06, + "loss": 0.3229, + "step": 16698 + }, + { + "epoch": 2.173760249902382, + "grad_norm": 2.7853684425354004, + "learning_rate": 1.8313161477781871e-06, + "loss": 0.3425, + "step": 16701 + }, + { + "epoch": 2.1741507223740726, + "grad_norm": 2.7178430557250977, + "learning_rate": 1.8296998284299195e-06, + "loss": 0.4037, + "step": 16704 + }, + { + "epoch": 2.1745411948457636, + "grad_norm": 2.7298033237457275, + "learning_rate": 1.8280840629264202e-06, + "loss": 0.3505, + "step": 16707 + }, + { + "epoch": 2.174931667317454, + "grad_norm": 2.750066041946411, + "learning_rate": 1.8264688515499645e-06, + "loss": 0.3359, + "step": 16710 + }, + { + "epoch": 2.1753221397891447, + "grad_norm": 3.090998888015747, + "learning_rate": 1.824854194582724e-06, + "loss": 0.3278, + "step": 16713 + }, + { + "epoch": 2.1757126122608357, + "grad_norm": 2.6128833293914795, + "learning_rate": 1.823240092306775e-06, + "loss": 0.3463, + "step": 16716 + }, + { + "epoch": 2.1761030847325262, + "grad_norm": 2.647390604019165, + "learning_rate": 1.8216265450041004e-06, + "loss": 0.3686, + "step": 16719 + }, + { + "epoch": 2.176493557204217, + "grad_norm": 2.6248843669891357, + "learning_rate": 1.8200135529565805e-06, + "loss": 0.3933, + "step": 16722 + }, + { + "epoch": 2.1768840296759078, + "grad_norm": 2.8739254474639893, + "learning_rate": 1.8184011164460046e-06, + "loss": 0.394, + "step": 16725 + }, + { + "epoch": 2.1772745021475988, + "grad_norm": 2.8517467975616455, + "learning_rate": 1.816789235754059e-06, + "loss": 0.3486, + "step": 16728 + }, + { + "epoch": 2.1776649746192893, + "grad_norm": 2.5461232662200928, + "learning_rate": 1.8151779111623392e-06, + "loss": 0.3518, + "step": 16731 + }, + { + "epoch": 2.17805544709098, + "grad_norm": 2.520355463027954, + "learning_rate": 1.8135671429523377e-06, + "loss": 0.3742, + "step": 16734 + }, + { + "epoch": 2.178445919562671, + "grad_norm": 2.9696972370147705, + "learning_rate": 1.811956931405454e-06, + "loss": 0.3798, + "step": 16737 + }, + { + "epoch": 2.1788363920343614, + "grad_norm": 2.75524640083313, + "learning_rate": 1.8103472768029856e-06, + "loss": 0.329, + "step": 16740 + }, + { + "epoch": 2.1792268645060524, + "grad_norm": 2.857969284057617, + "learning_rate": 1.8087381794261394e-06, + "loss": 0.3514, + "step": 16743 + }, + { + "epoch": 2.179617336977743, + "grad_norm": 3.1860451698303223, + "learning_rate": 1.80712963955602e-06, + "loss": 0.3085, + "step": 16746 + }, + { + "epoch": 2.180007809449434, + "grad_norm": 3.36942458152771, + "learning_rate": 1.8055216574736346e-06, + "loss": 0.3323, + "step": 16749 + }, + { + "epoch": 2.1803982819211245, + "grad_norm": 3.2723255157470703, + "learning_rate": 1.8039142334598964e-06, + "loss": 0.4076, + "step": 16752 + }, + { + "epoch": 2.1807887543928155, + "grad_norm": 2.8995213508605957, + "learning_rate": 1.8023073677956183e-06, + "loss": 0.3193, + "step": 16755 + }, + { + "epoch": 2.181179226864506, + "grad_norm": 3.000784397125244, + "learning_rate": 1.8007010607615144e-06, + "loss": 0.4389, + "step": 16758 + }, + { + "epoch": 2.1815696993361966, + "grad_norm": 2.861037492752075, + "learning_rate": 1.7990953126382065e-06, + "loss": 0.3881, + "step": 16761 + }, + { + "epoch": 2.1819601718078876, + "grad_norm": 2.6603360176086426, + "learning_rate": 1.797490123706212e-06, + "loss": 0.3744, + "step": 16764 + }, + { + "epoch": 2.182350644279578, + "grad_norm": 2.7340359687805176, + "learning_rate": 1.795885494245958e-06, + "loss": 0.3644, + "step": 16767 + }, + { + "epoch": 2.182741116751269, + "grad_norm": 2.9198856353759766, + "learning_rate": 1.7942814245377654e-06, + "loss": 0.3499, + "step": 16770 + }, + { + "epoch": 2.1831315892229597, + "grad_norm": 3.0503599643707275, + "learning_rate": 1.7926779148618661e-06, + "loss": 0.3224, + "step": 16773 + }, + { + "epoch": 2.1835220616946507, + "grad_norm": 2.8313817977905273, + "learning_rate": 1.7910749654983879e-06, + "loss": 0.3579, + "step": 16776 + }, + { + "epoch": 2.1839125341663412, + "grad_norm": 2.788543224334717, + "learning_rate": 1.7894725767273601e-06, + "loss": 0.3443, + "step": 16779 + }, + { + "epoch": 2.1843030066380322, + "grad_norm": 2.825108289718628, + "learning_rate": 1.7878707488287216e-06, + "loss": 0.3615, + "step": 16782 + }, + { + "epoch": 2.184693479109723, + "grad_norm": 2.4772067070007324, + "learning_rate": 1.7862694820823062e-06, + "loss": 0.3303, + "step": 16785 + }, + { + "epoch": 2.1850839515814133, + "grad_norm": 2.8775320053100586, + "learning_rate": 1.784668776767851e-06, + "loss": 0.3626, + "step": 16788 + }, + { + "epoch": 2.1854744240531043, + "grad_norm": 2.8523988723754883, + "learning_rate": 1.783068633164995e-06, + "loss": 0.2922, + "step": 16791 + }, + { + "epoch": 2.185864896524795, + "grad_norm": 2.919550657272339, + "learning_rate": 1.7814690515532828e-06, + "loss": 0.3725, + "step": 16794 + }, + { + "epoch": 2.186255368996486, + "grad_norm": 2.9476890563964844, + "learning_rate": 1.779870032212157e-06, + "loss": 0.3546, + "step": 16797 + }, + { + "epoch": 2.1866458414681764, + "grad_norm": 2.7363481521606445, + "learning_rate": 1.7782715754209607e-06, + "loss": 0.4004, + "step": 16800 + }, + { + "epoch": 2.1870363139398674, + "grad_norm": 2.9156899452209473, + "learning_rate": 1.776673681458944e-06, + "loss": 0.3733, + "step": 16803 + }, + { + "epoch": 2.187426786411558, + "grad_norm": 2.7484629154205322, + "learning_rate": 1.7750763506052526e-06, + "loss": 0.3773, + "step": 16806 + }, + { + "epoch": 2.187817258883249, + "grad_norm": 2.7555909156799316, + "learning_rate": 1.77347958313894e-06, + "loss": 0.3394, + "step": 16809 + }, + { + "epoch": 2.1882077313549395, + "grad_norm": 2.8585703372955322, + "learning_rate": 1.7718833793389556e-06, + "loss": 0.349, + "step": 16812 + }, + { + "epoch": 2.18859820382663, + "grad_norm": 2.7602007389068604, + "learning_rate": 1.770287739484155e-06, + "loss": 0.3731, + "step": 16815 + }, + { + "epoch": 2.188988676298321, + "grad_norm": 2.4553885459899902, + "learning_rate": 1.768692663853292e-06, + "loss": 0.3372, + "step": 16818 + }, + { + "epoch": 2.1893791487700116, + "grad_norm": 2.777724504470825, + "learning_rate": 1.7670981527250213e-06, + "loss": 0.4093, + "step": 16821 + }, + { + "epoch": 2.1897696212417026, + "grad_norm": 2.8050687313079834, + "learning_rate": 1.7655042063779043e-06, + "loss": 0.3752, + "step": 16824 + }, + { + "epoch": 2.190160093713393, + "grad_norm": 3.285545587539673, + "learning_rate": 1.7639108250903974e-06, + "loss": 0.4672, + "step": 16827 + }, + { + "epoch": 2.190550566185084, + "grad_norm": 3.2588181495666504, + "learning_rate": 1.762318009140862e-06, + "loss": 0.4154, + "step": 16830 + }, + { + "epoch": 2.1909410386567747, + "grad_norm": 2.734658718109131, + "learning_rate": 1.7607257588075582e-06, + "loss": 0.4109, + "step": 16833 + }, + { + "epoch": 2.1913315111284652, + "grad_norm": 2.7743442058563232, + "learning_rate": 1.7591340743686507e-06, + "loss": 0.3764, + "step": 16836 + }, + { + "epoch": 2.1917219836001562, + "grad_norm": 2.652029514312744, + "learning_rate": 1.7575429561022029e-06, + "loss": 0.4011, + "step": 16839 + }, + { + "epoch": 2.192112456071847, + "grad_norm": 2.7200498580932617, + "learning_rate": 1.755952404286178e-06, + "loss": 0.3842, + "step": 16842 + }, + { + "epoch": 2.192502928543538, + "grad_norm": 2.506895065307617, + "learning_rate": 1.7543624191984455e-06, + "loss": 0.3566, + "step": 16845 + }, + { + "epoch": 2.1928934010152283, + "grad_norm": 3.026963233947754, + "learning_rate": 1.7527730011167681e-06, + "loss": 0.3718, + "step": 16848 + }, + { + "epoch": 2.1932838734869193, + "grad_norm": 2.6555018424987793, + "learning_rate": 1.7511841503188187e-06, + "loss": 0.3382, + "step": 16851 + }, + { + "epoch": 2.19367434595861, + "grad_norm": 2.6906399726867676, + "learning_rate": 1.7495958670821617e-06, + "loss": 0.3304, + "step": 16854 + }, + { + "epoch": 2.1940648184303004, + "grad_norm": 3.381065607070923, + "learning_rate": 1.7480081516842705e-06, + "loss": 0.3892, + "step": 16857 + }, + { + "epoch": 2.1944552909019914, + "grad_norm": 2.733503580093384, + "learning_rate": 1.7464210044025144e-06, + "loss": 0.3378, + "step": 16860 + }, + { + "epoch": 2.194845763373682, + "grad_norm": 2.755075693130493, + "learning_rate": 1.744834425514162e-06, + "loss": 0.3439, + "step": 16863 + }, + { + "epoch": 2.195236235845373, + "grad_norm": 2.853973388671875, + "learning_rate": 1.7432484152963896e-06, + "loss": 0.3667, + "step": 16866 + }, + { + "epoch": 2.1956267083170635, + "grad_norm": 2.4795007705688477, + "learning_rate": 1.7416629740262681e-06, + "loss": 0.3341, + "step": 16869 + }, + { + "epoch": 2.1960171807887545, + "grad_norm": 2.5564217567443848, + "learning_rate": 1.7400781019807684e-06, + "loss": 0.3706, + "step": 16872 + }, + { + "epoch": 2.196407653260445, + "grad_norm": 2.830836057662964, + "learning_rate": 1.738493799436768e-06, + "loss": 0.354, + "step": 16875 + }, + { + "epoch": 2.196798125732136, + "grad_norm": 2.753403425216675, + "learning_rate": 1.7369100666710398e-06, + "loss": 0.3636, + "step": 16878 + }, + { + "epoch": 2.1971885982038266, + "grad_norm": 2.559079170227051, + "learning_rate": 1.7353269039602588e-06, + "loss": 0.3772, + "step": 16881 + }, + { + "epoch": 2.197579070675517, + "grad_norm": 2.9409003257751465, + "learning_rate": 1.7337443115809976e-06, + "loss": 0.3509, + "step": 16884 + }, + { + "epoch": 2.197969543147208, + "grad_norm": 2.7231342792510986, + "learning_rate": 1.7321622898097362e-06, + "loss": 0.4125, + "step": 16887 + }, + { + "epoch": 2.1983600156188987, + "grad_norm": 2.7584779262542725, + "learning_rate": 1.7305808389228462e-06, + "loss": 0.3734, + "step": 16890 + }, + { + "epoch": 2.1987504880905897, + "grad_norm": 2.9606661796569824, + "learning_rate": 1.7289999591966072e-06, + "loss": 0.3863, + "step": 16893 + }, + { + "epoch": 2.1991409605622803, + "grad_norm": 2.588604688644409, + "learning_rate": 1.7274196509071927e-06, + "loss": 0.3034, + "step": 16896 + }, + { + "epoch": 2.1995314330339713, + "grad_norm": 2.7741434574127197, + "learning_rate": 1.7258399143306825e-06, + "loss": 0.3722, + "step": 16899 + }, + { + "epoch": 2.199921905505662, + "grad_norm": 2.741807699203491, + "learning_rate": 1.7242607497430514e-06, + "loss": 0.3266, + "step": 16902 + }, + { + "epoch": 2.200312377977353, + "grad_norm": 2.6828136444091797, + "learning_rate": 1.7226821574201747e-06, + "loss": 0.4204, + "step": 16905 + }, + { + "epoch": 2.2007028504490433, + "grad_norm": 2.492523431777954, + "learning_rate": 1.721104137637832e-06, + "loss": 0.3388, + "step": 16908 + }, + { + "epoch": 2.201093322920734, + "grad_norm": 2.79616117477417, + "learning_rate": 1.7195266906716985e-06, + "loss": 0.3348, + "step": 16911 + }, + { + "epoch": 2.201483795392425, + "grad_norm": 2.682907819747925, + "learning_rate": 1.7179498167973496e-06, + "loss": 0.3421, + "step": 16914 + }, + { + "epoch": 2.2018742678641154, + "grad_norm": 3.588071823120117, + "learning_rate": 1.7163735162902651e-06, + "loss": 0.374, + "step": 16917 + }, + { + "epoch": 2.2022647403358064, + "grad_norm": 3.160140037536621, + "learning_rate": 1.7147977894258193e-06, + "loss": 0.4139, + "step": 16920 + }, + { + "epoch": 2.202655212807497, + "grad_norm": 2.9492440223693848, + "learning_rate": 1.713222636479287e-06, + "loss": 0.4267, + "step": 16923 + }, + { + "epoch": 2.203045685279188, + "grad_norm": 2.716421365737915, + "learning_rate": 1.7116480577258477e-06, + "loss": 0.3267, + "step": 16926 + }, + { + "epoch": 2.2034361577508785, + "grad_norm": 3.1146531105041504, + "learning_rate": 1.7100740534405746e-06, + "loss": 0.3581, + "step": 16929 + }, + { + "epoch": 2.2038266302225695, + "grad_norm": 3.101052761077881, + "learning_rate": 1.708500623898442e-06, + "loss": 0.3646, + "step": 16932 + }, + { + "epoch": 2.20421710269426, + "grad_norm": 2.7972285747528076, + "learning_rate": 1.7069277693743258e-06, + "loss": 0.3556, + "step": 16935 + }, + { + "epoch": 2.2046075751659506, + "grad_norm": 3.1059136390686035, + "learning_rate": 1.705355490143003e-06, + "loss": 0.3617, + "step": 16938 + }, + { + "epoch": 2.2049980476376416, + "grad_norm": 2.703691244125366, + "learning_rate": 1.7037837864791445e-06, + "loss": 0.3557, + "step": 16941 + }, + { + "epoch": 2.205388520109332, + "grad_norm": 2.6937639713287354, + "learning_rate": 1.7022126586573246e-06, + "loss": 0.3241, + "step": 16944 + }, + { + "epoch": 2.205778992581023, + "grad_norm": 3.200873613357544, + "learning_rate": 1.7006421069520141e-06, + "loss": 0.3686, + "step": 16947 + }, + { + "epoch": 2.2061694650527137, + "grad_norm": 2.610318183898926, + "learning_rate": 1.699072131637588e-06, + "loss": 0.339, + "step": 16950 + }, + { + "epoch": 2.2065599375244047, + "grad_norm": 3.0442748069763184, + "learning_rate": 1.6975027329883166e-06, + "loss": 0.4444, + "step": 16953 + }, + { + "epoch": 2.2069504099960953, + "grad_norm": 2.716014862060547, + "learning_rate": 1.6959339112783685e-06, + "loss": 0.3636, + "step": 16956 + }, + { + "epoch": 2.207340882467786, + "grad_norm": 2.61544132232666, + "learning_rate": 1.694365666781817e-06, + "loss": 0.3378, + "step": 16959 + }, + { + "epoch": 2.207731354939477, + "grad_norm": 2.998279094696045, + "learning_rate": 1.6927979997726295e-06, + "loss": 0.2909, + "step": 16962 + }, + { + "epoch": 2.2081218274111674, + "grad_norm": 2.755953311920166, + "learning_rate": 1.6912309105246726e-06, + "loss": 0.3346, + "step": 16965 + }, + { + "epoch": 2.2085122998828584, + "grad_norm": 2.780045986175537, + "learning_rate": 1.6896643993117168e-06, + "loss": 0.3338, + "step": 16968 + }, + { + "epoch": 2.208902772354549, + "grad_norm": 2.9842076301574707, + "learning_rate": 1.6880984664074262e-06, + "loss": 0.3819, + "step": 16971 + }, + { + "epoch": 2.20929324482624, + "grad_norm": 2.786750078201294, + "learning_rate": 1.6865331120853645e-06, + "loss": 0.3506, + "step": 16974 + }, + { + "epoch": 2.2096837172979304, + "grad_norm": 2.8136911392211914, + "learning_rate": 1.6849683366189978e-06, + "loss": 0.3695, + "step": 16977 + }, + { + "epoch": 2.2100741897696214, + "grad_norm": 2.6133673191070557, + "learning_rate": 1.6834041402816908e-06, + "loss": 0.3035, + "step": 16980 + }, + { + "epoch": 2.210464662241312, + "grad_norm": 2.6975417137145996, + "learning_rate": 1.6818405233467034e-06, + "loss": 0.314, + "step": 16983 + }, + { + "epoch": 2.2108551347130025, + "grad_norm": 2.7998573780059814, + "learning_rate": 1.6802774860871939e-06, + "loss": 0.3707, + "step": 16986 + }, + { + "epoch": 2.2112456071846935, + "grad_norm": 2.9110021591186523, + "learning_rate": 1.678715028776226e-06, + "loss": 0.4747, + "step": 16989 + }, + { + "epoch": 2.211636079656384, + "grad_norm": 2.649843215942383, + "learning_rate": 1.6771531516867557e-06, + "loss": 0.3751, + "step": 16992 + }, + { + "epoch": 2.212026552128075, + "grad_norm": 3.525862693786621, + "learning_rate": 1.6755918550916395e-06, + "loss": 0.3078, + "step": 16995 + }, + { + "epoch": 2.2124170245997656, + "grad_norm": 2.670433759689331, + "learning_rate": 1.6740311392636311e-06, + "loss": 0.3935, + "step": 16998 + }, + { + "epoch": 2.2128074970714566, + "grad_norm": 2.793560266494751, + "learning_rate": 1.6724710044753872e-06, + "loss": 0.3365, + "step": 17001 + }, + { + "epoch": 2.213197969543147, + "grad_norm": 3.4935295581817627, + "learning_rate": 1.6709114509994588e-06, + "loss": 0.3708, + "step": 17004 + }, + { + "epoch": 2.2135884420148377, + "grad_norm": 2.874403238296509, + "learning_rate": 1.6693524791082948e-06, + "loss": 0.3428, + "step": 17007 + }, + { + "epoch": 2.2139789144865287, + "grad_norm": 2.7574665546417236, + "learning_rate": 1.6677940890742484e-06, + "loss": 0.3819, + "step": 17010 + }, + { + "epoch": 2.2143693869582193, + "grad_norm": 2.5127577781677246, + "learning_rate": 1.6662362811695637e-06, + "loss": 0.3112, + "step": 17013 + }, + { + "epoch": 2.2147598594299103, + "grad_norm": 2.6434106826782227, + "learning_rate": 1.6646790556663867e-06, + "loss": 0.3468, + "step": 17016 + }, + { + "epoch": 2.215150331901601, + "grad_norm": 2.921630859375, + "learning_rate": 1.6631224128367612e-06, + "loss": 0.3818, + "step": 17019 + }, + { + "epoch": 2.215540804373292, + "grad_norm": 3.2600367069244385, + "learning_rate": 1.6615663529526328e-06, + "loss": 0.407, + "step": 17022 + }, + { + "epoch": 2.2159312768449824, + "grad_norm": 2.5993316173553467, + "learning_rate": 1.6600108762858392e-06, + "loss": 0.3508, + "step": 17025 + }, + { + "epoch": 2.2163217493166734, + "grad_norm": 3.402862310409546, + "learning_rate": 1.6584559831081176e-06, + "loss": 0.3205, + "step": 17028 + }, + { + "epoch": 2.216712221788364, + "grad_norm": 2.991856813430786, + "learning_rate": 1.6569016736911082e-06, + "loss": 0.3868, + "step": 17031 + }, + { + "epoch": 2.2171026942600545, + "grad_norm": 2.9160122871398926, + "learning_rate": 1.6553479483063434e-06, + "loss": 0.3622, + "step": 17034 + }, + { + "epoch": 2.2174931667317455, + "grad_norm": 2.687974691390991, + "learning_rate": 1.653794807225254e-06, + "loss": 0.3137, + "step": 17037 + }, + { + "epoch": 2.217883639203436, + "grad_norm": 2.7909791469573975, + "learning_rate": 1.6522422507191744e-06, + "loss": 0.3004, + "step": 17040 + }, + { + "epoch": 2.218274111675127, + "grad_norm": 2.7353012561798096, + "learning_rate": 1.6506902790593303e-06, + "loss": 0.3487, + "step": 17043 + }, + { + "epoch": 2.2186645841468176, + "grad_norm": 2.6116976737976074, + "learning_rate": 1.649138892516849e-06, + "loss": 0.3245, + "step": 17046 + }, + { + "epoch": 2.2190550566185085, + "grad_norm": 2.7857844829559326, + "learning_rate": 1.6475880913627522e-06, + "loss": 0.3987, + "step": 17049 + }, + { + "epoch": 2.219445529090199, + "grad_norm": 2.628438711166382, + "learning_rate": 1.646037875867965e-06, + "loss": 0.3248, + "step": 17052 + }, + { + "epoch": 2.21983600156189, + "grad_norm": 2.738862991333008, + "learning_rate": 1.6444882463033058e-06, + "loss": 0.3295, + "step": 17055 + }, + { + "epoch": 2.2202264740335806, + "grad_norm": 2.46667742729187, + "learning_rate": 1.6429392029394886e-06, + "loss": 0.3429, + "step": 17058 + }, + { + "epoch": 2.220616946505271, + "grad_norm": 2.5244455337524414, + "learning_rate": 1.6413907460471306e-06, + "loss": 0.3478, + "step": 17061 + }, + { + "epoch": 2.221007418976962, + "grad_norm": 2.5417511463165283, + "learning_rate": 1.6398428758967455e-06, + "loss": 0.3285, + "step": 17064 + }, + { + "epoch": 2.2213978914486527, + "grad_norm": 2.653949499130249, + "learning_rate": 1.6382955927587414e-06, + "loss": 0.3263, + "step": 17067 + }, + { + "epoch": 2.2217883639203437, + "grad_norm": 2.884857416152954, + "learning_rate": 1.6367488969034234e-06, + "loss": 0.3631, + "step": 17070 + }, + { + "epoch": 2.2221788363920343, + "grad_norm": 2.860215663909912, + "learning_rate": 1.6352027886009997e-06, + "loss": 0.3883, + "step": 17073 + }, + { + "epoch": 2.2225693088637253, + "grad_norm": 2.657209873199463, + "learning_rate": 1.6336572681215701e-06, + "loss": 0.3437, + "step": 17076 + }, + { + "epoch": 2.222959781335416, + "grad_norm": 2.648395538330078, + "learning_rate": 1.6321123357351327e-06, + "loss": 0.3321, + "step": 17079 + }, + { + "epoch": 2.223350253807107, + "grad_norm": 2.7004330158233643, + "learning_rate": 1.6305679917115864e-06, + "loss": 0.3254, + "step": 17082 + }, + { + "epoch": 2.2237407262787974, + "grad_norm": 2.858809232711792, + "learning_rate": 1.6290242363207238e-06, + "loss": 0.3445, + "step": 17085 + }, + { + "epoch": 2.224131198750488, + "grad_norm": 2.9426462650299072, + "learning_rate": 1.6274810698322341e-06, + "loss": 0.3974, + "step": 17088 + }, + { + "epoch": 2.224521671222179, + "grad_norm": 2.777961015701294, + "learning_rate": 1.6259384925157084e-06, + "loss": 0.4052, + "step": 17091 + }, + { + "epoch": 2.2249121436938695, + "grad_norm": 2.6763064861297607, + "learning_rate": 1.6243965046406302e-06, + "loss": 0.3834, + "step": 17094 + }, + { + "epoch": 2.2253026161655605, + "grad_norm": 2.739316463470459, + "learning_rate": 1.6228551064763814e-06, + "loss": 0.3819, + "step": 17097 + }, + { + "epoch": 2.225693088637251, + "grad_norm": 2.7301228046417236, + "learning_rate": 1.6213142982922376e-06, + "loss": 0.2822, + "step": 17100 + }, + { + "epoch": 2.226083561108942, + "grad_norm": 3.0507218837738037, + "learning_rate": 1.6197740803573813e-06, + "loss": 0.3252, + "step": 17103 + }, + { + "epoch": 2.2264740335806326, + "grad_norm": 2.8858330249786377, + "learning_rate": 1.6182344529408828e-06, + "loss": 0.3756, + "step": 17106 + }, + { + "epoch": 2.226864506052323, + "grad_norm": 2.5173003673553467, + "learning_rate": 1.6166954163117105e-06, + "loss": 0.3249, + "step": 17109 + }, + { + "epoch": 2.227254978524014, + "grad_norm": 2.6058590412139893, + "learning_rate": 1.6151569707387305e-06, + "loss": 0.3337, + "step": 17112 + }, + { + "epoch": 2.2276454509957047, + "grad_norm": 3.0819878578186035, + "learning_rate": 1.6136191164907084e-06, + "loss": 0.4114, + "step": 17115 + }, + { + "epoch": 2.2280359234673957, + "grad_norm": 3.0594425201416016, + "learning_rate": 1.6120818538363026e-06, + "loss": 0.4065, + "step": 17118 + }, + { + "epoch": 2.228426395939086, + "grad_norm": 3.019721746444702, + "learning_rate": 1.6105451830440683e-06, + "loss": 0.333, + "step": 17121 + }, + { + "epoch": 2.228816868410777, + "grad_norm": 2.7887985706329346, + "learning_rate": 1.6090091043824618e-06, + "loss": 0.37, + "step": 17124 + }, + { + "epoch": 2.2292073408824677, + "grad_norm": 2.4664344787597656, + "learning_rate": 1.6074736181198309e-06, + "loss": 0.3093, + "step": 17127 + }, + { + "epoch": 2.2295978133541583, + "grad_norm": 3.16357684135437, + "learning_rate": 1.6059387245244208e-06, + "loss": 0.4148, + "step": 17130 + }, + { + "epoch": 2.2299882858258493, + "grad_norm": 2.7439839839935303, + "learning_rate": 1.604404423864377e-06, + "loss": 0.3784, + "step": 17133 + }, + { + "epoch": 2.23037875829754, + "grad_norm": 3.130523443222046, + "learning_rate": 1.6028707164077367e-06, + "loss": 0.4011, + "step": 17136 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 2.8755338191986084, + "learning_rate": 1.6013376024224363e-06, + "loss": 0.3792, + "step": 17139 + }, + { + "epoch": 2.2311597032409214, + "grad_norm": 2.6047468185424805, + "learning_rate": 1.5998050821763033e-06, + "loss": 0.3807, + "step": 17142 + }, + { + "epoch": 2.2315501757126124, + "grad_norm": 2.492357015609741, + "learning_rate": 1.598273155937073e-06, + "loss": 0.3403, + "step": 17145 + }, + { + "epoch": 2.231940648184303, + "grad_norm": 3.0767757892608643, + "learning_rate": 1.5967418239723664e-06, + "loss": 0.3441, + "step": 17148 + }, + { + "epoch": 2.232331120655994, + "grad_norm": 2.8667490482330322, + "learning_rate": 1.5952110865497017e-06, + "loss": 0.3956, + "step": 17151 + }, + { + "epoch": 2.2327215931276845, + "grad_norm": 2.9534928798675537, + "learning_rate": 1.5936809439364992e-06, + "loss": 0.3098, + "step": 17154 + }, + { + "epoch": 2.233112065599375, + "grad_norm": 2.540276288986206, + "learning_rate": 1.59215139640007e-06, + "loss": 0.3845, + "step": 17157 + }, + { + "epoch": 2.233502538071066, + "grad_norm": 2.741945743560791, + "learning_rate": 1.590622444207623e-06, + "loss": 0.3631, + "step": 17160 + }, + { + "epoch": 2.2338930105427566, + "grad_norm": 2.531273603439331, + "learning_rate": 1.5890940876262612e-06, + "loss": 0.3323, + "step": 17163 + }, + { + "epoch": 2.2342834830144476, + "grad_norm": 3.036818027496338, + "learning_rate": 1.587566326922988e-06, + "loss": 0.3745, + "step": 17166 + }, + { + "epoch": 2.234673955486138, + "grad_norm": 2.696261167526245, + "learning_rate": 1.5860391623646986e-06, + "loss": 0.3351, + "step": 17169 + }, + { + "epoch": 2.235064427957829, + "grad_norm": 2.7615208625793457, + "learning_rate": 1.5845125942181844e-06, + "loss": 0.3886, + "step": 17172 + }, + { + "epoch": 2.2354549004295197, + "grad_norm": 3.1380362510681152, + "learning_rate": 1.5829866227501367e-06, + "loss": 0.3734, + "step": 17175 + }, + { + "epoch": 2.2358453729012107, + "grad_norm": 2.6224167346954346, + "learning_rate": 1.5814612482271368e-06, + "loss": 0.3422, + "step": 17178 + }, + { + "epoch": 2.236235845372901, + "grad_norm": 2.8184878826141357, + "learning_rate": 1.5799364709156645e-06, + "loss": 0.3194, + "step": 17181 + }, + { + "epoch": 2.2366263178445918, + "grad_norm": 2.749946355819702, + "learning_rate": 1.5784122910820976e-06, + "loss": 0.3587, + "step": 17184 + }, + { + "epoch": 2.2370167903162828, + "grad_norm": 3.1600887775421143, + "learning_rate": 1.5768887089927031e-06, + "loss": 0.3815, + "step": 17187 + }, + { + "epoch": 2.2374072627879733, + "grad_norm": 2.7639830112457275, + "learning_rate": 1.5753657249136523e-06, + "loss": 0.3757, + "step": 17190 + }, + { + "epoch": 2.2377977352596643, + "grad_norm": 2.792595624923706, + "learning_rate": 1.573843339111003e-06, + "loss": 0.3124, + "step": 17193 + }, + { + "epoch": 2.238188207731355, + "grad_norm": 3.1691551208496094, + "learning_rate": 1.5723215518507168e-06, + "loss": 0.3515, + "step": 17196 + }, + { + "epoch": 2.238578680203046, + "grad_norm": 2.5455732345581055, + "learning_rate": 1.570800363398644e-06, + "loss": 0.3376, + "step": 17199 + }, + { + "epoch": 2.2389691526747364, + "grad_norm": 2.700528383255005, + "learning_rate": 1.5692797740205345e-06, + "loss": 0.3563, + "step": 17202 + }, + { + "epoch": 2.2393596251464274, + "grad_norm": 3.191154718399048, + "learning_rate": 1.5677597839820292e-06, + "loss": 0.402, + "step": 17205 + }, + { + "epoch": 2.239750097618118, + "grad_norm": 2.647813320159912, + "learning_rate": 1.566240393548671e-06, + "loss": 0.3639, + "step": 17208 + }, + { + "epoch": 2.2401405700898085, + "grad_norm": 2.85390043258667, + "learning_rate": 1.5647216029858924e-06, + "loss": 0.3551, + "step": 17211 + }, + { + "epoch": 2.2405310425614995, + "grad_norm": 2.7630879878997803, + "learning_rate": 1.5632034125590212e-06, + "loss": 0.3158, + "step": 17214 + }, + { + "epoch": 2.24092151503319, + "grad_norm": 2.9064273834228516, + "learning_rate": 1.5616858225332858e-06, + "loss": 0.3789, + "step": 17217 + }, + { + "epoch": 2.241311987504881, + "grad_norm": 2.9090540409088135, + "learning_rate": 1.560168833173804e-06, + "loss": 0.3402, + "step": 17220 + }, + { + "epoch": 2.2417024599765716, + "grad_norm": 3.207202911376953, + "learning_rate": 1.5586524447455892e-06, + "loss": 0.4046, + "step": 17223 + }, + { + "epoch": 2.2420929324482626, + "grad_norm": 2.5313596725463867, + "learning_rate": 1.5571366575135544e-06, + "loss": 0.3498, + "step": 17226 + }, + { + "epoch": 2.242483404919953, + "grad_norm": 3.0708365440368652, + "learning_rate": 1.555621471742501e-06, + "loss": 0.3484, + "step": 17229 + }, + { + "epoch": 2.242873877391644, + "grad_norm": 3.1129088401794434, + "learning_rate": 1.5541068876971322e-06, + "loss": 0.4346, + "step": 17232 + }, + { + "epoch": 2.2432643498633347, + "grad_norm": 2.6781063079833984, + "learning_rate": 1.552592905642039e-06, + "loss": 0.3724, + "step": 17235 + }, + { + "epoch": 2.2436548223350252, + "grad_norm": 3.136171817779541, + "learning_rate": 1.5510795258417149e-06, + "loss": 0.4596, + "step": 17238 + }, + { + "epoch": 2.244045294806716, + "grad_norm": 2.706580877304077, + "learning_rate": 1.5495667485605425e-06, + "loss": 0.3578, + "step": 17241 + }, + { + "epoch": 2.2444357672784068, + "grad_norm": 2.8641068935394287, + "learning_rate": 1.5480545740627984e-06, + "loss": 0.3521, + "step": 17244 + }, + { + "epoch": 2.2448262397500978, + "grad_norm": 2.81913161277771, + "learning_rate": 1.5465430026126605e-06, + "loss": 0.385, + "step": 17247 + }, + { + "epoch": 2.2452167122217883, + "grad_norm": 2.8231277465820312, + "learning_rate": 1.5450320344741942e-06, + "loss": 0.3675, + "step": 17250 + }, + { + "epoch": 2.2456071846934793, + "grad_norm": 2.623539686203003, + "learning_rate": 1.5435216699113641e-06, + "loss": 0.3081, + "step": 17253 + }, + { + "epoch": 2.24599765716517, + "grad_norm": 3.059145927429199, + "learning_rate": 1.5420119091880247e-06, + "loss": 0.4299, + "step": 17256 + }, + { + "epoch": 2.2463881296368604, + "grad_norm": 2.7407338619232178, + "learning_rate": 1.5405027525679323e-06, + "loss": 0.329, + "step": 17259 + }, + { + "epoch": 2.2467786021085514, + "grad_norm": 3.146299123764038, + "learning_rate": 1.5389942003147313e-06, + "loss": 0.3063, + "step": 17262 + }, + { + "epoch": 2.247169074580242, + "grad_norm": 2.4751954078674316, + "learning_rate": 1.5374862526919605e-06, + "loss": 0.3154, + "step": 17265 + }, + { + "epoch": 2.247559547051933, + "grad_norm": 2.857436180114746, + "learning_rate": 1.5359789099630596e-06, + "loss": 0.4555, + "step": 17268 + }, + { + "epoch": 2.2479500195236235, + "grad_norm": 3.6831679344177246, + "learning_rate": 1.5344721723913535e-06, + "loss": 0.3972, + "step": 17271 + }, + { + "epoch": 2.2483404919953145, + "grad_norm": 2.737274646759033, + "learning_rate": 1.5329660402400703e-06, + "loss": 0.3751, + "step": 17274 + }, + { + "epoch": 2.248730964467005, + "grad_norm": 3.0916683673858643, + "learning_rate": 1.5314605137723254e-06, + "loss": 0.4125, + "step": 17277 + }, + { + "epoch": 2.2491214369386956, + "grad_norm": 2.5740551948547363, + "learning_rate": 1.529955593251133e-06, + "loss": 0.3866, + "step": 17280 + }, + { + "epoch": 2.2495119094103866, + "grad_norm": 2.7472755908966064, + "learning_rate": 1.5284512789393984e-06, + "loss": 0.3552, + "step": 17283 + }, + { + "epoch": 2.249902381882077, + "grad_norm": 2.672215461730957, + "learning_rate": 1.526947571099921e-06, + "loss": 0.3339, + "step": 17286 + }, + { + "epoch": 2.250292854353768, + "grad_norm": 2.8021116256713867, + "learning_rate": 1.525444469995398e-06, + "loss": 0.3135, + "step": 17289 + }, + { + "epoch": 2.2506833268254587, + "grad_norm": 2.7901899814605713, + "learning_rate": 1.5239419758884171e-06, + "loss": 0.2812, + "step": 17292 + }, + { + "epoch": 2.2510737992971497, + "grad_norm": 2.8856074810028076, + "learning_rate": 1.5224400890414587e-06, + "loss": 0.4185, + "step": 17295 + }, + { + "epoch": 2.2514642717688402, + "grad_norm": 2.6049904823303223, + "learning_rate": 1.5209388097169026e-06, + "loss": 0.3618, + "step": 17298 + }, + { + "epoch": 2.2518547442405312, + "grad_norm": 2.7991714477539062, + "learning_rate": 1.5194381381770173e-06, + "loss": 0.3726, + "step": 17301 + }, + { + "epoch": 2.2522452167122218, + "grad_norm": 3.187734603881836, + "learning_rate": 1.5179380746839678e-06, + "loss": 0.4094, + "step": 17304 + }, + { + "epoch": 2.2526356891839123, + "grad_norm": 2.6936240196228027, + "learning_rate": 1.5164386194998094e-06, + "loss": 0.3345, + "step": 17307 + }, + { + "epoch": 2.2530261616556033, + "grad_norm": 2.7314226627349854, + "learning_rate": 1.5149397728864979e-06, + "loss": 0.3792, + "step": 17310 + }, + { + "epoch": 2.253416634127294, + "grad_norm": 2.7893855571746826, + "learning_rate": 1.5134415351058744e-06, + "loss": 0.3403, + "step": 17313 + }, + { + "epoch": 2.253807106598985, + "grad_norm": 2.6449899673461914, + "learning_rate": 1.5119439064196823e-06, + "loss": 0.3472, + "step": 17316 + }, + { + "epoch": 2.2541975790706754, + "grad_norm": 3.2970783710479736, + "learning_rate": 1.5104468870895495e-06, + "loss": 0.4036, + "step": 17319 + }, + { + "epoch": 2.2545880515423664, + "grad_norm": 3.0442137718200684, + "learning_rate": 1.5089504773770069e-06, + "loss": 0.324, + "step": 17322 + }, + { + "epoch": 2.254978524014057, + "grad_norm": 2.8000681400299072, + "learning_rate": 1.5074546775434718e-06, + "loss": 0.3458, + "step": 17325 + }, + { + "epoch": 2.255368996485748, + "grad_norm": 2.9774391651153564, + "learning_rate": 1.5059594878502554e-06, + "loss": 0.3931, + "step": 17328 + }, + { + "epoch": 2.2557594689574385, + "grad_norm": 2.576894521713257, + "learning_rate": 1.5044649085585678e-06, + "loss": 0.3472, + "step": 17331 + }, + { + "epoch": 2.256149941429129, + "grad_norm": 2.661025285720825, + "learning_rate": 1.5029709399295066e-06, + "loss": 0.3077, + "step": 17334 + }, + { + "epoch": 2.25654041390082, + "grad_norm": 2.5842506885528564, + "learning_rate": 1.5014775822240645e-06, + "loss": 0.4093, + "step": 17337 + }, + { + "epoch": 2.2569308863725106, + "grad_norm": 3.0119876861572266, + "learning_rate": 1.4999848357031305e-06, + "loss": 0.3561, + "step": 17340 + }, + { + "epoch": 2.2573213588442016, + "grad_norm": 2.854466438293457, + "learning_rate": 1.498492700627483e-06, + "loss": 0.3519, + "step": 17343 + }, + { + "epoch": 2.257711831315892, + "grad_norm": 3.8666045665740967, + "learning_rate": 1.4970011772577925e-06, + "loss": 0.3475, + "step": 17346 + }, + { + "epoch": 2.258102303787583, + "grad_norm": 2.4900221824645996, + "learning_rate": 1.495510265854629e-06, + "loss": 0.3278, + "step": 17349 + }, + { + "epoch": 2.2584927762592737, + "grad_norm": 2.904358386993408, + "learning_rate": 1.4940199666784495e-06, + "loss": 0.3085, + "step": 17352 + }, + { + "epoch": 2.2588832487309647, + "grad_norm": 2.5294814109802246, + "learning_rate": 1.4925302799896053e-06, + "loss": 0.3618, + "step": 17355 + }, + { + "epoch": 2.2592737212026552, + "grad_norm": 2.6248788833618164, + "learning_rate": 1.491041206048342e-06, + "loss": 0.355, + "step": 17358 + }, + { + "epoch": 2.259664193674346, + "grad_norm": 2.460315465927124, + "learning_rate": 1.4895527451147995e-06, + "loss": 0.3503, + "step": 17361 + }, + { + "epoch": 2.260054666146037, + "grad_norm": 2.8616855144500732, + "learning_rate": 1.488064897449008e-06, + "loss": 0.4144, + "step": 17364 + }, + { + "epoch": 2.2604451386177273, + "grad_norm": 2.8192522525787354, + "learning_rate": 1.4865776633108908e-06, + "loss": 0.372, + "step": 17367 + }, + { + "epoch": 2.2608356110894183, + "grad_norm": 2.8044068813323975, + "learning_rate": 1.4850910429602633e-06, + "loss": 0.3388, + "step": 17370 + }, + { + "epoch": 2.261226083561109, + "grad_norm": 2.620159864425659, + "learning_rate": 1.4836050366568378e-06, + "loss": 0.356, + "step": 17373 + }, + { + "epoch": 2.2616165560328, + "grad_norm": 2.9459471702575684, + "learning_rate": 1.4821196446602148e-06, + "loss": 0.4086, + "step": 17376 + }, + { + "epoch": 2.2620070285044904, + "grad_norm": 2.8624160289764404, + "learning_rate": 1.4806348672298875e-06, + "loss": 0.4079, + "step": 17379 + }, + { + "epoch": 2.2623975009761814, + "grad_norm": 2.7950611114501953, + "learning_rate": 1.4791507046252474e-06, + "loss": 0.3334, + "step": 17382 + }, + { + "epoch": 2.262787973447872, + "grad_norm": 2.842926502227783, + "learning_rate": 1.4776671571055723e-06, + "loss": 0.3641, + "step": 17385 + }, + { + "epoch": 2.2631784459195625, + "grad_norm": 2.3689680099487305, + "learning_rate": 1.476184224930033e-06, + "loss": 0.3775, + "step": 17388 + }, + { + "epoch": 2.2635689183912535, + "grad_norm": 2.761807918548584, + "learning_rate": 1.4747019083576986e-06, + "loss": 0.3717, + "step": 17391 + }, + { + "epoch": 2.263959390862944, + "grad_norm": 2.9553542137145996, + "learning_rate": 1.4732202076475244e-06, + "loss": 0.3061, + "step": 17394 + }, + { + "epoch": 2.264349863334635, + "grad_norm": 2.7715632915496826, + "learning_rate": 1.4717391230583595e-06, + "loss": 0.3667, + "step": 17397 + }, + { + "epoch": 2.2647403358063256, + "grad_norm": 2.5581488609313965, + "learning_rate": 1.4702586548489467e-06, + "loss": 0.3156, + "step": 17400 + }, + { + "epoch": 2.265130808278016, + "grad_norm": 2.83203387260437, + "learning_rate": 1.4687788032779233e-06, + "loss": 0.3902, + "step": 17403 + }, + { + "epoch": 2.265521280749707, + "grad_norm": 3.0823655128479004, + "learning_rate": 1.4672995686038145e-06, + "loss": 0.4572, + "step": 17406 + }, + { + "epoch": 2.2659117532213977, + "grad_norm": 2.8829867839813232, + "learning_rate": 1.4658209510850373e-06, + "loss": 0.3649, + "step": 17409 + }, + { + "epoch": 2.2663022256930887, + "grad_norm": 2.9928228855133057, + "learning_rate": 1.4643429509799073e-06, + "loss": 0.3896, + "step": 17412 + }, + { + "epoch": 2.2666926981647793, + "grad_norm": 2.664480686187744, + "learning_rate": 1.4628655685466258e-06, + "loss": 0.3732, + "step": 17415 + }, + { + "epoch": 2.2670831706364702, + "grad_norm": 2.658304452896118, + "learning_rate": 1.4613888040432884e-06, + "loss": 0.3118, + "step": 17418 + }, + { + "epoch": 2.267473643108161, + "grad_norm": 2.6738052368164062, + "learning_rate": 1.459912657727881e-06, + "loss": 0.3854, + "step": 17421 + }, + { + "epoch": 2.267864115579852, + "grad_norm": 3.1468684673309326, + "learning_rate": 1.4584371298582866e-06, + "loss": 0.3526, + "step": 17424 + }, + { + "epoch": 2.2682545880515423, + "grad_norm": 2.8303110599517822, + "learning_rate": 1.456962220692275e-06, + "loss": 0.3028, + "step": 17427 + }, + { + "epoch": 2.268645060523233, + "grad_norm": 2.5083484649658203, + "learning_rate": 1.455487930487509e-06, + "loss": 0.3114, + "step": 17430 + }, + { + "epoch": 2.269035532994924, + "grad_norm": 2.9310407638549805, + "learning_rate": 1.4540142595015461e-06, + "loss": 0.3595, + "step": 17433 + }, + { + "epoch": 2.2694260054666144, + "grad_norm": 3.0042102336883545, + "learning_rate": 1.4525412079918327e-06, + "loss": 0.3652, + "step": 17436 + }, + { + "epoch": 2.2698164779383054, + "grad_norm": 2.961367130279541, + "learning_rate": 1.451068776215706e-06, + "loss": 0.4294, + "step": 17439 + }, + { + "epoch": 2.270206950409996, + "grad_norm": 2.6684181690216064, + "learning_rate": 1.449596964430398e-06, + "loss": 0.3593, + "step": 17442 + }, + { + "epoch": 2.270597422881687, + "grad_norm": 3.18241548538208, + "learning_rate": 1.448125772893033e-06, + "loss": 0.3256, + "step": 17445 + }, + { + "epoch": 2.2709878953533775, + "grad_norm": 2.6874358654022217, + "learning_rate": 1.4466552018606235e-06, + "loss": 0.3528, + "step": 17448 + }, + { + "epoch": 2.2713783678250685, + "grad_norm": 2.966919422149658, + "learning_rate": 1.4451852515900733e-06, + "loss": 0.4124, + "step": 17451 + }, + { + "epoch": 2.271768840296759, + "grad_norm": 2.7124547958374023, + "learning_rate": 1.443715922338183e-06, + "loss": 0.3237, + "step": 17454 + }, + { + "epoch": 2.2721593127684496, + "grad_norm": 2.423863172531128, + "learning_rate": 1.44224721436164e-06, + "loss": 0.3078, + "step": 17457 + }, + { + "epoch": 2.2725497852401406, + "grad_norm": 2.627487897872925, + "learning_rate": 1.4407791279170225e-06, + "loss": 0.3764, + "step": 17460 + }, + { + "epoch": 2.272940257711831, + "grad_norm": 2.7886862754821777, + "learning_rate": 1.439311663260805e-06, + "loss": 0.3625, + "step": 17463 + }, + { + "epoch": 2.273330730183522, + "grad_norm": 2.791762113571167, + "learning_rate": 1.43784482064935e-06, + "loss": 0.3917, + "step": 17466 + }, + { + "epoch": 2.2737212026552127, + "grad_norm": 3.137033700942993, + "learning_rate": 1.4363786003389108e-06, + "loss": 0.3928, + "step": 17469 + }, + { + "epoch": 2.2741116751269037, + "grad_norm": 2.5043063163757324, + "learning_rate": 1.4349130025856322e-06, + "loss": 0.3395, + "step": 17472 + }, + { + "epoch": 2.2745021475985943, + "grad_norm": 2.988461494445801, + "learning_rate": 1.4334480276455532e-06, + "loss": 0.3501, + "step": 17475 + }, + { + "epoch": 2.2748926200702853, + "grad_norm": 3.358941078186035, + "learning_rate": 1.4319836757746014e-06, + "loss": 0.3746, + "step": 17478 + }, + { + "epoch": 2.275283092541976, + "grad_norm": 2.6883740425109863, + "learning_rate": 1.4305199472285936e-06, + "loss": 0.3567, + "step": 17481 + }, + { + "epoch": 2.2756735650136664, + "grad_norm": 2.756044864654541, + "learning_rate": 1.4290568422632417e-06, + "loss": 0.3877, + "step": 17484 + }, + { + "epoch": 2.2760640374853573, + "grad_norm": 2.9537763595581055, + "learning_rate": 1.4275943611341491e-06, + "loss": 0.3604, + "step": 17487 + }, + { + "epoch": 2.276454509957048, + "grad_norm": 3.069265365600586, + "learning_rate": 1.4261325040968065e-06, + "loss": 0.3909, + "step": 17490 + }, + { + "epoch": 2.276844982428739, + "grad_norm": 2.7966959476470947, + "learning_rate": 1.4246712714065953e-06, + "loss": 0.3825, + "step": 17493 + }, + { + "epoch": 2.2772354549004294, + "grad_norm": 2.7966370582580566, + "learning_rate": 1.4232106633187932e-06, + "loss": 0.3359, + "step": 17496 + }, + { + "epoch": 2.2776259273721204, + "grad_norm": 2.9485177993774414, + "learning_rate": 1.4217506800885638e-06, + "loss": 0.3706, + "step": 17499 + }, + { + "epoch": 2.278016399843811, + "grad_norm": 2.909208297729492, + "learning_rate": 1.4202913219709612e-06, + "loss": 0.3484, + "step": 17502 + }, + { + "epoch": 2.278406872315502, + "grad_norm": 2.8797898292541504, + "learning_rate": 1.4188325892209359e-06, + "loss": 0.4045, + "step": 17505 + }, + { + "epoch": 2.2787973447871925, + "grad_norm": 2.849977970123291, + "learning_rate": 1.417374482093324e-06, + "loss": 0.3927, + "step": 17508 + }, + { + "epoch": 2.279187817258883, + "grad_norm": 2.766188383102417, + "learning_rate": 1.4159170008428513e-06, + "loss": 0.3263, + "step": 17511 + }, + { + "epoch": 2.279578289730574, + "grad_norm": 2.4690568447113037, + "learning_rate": 1.4144601457241402e-06, + "loss": 0.3001, + "step": 17514 + }, + { + "epoch": 2.2799687622022646, + "grad_norm": 2.894524335861206, + "learning_rate": 1.4130039169916986e-06, + "loss": 0.3283, + "step": 17517 + }, + { + "epoch": 2.2803592346739556, + "grad_norm": 2.6616241931915283, + "learning_rate": 1.4115483148999277e-06, + "loss": 0.3245, + "step": 17520 + }, + { + "epoch": 2.280749707145646, + "grad_norm": 2.9152584075927734, + "learning_rate": 1.410093339703113e-06, + "loss": 0.3555, + "step": 17523 + }, + { + "epoch": 2.281140179617337, + "grad_norm": 2.7940874099731445, + "learning_rate": 1.4086389916554439e-06, + "loss": 0.3349, + "step": 17526 + }, + { + "epoch": 2.2815306520890277, + "grad_norm": 2.692840814590454, + "learning_rate": 1.4071852710109867e-06, + "loss": 0.3424, + "step": 17529 + }, + { + "epoch": 2.2819211245607187, + "grad_norm": 2.8541390895843506, + "learning_rate": 1.4057321780237055e-06, + "loss": 0.4011, + "step": 17532 + }, + { + "epoch": 2.2823115970324093, + "grad_norm": 2.5435984134674072, + "learning_rate": 1.4042797129474495e-06, + "loss": 0.2974, + "step": 17535 + }, + { + "epoch": 2.2827020695041, + "grad_norm": 2.908129930496216, + "learning_rate": 1.4028278760359649e-06, + "loss": 0.3302, + "step": 17538 + }, + { + "epoch": 2.283092541975791, + "grad_norm": 2.641526222229004, + "learning_rate": 1.4013766675428831e-06, + "loss": 0.332, + "step": 17541 + }, + { + "epoch": 2.2834830144474814, + "grad_norm": 2.8526651859283447, + "learning_rate": 1.3999260877217259e-06, + "loss": 0.3843, + "step": 17544 + }, + { + "epoch": 2.2838734869191724, + "grad_norm": 2.7316031455993652, + "learning_rate": 1.3984761368259087e-06, + "loss": 0.3498, + "step": 17547 + }, + { + "epoch": 2.284263959390863, + "grad_norm": 2.5377614498138428, + "learning_rate": 1.3970268151087341e-06, + "loss": 0.3165, + "step": 17550 + }, + { + "epoch": 2.2846544318625535, + "grad_norm": 2.78267502784729, + "learning_rate": 1.3955781228233938e-06, + "loss": 0.3007, + "step": 17553 + }, + { + "epoch": 2.2850449043342445, + "grad_norm": 2.560595989227295, + "learning_rate": 1.3941300602229746e-06, + "loss": 0.3325, + "step": 17556 + }, + { + "epoch": 2.285435376805935, + "grad_norm": 2.923177480697632, + "learning_rate": 1.3926826275604476e-06, + "loss": 0.4159, + "step": 17559 + }, + { + "epoch": 2.285825849277626, + "grad_norm": 2.99003267288208, + "learning_rate": 1.3912358250886775e-06, + "loss": 0.3853, + "step": 17562 + }, + { + "epoch": 2.2862163217493165, + "grad_norm": 2.788792610168457, + "learning_rate": 1.3897896530604138e-06, + "loss": 0.3513, + "step": 17565 + }, + { + "epoch": 2.2866067942210075, + "grad_norm": 3.1519463062286377, + "learning_rate": 1.3883441117283058e-06, + "loss": 0.4351, + "step": 17568 + }, + { + "epoch": 2.286997266692698, + "grad_norm": 2.7040798664093018, + "learning_rate": 1.386899201344884e-06, + "loss": 0.3028, + "step": 17571 + }, + { + "epoch": 2.287387739164389, + "grad_norm": 3.0929603576660156, + "learning_rate": 1.3854549221625696e-06, + "loss": 0.3445, + "step": 17574 + }, + { + "epoch": 2.2877782116360796, + "grad_norm": 2.985767364501953, + "learning_rate": 1.3840112744336775e-06, + "loss": 0.3272, + "step": 17577 + }, + { + "epoch": 2.28816868410777, + "grad_norm": 2.829904794692993, + "learning_rate": 1.3825682584104088e-06, + "loss": 0.3655, + "step": 17580 + }, + { + "epoch": 2.288559156579461, + "grad_norm": 3.0158586502075195, + "learning_rate": 1.3811258743448553e-06, + "loss": 0.3236, + "step": 17583 + }, + { + "epoch": 2.2889496290511517, + "grad_norm": 2.844355344772339, + "learning_rate": 1.3796841224889973e-06, + "loss": 0.3448, + "step": 17586 + }, + { + "epoch": 2.2893401015228427, + "grad_norm": 2.8782236576080322, + "learning_rate": 1.3782430030947087e-06, + "loss": 0.4191, + "step": 17589 + }, + { + "epoch": 2.2897305739945333, + "grad_norm": 2.368623733520508, + "learning_rate": 1.3768025164137478e-06, + "loss": 0.3288, + "step": 17592 + }, + { + "epoch": 2.2901210464662243, + "grad_norm": 2.9353768825531006, + "learning_rate": 1.375362662697764e-06, + "loss": 0.3488, + "step": 17595 + }, + { + "epoch": 2.290511518937915, + "grad_norm": 2.769230604171753, + "learning_rate": 1.3739234421982995e-06, + "loss": 0.3142, + "step": 17598 + }, + { + "epoch": 2.290901991409606, + "grad_norm": 2.760913133621216, + "learning_rate": 1.3724848551667812e-06, + "loss": 0.3319, + "step": 17601 + }, + { + "epoch": 2.2912924638812964, + "grad_norm": 2.777470350265503, + "learning_rate": 1.3710469018545263e-06, + "loss": 0.3467, + "step": 17604 + }, + { + "epoch": 2.291682936352987, + "grad_norm": 2.680386543273926, + "learning_rate": 1.3696095825127436e-06, + "loss": 0.3023, + "step": 17607 + }, + { + "epoch": 2.292073408824678, + "grad_norm": 2.7917463779449463, + "learning_rate": 1.3681728973925313e-06, + "loss": 0.393, + "step": 17610 + }, + { + "epoch": 2.2924638812963685, + "grad_norm": 3.1995327472686768, + "learning_rate": 1.3667368467448734e-06, + "loss": 0.3997, + "step": 17613 + }, + { + "epoch": 2.2928543537680595, + "grad_norm": 2.46683406829834, + "learning_rate": 1.365301430820643e-06, + "loss": 0.3356, + "step": 17616 + }, + { + "epoch": 2.29324482623975, + "grad_norm": 3.129554271697998, + "learning_rate": 1.3638666498706082e-06, + "loss": 0.3385, + "step": 17619 + }, + { + "epoch": 2.293635298711441, + "grad_norm": 2.8934786319732666, + "learning_rate": 1.3624325041454206e-06, + "loss": 0.3568, + "step": 17622 + }, + { + "epoch": 2.2940257711831316, + "grad_norm": 2.703233242034912, + "learning_rate": 1.360998993895622e-06, + "loss": 0.3981, + "step": 17625 + }, + { + "epoch": 2.2944162436548226, + "grad_norm": 2.704498529434204, + "learning_rate": 1.3595661193716426e-06, + "loss": 0.3571, + "step": 17628 + }, + { + "epoch": 2.294806716126513, + "grad_norm": 2.624086856842041, + "learning_rate": 1.3581338808238048e-06, + "loss": 0.3518, + "step": 17631 + }, + { + "epoch": 2.2951971885982037, + "grad_norm": 2.516305685043335, + "learning_rate": 1.356702278502317e-06, + "loss": 0.3229, + "step": 17634 + }, + { + "epoch": 2.2955876610698946, + "grad_norm": 2.7615859508514404, + "learning_rate": 1.3552713126572752e-06, + "loss": 0.3107, + "step": 17637 + }, + { + "epoch": 2.295978133541585, + "grad_norm": 2.727630138397217, + "learning_rate": 1.353840983538669e-06, + "loss": 0.367, + "step": 17640 + }, + { + "epoch": 2.296368606013276, + "grad_norm": 2.636464834213257, + "learning_rate": 1.3524112913963728e-06, + "loss": 0.3203, + "step": 17643 + }, + { + "epoch": 2.2967590784849667, + "grad_norm": 2.5415730476379395, + "learning_rate": 1.3509822364801489e-06, + "loss": 0.3375, + "step": 17646 + }, + { + "epoch": 2.2971495509566577, + "grad_norm": 3.3716161251068115, + "learning_rate": 1.3495538190396524e-06, + "loss": 0.3362, + "step": 17649 + }, + { + "epoch": 2.2975400234283483, + "grad_norm": 2.9915719032287598, + "learning_rate": 1.348126039324425e-06, + "loss": 0.3083, + "step": 17652 + }, + { + "epoch": 2.2979304959000393, + "grad_norm": 3.0488333702087402, + "learning_rate": 1.3466988975838967e-06, + "loss": 0.3817, + "step": 17655 + }, + { + "epoch": 2.29832096837173, + "grad_norm": 3.0410192012786865, + "learning_rate": 1.3452723940673839e-06, + "loss": 0.367, + "step": 17658 + }, + { + "epoch": 2.2987114408434204, + "grad_norm": 3.0488669872283936, + "learning_rate": 1.343846529024097e-06, + "loss": 0.3775, + "step": 17661 + }, + { + "epoch": 2.2991019133151114, + "grad_norm": 2.820769786834717, + "learning_rate": 1.3424213027031297e-06, + "loss": 0.3809, + "step": 17664 + }, + { + "epoch": 2.299492385786802, + "grad_norm": 2.7791459560394287, + "learning_rate": 1.3409967153534654e-06, + "loss": 0.3571, + "step": 17667 + }, + { + "epoch": 2.299882858258493, + "grad_norm": 3.0136008262634277, + "learning_rate": 1.3395727672239789e-06, + "loss": 0.3502, + "step": 17670 + }, + { + "epoch": 2.3002733307301835, + "grad_norm": 2.868488073348999, + "learning_rate": 1.3381494585634292e-06, + "loss": 0.3603, + "step": 17673 + }, + { + "epoch": 2.300663803201874, + "grad_norm": 2.7699248790740967, + "learning_rate": 1.3367267896204662e-06, + "loss": 0.3638, + "step": 17676 + }, + { + "epoch": 2.301054275673565, + "grad_norm": 2.7452714443206787, + "learning_rate": 1.3353047606436248e-06, + "loss": 0.3532, + "step": 17679 + }, + { + "epoch": 2.3014447481452556, + "grad_norm": 2.6756961345672607, + "learning_rate": 1.3338833718813337e-06, + "loss": 0.3326, + "step": 17682 + }, + { + "epoch": 2.3018352206169466, + "grad_norm": 2.506359100341797, + "learning_rate": 1.3324626235819055e-06, + "loss": 0.3736, + "step": 17685 + }, + { + "epoch": 2.302225693088637, + "grad_norm": 2.6358907222747803, + "learning_rate": 1.3310425159935398e-06, + "loss": 0.3283, + "step": 17688 + }, + { + "epoch": 2.302616165560328, + "grad_norm": 2.714524745941162, + "learning_rate": 1.3296230493643282e-06, + "loss": 0.3764, + "step": 17691 + }, + { + "epoch": 2.3030066380320187, + "grad_norm": 2.8931455612182617, + "learning_rate": 1.3282042239422505e-06, + "loss": 0.3603, + "step": 17694 + }, + { + "epoch": 2.3033971105037097, + "grad_norm": 3.029452323913574, + "learning_rate": 1.3267860399751698e-06, + "loss": 0.3331, + "step": 17697 + }, + { + "epoch": 2.3037875829754, + "grad_norm": 2.932361125946045, + "learning_rate": 1.3253684977108394e-06, + "loss": 0.3672, + "step": 17700 + }, + { + "epoch": 2.3041780554470908, + "grad_norm": 3.243929624557495, + "learning_rate": 1.3239515973969042e-06, + "loss": 0.3559, + "step": 17703 + }, + { + "epoch": 2.3045685279187818, + "grad_norm": 2.824859142303467, + "learning_rate": 1.322535339280891e-06, + "loss": 0.4494, + "step": 17706 + }, + { + "epoch": 2.3049590003904723, + "grad_norm": 2.740846633911133, + "learning_rate": 1.3211197236102163e-06, + "loss": 0.3501, + "step": 17709 + }, + { + "epoch": 2.3053494728621633, + "grad_norm": 2.9181246757507324, + "learning_rate": 1.3197047506321887e-06, + "loss": 0.3736, + "step": 17712 + }, + { + "epoch": 2.305739945333854, + "grad_norm": 2.6927707195281982, + "learning_rate": 1.3182904205939983e-06, + "loss": 0.348, + "step": 17715 + }, + { + "epoch": 2.306130417805545, + "grad_norm": 2.88547945022583, + "learning_rate": 1.3168767337427251e-06, + "loss": 0.4355, + "step": 17718 + }, + { + "epoch": 2.3065208902772354, + "grad_norm": 2.848472833633423, + "learning_rate": 1.3154636903253398e-06, + "loss": 0.3914, + "step": 17721 + }, + { + "epoch": 2.3069113627489264, + "grad_norm": 2.7487449645996094, + "learning_rate": 1.3140512905886965e-06, + "loss": 0.3643, + "step": 17724 + }, + { + "epoch": 2.307301835220617, + "grad_norm": 2.6262028217315674, + "learning_rate": 1.312639534779539e-06, + "loss": 0.3678, + "step": 17727 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 2.6408939361572266, + "learning_rate": 1.3112284231444961e-06, + "loss": 0.3232, + "step": 17730 + }, + { + "epoch": 2.3080827801639985, + "grad_norm": 2.451420307159424, + "learning_rate": 1.3098179559300877e-06, + "loss": 0.3868, + "step": 17733 + }, + { + "epoch": 2.308473252635689, + "grad_norm": 2.4723563194274902, + "learning_rate": 1.3084081333827204e-06, + "loss": 0.3023, + "step": 17736 + }, + { + "epoch": 2.30886372510738, + "grad_norm": 2.9598135948181152, + "learning_rate": 1.3069989557486868e-06, + "loss": 0.3302, + "step": 17739 + }, + { + "epoch": 2.3092541975790706, + "grad_norm": 2.8125874996185303, + "learning_rate": 1.305590423274165e-06, + "loss": 0.3879, + "step": 17742 + }, + { + "epoch": 2.3096446700507616, + "grad_norm": 2.6237545013427734, + "learning_rate": 1.3041825362052258e-06, + "loss": 0.4228, + "step": 17745 + }, + { + "epoch": 2.310035142522452, + "grad_norm": 2.6856138706207275, + "learning_rate": 1.3027752947878226e-06, + "loss": 0.3903, + "step": 17748 + }, + { + "epoch": 2.310425614994143, + "grad_norm": 2.5263748168945312, + "learning_rate": 1.301368699267796e-06, + "loss": 0.328, + "step": 17751 + }, + { + "epoch": 2.3108160874658337, + "grad_norm": 2.7629663944244385, + "learning_rate": 1.2999627498908785e-06, + "loss": 0.345, + "step": 17754 + }, + { + "epoch": 2.311206559937524, + "grad_norm": 2.620549201965332, + "learning_rate": 1.2985574469026847e-06, + "loss": 0.3989, + "step": 17757 + }, + { + "epoch": 2.311597032409215, + "grad_norm": 3.176088333129883, + "learning_rate": 1.297152790548717e-06, + "loss": 0.3533, + "step": 17760 + }, + { + "epoch": 2.3119875048809058, + "grad_norm": 2.64750075340271, + "learning_rate": 1.2957487810743686e-06, + "loss": 0.3045, + "step": 17763 + }, + { + "epoch": 2.3123779773525968, + "grad_norm": 2.6987838745117188, + "learning_rate": 1.2943454187249154e-06, + "loss": 0.3716, + "step": 17766 + }, + { + "epoch": 2.3127684498242873, + "grad_norm": 2.43686580657959, + "learning_rate": 1.2929427037455206e-06, + "loss": 0.328, + "step": 17769 + }, + { + "epoch": 2.3131589222959783, + "grad_norm": 2.7529258728027344, + "learning_rate": 1.2915406363812384e-06, + "loss": 0.3639, + "step": 17772 + }, + { + "epoch": 2.313549394767669, + "grad_norm": 2.7594821453094482, + "learning_rate": 1.2901392168770038e-06, + "loss": 0.3543, + "step": 17775 + }, + { + "epoch": 2.31393986723936, + "grad_norm": 2.7785696983337402, + "learning_rate": 1.2887384454776453e-06, + "loss": 0.3617, + "step": 17778 + }, + { + "epoch": 2.3143303397110504, + "grad_norm": 2.8949971199035645, + "learning_rate": 1.2873383224278717e-06, + "loss": 0.3065, + "step": 17781 + }, + { + "epoch": 2.314720812182741, + "grad_norm": 2.9128878116607666, + "learning_rate": 1.2859388479722846e-06, + "loss": 0.3951, + "step": 17784 + }, + { + "epoch": 2.315111284654432, + "grad_norm": 3.0318446159362793, + "learning_rate": 1.2845400223553666e-06, + "loss": 0.4716, + "step": 17787 + }, + { + "epoch": 2.3155017571261225, + "grad_norm": 3.099668264389038, + "learning_rate": 1.2831418458214912e-06, + "loss": 0.3485, + "step": 17790 + }, + { + "epoch": 2.3158922295978135, + "grad_norm": 2.8386688232421875, + "learning_rate": 1.2817443186149148e-06, + "loss": 0.3256, + "step": 17793 + }, + { + "epoch": 2.316282702069504, + "grad_norm": 2.8773865699768066, + "learning_rate": 1.280347440979785e-06, + "loss": 0.377, + "step": 17796 + }, + { + "epoch": 2.316673174541195, + "grad_norm": 2.7028415203094482, + "learning_rate": 1.2789512131601323e-06, + "loss": 0.3315, + "step": 17799 + }, + { + "epoch": 2.3170636470128856, + "grad_norm": 2.7776808738708496, + "learning_rate": 1.2775556353998736e-06, + "loss": 0.3006, + "step": 17802 + }, + { + "epoch": 2.3174541194845766, + "grad_norm": 3.288285732269287, + "learning_rate": 1.2761607079428157e-06, + "loss": 0.4505, + "step": 17805 + }, + { + "epoch": 2.317844591956267, + "grad_norm": 2.7252891063690186, + "learning_rate": 1.2747664310326486e-06, + "loss": 0.3552, + "step": 17808 + }, + { + "epoch": 2.3182350644279577, + "grad_norm": 2.534891128540039, + "learning_rate": 1.2733728049129473e-06, + "loss": 0.3113, + "step": 17811 + }, + { + "epoch": 2.3186255368996487, + "grad_norm": 2.530785083770752, + "learning_rate": 1.271979829827179e-06, + "loss": 0.3388, + "step": 17814 + }, + { + "epoch": 2.3190160093713392, + "grad_norm": 2.7788338661193848, + "learning_rate": 1.2705875060186902e-06, + "loss": 0.3419, + "step": 17817 + }, + { + "epoch": 2.31940648184303, + "grad_norm": 2.8521158695220947, + "learning_rate": 1.2691958337307204e-06, + "loss": 0.3486, + "step": 17820 + }, + { + "epoch": 2.3197969543147208, + "grad_norm": 3.0471034049987793, + "learning_rate": 1.267804813206388e-06, + "loss": 0.3333, + "step": 17823 + }, + { + "epoch": 2.3201874267864113, + "grad_norm": 2.7590341567993164, + "learning_rate": 1.266414444688705e-06, + "loss": 0.3046, + "step": 17826 + }, + { + "epoch": 2.3205778992581023, + "grad_norm": 2.6539900302886963, + "learning_rate": 1.2650247284205646e-06, + "loss": 0.3385, + "step": 17829 + }, + { + "epoch": 2.320968371729793, + "grad_norm": 2.5913002490997314, + "learning_rate": 1.263635664644745e-06, + "loss": 0.3055, + "step": 17832 + }, + { + "epoch": 2.321358844201484, + "grad_norm": 2.7210516929626465, + "learning_rate": 1.2622472536039164e-06, + "loss": 0.3651, + "step": 17835 + }, + { + "epoch": 2.3217493166731744, + "grad_norm": 2.820171594619751, + "learning_rate": 1.2608594955406296e-06, + "loss": 0.3353, + "step": 17838 + }, + { + "epoch": 2.3221397891448654, + "grad_norm": 2.9673337936401367, + "learning_rate": 1.2594723906973222e-06, + "loss": 0.401, + "step": 17841 + }, + { + "epoch": 2.322530261616556, + "grad_norm": 2.832606792449951, + "learning_rate": 1.2580859393163185e-06, + "loss": 0.4028, + "step": 17844 + }, + { + "epoch": 2.322920734088247, + "grad_norm": 2.623936653137207, + "learning_rate": 1.2567001416398306e-06, + "loss": 0.3175, + "step": 17847 + }, + { + "epoch": 2.3233112065599375, + "grad_norm": 2.713925361633301, + "learning_rate": 1.2553149979099533e-06, + "loss": 0.3578, + "step": 17850 + }, + { + "epoch": 2.323701679031628, + "grad_norm": 2.729015827178955, + "learning_rate": 1.2539305083686665e-06, + "loss": 0.3648, + "step": 17853 + }, + { + "epoch": 2.324092151503319, + "grad_norm": 2.9586987495422363, + "learning_rate": 1.2525466732578407e-06, + "loss": 0.3634, + "step": 17856 + }, + { + "epoch": 2.3244826239750096, + "grad_norm": 2.6526246070861816, + "learning_rate": 1.2511634928192262e-06, + "loss": 0.3579, + "step": 17859 + }, + { + "epoch": 2.3248730964467006, + "grad_norm": 2.8939921855926514, + "learning_rate": 1.249780967294465e-06, + "loss": 0.3976, + "step": 17862 + }, + { + "epoch": 2.325263568918391, + "grad_norm": 2.95613169670105, + "learning_rate": 1.2483990969250776e-06, + "loss": 0.3352, + "step": 17865 + }, + { + "epoch": 2.325654041390082, + "grad_norm": 2.9669291973114014, + "learning_rate": 1.247017881952477e-06, + "loss": 0.3695, + "step": 17868 + }, + { + "epoch": 2.3260445138617727, + "grad_norm": 2.924299478530884, + "learning_rate": 1.245637322617958e-06, + "loss": 0.4432, + "step": 17871 + }, + { + "epoch": 2.3264349863334637, + "grad_norm": 2.776327610015869, + "learning_rate": 1.2442574191626988e-06, + "loss": 0.4004, + "step": 17874 + }, + { + "epoch": 2.3268254588051542, + "grad_norm": 2.6248810291290283, + "learning_rate": 1.2428781718277688e-06, + "loss": 0.3616, + "step": 17877 + }, + { + "epoch": 2.327215931276845, + "grad_norm": 2.729130983352661, + "learning_rate": 1.2414995808541186e-06, + "loss": 0.3167, + "step": 17880 + }, + { + "epoch": 2.327606403748536, + "grad_norm": 2.6671786308288574, + "learning_rate": 1.2401216464825838e-06, + "loss": 0.3463, + "step": 17883 + }, + { + "epoch": 2.3279968762202263, + "grad_norm": 2.7406859397888184, + "learning_rate": 1.2387443689538886e-06, + "loss": 0.4256, + "step": 17886 + }, + { + "epoch": 2.3283873486919173, + "grad_norm": 2.8111987113952637, + "learning_rate": 1.23736774850864e-06, + "loss": 0.3992, + "step": 17889 + }, + { + "epoch": 2.328777821163608, + "grad_norm": 2.890775680541992, + "learning_rate": 1.2359917853873304e-06, + "loss": 0.3903, + "step": 17892 + }, + { + "epoch": 2.329168293635299, + "grad_norm": 2.70546817779541, + "learning_rate": 1.2346164798303356e-06, + "loss": 0.3601, + "step": 17895 + }, + { + "epoch": 2.3295587661069894, + "grad_norm": 2.588521957397461, + "learning_rate": 1.2332418320779226e-06, + "loss": 0.3838, + "step": 17898 + }, + { + "epoch": 2.3299492385786804, + "grad_norm": 2.766190767288208, + "learning_rate": 1.2318678423702358e-06, + "loss": 0.362, + "step": 17901 + }, + { + "epoch": 2.330339711050371, + "grad_norm": 2.680659055709839, + "learning_rate": 1.230494510947311e-06, + "loss": 0.4384, + "step": 17904 + }, + { + "epoch": 2.3307301835220615, + "grad_norm": 2.6411266326904297, + "learning_rate": 1.2291218380490644e-06, + "loss": 0.3412, + "step": 17907 + }, + { + "epoch": 2.3311206559937525, + "grad_norm": 2.884023666381836, + "learning_rate": 1.2277498239153007e-06, + "loss": 0.3798, + "step": 17910 + }, + { + "epoch": 2.331511128465443, + "grad_norm": 2.8152637481689453, + "learning_rate": 1.2263784687857078e-06, + "loss": 0.4039, + "step": 17913 + }, + { + "epoch": 2.331901600937134, + "grad_norm": 3.315389394760132, + "learning_rate": 1.225007772899856e-06, + "loss": 0.3791, + "step": 17916 + }, + { + "epoch": 2.3322920734088246, + "grad_norm": 3.0322203636169434, + "learning_rate": 1.2236377364972063e-06, + "loss": 0.3572, + "step": 17919 + }, + { + "epoch": 2.3326825458805156, + "grad_norm": 2.6254141330718994, + "learning_rate": 1.2222683598171003e-06, + "loss": 0.3524, + "step": 17922 + }, + { + "epoch": 2.333073018352206, + "grad_norm": 2.877248525619507, + "learning_rate": 1.2208996430987625e-06, + "loss": 0.3472, + "step": 17925 + }, + { + "epoch": 2.333463490823897, + "grad_norm": 2.834516763687134, + "learning_rate": 1.2195315865813085e-06, + "loss": 0.4036, + "step": 17928 + }, + { + "epoch": 2.3338539632955877, + "grad_norm": 2.6819605827331543, + "learning_rate": 1.2181641905037338e-06, + "loss": 0.3461, + "step": 17931 + }, + { + "epoch": 2.3342444357672782, + "grad_norm": 2.962890148162842, + "learning_rate": 1.2167974551049177e-06, + "loss": 0.403, + "step": 17934 + }, + { + "epoch": 2.3346349082389692, + "grad_norm": 3.020080089569092, + "learning_rate": 1.2154313806236284e-06, + "loss": 0.4079, + "step": 17937 + }, + { + "epoch": 2.33502538071066, + "grad_norm": 2.56555438041687, + "learning_rate": 1.214065967298516e-06, + "loss": 0.3579, + "step": 17940 + }, + { + "epoch": 2.335415853182351, + "grad_norm": 3.1184959411621094, + "learning_rate": 1.2127012153681128e-06, + "loss": 0.4087, + "step": 17943 + }, + { + "epoch": 2.3358063256540413, + "grad_norm": 2.3602640628814697, + "learning_rate": 1.21133712507084e-06, + "loss": 0.3074, + "step": 17946 + }, + { + "epoch": 2.336196798125732, + "grad_norm": 3.22487735748291, + "learning_rate": 1.2099736966450026e-06, + "loss": 0.3818, + "step": 17949 + }, + { + "epoch": 2.336587270597423, + "grad_norm": 2.820575475692749, + "learning_rate": 1.208610930328788e-06, + "loss": 0.3451, + "step": 17952 + }, + { + "epoch": 2.3369777430691134, + "grad_norm": 2.8845815658569336, + "learning_rate": 1.2072488263602672e-06, + "loss": 0.3954, + "step": 17955 + }, + { + "epoch": 2.3373682155408044, + "grad_norm": 2.554292917251587, + "learning_rate": 1.2058873849773966e-06, + "loss": 0.3808, + "step": 17958 + }, + { + "epoch": 2.337758688012495, + "grad_norm": 2.7079639434814453, + "learning_rate": 1.2045266064180195e-06, + "loss": 0.4351, + "step": 17961 + }, + { + "epoch": 2.338149160484186, + "grad_norm": 2.7546820640563965, + "learning_rate": 1.2031664909198597e-06, + "loss": 0.4072, + "step": 17964 + }, + { + "epoch": 2.3385396329558765, + "grad_norm": 2.8127317428588867, + "learning_rate": 1.2018070387205256e-06, + "loss": 0.3719, + "step": 17967 + }, + { + "epoch": 2.3389301054275675, + "grad_norm": 2.782156229019165, + "learning_rate": 1.2004482500575126e-06, + "loss": 0.3663, + "step": 17970 + }, + { + "epoch": 2.339320577899258, + "grad_norm": 3.034802198410034, + "learning_rate": 1.1990901251681974e-06, + "loss": 0.3969, + "step": 17973 + }, + { + "epoch": 2.3397110503709486, + "grad_norm": 2.7988011837005615, + "learning_rate": 1.1977326642898395e-06, + "loss": 0.3998, + "step": 17976 + }, + { + "epoch": 2.3401015228426396, + "grad_norm": 2.6385715007781982, + "learning_rate": 1.1963758676595883e-06, + "loss": 0.3839, + "step": 17979 + }, + { + "epoch": 2.34049199531433, + "grad_norm": 2.976228713989258, + "learning_rate": 1.195019735514471e-06, + "loss": 0.3575, + "step": 17982 + }, + { + "epoch": 2.340882467786021, + "grad_norm": 2.7459535598754883, + "learning_rate": 1.1936642680914007e-06, + "loss": 0.3541, + "step": 17985 + }, + { + "epoch": 2.3412729402577117, + "grad_norm": 2.916914939880371, + "learning_rate": 1.1923094656271745e-06, + "loss": 0.395, + "step": 17988 + }, + { + "epoch": 2.3416634127294027, + "grad_norm": 2.652353286743164, + "learning_rate": 1.1909553283584763e-06, + "loss": 0.3298, + "step": 17991 + }, + { + "epoch": 2.3420538852010933, + "grad_norm": 2.817486524581909, + "learning_rate": 1.1896018565218692e-06, + "loss": 0.3436, + "step": 17994 + }, + { + "epoch": 2.3424443576727842, + "grad_norm": 2.9051315784454346, + "learning_rate": 1.1882490503538003e-06, + "loss": 0.3559, + "step": 17997 + }, + { + "epoch": 2.342834830144475, + "grad_norm": 2.805586099624634, + "learning_rate": 1.1868969100906052e-06, + "loss": 0.3098, + "step": 18000 + }, + { + "epoch": 2.3432253026161654, + "grad_norm": 2.8245179653167725, + "learning_rate": 1.1855454359684982e-06, + "loss": 0.3487, + "step": 18003 + }, + { + "epoch": 2.3436157750878563, + "grad_norm": 2.939572811126709, + "learning_rate": 1.1841946282235788e-06, + "loss": 0.3594, + "step": 18006 + }, + { + "epoch": 2.344006247559547, + "grad_norm": 2.8865296840667725, + "learning_rate": 1.1828444870918292e-06, + "loss": 0.3613, + "step": 18009 + }, + { + "epoch": 2.344396720031238, + "grad_norm": 2.665978193283081, + "learning_rate": 1.1814950128091197e-06, + "loss": 0.3966, + "step": 18012 + }, + { + "epoch": 2.3447871925029284, + "grad_norm": 2.8467841148376465, + "learning_rate": 1.180146205611198e-06, + "loss": 0.354, + "step": 18015 + }, + { + "epoch": 2.3451776649746194, + "grad_norm": 2.763833999633789, + "learning_rate": 1.1787980657336967e-06, + "loss": 0.3064, + "step": 18018 + }, + { + "epoch": 2.34556813744631, + "grad_norm": 2.5303876399993896, + "learning_rate": 1.1774505934121361e-06, + "loss": 0.3062, + "step": 18021 + }, + { + "epoch": 2.345958609918001, + "grad_norm": 2.5517313480377197, + "learning_rate": 1.176103788881916e-06, + "loss": 0.3531, + "step": 18024 + }, + { + "epoch": 2.3463490823896915, + "grad_norm": 2.689030885696411, + "learning_rate": 1.1747576523783178e-06, + "loss": 0.3974, + "step": 18027 + }, + { + "epoch": 2.346739554861382, + "grad_norm": 2.534513235092163, + "learning_rate": 1.1734121841365104e-06, + "loss": 0.335, + "step": 18030 + }, + { + "epoch": 2.347130027333073, + "grad_norm": 2.8086178302764893, + "learning_rate": 1.1720673843915465e-06, + "loss": 0.3382, + "step": 18033 + }, + { + "epoch": 2.3475204998047636, + "grad_norm": 2.9617831707000732, + "learning_rate": 1.1707232533783574e-06, + "loss": 0.3986, + "step": 18036 + }, + { + "epoch": 2.3479109722764546, + "grad_norm": 2.74955415725708, + "learning_rate": 1.169379791331759e-06, + "loss": 0.3935, + "step": 18039 + }, + { + "epoch": 2.348301444748145, + "grad_norm": 2.898005962371826, + "learning_rate": 1.1680369984864536e-06, + "loss": 0.5264, + "step": 18042 + }, + { + "epoch": 2.348691917219836, + "grad_norm": 3.2025671005249023, + "learning_rate": 1.1666948750770236e-06, + "loss": 0.3295, + "step": 18045 + }, + { + "epoch": 2.3490823896915267, + "grad_norm": 2.760274887084961, + "learning_rate": 1.1653534213379348e-06, + "loss": 0.4275, + "step": 18048 + }, + { + "epoch": 2.3494728621632177, + "grad_norm": 2.571065664291382, + "learning_rate": 1.1640126375035348e-06, + "loss": 0.4015, + "step": 18051 + }, + { + "epoch": 2.3498633346349083, + "grad_norm": 2.9767799377441406, + "learning_rate": 1.1626725238080593e-06, + "loss": 0.3862, + "step": 18054 + }, + { + "epoch": 2.350253807106599, + "grad_norm": 2.5151243209838867, + "learning_rate": 1.161333080485621e-06, + "loss": 0.3542, + "step": 18057 + }, + { + "epoch": 2.35064427957829, + "grad_norm": 3.4934723377227783, + "learning_rate": 1.1599943077702163e-06, + "loss": 0.3933, + "step": 18060 + }, + { + "epoch": 2.3510347520499804, + "grad_norm": 2.7796919345855713, + "learning_rate": 1.1586562058957302e-06, + "loss": 0.3271, + "step": 18063 + }, + { + "epoch": 2.3514252245216714, + "grad_norm": 2.8203134536743164, + "learning_rate": 1.157318775095923e-06, + "loss": 0.3522, + "step": 18066 + }, + { + "epoch": 2.351815696993362, + "grad_norm": 3.154359817504883, + "learning_rate": 1.155982015604441e-06, + "loss": 0.2913, + "step": 18069 + }, + { + "epoch": 2.352206169465053, + "grad_norm": 2.685695171356201, + "learning_rate": 1.1546459276548145e-06, + "loss": 0.3054, + "step": 18072 + }, + { + "epoch": 2.3525966419367434, + "grad_norm": 3.0085608959198, + "learning_rate": 1.153310511480456e-06, + "loss": 0.4078, + "step": 18075 + }, + { + "epoch": 2.3529871144084344, + "grad_norm": 3.074324607849121, + "learning_rate": 1.1519757673146586e-06, + "loss": 0.3455, + "step": 18078 + }, + { + "epoch": 2.353377586880125, + "grad_norm": 3.065185546875, + "learning_rate": 1.1506416953905986e-06, + "loss": 0.3692, + "step": 18081 + }, + { + "epoch": 2.3537680593518155, + "grad_norm": 2.8767306804656982, + "learning_rate": 1.1493082959413383e-06, + "loss": 0.3868, + "step": 18084 + }, + { + "epoch": 2.3541585318235065, + "grad_norm": 3.119493007659912, + "learning_rate": 1.1479755691998172e-06, + "loss": 0.3715, + "step": 18087 + }, + { + "epoch": 2.354549004295197, + "grad_norm": 2.8516952991485596, + "learning_rate": 1.1466435153988597e-06, + "loss": 0.3386, + "step": 18090 + }, + { + "epoch": 2.354939476766888, + "grad_norm": 2.712786912918091, + "learning_rate": 1.1453121347711755e-06, + "loss": 0.3865, + "step": 18093 + }, + { + "epoch": 2.3553299492385786, + "grad_norm": 3.0909574031829834, + "learning_rate": 1.1439814275493522e-06, + "loss": 0.3825, + "step": 18096 + }, + { + "epoch": 2.355720421710269, + "grad_norm": 2.726844549179077, + "learning_rate": 1.1426513939658611e-06, + "loss": 0.3344, + "step": 18099 + }, + { + "epoch": 2.35611089418196, + "grad_norm": 2.7704012393951416, + "learning_rate": 1.1413220342530556e-06, + "loss": 0.4041, + "step": 18102 + }, + { + "epoch": 2.3565013666536507, + "grad_norm": 2.8426220417022705, + "learning_rate": 1.1399933486431747e-06, + "loss": 0.3427, + "step": 18105 + }, + { + "epoch": 2.3568918391253417, + "grad_norm": 2.8885271549224854, + "learning_rate": 1.138665337368336e-06, + "loss": 0.4052, + "step": 18108 + }, + { + "epoch": 2.3572823115970323, + "grad_norm": 2.9842803478240967, + "learning_rate": 1.1373380006605378e-06, + "loss": 0.3765, + "step": 18111 + }, + { + "epoch": 2.3576727840687233, + "grad_norm": 2.573024272918701, + "learning_rate": 1.1360113387516654e-06, + "loss": 0.2962, + "step": 18114 + }, + { + "epoch": 2.358063256540414, + "grad_norm": 2.5947093963623047, + "learning_rate": 1.1346853518734845e-06, + "loss": 0.322, + "step": 18117 + }, + { + "epoch": 2.358453729012105, + "grad_norm": 3.0403120517730713, + "learning_rate": 1.133360040257641e-06, + "loss": 0.4033, + "step": 18120 + }, + { + "epoch": 2.3588442014837954, + "grad_norm": 2.958102226257324, + "learning_rate": 1.1320354041356636e-06, + "loss": 0.3175, + "step": 18123 + }, + { + "epoch": 2.359234673955486, + "grad_norm": 2.5290329456329346, + "learning_rate": 1.1307114437389648e-06, + "loss": 0.3399, + "step": 18126 + }, + { + "epoch": 2.359625146427177, + "grad_norm": 2.7605209350585938, + "learning_rate": 1.1293881592988375e-06, + "loss": 0.3571, + "step": 18129 + }, + { + "epoch": 2.3600156188988675, + "grad_norm": 3.629070997238159, + "learning_rate": 1.128065551046455e-06, + "loss": 0.3737, + "step": 18132 + }, + { + "epoch": 2.3604060913705585, + "grad_norm": 3.1546218395233154, + "learning_rate": 1.1267436192128762e-06, + "loss": 0.3364, + "step": 18135 + }, + { + "epoch": 2.360796563842249, + "grad_norm": 2.7438509464263916, + "learning_rate": 1.1254223640290391e-06, + "loss": 0.322, + "step": 18138 + }, + { + "epoch": 2.36118703631394, + "grad_norm": 2.711299419403076, + "learning_rate": 1.1241017857257624e-06, + "loss": 0.3045, + "step": 18141 + }, + { + "epoch": 2.3615775087856306, + "grad_norm": 3.051964282989502, + "learning_rate": 1.1227818845337523e-06, + "loss": 0.3652, + "step": 18144 + }, + { + "epoch": 2.3619679812573215, + "grad_norm": 2.6929006576538086, + "learning_rate": 1.12146266068359e-06, + "loss": 0.3793, + "step": 18147 + }, + { + "epoch": 2.362358453729012, + "grad_norm": 2.7894413471221924, + "learning_rate": 1.1201441144057413e-06, + "loss": 0.3304, + "step": 18150 + }, + { + "epoch": 2.3627489262007026, + "grad_norm": 2.5840792655944824, + "learning_rate": 1.1188262459305515e-06, + "loss": 0.3482, + "step": 18153 + }, + { + "epoch": 2.3631393986723936, + "grad_norm": 3.0223450660705566, + "learning_rate": 1.117509055488254e-06, + "loss": 0.4019, + "step": 18156 + }, + { + "epoch": 2.363529871144084, + "grad_norm": 2.9108870029449463, + "learning_rate": 1.1161925433089578e-06, + "loss": 0.3457, + "step": 18159 + }, + { + "epoch": 2.363920343615775, + "grad_norm": 2.797194480895996, + "learning_rate": 1.114876709622653e-06, + "loss": 0.3378, + "step": 18162 + }, + { + "epoch": 2.3643108160874657, + "grad_norm": 2.627267599105835, + "learning_rate": 1.1135615546592132e-06, + "loss": 0.3834, + "step": 18165 + }, + { + "epoch": 2.3647012885591567, + "grad_norm": 2.9231019020080566, + "learning_rate": 1.1122470786483946e-06, + "loss": 0.3934, + "step": 18168 + }, + { + "epoch": 2.3650917610308473, + "grad_norm": 2.57883358001709, + "learning_rate": 1.1109332818198338e-06, + "loss": 0.4163, + "step": 18171 + }, + { + "epoch": 2.3654822335025383, + "grad_norm": 3.0240585803985596, + "learning_rate": 1.1096201644030446e-06, + "loss": 0.371, + "step": 18174 + }, + { + "epoch": 2.365872705974229, + "grad_norm": 2.887322425842285, + "learning_rate": 1.108307726627431e-06, + "loss": 0.3658, + "step": 18177 + }, + { + "epoch": 2.3662631784459194, + "grad_norm": 2.607095956802368, + "learning_rate": 1.1069959687222704e-06, + "loss": 0.3982, + "step": 18180 + }, + { + "epoch": 2.3666536509176104, + "grad_norm": 3.5808451175689697, + "learning_rate": 1.1056848909167223e-06, + "loss": 0.3628, + "step": 18183 + }, + { + "epoch": 2.367044123389301, + "grad_norm": 3.329209566116333, + "learning_rate": 1.1043744934398332e-06, + "loss": 0.3591, + "step": 18186 + }, + { + "epoch": 2.367434595860992, + "grad_norm": 2.737271308898926, + "learning_rate": 1.1030647765205248e-06, + "loss": 0.3535, + "step": 18189 + }, + { + "epoch": 2.3678250683326825, + "grad_norm": 2.989947557449341, + "learning_rate": 1.1017557403876012e-06, + "loss": 0.3633, + "step": 18192 + }, + { + "epoch": 2.3682155408043735, + "grad_norm": 2.9825870990753174, + "learning_rate": 1.1004473852697484e-06, + "loss": 0.3497, + "step": 18195 + }, + { + "epoch": 2.368606013276064, + "grad_norm": 3.08147931098938, + "learning_rate": 1.0991397113955355e-06, + "loss": 0.371, + "step": 18198 + }, + { + "epoch": 2.368996485747755, + "grad_norm": 2.5311295986175537, + "learning_rate": 1.0978327189934085e-06, + "loss": 0.2979, + "step": 18201 + }, + { + "epoch": 2.3693869582194456, + "grad_norm": 2.984403371810913, + "learning_rate": 1.0965264082916954e-06, + "loss": 0.388, + "step": 18204 + }, + { + "epoch": 2.369777430691136, + "grad_norm": 2.8916494846343994, + "learning_rate": 1.0952207795186086e-06, + "loss": 0.3466, + "step": 18207 + }, + { + "epoch": 2.370167903162827, + "grad_norm": 2.8561973571777344, + "learning_rate": 1.0939158329022371e-06, + "loss": 0.3492, + "step": 18210 + }, + { + "epoch": 2.3705583756345177, + "grad_norm": 2.66711163520813, + "learning_rate": 1.0926115686705523e-06, + "loss": 0.3246, + "step": 18213 + }, + { + "epoch": 2.3709488481062087, + "grad_norm": 2.8468332290649414, + "learning_rate": 1.0913079870514055e-06, + "loss": 0.4033, + "step": 18216 + }, + { + "epoch": 2.371339320577899, + "grad_norm": 2.7105188369750977, + "learning_rate": 1.0900050882725316e-06, + "loss": 0.3426, + "step": 18219 + }, + { + "epoch": 2.37172979304959, + "grad_norm": 3.1098861694335938, + "learning_rate": 1.0887028725615433e-06, + "loss": 0.3738, + "step": 18222 + }, + { + "epoch": 2.3721202655212807, + "grad_norm": 2.924067258834839, + "learning_rate": 1.0874013401459338e-06, + "loss": 0.3517, + "step": 18225 + }, + { + "epoch": 2.3725107379929717, + "grad_norm": 2.9183595180511475, + "learning_rate": 1.0861004912530804e-06, + "loss": 0.3759, + "step": 18228 + }, + { + "epoch": 2.3729012104646623, + "grad_norm": 2.9503722190856934, + "learning_rate": 1.084800326110238e-06, + "loss": 0.3523, + "step": 18231 + }, + { + "epoch": 2.373291682936353, + "grad_norm": 2.875462770462036, + "learning_rate": 1.0835008449445406e-06, + "loss": 0.33, + "step": 18234 + }, + { + "epoch": 2.373682155408044, + "grad_norm": 2.600781202316284, + "learning_rate": 1.0822020479830064e-06, + "loss": 0.3591, + "step": 18237 + }, + { + "epoch": 2.3740726278797344, + "grad_norm": 2.702268123626709, + "learning_rate": 1.0809039354525342e-06, + "loss": 0.3282, + "step": 18240 + }, + { + "epoch": 2.3744631003514254, + "grad_norm": 2.848515748977661, + "learning_rate": 1.0796065075798995e-06, + "loss": 0.2933, + "step": 18243 + }, + { + "epoch": 2.374853572823116, + "grad_norm": 2.4967212677001953, + "learning_rate": 1.0783097645917594e-06, + "loss": 0.3887, + "step": 18246 + }, + { + "epoch": 2.3752440452948065, + "grad_norm": 2.6020994186401367, + "learning_rate": 1.0770137067146552e-06, + "loss": 0.3011, + "step": 18249 + }, + { + "epoch": 2.3756345177664975, + "grad_norm": 2.908611297607422, + "learning_rate": 1.0757183341750033e-06, + "loss": 0.3948, + "step": 18252 + }, + { + "epoch": 2.376024990238188, + "grad_norm": 3.1716883182525635, + "learning_rate": 1.0744236471991016e-06, + "loss": 0.3868, + "step": 18255 + }, + { + "epoch": 2.376415462709879, + "grad_norm": 2.6089026927948, + "learning_rate": 1.0731296460131319e-06, + "loss": 0.3903, + "step": 18258 + }, + { + "epoch": 2.3768059351815696, + "grad_norm": 2.7589609622955322, + "learning_rate": 1.0718363308431524e-06, + "loss": 0.3843, + "step": 18261 + }, + { + "epoch": 2.3771964076532606, + "grad_norm": 2.489070415496826, + "learning_rate": 1.0705437019151016e-06, + "loss": 0.3683, + "step": 18264 + }, + { + "epoch": 2.377586880124951, + "grad_norm": 2.6437742710113525, + "learning_rate": 1.069251759454799e-06, + "loss": 0.4111, + "step": 18267 + }, + { + "epoch": 2.377977352596642, + "grad_norm": 2.870964288711548, + "learning_rate": 1.067960503687946e-06, + "loss": 0.3937, + "step": 18270 + }, + { + "epoch": 2.3783678250683327, + "grad_norm": 2.932481527328491, + "learning_rate": 1.066669934840121e-06, + "loss": 0.3452, + "step": 18273 + }, + { + "epoch": 2.378758297540023, + "grad_norm": 2.969200611114502, + "learning_rate": 1.065380053136783e-06, + "loss": 0.3703, + "step": 18276 + }, + { + "epoch": 2.379148770011714, + "grad_norm": 3.3855032920837402, + "learning_rate": 1.0640908588032722e-06, + "loss": 0.3405, + "step": 18279 + }, + { + "epoch": 2.3795392424834048, + "grad_norm": 2.7585716247558594, + "learning_rate": 1.0628023520648102e-06, + "loss": 0.3448, + "step": 18282 + }, + { + "epoch": 2.3799297149550958, + "grad_norm": 2.9133167266845703, + "learning_rate": 1.0615145331464937e-06, + "loss": 0.3375, + "step": 18285 + }, + { + "epoch": 2.3803201874267863, + "grad_norm": 2.9258482456207275, + "learning_rate": 1.0602274022733023e-06, + "loss": 0.4111, + "step": 18288 + }, + { + "epoch": 2.3807106598984773, + "grad_norm": 2.8225510120391846, + "learning_rate": 1.0589409596700966e-06, + "loss": 0.3206, + "step": 18291 + }, + { + "epoch": 2.381101132370168, + "grad_norm": 2.6298437118530273, + "learning_rate": 1.0576552055616151e-06, + "loss": 0.3135, + "step": 18294 + }, + { + "epoch": 2.381491604841859, + "grad_norm": 2.665774345397949, + "learning_rate": 1.0563701401724735e-06, + "loss": 0.4122, + "step": 18297 + }, + { + "epoch": 2.3818820773135494, + "grad_norm": 2.916292190551758, + "learning_rate": 1.0550857637271744e-06, + "loss": 0.4741, + "step": 18300 + }, + { + "epoch": 2.38227254978524, + "grad_norm": 2.903151512145996, + "learning_rate": 1.0538020764500929e-06, + "loss": 0.3216, + "step": 18303 + }, + { + "epoch": 2.382663022256931, + "grad_norm": 2.8354744911193848, + "learning_rate": 1.052519078565486e-06, + "loss": 0.3428, + "step": 18306 + }, + { + "epoch": 2.3830534947286215, + "grad_norm": 2.986859083175659, + "learning_rate": 1.051236770297493e-06, + "loss": 0.3477, + "step": 18309 + }, + { + "epoch": 2.3834439672003125, + "grad_norm": 2.7683563232421875, + "learning_rate": 1.0499551518701296e-06, + "loss": 0.3927, + "step": 18312 + }, + { + "epoch": 2.383834439672003, + "grad_norm": 2.669161319732666, + "learning_rate": 1.048674223507291e-06, + "loss": 0.3973, + "step": 18315 + }, + { + "epoch": 2.384224912143694, + "grad_norm": 2.9369685649871826, + "learning_rate": 1.047393985432752e-06, + "loss": 0.3683, + "step": 18318 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 2.722029209136963, + "learning_rate": 1.0461144378701688e-06, + "loss": 0.2915, + "step": 18321 + }, + { + "epoch": 2.3850058570870756, + "grad_norm": 2.844775438308716, + "learning_rate": 1.0448355810430766e-06, + "loss": 0.3954, + "step": 18324 + }, + { + "epoch": 2.385396329558766, + "grad_norm": 2.715589761734009, + "learning_rate": 1.0435574151748878e-06, + "loss": 0.3586, + "step": 18327 + }, + { + "epoch": 2.3857868020304567, + "grad_norm": 3.002556562423706, + "learning_rate": 1.0422799404888945e-06, + "loss": 0.4168, + "step": 18330 + }, + { + "epoch": 2.3861772745021477, + "grad_norm": 2.758995532989502, + "learning_rate": 1.0410031572082712e-06, + "loss": 0.3743, + "step": 18333 + }, + { + "epoch": 2.3865677469738382, + "grad_norm": 2.8067500591278076, + "learning_rate": 1.0397270655560676e-06, + "loss": 0.3647, + "step": 18336 + }, + { + "epoch": 2.386958219445529, + "grad_norm": 2.654280185699463, + "learning_rate": 1.0384516657552129e-06, + "loss": 0.3467, + "step": 18339 + }, + { + "epoch": 2.3873486919172198, + "grad_norm": 2.651869535446167, + "learning_rate": 1.03717695802852e-06, + "loss": 0.3492, + "step": 18342 + }, + { + "epoch": 2.3877391643889108, + "grad_norm": 2.8447647094726562, + "learning_rate": 1.035902942598676e-06, + "loss": 0.3398, + "step": 18345 + }, + { + "epoch": 2.3881296368606013, + "grad_norm": 2.987757444381714, + "learning_rate": 1.034629619688247e-06, + "loss": 0.441, + "step": 18348 + }, + { + "epoch": 2.3885201093322923, + "grad_norm": 2.6804354190826416, + "learning_rate": 1.0333569895196832e-06, + "loss": 0.3859, + "step": 18351 + }, + { + "epoch": 2.388910581803983, + "grad_norm": 2.683786153793335, + "learning_rate": 1.0320850523153087e-06, + "loss": 0.3567, + "step": 18354 + }, + { + "epoch": 2.3893010542756734, + "grad_norm": 2.5274264812469482, + "learning_rate": 1.0308138082973285e-06, + "loss": 0.2944, + "step": 18357 + }, + { + "epoch": 2.3896915267473644, + "grad_norm": 2.8091444969177246, + "learning_rate": 1.0295432576878246e-06, + "loss": 0.343, + "step": 18360 + }, + { + "epoch": 2.390081999219055, + "grad_norm": 2.602357864379883, + "learning_rate": 1.0282734007087601e-06, + "loss": 0.312, + "step": 18363 + }, + { + "epoch": 2.390472471690746, + "grad_norm": 2.9505271911621094, + "learning_rate": 1.0270042375819795e-06, + "loss": 0.3419, + "step": 18366 + }, + { + "epoch": 2.3908629441624365, + "grad_norm": 2.739809989929199, + "learning_rate": 1.025735768529199e-06, + "loss": 0.3484, + "step": 18369 + }, + { + "epoch": 2.391253416634127, + "grad_norm": 2.7395405769348145, + "learning_rate": 1.0244679937720203e-06, + "loss": 0.3406, + "step": 18372 + }, + { + "epoch": 2.391643889105818, + "grad_norm": 3.019829034805298, + "learning_rate": 1.0232009135319198e-06, + "loss": 0.4535, + "step": 18375 + }, + { + "epoch": 2.3920343615775086, + "grad_norm": 2.979292631149292, + "learning_rate": 1.021934528030254e-06, + "loss": 0.4343, + "step": 18378 + }, + { + "epoch": 2.3924248340491996, + "grad_norm": 2.9323439598083496, + "learning_rate": 1.0206688374882562e-06, + "loss": 0.3784, + "step": 18381 + }, + { + "epoch": 2.39281530652089, + "grad_norm": 2.577038049697876, + "learning_rate": 1.0194038421270426e-06, + "loss": 0.3109, + "step": 18384 + }, + { + "epoch": 2.393205778992581, + "grad_norm": 3.026864767074585, + "learning_rate": 1.0181395421676038e-06, + "loss": 0.3493, + "step": 18387 + }, + { + "epoch": 2.3935962514642717, + "grad_norm": 2.9030680656433105, + "learning_rate": 1.0168759378308085e-06, + "loss": 0.348, + "step": 18390 + }, + { + "epoch": 2.3939867239359627, + "grad_norm": 2.8374221324920654, + "learning_rate": 1.0156130293374094e-06, + "loss": 0.3319, + "step": 18393 + }, + { + "epoch": 2.3943771964076532, + "grad_norm": 2.8880085945129395, + "learning_rate": 1.0143508169080323e-06, + "loss": 0.3814, + "step": 18396 + }, + { + "epoch": 2.394767668879344, + "grad_norm": 2.656985282897949, + "learning_rate": 1.013089300763181e-06, + "loss": 0.371, + "step": 18399 + }, + { + "epoch": 2.3951581413510348, + "grad_norm": 2.770444631576538, + "learning_rate": 1.0118284811232432e-06, + "loss": 0.3254, + "step": 18402 + }, + { + "epoch": 2.3955486138227253, + "grad_norm": 2.928893804550171, + "learning_rate": 1.010568358208479e-06, + "loss": 0.3686, + "step": 18405 + }, + { + "epoch": 2.3959390862944163, + "grad_norm": 3.908521890640259, + "learning_rate": 1.009308932239031e-06, + "loss": 0.3932, + "step": 18408 + }, + { + "epoch": 2.396329558766107, + "grad_norm": 2.9738271236419678, + "learning_rate": 1.008050203434916e-06, + "loss": 0.3626, + "step": 18411 + }, + { + "epoch": 2.396720031237798, + "grad_norm": 3.0634217262268066, + "learning_rate": 1.006792172016034e-06, + "loss": 0.3842, + "step": 18414 + }, + { + "epoch": 2.3971105037094884, + "grad_norm": 2.783146858215332, + "learning_rate": 1.005534838202159e-06, + "loss": 0.3737, + "step": 18417 + }, + { + "epoch": 2.3975009761811794, + "grad_norm": 2.868107795715332, + "learning_rate": 1.004278202212945e-06, + "loss": 0.3692, + "step": 18420 + }, + { + "epoch": 2.39789144865287, + "grad_norm": 3.5386221408843994, + "learning_rate": 1.0030222642679217e-06, + "loss": 0.3371, + "step": 18423 + }, + { + "epoch": 2.3982819211245605, + "grad_norm": 2.8862059116363525, + "learning_rate": 1.0017670245865014e-06, + "loss": 0.3265, + "step": 18426 + }, + { + "epoch": 2.3986723935962515, + "grad_norm": 2.635298490524292, + "learning_rate": 1.0005124833879714e-06, + "loss": 0.329, + "step": 18429 + }, + { + "epoch": 2.399062866067942, + "grad_norm": 2.8082430362701416, + "learning_rate": 9.99258640891495e-07, + "loss": 0.3584, + "step": 18432 + }, + { + "epoch": 2.399453338539633, + "grad_norm": 3.525629997253418, + "learning_rate": 9.980054973161196e-07, + "loss": 0.381, + "step": 18435 + }, + { + "epoch": 2.3998438110113236, + "grad_norm": 3.2471086978912354, + "learning_rate": 9.967530528807644e-07, + "loss": 0.3784, + "step": 18438 + }, + { + "epoch": 2.4002342834830146, + "grad_norm": 3.815002202987671, + "learning_rate": 9.95501307804228e-07, + "loss": 0.3329, + "step": 18441 + }, + { + "epoch": 2.400624755954705, + "grad_norm": 3.0479953289031982, + "learning_rate": 9.942502623051908e-07, + "loss": 0.3522, + "step": 18444 + }, + { + "epoch": 2.401015228426396, + "grad_norm": 2.807485818862915, + "learning_rate": 9.929999166022042e-07, + "loss": 0.3269, + "step": 18447 + }, + { + "epoch": 2.4014057008980867, + "grad_norm": 2.907914400100708, + "learning_rate": 9.91750270913704e-07, + "loss": 0.3308, + "step": 18450 + }, + { + "epoch": 2.4017961733697772, + "grad_norm": 2.5052411556243896, + "learning_rate": 9.905013254579976e-07, + "loss": 0.3099, + "step": 18453 + }, + { + "epoch": 2.4021866458414682, + "grad_norm": 2.7181320190429688, + "learning_rate": 9.892530804532768e-07, + "loss": 0.3439, + "step": 18456 + }, + { + "epoch": 2.402577118313159, + "grad_norm": 3.0974040031433105, + "learning_rate": 9.880055361176049e-07, + "loss": 0.4185, + "step": 18459 + }, + { + "epoch": 2.40296759078485, + "grad_norm": 3.3100745677948, + "learning_rate": 9.867586926689249e-07, + "loss": 0.3995, + "step": 18462 + }, + { + "epoch": 2.4033580632565403, + "grad_norm": 2.790778160095215, + "learning_rate": 9.8551255032506e-07, + "loss": 0.3481, + "step": 18465 + }, + { + "epoch": 2.4037485357282313, + "grad_norm": 2.46254563331604, + "learning_rate": 9.842671093037075e-07, + "loss": 0.3661, + "step": 18468 + }, + { + "epoch": 2.404139008199922, + "grad_norm": 2.7810397148132324, + "learning_rate": 9.830223698224428e-07, + "loss": 0.365, + "step": 18471 + }, + { + "epoch": 2.404529480671613, + "grad_norm": 2.764301300048828, + "learning_rate": 9.817783320987183e-07, + "loss": 0.3454, + "step": 18474 + }, + { + "epoch": 2.4049199531433034, + "grad_norm": 3.241234302520752, + "learning_rate": 9.805349963498672e-07, + "loss": 0.3636, + "step": 18477 + }, + { + "epoch": 2.405310425614994, + "grad_norm": 2.733487844467163, + "learning_rate": 9.792923627930972e-07, + "loss": 0.3371, + "step": 18480 + }, + { + "epoch": 2.405700898086685, + "grad_norm": 2.4645349979400635, + "learning_rate": 9.780504316454915e-07, + "loss": 0.351, + "step": 18483 + }, + { + "epoch": 2.4060913705583755, + "grad_norm": 2.7485122680664062, + "learning_rate": 9.768092031240155e-07, + "loss": 0.399, + "step": 18486 + }, + { + "epoch": 2.4064818430300665, + "grad_norm": 2.626877784729004, + "learning_rate": 9.75568677445507e-07, + "loss": 0.3811, + "step": 18489 + }, + { + "epoch": 2.406872315501757, + "grad_norm": 2.7127082347869873, + "learning_rate": 9.743288548266855e-07, + "loss": 0.3375, + "step": 18492 + }, + { + "epoch": 2.407262787973448, + "grad_norm": 2.722588300704956, + "learning_rate": 9.730897354841435e-07, + "loss": 0.3514, + "step": 18495 + }, + { + "epoch": 2.4076532604451386, + "grad_norm": 3.016986846923828, + "learning_rate": 9.718513196343539e-07, + "loss": 0.3149, + "step": 18498 + }, + { + "epoch": 2.4080437329168296, + "grad_norm": 2.604357957839966, + "learning_rate": 9.70613607493665e-07, + "loss": 0.3478, + "step": 18501 + }, + { + "epoch": 2.40843420538852, + "grad_norm": 2.603797435760498, + "learning_rate": 9.693765992783017e-07, + "loss": 0.309, + "step": 18504 + }, + { + "epoch": 2.4088246778602107, + "grad_norm": 2.9746592044830322, + "learning_rate": 9.681402952043677e-07, + "loss": 0.3852, + "step": 18507 + }, + { + "epoch": 2.4092151503319017, + "grad_norm": 3.175025224685669, + "learning_rate": 9.669046954878425e-07, + "loss": 0.3769, + "step": 18510 + }, + { + "epoch": 2.4096056228035923, + "grad_norm": 2.841076374053955, + "learning_rate": 9.65669800344582e-07, + "loss": 0.3367, + "step": 18513 + }, + { + "epoch": 2.4099960952752832, + "grad_norm": 2.7390999794006348, + "learning_rate": 9.644356099903208e-07, + "loss": 0.3437, + "step": 18516 + }, + { + "epoch": 2.410386567746974, + "grad_norm": 2.828277349472046, + "learning_rate": 9.632021246406693e-07, + "loss": 0.3318, + "step": 18519 + }, + { + "epoch": 2.4107770402186643, + "grad_norm": 2.4388415813446045, + "learning_rate": 9.619693445111145e-07, + "loss": 0.344, + "step": 18522 + }, + { + "epoch": 2.4111675126903553, + "grad_norm": 2.6832144260406494, + "learning_rate": 9.607372698170191e-07, + "loss": 0.3834, + "step": 18525 + }, + { + "epoch": 2.411557985162046, + "grad_norm": 2.858412742614746, + "learning_rate": 9.595059007736268e-07, + "loss": 0.3879, + "step": 18528 + }, + { + "epoch": 2.411948457633737, + "grad_norm": 2.7546768188476562, + "learning_rate": 9.582752375960519e-07, + "loss": 0.3568, + "step": 18531 + }, + { + "epoch": 2.4123389301054274, + "grad_norm": 2.633880853652954, + "learning_rate": 9.570452804992925e-07, + "loss": 0.3444, + "step": 18534 + }, + { + "epoch": 2.4127294025771184, + "grad_norm": 2.5289742946624756, + "learning_rate": 9.558160296982154e-07, + "loss": 0.346, + "step": 18537 + }, + { + "epoch": 2.413119875048809, + "grad_norm": 2.8704309463500977, + "learning_rate": 9.54587485407572e-07, + "loss": 0.3572, + "step": 18540 + }, + { + "epoch": 2.4135103475205, + "grad_norm": 2.7778470516204834, + "learning_rate": 9.533596478419843e-07, + "loss": 0.417, + "step": 18543 + }, + { + "epoch": 2.4139008199921905, + "grad_norm": 2.997497320175171, + "learning_rate": 9.521325172159518e-07, + "loss": 0.3553, + "step": 18546 + }, + { + "epoch": 2.414291292463881, + "grad_norm": 3.075047016143799, + "learning_rate": 9.509060937438546e-07, + "loss": 0.3914, + "step": 18549 + }, + { + "epoch": 2.414681764935572, + "grad_norm": 2.607013463973999, + "learning_rate": 9.496803776399449e-07, + "loss": 0.3676, + "step": 18552 + }, + { + "epoch": 2.4150722374072626, + "grad_norm": 2.543388843536377, + "learning_rate": 9.484553691183512e-07, + "loss": 0.3634, + "step": 18555 + }, + { + "epoch": 2.4154627098789536, + "grad_norm": 2.790480852127075, + "learning_rate": 9.472310683930824e-07, + "loss": 0.4357, + "step": 18558 + }, + { + "epoch": 2.415853182350644, + "grad_norm": 2.679603338241577, + "learning_rate": 9.460074756780202e-07, + "loss": 0.3298, + "step": 18561 + }, + { + "epoch": 2.416243654822335, + "grad_norm": 2.8844940662384033, + "learning_rate": 9.44784591186923e-07, + "loss": 0.3547, + "step": 18564 + }, + { + "epoch": 2.4166341272940257, + "grad_norm": 2.9198670387268066, + "learning_rate": 9.435624151334272e-07, + "loss": 0.3595, + "step": 18567 + }, + { + "epoch": 2.4170245997657167, + "grad_norm": 2.825845956802368, + "learning_rate": 9.423409477310446e-07, + "loss": 0.3814, + "step": 18570 + }, + { + "epoch": 2.4174150722374073, + "grad_norm": 2.549248456954956, + "learning_rate": 9.411201891931609e-07, + "loss": 0.3238, + "step": 18573 + }, + { + "epoch": 2.417805544709098, + "grad_norm": 3.1343023777008057, + "learning_rate": 9.399001397330415e-07, + "loss": 0.4215, + "step": 18576 + }, + { + "epoch": 2.418196017180789, + "grad_norm": 2.932382822036743, + "learning_rate": 9.386807995638275e-07, + "loss": 0.3348, + "step": 18579 + }, + { + "epoch": 2.4185864896524794, + "grad_norm": 2.6527950763702393, + "learning_rate": 9.374621688985341e-07, + "loss": 0.3421, + "step": 18582 + }, + { + "epoch": 2.4189769621241703, + "grad_norm": 2.6357645988464355, + "learning_rate": 9.362442479500539e-07, + "loss": 0.33, + "step": 18585 + }, + { + "epoch": 2.419367434595861, + "grad_norm": 2.9315764904022217, + "learning_rate": 9.350270369311531e-07, + "loss": 0.3976, + "step": 18588 + }, + { + "epoch": 2.419757907067552, + "grad_norm": 3.037123441696167, + "learning_rate": 9.338105360544786e-07, + "loss": 0.3024, + "step": 18591 + }, + { + "epoch": 2.4201483795392424, + "grad_norm": 2.859992742538452, + "learning_rate": 9.325947455325496e-07, + "loss": 0.4043, + "step": 18594 + }, + { + "epoch": 2.4205388520109334, + "grad_norm": 2.6656060218811035, + "learning_rate": 9.313796655777613e-07, + "loss": 0.3422, + "step": 18597 + }, + { + "epoch": 2.420929324482624, + "grad_norm": 2.7755658626556396, + "learning_rate": 9.301652964023866e-07, + "loss": 0.3565, + "step": 18600 + }, + { + "epoch": 2.4213197969543145, + "grad_norm": 2.9836435317993164, + "learning_rate": 9.289516382185737e-07, + "loss": 0.3223, + "step": 18603 + }, + { + "epoch": 2.4217102694260055, + "grad_norm": 2.6815292835235596, + "learning_rate": 9.277386912383435e-07, + "loss": 0.403, + "step": 18606 + }, + { + "epoch": 2.422100741897696, + "grad_norm": 3.0170223712921143, + "learning_rate": 9.265264556735987e-07, + "loss": 0.2842, + "step": 18609 + }, + { + "epoch": 2.422491214369387, + "grad_norm": 2.64164662361145, + "learning_rate": 9.253149317361126e-07, + "loss": 0.3211, + "step": 18612 + }, + { + "epoch": 2.4228816868410776, + "grad_norm": 2.9962964057922363, + "learning_rate": 9.24104119637535e-07, + "loss": 0.3325, + "step": 18615 + }, + { + "epoch": 2.4232721593127686, + "grad_norm": 2.4860761165618896, + "learning_rate": 9.228940195893932e-07, + "loss": 0.3338, + "step": 18618 + }, + { + "epoch": 2.423662631784459, + "grad_norm": 2.7918808460235596, + "learning_rate": 9.216846318030908e-07, + "loss": 0.3377, + "step": 18621 + }, + { + "epoch": 2.42405310425615, + "grad_norm": 3.0834054946899414, + "learning_rate": 9.204759564899029e-07, + "loss": 0.3111, + "step": 18624 + }, + { + "epoch": 2.4244435767278407, + "grad_norm": 3.3791403770446777, + "learning_rate": 9.192679938609827e-07, + "loss": 0.3618, + "step": 18627 + }, + { + "epoch": 2.4248340491995313, + "grad_norm": 2.551917314529419, + "learning_rate": 9.180607441273604e-07, + "loss": 0.368, + "step": 18630 + }, + { + "epoch": 2.4252245216712223, + "grad_norm": 2.7034595012664795, + "learning_rate": 9.168542074999392e-07, + "loss": 0.3756, + "step": 18633 + }, + { + "epoch": 2.425614994142913, + "grad_norm": 2.543304443359375, + "learning_rate": 9.15648384189498e-07, + "loss": 0.3233, + "step": 18636 + }, + { + "epoch": 2.426005466614604, + "grad_norm": 3.1651837825775146, + "learning_rate": 9.144432744066905e-07, + "loss": 0.3206, + "step": 18639 + }, + { + "epoch": 2.4263959390862944, + "grad_norm": 2.559799909591675, + "learning_rate": 9.132388783620499e-07, + "loss": 0.3498, + "step": 18642 + }, + { + "epoch": 2.426786411557985, + "grad_norm": 2.8475115299224854, + "learning_rate": 9.120351962659796e-07, + "loss": 0.346, + "step": 18645 + }, + { + "epoch": 2.427176884029676, + "grad_norm": 2.8560378551483154, + "learning_rate": 9.108322283287596e-07, + "loss": 0.3304, + "step": 18648 + }, + { + "epoch": 2.427567356501367, + "grad_norm": 2.9788451194763184, + "learning_rate": 9.096299747605481e-07, + "loss": 0.3449, + "step": 18651 + }, + { + "epoch": 2.4279578289730575, + "grad_norm": 2.8146660327911377, + "learning_rate": 9.084284357713752e-07, + "loss": 0.4016, + "step": 18654 + }, + { + "epoch": 2.428348301444748, + "grad_norm": 2.616931200027466, + "learning_rate": 9.072276115711459e-07, + "loss": 0.3531, + "step": 18657 + }, + { + "epoch": 2.428738773916439, + "grad_norm": 2.9515371322631836, + "learning_rate": 9.06027502369643e-07, + "loss": 0.3594, + "step": 18660 + }, + { + "epoch": 2.4291292463881295, + "grad_norm": 2.725242853164673, + "learning_rate": 9.048281083765243e-07, + "loss": 0.3437, + "step": 18663 + }, + { + "epoch": 2.4295197188598205, + "grad_norm": 3.099229574203491, + "learning_rate": 9.036294298013199e-07, + "loss": 0.4032, + "step": 18666 + }, + { + "epoch": 2.429910191331511, + "grad_norm": 2.691160202026367, + "learning_rate": 9.024314668534356e-07, + "loss": 0.3516, + "step": 18669 + }, + { + "epoch": 2.4303006638032016, + "grad_norm": 2.8280410766601562, + "learning_rate": 9.012342197421548e-07, + "loss": 0.3906, + "step": 18672 + }, + { + "epoch": 2.4306911362748926, + "grad_norm": 2.8767683506011963, + "learning_rate": 9.000376886766337e-07, + "loss": 0.3625, + "step": 18675 + }, + { + "epoch": 2.431081608746583, + "grad_norm": 2.8653829097747803, + "learning_rate": 8.988418738659016e-07, + "loss": 0.3302, + "step": 18678 + }, + { + "epoch": 2.431472081218274, + "grad_norm": 2.558035135269165, + "learning_rate": 8.976467755188684e-07, + "loss": 0.3544, + "step": 18681 + }, + { + "epoch": 2.4318625536899647, + "grad_norm": 2.6624937057495117, + "learning_rate": 8.964523938443131e-07, + "loss": 0.3324, + "step": 18684 + }, + { + "epoch": 2.4322530261616557, + "grad_norm": 2.9672672748565674, + "learning_rate": 8.952587290508919e-07, + "loss": 0.3928, + "step": 18687 + }, + { + "epoch": 2.4326434986333463, + "grad_norm": 2.827082395553589, + "learning_rate": 8.940657813471349e-07, + "loss": 0.3936, + "step": 18690 + }, + { + "epoch": 2.4330339711050373, + "grad_norm": 2.829801321029663, + "learning_rate": 8.928735509414488e-07, + "loss": 0.3735, + "step": 18693 + }, + { + "epoch": 2.433424443576728, + "grad_norm": 2.6571168899536133, + "learning_rate": 8.916820380421138e-07, + "loss": 0.3167, + "step": 18696 + }, + { + "epoch": 2.4338149160484184, + "grad_norm": 3.000652551651001, + "learning_rate": 8.904912428572827e-07, + "loss": 0.3927, + "step": 18699 + }, + { + "epoch": 2.4342053885201094, + "grad_norm": 3.0371198654174805, + "learning_rate": 8.893011655949862e-07, + "loss": 0.4359, + "step": 18702 + }, + { + "epoch": 2.4345958609918, + "grad_norm": 2.5225956439971924, + "learning_rate": 8.881118064631294e-07, + "loss": 0.3617, + "step": 18705 + }, + { + "epoch": 2.434986333463491, + "grad_norm": 3.0411794185638428, + "learning_rate": 8.869231656694904e-07, + "loss": 0.4085, + "step": 18708 + }, + { + "epoch": 2.4353768059351815, + "grad_norm": 2.582329750061035, + "learning_rate": 8.857352434217203e-07, + "loss": 0.3344, + "step": 18711 + }, + { + "epoch": 2.4357672784068725, + "grad_norm": 2.818873643875122, + "learning_rate": 8.845480399273493e-07, + "loss": 0.3445, + "step": 18714 + }, + { + "epoch": 2.436157750878563, + "grad_norm": 2.471068859100342, + "learning_rate": 8.83361555393778e-07, + "loss": 0.3724, + "step": 18717 + }, + { + "epoch": 2.436548223350254, + "grad_norm": 2.8934266567230225, + "learning_rate": 8.821757900282812e-07, + "loss": 0.3259, + "step": 18720 + }, + { + "epoch": 2.4369386958219446, + "grad_norm": 3.3927173614501953, + "learning_rate": 8.809907440380134e-07, + "loss": 0.4121, + "step": 18723 + }, + { + "epoch": 2.437329168293635, + "grad_norm": 2.8776323795318604, + "learning_rate": 8.798064176299964e-07, + "loss": 0.3497, + "step": 18726 + }, + { + "epoch": 2.437719640765326, + "grad_norm": 2.814983606338501, + "learning_rate": 8.7862281101113e-07, + "loss": 0.3731, + "step": 18729 + }, + { + "epoch": 2.4381101132370167, + "grad_norm": 2.808781862258911, + "learning_rate": 8.774399243881898e-07, + "loss": 0.403, + "step": 18732 + }, + { + "epoch": 2.4385005857087076, + "grad_norm": 3.211045980453491, + "learning_rate": 8.762577579678222e-07, + "loss": 0.3331, + "step": 18735 + }, + { + "epoch": 2.438891058180398, + "grad_norm": 2.9599382877349854, + "learning_rate": 8.750763119565486e-07, + "loss": 0.3298, + "step": 18738 + }, + { + "epoch": 2.439281530652089, + "grad_norm": 2.901627779006958, + "learning_rate": 8.73895586560764e-07, + "loss": 0.4148, + "step": 18741 + }, + { + "epoch": 2.4396720031237797, + "grad_norm": 2.719869375228882, + "learning_rate": 8.727155819867423e-07, + "loss": 0.3681, + "step": 18744 + }, + { + "epoch": 2.4400624755954707, + "grad_norm": 2.781661033630371, + "learning_rate": 8.715362984406261e-07, + "loss": 0.3967, + "step": 18747 + }, + { + "epoch": 2.4404529480671613, + "grad_norm": 3.1544594764709473, + "learning_rate": 8.703577361284338e-07, + "loss": 0.3487, + "step": 18750 + }, + { + "epoch": 2.440843420538852, + "grad_norm": 3.0680270195007324, + "learning_rate": 8.691798952560559e-07, + "loss": 0.3621, + "step": 18753 + }, + { + "epoch": 2.441233893010543, + "grad_norm": 2.580583333969116, + "learning_rate": 8.680027760292614e-07, + "loss": 0.3696, + "step": 18756 + }, + { + "epoch": 2.4416243654822334, + "grad_norm": 2.498162269592285, + "learning_rate": 8.668263786536896e-07, + "loss": 0.3002, + "step": 18759 + }, + { + "epoch": 2.4420148379539244, + "grad_norm": 2.785461902618408, + "learning_rate": 8.656507033348538e-07, + "loss": 0.3497, + "step": 18762 + }, + { + "epoch": 2.442405310425615, + "grad_norm": 2.962099313735962, + "learning_rate": 8.644757502781437e-07, + "loss": 0.4551, + "step": 18765 + }, + { + "epoch": 2.442795782897306, + "grad_norm": 2.782252550125122, + "learning_rate": 8.633015196888201e-07, + "loss": 0.3868, + "step": 18768 + }, + { + "epoch": 2.4431862553689965, + "grad_norm": 2.9131968021392822, + "learning_rate": 8.621280117720171e-07, + "loss": 0.3787, + "step": 18771 + }, + { + "epoch": 2.4435767278406875, + "grad_norm": 2.526874303817749, + "learning_rate": 8.60955226732747e-07, + "loss": 0.306, + "step": 18774 + }, + { + "epoch": 2.443967200312378, + "grad_norm": 2.654900312423706, + "learning_rate": 8.59783164775892e-07, + "loss": 0.3161, + "step": 18777 + }, + { + "epoch": 2.4443576727840686, + "grad_norm": 2.776625633239746, + "learning_rate": 8.586118261062076e-07, + "loss": 0.3483, + "step": 18780 + }, + { + "epoch": 2.4447481452557596, + "grad_norm": 2.6517176628112793, + "learning_rate": 8.574412109283232e-07, + "loss": 0.3079, + "step": 18783 + }, + { + "epoch": 2.44513861772745, + "grad_norm": 2.642512321472168, + "learning_rate": 8.562713194467465e-07, + "loss": 0.3765, + "step": 18786 + }, + { + "epoch": 2.445529090199141, + "grad_norm": 2.8423049449920654, + "learning_rate": 8.551021518658536e-07, + "loss": 0.3594, + "step": 18789 + }, + { + "epoch": 2.4459195626708317, + "grad_norm": 3.162525177001953, + "learning_rate": 8.539337083898936e-07, + "loss": 0.3961, + "step": 18792 + }, + { + "epoch": 2.446310035142522, + "grad_norm": 2.591289520263672, + "learning_rate": 8.527659892229944e-07, + "loss": 0.4013, + "step": 18795 + }, + { + "epoch": 2.446700507614213, + "grad_norm": 2.7982068061828613, + "learning_rate": 8.515989945691522e-07, + "loss": 0.3741, + "step": 18798 + }, + { + "epoch": 2.4470909800859038, + "grad_norm": 2.731457471847534, + "learning_rate": 8.504327246322386e-07, + "loss": 0.3658, + "step": 18801 + }, + { + "epoch": 2.4474814525575947, + "grad_norm": 2.960292100906372, + "learning_rate": 8.492671796159968e-07, + "loss": 0.4037, + "step": 18804 + }, + { + "epoch": 2.4478719250292853, + "grad_norm": 2.7071995735168457, + "learning_rate": 8.48102359724049e-07, + "loss": 0.392, + "step": 18807 + }, + { + "epoch": 2.4482623975009763, + "grad_norm": 2.889734983444214, + "learning_rate": 8.46938265159884e-07, + "loss": 0.3443, + "step": 18810 + }, + { + "epoch": 2.448652869972667, + "grad_norm": 2.8918561935424805, + "learning_rate": 8.457748961268664e-07, + "loss": 0.3643, + "step": 18813 + }, + { + "epoch": 2.449043342444358, + "grad_norm": 3.017657518386841, + "learning_rate": 8.446122528282363e-07, + "loss": 0.3877, + "step": 18816 + }, + { + "epoch": 2.4494338149160484, + "grad_norm": 2.657172679901123, + "learning_rate": 8.434503354671042e-07, + "loss": 0.3669, + "step": 18819 + }, + { + "epoch": 2.449824287387739, + "grad_norm": 2.905705451965332, + "learning_rate": 8.422891442464531e-07, + "loss": 0.3495, + "step": 18822 + }, + { + "epoch": 2.45021475985943, + "grad_norm": 2.8718855381011963, + "learning_rate": 8.41128679369142e-07, + "loss": 0.4314, + "step": 18825 + }, + { + "epoch": 2.4506052323311205, + "grad_norm": 2.7882487773895264, + "learning_rate": 8.399689410379025e-07, + "loss": 0.3464, + "step": 18828 + }, + { + "epoch": 2.4509957048028115, + "grad_norm": 2.7789225578308105, + "learning_rate": 8.388099294553382e-07, + "loss": 0.3243, + "step": 18831 + }, + { + "epoch": 2.451386177274502, + "grad_norm": 2.7858150005340576, + "learning_rate": 8.376516448239236e-07, + "loss": 0.3557, + "step": 18834 + }, + { + "epoch": 2.451776649746193, + "grad_norm": 2.6816978454589844, + "learning_rate": 8.364940873460115e-07, + "loss": 0.382, + "step": 18837 + }, + { + "epoch": 2.4521671222178836, + "grad_norm": 3.012741804122925, + "learning_rate": 8.353372572238238e-07, + "loss": 0.3826, + "step": 18840 + }, + { + "epoch": 2.4525575946895746, + "grad_norm": 2.689615488052368, + "learning_rate": 8.341811546594564e-07, + "loss": 0.3332, + "step": 18843 + }, + { + "epoch": 2.452948067161265, + "grad_norm": 2.9826838970184326, + "learning_rate": 8.330257798548763e-07, + "loss": 0.3838, + "step": 18846 + }, + { + "epoch": 2.4533385396329557, + "grad_norm": 2.5051567554473877, + "learning_rate": 8.318711330119272e-07, + "loss": 0.3054, + "step": 18849 + }, + { + "epoch": 2.4537290121046467, + "grad_norm": 2.6066651344299316, + "learning_rate": 8.307172143323233e-07, + "loss": 0.3481, + "step": 18852 + }, + { + "epoch": 2.454119484576337, + "grad_norm": 2.808889865875244, + "learning_rate": 8.295640240176494e-07, + "loss": 0.3487, + "step": 18855 + }, + { + "epoch": 2.454509957048028, + "grad_norm": 2.8130900859832764, + "learning_rate": 8.28411562269369e-07, + "loss": 0.3552, + "step": 18858 + }, + { + "epoch": 2.4549004295197188, + "grad_norm": 2.8734047412872314, + "learning_rate": 8.272598292888124e-07, + "loss": 0.3521, + "step": 18861 + }, + { + "epoch": 2.4552909019914098, + "grad_norm": 2.4436731338500977, + "learning_rate": 8.261088252771848e-07, + "loss": 0.3438, + "step": 18864 + }, + { + "epoch": 2.4556813744631003, + "grad_norm": 2.721705675125122, + "learning_rate": 8.249585504355645e-07, + "loss": 0.4103, + "step": 18867 + }, + { + "epoch": 2.4560718469347913, + "grad_norm": 2.883448839187622, + "learning_rate": 8.238090049649033e-07, + "loss": 0.368, + "step": 18870 + }, + { + "epoch": 2.456462319406482, + "grad_norm": 2.8921525478363037, + "learning_rate": 8.226601890660241e-07, + "loss": 0.3569, + "step": 18873 + }, + { + "epoch": 2.4568527918781724, + "grad_norm": 2.69545316696167, + "learning_rate": 8.215121029396206e-07, + "loss": 0.3431, + "step": 18876 + }, + { + "epoch": 2.4572432643498634, + "grad_norm": 3.1997456550598145, + "learning_rate": 8.203647467862636e-07, + "loss": 0.286, + "step": 18879 + }, + { + "epoch": 2.457633736821554, + "grad_norm": 3.0184547901153564, + "learning_rate": 8.192181208063926e-07, + "loss": 0.3426, + "step": 18882 + }, + { + "epoch": 2.458024209293245, + "grad_norm": 2.5744235515594482, + "learning_rate": 8.180722252003198e-07, + "loss": 0.3675, + "step": 18885 + }, + { + "epoch": 2.4584146817649355, + "grad_norm": 2.549051523208618, + "learning_rate": 8.169270601682328e-07, + "loss": 0.39, + "step": 18888 + }, + { + "epoch": 2.4588051542366265, + "grad_norm": 2.9480321407318115, + "learning_rate": 8.157826259101886e-07, + "loss": 0.3437, + "step": 18891 + }, + { + "epoch": 2.459195626708317, + "grad_norm": 2.6580848693847656, + "learning_rate": 8.146389226261176e-07, + "loss": 0.2907, + "step": 18894 + }, + { + "epoch": 2.459586099180008, + "grad_norm": 2.8073930740356445, + "learning_rate": 8.134959505158208e-07, + "loss": 0.346, + "step": 18897 + }, + { + "epoch": 2.4599765716516986, + "grad_norm": 2.8346378803253174, + "learning_rate": 8.123537097789752e-07, + "loss": 0.3428, + "step": 18900 + }, + { + "epoch": 2.460367044123389, + "grad_norm": 2.9591760635375977, + "learning_rate": 8.112122006151268e-07, + "loss": 0.3972, + "step": 18903 + }, + { + "epoch": 2.46075751659508, + "grad_norm": 3.013507843017578, + "learning_rate": 8.100714232236945e-07, + "loss": 0.3298, + "step": 18906 + }, + { + "epoch": 2.4611479890667707, + "grad_norm": 2.7471742630004883, + "learning_rate": 8.089313778039698e-07, + "loss": 0.309, + "step": 18909 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 3.1585850715637207, + "learning_rate": 8.077920645551179e-07, + "loss": 0.361, + "step": 18912 + }, + { + "epoch": 2.4619289340101522, + "grad_norm": 2.589958906173706, + "learning_rate": 8.066534836761736e-07, + "loss": 0.3519, + "step": 18915 + }, + { + "epoch": 2.462319406481843, + "grad_norm": 3.804331064224243, + "learning_rate": 8.055156353660426e-07, + "loss": 0.3417, + "step": 18918 + }, + { + "epoch": 2.4627098789535338, + "grad_norm": 3.136734962463379, + "learning_rate": 8.043785198235076e-07, + "loss": 0.3533, + "step": 18921 + }, + { + "epoch": 2.4631003514252248, + "grad_norm": 2.8037362098693848, + "learning_rate": 8.032421372472188e-07, + "loss": 0.3386, + "step": 18924 + }, + { + "epoch": 2.4634908238969153, + "grad_norm": 2.6273999214172363, + "learning_rate": 8.021064878356987e-07, + "loss": 0.3358, + "step": 18927 + }, + { + "epoch": 2.463881296368606, + "grad_norm": 2.6162285804748535, + "learning_rate": 8.009715717873451e-07, + "loss": 0.366, + "step": 18930 + }, + { + "epoch": 2.464271768840297, + "grad_norm": 2.9784958362579346, + "learning_rate": 7.998373893004246e-07, + "loss": 0.3863, + "step": 18933 + }, + { + "epoch": 2.4646622413119874, + "grad_norm": 3.3731791973114014, + "learning_rate": 7.987039405730757e-07, + "loss": 0.3216, + "step": 18936 + }, + { + "epoch": 2.4650527137836784, + "grad_norm": 2.652818441390991, + "learning_rate": 7.975712258033108e-07, + "loss": 0.3587, + "step": 18939 + }, + { + "epoch": 2.465443186255369, + "grad_norm": 2.968871831893921, + "learning_rate": 7.964392451890119e-07, + "loss": 0.3334, + "step": 18942 + }, + { + "epoch": 2.4658336587270595, + "grad_norm": 2.5186870098114014, + "learning_rate": 7.953079989279344e-07, + "loss": 0.3153, + "step": 18945 + }, + { + "epoch": 2.4662241311987505, + "grad_norm": 2.8169445991516113, + "learning_rate": 7.941774872177027e-07, + "loss": 0.3013, + "step": 18948 + }, + { + "epoch": 2.466614603670441, + "grad_norm": 2.731722831726074, + "learning_rate": 7.930477102558159e-07, + "loss": 0.3281, + "step": 18951 + }, + { + "epoch": 2.467005076142132, + "grad_norm": 2.587888240814209, + "learning_rate": 7.919186682396457e-07, + "loss": 0.3223, + "step": 18954 + }, + { + "epoch": 2.4673955486138226, + "grad_norm": 2.6688907146453857, + "learning_rate": 7.907903613664314e-07, + "loss": 0.3775, + "step": 18957 + }, + { + "epoch": 2.4677860210855136, + "grad_norm": 2.8114142417907715, + "learning_rate": 7.896627898332848e-07, + "loss": 0.3064, + "step": 18960 + }, + { + "epoch": 2.468176493557204, + "grad_norm": 2.9139301776885986, + "learning_rate": 7.885359538371929e-07, + "loss": 0.4012, + "step": 18963 + }, + { + "epoch": 2.468566966028895, + "grad_norm": 2.7291030883789062, + "learning_rate": 7.874098535750103e-07, + "loss": 0.3384, + "step": 18966 + }, + { + "epoch": 2.4689574385005857, + "grad_norm": 2.8256447315216064, + "learning_rate": 7.862844892434629e-07, + "loss": 0.3605, + "step": 18969 + }, + { + "epoch": 2.4693479109722762, + "grad_norm": 2.81532621383667, + "learning_rate": 7.85159861039152e-07, + "loss": 0.3608, + "step": 18972 + }, + { + "epoch": 2.4697383834439672, + "grad_norm": 2.861802816390991, + "learning_rate": 7.84035969158547e-07, + "loss": 0.3409, + "step": 18975 + }, + { + "epoch": 2.470128855915658, + "grad_norm": 2.916384696960449, + "learning_rate": 7.829128137979875e-07, + "loss": 0.3839, + "step": 18978 + }, + { + "epoch": 2.470519328387349, + "grad_norm": 2.9499900341033936, + "learning_rate": 7.817903951536892e-07, + "loss": 0.3667, + "step": 18981 + }, + { + "epoch": 2.4709098008590393, + "grad_norm": 2.9499289989471436, + "learning_rate": 7.806687134217356e-07, + "loss": 0.3399, + "step": 18984 + }, + { + "epoch": 2.4713002733307303, + "grad_norm": 2.9317288398742676, + "learning_rate": 7.795477687980801e-07, + "loss": 0.3157, + "step": 18987 + }, + { + "epoch": 2.471690745802421, + "grad_norm": 3.2856903076171875, + "learning_rate": 7.784275614785519e-07, + "loss": 0.386, + "step": 18990 + }, + { + "epoch": 2.472081218274112, + "grad_norm": 3.034611225128174, + "learning_rate": 7.773080916588466e-07, + "loss": 0.3312, + "step": 18993 + }, + { + "epoch": 2.4724716907458024, + "grad_norm": 2.9121174812316895, + "learning_rate": 7.761893595345354e-07, + "loss": 0.4084, + "step": 18996 + }, + { + "epoch": 2.472862163217493, + "grad_norm": 2.7845377922058105, + "learning_rate": 7.750713653010567e-07, + "loss": 0.3619, + "step": 18999 + }, + { + "epoch": 2.473252635689184, + "grad_norm": 2.673917531967163, + "learning_rate": 7.73954109153724e-07, + "loss": 0.4123, + "step": 19002 + }, + { + "epoch": 2.4736431081608745, + "grad_norm": 2.777308225631714, + "learning_rate": 7.728375912877178e-07, + "loss": 0.3202, + "step": 19005 + }, + { + "epoch": 2.4740335806325655, + "grad_norm": 3.0712320804595947, + "learning_rate": 7.717218118980918e-07, + "loss": 0.3864, + "step": 19008 + }, + { + "epoch": 2.474424053104256, + "grad_norm": 3.155332088470459, + "learning_rate": 7.706067711797687e-07, + "loss": 0.3849, + "step": 19011 + }, + { + "epoch": 2.474814525575947, + "grad_norm": 2.9384818077087402, + "learning_rate": 7.694924693275468e-07, + "loss": 0.3452, + "step": 19014 + }, + { + "epoch": 2.4752049980476376, + "grad_norm": 2.8019471168518066, + "learning_rate": 7.683789065360908e-07, + "loss": 0.3621, + "step": 19017 + }, + { + "epoch": 2.4755954705193286, + "grad_norm": 3.059307098388672, + "learning_rate": 7.672660829999367e-07, + "loss": 0.3504, + "step": 19020 + }, + { + "epoch": 2.475985942991019, + "grad_norm": 3.4540011882781982, + "learning_rate": 7.661539989134947e-07, + "loss": 0.3184, + "step": 19023 + }, + { + "epoch": 2.4763764154627097, + "grad_norm": 2.706732988357544, + "learning_rate": 7.650426544710427e-07, + "loss": 0.3627, + "step": 19026 + }, + { + "epoch": 2.4767668879344007, + "grad_norm": 2.591989040374756, + "learning_rate": 7.63932049866728e-07, + "loss": 0.4024, + "step": 19029 + }, + { + "epoch": 2.4771573604060912, + "grad_norm": 2.8087947368621826, + "learning_rate": 7.628221852945744e-07, + "loss": 0.3468, + "step": 19032 + }, + { + "epoch": 2.4775478328777822, + "grad_norm": 2.681309700012207, + "learning_rate": 7.617130609484702e-07, + "loss": 0.3382, + "step": 19035 + }, + { + "epoch": 2.477938305349473, + "grad_norm": 3.0248937606811523, + "learning_rate": 7.606046770221792e-07, + "loss": 0.417, + "step": 19038 + }, + { + "epoch": 2.478328777821164, + "grad_norm": 2.839232921600342, + "learning_rate": 7.594970337093316e-07, + "loss": 0.3179, + "step": 19041 + }, + { + "epoch": 2.4787192502928543, + "grad_norm": 2.8499197959899902, + "learning_rate": 7.583901312034331e-07, + "loss": 0.367, + "step": 19044 + }, + { + "epoch": 2.4791097227645453, + "grad_norm": 2.664505958557129, + "learning_rate": 7.572839696978551e-07, + "loss": 0.3077, + "step": 19047 + }, + { + "epoch": 2.479500195236236, + "grad_norm": 2.8773906230926514, + "learning_rate": 7.561785493858409e-07, + "loss": 0.3265, + "step": 19050 + }, + { + "epoch": 2.4798906677079264, + "grad_norm": 3.14871883392334, + "learning_rate": 7.550738704605082e-07, + "loss": 0.3526, + "step": 19053 + }, + { + "epoch": 2.4802811401796174, + "grad_norm": 2.6195621490478516, + "learning_rate": 7.539699331148398e-07, + "loss": 0.3692, + "step": 19056 + }, + { + "epoch": 2.480671612651308, + "grad_norm": 2.551382541656494, + "learning_rate": 7.52866737541692e-07, + "loss": 0.2936, + "step": 19059 + }, + { + "epoch": 2.481062085122999, + "grad_norm": 2.6689510345458984, + "learning_rate": 7.517642839337896e-07, + "loss": 0.3484, + "step": 19062 + }, + { + "epoch": 2.4814525575946895, + "grad_norm": 2.526609420776367, + "learning_rate": 7.506625724837302e-07, + "loss": 0.403, + "step": 19065 + }, + { + "epoch": 2.48184303006638, + "grad_norm": 2.6223344802856445, + "learning_rate": 7.495616033839808e-07, + "loss": 0.3567, + "step": 19068 + }, + { + "epoch": 2.482233502538071, + "grad_norm": 2.91428804397583, + "learning_rate": 7.484613768268762e-07, + "loss": 0.3981, + "step": 19071 + }, + { + "epoch": 2.4826239750097616, + "grad_norm": 2.6756317615509033, + "learning_rate": 7.473618930046267e-07, + "loss": 0.3424, + "step": 19074 + }, + { + "epoch": 2.4830144474814526, + "grad_norm": 2.6484274864196777, + "learning_rate": 7.462631521093066e-07, + "loss": 0.302, + "step": 19077 + }, + { + "epoch": 2.483404919953143, + "grad_norm": 2.8635783195495605, + "learning_rate": 7.451651543328664e-07, + "loss": 0.3657, + "step": 19080 + }, + { + "epoch": 2.483795392424834, + "grad_norm": 2.7862300872802734, + "learning_rate": 7.440678998671219e-07, + "loss": 0.3287, + "step": 19083 + }, + { + "epoch": 2.4841858648965247, + "grad_norm": 2.5042202472686768, + "learning_rate": 7.429713889037632e-07, + "loss": 0.3273, + "step": 19086 + }, + { + "epoch": 2.4845763373682157, + "grad_norm": 2.9982264041900635, + "learning_rate": 7.418756216343475e-07, + "loss": 0.4191, + "step": 19089 + }, + { + "epoch": 2.4849668098399063, + "grad_norm": 2.7572524547576904, + "learning_rate": 7.407805982503019e-07, + "loss": 0.3332, + "step": 19092 + }, + { + "epoch": 2.485357282311597, + "grad_norm": 2.9605908393859863, + "learning_rate": 7.396863189429265e-07, + "loss": 0.3352, + "step": 19095 + }, + { + "epoch": 2.485747754783288, + "grad_norm": 2.8530290126800537, + "learning_rate": 7.385927839033891e-07, + "loss": 0.3067, + "step": 19098 + }, + { + "epoch": 2.4861382272549784, + "grad_norm": 2.7384355068206787, + "learning_rate": 7.374999933227261e-07, + "loss": 0.365, + "step": 19101 + }, + { + "epoch": 2.4865286997266693, + "grad_norm": 2.806218385696411, + "learning_rate": 7.36407947391849e-07, + "loss": 0.3437, + "step": 19104 + }, + { + "epoch": 2.48691917219836, + "grad_norm": 2.798231363296509, + "learning_rate": 7.353166463015338e-07, + "loss": 0.3763, + "step": 19107 + }, + { + "epoch": 2.487309644670051, + "grad_norm": 2.5547235012054443, + "learning_rate": 7.342260902424292e-07, + "loss": 0.3022, + "step": 19110 + }, + { + "epoch": 2.4877001171417414, + "grad_norm": 2.6578516960144043, + "learning_rate": 7.331362794050512e-07, + "loss": 0.3696, + "step": 19113 + }, + { + "epoch": 2.4880905896134324, + "grad_norm": 2.6646409034729004, + "learning_rate": 7.320472139797902e-07, + "loss": 0.3517, + "step": 19116 + }, + { + "epoch": 2.488481062085123, + "grad_norm": 2.4392144680023193, + "learning_rate": 7.309588941569018e-07, + "loss": 0.308, + "step": 19119 + }, + { + "epoch": 2.4888715345568135, + "grad_norm": 2.8057920932769775, + "learning_rate": 7.298713201265145e-07, + "loss": 0.4005, + "step": 19122 + }, + { + "epoch": 2.4892620070285045, + "grad_norm": 2.7063517570495605, + "learning_rate": 7.287844920786236e-07, + "loss": 0.3574, + "step": 19125 + }, + { + "epoch": 2.489652479500195, + "grad_norm": 2.536086082458496, + "learning_rate": 7.276984102030976e-07, + "loss": 0.3572, + "step": 19128 + }, + { + "epoch": 2.490042951971886, + "grad_norm": 2.7980449199676514, + "learning_rate": 7.266130746896722e-07, + "loss": 0.4331, + "step": 19131 + }, + { + "epoch": 2.4904334244435766, + "grad_norm": 2.7580485343933105, + "learning_rate": 7.25528485727951e-07, + "loss": 0.3337, + "step": 19134 + }, + { + "epoch": 2.4908238969152676, + "grad_norm": 2.91790771484375, + "learning_rate": 7.244446435074126e-07, + "loss": 0.3567, + "step": 19137 + }, + { + "epoch": 2.491214369386958, + "grad_norm": 2.8744025230407715, + "learning_rate": 7.233615482174005e-07, + "loss": 0.3997, + "step": 19140 + }, + { + "epoch": 2.491604841858649, + "grad_norm": 2.7797534465789795, + "learning_rate": 7.22279200047128e-07, + "loss": 0.3623, + "step": 19143 + }, + { + "epoch": 2.4919953143303397, + "grad_norm": 2.9023027420043945, + "learning_rate": 7.211975991856812e-07, + "loss": 0.3873, + "step": 19146 + }, + { + "epoch": 2.4923857868020303, + "grad_norm": 2.7867956161499023, + "learning_rate": 7.201167458220131e-07, + "loss": 0.4055, + "step": 19149 + }, + { + "epoch": 2.4927762592737213, + "grad_norm": 2.779937505722046, + "learning_rate": 7.190366401449444e-07, + "loss": 0.3783, + "step": 19152 + }, + { + "epoch": 2.493166731745412, + "grad_norm": 2.892500400543213, + "learning_rate": 7.179572823431702e-07, + "loss": 0.3472, + "step": 19155 + }, + { + "epoch": 2.493557204217103, + "grad_norm": 2.9717111587524414, + "learning_rate": 7.168786726052501e-07, + "loss": 0.3959, + "step": 19158 + }, + { + "epoch": 2.4939476766887934, + "grad_norm": 3.3151369094848633, + "learning_rate": 7.158008111196152e-07, + "loss": 0.3453, + "step": 19161 + }, + { + "epoch": 2.4943381491604844, + "grad_norm": 2.6525325775146484, + "learning_rate": 7.147236980745653e-07, + "loss": 0.2747, + "step": 19164 + }, + { + "epoch": 2.494728621632175, + "grad_norm": 2.505096197128296, + "learning_rate": 7.136473336582722e-07, + "loss": 0.3938, + "step": 19167 + }, + { + "epoch": 2.495119094103866, + "grad_norm": 3.1451778411865234, + "learning_rate": 7.125717180587721e-07, + "loss": 0.3658, + "step": 19170 + }, + { + "epoch": 2.4955095665755564, + "grad_norm": 3.026200294494629, + "learning_rate": 7.114968514639736e-07, + "loss": 0.3788, + "step": 19173 + }, + { + "epoch": 2.495900039047247, + "grad_norm": 2.9026551246643066, + "learning_rate": 7.104227340616527e-07, + "loss": 0.3708, + "step": 19176 + }, + { + "epoch": 2.496290511518938, + "grad_norm": 2.9777603149414062, + "learning_rate": 7.093493660394568e-07, + "loss": 0.3116, + "step": 19179 + }, + { + "epoch": 2.4966809839906285, + "grad_norm": 2.5679221153259277, + "learning_rate": 7.082767475849011e-07, + "loss": 0.3198, + "step": 19182 + }, + { + "epoch": 2.4970714564623195, + "grad_norm": 2.768268585205078, + "learning_rate": 7.072048788853675e-07, + "loss": 0.4067, + "step": 19185 + }, + { + "epoch": 2.49746192893401, + "grad_norm": 2.612443447113037, + "learning_rate": 7.061337601281121e-07, + "loss": 0.3287, + "step": 19188 + }, + { + "epoch": 2.497852401405701, + "grad_norm": 2.776230573654175, + "learning_rate": 7.050633915002559e-07, + "loss": 0.3972, + "step": 19191 + }, + { + "epoch": 2.4982428738773916, + "grad_norm": 2.519664764404297, + "learning_rate": 7.039937731887886e-07, + "loss": 0.3023, + "step": 19194 + }, + { + "epoch": 2.4986333463490826, + "grad_norm": 3.676971197128296, + "learning_rate": 7.029249053805731e-07, + "loss": 0.398, + "step": 19197 + }, + { + "epoch": 2.499023818820773, + "grad_norm": 2.587867021560669, + "learning_rate": 7.018567882623368e-07, + "loss": 0.3471, + "step": 19200 + }, + { + "epoch": 2.4994142912924637, + "grad_norm": 3.008563280105591, + "learning_rate": 7.007894220206763e-07, + "loss": 0.3319, + "step": 19203 + }, + { + "epoch": 2.4998047637641547, + "grad_norm": 3.0095622539520264, + "learning_rate": 6.997228068420597e-07, + "loss": 0.299, + "step": 19206 + }, + { + "epoch": 2.5001952362358453, + "grad_norm": 3.0652036666870117, + "learning_rate": 6.986569429128238e-07, + "loss": 0.4295, + "step": 19209 + }, + { + "epoch": 2.5005857087075363, + "grad_norm": 2.4899280071258545, + "learning_rate": 6.975918304191709e-07, + "loss": 0.4072, + "step": 19212 + }, + { + "epoch": 2.500976181179227, + "grad_norm": 2.737656593322754, + "learning_rate": 6.965274695471729e-07, + "loss": 0.392, + "step": 19215 + }, + { + "epoch": 2.5013666536509174, + "grad_norm": 2.7664875984191895, + "learning_rate": 6.954638604827741e-07, + "loss": 0.2942, + "step": 19218 + }, + { + "epoch": 2.5017571261226084, + "grad_norm": 2.665586233139038, + "learning_rate": 6.944010034117837e-07, + "loss": 0.3066, + "step": 19221 + }, + { + "epoch": 2.5021475985942994, + "grad_norm": 2.6148107051849365, + "learning_rate": 6.933388985198802e-07, + "loss": 0.3325, + "step": 19224 + }, + { + "epoch": 2.50253807106599, + "grad_norm": 2.9085710048675537, + "learning_rate": 6.922775459926101e-07, + "loss": 0.3846, + "step": 19227 + }, + { + "epoch": 2.5029285435376805, + "grad_norm": 2.714949369430542, + "learning_rate": 6.912169460153917e-07, + "loss": 0.3562, + "step": 19230 + }, + { + "epoch": 2.5033190160093715, + "grad_norm": 2.8644955158233643, + "learning_rate": 6.901570987735085e-07, + "loss": 0.2756, + "step": 19233 + }, + { + "epoch": 2.503709488481062, + "grad_norm": 2.9508283138275146, + "learning_rate": 6.890980044521123e-07, + "loss": 0.4106, + "step": 19236 + }, + { + "epoch": 2.504099960952753, + "grad_norm": 2.899592161178589, + "learning_rate": 6.880396632362268e-07, + "loss": 0.3553, + "step": 19239 + }, + { + "epoch": 2.5044904334244436, + "grad_norm": 2.7175285816192627, + "learning_rate": 6.869820753107415e-07, + "loss": 0.3397, + "step": 19242 + }, + { + "epoch": 2.504880905896134, + "grad_norm": 2.8348207473754883, + "learning_rate": 6.859252408604134e-07, + "loss": 0.4374, + "step": 19245 + }, + { + "epoch": 2.505271378367825, + "grad_norm": 2.8067331314086914, + "learning_rate": 6.848691600698698e-07, + "loss": 0.3312, + "step": 19248 + }, + { + "epoch": 2.505661850839516, + "grad_norm": 3.3372321128845215, + "learning_rate": 6.838138331236077e-07, + "loss": 0.3608, + "step": 19251 + }, + { + "epoch": 2.5060523233112066, + "grad_norm": 2.9257891178131104, + "learning_rate": 6.827592602059891e-07, + "loss": 0.3447, + "step": 19254 + }, + { + "epoch": 2.506442795782897, + "grad_norm": 3.0877225399017334, + "learning_rate": 6.817054415012441e-07, + "loss": 0.3567, + "step": 19257 + }, + { + "epoch": 2.506833268254588, + "grad_norm": 3.7706382274627686, + "learning_rate": 6.806523771934753e-07, + "loss": 0.3516, + "step": 19260 + }, + { + "epoch": 2.5072237407262787, + "grad_norm": 2.557373285293579, + "learning_rate": 6.796000674666498e-07, + "loss": 0.3081, + "step": 19263 + }, + { + "epoch": 2.5076142131979697, + "grad_norm": 2.9034101963043213, + "learning_rate": 6.785485125046037e-07, + "loss": 0.3248, + "step": 19266 + }, + { + "epoch": 2.5080046856696603, + "grad_norm": 2.5061423778533936, + "learning_rate": 6.774977124910398e-07, + "loss": 0.328, + "step": 19269 + }, + { + "epoch": 2.508395158141351, + "grad_norm": 2.783360481262207, + "learning_rate": 6.764476676095344e-07, + "loss": 0.3268, + "step": 19272 + }, + { + "epoch": 2.508785630613042, + "grad_norm": 2.8032476902008057, + "learning_rate": 6.753983780435253e-07, + "loss": 0.3779, + "step": 19275 + }, + { + "epoch": 2.5091761030847324, + "grad_norm": 3.029937267303467, + "learning_rate": 6.743498439763213e-07, + "loss": 0.356, + "step": 19278 + }, + { + "epoch": 2.5095665755564234, + "grad_norm": 2.802793264389038, + "learning_rate": 6.733020655911004e-07, + "loss": 0.3366, + "step": 19281 + }, + { + "epoch": 2.509957048028114, + "grad_norm": 2.936530590057373, + "learning_rate": 6.722550430709068e-07, + "loss": 0.3973, + "step": 19284 + }, + { + "epoch": 2.510347520499805, + "grad_norm": 2.8553144931793213, + "learning_rate": 6.71208776598652e-07, + "loss": 0.3852, + "step": 19287 + }, + { + "epoch": 2.5107379929714955, + "grad_norm": 2.726348400115967, + "learning_rate": 6.701632663571172e-07, + "loss": 0.3928, + "step": 19290 + }, + { + "epoch": 2.5111284654431865, + "grad_norm": 3.0208041667938232, + "learning_rate": 6.691185125289523e-07, + "loss": 0.4231, + "step": 19293 + }, + { + "epoch": 2.511518937914877, + "grad_norm": 2.8433890342712402, + "learning_rate": 6.680745152966722e-07, + "loss": 0.3642, + "step": 19296 + }, + { + "epoch": 2.5119094103865676, + "grad_norm": 2.9164557456970215, + "learning_rate": 6.670312748426605e-07, + "loss": 0.3276, + "step": 19299 + }, + { + "epoch": 2.5122998828582586, + "grad_norm": 2.7807259559631348, + "learning_rate": 6.659887913491709e-07, + "loss": 0.4351, + "step": 19302 + }, + { + "epoch": 2.512690355329949, + "grad_norm": 2.888089179992676, + "learning_rate": 6.64947064998322e-07, + "loss": 0.354, + "step": 19305 + }, + { + "epoch": 2.51308082780164, + "grad_norm": 2.8115479946136475, + "learning_rate": 6.639060959721e-07, + "loss": 0.3543, + "step": 19308 + }, + { + "epoch": 2.5134713002733307, + "grad_norm": 2.8642683029174805, + "learning_rate": 6.628658844523622e-07, + "loss": 0.303, + "step": 19311 + }, + { + "epoch": 2.513861772745021, + "grad_norm": 2.651221752166748, + "learning_rate": 6.618264306208305e-07, + "loss": 0.3848, + "step": 19314 + }, + { + "epoch": 2.514252245216712, + "grad_norm": 3.1669631004333496, + "learning_rate": 6.607877346590958e-07, + "loss": 0.3538, + "step": 19317 + }, + { + "epoch": 2.514642717688403, + "grad_norm": 2.709857225418091, + "learning_rate": 6.597497967486139e-07, + "loss": 0.4179, + "step": 19320 + }, + { + "epoch": 2.5150331901600937, + "grad_norm": 2.7595081329345703, + "learning_rate": 6.587126170707125e-07, + "loss": 0.3483, + "step": 19323 + }, + { + "epoch": 2.5154236626317843, + "grad_norm": 2.8310909271240234, + "learning_rate": 6.576761958065847e-07, + "loss": 0.3987, + "step": 19326 + }, + { + "epoch": 2.5158141351034753, + "grad_norm": 2.7588865756988525, + "learning_rate": 6.566405331372899e-07, + "loss": 0.3472, + "step": 19329 + }, + { + "epoch": 2.516204607575166, + "grad_norm": 2.9078924655914307, + "learning_rate": 6.556056292437563e-07, + "loss": 0.4267, + "step": 19332 + }, + { + "epoch": 2.516595080046857, + "grad_norm": 2.8975186347961426, + "learning_rate": 6.545714843067813e-07, + "loss": 0.392, + "step": 19335 + }, + { + "epoch": 2.5169855525185474, + "grad_norm": 2.952500581741333, + "learning_rate": 6.535380985070261e-07, + "loss": 0.3441, + "step": 19338 + }, + { + "epoch": 2.517376024990238, + "grad_norm": 2.9215312004089355, + "learning_rate": 6.525054720250207e-07, + "loss": 0.4316, + "step": 19341 + }, + { + "epoch": 2.517766497461929, + "grad_norm": 2.8318424224853516, + "learning_rate": 6.514736050411652e-07, + "loss": 0.3531, + "step": 19344 + }, + { + "epoch": 2.51815696993362, + "grad_norm": 3.1043033599853516, + "learning_rate": 6.504424977357221e-07, + "loss": 0.4214, + "step": 19347 + }, + { + "epoch": 2.5185474424053105, + "grad_norm": 2.7383644580841064, + "learning_rate": 6.49412150288824e-07, + "loss": 0.3557, + "step": 19350 + }, + { + "epoch": 2.518937914877001, + "grad_norm": 2.9225306510925293, + "learning_rate": 6.483825628804719e-07, + "loss": 0.3435, + "step": 19353 + }, + { + "epoch": 2.519328387348692, + "grad_norm": 2.5163514614105225, + "learning_rate": 6.473537356905313e-07, + "loss": 0.3363, + "step": 19356 + }, + { + "epoch": 2.5197188598203826, + "grad_norm": 2.8150081634521484, + "learning_rate": 6.463256688987357e-07, + "loss": 0.3409, + "step": 19359 + }, + { + "epoch": 2.5201093322920736, + "grad_norm": 2.6332247257232666, + "learning_rate": 6.452983626846876e-07, + "loss": 0.3493, + "step": 19362 + }, + { + "epoch": 2.520499804763764, + "grad_norm": 2.6498818397521973, + "learning_rate": 6.44271817227855e-07, + "loss": 0.3274, + "step": 19365 + }, + { + "epoch": 2.5208902772354547, + "grad_norm": 3.2936317920684814, + "learning_rate": 6.432460327075723e-07, + "loss": 0.3614, + "step": 19368 + }, + { + "epoch": 2.5212807497071457, + "grad_norm": 2.7875030040740967, + "learning_rate": 6.422210093030407e-07, + "loss": 0.4025, + "step": 19371 + }, + { + "epoch": 2.5216712221788367, + "grad_norm": 2.455900192260742, + "learning_rate": 6.41196747193334e-07, + "loss": 0.3627, + "step": 19374 + }, + { + "epoch": 2.522061694650527, + "grad_norm": 2.7226195335388184, + "learning_rate": 6.401732465573851e-07, + "loss": 0.335, + "step": 19377 + }, + { + "epoch": 2.5224521671222178, + "grad_norm": 2.6656506061553955, + "learning_rate": 6.391505075739984e-07, + "loss": 0.3006, + "step": 19380 + }, + { + "epoch": 2.5228426395939088, + "grad_norm": 2.610862970352173, + "learning_rate": 6.381285304218437e-07, + "loss": 0.3197, + "step": 19383 + }, + { + "epoch": 2.5232331120655993, + "grad_norm": 2.577313184738159, + "learning_rate": 6.371073152794593e-07, + "loss": 0.3171, + "step": 19386 + }, + { + "epoch": 2.5236235845372903, + "grad_norm": 2.949810028076172, + "learning_rate": 6.360868623252486e-07, + "loss": 0.3888, + "step": 19389 + }, + { + "epoch": 2.524014057008981, + "grad_norm": 3.948436975479126, + "learning_rate": 6.350671717374818e-07, + "loss": 0.3426, + "step": 19392 + }, + { + "epoch": 2.5244045294806714, + "grad_norm": 2.535118341445923, + "learning_rate": 6.340482436942991e-07, + "loss": 0.3528, + "step": 19395 + }, + { + "epoch": 2.5247950019523624, + "grad_norm": 2.7280428409576416, + "learning_rate": 6.330300783737031e-07, + "loss": 0.4199, + "step": 19398 + }, + { + "epoch": 2.525185474424053, + "grad_norm": 2.778590440750122, + "learning_rate": 6.320126759535645e-07, + "loss": 0.3757, + "step": 19401 + }, + { + "epoch": 2.525575946895744, + "grad_norm": 2.919978141784668, + "learning_rate": 6.309960366116242e-07, + "loss": 0.3648, + "step": 19404 + }, + { + "epoch": 2.5259664193674345, + "grad_norm": 2.7053189277648926, + "learning_rate": 6.299801605254846e-07, + "loss": 0.3384, + "step": 19407 + }, + { + "epoch": 2.5263568918391255, + "grad_norm": 2.8377797603607178, + "learning_rate": 6.289650478726167e-07, + "loss": 0.2978, + "step": 19410 + }, + { + "epoch": 2.526747364310816, + "grad_norm": 2.6428401470184326, + "learning_rate": 6.279506988303602e-07, + "loss": 0.3359, + "step": 19413 + }, + { + "epoch": 2.527137836782507, + "grad_norm": 2.710667610168457, + "learning_rate": 6.2693711357592e-07, + "loss": 0.3566, + "step": 19416 + }, + { + "epoch": 2.5275283092541976, + "grad_norm": 2.882359504699707, + "learning_rate": 6.259242922863662e-07, + "loss": 0.3227, + "step": 19419 + }, + { + "epoch": 2.527918781725888, + "grad_norm": 2.9712507724761963, + "learning_rate": 6.249122351386361e-07, + "loss": 0.4212, + "step": 19422 + }, + { + "epoch": 2.528309254197579, + "grad_norm": 2.5499699115753174, + "learning_rate": 6.239009423095355e-07, + "loss": 0.3462, + "step": 19425 + }, + { + "epoch": 2.5286997266692697, + "grad_norm": 3.2019617557525635, + "learning_rate": 6.228904139757347e-07, + "loss": 0.3946, + "step": 19428 + }, + { + "epoch": 2.5290901991409607, + "grad_norm": 2.812934398651123, + "learning_rate": 6.218806503137697e-07, + "loss": 0.3762, + "step": 19431 + }, + { + "epoch": 2.529480671612651, + "grad_norm": 2.723848581314087, + "learning_rate": 6.208716515000446e-07, + "loss": 0.377, + "step": 19434 + }, + { + "epoch": 2.529871144084342, + "grad_norm": 2.7258222103118896, + "learning_rate": 6.198634177108303e-07, + "loss": 0.3686, + "step": 19437 + }, + { + "epoch": 2.5302616165560328, + "grad_norm": 2.727672815322876, + "learning_rate": 6.188559491222628e-07, + "loss": 0.4371, + "step": 19440 + }, + { + "epoch": 2.5306520890277238, + "grad_norm": 2.907092809677124, + "learning_rate": 6.17849245910343e-07, + "loss": 0.3584, + "step": 19443 + }, + { + "epoch": 2.5310425614994143, + "grad_norm": 2.331328868865967, + "learning_rate": 6.168433082509423e-07, + "loss": 0.3078, + "step": 19446 + }, + { + "epoch": 2.531433033971105, + "grad_norm": 3.0786802768707275, + "learning_rate": 6.158381363197951e-07, + "loss": 0.384, + "step": 19449 + }, + { + "epoch": 2.531823506442796, + "grad_norm": 2.793458938598633, + "learning_rate": 6.148337302925011e-07, + "loss": 0.3512, + "step": 19452 + }, + { + "epoch": 2.5322139789144864, + "grad_norm": 2.8592000007629395, + "learning_rate": 6.138300903445299e-07, + "loss": 0.3117, + "step": 19455 + }, + { + "epoch": 2.5326044513861774, + "grad_norm": 2.7495779991149902, + "learning_rate": 6.128272166512156e-07, + "loss": 0.3858, + "step": 19458 + }, + { + "epoch": 2.532994923857868, + "grad_norm": 2.65899920463562, + "learning_rate": 6.11825109387757e-07, + "loss": 0.3549, + "step": 19461 + }, + { + "epoch": 2.5333853963295585, + "grad_norm": 2.677633285522461, + "learning_rate": 6.108237687292202e-07, + "loss": 0.3314, + "step": 19464 + }, + { + "epoch": 2.5337758688012495, + "grad_norm": 2.7956135272979736, + "learning_rate": 6.098231948505379e-07, + "loss": 0.4367, + "step": 19467 + }, + { + "epoch": 2.5341663412729405, + "grad_norm": 2.5864150524139404, + "learning_rate": 6.088233879265076e-07, + "loss": 0.3262, + "step": 19470 + }, + { + "epoch": 2.534556813744631, + "grad_norm": 2.944986343383789, + "learning_rate": 6.078243481317931e-07, + "loss": 0.4178, + "step": 19473 + }, + { + "epoch": 2.5349472862163216, + "grad_norm": 2.8859469890594482, + "learning_rate": 6.068260756409261e-07, + "loss": 0.443, + "step": 19476 + }, + { + "epoch": 2.5353377586880126, + "grad_norm": 2.6689913272857666, + "learning_rate": 6.058285706283023e-07, + "loss": 0.331, + "step": 19479 + }, + { + "epoch": 2.535728231159703, + "grad_norm": 2.97873592376709, + "learning_rate": 6.048318332681824e-07, + "loss": 0.3991, + "step": 19482 + }, + { + "epoch": 2.536118703631394, + "grad_norm": 2.8189284801483154, + "learning_rate": 6.038358637346947e-07, + "loss": 0.3456, + "step": 19485 + }, + { + "epoch": 2.5365091761030847, + "grad_norm": 2.695420026779175, + "learning_rate": 6.028406622018346e-07, + "loss": 0.3596, + "step": 19488 + }, + { + "epoch": 2.5368996485747752, + "grad_norm": 2.9843835830688477, + "learning_rate": 6.018462288434601e-07, + "loss": 0.3328, + "step": 19491 + }, + { + "epoch": 2.5372901210464662, + "grad_norm": 2.7242064476013184, + "learning_rate": 6.008525638332963e-07, + "loss": 0.4317, + "step": 19494 + }, + { + "epoch": 2.5376805935181572, + "grad_norm": 2.6902647018432617, + "learning_rate": 5.998596673449348e-07, + "loss": 0.3795, + "step": 19497 + }, + { + "epoch": 2.5380710659898478, + "grad_norm": 3.0032718181610107, + "learning_rate": 5.98867539551834e-07, + "loss": 0.3836, + "step": 19500 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 3.2121152877807617, + "learning_rate": 5.978761806273159e-07, + "loss": 0.3354, + "step": 19503 + }, + { + "epoch": 2.5388520109332293, + "grad_norm": 3.1183557510375977, + "learning_rate": 5.968855907445669e-07, + "loss": 0.2782, + "step": 19506 + }, + { + "epoch": 2.53924248340492, + "grad_norm": 2.6611621379852295, + "learning_rate": 5.958957700766432e-07, + "loss": 0.3565, + "step": 19509 + }, + { + "epoch": 2.539632955876611, + "grad_norm": 3.2292160987854004, + "learning_rate": 5.949067187964642e-07, + "loss": 0.3928, + "step": 19512 + }, + { + "epoch": 2.5400234283483014, + "grad_norm": 2.88360595703125, + "learning_rate": 5.939184370768131e-07, + "loss": 0.3521, + "step": 19515 + }, + { + "epoch": 2.540413900819992, + "grad_norm": 2.851783037185669, + "learning_rate": 5.929309250903425e-07, + "loss": 0.3978, + "step": 19518 + }, + { + "epoch": 2.540804373291683, + "grad_norm": 2.560197114944458, + "learning_rate": 5.91944183009569e-07, + "loss": 0.3088, + "step": 19521 + }, + { + "epoch": 2.541194845763374, + "grad_norm": 3.081719398498535, + "learning_rate": 5.909582110068724e-07, + "loss": 0.3519, + "step": 19524 + }, + { + "epoch": 2.5415853182350645, + "grad_norm": 2.6352970600128174, + "learning_rate": 5.899730092545014e-07, + "loss": 0.3859, + "step": 19527 + }, + { + "epoch": 2.541975790706755, + "grad_norm": 2.6711513996124268, + "learning_rate": 5.889885779245691e-07, + "loss": 0.3733, + "step": 19530 + }, + { + "epoch": 2.542366263178446, + "grad_norm": 2.8868939876556396, + "learning_rate": 5.880049171890523e-07, + "loss": 0.3391, + "step": 19533 + }, + { + "epoch": 2.5427567356501366, + "grad_norm": 2.6922616958618164, + "learning_rate": 5.870220272197941e-07, + "loss": 0.3535, + "step": 19536 + }, + { + "epoch": 2.5431472081218276, + "grad_norm": 2.6204869747161865, + "learning_rate": 5.860399081885043e-07, + "loss": 0.3502, + "step": 19539 + }, + { + "epoch": 2.543537680593518, + "grad_norm": 2.710123062133789, + "learning_rate": 5.85058560266758e-07, + "loss": 0.3269, + "step": 19542 + }, + { + "epoch": 2.5439281530652087, + "grad_norm": 2.685530424118042, + "learning_rate": 5.840779836259936e-07, + "loss": 0.3666, + "step": 19545 + }, + { + "epoch": 2.5443186255368997, + "grad_norm": 3.140651226043701, + "learning_rate": 5.830981784375145e-07, + "loss": 0.3366, + "step": 19548 + }, + { + "epoch": 2.5447090980085902, + "grad_norm": 2.9641339778900146, + "learning_rate": 5.821191448724934e-07, + "loss": 0.3882, + "step": 19551 + }, + { + "epoch": 2.5450995704802812, + "grad_norm": 2.5823094844818115, + "learning_rate": 5.811408831019633e-07, + "loss": 0.3802, + "step": 19554 + }, + { + "epoch": 2.545490042951972, + "grad_norm": 3.241447925567627, + "learning_rate": 5.801633932968237e-07, + "loss": 0.4017, + "step": 19557 + }, + { + "epoch": 2.545880515423663, + "grad_norm": 2.6996021270751953, + "learning_rate": 5.79186675627843e-07, + "loss": 0.3679, + "step": 19560 + }, + { + "epoch": 2.5462709878953533, + "grad_norm": 3.129182815551758, + "learning_rate": 5.782107302656497e-07, + "loss": 0.4384, + "step": 19563 + }, + { + "epoch": 2.5466614603670443, + "grad_norm": 2.9826695919036865, + "learning_rate": 5.772355573807386e-07, + "loss": 0.392, + "step": 19566 + }, + { + "epoch": 2.547051932838735, + "grad_norm": 2.7523341178894043, + "learning_rate": 5.762611571434729e-07, + "loss": 0.337, + "step": 19569 + }, + { + "epoch": 2.5474424053104254, + "grad_norm": 2.7614572048187256, + "learning_rate": 5.752875297240762e-07, + "loss": 0.3406, + "step": 19572 + }, + { + "epoch": 2.5478328777821164, + "grad_norm": 3.031256675720215, + "learning_rate": 5.743146752926404e-07, + "loss": 0.3894, + "step": 19575 + }, + { + "epoch": 2.548223350253807, + "grad_norm": 2.6525719165802, + "learning_rate": 5.733425940191196e-07, + "loss": 0.3594, + "step": 19578 + }, + { + "epoch": 2.548613822725498, + "grad_norm": 3.2900519371032715, + "learning_rate": 5.723712860733349e-07, + "loss": 0.4027, + "step": 19581 + }, + { + "epoch": 2.5490042951971885, + "grad_norm": 2.974825620651245, + "learning_rate": 5.714007516249731e-07, + "loss": 0.3328, + "step": 19584 + }, + { + "epoch": 2.549394767668879, + "grad_norm": 3.1361944675445557, + "learning_rate": 5.704309908435829e-07, + "loss": 0.3499, + "step": 19587 + }, + { + "epoch": 2.54978524014057, + "grad_norm": 2.7374889850616455, + "learning_rate": 5.69462003898581e-07, + "loss": 0.3019, + "step": 19590 + }, + { + "epoch": 2.550175712612261, + "grad_norm": 2.7288341522216797, + "learning_rate": 5.684937909592464e-07, + "loss": 0.2799, + "step": 19593 + }, + { + "epoch": 2.5505661850839516, + "grad_norm": 2.551090717315674, + "learning_rate": 5.675263521947244e-07, + "loss": 0.3312, + "step": 19596 + }, + { + "epoch": 2.550956657555642, + "grad_norm": 2.9384982585906982, + "learning_rate": 5.665596877740226e-07, + "loss": 0.346, + "step": 19599 + }, + { + "epoch": 2.551347130027333, + "grad_norm": 2.4525489807128906, + "learning_rate": 5.655937978660181e-07, + "loss": 0.267, + "step": 19602 + }, + { + "epoch": 2.5517376024990237, + "grad_norm": 3.117671251296997, + "learning_rate": 5.646286826394487e-07, + "loss": 0.393, + "step": 19605 + }, + { + "epoch": 2.5521280749707147, + "grad_norm": 2.629549264907837, + "learning_rate": 5.636643422629162e-07, + "loss": 0.341, + "step": 19608 + }, + { + "epoch": 2.5525185474424053, + "grad_norm": 2.5423882007598877, + "learning_rate": 5.62700776904892e-07, + "loss": 0.3133, + "step": 19611 + }, + { + "epoch": 2.552909019914096, + "grad_norm": 2.8479764461517334, + "learning_rate": 5.617379867337069e-07, + "loss": 0.3234, + "step": 19614 + }, + { + "epoch": 2.553299492385787, + "grad_norm": 2.5962421894073486, + "learning_rate": 5.607759719175581e-07, + "loss": 0.3422, + "step": 19617 + }, + { + "epoch": 2.553689964857478, + "grad_norm": 2.9569382667541504, + "learning_rate": 5.59814732624509e-07, + "loss": 0.3549, + "step": 19620 + }, + { + "epoch": 2.5540804373291683, + "grad_norm": 2.6492676734924316, + "learning_rate": 5.588542690224847e-07, + "loss": 0.3782, + "step": 19623 + }, + { + "epoch": 2.554470909800859, + "grad_norm": 2.628129482269287, + "learning_rate": 5.578945812792774e-07, + "loss": 0.2951, + "step": 19626 + }, + { + "epoch": 2.55486138227255, + "grad_norm": 2.4740099906921387, + "learning_rate": 5.569356695625411e-07, + "loss": 0.2915, + "step": 19629 + }, + { + "epoch": 2.5552518547442404, + "grad_norm": 2.6669182777404785, + "learning_rate": 5.559775340397972e-07, + "loss": 0.353, + "step": 19632 + }, + { + "epoch": 2.5556423272159314, + "grad_norm": 3.07163667678833, + "learning_rate": 5.550201748784295e-07, + "loss": 0.3871, + "step": 19635 + }, + { + "epoch": 2.556032799687622, + "grad_norm": 3.0605452060699463, + "learning_rate": 5.54063592245686e-07, + "loss": 0.3861, + "step": 19638 + }, + { + "epoch": 2.5564232721593125, + "grad_norm": 3.2628283500671387, + "learning_rate": 5.531077863086798e-07, + "loss": 0.3629, + "step": 19641 + }, + { + "epoch": 2.5568137446310035, + "grad_norm": 2.872985601425171, + "learning_rate": 5.521527572343888e-07, + "loss": 0.3232, + "step": 19644 + }, + { + "epoch": 2.5572042171026945, + "grad_norm": 2.832007884979248, + "learning_rate": 5.511985051896546e-07, + "loss": 0.3507, + "step": 19647 + }, + { + "epoch": 2.557594689574385, + "grad_norm": 3.344686985015869, + "learning_rate": 5.502450303411816e-07, + "loss": 0.2877, + "step": 19650 + }, + { + "epoch": 2.5579851620460756, + "grad_norm": 2.880563974380493, + "learning_rate": 5.492923328555416e-07, + "loss": 0.3184, + "step": 19653 + }, + { + "epoch": 2.5583756345177666, + "grad_norm": 2.620983839035034, + "learning_rate": 5.483404128991682e-07, + "loss": 0.3796, + "step": 19656 + }, + { + "epoch": 2.558766106989457, + "grad_norm": 2.6286096572875977, + "learning_rate": 5.473892706383587e-07, + "loss": 0.3424, + "step": 19659 + }, + { + "epoch": 2.559156579461148, + "grad_norm": 2.9008336067199707, + "learning_rate": 5.464389062392783e-07, + "loss": 0.3349, + "step": 19662 + }, + { + "epoch": 2.5595470519328387, + "grad_norm": 2.709076404571533, + "learning_rate": 5.454893198679507e-07, + "loss": 0.3707, + "step": 19665 + }, + { + "epoch": 2.5599375244045293, + "grad_norm": 3.218621253967285, + "learning_rate": 5.445405116902686e-07, + "loss": 0.3714, + "step": 19668 + }, + { + "epoch": 2.5603279968762203, + "grad_norm": 2.944977283477783, + "learning_rate": 5.435924818719857e-07, + "loss": 0.3407, + "step": 19671 + }, + { + "epoch": 2.560718469347911, + "grad_norm": 2.735933303833008, + "learning_rate": 5.426452305787222e-07, + "loss": 0.3715, + "step": 19674 + }, + { + "epoch": 2.561108941819602, + "grad_norm": 2.785769462585449, + "learning_rate": 5.416987579759597e-07, + "loss": 0.3397, + "step": 19677 + }, + { + "epoch": 2.5614994142912924, + "grad_norm": 3.0471670627593994, + "learning_rate": 5.407530642290442e-07, + "loss": 0.3995, + "step": 19680 + }, + { + "epoch": 2.5618898867629833, + "grad_norm": 2.528057336807251, + "learning_rate": 5.398081495031893e-07, + "loss": 0.3123, + "step": 19683 + }, + { + "epoch": 2.562280359234674, + "grad_norm": 2.9553000926971436, + "learning_rate": 5.388640139634671e-07, + "loss": 0.3747, + "step": 19686 + }, + { + "epoch": 2.562670831706365, + "grad_norm": 2.6886420249938965, + "learning_rate": 5.37920657774817e-07, + "loss": 0.3329, + "step": 19689 + }, + { + "epoch": 2.5630613041780554, + "grad_norm": 2.6885476112365723, + "learning_rate": 5.369780811020403e-07, + "loss": 0.318, + "step": 19692 + }, + { + "epoch": 2.563451776649746, + "grad_norm": 2.7424001693725586, + "learning_rate": 5.360362841098043e-07, + "loss": 0.3761, + "step": 19695 + }, + { + "epoch": 2.563842249121437, + "grad_norm": 2.803973913192749, + "learning_rate": 5.350952669626397e-07, + "loss": 0.3694, + "step": 19698 + }, + { + "epoch": 2.5642327215931275, + "grad_norm": 2.916654109954834, + "learning_rate": 5.341550298249376e-07, + "loss": 0.4022, + "step": 19701 + }, + { + "epoch": 2.5646231940648185, + "grad_norm": 3.0839455127716064, + "learning_rate": 5.332155728609578e-07, + "loss": 0.3461, + "step": 19704 + }, + { + "epoch": 2.565013666536509, + "grad_norm": 2.8296546936035156, + "learning_rate": 5.322768962348201e-07, + "loss": 0.3642, + "step": 19707 + }, + { + "epoch": 2.5654041390082, + "grad_norm": 2.6688482761383057, + "learning_rate": 5.313390001105106e-07, + "loss": 0.3447, + "step": 19710 + }, + { + "epoch": 2.5657946114798906, + "grad_norm": 2.6866798400878906, + "learning_rate": 5.304018846518765e-07, + "loss": 0.3273, + "step": 19713 + }, + { + "epoch": 2.5661850839515816, + "grad_norm": 2.8025333881378174, + "learning_rate": 5.294655500226315e-07, + "loss": 0.35, + "step": 19716 + }, + { + "epoch": 2.566575556423272, + "grad_norm": 2.8305671215057373, + "learning_rate": 5.285299963863499e-07, + "loss": 0.3589, + "step": 19719 + }, + { + "epoch": 2.5669660288949627, + "grad_norm": 2.790302276611328, + "learning_rate": 5.275952239064708e-07, + "loss": 0.2947, + "step": 19722 + }, + { + "epoch": 2.5673565013666537, + "grad_norm": 2.739863872528076, + "learning_rate": 5.266612327462978e-07, + "loss": 0.3707, + "step": 19725 + }, + { + "epoch": 2.5677469738383443, + "grad_norm": 2.8916966915130615, + "learning_rate": 5.257280230689976e-07, + "loss": 0.373, + "step": 19728 + }, + { + "epoch": 2.5681374463100353, + "grad_norm": 2.925980567932129, + "learning_rate": 5.247955950375977e-07, + "loss": 0.3426, + "step": 19731 + }, + { + "epoch": 2.568527918781726, + "grad_norm": 2.7908196449279785, + "learning_rate": 5.238639488149944e-07, + "loss": 0.3189, + "step": 19734 + }, + { + "epoch": 2.5689183912534164, + "grad_norm": 2.8355512619018555, + "learning_rate": 5.229330845639424e-07, + "loss": 0.3587, + "step": 19737 + }, + { + "epoch": 2.5693088637251074, + "grad_norm": 2.771878480911255, + "learning_rate": 5.220030024470623e-07, + "loss": 0.3447, + "step": 19740 + }, + { + "epoch": 2.5696993361967984, + "grad_norm": 2.8163154125213623, + "learning_rate": 5.210737026268364e-07, + "loss": 0.3142, + "step": 19743 + }, + { + "epoch": 2.570089808668489, + "grad_norm": 2.9766600131988525, + "learning_rate": 5.201451852656137e-07, + "loss": 0.396, + "step": 19746 + }, + { + "epoch": 2.5704802811401795, + "grad_norm": 2.753567934036255, + "learning_rate": 5.192174505256014e-07, + "loss": 0.3244, + "step": 19749 + }, + { + "epoch": 2.5708707536118705, + "grad_norm": 2.696614980697632, + "learning_rate": 5.182904985688758e-07, + "loss": 0.3593, + "step": 19752 + }, + { + "epoch": 2.571261226083561, + "grad_norm": 3.015610456466675, + "learning_rate": 5.173643295573704e-07, + "loss": 0.3158, + "step": 19755 + }, + { + "epoch": 2.571651698555252, + "grad_norm": 2.938066244125366, + "learning_rate": 5.164389436528877e-07, + "loss": 0.3686, + "step": 19758 + }, + { + "epoch": 2.5720421710269425, + "grad_norm": 2.7356908321380615, + "learning_rate": 5.155143410170899e-07, + "loss": 0.3215, + "step": 19761 + }, + { + "epoch": 2.572432643498633, + "grad_norm": 2.872227668762207, + "learning_rate": 5.14590521811501e-07, + "loss": 0.4187, + "step": 19764 + }, + { + "epoch": 2.572823115970324, + "grad_norm": 2.6312549114227295, + "learning_rate": 5.136674861975138e-07, + "loss": 0.3705, + "step": 19767 + }, + { + "epoch": 2.573213588442015, + "grad_norm": 3.510446310043335, + "learning_rate": 5.127452343363787e-07, + "loss": 0.3801, + "step": 19770 + }, + { + "epoch": 2.5736040609137056, + "grad_norm": 2.8721954822540283, + "learning_rate": 5.118237663892101e-07, + "loss": 0.2959, + "step": 19773 + }, + { + "epoch": 2.573994533385396, + "grad_norm": 2.6207385063171387, + "learning_rate": 5.109030825169886e-07, + "loss": 0.3646, + "step": 19776 + }, + { + "epoch": 2.574385005857087, + "grad_norm": 2.5428993701934814, + "learning_rate": 5.099831828805552e-07, + "loss": 0.329, + "step": 19779 + }, + { + "epoch": 2.5747754783287777, + "grad_norm": 2.899235725402832, + "learning_rate": 5.090640676406134e-07, + "loss": 0.3592, + "step": 19782 + }, + { + "epoch": 2.5751659508004687, + "grad_norm": 2.9908111095428467, + "learning_rate": 5.081457369577319e-07, + "loss": 0.4394, + "step": 19785 + }, + { + "epoch": 2.5755564232721593, + "grad_norm": 2.572739601135254, + "learning_rate": 5.07228190992341e-07, + "loss": 0.3404, + "step": 19788 + }, + { + "epoch": 2.57594689574385, + "grad_norm": 2.801175117492676, + "learning_rate": 5.063114299047328e-07, + "loss": 0.3375, + "step": 19791 + }, + { + "epoch": 2.576337368215541, + "grad_norm": 2.8451387882232666, + "learning_rate": 5.053954538550643e-07, + "loss": 0.3875, + "step": 19794 + }, + { + "epoch": 2.576727840687232, + "grad_norm": 2.8498640060424805, + "learning_rate": 5.044802630033557e-07, + "loss": 0.4177, + "step": 19797 + }, + { + "epoch": 2.5771183131589224, + "grad_norm": 3.0200397968292236, + "learning_rate": 5.035658575094882e-07, + "loss": 0.3213, + "step": 19800 + }, + { + "epoch": 2.577508785630613, + "grad_norm": 3.0799405574798584, + "learning_rate": 5.02652237533206e-07, + "loss": 0.4103, + "step": 19803 + }, + { + "epoch": 2.577899258102304, + "grad_norm": 2.6357455253601074, + "learning_rate": 5.017394032341161e-07, + "loss": 0.2962, + "step": 19806 + }, + { + "epoch": 2.5782897305739945, + "grad_norm": 2.754474401473999, + "learning_rate": 5.008273547716902e-07, + "loss": 0.3561, + "step": 19809 + }, + { + "epoch": 2.5786802030456855, + "grad_norm": 2.776348829269409, + "learning_rate": 4.999160923052604e-07, + "loss": 0.3432, + "step": 19812 + }, + { + "epoch": 2.579070675517376, + "grad_norm": 2.6089048385620117, + "learning_rate": 4.990056159940221e-07, + "loss": 0.3669, + "step": 19815 + }, + { + "epoch": 2.5794611479890666, + "grad_norm": 2.7450947761535645, + "learning_rate": 4.980959259970347e-07, + "loss": 0.3924, + "step": 19818 + }, + { + "epoch": 2.5798516204607576, + "grad_norm": 2.4832842350006104, + "learning_rate": 4.971870224732184e-07, + "loss": 0.3488, + "step": 19821 + }, + { + "epoch": 2.580242092932448, + "grad_norm": 3.0447306632995605, + "learning_rate": 4.962789055813555e-07, + "loss": 0.3573, + "step": 19824 + }, + { + "epoch": 2.580632565404139, + "grad_norm": 2.752413272857666, + "learning_rate": 4.953715754800947e-07, + "loss": 0.3524, + "step": 19827 + }, + { + "epoch": 2.5810230378758297, + "grad_norm": 2.7045111656188965, + "learning_rate": 4.944650323279432e-07, + "loss": 0.3342, + "step": 19830 + }, + { + "epoch": 2.5814135103475206, + "grad_norm": 2.5593090057373047, + "learning_rate": 4.935592762832714e-07, + "loss": 0.3024, + "step": 19833 + }, + { + "epoch": 2.581803982819211, + "grad_norm": 2.776071786880493, + "learning_rate": 4.926543075043133e-07, + "loss": 0.3213, + "step": 19836 + }, + { + "epoch": 2.582194455290902, + "grad_norm": 2.6728298664093018, + "learning_rate": 4.917501261491675e-07, + "loss": 0.3532, + "step": 19839 + }, + { + "epoch": 2.5825849277625927, + "grad_norm": 2.9024364948272705, + "learning_rate": 4.908467323757898e-07, + "loss": 0.3839, + "step": 19842 + }, + { + "epoch": 2.5829754002342833, + "grad_norm": 2.7096471786499023, + "learning_rate": 4.899441263420019e-07, + "loss": 0.3769, + "step": 19845 + }, + { + "epoch": 2.5833658727059743, + "grad_norm": 2.882371425628662, + "learning_rate": 4.890423082054879e-07, + "loss": 0.3751, + "step": 19848 + }, + { + "epoch": 2.583756345177665, + "grad_norm": 2.866162061691284, + "learning_rate": 4.881412781237927e-07, + "loss": 0.3513, + "step": 19851 + }, + { + "epoch": 2.584146817649356, + "grad_norm": 2.852534770965576, + "learning_rate": 4.872410362543251e-07, + "loss": 0.3674, + "step": 19854 + }, + { + "epoch": 2.5845372901210464, + "grad_norm": 2.7616591453552246, + "learning_rate": 4.863415827543539e-07, + "loss": 0.3503, + "step": 19857 + }, + { + "epoch": 2.5849277625927374, + "grad_norm": 2.753648519515991, + "learning_rate": 4.854429177810138e-07, + "loss": 0.3549, + "step": 19860 + }, + { + "epoch": 2.585318235064428, + "grad_norm": 2.8216161727905273, + "learning_rate": 4.845450414912989e-07, + "loss": 0.3334, + "step": 19863 + }, + { + "epoch": 2.585708707536119, + "grad_norm": 2.91745662689209, + "learning_rate": 4.836479540420653e-07, + "loss": 0.41, + "step": 19866 + }, + { + "epoch": 2.5860991800078095, + "grad_norm": 3.008896589279175, + "learning_rate": 4.827516555900335e-07, + "loss": 0.3873, + "step": 19869 + }, + { + "epoch": 2.5864896524795, + "grad_norm": 2.9034063816070557, + "learning_rate": 4.818561462917848e-07, + "loss": 0.3723, + "step": 19872 + }, + { + "epoch": 2.586880124951191, + "grad_norm": 3.033580780029297, + "learning_rate": 4.809614263037621e-07, + "loss": 0.4597, + "step": 19875 + }, + { + "epoch": 2.5872705974228816, + "grad_norm": 2.697132110595703, + "learning_rate": 4.800674957822709e-07, + "loss": 0.3399, + "step": 19878 + }, + { + "epoch": 2.5876610698945726, + "grad_norm": 2.8508565425872803, + "learning_rate": 4.791743548834809e-07, + "loss": 0.4714, + "step": 19881 + }, + { + "epoch": 2.588051542366263, + "grad_norm": 2.757885694503784, + "learning_rate": 4.782820037634206e-07, + "loss": 0.4015, + "step": 19884 + }, + { + "epoch": 2.5884420148379537, + "grad_norm": 2.6383426189422607, + "learning_rate": 4.773904425779807e-07, + "loss": 0.3587, + "step": 19887 + }, + { + "epoch": 2.5888324873096447, + "grad_norm": 2.7818305492401123, + "learning_rate": 4.7649967148291787e-07, + "loss": 0.3431, + "step": 19890 + }, + { + "epoch": 2.5892229597813357, + "grad_norm": 2.781233549118042, + "learning_rate": 4.7560969063384587e-07, + "loss": 0.3586, + "step": 19893 + }, + { + "epoch": 2.589613432253026, + "grad_norm": 2.4520440101623535, + "learning_rate": 4.747205001862421e-07, + "loss": 0.3147, + "step": 19896 + }, + { + "epoch": 2.5900039047247168, + "grad_norm": 2.895876407623291, + "learning_rate": 4.7383210029544826e-07, + "loss": 0.4083, + "step": 19899 + }, + { + "epoch": 2.5903943771964077, + "grad_norm": 2.7189314365386963, + "learning_rate": 4.7294449111666474e-07, + "loss": 0.3006, + "step": 19902 + }, + { + "epoch": 2.5907848496680983, + "grad_norm": 2.5522122383117676, + "learning_rate": 4.720576728049553e-07, + "loss": 0.3353, + "step": 19905 + }, + { + "epoch": 2.5911753221397893, + "grad_norm": 2.8779947757720947, + "learning_rate": 4.711716455152437e-07, + "loss": 0.3466, + "step": 19908 + }, + { + "epoch": 2.59156579461148, + "grad_norm": 3.0736544132232666, + "learning_rate": 4.7028640940231964e-07, + "loss": 0.3948, + "step": 19911 + }, + { + "epoch": 2.5919562670831704, + "grad_norm": 2.9793858528137207, + "learning_rate": 4.694019646208309e-07, + "loss": 0.4169, + "step": 19914 + }, + { + "epoch": 2.5923467395548614, + "grad_norm": 2.964721441268921, + "learning_rate": 4.685183113252867e-07, + "loss": 0.3437, + "step": 19917 + }, + { + "epoch": 2.5927372120265524, + "grad_norm": 2.8925373554229736, + "learning_rate": 4.6763544967006027e-07, + "loss": 0.3583, + "step": 19920 + }, + { + "epoch": 2.593127684498243, + "grad_norm": 2.848095178604126, + "learning_rate": 4.667533798093876e-07, + "loss": 0.3217, + "step": 19923 + }, + { + "epoch": 2.5935181569699335, + "grad_norm": 2.6200132369995117, + "learning_rate": 4.6587210189736277e-07, + "loss": 0.4197, + "step": 19926 + }, + { + "epoch": 2.5939086294416245, + "grad_norm": 3.714834451675415, + "learning_rate": 4.6499161608794253e-07, + "loss": 0.3621, + "step": 19929 + }, + { + "epoch": 2.594299101913315, + "grad_norm": 2.842902898788452, + "learning_rate": 4.641119225349472e-07, + "loss": 0.4068, + "step": 19932 + }, + { + "epoch": 2.594689574385006, + "grad_norm": 2.6865339279174805, + "learning_rate": 4.632330213920572e-07, + "loss": 0.3629, + "step": 19935 + }, + { + "epoch": 2.5950800468566966, + "grad_norm": 2.490020751953125, + "learning_rate": 4.6235491281281354e-07, + "loss": 0.3555, + "step": 19938 + }, + { + "epoch": 2.595470519328387, + "grad_norm": 2.818798780441284, + "learning_rate": 4.614775969506219e-07, + "loss": 0.3819, + "step": 19941 + }, + { + "epoch": 2.595860991800078, + "grad_norm": 2.8724567890167236, + "learning_rate": 4.6060107395874575e-07, + "loss": 0.3519, + "step": 19944 + }, + { + "epoch": 2.596251464271769, + "grad_norm": 2.898418664932251, + "learning_rate": 4.597253439903121e-07, + "loss": 0.3509, + "step": 19947 + }, + { + "epoch": 2.5966419367434597, + "grad_norm": 2.461851119995117, + "learning_rate": 4.588504071983102e-07, + "loss": 0.3343, + "step": 19950 + }, + { + "epoch": 2.59703240921515, + "grad_norm": 3.4729630947113037, + "learning_rate": 4.579762637355889e-07, + "loss": 0.3653, + "step": 19953 + }, + { + "epoch": 2.597422881686841, + "grad_norm": 2.9097652435302734, + "learning_rate": 4.5710291375485995e-07, + "loss": 0.3598, + "step": 19956 + }, + { + "epoch": 2.5978133541585318, + "grad_norm": 2.9537763595581055, + "learning_rate": 4.5623035740869237e-07, + "loss": 0.3626, + "step": 19959 + }, + { + "epoch": 2.5982038266302228, + "grad_norm": 2.8022215366363525, + "learning_rate": 4.553585948495254e-07, + "loss": 0.3239, + "step": 19962 + }, + { + "epoch": 2.5985942991019133, + "grad_norm": 3.070570945739746, + "learning_rate": 4.544876262296505e-07, + "loss": 0.397, + "step": 19965 + }, + { + "epoch": 2.598984771573604, + "grad_norm": 2.5882253646850586, + "learning_rate": 4.536174517012254e-07, + "loss": 0.3869, + "step": 19968 + }, + { + "epoch": 2.599375244045295, + "grad_norm": 2.780194044113159, + "learning_rate": 4.527480714162663e-07, + "loss": 0.3459, + "step": 19971 + }, + { + "epoch": 2.5997657165169854, + "grad_norm": 2.8268415927886963, + "learning_rate": 4.5187948552665394e-07, + "loss": 0.3352, + "step": 19974 + }, + { + "epoch": 2.6001561889886764, + "grad_norm": 2.7509450912475586, + "learning_rate": 4.5101169418412804e-07, + "loss": 0.3166, + "step": 19977 + }, + { + "epoch": 2.600546661460367, + "grad_norm": 2.9784107208251953, + "learning_rate": 4.501446975402879e-07, + "loss": 0.3974, + "step": 19980 + }, + { + "epoch": 2.600937133932058, + "grad_norm": 2.658228874206543, + "learning_rate": 4.49278495746599e-07, + "loss": 0.2955, + "step": 19983 + }, + { + "epoch": 2.6013276064037485, + "grad_norm": 2.9209887981414795, + "learning_rate": 4.4841308895438363e-07, + "loss": 0.4038, + "step": 19986 + }, + { + "epoch": 2.6017180788754395, + "grad_norm": 3.115154981613159, + "learning_rate": 4.475484773148253e-07, + "loss": 0.4056, + "step": 19989 + }, + { + "epoch": 2.60210855134713, + "grad_norm": 3.18581223487854, + "learning_rate": 4.4668466097897214e-07, + "loss": 0.3335, + "step": 19992 + }, + { + "epoch": 2.6024990238188206, + "grad_norm": 2.865859270095825, + "learning_rate": 4.458216400977294e-07, + "loss": 0.3325, + "step": 19995 + }, + { + "epoch": 2.6028894962905116, + "grad_norm": 2.806417465209961, + "learning_rate": 4.449594148218661e-07, + "loss": 0.3596, + "step": 19998 + }, + { + "epoch": 2.603279968762202, + "grad_norm": 2.626926898956299, + "learning_rate": 4.4409798530200887e-07, + "loss": 0.3196, + "step": 20001 + }, + { + "epoch": 2.603670441233893, + "grad_norm": 2.9506349563598633, + "learning_rate": 4.4323735168865067e-07, + "loss": 0.3033, + "step": 20004 + }, + { + "epoch": 2.6040609137055837, + "grad_norm": 2.665252685546875, + "learning_rate": 4.423775141321418e-07, + "loss": 0.3409, + "step": 20007 + }, + { + "epoch": 2.6044513861772742, + "grad_norm": 2.7964937686920166, + "learning_rate": 4.4151847278269213e-07, + "loss": 0.4056, + "step": 20010 + }, + { + "epoch": 2.6048418586489652, + "grad_norm": 2.7492387294769287, + "learning_rate": 4.40660227790376e-07, + "loss": 0.3769, + "step": 20013 + }, + { + "epoch": 2.605232331120656, + "grad_norm": 2.866215944290161, + "learning_rate": 4.398027793051274e-07, + "loss": 0.3084, + "step": 20016 + }, + { + "epoch": 2.6056228035923468, + "grad_norm": 2.538553237915039, + "learning_rate": 4.389461274767398e-07, + "loss": 0.3312, + "step": 20019 + }, + { + "epoch": 2.6060132760640373, + "grad_norm": 2.998384475708008, + "learning_rate": 4.3809027245486745e-07, + "loss": 0.3473, + "step": 20022 + }, + { + "epoch": 2.6064037485357283, + "grad_norm": 3.2550208568573, + "learning_rate": 4.3723521438902907e-07, + "loss": 0.3076, + "step": 20025 + }, + { + "epoch": 2.606794221007419, + "grad_norm": 2.7280972003936768, + "learning_rate": 4.3638095342859953e-07, + "loss": 0.393, + "step": 20028 + }, + { + "epoch": 2.60718469347911, + "grad_norm": 2.8212859630584717, + "learning_rate": 4.3552748972281623e-07, + "loss": 0.3391, + "step": 20031 + }, + { + "epoch": 2.6075751659508004, + "grad_norm": 2.6648576259613037, + "learning_rate": 4.3467482342077927e-07, + "loss": 0.2934, + "step": 20034 + }, + { + "epoch": 2.607965638422491, + "grad_norm": 2.6012842655181885, + "learning_rate": 4.3382295467144675e-07, + "loss": 0.3, + "step": 20037 + }, + { + "epoch": 2.608356110894182, + "grad_norm": 2.910468578338623, + "learning_rate": 4.329718836236374e-07, + "loss": 0.3812, + "step": 20040 + }, + { + "epoch": 2.608746583365873, + "grad_norm": 3.135700225830078, + "learning_rate": 4.3212161042603174e-07, + "loss": 0.3906, + "step": 20043 + }, + { + "epoch": 2.6091370558375635, + "grad_norm": 2.9458134174346924, + "learning_rate": 4.312721352271726e-07, + "loss": 0.3705, + "step": 20046 + }, + { + "epoch": 2.609527528309254, + "grad_norm": 2.7890570163726807, + "learning_rate": 4.304234581754602e-07, + "loss": 0.3879, + "step": 20049 + }, + { + "epoch": 2.609918000780945, + "grad_norm": 2.8146069049835205, + "learning_rate": 4.2957557941915586e-07, + "loss": 0.3378, + "step": 20052 + }, + { + "epoch": 2.6103084732526356, + "grad_norm": 2.8290700912475586, + "learning_rate": 4.287284991063839e-07, + "loss": 0.3459, + "step": 20055 + }, + { + "epoch": 2.6106989457243266, + "grad_norm": 2.8539445400238037, + "learning_rate": 4.278822173851266e-07, + "loss": 0.3173, + "step": 20058 + }, + { + "epoch": 2.611089418196017, + "grad_norm": 2.866018056869507, + "learning_rate": 4.270367344032278e-07, + "loss": 0.3547, + "step": 20061 + }, + { + "epoch": 2.6114798906677077, + "grad_norm": 2.7992513179779053, + "learning_rate": 4.2619205030838993e-07, + "loss": 0.2944, + "step": 20064 + }, + { + "epoch": 2.6118703631393987, + "grad_norm": 2.9336869716644287, + "learning_rate": 4.2534816524818054e-07, + "loss": 0.327, + "step": 20067 + }, + { + "epoch": 2.6122608356110897, + "grad_norm": 2.6383185386657715, + "learning_rate": 4.245050793700228e-07, + "loss": 0.3384, + "step": 20070 + }, + { + "epoch": 2.6126513080827802, + "grad_norm": 2.8900146484375, + "learning_rate": 4.2366279282120113e-07, + "loss": 0.3804, + "step": 20073 + }, + { + "epoch": 2.613041780554471, + "grad_norm": 2.861504316329956, + "learning_rate": 4.2282130574886336e-07, + "loss": 0.3865, + "step": 20076 + }, + { + "epoch": 2.6134322530261618, + "grad_norm": 3.0964694023132324, + "learning_rate": 4.2198061830001467e-07, + "loss": 0.3979, + "step": 20079 + }, + { + "epoch": 2.6138227254978523, + "grad_norm": 2.622819423675537, + "learning_rate": 4.211407306215198e-07, + "loss": 0.3653, + "step": 20082 + }, + { + "epoch": 2.6142131979695433, + "grad_norm": 2.8246631622314453, + "learning_rate": 4.20301642860107e-07, + "loss": 0.3648, + "step": 20085 + }, + { + "epoch": 2.614603670441234, + "grad_norm": 2.9626944065093994, + "learning_rate": 4.194633551623645e-07, + "loss": 0.3551, + "step": 20088 + }, + { + "epoch": 2.6149941429129244, + "grad_norm": 3.4414031505584717, + "learning_rate": 4.186258676747368e-07, + "loss": 0.3116, + "step": 20091 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 2.694920539855957, + "learning_rate": 4.177891805435319e-07, + "loss": 0.3415, + "step": 20094 + }, + { + "epoch": 2.615775087856306, + "grad_norm": 2.726844549179077, + "learning_rate": 4.169532939149185e-07, + "loss": 0.349, + "step": 20097 + }, + { + "epoch": 2.616165560327997, + "grad_norm": 2.7306230068206787, + "learning_rate": 4.1611820793492294e-07, + "loss": 0.3459, + "step": 20100 + }, + { + "epoch": 2.6165560327996875, + "grad_norm": 2.7479240894317627, + "learning_rate": 4.152839227494332e-07, + "loss": 0.3669, + "step": 20103 + }, + { + "epoch": 2.6169465052713785, + "grad_norm": 2.936683416366577, + "learning_rate": 4.1445043850419754e-07, + "loss": 0.3581, + "step": 20106 + }, + { + "epoch": 2.617336977743069, + "grad_norm": 2.6035518646240234, + "learning_rate": 4.1361775534482397e-07, + "loss": 0.3312, + "step": 20109 + }, + { + "epoch": 2.61772745021476, + "grad_norm": 2.8025290966033936, + "learning_rate": 4.1278587341678e-07, + "loss": 0.3405, + "step": 20112 + }, + { + "epoch": 2.6181179226864506, + "grad_norm": 2.8133556842803955, + "learning_rate": 4.119547928653933e-07, + "loss": 0.3747, + "step": 20115 + }, + { + "epoch": 2.618508395158141, + "grad_norm": 2.7612462043762207, + "learning_rate": 4.111245138358527e-07, + "loss": 0.3516, + "step": 20118 + }, + { + "epoch": 2.618898867629832, + "grad_norm": 2.857440710067749, + "learning_rate": 4.1029503647320666e-07, + "loss": 0.3266, + "step": 20121 + }, + { + "epoch": 2.6192893401015227, + "grad_norm": 2.9601211547851562, + "learning_rate": 4.094663609223615e-07, + "loss": 0.3357, + "step": 20124 + }, + { + "epoch": 2.6196798125732137, + "grad_norm": 2.95589017868042, + "learning_rate": 4.086384873280863e-07, + "loss": 0.3328, + "step": 20127 + }, + { + "epoch": 2.6200702850449042, + "grad_norm": 3.003638744354248, + "learning_rate": 4.0781141583500825e-07, + "loss": 0.3574, + "step": 20130 + }, + { + "epoch": 2.6204607575165952, + "grad_norm": 2.928009033203125, + "learning_rate": 4.069851465876157e-07, + "loss": 0.4451, + "step": 20133 + }, + { + "epoch": 2.620851229988286, + "grad_norm": 3.0818943977355957, + "learning_rate": 4.061596797302553e-07, + "loss": 0.3478, + "step": 20136 + }, + { + "epoch": 2.621241702459977, + "grad_norm": 3.1439309120178223, + "learning_rate": 4.053350154071356e-07, + "loss": 0.3554, + "step": 20139 + }, + { + "epoch": 2.6216321749316673, + "grad_norm": 2.61098313331604, + "learning_rate": 4.045111537623231e-07, + "loss": 0.3411, + "step": 20142 + }, + { + "epoch": 2.622022647403358, + "grad_norm": 2.8817806243896484, + "learning_rate": 4.0368809493974314e-07, + "loss": 0.4183, + "step": 20145 + }, + { + "epoch": 2.622413119875049, + "grad_norm": 2.906568765640259, + "learning_rate": 4.028658390831852e-07, + "loss": 0.3663, + "step": 20148 + }, + { + "epoch": 2.6228035923467394, + "grad_norm": 2.7763681411743164, + "learning_rate": 4.0204438633629383e-07, + "loss": 0.395, + "step": 20151 + }, + { + "epoch": 2.6231940648184304, + "grad_norm": 2.8315131664276123, + "learning_rate": 4.0122373684257474e-07, + "loss": 0.3649, + "step": 20154 + }, + { + "epoch": 2.623584537290121, + "grad_norm": 3.1497440338134766, + "learning_rate": 4.0040389074539555e-07, + "loss": 0.3771, + "step": 20157 + }, + { + "epoch": 2.6239750097618115, + "grad_norm": 2.719172477722168, + "learning_rate": 3.9958484818797995e-07, + "loss": 0.3331, + "step": 20160 + }, + { + "epoch": 2.6243654822335025, + "grad_norm": 3.178786277770996, + "learning_rate": 3.9876660931341405e-07, + "loss": 0.3777, + "step": 20163 + }, + { + "epoch": 2.6247559547051935, + "grad_norm": 2.772921085357666, + "learning_rate": 3.9794917426464074e-07, + "loss": 0.3759, + "step": 20166 + }, + { + "epoch": 2.625146427176884, + "grad_norm": 2.7855470180511475, + "learning_rate": 3.971325431844664e-07, + "loss": 0.3807, + "step": 20169 + }, + { + "epoch": 2.6255368996485746, + "grad_norm": 2.967480182647705, + "learning_rate": 3.963167162155529e-07, + "loss": 0.3764, + "step": 20172 + }, + { + "epoch": 2.6259273721202656, + "grad_norm": 2.7995481491088867, + "learning_rate": 3.9550169350042464e-07, + "loss": 0.3978, + "step": 20175 + }, + { + "epoch": 2.626317844591956, + "grad_norm": 3.274163246154785, + "learning_rate": 3.946874751814639e-07, + "loss": 0.3355, + "step": 20178 + }, + { + "epoch": 2.626708317063647, + "grad_norm": 2.6372921466827393, + "learning_rate": 3.93874061400914e-07, + "loss": 0.3227, + "step": 20181 + }, + { + "epoch": 2.6270987895353377, + "grad_norm": 2.845742702484131, + "learning_rate": 3.9306145230087524e-07, + "loss": 0.3701, + "step": 20184 + }, + { + "epoch": 2.6274892620070283, + "grad_norm": 2.5727438926696777, + "learning_rate": 3.9224964802330847e-07, + "loss": 0.3382, + "step": 20187 + }, + { + "epoch": 2.6278797344787193, + "grad_norm": 3.014756679534912, + "learning_rate": 3.914386487100358e-07, + "loss": 0.3586, + "step": 20190 + }, + { + "epoch": 2.6282702069504102, + "grad_norm": 2.7406253814697266, + "learning_rate": 3.9062845450273613e-07, + "loss": 0.3508, + "step": 20193 + }, + { + "epoch": 2.628660679422101, + "grad_norm": 2.826164960861206, + "learning_rate": 3.898190655429479e-07, + "loss": 0.4258, + "step": 20196 + }, + { + "epoch": 2.6290511518937913, + "grad_norm": 3.219733238220215, + "learning_rate": 3.890104819720719e-07, + "loss": 0.3397, + "step": 20199 + }, + { + "epoch": 2.6294416243654823, + "grad_norm": 2.6495490074157715, + "learning_rate": 3.8820270393136403e-07, + "loss": 0.3296, + "step": 20202 + }, + { + "epoch": 2.629832096837173, + "grad_norm": 3.0028276443481445, + "learning_rate": 3.873957315619414e-07, + "loss": 0.3653, + "step": 20205 + }, + { + "epoch": 2.630222569308864, + "grad_norm": 2.973289728164673, + "learning_rate": 3.8658956500478127e-07, + "loss": 0.3419, + "step": 20208 + }, + { + "epoch": 2.6306130417805544, + "grad_norm": 3.0267601013183594, + "learning_rate": 3.857842044007193e-07, + "loss": 0.3816, + "step": 20211 + }, + { + "epoch": 2.631003514252245, + "grad_norm": 2.8618972301483154, + "learning_rate": 3.849796498904496e-07, + "loss": 0.3335, + "step": 20214 + }, + { + "epoch": 2.631393986723936, + "grad_norm": 3.1597700119018555, + "learning_rate": 3.841759016145258e-07, + "loss": 0.4617, + "step": 20217 + }, + { + "epoch": 2.631784459195627, + "grad_norm": 3.0783212184906006, + "learning_rate": 3.8337295971336285e-07, + "loss": 0.4516, + "step": 20220 + }, + { + "epoch": 2.6321749316673175, + "grad_norm": 2.588809013366699, + "learning_rate": 3.825708243272319e-07, + "loss": 0.3026, + "step": 20223 + }, + { + "epoch": 2.632565404139008, + "grad_norm": 2.936833381652832, + "learning_rate": 3.817694955962642e-07, + "loss": 0.3481, + "step": 20226 + }, + { + "epoch": 2.632955876610699, + "grad_norm": 2.49509596824646, + "learning_rate": 3.8096897366044936e-07, + "loss": 0.3365, + "step": 20229 + }, + { + "epoch": 2.6333463490823896, + "grad_norm": 2.5137100219726562, + "learning_rate": 3.801692586596384e-07, + "loss": 0.3061, + "step": 20232 + }, + { + "epoch": 2.6337368215540806, + "grad_norm": 3.77799916267395, + "learning_rate": 3.7937035073353947e-07, + "loss": 0.3653, + "step": 20235 + }, + { + "epoch": 2.634127294025771, + "grad_norm": 3.221705198287964, + "learning_rate": 3.785722500217193e-07, + "loss": 0.354, + "step": 20238 + }, + { + "epoch": 2.6345177664974617, + "grad_norm": 2.6136856079101562, + "learning_rate": 3.7777495666360574e-07, + "loss": 0.3911, + "step": 20241 + }, + { + "epoch": 2.6349082389691527, + "grad_norm": 2.5668489933013916, + "learning_rate": 3.76978470798483e-07, + "loss": 0.2889, + "step": 20244 + }, + { + "epoch": 2.6352987114408433, + "grad_norm": 2.760801076889038, + "learning_rate": 3.7618279256549597e-07, + "loss": 0.3479, + "step": 20247 + }, + { + "epoch": 2.6356891839125343, + "grad_norm": 2.920722723007202, + "learning_rate": 3.7538792210364825e-07, + "loss": 0.3856, + "step": 20250 + }, + { + "epoch": 2.636079656384225, + "grad_norm": 3.082993268966675, + "learning_rate": 3.745938595518017e-07, + "loss": 0.3859, + "step": 20253 + }, + { + "epoch": 2.636470128855916, + "grad_norm": 3.254218101501465, + "learning_rate": 3.7380060504867697e-07, + "loss": 0.3902, + "step": 20256 + }, + { + "epoch": 2.6368606013276064, + "grad_norm": 2.6879429817199707, + "learning_rate": 3.7300815873285435e-07, + "loss": 0.3567, + "step": 20259 + }, + { + "epoch": 2.6372510737992974, + "grad_norm": 2.662904977798462, + "learning_rate": 3.722165207427736e-07, + "loss": 0.3279, + "step": 20262 + }, + { + "epoch": 2.637641546270988, + "grad_norm": 2.596517562866211, + "learning_rate": 3.7142569121673135e-07, + "loss": 0.3851, + "step": 20265 + }, + { + "epoch": 2.6380320187426785, + "grad_norm": 2.8582632541656494, + "learning_rate": 3.706356702928826e-07, + "loss": 0.364, + "step": 20268 + }, + { + "epoch": 2.6384224912143694, + "grad_norm": 2.894286632537842, + "learning_rate": 3.6984645810924423e-07, + "loss": 0.3438, + "step": 20271 + }, + { + "epoch": 2.63881296368606, + "grad_norm": 2.674154758453369, + "learning_rate": 3.690580548036893e-07, + "loss": 0.3641, + "step": 20274 + }, + { + "epoch": 2.639203436157751, + "grad_norm": 2.6992299556732178, + "learning_rate": 3.6827046051395035e-07, + "loss": 0.3502, + "step": 20277 + }, + { + "epoch": 2.6395939086294415, + "grad_norm": 2.4692561626434326, + "learning_rate": 3.674836753776173e-07, + "loss": 0.3447, + "step": 20280 + }, + { + "epoch": 2.639984381101132, + "grad_norm": 2.7542686462402344, + "learning_rate": 3.666976995321414e-07, + "loss": 0.3577, + "step": 20283 + }, + { + "epoch": 2.640374853572823, + "grad_norm": 3.003816843032837, + "learning_rate": 3.6591253311483056e-07, + "loss": 0.2958, + "step": 20286 + }, + { + "epoch": 2.640765326044514, + "grad_norm": 2.7368361949920654, + "learning_rate": 3.6512817626285056e-07, + "loss": 0.3826, + "step": 20289 + }, + { + "epoch": 2.6411557985162046, + "grad_norm": 2.821338176727295, + "learning_rate": 3.6434462911322856e-07, + "loss": 0.3569, + "step": 20292 + }, + { + "epoch": 2.641546270987895, + "grad_norm": 2.715587615966797, + "learning_rate": 3.635618918028477e-07, + "loss": 0.369, + "step": 20295 + }, + { + "epoch": 2.641936743459586, + "grad_norm": 2.71569561958313, + "learning_rate": 3.627799644684504e-07, + "loss": 0.3227, + "step": 20298 + }, + { + "epoch": 2.6423272159312767, + "grad_norm": 3.0673129558563232, + "learning_rate": 3.6199884724663734e-07, + "loss": 0.3651, + "step": 20301 + }, + { + "epoch": 2.6427176884029677, + "grad_norm": 3.4038453102111816, + "learning_rate": 3.612185402738705e-07, + "loss": 0.3598, + "step": 20304 + }, + { + "epoch": 2.6431081608746583, + "grad_norm": 2.976976156234741, + "learning_rate": 3.604390436864652e-07, + "loss": 0.3942, + "step": 20307 + }, + { + "epoch": 2.643498633346349, + "grad_norm": 3.015279531478882, + "learning_rate": 3.596603576205987e-07, + "loss": 0.413, + "step": 20310 + }, + { + "epoch": 2.64388910581804, + "grad_norm": 2.8536202907562256, + "learning_rate": 3.5888248221230605e-07, + "loss": 0.3735, + "step": 20313 + }, + { + "epoch": 2.644279578289731, + "grad_norm": 2.8353049755096436, + "learning_rate": 3.5810541759748076e-07, + "loss": 0.3604, + "step": 20316 + }, + { + "epoch": 2.6446700507614214, + "grad_norm": 2.661177158355713, + "learning_rate": 3.5732916391187254e-07, + "loss": 0.3191, + "step": 20319 + }, + { + "epoch": 2.645060523233112, + "grad_norm": 2.5510129928588867, + "learning_rate": 3.5655372129109356e-07, + "loss": 0.3296, + "step": 20322 + }, + { + "epoch": 2.645450995704803, + "grad_norm": 2.8251776695251465, + "learning_rate": 3.557790898706115e-07, + "loss": 0.3705, + "step": 20325 + }, + { + "epoch": 2.6458414681764935, + "grad_norm": 2.7708895206451416, + "learning_rate": 3.55005269785752e-07, + "loss": 0.3361, + "step": 20328 + }, + { + "epoch": 2.6462319406481845, + "grad_norm": 2.593419313430786, + "learning_rate": 3.5423226117169973e-07, + "loss": 0.3692, + "step": 20331 + }, + { + "epoch": 2.646622413119875, + "grad_norm": 3.5748655796051025, + "learning_rate": 3.5346006416349886e-07, + "loss": 0.3352, + "step": 20334 + }, + { + "epoch": 2.6470128855915656, + "grad_norm": 3.5419743061065674, + "learning_rate": 3.5268867889604983e-07, + "loss": 0.3423, + "step": 20337 + }, + { + "epoch": 2.6474033580632566, + "grad_norm": 2.934070587158203, + "learning_rate": 3.5191810550411155e-07, + "loss": 0.3658, + "step": 20340 + }, + { + "epoch": 2.6477938305349475, + "grad_norm": 2.5605757236480713, + "learning_rate": 3.511483441223018e-07, + "loss": 0.3037, + "step": 20343 + }, + { + "epoch": 2.648184303006638, + "grad_norm": 2.988960027694702, + "learning_rate": 3.503793948850975e-07, + "loss": 0.3674, + "step": 20346 + }, + { + "epoch": 2.6485747754783286, + "grad_norm": 2.7395620346069336, + "learning_rate": 3.4961125792683184e-07, + "loss": 0.3509, + "step": 20349 + }, + { + "epoch": 2.6489652479500196, + "grad_norm": 2.5244340896606445, + "learning_rate": 3.488439333816951e-07, + "loss": 0.3272, + "step": 20352 + }, + { + "epoch": 2.64935572042171, + "grad_norm": 2.6971912384033203, + "learning_rate": 3.480774213837396e-07, + "loss": 0.3596, + "step": 20355 + }, + { + "epoch": 2.649746192893401, + "grad_norm": 3.134498119354248, + "learning_rate": 3.4731172206687257e-07, + "loss": 0.3619, + "step": 20358 + }, + { + "epoch": 2.6501366653650917, + "grad_norm": 2.9849419593811035, + "learning_rate": 3.465468355648588e-07, + "loss": 0.3809, + "step": 20361 + }, + { + "epoch": 2.6505271378367823, + "grad_norm": 2.864431858062744, + "learning_rate": 3.457827620113241e-07, + "loss": 0.3084, + "step": 20364 + }, + { + "epoch": 2.6509176103084733, + "grad_norm": 3.1214239597320557, + "learning_rate": 3.4501950153975003e-07, + "loss": 0.3852, + "step": 20367 + }, + { + "epoch": 2.651308082780164, + "grad_norm": 2.6842384338378906, + "learning_rate": 3.4425705428347556e-07, + "loss": 0.3501, + "step": 20370 + }, + { + "epoch": 2.651698555251855, + "grad_norm": 3.266386032104492, + "learning_rate": 3.434954203757007e-07, + "loss": 0.3648, + "step": 20373 + }, + { + "epoch": 2.6520890277235454, + "grad_norm": 2.81931471824646, + "learning_rate": 3.427345999494797e-07, + "loss": 0.3676, + "step": 20376 + }, + { + "epoch": 2.6524795001952364, + "grad_norm": 2.9153671264648438, + "learning_rate": 3.419745931377261e-07, + "loss": 0.3825, + "step": 20379 + }, + { + "epoch": 2.652869972666927, + "grad_norm": 2.6502585411071777, + "learning_rate": 3.41215400073211e-07, + "loss": 0.3146, + "step": 20382 + }, + { + "epoch": 2.653260445138618, + "grad_norm": 2.8781330585479736, + "learning_rate": 3.404570208885666e-07, + "loss": 0.4162, + "step": 20385 + }, + { + "epoch": 2.6536509176103085, + "grad_norm": 3.011082410812378, + "learning_rate": 3.3969945571627805e-07, + "loss": 0.3475, + "step": 20388 + }, + { + "epoch": 2.654041390081999, + "grad_norm": 2.7348506450653076, + "learning_rate": 3.389427046886906e-07, + "loss": 0.3648, + "step": 20391 + }, + { + "epoch": 2.65443186255369, + "grad_norm": 2.8573429584503174, + "learning_rate": 3.381867679380069e-07, + "loss": 0.3945, + "step": 20394 + }, + { + "epoch": 2.6548223350253806, + "grad_norm": 2.513740301132202, + "learning_rate": 3.374316455962884e-07, + "loss": 0.302, + "step": 20397 + }, + { + "epoch": 2.6552128074970716, + "grad_norm": 2.592041015625, + "learning_rate": 3.36677337795453e-07, + "loss": 0.3118, + "step": 20400 + }, + { + "epoch": 2.655603279968762, + "grad_norm": 2.622539520263672, + "learning_rate": 3.359238446672752e-07, + "loss": 0.346, + "step": 20403 + }, + { + "epoch": 2.655993752440453, + "grad_norm": 2.9038712978363037, + "learning_rate": 3.3517116634339097e-07, + "loss": 0.3932, + "step": 20406 + }, + { + "epoch": 2.6563842249121437, + "grad_norm": 2.3799030780792236, + "learning_rate": 3.344193029552911e-07, + "loss": 0.3106, + "step": 20409 + }, + { + "epoch": 2.6567746973838346, + "grad_norm": 2.7198572158813477, + "learning_rate": 3.336682546343228e-07, + "loss": 0.3262, + "step": 20412 + }, + { + "epoch": 2.657165169855525, + "grad_norm": 2.7029428482055664, + "learning_rate": 3.329180215116945e-07, + "loss": 0.3479, + "step": 20415 + }, + { + "epoch": 2.6575556423272158, + "grad_norm": 2.6045374870300293, + "learning_rate": 3.3216860371847013e-07, + "loss": 0.3007, + "step": 20418 + }, + { + "epoch": 2.6579461147989067, + "grad_norm": 2.965404748916626, + "learning_rate": 3.314200013855706e-07, + "loss": 0.3138, + "step": 20421 + }, + { + "epoch": 2.6583365872705973, + "grad_norm": 2.752427577972412, + "learning_rate": 3.3067221464377407e-07, + "loss": 0.3779, + "step": 20424 + }, + { + "epoch": 2.6587270597422883, + "grad_norm": 2.7018942832946777, + "learning_rate": 3.29925243623721e-07, + "loss": 0.3085, + "step": 20427 + }, + { + "epoch": 2.659117532213979, + "grad_norm": 2.695830821990967, + "learning_rate": 3.2917908845590263e-07, + "loss": 0.3486, + "step": 20430 + }, + { + "epoch": 2.6595080046856694, + "grad_norm": 2.9110679626464844, + "learning_rate": 3.2843374927067126e-07, + "loss": 0.3119, + "step": 20433 + }, + { + "epoch": 2.6598984771573604, + "grad_norm": 2.8462071418762207, + "learning_rate": 3.276892261982373e-07, + "loss": 0.3364, + "step": 20436 + }, + { + "epoch": 2.6602889496290514, + "grad_norm": 2.907721757888794, + "learning_rate": 3.26945519368666e-07, + "loss": 0.4112, + "step": 20439 + }, + { + "epoch": 2.660679422100742, + "grad_norm": 2.566709041595459, + "learning_rate": 3.2620262891188195e-07, + "loss": 0.3851, + "step": 20442 + }, + { + "epoch": 2.6610698945724325, + "grad_norm": 2.8268721103668213, + "learning_rate": 3.254605549576656e-07, + "loss": 0.3545, + "step": 20445 + }, + { + "epoch": 2.6614603670441235, + "grad_norm": 2.767282485961914, + "learning_rate": 3.2471929763565725e-07, + "loss": 0.3658, + "step": 20448 + }, + { + "epoch": 2.661850839515814, + "grad_norm": 3.0643694400787354, + "learning_rate": 3.2397885707535216e-07, + "loss": 0.3307, + "step": 20451 + }, + { + "epoch": 2.662241311987505, + "grad_norm": 3.210627555847168, + "learning_rate": 3.2323923340610296e-07, + "loss": 0.3483, + "step": 20454 + }, + { + "epoch": 2.6626317844591956, + "grad_norm": 2.5691981315612793, + "learning_rate": 3.2250042675712246e-07, + "loss": 0.2977, + "step": 20457 + }, + { + "epoch": 2.663022256930886, + "grad_norm": 2.906088352203369, + "learning_rate": 3.2176243725747736e-07, + "loss": 0.3708, + "step": 20460 + }, + { + "epoch": 2.663412729402577, + "grad_norm": 2.960641622543335, + "learning_rate": 3.210252650360918e-07, + "loss": 0.3184, + "step": 20463 + }, + { + "epoch": 2.663803201874268, + "grad_norm": 2.767688274383545, + "learning_rate": 3.2028891022174934e-07, + "loss": 0.364, + "step": 20466 + }, + { + "epoch": 2.6641936743459587, + "grad_norm": 2.6504275798797607, + "learning_rate": 3.1955337294309054e-07, + "loss": 0.3184, + "step": 20469 + }, + { + "epoch": 2.664584146817649, + "grad_norm": 2.9848546981811523, + "learning_rate": 3.1881865332861086e-07, + "loss": 0.3828, + "step": 20472 + }, + { + "epoch": 2.66497461928934, + "grad_norm": 2.9928441047668457, + "learning_rate": 3.180847515066643e-07, + "loss": 0.3589, + "step": 20475 + }, + { + "epoch": 2.6653650917610308, + "grad_norm": 3.2330639362335205, + "learning_rate": 3.173516676054628e-07, + "loss": 0.3848, + "step": 20478 + }, + { + "epoch": 2.6657555642327218, + "grad_norm": 2.6625139713287354, + "learning_rate": 3.1661940175307437e-07, + "loss": 0.365, + "step": 20481 + }, + { + "epoch": 2.6661460367044123, + "grad_norm": 2.730323553085327, + "learning_rate": 3.15887954077424e-07, + "loss": 0.328, + "step": 20484 + }, + { + "epoch": 2.666536509176103, + "grad_norm": 2.785428524017334, + "learning_rate": 3.1515732470629335e-07, + "loss": 0.3185, + "step": 20487 + }, + { + "epoch": 2.666926981647794, + "grad_norm": 2.6901137828826904, + "learning_rate": 3.144275137673236e-07, + "loss": 0.298, + "step": 20490 + }, + { + "epoch": 2.667317454119485, + "grad_norm": 2.662198066711426, + "learning_rate": 3.1369852138801006e-07, + "loss": 0.3452, + "step": 20493 + }, + { + "epoch": 2.6677079265911754, + "grad_norm": 2.7666854858398438, + "learning_rate": 3.1297034769570523e-07, + "loss": 0.333, + "step": 20496 + }, + { + "epoch": 2.668098399062866, + "grad_norm": 2.821671962738037, + "learning_rate": 3.1224299281762184e-07, + "loss": 0.3391, + "step": 20499 + }, + { + "epoch": 2.668488871534557, + "grad_norm": 2.7123184204101562, + "learning_rate": 3.115164568808254e-07, + "loss": 0.3117, + "step": 20502 + }, + { + "epoch": 2.6688793440062475, + "grad_norm": 2.6388189792633057, + "learning_rate": 3.107907400122406e-07, + "loss": 0.3992, + "step": 20505 + }, + { + "epoch": 2.6692698164779385, + "grad_norm": 2.8583977222442627, + "learning_rate": 3.100658423386488e-07, + "loss": 0.3633, + "step": 20508 + }, + { + "epoch": 2.669660288949629, + "grad_norm": 2.892751932144165, + "learning_rate": 3.093417639866886e-07, + "loss": 0.3982, + "step": 20511 + }, + { + "epoch": 2.6700507614213196, + "grad_norm": 2.8561367988586426, + "learning_rate": 3.0861850508285496e-07, + "loss": 0.4017, + "step": 20514 + }, + { + "epoch": 2.6704412338930106, + "grad_norm": 3.0648305416107178, + "learning_rate": 3.07896065753498e-07, + "loss": 0.4046, + "step": 20517 + }, + { + "epoch": 2.670831706364701, + "grad_norm": 2.555460214614868, + "learning_rate": 3.0717444612482883e-07, + "loss": 0.3425, + "step": 20520 + }, + { + "epoch": 2.671222178836392, + "grad_norm": 2.744659185409546, + "learning_rate": 3.064536463229112e-07, + "loss": 0.4306, + "step": 20523 + }, + { + "epoch": 2.6716126513080827, + "grad_norm": 2.578082323074341, + "learning_rate": 3.0573366647366764e-07, + "loss": 0.3021, + "step": 20526 + }, + { + "epoch": 2.6720031237797737, + "grad_norm": 2.590057849884033, + "learning_rate": 3.0501450670287756e-07, + "loss": 0.3247, + "step": 20529 + }, + { + "epoch": 2.672393596251464, + "grad_norm": 2.5789554119110107, + "learning_rate": 3.0429616713617607e-07, + "loss": 0.3732, + "step": 20532 + }, + { + "epoch": 2.672784068723155, + "grad_norm": 2.6770694255828857, + "learning_rate": 3.0357864789905653e-07, + "loss": 0.3459, + "step": 20535 + }, + { + "epoch": 2.6731745411948458, + "grad_norm": 2.7113239765167236, + "learning_rate": 3.0286194911686606e-07, + "loss": 0.3877, + "step": 20538 + }, + { + "epoch": 2.6735650136665363, + "grad_norm": 2.6579864025115967, + "learning_rate": 3.0214607091481276e-07, + "loss": 0.3727, + "step": 20541 + }, + { + "epoch": 2.6739554861382273, + "grad_norm": 2.62597918510437, + "learning_rate": 3.0143101341795823e-07, + "loss": 0.3935, + "step": 20544 + }, + { + "epoch": 2.674345958609918, + "grad_norm": 2.7485511302948, + "learning_rate": 3.0071677675122035e-07, + "loss": 0.4071, + "step": 20547 + }, + { + "epoch": 2.674736431081609, + "grad_norm": 2.8608615398406982, + "learning_rate": 3.0000336103937597e-07, + "loss": 0.3377, + "step": 20550 + }, + { + "epoch": 2.6751269035532994, + "grad_norm": 2.4845223426818848, + "learning_rate": 2.9929076640705714e-07, + "loss": 0.324, + "step": 20553 + }, + { + "epoch": 2.6755173760249904, + "grad_norm": 2.7939839363098145, + "learning_rate": 2.9857899297875304e-07, + "loss": 0.4306, + "step": 20556 + }, + { + "epoch": 2.675907848496681, + "grad_norm": 2.790100574493408, + "learning_rate": 2.9786804087880816e-07, + "loss": 0.4033, + "step": 20559 + }, + { + "epoch": 2.676298320968372, + "grad_norm": 2.7309770584106445, + "learning_rate": 2.9715791023142484e-07, + "loss": 0.3931, + "step": 20562 + }, + { + "epoch": 2.6766887934400625, + "grad_norm": 2.698127031326294, + "learning_rate": 2.9644860116066155e-07, + "loss": 0.3301, + "step": 20565 + }, + { + "epoch": 2.677079265911753, + "grad_norm": 2.612318754196167, + "learning_rate": 2.95740113790432e-07, + "loss": 0.3219, + "step": 20568 + }, + { + "epoch": 2.677469738383444, + "grad_norm": 2.7074458599090576, + "learning_rate": 2.950324482445088e-07, + "loss": 0.3249, + "step": 20571 + }, + { + "epoch": 2.6778602108551346, + "grad_norm": 2.573791265487671, + "learning_rate": 2.943256046465193e-07, + "loss": 0.3419, + "step": 20574 + }, + { + "epoch": 2.6782506833268256, + "grad_norm": 3.1822309494018555, + "learning_rate": 2.936195831199468e-07, + "loss": 0.3264, + "step": 20577 + }, + { + "epoch": 2.678641155798516, + "grad_norm": 2.9114439487457275, + "learning_rate": 2.929143837881326e-07, + "loss": 0.4111, + "step": 20580 + }, + { + "epoch": 2.6790316282702067, + "grad_norm": 2.9312427043914795, + "learning_rate": 2.922100067742739e-07, + "loss": 0.4224, + "step": 20583 + }, + { + "epoch": 2.6794221007418977, + "grad_norm": 3.390676259994507, + "learning_rate": 2.9150645220142273e-07, + "loss": 0.3307, + "step": 20586 + }, + { + "epoch": 2.6798125732135887, + "grad_norm": 2.750668525695801, + "learning_rate": 2.908037201924885e-07, + "loss": 0.4167, + "step": 20589 + }, + { + "epoch": 2.6802030456852792, + "grad_norm": 2.667370319366455, + "learning_rate": 2.9010181087023804e-07, + "loss": 0.3587, + "step": 20592 + }, + { + "epoch": 2.68059351815697, + "grad_norm": 3.1470043659210205, + "learning_rate": 2.894007243572933e-07, + "loss": 0.5031, + "step": 20595 + }, + { + "epoch": 2.6809839906286608, + "grad_norm": 2.8547494411468506, + "learning_rate": 2.887004607761329e-07, + "loss": 0.3249, + "step": 20598 + }, + { + "epoch": 2.6813744631003513, + "grad_norm": 2.8937506675720215, + "learning_rate": 2.880010202490896e-07, + "loss": 0.4048, + "step": 20601 + }, + { + "epoch": 2.6817649355720423, + "grad_norm": 2.470564365386963, + "learning_rate": 2.873024028983562e-07, + "loss": 0.2885, + "step": 20604 + }, + { + "epoch": 2.682155408043733, + "grad_norm": 2.804993152618408, + "learning_rate": 2.8660460884597953e-07, + "loss": 0.3877, + "step": 20607 + }, + { + "epoch": 2.6825458805154234, + "grad_norm": 2.796926259994507, + "learning_rate": 2.859076382138609e-07, + "loss": 0.3878, + "step": 20610 + }, + { + "epoch": 2.6829363529871144, + "grad_norm": 2.4991455078125, + "learning_rate": 2.852114911237619e-07, + "loss": 0.2931, + "step": 20613 + }, + { + "epoch": 2.6833268254588054, + "grad_norm": 2.7926175594329834, + "learning_rate": 2.845161676972968e-07, + "loss": 0.3678, + "step": 20616 + }, + { + "epoch": 2.683717297930496, + "grad_norm": 3.0255000591278076, + "learning_rate": 2.838216680559364e-07, + "loss": 0.3649, + "step": 20619 + }, + { + "epoch": 2.6841077704021865, + "grad_norm": 2.6406142711639404, + "learning_rate": 2.8312799232101007e-07, + "loss": 0.3252, + "step": 20622 + }, + { + "epoch": 2.6844982428738775, + "grad_norm": 2.9375603199005127, + "learning_rate": 2.824351406137005e-07, + "loss": 0.3596, + "step": 20625 + }, + { + "epoch": 2.684888715345568, + "grad_norm": 2.7059028148651123, + "learning_rate": 2.817431130550474e-07, + "loss": 0.4015, + "step": 20628 + }, + { + "epoch": 2.685279187817259, + "grad_norm": 2.975783586502075, + "learning_rate": 2.81051909765947e-07, + "loss": 0.345, + "step": 20631 + }, + { + "epoch": 2.6856696602889496, + "grad_norm": 2.8388760089874268, + "learning_rate": 2.8036153086714976e-07, + "loss": 0.3673, + "step": 20634 + }, + { + "epoch": 2.68606013276064, + "grad_norm": 2.8770854473114014, + "learning_rate": 2.7967197647926547e-07, + "loss": 0.3382, + "step": 20637 + }, + { + "epoch": 2.686450605232331, + "grad_norm": 2.836329460144043, + "learning_rate": 2.78983246722756e-07, + "loss": 0.4078, + "step": 20640 + }, + { + "epoch": 2.686841077704022, + "grad_norm": 2.931439161300659, + "learning_rate": 2.7829534171794236e-07, + "loss": 0.3589, + "step": 20643 + }, + { + "epoch": 2.6872315501757127, + "grad_norm": 2.7307281494140625, + "learning_rate": 2.7760826158499955e-07, + "loss": 0.3384, + "step": 20646 + }, + { + "epoch": 2.6876220226474032, + "grad_norm": 3.2210898399353027, + "learning_rate": 2.769220064439593e-07, + "loss": 0.4175, + "step": 20649 + }, + { + "epoch": 2.6880124951190942, + "grad_norm": 2.8683855533599854, + "learning_rate": 2.7623657641470734e-07, + "loss": 0.3799, + "step": 20652 + }, + { + "epoch": 2.688402967590785, + "grad_norm": 2.820208787918091, + "learning_rate": 2.755519716169891e-07, + "loss": 0.4057, + "step": 20655 + }, + { + "epoch": 2.688793440062476, + "grad_norm": 2.7478857040405273, + "learning_rate": 2.7486819217040273e-07, + "loss": 0.3818, + "step": 20658 + }, + { + "epoch": 2.6891839125341663, + "grad_norm": 2.4639649391174316, + "learning_rate": 2.7418523819440214e-07, + "loss": 0.2983, + "step": 20661 + }, + { + "epoch": 2.689574385005857, + "grad_norm": 2.7031710147857666, + "learning_rate": 2.735031098082996e-07, + "loss": 0.3065, + "step": 20664 + }, + { + "epoch": 2.689964857477548, + "grad_norm": 3.303699254989624, + "learning_rate": 2.728218071312605e-07, + "loss": 0.3628, + "step": 20667 + }, + { + "epoch": 2.6903553299492384, + "grad_norm": 2.7157487869262695, + "learning_rate": 2.721413302823067e-07, + "loss": 0.3855, + "step": 20670 + }, + { + "epoch": 2.6907458024209294, + "grad_norm": 2.9699556827545166, + "learning_rate": 2.714616793803171e-07, + "loss": 0.3907, + "step": 20673 + }, + { + "epoch": 2.69113627489262, + "grad_norm": 2.9308390617370605, + "learning_rate": 2.707828545440239e-07, + "loss": 0.3205, + "step": 20676 + }, + { + "epoch": 2.691526747364311, + "grad_norm": 2.65004563331604, + "learning_rate": 2.701048558920183e-07, + "loss": 0.2922, + "step": 20679 + }, + { + "epoch": 2.6919172198360015, + "grad_norm": 3.5007753372192383, + "learning_rate": 2.6942768354274283e-07, + "loss": 0.354, + "step": 20682 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 2.951852560043335, + "learning_rate": 2.687513376145007e-07, + "loss": 0.3592, + "step": 20685 + }, + { + "epoch": 2.692698164779383, + "grad_norm": 2.6974475383758545, + "learning_rate": 2.6807581822544616e-07, + "loss": 0.3565, + "step": 20688 + }, + { + "epoch": 2.6930886372510736, + "grad_norm": 3.6439194679260254, + "learning_rate": 2.6740112549359154e-07, + "loss": 0.3602, + "step": 20691 + }, + { + "epoch": 2.6934791097227646, + "grad_norm": 2.9142510890960693, + "learning_rate": 2.6672725953680476e-07, + "loss": 0.4112, + "step": 20694 + }, + { + "epoch": 2.693869582194455, + "grad_norm": 2.7999353408813477, + "learning_rate": 2.660542204728084e-07, + "loss": 0.3124, + "step": 20697 + }, + { + "epoch": 2.694260054666146, + "grad_norm": 2.537796974182129, + "learning_rate": 2.6538200841918103e-07, + "loss": 0.3301, + "step": 20700 + }, + { + "epoch": 2.6946505271378367, + "grad_norm": 3.0205585956573486, + "learning_rate": 2.6471062349335606e-07, + "loss": 0.3173, + "step": 20703 + }, + { + "epoch": 2.6950409996095273, + "grad_norm": 2.9223034381866455, + "learning_rate": 2.640400658126241e-07, + "loss": 0.4226, + "step": 20706 + }, + { + "epoch": 2.6954314720812182, + "grad_norm": 2.9517760276794434, + "learning_rate": 2.633703354941297e-07, + "loss": 0.3851, + "step": 20709 + }, + { + "epoch": 2.6958219445529092, + "grad_norm": 3.43226957321167, + "learning_rate": 2.6270143265487214e-07, + "loss": 0.4494, + "step": 20712 + }, + { + "epoch": 2.6962124170246, + "grad_norm": 2.9508416652679443, + "learning_rate": 2.6203335741170955e-07, + "loss": 0.4241, + "step": 20715 + }, + { + "epoch": 2.6966028894962903, + "grad_norm": 2.6791441440582275, + "learning_rate": 2.613661098813519e-07, + "loss": 0.3442, + "step": 20718 + }, + { + "epoch": 2.6969933619679813, + "grad_norm": 2.6088688373565674, + "learning_rate": 2.6069969018036655e-07, + "loss": 0.3336, + "step": 20721 + }, + { + "epoch": 2.697383834439672, + "grad_norm": 2.9644179344177246, + "learning_rate": 2.6003409842517425e-07, + "loss": 0.3761, + "step": 20724 + }, + { + "epoch": 2.697774306911363, + "grad_norm": 2.931863307952881, + "learning_rate": 2.5936933473205473e-07, + "loss": 0.3512, + "step": 20727 + }, + { + "epoch": 2.6981647793830534, + "grad_norm": 3.0752289295196533, + "learning_rate": 2.5870539921713955e-07, + "loss": 0.3662, + "step": 20730 + }, + { + "epoch": 2.698555251854744, + "grad_norm": 2.8674252033233643, + "learning_rate": 2.5804229199641594e-07, + "loss": 0.3455, + "step": 20733 + }, + { + "epoch": 2.698945724326435, + "grad_norm": 2.8869755268096924, + "learning_rate": 2.57380013185729e-07, + "loss": 0.3543, + "step": 20736 + }, + { + "epoch": 2.699336196798126, + "grad_norm": 2.9068305492401123, + "learning_rate": 2.567185629007768e-07, + "loss": 0.4262, + "step": 20739 + }, + { + "epoch": 2.6997266692698165, + "grad_norm": 2.6254804134368896, + "learning_rate": 2.560579412571129e-07, + "loss": 0.4404, + "step": 20742 + }, + { + "epoch": 2.700117141741507, + "grad_norm": 3.4937686920166016, + "learning_rate": 2.5539814837014734e-07, + "loss": 0.3142, + "step": 20745 + }, + { + "epoch": 2.700507614213198, + "grad_norm": 2.548431158065796, + "learning_rate": 2.547391843551439e-07, + "loss": 0.3637, + "step": 20748 + }, + { + "epoch": 2.7008980866848886, + "grad_norm": 3.108660936355591, + "learning_rate": 2.540810493272222e-07, + "loss": 0.3424, + "step": 20751 + }, + { + "epoch": 2.7012885591565796, + "grad_norm": 2.6465303897857666, + "learning_rate": 2.53423743401357e-07, + "loss": 0.3347, + "step": 20754 + }, + { + "epoch": 2.70167903162827, + "grad_norm": 2.571742057800293, + "learning_rate": 2.5276726669237917e-07, + "loss": 0.2957, + "step": 20757 + }, + { + "epoch": 2.7020695040999607, + "grad_norm": 2.738924741744995, + "learning_rate": 2.5211161931497195e-07, + "loss": 0.3669, + "step": 20760 + }, + { + "epoch": 2.7024599765716517, + "grad_norm": 2.8582653999328613, + "learning_rate": 2.5145680138367823e-07, + "loss": 0.4023, + "step": 20763 + }, + { + "epoch": 2.7028504490433427, + "grad_norm": 2.8408925533294678, + "learning_rate": 2.5080281301289034e-07, + "loss": 0.424, + "step": 20766 + }, + { + "epoch": 2.7032409215150333, + "grad_norm": 2.618551015853882, + "learning_rate": 2.5014965431686133e-07, + "loss": 0.2904, + "step": 20769 + }, + { + "epoch": 2.703631393986724, + "grad_norm": 2.550246000289917, + "learning_rate": 2.4949732540969553e-07, + "loss": 0.3352, + "step": 20772 + }, + { + "epoch": 2.704021866458415, + "grad_norm": 2.683685541152954, + "learning_rate": 2.488458264053523e-07, + "loss": 0.3334, + "step": 20775 + }, + { + "epoch": 2.7044123389301054, + "grad_norm": 2.9266393184661865, + "learning_rate": 2.481951574176494e-07, + "loss": 0.3804, + "step": 20778 + }, + { + "epoch": 2.7048028114017963, + "grad_norm": 2.7701256275177, + "learning_rate": 2.4754531856025557e-07, + "loss": 0.364, + "step": 20781 + }, + { + "epoch": 2.705193283873487, + "grad_norm": 2.6166975498199463, + "learning_rate": 2.4689630994669646e-07, + "loss": 0.3529, + "step": 20784 + }, + { + "epoch": 2.7055837563451774, + "grad_norm": 2.662623643875122, + "learning_rate": 2.462481316903537e-07, + "loss": 0.296, + "step": 20787 + }, + { + "epoch": 2.7059742288168684, + "grad_norm": 3.0897302627563477, + "learning_rate": 2.4560078390446216e-07, + "loss": 0.4447, + "step": 20790 + }, + { + "epoch": 2.706364701288559, + "grad_norm": 2.9640982151031494, + "learning_rate": 2.4495426670211154e-07, + "loss": 0.3694, + "step": 20793 + }, + { + "epoch": 2.70675517376025, + "grad_norm": 2.899473190307617, + "learning_rate": 2.443085801962469e-07, + "loss": 0.3705, + "step": 20796 + }, + { + "epoch": 2.7071456462319405, + "grad_norm": 2.732571601867676, + "learning_rate": 2.4366372449966924e-07, + "loss": 0.3467, + "step": 20799 + }, + { + "epoch": 2.7075361187036315, + "grad_norm": 4.155490398406982, + "learning_rate": 2.4301969972503223e-07, + "loss": 0.3475, + "step": 20802 + }, + { + "epoch": 2.707926591175322, + "grad_norm": 2.9572913646698, + "learning_rate": 2.4237650598484707e-07, + "loss": 0.3588, + "step": 20805 + }, + { + "epoch": 2.708317063647013, + "grad_norm": 2.9816484451293945, + "learning_rate": 2.417341433914777e-07, + "loss": 0.3269, + "step": 20808 + }, + { + "epoch": 2.7087075361187036, + "grad_norm": 2.436990261077881, + "learning_rate": 2.4109261205714386e-07, + "loss": 0.3758, + "step": 20811 + }, + { + "epoch": 2.709098008590394, + "grad_norm": 2.87477707862854, + "learning_rate": 2.404519120939197e-07, + "loss": 0.3635, + "step": 20814 + }, + { + "epoch": 2.709488481062085, + "grad_norm": 2.7576093673706055, + "learning_rate": 2.3981204361373247e-07, + "loss": 0.3162, + "step": 20817 + }, + { + "epoch": 2.7098789535337757, + "grad_norm": 3.003218650817871, + "learning_rate": 2.3917300672836876e-07, + "loss": 0.344, + "step": 20820 + }, + { + "epoch": 2.7102694260054667, + "grad_norm": 2.829570770263672, + "learning_rate": 2.385348015494648e-07, + "loss": 0.399, + "step": 20823 + }, + { + "epoch": 2.7106598984771573, + "grad_norm": 2.7657134532928467, + "learning_rate": 2.378974281885138e-07, + "loss": 0.3849, + "step": 20826 + }, + { + "epoch": 2.7110503709488483, + "grad_norm": 3.6430046558380127, + "learning_rate": 2.3726088675686542e-07, + "loss": 0.3418, + "step": 20829 + }, + { + "epoch": 2.711440843420539, + "grad_norm": 3.0621814727783203, + "learning_rate": 2.366251773657202e-07, + "loss": 0.3826, + "step": 20832 + }, + { + "epoch": 2.71183131589223, + "grad_norm": 2.840770721435547, + "learning_rate": 2.359903001261349e-07, + "loss": 0.3095, + "step": 20835 + }, + { + "epoch": 2.7122217883639204, + "grad_norm": 2.6842424869537354, + "learning_rate": 2.353562551490235e-07, + "loss": 0.3479, + "step": 20838 + }, + { + "epoch": 2.712612260835611, + "grad_norm": 2.5753138065338135, + "learning_rate": 2.3472304254515022e-07, + "loss": 0.3303, + "step": 20841 + }, + { + "epoch": 2.713002733307302, + "grad_norm": 4.057579040527344, + "learning_rate": 2.3409066242513655e-07, + "loss": 0.3138, + "step": 20844 + }, + { + "epoch": 2.7133932057789925, + "grad_norm": 2.7594196796417236, + "learning_rate": 2.3345911489945806e-07, + "loss": 0.35, + "step": 20847 + }, + { + "epoch": 2.7137836782506835, + "grad_norm": 2.6751604080200195, + "learning_rate": 2.3282840007844586e-07, + "loss": 0.3223, + "step": 20850 + }, + { + "epoch": 2.714174150722374, + "grad_norm": 3.2291057109832764, + "learning_rate": 2.3219851807228298e-07, + "loss": 0.4123, + "step": 20853 + }, + { + "epoch": 2.7145646231940646, + "grad_norm": 2.8833391666412354, + "learning_rate": 2.3156946899100918e-07, + "loss": 0.3697, + "step": 20856 + }, + { + "epoch": 2.7149550956657555, + "grad_norm": 2.8411707878112793, + "learning_rate": 2.3094125294451709e-07, + "loss": 0.3905, + "step": 20859 + }, + { + "epoch": 2.7153455681374465, + "grad_norm": 2.843876361846924, + "learning_rate": 2.3031387004255667e-07, + "loss": 0.3087, + "step": 20862 + }, + { + "epoch": 2.715736040609137, + "grad_norm": 2.7246623039245605, + "learning_rate": 2.2968732039472864e-07, + "loss": 0.3058, + "step": 20865 + }, + { + "epoch": 2.7161265130808276, + "grad_norm": 3.1246981620788574, + "learning_rate": 2.2906160411048982e-07, + "loss": 0.4129, + "step": 20868 + }, + { + "epoch": 2.7165169855525186, + "grad_norm": 2.8314807415008545, + "learning_rate": 2.2843672129915284e-07, + "loss": 0.3662, + "step": 20871 + }, + { + "epoch": 2.716907458024209, + "grad_norm": 2.765364408493042, + "learning_rate": 2.278126720698831e-07, + "loss": 0.3987, + "step": 20874 + }, + { + "epoch": 2.7172979304959, + "grad_norm": 2.515321969985962, + "learning_rate": 2.2718945653169954e-07, + "loss": 0.3245, + "step": 20877 + }, + { + "epoch": 2.7176884029675907, + "grad_norm": 3.015044689178467, + "learning_rate": 2.2656707479347783e-07, + "loss": 0.3235, + "step": 20880 + }, + { + "epoch": 2.7180788754392813, + "grad_norm": 2.9350194931030273, + "learning_rate": 2.2594552696394655e-07, + "loss": 0.3248, + "step": 20883 + }, + { + "epoch": 2.7184693479109723, + "grad_norm": 2.5761871337890625, + "learning_rate": 2.2532481315168774e-07, + "loss": 0.4054, + "step": 20886 + }, + { + "epoch": 2.7188598203826633, + "grad_norm": 2.6208813190460205, + "learning_rate": 2.2470493346513967e-07, + "loss": 0.3896, + "step": 20889 + }, + { + "epoch": 2.719250292854354, + "grad_norm": 2.8215200901031494, + "learning_rate": 2.2408588801259456e-07, + "loss": 0.3916, + "step": 20892 + }, + { + "epoch": 2.7196407653260444, + "grad_norm": 2.862316608428955, + "learning_rate": 2.2346767690219762e-07, + "loss": 0.3927, + "step": 20895 + }, + { + "epoch": 2.7200312377977354, + "grad_norm": 2.7122604846954346, + "learning_rate": 2.228503002419491e-07, + "loss": 0.3755, + "step": 20898 + }, + { + "epoch": 2.720421710269426, + "grad_norm": 2.7024004459381104, + "learning_rate": 2.2223375813970382e-07, + "loss": 0.382, + "step": 20901 + }, + { + "epoch": 2.720812182741117, + "grad_norm": 3.0531258583068848, + "learning_rate": 2.2161805070316955e-07, + "loss": 0.427, + "step": 20904 + }, + { + "epoch": 2.7212026552128075, + "grad_norm": 2.6628942489624023, + "learning_rate": 2.2100317803991023e-07, + "loss": 0.3531, + "step": 20907 + }, + { + "epoch": 2.721593127684498, + "grad_norm": 2.617644786834717, + "learning_rate": 2.203891402573416e-07, + "loss": 0.3378, + "step": 20910 + }, + { + "epoch": 2.721983600156189, + "grad_norm": 3.2814929485321045, + "learning_rate": 2.197759374627356e-07, + "loss": 0.4266, + "step": 20913 + }, + { + "epoch": 2.72237407262788, + "grad_norm": 2.880181074142456, + "learning_rate": 2.1916356976321717e-07, + "loss": 0.4139, + "step": 20916 + }, + { + "epoch": 2.7227645450995706, + "grad_norm": 3.127866506576538, + "learning_rate": 2.1855203726576512e-07, + "loss": 0.4289, + "step": 20919 + }, + { + "epoch": 2.723155017571261, + "grad_norm": 2.892545461654663, + "learning_rate": 2.17941340077214e-07, + "loss": 0.3724, + "step": 20922 + }, + { + "epoch": 2.723545490042952, + "grad_norm": 3.038330078125, + "learning_rate": 2.1733147830425129e-07, + "loss": 0.3305, + "step": 20925 + }, + { + "epoch": 2.7239359625146427, + "grad_norm": 2.841825246810913, + "learning_rate": 2.1672245205341668e-07, + "loss": 0.355, + "step": 20928 + }, + { + "epoch": 2.7243264349863336, + "grad_norm": 2.6401593685150146, + "learning_rate": 2.1611426143110792e-07, + "loss": 0.3588, + "step": 20931 + }, + { + "epoch": 2.724716907458024, + "grad_norm": 2.7327427864074707, + "learning_rate": 2.1550690654357387e-07, + "loss": 0.3542, + "step": 20934 + }, + { + "epoch": 2.7251073799297147, + "grad_norm": 2.9311492443084717, + "learning_rate": 2.1490038749691855e-07, + "loss": 0.3667, + "step": 20937 + }, + { + "epoch": 2.7254978524014057, + "grad_norm": 2.581843614578247, + "learning_rate": 2.1429470439709832e-07, + "loss": 0.3361, + "step": 20940 + }, + { + "epoch": 2.7258883248730963, + "grad_norm": 2.531803607940674, + "learning_rate": 2.1368985734992632e-07, + "loss": 0.3147, + "step": 20943 + }, + { + "epoch": 2.7262787973447873, + "grad_norm": 2.8539505004882812, + "learning_rate": 2.130858464610669e-07, + "loss": 0.3391, + "step": 20946 + }, + { + "epoch": 2.726669269816478, + "grad_norm": 2.8276593685150146, + "learning_rate": 2.124826718360401e-07, + "loss": 0.3272, + "step": 20949 + }, + { + "epoch": 2.727059742288169, + "grad_norm": 3.1101760864257812, + "learning_rate": 2.1188033358021887e-07, + "loss": 0.4169, + "step": 20952 + }, + { + "epoch": 2.7274502147598594, + "grad_norm": 2.8859715461730957, + "learning_rate": 2.1127883179883123e-07, + "loss": 0.343, + "step": 20955 + }, + { + "epoch": 2.7278406872315504, + "grad_norm": 2.626885414123535, + "learning_rate": 2.1067816659695705e-07, + "loss": 0.354, + "step": 20958 + }, + { + "epoch": 2.728231159703241, + "grad_norm": 3.709789276123047, + "learning_rate": 2.100783380795318e-07, + "loss": 0.4314, + "step": 20961 + }, + { + "epoch": 2.7286216321749315, + "grad_norm": 3.0039596557617188, + "learning_rate": 2.0947934635134504e-07, + "loss": 0.3791, + "step": 20964 + }, + { + "epoch": 2.7290121046466225, + "grad_norm": 2.7142927646636963, + "learning_rate": 2.0888119151703855e-07, + "loss": 0.4363, + "step": 20967 + }, + { + "epoch": 2.729402577118313, + "grad_norm": 2.700593948364258, + "learning_rate": 2.0828387368110825e-07, + "loss": 0.2971, + "step": 20970 + }, + { + "epoch": 2.729793049590004, + "grad_norm": 2.7692973613739014, + "learning_rate": 2.0768739294790453e-07, + "loss": 0.3486, + "step": 20973 + }, + { + "epoch": 2.7301835220616946, + "grad_norm": 2.7702739238739014, + "learning_rate": 2.070917494216329e-07, + "loss": 0.3118, + "step": 20976 + }, + { + "epoch": 2.7305739945333856, + "grad_norm": 2.789076805114746, + "learning_rate": 2.0649694320634962e-07, + "loss": 0.4196, + "step": 20979 + }, + { + "epoch": 2.730964467005076, + "grad_norm": 2.585819721221924, + "learning_rate": 2.059029744059654e-07, + "loss": 0.3413, + "step": 20982 + }, + { + "epoch": 2.731354939476767, + "grad_norm": 2.9140193462371826, + "learning_rate": 2.0530984312424728e-07, + "loss": 0.3709, + "step": 20985 + }, + { + "epoch": 2.7317454119484577, + "grad_norm": 2.968364715576172, + "learning_rate": 2.0471754946481293e-07, + "loss": 0.3156, + "step": 20988 + }, + { + "epoch": 2.732135884420148, + "grad_norm": 2.853219985961914, + "learning_rate": 2.04126093531134e-07, + "loss": 0.3475, + "step": 20991 + }, + { + "epoch": 2.732526356891839, + "grad_norm": 2.715740919113159, + "learning_rate": 2.035354754265384e-07, + "loss": 0.337, + "step": 20994 + }, + { + "epoch": 2.7329168293635298, + "grad_norm": 2.861161947250366, + "learning_rate": 2.029456952542047e-07, + "loss": 0.3736, + "step": 20997 + }, + { + "epoch": 2.7333073018352207, + "grad_norm": 3.0572056770324707, + "learning_rate": 2.0235675311716606e-07, + "loss": 0.3697, + "step": 21000 + }, + { + "epoch": 2.7336977743069113, + "grad_norm": 3.1050283908843994, + "learning_rate": 2.0176864911831074e-07, + "loss": 0.3451, + "step": 21003 + }, + { + "epoch": 2.734088246778602, + "grad_norm": 2.7847084999084473, + "learning_rate": 2.0118138336037818e-07, + "loss": 0.3791, + "step": 21006 + }, + { + "epoch": 2.734478719250293, + "grad_norm": 2.646023988723755, + "learning_rate": 2.0059495594596245e-07, + "loss": 0.3152, + "step": 21009 + }, + { + "epoch": 2.734869191721984, + "grad_norm": 2.6113431453704834, + "learning_rate": 2.000093669775105e-07, + "loss": 0.3829, + "step": 21012 + }, + { + "epoch": 2.7352596641936744, + "grad_norm": 3.1198410987854004, + "learning_rate": 1.9942461655732604e-07, + "loss": 0.3661, + "step": 21015 + }, + { + "epoch": 2.735650136665365, + "grad_norm": 3.019252061843872, + "learning_rate": 1.9884070478756124e-07, + "loss": 0.3273, + "step": 21018 + }, + { + "epoch": 2.736040609137056, + "grad_norm": 2.6802523136138916, + "learning_rate": 1.982576317702256e-07, + "loss": 0.3518, + "step": 21021 + }, + { + "epoch": 2.7364310816087465, + "grad_norm": 2.885608196258545, + "learning_rate": 1.976753976071799e-07, + "loss": 0.3363, + "step": 21024 + }, + { + "epoch": 2.7368215540804375, + "grad_norm": 3.0259361267089844, + "learning_rate": 1.9709400240014e-07, + "loss": 0.4027, + "step": 21027 + }, + { + "epoch": 2.737212026552128, + "grad_norm": 2.8382019996643066, + "learning_rate": 1.9651344625067404e-07, + "loss": 0.3445, + "step": 21030 + }, + { + "epoch": 2.7376024990238186, + "grad_norm": 2.760059356689453, + "learning_rate": 1.959337292602037e-07, + "loss": 0.341, + "step": 21033 + }, + { + "epoch": 2.7379929714955096, + "grad_norm": 2.7008633613586426, + "learning_rate": 1.9535485153000467e-07, + "loss": 0.3574, + "step": 21036 + }, + { + "epoch": 2.7383834439672006, + "grad_norm": 2.705885410308838, + "learning_rate": 1.94776813161206e-07, + "loss": 0.3451, + "step": 21039 + }, + { + "epoch": 2.738773916438891, + "grad_norm": 2.571812629699707, + "learning_rate": 1.941996142547886e-07, + "loss": 0.3598, + "step": 21042 + }, + { + "epoch": 2.7391643889105817, + "grad_norm": 2.7097597122192383, + "learning_rate": 1.9362325491158907e-07, + "loss": 0.4084, + "step": 21045 + }, + { + "epoch": 2.7395548613822727, + "grad_norm": 2.7397844791412354, + "learning_rate": 1.9304773523229626e-07, + "loss": 0.3158, + "step": 21048 + }, + { + "epoch": 2.739945333853963, + "grad_norm": 2.917585849761963, + "learning_rate": 1.924730553174503e-07, + "loss": 0.3487, + "step": 21051 + }, + { + "epoch": 2.740335806325654, + "grad_norm": 3.6193621158599854, + "learning_rate": 1.918992152674487e-07, + "loss": 0.3871, + "step": 21054 + }, + { + "epoch": 2.7407262787973448, + "grad_norm": 2.6469061374664307, + "learning_rate": 1.9132621518254014e-07, + "loss": 0.3751, + "step": 21057 + }, + { + "epoch": 2.7411167512690353, + "grad_norm": 2.455784797668457, + "learning_rate": 1.9075405516282562e-07, + "loss": 0.3128, + "step": 21060 + }, + { + "epoch": 2.7415072237407263, + "grad_norm": 2.900134325027466, + "learning_rate": 1.9018273530825972e-07, + "loss": 0.3579, + "step": 21063 + }, + { + "epoch": 2.7418976962124173, + "grad_norm": 2.7763450145721436, + "learning_rate": 1.8961225571865194e-07, + "loss": 0.3071, + "step": 21066 + }, + { + "epoch": 2.742288168684108, + "grad_norm": 3.2267343997955322, + "learning_rate": 1.890426164936643e-07, + "loss": 0.3962, + "step": 21069 + }, + { + "epoch": 2.7426786411557984, + "grad_norm": 2.72977876663208, + "learning_rate": 1.8847381773280991e-07, + "loss": 0.3407, + "step": 21072 + }, + { + "epoch": 2.7430691136274894, + "grad_norm": 3.236475944519043, + "learning_rate": 1.879058595354577e-07, + "loss": 0.4347, + "step": 21075 + }, + { + "epoch": 2.74345958609918, + "grad_norm": 2.5034587383270264, + "learning_rate": 1.873387420008288e-07, + "loss": 0.3305, + "step": 21078 + }, + { + "epoch": 2.743850058570871, + "grad_norm": 2.9593002796173096, + "learning_rate": 1.867724652279973e-07, + "loss": 0.3307, + "step": 21081 + }, + { + "epoch": 2.7442405310425615, + "grad_norm": 2.766495943069458, + "learning_rate": 1.8620702931589018e-07, + "loss": 0.3772, + "step": 21084 + }, + { + "epoch": 2.744631003514252, + "grad_norm": 2.7512776851654053, + "learning_rate": 1.85642434363289e-07, + "loss": 0.3637, + "step": 21087 + }, + { + "epoch": 2.745021475985943, + "grad_norm": 2.5375959873199463, + "learning_rate": 1.8507868046882648e-07, + "loss": 0.3328, + "step": 21090 + }, + { + "epoch": 2.7454119484576336, + "grad_norm": 3.020958662033081, + "learning_rate": 1.845157677309889e-07, + "loss": 0.3264, + "step": 21093 + }, + { + "epoch": 2.7458024209293246, + "grad_norm": 2.4167263507843018, + "learning_rate": 1.8395369624811643e-07, + "loss": 0.2931, + "step": 21096 + }, + { + "epoch": 2.746192893401015, + "grad_norm": 2.777470827102661, + "learning_rate": 1.833924661184022e-07, + "loss": 0.4431, + "step": 21099 + }, + { + "epoch": 2.746583365872706, + "grad_norm": 2.5936362743377686, + "learning_rate": 1.8283207743989118e-07, + "loss": 0.3583, + "step": 21102 + }, + { + "epoch": 2.7469738383443967, + "grad_norm": 2.837986946105957, + "learning_rate": 1.822725303104822e-07, + "loss": 0.3669, + "step": 21105 + }, + { + "epoch": 2.7473643108160877, + "grad_norm": 2.606499671936035, + "learning_rate": 1.8171382482792765e-07, + "loss": 0.3378, + "step": 21108 + }, + { + "epoch": 2.7477547832877782, + "grad_norm": 3.059852123260498, + "learning_rate": 1.8115596108983168e-07, + "loss": 0.3491, + "step": 21111 + }, + { + "epoch": 2.7481452557594688, + "grad_norm": 2.9495184421539307, + "learning_rate": 1.8059893919365135e-07, + "loss": 0.4117, + "step": 21114 + }, + { + "epoch": 2.7485357282311598, + "grad_norm": 2.9733073711395264, + "learning_rate": 1.8004275923669824e-07, + "loss": 0.4087, + "step": 21117 + }, + { + "epoch": 2.7489262007028503, + "grad_norm": 2.503350019454956, + "learning_rate": 1.7948742131613571e-07, + "loss": 0.3365, + "step": 21120 + }, + { + "epoch": 2.7493166731745413, + "grad_norm": 2.9972898960113525, + "learning_rate": 1.7893292552897956e-07, + "loss": 0.3652, + "step": 21123 + }, + { + "epoch": 2.749707145646232, + "grad_norm": 2.9477357864379883, + "learning_rate": 1.783792719720989e-07, + "loss": 0.3609, + "step": 21126 + }, + { + "epoch": 2.7500976181179224, + "grad_norm": 2.971484422683716, + "learning_rate": 1.7782646074221643e-07, + "loss": 0.3444, + "step": 21129 + }, + { + "epoch": 2.7504880905896134, + "grad_norm": 2.396226644515991, + "learning_rate": 1.7727449193590707e-07, + "loss": 0.3327, + "step": 21132 + }, + { + "epoch": 2.7508785630613044, + "grad_norm": 2.7505605220794678, + "learning_rate": 1.7672336564959813e-07, + "loss": 0.3816, + "step": 21135 + }, + { + "epoch": 2.751269035532995, + "grad_norm": 2.9691169261932373, + "learning_rate": 1.761730819795704e-07, + "loss": 0.3654, + "step": 21138 + }, + { + "epoch": 2.7516595080046855, + "grad_norm": 3.171504259109497, + "learning_rate": 1.7562364102195806e-07, + "loss": 0.3361, + "step": 21141 + }, + { + "epoch": 2.7520499804763765, + "grad_norm": 2.5362813472747803, + "learning_rate": 1.7507504287274603e-07, + "loss": 0.2974, + "step": 21144 + }, + { + "epoch": 2.752440452948067, + "grad_norm": 2.5266830921173096, + "learning_rate": 1.7452728762777372e-07, + "loss": 0.4153, + "step": 21147 + }, + { + "epoch": 2.752830925419758, + "grad_norm": 3.282205820083618, + "learning_rate": 1.739803753827335e-07, + "loss": 0.2816, + "step": 21150 + }, + { + "epoch": 2.7532213978914486, + "grad_norm": 2.7888989448547363, + "learning_rate": 1.7343430623316947e-07, + "loss": 0.3416, + "step": 21153 + }, + { + "epoch": 2.753611870363139, + "grad_norm": 2.560568332672119, + "learning_rate": 1.728890802744776e-07, + "loss": 0.3341, + "step": 21156 + }, + { + "epoch": 2.75400234283483, + "grad_norm": 3.4520578384399414, + "learning_rate": 1.723446976019094e-07, + "loss": 0.3717, + "step": 21159 + }, + { + "epoch": 2.754392815306521, + "grad_norm": 2.7265379428863525, + "learning_rate": 1.7180115831056665e-07, + "loss": 0.3872, + "step": 21162 + }, + { + "epoch": 2.7547832877782117, + "grad_norm": 2.842808246612549, + "learning_rate": 1.712584624954039e-07, + "loss": 0.3927, + "step": 21165 + }, + { + "epoch": 2.7551737602499022, + "grad_norm": 2.4746949672698975, + "learning_rate": 1.707166102512303e-07, + "loss": 0.3752, + "step": 21168 + }, + { + "epoch": 2.7555642327215932, + "grad_norm": 2.7992329597473145, + "learning_rate": 1.7017560167270519e-07, + "loss": 0.3562, + "step": 21171 + }, + { + "epoch": 2.755954705193284, + "grad_norm": 3.015634059906006, + "learning_rate": 1.6963543685434236e-07, + "loss": 0.4082, + "step": 21174 + }, + { + "epoch": 2.7563451776649748, + "grad_norm": 2.884288787841797, + "learning_rate": 1.6909611589050635e-07, + "loss": 0.3104, + "step": 21177 + }, + { + "epoch": 2.7567356501366653, + "grad_norm": 3.6291792392730713, + "learning_rate": 1.6855763887541565e-07, + "loss": 0.3426, + "step": 21180 + }, + { + "epoch": 2.757126122608356, + "grad_norm": 2.6148040294647217, + "learning_rate": 1.6802000590314283e-07, + "loss": 0.3069, + "step": 21183 + }, + { + "epoch": 2.757516595080047, + "grad_norm": 3.024834156036377, + "learning_rate": 1.6748321706760994e-07, + "loss": 0.3358, + "step": 21186 + }, + { + "epoch": 2.757907067551738, + "grad_norm": 2.5863471031188965, + "learning_rate": 1.66947272462592e-07, + "loss": 0.3238, + "step": 21189 + }, + { + "epoch": 2.7582975400234284, + "grad_norm": 3.284332036972046, + "learning_rate": 1.6641217218171912e-07, + "loss": 0.4084, + "step": 21192 + }, + { + "epoch": 2.758688012495119, + "grad_norm": 3.1912519931793213, + "learning_rate": 1.658779163184715e-07, + "loss": 0.353, + "step": 21195 + }, + { + "epoch": 2.75907848496681, + "grad_norm": 2.728577136993408, + "learning_rate": 1.6534450496618171e-07, + "loss": 0.408, + "step": 21198 + }, + { + "epoch": 2.7594689574385005, + "grad_norm": 2.7622365951538086, + "learning_rate": 1.648119382180363e-07, + "loss": 0.3854, + "step": 21201 + }, + { + "epoch": 2.7598594299101915, + "grad_norm": 3.1888158321380615, + "learning_rate": 1.6428021616707423e-07, + "loss": 0.4322, + "step": 21204 + }, + { + "epoch": 2.760249902381882, + "grad_norm": 2.6636276245117188, + "learning_rate": 1.6374933890618504e-07, + "loss": 0.3482, + "step": 21207 + }, + { + "epoch": 2.7606403748535726, + "grad_norm": 3.0101702213287354, + "learning_rate": 1.6321930652811236e-07, + "loss": 0.3703, + "step": 21210 + }, + { + "epoch": 2.7610308473252636, + "grad_norm": 3.2155416011810303, + "learning_rate": 1.6269011912545208e-07, + "loss": 0.384, + "step": 21213 + }, + { + "epoch": 2.761421319796954, + "grad_norm": 2.743858814239502, + "learning_rate": 1.6216177679065136e-07, + "loss": 0.3099, + "step": 21216 + }, + { + "epoch": 2.761811792268645, + "grad_norm": 2.7741172313690186, + "learning_rate": 1.6163427961601086e-07, + "loss": 0.3717, + "step": 21219 + }, + { + "epoch": 2.7622022647403357, + "grad_norm": 3.037571907043457, + "learning_rate": 1.6110762769368294e-07, + "loss": 0.3058, + "step": 21222 + }, + { + "epoch": 2.7625927372120267, + "grad_norm": 2.6147408485412598, + "learning_rate": 1.60581821115674e-07, + "loss": 0.3178, + "step": 21225 + }, + { + "epoch": 2.7629832096837172, + "grad_norm": 2.956163167953491, + "learning_rate": 1.6005685997383945e-07, + "loss": 0.3459, + "step": 21228 + }, + { + "epoch": 2.7633736821554082, + "grad_norm": 3.1551594734191895, + "learning_rate": 1.5953274435988985e-07, + "loss": 0.3189, + "step": 21231 + }, + { + "epoch": 2.763764154627099, + "grad_norm": 2.789609670639038, + "learning_rate": 1.590094743653875e-07, + "loss": 0.3035, + "step": 21234 + }, + { + "epoch": 2.7641546270987893, + "grad_norm": 3.474595308303833, + "learning_rate": 1.5848705008174535e-07, + "loss": 0.434, + "step": 21237 + }, + { + "epoch": 2.7645450995704803, + "grad_norm": 2.9483280181884766, + "learning_rate": 1.5796547160023046e-07, + "loss": 0.3975, + "step": 21240 + }, + { + "epoch": 2.764935572042171, + "grad_norm": 3.087123155593872, + "learning_rate": 1.5744473901196211e-07, + "loss": 0.306, + "step": 21243 + }, + { + "epoch": 2.765326044513862, + "grad_norm": 2.7233099937438965, + "learning_rate": 1.5692485240791034e-07, + "loss": 0.3202, + "step": 21246 + }, + { + "epoch": 2.7657165169855524, + "grad_norm": 3.1020421981811523, + "learning_rate": 1.5640581187889857e-07, + "loss": 0.3874, + "step": 21249 + }, + { + "epoch": 2.7661069894572434, + "grad_norm": 3.102666139602661, + "learning_rate": 1.558876175156021e-07, + "loss": 0.3741, + "step": 21252 + }, + { + "epoch": 2.766497461928934, + "grad_norm": 2.6903672218322754, + "learning_rate": 1.5537026940854794e-07, + "loss": 0.3412, + "step": 21255 + }, + { + "epoch": 2.766887934400625, + "grad_norm": 2.9768612384796143, + "learning_rate": 1.54853767648116e-07, + "loss": 0.3017, + "step": 21258 + }, + { + "epoch": 2.7672784068723155, + "grad_norm": 3.4429922103881836, + "learning_rate": 1.543381123245391e-07, + "loss": 0.321, + "step": 21261 + }, + { + "epoch": 2.767668879344006, + "grad_norm": 2.8304970264434814, + "learning_rate": 1.538233035278991e-07, + "loss": 0.328, + "step": 21264 + }, + { + "epoch": 2.768059351815697, + "grad_norm": 2.5452582836151123, + "learning_rate": 1.5330934134813346e-07, + "loss": 0.2859, + "step": 21267 + }, + { + "epoch": 2.7684498242873876, + "grad_norm": 2.8561484813690186, + "learning_rate": 1.5279622587502986e-07, + "loss": 0.4397, + "step": 21270 + }, + { + "epoch": 2.7688402967590786, + "grad_norm": 3.0606303215026855, + "learning_rate": 1.5228395719822876e-07, + "loss": 0.4187, + "step": 21273 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 2.421595335006714, + "learning_rate": 1.5177253540722247e-07, + "loss": 0.2697, + "step": 21276 + }, + { + "epoch": 2.7696212417024597, + "grad_norm": 3.015394449234009, + "learning_rate": 1.512619605913551e-07, + "loss": 0.3576, + "step": 21279 + }, + { + "epoch": 2.7700117141741507, + "grad_norm": 2.740720510482788, + "learning_rate": 1.5075223283982255e-07, + "loss": 0.3246, + "step": 21282 + }, + { + "epoch": 2.7704021866458417, + "grad_norm": 2.849069833755493, + "learning_rate": 1.5024335224167407e-07, + "loss": 0.342, + "step": 21285 + }, + { + "epoch": 2.7707926591175323, + "grad_norm": 2.9401018619537354, + "learning_rate": 1.4973531888580916e-07, + "loss": 0.3456, + "step": 21288 + }, + { + "epoch": 2.771183131589223, + "grad_norm": 2.6727066040039062, + "learning_rate": 1.4922813286098016e-07, + "loss": 0.3463, + "step": 21291 + }, + { + "epoch": 2.771573604060914, + "grad_norm": 3.0198166370391846, + "learning_rate": 1.487217942557928e-07, + "loss": 0.3842, + "step": 21294 + }, + { + "epoch": 2.7719640765326043, + "grad_norm": 2.9366579055786133, + "learning_rate": 1.4821630315870194e-07, + "loss": 0.367, + "step": 21297 + }, + { + "epoch": 2.7723545490042953, + "grad_norm": 3.7701351642608643, + "learning_rate": 1.4771165965801582e-07, + "loss": 0.4056, + "step": 21300 + }, + { + "epoch": 2.772745021475986, + "grad_norm": 2.716285467147827, + "learning_rate": 1.4720786384189557e-07, + "loss": 0.3333, + "step": 21303 + }, + { + "epoch": 2.7731354939476764, + "grad_norm": 2.849595546722412, + "learning_rate": 1.4670491579835245e-07, + "loss": 0.3918, + "step": 21306 + }, + { + "epoch": 2.7735259664193674, + "grad_norm": 2.8046321868896484, + "learning_rate": 1.462028156152512e-07, + "loss": 0.3327, + "step": 21309 + }, + { + "epoch": 2.7739164388910584, + "grad_norm": 3.198570966720581, + "learning_rate": 1.4570156338030606e-07, + "loss": 0.3182, + "step": 21312 + }, + { + "epoch": 2.774306911362749, + "grad_norm": 2.8275673389434814, + "learning_rate": 1.4520115918108701e-07, + "loss": 0.3296, + "step": 21315 + }, + { + "epoch": 2.7746973838344395, + "grad_norm": 2.7337069511413574, + "learning_rate": 1.447016031050119e-07, + "loss": 0.3556, + "step": 21318 + }, + { + "epoch": 2.7750878563061305, + "grad_norm": 2.702484369277954, + "learning_rate": 1.442028952393526e-07, + "loss": 0.4011, + "step": 21321 + }, + { + "epoch": 2.775478328777821, + "grad_norm": 2.6298041343688965, + "learning_rate": 1.4370503567123274e-07, + "loss": 0.3267, + "step": 21324 + }, + { + "epoch": 2.775868801249512, + "grad_norm": 2.7082111835479736, + "learning_rate": 1.4320802448762716e-07, + "loss": 0.3419, + "step": 21327 + }, + { + "epoch": 2.7762592737212026, + "grad_norm": 3.18163800239563, + "learning_rate": 1.4271186177536256e-07, + "loss": 0.3421, + "step": 21330 + }, + { + "epoch": 2.776649746192893, + "grad_norm": 2.9443161487579346, + "learning_rate": 1.4221654762111624e-07, + "loss": 0.3675, + "step": 21333 + }, + { + "epoch": 2.777040218664584, + "grad_norm": 3.03935170173645, + "learning_rate": 1.4172208211142124e-07, + "loss": 0.3989, + "step": 21336 + }, + { + "epoch": 2.777430691136275, + "grad_norm": 2.619476795196533, + "learning_rate": 1.4122846533265733e-07, + "loss": 0.3682, + "step": 21339 + }, + { + "epoch": 2.7778211636079657, + "grad_norm": 2.9960954189300537, + "learning_rate": 1.4073569737105942e-07, + "loss": 0.3476, + "step": 21342 + }, + { + "epoch": 2.7782116360796563, + "grad_norm": 3.048250675201416, + "learning_rate": 1.4024377831271253e-07, + "loss": 0.38, + "step": 21345 + }, + { + "epoch": 2.7786021085513473, + "grad_norm": 3.039592981338501, + "learning_rate": 1.3975270824355402e-07, + "loss": 0.2967, + "step": 21348 + }, + { + "epoch": 2.778992581023038, + "grad_norm": 3.194598913192749, + "learning_rate": 1.3926248724937363e-07, + "loss": 0.4696, + "step": 21351 + }, + { + "epoch": 2.779383053494729, + "grad_norm": 2.963008165359497, + "learning_rate": 1.3877311541581063e-07, + "loss": 0.3861, + "step": 21354 + }, + { + "epoch": 2.7797735259664194, + "grad_norm": 2.9709367752075195, + "learning_rate": 1.3828459282835828e-07, + "loss": 0.4144, + "step": 21357 + }, + { + "epoch": 2.78016399843811, + "grad_norm": 2.8352601528167725, + "learning_rate": 1.3779691957235996e-07, + "loss": 0.3747, + "step": 21360 + }, + { + "epoch": 2.780554470909801, + "grad_norm": 2.657566547393799, + "learning_rate": 1.3731009573301035e-07, + "loss": 0.3459, + "step": 21363 + }, + { + "epoch": 2.7809449433814915, + "grad_norm": 2.850067615509033, + "learning_rate": 1.368241213953586e-07, + "loss": 0.4104, + "step": 21366 + }, + { + "epoch": 2.7813354158531824, + "grad_norm": 3.0880982875823975, + "learning_rate": 1.3633899664430183e-07, + "loss": 0.36, + "step": 21369 + }, + { + "epoch": 2.781725888324873, + "grad_norm": 3.7259879112243652, + "learning_rate": 1.3585472156459e-07, + "loss": 0.3831, + "step": 21372 + }, + { + "epoch": 2.782116360796564, + "grad_norm": 3.025913953781128, + "learning_rate": 1.3537129624082657e-07, + "loss": 0.3829, + "step": 21375 + }, + { + "epoch": 2.7825068332682545, + "grad_norm": 3.209459066390991, + "learning_rate": 1.3488872075746395e-07, + "loss": 0.3427, + "step": 21378 + }, + { + "epoch": 2.7828973057399455, + "grad_norm": 2.893686294555664, + "learning_rate": 1.3440699519880695e-07, + "loss": 0.3317, + "step": 21381 + }, + { + "epoch": 2.783287778211636, + "grad_norm": 2.8265738487243652, + "learning_rate": 1.3392611964901159e-07, + "loss": 0.3864, + "step": 21384 + }, + { + "epoch": 2.7836782506833266, + "grad_norm": 2.7982900142669678, + "learning_rate": 1.334460941920873e-07, + "loss": 0.3053, + "step": 21387 + }, + { + "epoch": 2.7840687231550176, + "grad_norm": 2.68939471244812, + "learning_rate": 1.329669189118915e-07, + "loss": 0.4247, + "step": 21390 + }, + { + "epoch": 2.784459195626708, + "grad_norm": 2.8545985221862793, + "learning_rate": 1.324885938921372e-07, + "loss": 0.3723, + "step": 21393 + }, + { + "epoch": 2.784849668098399, + "grad_norm": 2.9172253608703613, + "learning_rate": 1.3201111921638532e-07, + "loss": 0.346, + "step": 21396 + }, + { + "epoch": 2.7852401405700897, + "grad_norm": 2.608978271484375, + "learning_rate": 1.3153449496805028e-07, + "loss": 0.3201, + "step": 21399 + }, + { + "epoch": 2.7856306130417803, + "grad_norm": 2.9829037189483643, + "learning_rate": 1.3105872123039765e-07, + "loss": 0.4295, + "step": 21402 + }, + { + "epoch": 2.7860210855134713, + "grad_norm": 2.7616302967071533, + "learning_rate": 1.305837980865432e-07, + "loss": 0.4219, + "step": 21405 + }, + { + "epoch": 2.7864115579851623, + "grad_norm": 3.1220195293426514, + "learning_rate": 1.3010972561945555e-07, + "loss": 0.3856, + "step": 21408 + }, + { + "epoch": 2.786802030456853, + "grad_norm": 2.6154117584228516, + "learning_rate": 1.2963650391195403e-07, + "loss": 0.3607, + "step": 21411 + }, + { + "epoch": 2.7871925029285434, + "grad_norm": 2.7787892818450928, + "learning_rate": 1.2916413304670972e-07, + "loss": 0.3516, + "step": 21414 + }, + { + "epoch": 2.7875829754002344, + "grad_norm": 3.176905632019043, + "learning_rate": 1.2869261310624437e-07, + "loss": 0.3978, + "step": 21417 + }, + { + "epoch": 2.787973447871925, + "grad_norm": 2.9123852252960205, + "learning_rate": 1.282219441729321e-07, + "loss": 0.3417, + "step": 21420 + }, + { + "epoch": 2.788363920343616, + "grad_norm": 2.5749077796936035, + "learning_rate": 1.2775212632899715e-07, + "loss": 0.345, + "step": 21423 + }, + { + "epoch": 2.7887543928153065, + "grad_norm": 2.712224006652832, + "learning_rate": 1.2728315965651606e-07, + "loss": 0.4133, + "step": 21426 + }, + { + "epoch": 2.789144865286997, + "grad_norm": 2.8069446086883545, + "learning_rate": 1.2681504423741665e-07, + "loss": 0.3276, + "step": 21429 + }, + { + "epoch": 2.789535337758688, + "grad_norm": 2.740924596786499, + "learning_rate": 1.2634778015347682e-07, + "loss": 0.3468, + "step": 21432 + }, + { + "epoch": 2.789925810230379, + "grad_norm": 2.735494613647461, + "learning_rate": 1.2588136748632685e-07, + "loss": 0.3151, + "step": 21435 + }, + { + "epoch": 2.7903162827020696, + "grad_norm": 2.7889745235443115, + "learning_rate": 1.2541580631744931e-07, + "loss": 0.338, + "step": 21438 + }, + { + "epoch": 2.79070675517376, + "grad_norm": 2.608374834060669, + "learning_rate": 1.249510967281753e-07, + "loss": 0.3711, + "step": 21441 + }, + { + "epoch": 2.791097227645451, + "grad_norm": 2.477095365524292, + "learning_rate": 1.2448723879968927e-07, + "loss": 0.3962, + "step": 21444 + }, + { + "epoch": 2.7914877001171416, + "grad_norm": 3.061278820037842, + "learning_rate": 1.2402423261302532e-07, + "loss": 0.3421, + "step": 21447 + }, + { + "epoch": 2.7918781725888326, + "grad_norm": 2.966050386428833, + "learning_rate": 1.2356207824907152e-07, + "loss": 0.3855, + "step": 21450 + }, + { + "epoch": 2.792268645060523, + "grad_norm": 2.758970022201538, + "learning_rate": 1.2310077578856328e-07, + "loss": 0.3178, + "step": 21453 + }, + { + "epoch": 2.7926591175322137, + "grad_norm": 2.657818555831909, + "learning_rate": 1.2264032531209004e-07, + "loss": 0.3399, + "step": 21456 + }, + { + "epoch": 2.7930495900039047, + "grad_norm": 2.7079572677612305, + "learning_rate": 1.2218072690009187e-07, + "loss": 0.3295, + "step": 21459 + }, + { + "epoch": 2.7934400624755957, + "grad_norm": 2.721400737762451, + "learning_rate": 1.2172198063285957e-07, + "loss": 0.3228, + "step": 21462 + }, + { + "epoch": 2.7938305349472863, + "grad_norm": 2.8544654846191406, + "learning_rate": 1.2126408659053402e-07, + "loss": 0.3513, + "step": 21465 + }, + { + "epoch": 2.794221007418977, + "grad_norm": 3.039076566696167, + "learning_rate": 1.2080704485310957e-07, + "loss": 0.4097, + "step": 21468 + }, + { + "epoch": 2.794611479890668, + "grad_norm": 3.0144577026367188, + "learning_rate": 1.2035085550043013e-07, + "loss": 0.3648, + "step": 21471 + }, + { + "epoch": 2.7950019523623584, + "grad_norm": 2.8478612899780273, + "learning_rate": 1.1989551861219084e-07, + "loss": 0.3556, + "step": 21474 + }, + { + "epoch": 2.7953924248340494, + "grad_norm": 3.181854724884033, + "learning_rate": 1.1944103426793808e-07, + "loss": 0.2727, + "step": 21477 + }, + { + "epoch": 2.79578289730574, + "grad_norm": 2.986677885055542, + "learning_rate": 1.1898740254706942e-07, + "loss": 0.3345, + "step": 21480 + }, + { + "epoch": 2.7961733697774305, + "grad_norm": 2.7655344009399414, + "learning_rate": 1.1853462352883371e-07, + "loss": 0.4025, + "step": 21483 + }, + { + "epoch": 2.7965638422491215, + "grad_norm": 2.599412202835083, + "learning_rate": 1.180826972923299e-07, + "loss": 0.3651, + "step": 21486 + }, + { + "epoch": 2.796954314720812, + "grad_norm": 3.225376844406128, + "learning_rate": 1.1763162391650929e-07, + "loss": 0.3836, + "step": 21489 + }, + { + "epoch": 2.797344787192503, + "grad_norm": 2.734292507171631, + "learning_rate": 1.1718140348017271e-07, + "loss": 0.3683, + "step": 21492 + }, + { + "epoch": 2.7977352596641936, + "grad_norm": 2.984931230545044, + "learning_rate": 1.1673203606197336e-07, + "loss": 0.3585, + "step": 21495 + }, + { + "epoch": 2.7981257321358846, + "grad_norm": 2.9327147006988525, + "learning_rate": 1.1628352174041346e-07, + "loss": 0.4024, + "step": 21498 + }, + { + "epoch": 2.798516204607575, + "grad_norm": 2.466592788696289, + "learning_rate": 1.1583586059384921e-07, + "loss": 0.33, + "step": 21501 + }, + { + "epoch": 2.798906677079266, + "grad_norm": 2.7316441535949707, + "learning_rate": 1.1538905270048528e-07, + "loss": 0.312, + "step": 21504 + }, + { + "epoch": 2.7992971495509567, + "grad_norm": 2.5499911308288574, + "learning_rate": 1.1494309813837756e-07, + "loss": 0.3187, + "step": 21507 + }, + { + "epoch": 2.799687622022647, + "grad_norm": 2.7340035438537598, + "learning_rate": 1.1449799698543429e-07, + "loss": 0.3893, + "step": 21510 + }, + { + "epoch": 2.800078094494338, + "grad_norm": 2.626269578933716, + "learning_rate": 1.1405374931941382e-07, + "loss": 0.3604, + "step": 21513 + }, + { + "epoch": 2.8004685669660287, + "grad_norm": 2.762885808944702, + "learning_rate": 1.1361035521792407e-07, + "loss": 0.3397, + "step": 21516 + }, + { + "epoch": 2.8008590394377197, + "grad_norm": 2.9258060455322266, + "learning_rate": 1.1316781475842586e-07, + "loss": 0.3689, + "step": 21519 + }, + { + "epoch": 2.8012495119094103, + "grad_norm": 2.73075008392334, + "learning_rate": 1.1272612801823069e-07, + "loss": 0.3302, + "step": 21522 + }, + { + "epoch": 2.8016399843811013, + "grad_norm": 2.7661848068237305, + "learning_rate": 1.1228529507449904e-07, + "loss": 0.4068, + "step": 21525 + }, + { + "epoch": 2.802030456852792, + "grad_norm": 2.8157827854156494, + "learning_rate": 1.1184531600424431e-07, + "loss": 0.3384, + "step": 21528 + }, + { + "epoch": 2.802420929324483, + "grad_norm": 2.6738975048065186, + "learning_rate": 1.1140619088432946e-07, + "loss": 0.3366, + "step": 21531 + }, + { + "epoch": 2.8028114017961734, + "grad_norm": 3.107644557952881, + "learning_rate": 1.1096791979146981e-07, + "loss": 0.3749, + "step": 21534 + }, + { + "epoch": 2.803201874267864, + "grad_norm": 2.7247238159179688, + "learning_rate": 1.1053050280222855e-07, + "loss": 0.3524, + "step": 21537 + }, + { + "epoch": 2.803592346739555, + "grad_norm": 2.4612529277801514, + "learning_rate": 1.1009393999302287e-07, + "loss": 0.3645, + "step": 21540 + }, + { + "epoch": 2.8039828192112455, + "grad_norm": 2.981336832046509, + "learning_rate": 1.09658231440119e-07, + "loss": 0.4023, + "step": 21543 + }, + { + "epoch": 2.8043732916829365, + "grad_norm": 2.6686904430389404, + "learning_rate": 1.0922337721963494e-07, + "loss": 0.3697, + "step": 21546 + }, + { + "epoch": 2.804763764154627, + "grad_norm": 2.5626204013824463, + "learning_rate": 1.0878937740753714e-07, + "loss": 0.3182, + "step": 21549 + }, + { + "epoch": 2.8051542366263176, + "grad_norm": 3.065051317214966, + "learning_rate": 1.0835623207964607e-07, + "loss": 0.3935, + "step": 21552 + }, + { + "epoch": 2.8055447090980086, + "grad_norm": 4.100696086883545, + "learning_rate": 1.0792394131163064e-07, + "loss": 0.3165, + "step": 21555 + }, + { + "epoch": 2.8059351815696996, + "grad_norm": 2.8829846382141113, + "learning_rate": 1.07492505179011e-07, + "loss": 0.3021, + "step": 21558 + }, + { + "epoch": 2.80632565404139, + "grad_norm": 2.999645471572876, + "learning_rate": 1.0706192375715851e-07, + "loss": 0.3379, + "step": 21561 + }, + { + "epoch": 2.8067161265130807, + "grad_norm": 2.903561592102051, + "learning_rate": 1.0663219712129469e-07, + "loss": 0.4568, + "step": 21564 + }, + { + "epoch": 2.8071065989847717, + "grad_norm": 2.6506104469299316, + "learning_rate": 1.0620332534649225e-07, + "loss": 0.3272, + "step": 21567 + }, + { + "epoch": 2.807497071456462, + "grad_norm": 2.9835636615753174, + "learning_rate": 1.0577530850767348e-07, + "loss": 0.3592, + "step": 21570 + }, + { + "epoch": 2.807887543928153, + "grad_norm": 2.599337100982666, + "learning_rate": 1.0534814667961246e-07, + "loss": 0.3549, + "step": 21573 + }, + { + "epoch": 2.8082780163998438, + "grad_norm": 2.8641128540039062, + "learning_rate": 1.0492183993693394e-07, + "loss": 0.3638, + "step": 21576 + }, + { + "epoch": 2.8086684888715343, + "grad_norm": 2.740431547164917, + "learning_rate": 1.0449638835411114e-07, + "loss": 0.3511, + "step": 21579 + }, + { + "epoch": 2.8090589613432253, + "grad_norm": 2.9625089168548584, + "learning_rate": 1.0407179200547124e-07, + "loss": 0.4805, + "step": 21582 + }, + { + "epoch": 2.8094494338149163, + "grad_norm": 2.8378477096557617, + "learning_rate": 1.0364805096518993e-07, + "loss": 0.349, + "step": 21585 + }, + { + "epoch": 2.809839906286607, + "grad_norm": 2.665264368057251, + "learning_rate": 1.0322516530729298e-07, + "loss": 0.3023, + "step": 21588 + }, + { + "epoch": 2.8102303787582974, + "grad_norm": 2.7869749069213867, + "learning_rate": 1.0280313510565909e-07, + "loss": 0.3308, + "step": 21591 + }, + { + "epoch": 2.8106208512299884, + "grad_norm": 3.251950740814209, + "learning_rate": 1.0238196043401538e-07, + "loss": 0.2892, + "step": 21594 + }, + { + "epoch": 2.811011323701679, + "grad_norm": 2.933887004852295, + "learning_rate": 1.0196164136594022e-07, + "loss": 0.366, + "step": 21597 + }, + { + "epoch": 2.81140179617337, + "grad_norm": 2.547628164291382, + "learning_rate": 1.0154217797486098e-07, + "loss": 0.3066, + "step": 21600 + }, + { + "epoch": 2.8117922686450605, + "grad_norm": 2.8008322715759277, + "learning_rate": 1.0112357033405962e-07, + "loss": 0.3796, + "step": 21603 + }, + { + "epoch": 2.812182741116751, + "grad_norm": 2.5808677673339844, + "learning_rate": 1.0070581851666428e-07, + "loss": 0.3241, + "step": 21606 + }, + { + "epoch": 2.812573213588442, + "grad_norm": 2.5986881256103516, + "learning_rate": 1.0028892259565658e-07, + "loss": 0.3361, + "step": 21609 + }, + { + "epoch": 2.812963686060133, + "grad_norm": 2.557717800140381, + "learning_rate": 9.987288264386552e-08, + "loss": 0.346, + "step": 21612 + }, + { + "epoch": 2.8133541585318236, + "grad_norm": 2.836681842803955, + "learning_rate": 9.945769873397404e-08, + "loss": 0.3852, + "step": 21615 + }, + { + "epoch": 2.813744631003514, + "grad_norm": 2.740900754928589, + "learning_rate": 9.904337093851358e-08, + "loss": 0.402, + "step": 21618 + }, + { + "epoch": 2.814135103475205, + "grad_norm": 2.7481210231781006, + "learning_rate": 9.862989932986566e-08, + "loss": 0.347, + "step": 21621 + }, + { + "epoch": 2.8145255759468957, + "grad_norm": 2.67153000831604, + "learning_rate": 9.82172839802642e-08, + "loss": 0.3092, + "step": 21624 + }, + { + "epoch": 2.8149160484185867, + "grad_norm": 2.9304611682891846, + "learning_rate": 9.780552496179096e-08, + "loss": 0.3068, + "step": 21627 + }, + { + "epoch": 2.815306520890277, + "grad_norm": 2.806035280227661, + "learning_rate": 9.739462234637898e-08, + "loss": 0.3729, + "step": 21630 + }, + { + "epoch": 2.8156969933619678, + "grad_norm": 3.032376766204834, + "learning_rate": 9.698457620581359e-08, + "loss": 0.3683, + "step": 21633 + }, + { + "epoch": 2.8160874658336588, + "grad_norm": 3.0666725635528564, + "learning_rate": 9.657538661172861e-08, + "loss": 0.3887, + "step": 21636 + }, + { + "epoch": 2.8164779383053493, + "grad_norm": 2.8096840381622314, + "learning_rate": 9.616705363560796e-08, + "loss": 0.3416, + "step": 21639 + }, + { + "epoch": 2.8168684107770403, + "grad_norm": 2.5482988357543945, + "learning_rate": 9.575957734878627e-08, + "loss": 0.3262, + "step": 21642 + }, + { + "epoch": 2.817258883248731, + "grad_norm": 2.7446038722991943, + "learning_rate": 9.535295782245046e-08, + "loss": 0.3893, + "step": 21645 + }, + { + "epoch": 2.817649355720422, + "grad_norm": 2.8740649223327637, + "learning_rate": 9.494719512763429e-08, + "loss": 0.3946, + "step": 21648 + }, + { + "epoch": 2.8180398281921124, + "grad_norm": 2.647369623184204, + "learning_rate": 9.454228933522491e-08, + "loss": 0.3482, + "step": 21651 + }, + { + "epoch": 2.8184303006638034, + "grad_norm": 2.5511598587036133, + "learning_rate": 9.413824051595799e-08, + "loss": 0.315, + "step": 21654 + }, + { + "epoch": 2.818820773135494, + "grad_norm": 2.809553623199463, + "learning_rate": 9.373504874041984e-08, + "loss": 0.4283, + "step": 21657 + }, + { + "epoch": 2.8192112456071845, + "grad_norm": 2.821474075317383, + "learning_rate": 9.333271407904743e-08, + "loss": 0.3556, + "step": 21660 + }, + { + "epoch": 2.8196017180788755, + "grad_norm": 2.612962484359741, + "learning_rate": 9.293123660212733e-08, + "loss": 0.3268, + "step": 21663 + }, + { + "epoch": 2.819992190550566, + "grad_norm": 3.18330454826355, + "learning_rate": 9.253061637979788e-08, + "loss": 0.3513, + "step": 21666 + }, + { + "epoch": 2.820382663022257, + "grad_norm": 2.551689624786377, + "learning_rate": 9.213085348204587e-08, + "loss": 0.3637, + "step": 21669 + }, + { + "epoch": 2.8207731354939476, + "grad_norm": 2.9806084632873535, + "learning_rate": 9.173194797870877e-08, + "loss": 0.4023, + "step": 21672 + }, + { + "epoch": 2.8211636079656386, + "grad_norm": 2.7304067611694336, + "learning_rate": 9.133389993947528e-08, + "loss": 0.3305, + "step": 21675 + }, + { + "epoch": 2.821554080437329, + "grad_norm": 3.107208013534546, + "learning_rate": 9.09367094338831e-08, + "loss": 0.3507, + "step": 21678 + }, + { + "epoch": 2.82194455290902, + "grad_norm": 2.694958448410034, + "learning_rate": 9.054037653132008e-08, + "loss": 0.3679, + "step": 21681 + }, + { + "epoch": 2.8223350253807107, + "grad_norm": 2.6622893810272217, + "learning_rate": 9.014490130102527e-08, + "loss": 0.3641, + "step": 21684 + }, + { + "epoch": 2.8227254978524012, + "grad_norm": 2.8670809268951416, + "learning_rate": 8.975028381208784e-08, + "loss": 0.3707, + "step": 21687 + }, + { + "epoch": 2.8231159703240922, + "grad_norm": 2.79952335357666, + "learning_rate": 8.935652413344598e-08, + "loss": 0.4008, + "step": 21690 + }, + { + "epoch": 2.823506442795783, + "grad_norm": 3.0517783164978027, + "learning_rate": 8.89636223338891e-08, + "loss": 0.4025, + "step": 21693 + }, + { + "epoch": 2.8238969152674738, + "grad_norm": 2.722447395324707, + "learning_rate": 8.857157848205566e-08, + "loss": 0.344, + "step": 21696 + }, + { + "epoch": 2.8242873877391643, + "grad_norm": 2.9243016242980957, + "learning_rate": 8.818039264643586e-08, + "loss": 0.3724, + "step": 21699 + }, + { + "epoch": 2.824677860210855, + "grad_norm": 3.2354788780212402, + "learning_rate": 8.779006489536834e-08, + "loss": 0.3583, + "step": 21702 + }, + { + "epoch": 2.825068332682546, + "grad_norm": 2.571131706237793, + "learning_rate": 8.740059529704248e-08, + "loss": 0.3751, + "step": 21705 + }, + { + "epoch": 2.825458805154237, + "grad_norm": 3.589383363723755, + "learning_rate": 8.701198391949827e-08, + "loss": 0.3349, + "step": 21708 + }, + { + "epoch": 2.8258492776259274, + "grad_norm": 2.8357677459716797, + "learning_rate": 8.662423083062532e-08, + "loss": 0.3465, + "step": 21711 + }, + { + "epoch": 2.826239750097618, + "grad_norm": 3.248215913772583, + "learning_rate": 8.62373360981622e-08, + "loss": 0.3655, + "step": 21714 + }, + { + "epoch": 2.826630222569309, + "grad_norm": 2.9728636741638184, + "learning_rate": 8.585129978969986e-08, + "loss": 0.3857, + "step": 21717 + }, + { + "epoch": 2.8270206950409995, + "grad_norm": 3.6474063396453857, + "learning_rate": 8.546612197267768e-08, + "loss": 0.3819, + "step": 21720 + }, + { + "epoch": 2.8274111675126905, + "grad_norm": 2.7672176361083984, + "learning_rate": 8.508180271438516e-08, + "loss": 0.4161, + "step": 21723 + }, + { + "epoch": 2.827801639984381, + "grad_norm": 2.943279504776001, + "learning_rate": 8.469834208196193e-08, + "loss": 0.3341, + "step": 21726 + }, + { + "epoch": 2.8281921124560716, + "grad_norm": 3.383356809616089, + "learning_rate": 8.431574014239885e-08, + "loss": 0.3586, + "step": 21729 + }, + { + "epoch": 2.8285825849277626, + "grad_norm": 2.866734266281128, + "learning_rate": 8.393399696253412e-08, + "loss": 0.3224, + "step": 21732 + }, + { + "epoch": 2.8289730573994536, + "grad_norm": 2.8714816570281982, + "learning_rate": 8.355311260905829e-08, + "loss": 0.416, + "step": 21735 + }, + { + "epoch": 2.829363529871144, + "grad_norm": 2.4566404819488525, + "learning_rate": 8.317308714851146e-08, + "loss": 0.3209, + "step": 21738 + }, + { + "epoch": 2.8297540023428347, + "grad_norm": 3.835968255996704, + "learning_rate": 8.279392064728276e-08, + "loss": 0.3415, + "step": 21741 + }, + { + "epoch": 2.8301444748145257, + "grad_norm": 2.9507713317871094, + "learning_rate": 8.24156131716114e-08, + "loss": 0.3388, + "step": 21744 + }, + { + "epoch": 2.8305349472862162, + "grad_norm": 2.818152904510498, + "learning_rate": 8.203816478758785e-08, + "loss": 0.3826, + "step": 21747 + }, + { + "epoch": 2.8309254197579072, + "grad_norm": 2.5711591243743896, + "learning_rate": 8.166157556115107e-08, + "loss": 0.3195, + "step": 21750 + }, + { + "epoch": 2.831315892229598, + "grad_norm": 3.0602824687957764, + "learning_rate": 8.128584555809005e-08, + "loss": 0.333, + "step": 21753 + }, + { + "epoch": 2.8317063647012883, + "grad_norm": 2.9426445960998535, + "learning_rate": 8.091097484404454e-08, + "loss": 0.3469, + "step": 21756 + }, + { + "epoch": 2.8320968371729793, + "grad_norm": 2.615579843521118, + "learning_rate": 8.053696348450324e-08, + "loss": 0.3241, + "step": 21759 + }, + { + "epoch": 2.8324873096446703, + "grad_norm": 2.8304200172424316, + "learning_rate": 8.016381154480557e-08, + "loss": 0.3492, + "step": 21762 + }, + { + "epoch": 2.832877782116361, + "grad_norm": 2.8419861793518066, + "learning_rate": 7.979151909013993e-08, + "loss": 0.3959, + "step": 21765 + }, + { + "epoch": 2.8332682545880514, + "grad_norm": 3.1419153213500977, + "learning_rate": 7.942008618554543e-08, + "loss": 0.3467, + "step": 21768 + }, + { + "epoch": 2.8336587270597424, + "grad_norm": 2.6105754375457764, + "learning_rate": 7.904951289591068e-08, + "loss": 0.3219, + "step": 21771 + }, + { + "epoch": 2.834049199531433, + "grad_norm": 2.874232530593872, + "learning_rate": 7.867979928597336e-08, + "loss": 0.4414, + "step": 21774 + }, + { + "epoch": 2.834439672003124, + "grad_norm": 2.67608642578125, + "learning_rate": 7.831094542032236e-08, + "loss": 0.3289, + "step": 21777 + }, + { + "epoch": 2.8348301444748145, + "grad_norm": 2.974731922149658, + "learning_rate": 7.794295136339613e-08, + "loss": 0.3602, + "step": 21780 + }, + { + "epoch": 2.835220616946505, + "grad_norm": 2.8034987449645996, + "learning_rate": 7.757581717948104e-08, + "loss": 0.4196, + "step": 21783 + }, + { + "epoch": 2.835611089418196, + "grad_norm": 2.63897442817688, + "learning_rate": 7.720954293271576e-08, + "loss": 0.3953, + "step": 21786 + }, + { + "epoch": 2.8360015618898866, + "grad_norm": 2.5943167209625244, + "learning_rate": 7.68441286870869e-08, + "loss": 0.2894, + "step": 21789 + }, + { + "epoch": 2.8363920343615776, + "grad_norm": 2.8868579864501953, + "learning_rate": 7.647957450643284e-08, + "loss": 0.4103, + "step": 21792 + }, + { + "epoch": 2.836782506833268, + "grad_norm": 2.7333850860595703, + "learning_rate": 7.611588045443874e-08, + "loss": 0.3394, + "step": 21795 + }, + { + "epoch": 2.837172979304959, + "grad_norm": 2.8235933780670166, + "learning_rate": 7.575304659464211e-08, + "loss": 0.3461, + "step": 21798 + }, + { + "epoch": 2.8375634517766497, + "grad_norm": 2.699286699295044, + "learning_rate": 7.539107299042947e-08, + "loss": 0.3186, + "step": 21801 + }, + { + "epoch": 2.8379539242483407, + "grad_norm": 2.984445095062256, + "learning_rate": 7.502995970503634e-08, + "loss": 0.3464, + "step": 21804 + }, + { + "epoch": 2.8383443967200312, + "grad_norm": 2.8218915462493896, + "learning_rate": 7.466970680154839e-08, + "loss": 0.3115, + "step": 21807 + }, + { + "epoch": 2.838734869191722, + "grad_norm": 2.748157262802124, + "learning_rate": 7.431031434290137e-08, + "loss": 0.3772, + "step": 21810 + }, + { + "epoch": 2.839125341663413, + "grad_norm": 2.9761040210723877, + "learning_rate": 7.395178239188006e-08, + "loss": 0.4132, + "step": 21813 + }, + { + "epoch": 2.8395158141351033, + "grad_norm": 2.9955594539642334, + "learning_rate": 7.359411101111991e-08, + "loss": 0.3501, + "step": 21816 + }, + { + "epoch": 2.8399062866067943, + "grad_norm": 2.834073305130005, + "learning_rate": 7.323730026310483e-08, + "loss": 0.3444, + "step": 21819 + }, + { + "epoch": 2.840296759078485, + "grad_norm": 2.5343804359436035, + "learning_rate": 7.28813502101694e-08, + "loss": 0.3856, + "step": 21822 + }, + { + "epoch": 2.8406872315501754, + "grad_norm": 2.5103886127471924, + "learning_rate": 7.252626091449666e-08, + "loss": 0.3501, + "step": 21825 + }, + { + "epoch": 2.8410777040218664, + "grad_norm": 2.7703588008880615, + "learning_rate": 7.217203243811976e-08, + "loss": 0.3207, + "step": 21828 + }, + { + "epoch": 2.8414681764935574, + "grad_norm": 2.8706281185150146, + "learning_rate": 7.181866484292255e-08, + "loss": 0.36, + "step": 21831 + }, + { + "epoch": 2.841858648965248, + "grad_norm": 2.7826225757598877, + "learning_rate": 7.146615819063729e-08, + "loss": 0.3871, + "step": 21834 + }, + { + "epoch": 2.8422491214369385, + "grad_norm": 2.82706880569458, + "learning_rate": 7.111451254284584e-08, + "loss": 0.3323, + "step": 21837 + }, + { + "epoch": 2.8426395939086295, + "grad_norm": 2.9153218269348145, + "learning_rate": 7.076372796098019e-08, + "loss": 0.3573, + "step": 21840 + }, + { + "epoch": 2.84303006638032, + "grad_norm": 2.676373243331909, + "learning_rate": 7.041380450632185e-08, + "loss": 0.3178, + "step": 21843 + }, + { + "epoch": 2.843420538852011, + "grad_norm": 2.866745948791504, + "learning_rate": 7.006474224000138e-08, + "loss": 0.3973, + "step": 21846 + }, + { + "epoch": 2.8438110113237016, + "grad_norm": 2.364248514175415, + "learning_rate": 6.971654122299943e-08, + "loss": 0.3069, + "step": 21849 + }, + { + "epoch": 2.844201483795392, + "grad_norm": 2.777941942214966, + "learning_rate": 6.936920151614567e-08, + "loss": 0.2974, + "step": 21852 + }, + { + "epoch": 2.844591956267083, + "grad_norm": 2.982527017593384, + "learning_rate": 6.90227231801205e-08, + "loss": 0.3763, + "step": 21855 + }, + { + "epoch": 2.844982428738774, + "grad_norm": 2.7366418838500977, + "learning_rate": 6.867710627545154e-08, + "loss": 0.3435, + "step": 21858 + }, + { + "epoch": 2.8453729012104647, + "grad_norm": 3.094679832458496, + "learning_rate": 6.833235086251889e-08, + "loss": 0.3818, + "step": 21861 + }, + { + "epoch": 2.8457633736821553, + "grad_norm": 2.775191068649292, + "learning_rate": 6.798845700154987e-08, + "loss": 0.3325, + "step": 21864 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 2.8869082927703857, + "learning_rate": 6.764542475262147e-08, + "loss": 0.3708, + "step": 21867 + }, + { + "epoch": 2.846544318625537, + "grad_norm": 3.177225112915039, + "learning_rate": 6.73032541756613e-08, + "loss": 0.3381, + "step": 21870 + }, + { + "epoch": 2.846934791097228, + "grad_norm": 2.946268081665039, + "learning_rate": 6.696194533044598e-08, + "loss": 0.3494, + "step": 21873 + }, + { + "epoch": 2.8473252635689184, + "grad_norm": 3.021143674850464, + "learning_rate": 6.662149827660114e-08, + "loss": 0.3219, + "step": 21876 + }, + { + "epoch": 2.847715736040609, + "grad_norm": 2.826537847518921, + "learning_rate": 6.628191307360199e-08, + "loss": 0.341, + "step": 21879 + }, + { + "epoch": 2.8481062085123, + "grad_norm": 2.8287034034729004, + "learning_rate": 6.594318978077386e-08, + "loss": 0.3956, + "step": 21882 + }, + { + "epoch": 2.848496680983991, + "grad_norm": 3.175198793411255, + "learning_rate": 6.56053284572905e-08, + "loss": 0.2685, + "step": 21885 + }, + { + "epoch": 2.8488871534556814, + "grad_norm": 2.6262009143829346, + "learning_rate": 6.526832916217584e-08, + "loss": 0.374, + "step": 21888 + }, + { + "epoch": 2.849277625927372, + "grad_norm": 2.9307057857513428, + "learning_rate": 6.493219195430334e-08, + "loss": 0.4002, + "step": 21891 + }, + { + "epoch": 2.849668098399063, + "grad_norm": 2.8721837997436523, + "learning_rate": 6.459691689239433e-08, + "loss": 0.3986, + "step": 21894 + }, + { + "epoch": 2.8500585708707535, + "grad_norm": 2.7488579750061035, + "learning_rate": 6.426250403502199e-08, + "loss": 0.333, + "step": 21897 + }, + { + "epoch": 2.8504490433424445, + "grad_norm": 2.9429168701171875, + "learning_rate": 6.392895344060623e-08, + "loss": 0.3574, + "step": 21900 + }, + { + "epoch": 2.850839515814135, + "grad_norm": 2.7157819271087646, + "learning_rate": 6.359626516741935e-08, + "loss": 0.3414, + "step": 21903 + }, + { + "epoch": 2.8512299882858256, + "grad_norm": 2.7233452796936035, + "learning_rate": 6.326443927357983e-08, + "loss": 0.3299, + "step": 21906 + }, + { + "epoch": 2.8516204607575166, + "grad_norm": 2.926488161087036, + "learning_rate": 6.293347581705689e-08, + "loss": 0.3944, + "step": 21909 + }, + { + "epoch": 2.852010933229207, + "grad_norm": 2.6980791091918945, + "learning_rate": 6.260337485567037e-08, + "loss": 0.3758, + "step": 21912 + }, + { + "epoch": 2.852401405700898, + "grad_norm": 2.777859926223755, + "learning_rate": 6.227413644708691e-08, + "loss": 0.2841, + "step": 21915 + }, + { + "epoch": 2.8527918781725887, + "grad_norm": 3.123918056488037, + "learning_rate": 6.194576064882496e-08, + "loss": 0.365, + "step": 21918 + }, + { + "epoch": 2.8531823506442797, + "grad_norm": 2.804738998413086, + "learning_rate": 6.161824751824974e-08, + "loss": 0.3655, + "step": 21921 + }, + { + "epoch": 2.8535728231159703, + "grad_norm": 2.6704933643341064, + "learning_rate": 6.129159711257826e-08, + "loss": 0.2882, + "step": 21924 + }, + { + "epoch": 2.8539632955876613, + "grad_norm": 2.9849040508270264, + "learning_rate": 6.096580948887543e-08, + "loss": 0.3722, + "step": 21927 + }, + { + "epoch": 2.854353768059352, + "grad_norm": 2.9017884731292725, + "learning_rate": 6.064088470405516e-08, + "loss": 0.3921, + "step": 21930 + }, + { + "epoch": 2.8547442405310424, + "grad_norm": 2.6805455684661865, + "learning_rate": 6.03168228148815e-08, + "loss": 0.3453, + "step": 21933 + }, + { + "epoch": 2.8551347130027334, + "grad_norm": 2.8592615127563477, + "learning_rate": 5.999362387796747e-08, + "loss": 0.3311, + "step": 21936 + }, + { + "epoch": 2.855525185474424, + "grad_norm": 3.1579458713531494, + "learning_rate": 5.967128794977462e-08, + "loss": 0.3666, + "step": 21939 + }, + { + "epoch": 2.855915657946115, + "grad_norm": 2.6894073486328125, + "learning_rate": 5.9349815086615084e-08, + "loss": 0.3648, + "step": 21942 + }, + { + "epoch": 2.8563061304178055, + "grad_norm": 2.765660047531128, + "learning_rate": 5.902920534464951e-08, + "loss": 0.3761, + "step": 21945 + }, + { + "epoch": 2.8566966028894965, + "grad_norm": 2.739934206008911, + "learning_rate": 5.870945877988754e-08, + "loss": 0.293, + "step": 21948 + }, + { + "epoch": 2.857087075361187, + "grad_norm": 3.662998914718628, + "learning_rate": 5.839057544818783e-08, + "loss": 0.3738, + "step": 21951 + }, + { + "epoch": 2.857477547832878, + "grad_norm": 2.789340019226074, + "learning_rate": 5.8072555405259135e-08, + "loss": 0.3837, + "step": 21954 + }, + { + "epoch": 2.8578680203045685, + "grad_norm": 2.4492013454437256, + "learning_rate": 5.7755398706658694e-08, + "loss": 0.3126, + "step": 21957 + }, + { + "epoch": 2.858258492776259, + "grad_norm": 2.5558364391326904, + "learning_rate": 5.7439105407792736e-08, + "loss": 0.31, + "step": 21960 + }, + { + "epoch": 2.85864896524795, + "grad_norm": 2.8237719535827637, + "learning_rate": 5.712367556391818e-08, + "loss": 0.3935, + "step": 21963 + }, + { + "epoch": 2.8590394377196406, + "grad_norm": 2.918532371520996, + "learning_rate": 5.6809109230138714e-08, + "loss": 0.3618, + "step": 21966 + }, + { + "epoch": 2.8594299101913316, + "grad_norm": 3.382922410964966, + "learning_rate": 5.6495406461409274e-08, + "loss": 0.3443, + "step": 21969 + }, + { + "epoch": 2.859820382663022, + "grad_norm": 2.8826119899749756, + "learning_rate": 5.6182567312532134e-08, + "loss": 0.4316, + "step": 21972 + }, + { + "epoch": 2.8602108551347127, + "grad_norm": 2.8180668354034424, + "learning_rate": 5.587059183816079e-08, + "loss": 0.3589, + "step": 21975 + }, + { + "epoch": 2.8606013276064037, + "grad_norm": 2.677135705947876, + "learning_rate": 5.5559480092795545e-08, + "loss": 0.3745, + "step": 21978 + }, + { + "epoch": 2.8609918000780947, + "grad_norm": 2.9708173274993896, + "learning_rate": 5.5249232130787924e-08, + "loss": 0.3381, + "step": 21981 + }, + { + "epoch": 2.8613822725497853, + "grad_norm": 3.1857826709747314, + "learning_rate": 5.493984800633734e-08, + "loss": 0.3931, + "step": 21984 + }, + { + "epoch": 2.861772745021476, + "grad_norm": 3.0874578952789307, + "learning_rate": 5.4631327773492226e-08, + "loss": 0.3146, + "step": 21987 + }, + { + "epoch": 2.862163217493167, + "grad_norm": 2.952526807785034, + "learning_rate": 5.4323671486150585e-08, + "loss": 0.3709, + "step": 21990 + }, + { + "epoch": 2.8625536899648574, + "grad_norm": 2.7421607971191406, + "learning_rate": 5.401687919805942e-08, + "loss": 0.3305, + "step": 21993 + }, + { + "epoch": 2.8629441624365484, + "grad_norm": 3.3687474727630615, + "learning_rate": 5.3710950962814755e-08, + "loss": 0.4446, + "step": 21996 + }, + { + "epoch": 2.863334634908239, + "grad_norm": 3.39333176612854, + "learning_rate": 5.3405886833861054e-08, + "loss": 0.3432, + "step": 21999 + }, + { + "epoch": 2.8637251073799295, + "grad_norm": 2.7644834518432617, + "learning_rate": 5.3101686864492904e-08, + "loss": 0.3486, + "step": 22002 + }, + { + "epoch": 2.8641155798516205, + "grad_norm": 2.7170677185058594, + "learning_rate": 5.279835110785392e-08, + "loss": 0.3633, + "step": 22005 + }, + { + "epoch": 2.8645060523233115, + "grad_norm": 3.4381911754608154, + "learning_rate": 5.249587961693503e-08, + "loss": 0.3431, + "step": 22008 + }, + { + "epoch": 2.864896524795002, + "grad_norm": 2.583317756652832, + "learning_rate": 5.219427244457842e-08, + "loss": 0.3621, + "step": 22011 + }, + { + "epoch": 2.8652869972666926, + "grad_norm": 2.8179497718811035, + "learning_rate": 5.189352964347305e-08, + "loss": 0.3758, + "step": 22014 + }, + { + "epoch": 2.8656774697383836, + "grad_norm": 2.632049560546875, + "learning_rate": 5.159365126615967e-08, + "loss": 0.3725, + "step": 22017 + }, + { + "epoch": 2.866067942210074, + "grad_norm": 3.091775894165039, + "learning_rate": 5.129463736502471e-08, + "loss": 0.3826, + "step": 22020 + }, + { + "epoch": 2.866458414681765, + "grad_norm": 2.7316322326660156, + "learning_rate": 5.099648799230583e-08, + "loss": 0.3998, + "step": 22023 + }, + { + "epoch": 2.8668488871534556, + "grad_norm": 2.750880002975464, + "learning_rate": 5.0699203200089694e-08, + "loss": 0.4074, + "step": 22026 + }, + { + "epoch": 2.867239359625146, + "grad_norm": 3.477369785308838, + "learning_rate": 5.0402783040311435e-08, + "loss": 0.4091, + "step": 22029 + }, + { + "epoch": 2.867629832096837, + "grad_norm": 2.8512279987335205, + "learning_rate": 5.010722756475406e-08, + "loss": 0.3391, + "step": 22032 + }, + { + "epoch": 2.868020304568528, + "grad_norm": 2.6762948036193848, + "learning_rate": 4.981253682505127e-08, + "loss": 0.2929, + "step": 22035 + }, + { + "epoch": 2.8684107770402187, + "grad_norm": 3.086232900619507, + "learning_rate": 4.951871087268412e-08, + "loss": 0.3973, + "step": 22038 + }, + { + "epoch": 2.8688012495119093, + "grad_norm": 2.8311057090759277, + "learning_rate": 4.922574975898431e-08, + "loss": 0.3275, + "step": 22041 + }, + { + "epoch": 2.8691917219836003, + "grad_norm": 2.6848394870758057, + "learning_rate": 4.893365353513091e-08, + "loss": 0.3176, + "step": 22044 + }, + { + "epoch": 2.869582194455291, + "grad_norm": 2.7619194984436035, + "learning_rate": 4.86424222521531e-08, + "loss": 0.3016, + "step": 22047 + }, + { + "epoch": 2.869972666926982, + "grad_norm": 2.7256760597229004, + "learning_rate": 4.835205596092796e-08, + "loss": 0.3447, + "step": 22050 + }, + { + "epoch": 2.8703631393986724, + "grad_norm": 2.8061459064483643, + "learning_rate": 4.806255471218102e-08, + "loss": 0.3466, + "step": 22053 + }, + { + "epoch": 2.870753611870363, + "grad_norm": 2.5725607872009277, + "learning_rate": 4.7773918556489054e-08, + "loss": 0.3098, + "step": 22056 + }, + { + "epoch": 2.871144084342054, + "grad_norm": 2.703907012939453, + "learning_rate": 4.748614754427561e-08, + "loss": 0.3863, + "step": 22059 + }, + { + "epoch": 2.8715345568137445, + "grad_norm": 2.798107385635376, + "learning_rate": 4.71992417258138e-08, + "loss": 0.3247, + "step": 22062 + }, + { + "epoch": 2.8719250292854355, + "grad_norm": 2.4142208099365234, + "learning_rate": 4.691320115122466e-08, + "loss": 0.3145, + "step": 22065 + }, + { + "epoch": 2.872315501757126, + "grad_norm": 2.5614852905273438, + "learning_rate": 4.6628025870479875e-08, + "loss": 0.3519, + "step": 22068 + }, + { + "epoch": 2.872705974228817, + "grad_norm": 2.7241270542144775, + "learning_rate": 4.6343715933399034e-08, + "loss": 0.3275, + "step": 22071 + }, + { + "epoch": 2.8730964467005076, + "grad_norm": 2.801772356033325, + "learning_rate": 4.6060271389649635e-08, + "loss": 0.3661, + "step": 22074 + }, + { + "epoch": 2.8734869191721986, + "grad_norm": 2.6916000843048096, + "learning_rate": 4.577769228874873e-08, + "loss": 0.3467, + "step": 22077 + }, + { + "epoch": 2.873877391643889, + "grad_norm": 2.7385640144348145, + "learning_rate": 4.54959786800635e-08, + "loss": 0.3489, + "step": 22080 + }, + { + "epoch": 2.8742678641155797, + "grad_norm": 2.9576168060302734, + "learning_rate": 4.521513061280791e-08, + "loss": 0.333, + "step": 22083 + }, + { + "epoch": 2.8746583365872707, + "grad_norm": 2.8429853916168213, + "learning_rate": 4.4935148136045495e-08, + "loss": 0.3436, + "step": 22086 + }, + { + "epoch": 2.875048809058961, + "grad_norm": 2.8129966259002686, + "learning_rate": 4.465603129868934e-08, + "loss": 0.3853, + "step": 22089 + }, + { + "epoch": 2.875439281530652, + "grad_norm": 2.891261100769043, + "learning_rate": 4.437778014949934e-08, + "loss": 0.3138, + "step": 22092 + }, + { + "epoch": 2.8758297540023428, + "grad_norm": 2.6374096870422363, + "learning_rate": 4.410039473708605e-08, + "loss": 0.2911, + "step": 22095 + }, + { + "epoch": 2.8762202264740333, + "grad_norm": 2.888641119003296, + "learning_rate": 4.3823875109908486e-08, + "loss": 0.4262, + "step": 22098 + }, + { + "epoch": 2.8766106989457243, + "grad_norm": 3.0171844959259033, + "learning_rate": 4.354822131627357e-08, + "loss": 0.427, + "step": 22101 + }, + { + "epoch": 2.8770011714174153, + "grad_norm": 2.790205717086792, + "learning_rate": 4.327343340433721e-08, + "loss": 0.3196, + "step": 22104 + }, + { + "epoch": 2.877391643889106, + "grad_norm": 2.8684253692626953, + "learning_rate": 4.29995114221049e-08, + "loss": 0.3728, + "step": 22107 + }, + { + "epoch": 2.8777821163607964, + "grad_norm": 2.8882906436920166, + "learning_rate": 4.2726455417430037e-08, + "loss": 0.3488, + "step": 22110 + }, + { + "epoch": 2.8781725888324874, + "grad_norm": 2.429396152496338, + "learning_rate": 4.2454265438014454e-08, + "loss": 0.3179, + "step": 22113 + }, + { + "epoch": 2.878563061304178, + "grad_norm": 3.0140392780303955, + "learning_rate": 4.2182941531410113e-08, + "loss": 0.3496, + "step": 22116 + }, + { + "epoch": 2.878953533775869, + "grad_norm": 3.1088218688964844, + "learning_rate": 4.191248374501577e-08, + "loss": 0.3918, + "step": 22119 + }, + { + "epoch": 2.8793440062475595, + "grad_norm": 2.7448649406433105, + "learning_rate": 4.16428921260803e-08, + "loss": 0.4052, + "step": 22122 + }, + { + "epoch": 2.87973447871925, + "grad_norm": 2.797527551651001, + "learning_rate": 4.1374166721701026e-08, + "loss": 0.3408, + "step": 22125 + }, + { + "epoch": 2.880124951190941, + "grad_norm": 2.954578399658203, + "learning_rate": 4.110630757882261e-08, + "loss": 0.3704, + "step": 22128 + }, + { + "epoch": 2.880515423662632, + "grad_norm": 2.8430421352386475, + "learning_rate": 4.0839314744240966e-08, + "loss": 0.4094, + "step": 22131 + }, + { + "epoch": 2.8809058961343226, + "grad_norm": 2.5656564235687256, + "learning_rate": 4.057318826459822e-08, + "loss": 0.3717, + "step": 22134 + }, + { + "epoch": 2.881296368606013, + "grad_norm": 2.829988956451416, + "learning_rate": 4.0307928186386625e-08, + "loss": 0.3751, + "step": 22137 + }, + { + "epoch": 2.881686841077704, + "grad_norm": 2.584980010986328, + "learning_rate": 4.004353455594578e-08, + "loss": 0.3584, + "step": 22140 + }, + { + "epoch": 2.8820773135493947, + "grad_norm": 3.4724276065826416, + "learning_rate": 3.978000741946597e-08, + "loss": 0.3441, + "step": 22143 + }, + { + "epoch": 2.8824677860210857, + "grad_norm": 2.891096830368042, + "learning_rate": 3.951734682298314e-08, + "loss": 0.3937, + "step": 22146 + }, + { + "epoch": 2.882858258492776, + "grad_norm": 3.165163993835449, + "learning_rate": 3.9255552812385025e-08, + "loss": 0.3763, + "step": 22149 + }, + { + "epoch": 2.8832487309644668, + "grad_norm": 2.5909931659698486, + "learning_rate": 3.8994625433406155e-08, + "loss": 0.3113, + "step": 22152 + }, + { + "epoch": 2.8836392034361578, + "grad_norm": 2.9258785247802734, + "learning_rate": 3.873456473162951e-08, + "loss": 0.3515, + "step": 22155 + }, + { + "epoch": 2.8840296759078488, + "grad_norm": 2.447117805480957, + "learning_rate": 3.847537075248764e-08, + "loss": 0.333, + "step": 22158 + }, + { + "epoch": 2.8844201483795393, + "grad_norm": 2.608942985534668, + "learning_rate": 3.821704354126099e-08, + "loss": 0.3675, + "step": 22161 + }, + { + "epoch": 2.88481062085123, + "grad_norm": 2.856931447982788, + "learning_rate": 3.795958314307846e-08, + "loss": 0.3207, + "step": 22164 + }, + { + "epoch": 2.885201093322921, + "grad_norm": 3.8806073665618896, + "learning_rate": 3.770298960291796e-08, + "loss": 0.3051, + "step": 22167 + }, + { + "epoch": 2.8855915657946114, + "grad_norm": 2.8698956966400146, + "learning_rate": 3.744726296560641e-08, + "loss": 0.3686, + "step": 22170 + }, + { + "epoch": 2.8859820382663024, + "grad_norm": 2.8077917098999023, + "learning_rate": 3.7192403275818636e-08, + "loss": 0.3602, + "step": 22173 + }, + { + "epoch": 2.886372510737993, + "grad_norm": 2.8903465270996094, + "learning_rate": 3.693841057807734e-08, + "loss": 0.3563, + "step": 22176 + }, + { + "epoch": 2.8867629832096835, + "grad_norm": 2.8778324127197266, + "learning_rate": 3.6685284916755384e-08, + "loss": 0.3589, + "step": 22179 + }, + { + "epoch": 2.8871534556813745, + "grad_norm": 2.758140802383423, + "learning_rate": 3.643302633607237e-08, + "loss": 0.3862, + "step": 22182 + }, + { + "epoch": 2.887543928153065, + "grad_norm": 2.449650526046753, + "learning_rate": 3.618163488009807e-08, + "loss": 0.3352, + "step": 22185 + }, + { + "epoch": 2.887934400624756, + "grad_norm": 2.983010768890381, + "learning_rate": 3.593111059274956e-08, + "loss": 0.3191, + "step": 22188 + }, + { + "epoch": 2.8883248730964466, + "grad_norm": 2.961902618408203, + "learning_rate": 3.5681453517793506e-08, + "loss": 0.3256, + "step": 22191 + }, + { + "epoch": 2.8887153455681376, + "grad_norm": 2.506606340408325, + "learning_rate": 3.54326636988439e-08, + "loss": 0.3348, + "step": 22194 + }, + { + "epoch": 2.889105818039828, + "grad_norm": 2.9733147621154785, + "learning_rate": 3.518474117936432e-08, + "loss": 0.3812, + "step": 22197 + }, + { + "epoch": 2.889496290511519, + "grad_norm": 2.4794435501098633, + "learning_rate": 3.493768600266567e-08, + "loss": 0.3011, + "step": 22200 + }, + { + "epoch": 2.8898867629832097, + "grad_norm": 2.974116563796997, + "learning_rate": 3.469149821190842e-08, + "loss": 0.337, + "step": 22203 + }, + { + "epoch": 2.8902772354549002, + "grad_norm": 3.0011672973632812, + "learning_rate": 3.4446177850100957e-08, + "loss": 0.4151, + "step": 22206 + }, + { + "epoch": 2.8906677079265912, + "grad_norm": 2.728896141052246, + "learning_rate": 3.42017249601001e-08, + "loss": 0.3306, + "step": 22209 + }, + { + "epoch": 2.8910581803982818, + "grad_norm": 2.9429938793182373, + "learning_rate": 3.395813958461169e-08, + "loss": 0.3284, + "step": 22212 + }, + { + "epoch": 2.8914486528699728, + "grad_norm": 2.7792723178863525, + "learning_rate": 3.371542176618891e-08, + "loss": 0.3404, + "step": 22215 + }, + { + "epoch": 2.8918391253416633, + "grad_norm": 2.7647454738616943, + "learning_rate": 3.347357154723452e-08, + "loss": 0.3339, + "step": 22218 + }, + { + "epoch": 2.8922295978133543, + "grad_norm": 2.6491119861602783, + "learning_rate": 3.323258896999915e-08, + "loss": 0.3466, + "step": 22221 + }, + { + "epoch": 2.892620070285045, + "grad_norm": 2.659365177154541, + "learning_rate": 3.2992474076581904e-08, + "loss": 0.3819, + "step": 22224 + }, + { + "epoch": 2.893010542756736, + "grad_norm": 3.4432740211486816, + "learning_rate": 3.27532269089309e-08, + "loss": 0.3364, + "step": 22227 + }, + { + "epoch": 2.8934010152284264, + "grad_norm": 2.9346230030059814, + "learning_rate": 3.251484750884048e-08, + "loss": 0.3694, + "step": 22230 + }, + { + "epoch": 2.893791487700117, + "grad_norm": 3.1397318840026855, + "learning_rate": 3.227733591795734e-08, + "loss": 0.3735, + "step": 22233 + }, + { + "epoch": 2.894181960171808, + "grad_norm": 2.8210697174072266, + "learning_rate": 3.204069217777217e-08, + "loss": 0.3545, + "step": 22236 + }, + { + "epoch": 2.8945724326434985, + "grad_norm": 2.865011692047119, + "learning_rate": 3.1804916329627456e-08, + "loss": 0.3738, + "step": 22239 + }, + { + "epoch": 2.8949629051151895, + "grad_norm": 2.7513821125030518, + "learning_rate": 3.157000841471247e-08, + "loss": 0.3312, + "step": 22242 + }, + { + "epoch": 2.89535337758688, + "grad_norm": 2.894726037979126, + "learning_rate": 3.1335968474064395e-08, + "loss": 0.3482, + "step": 22245 + }, + { + "epoch": 2.8957438500585706, + "grad_norm": 2.9314725399017334, + "learning_rate": 3.110279654857051e-08, + "loss": 0.3727, + "step": 22248 + }, + { + "epoch": 2.8961343225302616, + "grad_norm": 2.801659107208252, + "learning_rate": 3.087049267896492e-08, + "loss": 0.3829, + "step": 22251 + }, + { + "epoch": 2.8965247950019526, + "grad_norm": 2.8862311840057373, + "learning_rate": 3.063905690583069e-08, + "loss": 0.3442, + "step": 22254 + }, + { + "epoch": 2.896915267473643, + "grad_norm": 2.668565273284912, + "learning_rate": 3.040848926959938e-08, + "loss": 0.3629, + "step": 22257 + }, + { + "epoch": 2.8973057399453337, + "grad_norm": 2.784465789794922, + "learning_rate": 3.017878981055045e-08, + "loss": 0.3723, + "step": 22260 + }, + { + "epoch": 2.8976962124170247, + "grad_norm": 2.8275580406188965, + "learning_rate": 2.9949958568811774e-08, + "loss": 0.4457, + "step": 22263 + }, + { + "epoch": 2.8980866848887152, + "grad_norm": 2.947969913482666, + "learning_rate": 2.9721995584360286e-08, + "loss": 0.3491, + "step": 22266 + }, + { + "epoch": 2.8984771573604062, + "grad_norm": 3.009458303451538, + "learning_rate": 2.9494900897019675e-08, + "loss": 0.3924, + "step": 22269 + }, + { + "epoch": 2.898867629832097, + "grad_norm": 2.875168561935425, + "learning_rate": 2.926867454646376e-08, + "loss": 0.3703, + "step": 22272 + }, + { + "epoch": 2.8992581023037873, + "grad_norm": 2.945216417312622, + "learning_rate": 2.904331657221371e-08, + "loss": 0.3615, + "step": 22275 + }, + { + "epoch": 2.8996485747754783, + "grad_norm": 3.359584331512451, + "learning_rate": 2.8818827013638583e-08, + "loss": 0.3504, + "step": 22278 + }, + { + "epoch": 2.9000390472471693, + "grad_norm": 2.8335955142974854, + "learning_rate": 2.859520590995646e-08, + "loss": 0.3456, + "step": 22281 + }, + { + "epoch": 2.90042951971886, + "grad_norm": 2.685680627822876, + "learning_rate": 2.837245330023386e-08, + "loss": 0.3402, + "step": 22284 + }, + { + "epoch": 2.9008199921905504, + "grad_norm": 2.90457820892334, + "learning_rate": 2.815056922338466e-08, + "loss": 0.3305, + "step": 22287 + }, + { + "epoch": 2.9012104646622414, + "grad_norm": 2.739206314086914, + "learning_rate": 2.79295537181723e-08, + "loss": 0.338, + "step": 22290 + }, + { + "epoch": 2.901600937133932, + "grad_norm": 3.032250165939331, + "learning_rate": 2.770940682320644e-08, + "loss": 0.3378, + "step": 22293 + }, + { + "epoch": 2.901991409605623, + "grad_norm": 2.631166696548462, + "learning_rate": 2.749012857694744e-08, + "loss": 0.3898, + "step": 22296 + }, + { + "epoch": 2.9023818820773135, + "grad_norm": 2.825129747390747, + "learning_rate": 2.7271719017702424e-08, + "loss": 0.3608, + "step": 22299 + }, + { + "epoch": 2.902772354549004, + "grad_norm": 2.757324457168579, + "learning_rate": 2.7054178183626988e-08, + "loss": 0.4074, + "step": 22302 + }, + { + "epoch": 2.903162827020695, + "grad_norm": 2.9708240032196045, + "learning_rate": 2.683750611272462e-08, + "loss": 0.314, + "step": 22305 + }, + { + "epoch": 2.903553299492386, + "grad_norm": 3.2644076347351074, + "learning_rate": 2.6621702842848372e-08, + "loss": 0.3526, + "step": 22308 + }, + { + "epoch": 2.9039437719640766, + "grad_norm": 2.9245588779449463, + "learning_rate": 2.6406768411698093e-08, + "loss": 0.3377, + "step": 22311 + }, + { + "epoch": 2.904334244435767, + "grad_norm": 2.6325671672821045, + "learning_rate": 2.6192702856822073e-08, + "loss": 0.3118, + "step": 22314 + }, + { + "epoch": 2.904724716907458, + "grad_norm": 3.0001933574676514, + "learning_rate": 2.597950621561818e-08, + "loss": 0.3108, + "step": 22317 + }, + { + "epoch": 2.9051151893791487, + "grad_norm": 3.227618932723999, + "learning_rate": 2.5767178525330504e-08, + "loss": 0.39, + "step": 22320 + }, + { + "epoch": 2.9055056618508397, + "grad_norm": 2.91813325881958, + "learning_rate": 2.555571982305216e-08, + "loss": 0.3227, + "step": 22323 + }, + { + "epoch": 2.9058961343225302, + "grad_norm": 2.577854633331299, + "learning_rate": 2.534513014572526e-08, + "loss": 0.2979, + "step": 22326 + }, + { + "epoch": 2.906286606794221, + "grad_norm": 2.738893985748291, + "learning_rate": 2.513540953013871e-08, + "loss": 0.377, + "step": 22329 + }, + { + "epoch": 2.906677079265912, + "grad_norm": 2.668468475341797, + "learning_rate": 2.4926558012930425e-08, + "loss": 0.3525, + "step": 22332 + }, + { + "epoch": 2.9070675517376023, + "grad_norm": 2.666583299636841, + "learning_rate": 2.471857563058677e-08, + "loss": 0.3823, + "step": 22335 + }, + { + "epoch": 2.9074580242092933, + "grad_norm": 3.1053268909454346, + "learning_rate": 2.4511462419441466e-08, + "loss": 0.3824, + "step": 22338 + }, + { + "epoch": 2.907848496680984, + "grad_norm": 2.7575018405914307, + "learning_rate": 2.4305218415677235e-08, + "loss": 0.3984, + "step": 22341 + }, + { + "epoch": 2.908238969152675, + "grad_norm": 3.4204914569854736, + "learning_rate": 2.409984365532303e-08, + "loss": 0.3681, + "step": 22344 + }, + { + "epoch": 2.9086294416243654, + "grad_norm": 2.5891804695129395, + "learning_rate": 2.389533817425904e-08, + "loss": 0.318, + "step": 22347 + }, + { + "epoch": 2.9090199140960564, + "grad_norm": 2.9645187854766846, + "learning_rate": 2.3691702008211136e-08, + "loss": 0.3515, + "step": 22350 + }, + { + "epoch": 2.909410386567747, + "grad_norm": 3.2020936012268066, + "learning_rate": 2.348893519275364e-08, + "loss": 0.3788, + "step": 22353 + }, + { + "epoch": 2.9098008590394375, + "grad_norm": 2.665403127670288, + "learning_rate": 2.3287037763310984e-08, + "loss": 0.3548, + "step": 22356 + }, + { + "epoch": 2.9101913315111285, + "grad_norm": 2.6909213066101074, + "learning_rate": 2.3086009755152738e-08, + "loss": 0.3125, + "step": 22359 + }, + { + "epoch": 2.910581803982819, + "grad_norm": 2.763498067855835, + "learning_rate": 2.2885851203399146e-08, + "loss": 0.3864, + "step": 22362 + }, + { + "epoch": 2.91097227645451, + "grad_norm": 2.798532485961914, + "learning_rate": 2.268656214301668e-08, + "loss": 0.3635, + "step": 22365 + }, + { + "epoch": 2.9113627489262006, + "grad_norm": 2.9937210083007812, + "learning_rate": 2.2488142608821373e-08, + "loss": 0.4099, + "step": 22368 + }, + { + "epoch": 2.9117532213978916, + "grad_norm": 2.972405195236206, + "learning_rate": 2.2290592635476615e-08, + "loss": 0.4077, + "step": 22371 + }, + { + "epoch": 2.912143693869582, + "grad_norm": 2.8520312309265137, + "learning_rate": 2.2093912257493133e-08, + "loss": 0.3675, + "step": 22374 + }, + { + "epoch": 2.912534166341273, + "grad_norm": 2.894411087036133, + "learning_rate": 2.1898101509231772e-08, + "loss": 0.3879, + "step": 22377 + }, + { + "epoch": 2.9129246388129637, + "grad_norm": 2.9919564723968506, + "learning_rate": 2.170316042489906e-08, + "loss": 0.4072, + "step": 22380 + }, + { + "epoch": 2.9133151112846543, + "grad_norm": 2.7866508960723877, + "learning_rate": 2.1509089038551645e-08, + "loss": 0.3156, + "step": 22383 + }, + { + "epoch": 2.9137055837563453, + "grad_norm": 3.07585072517395, + "learning_rate": 2.1315887384093513e-08, + "loss": 0.3686, + "step": 22386 + }, + { + "epoch": 2.914096056228036, + "grad_norm": 3.257378339767456, + "learning_rate": 2.1123555495276005e-08, + "loss": 0.3546, + "step": 22389 + }, + { + "epoch": 2.914486528699727, + "grad_norm": 3.2451460361480713, + "learning_rate": 2.093209340569946e-08, + "loss": 0.346, + "step": 22392 + }, + { + "epoch": 2.9148770011714173, + "grad_norm": 2.622567653656006, + "learning_rate": 2.074150114881157e-08, + "loss": 0.3506, + "step": 22395 + }, + { + "epoch": 2.915267473643108, + "grad_norm": 2.6424460411071777, + "learning_rate": 2.055177875790848e-08, + "loss": 0.3617, + "step": 22398 + }, + { + "epoch": 2.915657946114799, + "grad_norm": 2.814903974533081, + "learning_rate": 2.036292626613423e-08, + "loss": 0.3418, + "step": 22401 + }, + { + "epoch": 2.91604841858649, + "grad_norm": 2.869673728942871, + "learning_rate": 2.0174943706481874e-08, + "loss": 0.3141, + "step": 22404 + }, + { + "epoch": 2.9164388910581804, + "grad_norm": 2.6564950942993164, + "learning_rate": 1.9987831111790147e-08, + "loss": 0.3139, + "step": 22407 + }, + { + "epoch": 2.916829363529871, + "grad_norm": 2.9242870807647705, + "learning_rate": 1.980158851474845e-08, + "loss": 0.4097, + "step": 22410 + }, + { + "epoch": 2.917219836001562, + "grad_norm": 2.8616063594818115, + "learning_rate": 1.9616215947892427e-08, + "loss": 0.3452, + "step": 22413 + }, + { + "epoch": 2.9176103084732525, + "grad_norm": 2.781572103500366, + "learning_rate": 1.9431713443605616e-08, + "loss": 0.384, + "step": 22416 + }, + { + "epoch": 2.9180007809449435, + "grad_norm": 3.0392205715179443, + "learning_rate": 1.924808103412168e-08, + "loss": 0.4144, + "step": 22419 + }, + { + "epoch": 2.918391253416634, + "grad_norm": 2.8837602138519287, + "learning_rate": 1.9065318751519402e-08, + "loss": 0.3395, + "step": 22422 + }, + { + "epoch": 2.9187817258883246, + "grad_norm": 2.5549280643463135, + "learning_rate": 1.8883426627727684e-08, + "loss": 0.3312, + "step": 22425 + }, + { + "epoch": 2.9191721983600156, + "grad_norm": 3.014940023422241, + "learning_rate": 1.870240469452278e-08, + "loss": 0.3757, + "step": 22428 + }, + { + "epoch": 2.9195626708317066, + "grad_norm": 2.8069894313812256, + "learning_rate": 1.8522252983528832e-08, + "loss": 0.3599, + "step": 22431 + }, + { + "epoch": 2.919953143303397, + "grad_norm": 4.53384256362915, + "learning_rate": 1.8342971526217334e-08, + "loss": 0.3301, + "step": 22434 + }, + { + "epoch": 2.9203436157750877, + "grad_norm": 2.5881564617156982, + "learning_rate": 1.8164560353909344e-08, + "loss": 0.3605, + "step": 22437 + }, + { + "epoch": 2.9207340882467787, + "grad_norm": 2.660621166229248, + "learning_rate": 1.798701949777215e-08, + "loss": 0.3414, + "step": 22440 + }, + { + "epoch": 2.9211245607184693, + "grad_norm": 2.7579736709594727, + "learning_rate": 1.7810348988822058e-08, + "loss": 0.3739, + "step": 22443 + }, + { + "epoch": 2.9215150331901603, + "grad_norm": 3.0707061290740967, + "learning_rate": 1.7634548857922707e-08, + "loss": 0.3433, + "step": 22446 + }, + { + "epoch": 2.921905505661851, + "grad_norm": 2.9784891605377197, + "learning_rate": 1.7459619135786753e-08, + "loss": 0.3297, + "step": 22449 + }, + { + "epoch": 2.9222959781335414, + "grad_norm": 2.9023778438568115, + "learning_rate": 1.728555985297309e-08, + "loss": 0.3122, + "step": 22452 + }, + { + "epoch": 2.9226864506052324, + "grad_norm": 2.9779181480407715, + "learning_rate": 1.711237103989072e-08, + "loss": 0.3818, + "step": 22455 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 2.76466703414917, + "learning_rate": 1.6940052726793776e-08, + "loss": 0.3163, + "step": 22458 + }, + { + "epoch": 2.923467395548614, + "grad_norm": 2.9506185054779053, + "learning_rate": 1.6768604943787624e-08, + "loss": 0.2745, + "step": 22461 + }, + { + "epoch": 2.9238578680203045, + "grad_norm": 2.865941047668457, + "learning_rate": 1.659802772082275e-08, + "loss": 0.3797, + "step": 22464 + }, + { + "epoch": 2.9242483404919954, + "grad_norm": 2.825014591217041, + "learning_rate": 1.642832108769865e-08, + "loss": 0.4346, + "step": 22467 + }, + { + "epoch": 2.924638812963686, + "grad_norm": 2.9251394271850586, + "learning_rate": 1.6259485074063276e-08, + "loss": 0.3382, + "step": 22470 + }, + { + "epoch": 2.925029285435377, + "grad_norm": 2.5438222885131836, + "learning_rate": 1.6091519709411363e-08, + "loss": 0.3026, + "step": 22473 + }, + { + "epoch": 2.9254197579070675, + "grad_norm": 2.7614340782165527, + "learning_rate": 1.5924425023086665e-08, + "loss": 0.3606, + "step": 22476 + }, + { + "epoch": 2.925810230378758, + "grad_norm": 2.884199619293213, + "learning_rate": 1.575820104427972e-08, + "loss": 0.3452, + "step": 22479 + }, + { + "epoch": 2.926200702850449, + "grad_norm": 2.60504150390625, + "learning_rate": 1.559284780202952e-08, + "loss": 0.3219, + "step": 22482 + }, + { + "epoch": 2.9265911753221396, + "grad_norm": 3.0407423973083496, + "learning_rate": 1.5428365325223516e-08, + "loss": 0.3445, + "step": 22485 + }, + { + "epoch": 2.9269816477938306, + "grad_norm": 2.683939218521118, + "learning_rate": 1.5264753642595387e-08, + "loss": 0.3529, + "step": 22488 + }, + { + "epoch": 2.927372120265521, + "grad_norm": 2.9559898376464844, + "learning_rate": 1.510201278272949e-08, + "loss": 0.3729, + "step": 22491 + }, + { + "epoch": 2.927762592737212, + "grad_norm": 2.676271438598633, + "learning_rate": 1.4940142774054755e-08, + "loss": 0.4197, + "step": 22494 + }, + { + "epoch": 2.9281530652089027, + "grad_norm": 3.0455739498138428, + "learning_rate": 1.4779143644850225e-08, + "loss": 0.4026, + "step": 22497 + }, + { + "epoch": 2.9285435376805937, + "grad_norm": 3.0256705284118652, + "learning_rate": 1.4619015423241178e-08, + "loss": 0.4349, + "step": 22500 + }, + { + "epoch": 2.9289340101522843, + "grad_norm": 2.6953423023223877, + "learning_rate": 1.4459758137203572e-08, + "loss": 0.324, + "step": 22503 + }, + { + "epoch": 2.929324482623975, + "grad_norm": 3.021960735321045, + "learning_rate": 1.4301371814557374e-08, + "loss": 0.3427, + "step": 22506 + }, + { + "epoch": 2.929714955095666, + "grad_norm": 2.9862983226776123, + "learning_rate": 1.4143856482973228e-08, + "loss": 0.3093, + "step": 22509 + }, + { + "epoch": 2.9301054275673564, + "grad_norm": 2.836103916168213, + "learning_rate": 1.3987212169969121e-08, + "loss": 0.3272, + "step": 22512 + }, + { + "epoch": 2.9304959000390474, + "grad_norm": 2.806823492050171, + "learning_rate": 1.3831438902909834e-08, + "loss": 0.3633, + "step": 22515 + }, + { + "epoch": 2.930886372510738, + "grad_norm": 3.1762378215789795, + "learning_rate": 1.3676536709008593e-08, + "loss": 0.3304, + "step": 22518 + }, + { + "epoch": 2.9312768449824285, + "grad_norm": 2.651766300201416, + "learning_rate": 1.3522505615326531e-08, + "loss": 0.3698, + "step": 22521 + }, + { + "epoch": 2.9316673174541195, + "grad_norm": 2.5449414253234863, + "learning_rate": 1.336934564877268e-08, + "loss": 0.4071, + "step": 22524 + }, + { + "epoch": 2.9320577899258105, + "grad_norm": 2.726243495941162, + "learning_rate": 1.3217056836104525e-08, + "loss": 0.4334, + "step": 22527 + }, + { + "epoch": 2.932448262397501, + "grad_norm": 2.689444065093994, + "learning_rate": 1.3065639203925229e-08, + "loss": 0.3296, + "step": 22530 + }, + { + "epoch": 2.9328387348691916, + "grad_norm": 2.9110662937164307, + "learning_rate": 1.2915092778688077e-08, + "loss": 0.381, + "step": 22533 + }, + { + "epoch": 2.9332292073408825, + "grad_norm": 3.017112970352173, + "learning_rate": 1.2765417586692586e-08, + "loss": 0.366, + "step": 22536 + }, + { + "epoch": 2.933619679812573, + "grad_norm": 2.5785021781921387, + "learning_rate": 1.2616613654087285e-08, + "loss": 0.3984, + "step": 22539 + }, + { + "epoch": 2.934010152284264, + "grad_norm": 2.9283220767974854, + "learning_rate": 1.2468681006868044e-08, + "loss": 0.3302, + "step": 22542 + }, + { + "epoch": 2.9344006247559546, + "grad_norm": 2.7230112552642822, + "learning_rate": 1.2321619670877527e-08, + "loss": 0.3774, + "step": 22545 + }, + { + "epoch": 2.934791097227645, + "grad_norm": 2.7927682399749756, + "learning_rate": 1.2175429671807405e-08, + "loss": 0.421, + "step": 22548 + }, + { + "epoch": 2.935181569699336, + "grad_norm": 2.6731719970703125, + "learning_rate": 1.2030111035197245e-08, + "loss": 0.3555, + "step": 22551 + }, + { + "epoch": 2.935572042171027, + "grad_norm": 2.5603408813476562, + "learning_rate": 1.1885663786433411e-08, + "loss": 0.3375, + "step": 22554 + }, + { + "epoch": 2.9359625146427177, + "grad_norm": 2.7101588249206543, + "learning_rate": 1.1742087950750714e-08, + "loss": 0.3384, + "step": 22557 + }, + { + "epoch": 2.9363529871144083, + "grad_norm": 2.6867103576660156, + "learning_rate": 1.1599383553231314e-08, + "loss": 0.3236, + "step": 22560 + }, + { + "epoch": 2.9367434595860993, + "grad_norm": 3.0580341815948486, + "learning_rate": 1.1457550618805824e-08, + "loss": 0.3854, + "step": 22563 + }, + { + "epoch": 2.93713393205779, + "grad_norm": 2.929643154144287, + "learning_rate": 1.1316589172251091e-08, + "loss": 0.344, + "step": 22566 + }, + { + "epoch": 2.937524404529481, + "grad_norm": 2.806877613067627, + "learning_rate": 1.1176499238194639e-08, + "loss": 0.3385, + "step": 22569 + }, + { + "epoch": 2.9379148770011714, + "grad_norm": 2.79831600189209, + "learning_rate": 1.1037280841108e-08, + "loss": 0.3901, + "step": 22572 + }, + { + "epoch": 2.938305349472862, + "grad_norm": 2.647631883621216, + "learning_rate": 1.0898934005313389e-08, + "loss": 0.3789, + "step": 22575 + }, + { + "epoch": 2.938695821944553, + "grad_norm": 2.671846866607666, + "learning_rate": 1.0761458754979804e-08, + "loss": 0.2923, + "step": 22578 + }, + { + "epoch": 2.939086294416244, + "grad_norm": 2.559683084487915, + "learning_rate": 1.0624855114123035e-08, + "loss": 0.3096, + "step": 22581 + }, + { + "epoch": 2.9394767668879345, + "grad_norm": 2.7834372520446777, + "learning_rate": 1.0489123106608434e-08, + "loss": 0.3449, + "step": 22584 + }, + { + "epoch": 2.939867239359625, + "grad_norm": 2.7174694538116455, + "learning_rate": 1.0354262756147593e-08, + "loss": 0.3508, + "step": 22587 + }, + { + "epoch": 2.940257711831316, + "grad_norm": 2.7829127311706543, + "learning_rate": 1.0220274086299998e-08, + "loss": 0.397, + "step": 22590 + }, + { + "epoch": 2.9406481843030066, + "grad_norm": 2.9258644580841064, + "learning_rate": 1.0087157120474145e-08, + "loss": 0.3757, + "step": 22593 + }, + { + "epoch": 2.9410386567746976, + "grad_norm": 2.8890163898468018, + "learning_rate": 9.954911881924212e-09, + "loss": 0.3301, + "step": 22596 + }, + { + "epoch": 2.941429129246388, + "grad_norm": 2.4715800285339355, + "learning_rate": 9.82353839375394e-09, + "loss": 0.3031, + "step": 22599 + }, + { + "epoch": 2.9418196017180787, + "grad_norm": 2.8599045276641846, + "learning_rate": 9.693036678913303e-09, + "loss": 0.3638, + "step": 22602 + }, + { + "epoch": 2.9422100741897697, + "grad_norm": 2.724221706390381, + "learning_rate": 9.563406760201288e-09, + "loss": 0.4034, + "step": 22605 + }, + { + "epoch": 2.94260054666146, + "grad_norm": 2.7690560817718506, + "learning_rate": 9.434648660263668e-09, + "loss": 0.3274, + "step": 22608 + }, + { + "epoch": 2.942991019133151, + "grad_norm": 2.534320592880249, + "learning_rate": 9.30676240159467e-09, + "loss": 0.3847, + "step": 22611 + }, + { + "epoch": 2.9433814916048417, + "grad_norm": 2.9107460975646973, + "learning_rate": 9.179748006535317e-09, + "loss": 0.3455, + "step": 22614 + }, + { + "epoch": 2.9437719640765327, + "grad_norm": 2.507080554962158, + "learning_rate": 9.05360549727452e-09, + "loss": 0.3132, + "step": 22617 + }, + { + "epoch": 2.9441624365482233, + "grad_norm": 2.537360191345215, + "learning_rate": 8.928334895849656e-09, + "loss": 0.3198, + "step": 22620 + }, + { + "epoch": 2.9445529090199143, + "grad_norm": 2.297013282775879, + "learning_rate": 8.803936224144883e-09, + "loss": 0.2882, + "step": 22623 + }, + { + "epoch": 2.944943381491605, + "grad_norm": 2.7976059913635254, + "learning_rate": 8.680409503892817e-09, + "loss": 0.4199, + "step": 22626 + }, + { + "epoch": 2.9453338539632954, + "grad_norm": 2.64867901802063, + "learning_rate": 8.557754756672864e-09, + "loss": 0.3469, + "step": 22629 + }, + { + "epoch": 2.9457243264349864, + "grad_norm": 2.9491302967071533, + "learning_rate": 8.435972003912329e-09, + "loss": 0.3483, + "step": 22632 + }, + { + "epoch": 2.946114798906677, + "grad_norm": 2.6923861503601074, + "learning_rate": 8.31506126688697e-09, + "loss": 0.3447, + "step": 22635 + }, + { + "epoch": 2.946505271378368, + "grad_norm": 3.363018035888672, + "learning_rate": 8.19502256671989e-09, + "loss": 0.3398, + "step": 22638 + }, + { + "epoch": 2.9468957438500585, + "grad_norm": 2.8834447860717773, + "learning_rate": 8.075855924380427e-09, + "loss": 0.4022, + "step": 22641 + }, + { + "epoch": 2.9472862163217495, + "grad_norm": 2.857146978378296, + "learning_rate": 7.957561360688038e-09, + "loss": 0.3495, + "step": 22644 + }, + { + "epoch": 2.94767668879344, + "grad_norm": 2.984123468399048, + "learning_rate": 7.840138896307303e-09, + "loss": 0.308, + "step": 22647 + }, + { + "epoch": 2.948067161265131, + "grad_norm": 3.035522222518921, + "learning_rate": 7.723588551752925e-09, + "loss": 0.3807, + "step": 22650 + }, + { + "epoch": 2.9484576337368216, + "grad_norm": 2.7213213443756104, + "learning_rate": 7.607910347385283e-09, + "loss": 0.3534, + "step": 22653 + }, + { + "epoch": 2.948848106208512, + "grad_norm": 2.8660976886749268, + "learning_rate": 7.493104303413212e-09, + "loss": 0.3385, + "step": 22656 + }, + { + "epoch": 2.949238578680203, + "grad_norm": 2.926711320877075, + "learning_rate": 7.379170439892891e-09, + "loss": 0.3535, + "step": 22659 + }, + { + "epoch": 2.9496290511518937, + "grad_norm": 2.7307538986206055, + "learning_rate": 7.266108776728953e-09, + "loss": 0.3584, + "step": 22662 + }, + { + "epoch": 2.9500195236235847, + "grad_norm": 2.8166987895965576, + "learning_rate": 7.153919333672266e-09, + "loss": 0.322, + "step": 22665 + }, + { + "epoch": 2.950409996095275, + "grad_norm": 2.866868734359741, + "learning_rate": 7.042602130322707e-09, + "loss": 0.3712, + "step": 22668 + }, + { + "epoch": 2.9508004685669658, + "grad_norm": 2.841460704803467, + "learning_rate": 6.932157186126942e-09, + "loss": 0.4131, + "step": 22671 + }, + { + "epoch": 2.9511909410386568, + "grad_norm": 2.5666918754577637, + "learning_rate": 6.822584520379538e-09, + "loss": 0.3417, + "step": 22674 + }, + { + "epoch": 2.9515814135103478, + "grad_norm": 3.38871431350708, + "learning_rate": 6.713884152222405e-09, + "loss": 0.3104, + "step": 22677 + }, + { + "epoch": 2.9519718859820383, + "grad_norm": 3.0104832649230957, + "learning_rate": 6.6060561006453525e-09, + "loss": 0.4312, + "step": 22680 + }, + { + "epoch": 2.952362358453729, + "grad_norm": 3.21549129486084, + "learning_rate": 6.499100384485535e-09, + "loss": 0.3817, + "step": 22683 + }, + { + "epoch": 2.95275283092542, + "grad_norm": 3.07248592376709, + "learning_rate": 6.393017022428005e-09, + "loss": 0.3889, + "step": 22686 + }, + { + "epoch": 2.9531433033971104, + "grad_norm": 2.6400251388549805, + "learning_rate": 6.287806033005717e-09, + "loss": 0.3869, + "step": 22689 + }, + { + "epoch": 2.9535337758688014, + "grad_norm": 2.670640707015991, + "learning_rate": 6.1834674345984115e-09, + "loss": 0.2964, + "step": 22692 + }, + { + "epoch": 2.953924248340492, + "grad_norm": 2.874105215072632, + "learning_rate": 6.080001245433731e-09, + "loss": 0.3724, + "step": 22695 + }, + { + "epoch": 2.9543147208121825, + "grad_norm": 2.8548583984375, + "learning_rate": 5.977407483587217e-09, + "loss": 0.3864, + "step": 22698 + }, + { + "epoch": 2.9547051932838735, + "grad_norm": 2.812763214111328, + "learning_rate": 5.875686166981753e-09, + "loss": 0.3374, + "step": 22701 + }, + { + "epoch": 2.9550956657555645, + "grad_norm": 2.711881637573242, + "learning_rate": 5.7748373133875715e-09, + "loss": 0.3739, + "step": 22704 + }, + { + "epoch": 2.955486138227255, + "grad_norm": 2.6988415718078613, + "learning_rate": 5.674860940423354e-09, + "loss": 0.334, + "step": 22707 + }, + { + "epoch": 2.9558766106989456, + "grad_norm": 2.473297357559204, + "learning_rate": 5.5757570655545765e-09, + "loss": 0.3704, + "step": 22710 + }, + { + "epoch": 2.9562670831706366, + "grad_norm": 2.6674001216888428, + "learning_rate": 5.477525706094056e-09, + "loss": 0.364, + "step": 22713 + }, + { + "epoch": 2.956657555642327, + "grad_norm": 2.651705741882324, + "learning_rate": 5.380166879202508e-09, + "loss": 0.3361, + "step": 22716 + }, + { + "epoch": 2.957048028114018, + "grad_norm": 2.93682599067688, + "learning_rate": 5.283680601889107e-09, + "loss": 0.4136, + "step": 22719 + }, + { + "epoch": 2.9574385005857087, + "grad_norm": 2.921872854232788, + "learning_rate": 5.188066891009258e-09, + "loss": 0.3346, + "step": 22722 + }, + { + "epoch": 2.9578289730573992, + "grad_norm": 2.8283815383911133, + "learning_rate": 5.093325763266821e-09, + "loss": 0.2995, + "step": 22725 + }, + { + "epoch": 2.95821944552909, + "grad_norm": 2.4506702423095703, + "learning_rate": 4.999457235212446e-09, + "loss": 0.3262, + "step": 22728 + }, + { + "epoch": 2.958609918000781, + "grad_norm": 2.559972047805786, + "learning_rate": 4.906461323244683e-09, + "loss": 0.3491, + "step": 22731 + }, + { + "epoch": 2.9590003904724718, + "grad_norm": 2.777678966522217, + "learning_rate": 4.814338043609979e-09, + "loss": 0.3715, + "step": 22734 + }, + { + "epoch": 2.9593908629441623, + "grad_norm": 2.797595977783203, + "learning_rate": 4.7230874124026825e-09, + "loss": 0.3736, + "step": 22737 + }, + { + "epoch": 2.9597813354158533, + "grad_norm": 2.576273202896118, + "learning_rate": 4.632709445562822e-09, + "loss": 0.3177, + "step": 22740 + }, + { + "epoch": 2.960171807887544, + "grad_norm": 2.8237977027893066, + "learning_rate": 4.543204158879988e-09, + "loss": 0.3287, + "step": 22743 + }, + { + "epoch": 2.960562280359235, + "grad_norm": 2.5393621921539307, + "learning_rate": 4.454571567991117e-09, + "loss": 0.3608, + "step": 22746 + }, + { + "epoch": 2.9609527528309254, + "grad_norm": 2.9079854488372803, + "learning_rate": 4.3668116883788245e-09, + "loss": 0.3497, + "step": 22749 + }, + { + "epoch": 2.961343225302616, + "grad_norm": 2.9478375911712646, + "learning_rate": 4.2799245353752905e-09, + "loss": 0.3858, + "step": 22752 + }, + { + "epoch": 2.961733697774307, + "grad_norm": 2.845107316970825, + "learning_rate": 4.193910124160039e-09, + "loss": 0.3556, + "step": 22755 + }, + { + "epoch": 2.9621241702459975, + "grad_norm": 2.638409376144409, + "learning_rate": 4.108768469758273e-09, + "loss": 0.2997, + "step": 22758 + }, + { + "epoch": 2.9625146427176885, + "grad_norm": 3.1192007064819336, + "learning_rate": 4.0244995870453166e-09, + "loss": 0.3626, + "step": 22761 + }, + { + "epoch": 2.962905115189379, + "grad_norm": 2.773621082305908, + "learning_rate": 3.941103490742171e-09, + "loss": 0.2836, + "step": 22764 + }, + { + "epoch": 2.96329558766107, + "grad_norm": 2.718259811401367, + "learning_rate": 3.858580195418293e-09, + "loss": 0.3435, + "step": 22767 + }, + { + "epoch": 2.9636860601327606, + "grad_norm": 2.8922083377838135, + "learning_rate": 3.77692971548993e-09, + "loss": 0.3966, + "step": 22770 + }, + { + "epoch": 2.9640765326044516, + "grad_norm": 2.6764767169952393, + "learning_rate": 3.6961520652212256e-09, + "loss": 0.3352, + "step": 22773 + }, + { + "epoch": 2.964467005076142, + "grad_norm": 2.891240119934082, + "learning_rate": 3.6162472587242257e-09, + "loss": 0.3341, + "step": 22776 + }, + { + "epoch": 2.9648574775478327, + "grad_norm": 2.7254581451416016, + "learning_rate": 3.5372153099583195e-09, + "loss": 0.3189, + "step": 22779 + }, + { + "epoch": 2.9652479500195237, + "grad_norm": 3.07497501373291, + "learning_rate": 3.459056232729685e-09, + "loss": 0.3985, + "step": 22782 + }, + { + "epoch": 2.9656384224912142, + "grad_norm": 2.8364953994750977, + "learning_rate": 3.3817700406924004e-09, + "loss": 0.3572, + "step": 22785 + }, + { + "epoch": 2.9660288949629052, + "grad_norm": 2.677969217300415, + "learning_rate": 3.305356747348998e-09, + "loss": 0.3667, + "step": 22788 + }, + { + "epoch": 2.9664193674345958, + "grad_norm": 2.8867709636688232, + "learning_rate": 3.229816366047689e-09, + "loss": 0.3836, + "step": 22791 + }, + { + "epoch": 2.9668098399062868, + "grad_norm": 2.7615604400634766, + "learning_rate": 3.15514890998625e-09, + "loss": 0.3543, + "step": 22794 + }, + { + "epoch": 2.9672003123779773, + "grad_norm": 2.8149735927581787, + "learning_rate": 3.0813543922081357e-09, + "loss": 0.3806, + "step": 22797 + }, + { + "epoch": 2.9675907848496683, + "grad_norm": 2.686567544937134, + "learning_rate": 3.0084328256058116e-09, + "loss": 0.3664, + "step": 22800 + }, + { + "epoch": 2.967981257321359, + "grad_norm": 2.6036901473999023, + "learning_rate": 2.9363842229179763e-09, + "loss": 0.3127, + "step": 22803 + }, + { + "epoch": 2.9683717297930494, + "grad_norm": 2.843297004699707, + "learning_rate": 2.865208596731783e-09, + "loss": 0.3899, + "step": 22806 + }, + { + "epoch": 2.9687622022647404, + "grad_norm": 2.4631242752075195, + "learning_rate": 2.7949059594806203e-09, + "loss": 0.3895, + "step": 22809 + }, + { + "epoch": 2.969152674736431, + "grad_norm": 2.619086980819702, + "learning_rate": 2.7254763234474404e-09, + "loss": 0.277, + "step": 22812 + }, + { + "epoch": 2.969543147208122, + "grad_norm": 2.584728717803955, + "learning_rate": 2.6569197007603186e-09, + "loss": 0.3257, + "step": 22815 + }, + { + "epoch": 2.9699336196798125, + "grad_norm": 3.1949074268341064, + "learning_rate": 2.589236103396897e-09, + "loss": 0.408, + "step": 22818 + }, + { + "epoch": 2.970324092151503, + "grad_norm": 2.800149440765381, + "learning_rate": 2.5224255431804957e-09, + "loss": 0.3305, + "step": 22821 + }, + { + "epoch": 2.970714564623194, + "grad_norm": 2.811697006225586, + "learning_rate": 2.4564880317834438e-09, + "loss": 0.3886, + "step": 22824 + }, + { + "epoch": 2.971105037094885, + "grad_norm": 2.7836661338806152, + "learning_rate": 2.391423580724861e-09, + "loss": 0.3741, + "step": 22827 + }, + { + "epoch": 2.9714955095665756, + "grad_norm": 2.9304723739624023, + "learning_rate": 2.327232201370655e-09, + "loss": 0.3688, + "step": 22830 + }, + { + "epoch": 2.971885982038266, + "grad_norm": 2.7736597061157227, + "learning_rate": 2.263913904935744e-09, + "loss": 0.3645, + "step": 22833 + }, + { + "epoch": 2.972276454509957, + "grad_norm": 2.527412176132202, + "learning_rate": 2.20146870248128e-09, + "loss": 0.3668, + "step": 22836 + }, + { + "epoch": 2.9726669269816477, + "grad_norm": 2.9895877838134766, + "learning_rate": 2.139896604916314e-09, + "loss": 0.4066, + "step": 22839 + }, + { + "epoch": 2.9730573994533387, + "grad_norm": 2.8589468002319336, + "learning_rate": 2.079197622997242e-09, + "loss": 0.3825, + "step": 22842 + }, + { + "epoch": 2.9734478719250292, + "grad_norm": 2.533005475997925, + "learning_rate": 2.0193717673283597e-09, + "loss": 0.3998, + "step": 22845 + }, + { + "epoch": 2.97383834439672, + "grad_norm": 2.7130699157714844, + "learning_rate": 1.9604190483613062e-09, + "loss": 0.3201, + "step": 22848 + }, + { + "epoch": 2.974228816868411, + "grad_norm": 2.8447370529174805, + "learning_rate": 1.9023394763945104e-09, + "loss": 0.3743, + "step": 22851 + }, + { + "epoch": 2.974619289340102, + "grad_norm": 3.0323686599731445, + "learning_rate": 1.8451330615748553e-09, + "loss": 0.3989, + "step": 22854 + }, + { + "epoch": 2.9750097618117923, + "grad_norm": 3.1013906002044678, + "learning_rate": 1.7887998138954587e-09, + "loss": 0.3767, + "step": 22857 + }, + { + "epoch": 2.975400234283483, + "grad_norm": 2.792752504348755, + "learning_rate": 1.7333397431984477e-09, + "loss": 0.3696, + "step": 22860 + }, + { + "epoch": 2.975790706755174, + "grad_norm": 2.7394256591796875, + "learning_rate": 1.6787528591716284e-09, + "loss": 0.3893, + "step": 22863 + }, + { + "epoch": 2.9761811792268644, + "grad_norm": 3.603832244873047, + "learning_rate": 1.6250391713523717e-09, + "loss": 0.3438, + "step": 22866 + }, + { + "epoch": 2.9765716516985554, + "grad_norm": 2.743468761444092, + "learning_rate": 1.5721986891231722e-09, + "loss": 0.3395, + "step": 22869 + }, + { + "epoch": 2.976962124170246, + "grad_norm": 2.7976534366607666, + "learning_rate": 1.5202314217160895e-09, + "loss": 0.3345, + "step": 22872 + }, + { + "epoch": 2.9773525966419365, + "grad_norm": 2.8233566284179688, + "learning_rate": 1.4691373782088625e-09, + "loss": 0.4015, + "step": 22875 + }, + { + "epoch": 2.9777430691136275, + "grad_norm": 2.91632080078125, + "learning_rate": 1.418916567528239e-09, + "loss": 0.3493, + "step": 22878 + }, + { + "epoch": 2.9781335415853185, + "grad_norm": 2.812035322189331, + "learning_rate": 1.3695689984477566e-09, + "loss": 0.3718, + "step": 22881 + }, + { + "epoch": 2.978524014057009, + "grad_norm": 2.8257293701171875, + "learning_rate": 1.3210946795877423e-09, + "loss": 0.3545, + "step": 22884 + }, + { + "epoch": 2.9789144865286996, + "grad_norm": 2.968236207962036, + "learning_rate": 1.2734936194164215e-09, + "loss": 0.3936, + "step": 22887 + }, + { + "epoch": 2.9793049590003906, + "grad_norm": 2.8840749263763428, + "learning_rate": 1.2267658262504756e-09, + "loss": 0.4015, + "step": 22890 + }, + { + "epoch": 2.979695431472081, + "grad_norm": 2.828843355178833, + "learning_rate": 1.1809113082522638e-09, + "loss": 0.2979, + "step": 22893 + }, + { + "epoch": 2.980085903943772, + "grad_norm": 2.849477767944336, + "learning_rate": 1.1359300734331558e-09, + "loss": 0.3837, + "step": 22896 + }, + { + "epoch": 2.9804763764154627, + "grad_norm": 2.721007823944092, + "learning_rate": 1.0918221296507547e-09, + "loss": 0.4118, + "step": 22899 + }, + { + "epoch": 2.9808668488871533, + "grad_norm": 2.5880935192108154, + "learning_rate": 1.0485874846111189e-09, + "loss": 0.3802, + "step": 22902 + }, + { + "epoch": 2.9812573213588442, + "grad_norm": 2.763317108154297, + "learning_rate": 1.0062261458670952e-09, + "loss": 0.3266, + "step": 22905 + }, + { + "epoch": 2.981647793830535, + "grad_norm": 2.6199758052825928, + "learning_rate": 9.647381208188755e-10, + "loss": 0.3894, + "step": 22908 + }, + { + "epoch": 2.982038266302226, + "grad_norm": 3.725672960281372, + "learning_rate": 9.241234167139956e-10, + "loss": 0.4053, + "step": 22911 + }, + { + "epoch": 2.9824287387739163, + "grad_norm": 2.939629077911377, + "learning_rate": 8.843820406490011e-10, + "loss": 0.3776, + "step": 22914 + }, + { + "epoch": 2.9828192112456073, + "grad_norm": 2.6625006198883057, + "learning_rate": 8.455139995655615e-10, + "loss": 0.3032, + "step": 22917 + }, + { + "epoch": 2.983209683717298, + "grad_norm": 2.914444923400879, + "learning_rate": 8.075193002538006e-10, + "loss": 0.3967, + "step": 22920 + }, + { + "epoch": 2.983600156188989, + "grad_norm": 2.957470655441284, + "learning_rate": 7.703979493522973e-10, + "loss": 0.3776, + "step": 22923 + }, + { + "epoch": 2.9839906286606794, + "grad_norm": 3.152146100997925, + "learning_rate": 7.341499533447538e-10, + "loss": 0.3585, + "step": 22926 + }, + { + "epoch": 2.98438110113237, + "grad_norm": 2.8697428703308105, + "learning_rate": 6.987753185649926e-10, + "loss": 0.4167, + "step": 22929 + }, + { + "epoch": 2.984771573604061, + "grad_norm": 3.1371231079101562, + "learning_rate": 6.642740511914047e-10, + "loss": 0.3816, + "step": 22932 + }, + { + "epoch": 2.9851620460757515, + "grad_norm": 3.021503210067749, + "learning_rate": 6.306461572525013e-10, + "loss": 0.3535, + "step": 22935 + }, + { + "epoch": 2.9855525185474425, + "grad_norm": 2.5989933013916016, + "learning_rate": 5.978916426230274e-10, + "loss": 0.3127, + "step": 22938 + }, + { + "epoch": 2.985942991019133, + "grad_norm": 2.7822329998016357, + "learning_rate": 5.660105130239624e-10, + "loss": 0.354, + "step": 22941 + }, + { + "epoch": 2.9863334634908236, + "grad_norm": 2.7127301692962646, + "learning_rate": 5.350027740258501e-10, + "loss": 0.3868, + "step": 22944 + }, + { + "epoch": 2.9867239359625146, + "grad_norm": 3.0589182376861572, + "learning_rate": 5.048684310454688e-10, + "loss": 0.3267, + "step": 22947 + }, + { + "epoch": 2.9871144084342056, + "grad_norm": 2.661449670791626, + "learning_rate": 4.756074893469409e-10, + "loss": 0.3292, + "step": 22950 + }, + { + "epoch": 2.987504880905896, + "grad_norm": 2.9641380310058594, + "learning_rate": 4.4721995404284345e-10, + "loss": 0.4186, + "step": 22953 + }, + { + "epoch": 2.9878953533775867, + "grad_norm": 2.9655439853668213, + "learning_rate": 4.197058300914325e-10, + "loss": 0.3803, + "step": 22956 + }, + { + "epoch": 2.9882858258492777, + "grad_norm": 3.2909016609191895, + "learning_rate": 3.930651222999737e-10, + "loss": 0.391, + "step": 22959 + }, + { + "epoch": 2.9886762983209683, + "grad_norm": 2.9940195083618164, + "learning_rate": 3.6729783532196696e-10, + "loss": 0.3484, + "step": 22962 + }, + { + "epoch": 2.9890667707926593, + "grad_norm": 3.0727415084838867, + "learning_rate": 3.424039736599216e-10, + "loss": 0.3758, + "step": 22965 + }, + { + "epoch": 2.98945724326435, + "grad_norm": 2.5275540351867676, + "learning_rate": 3.1838354166202623e-10, + "loss": 0.3679, + "step": 22968 + }, + { + "epoch": 2.9898477157360404, + "grad_norm": 2.776193380355835, + "learning_rate": 2.9523654352492384e-10, + "loss": 0.354, + "step": 22971 + }, + { + "epoch": 2.9902381882077314, + "grad_norm": 2.7540743350982666, + "learning_rate": 2.7296298329204664e-10, + "loss": 0.3332, + "step": 22974 + }, + { + "epoch": 2.9906286606794223, + "grad_norm": 2.7912518978118896, + "learning_rate": 2.5156286485417126e-10, + "loss": 0.327, + "step": 22977 + }, + { + "epoch": 2.991019133151113, + "grad_norm": 3.1262850761413574, + "learning_rate": 2.3103619195052884e-10, + "loss": 0.3989, + "step": 22980 + }, + { + "epoch": 2.9914096056228034, + "grad_norm": 3.24723744392395, + "learning_rate": 2.1138296816713976e-10, + "loss": 0.3658, + "step": 22983 + }, + { + "epoch": 2.9918000780944944, + "grad_norm": 2.843860149383545, + "learning_rate": 1.9260319693736874e-10, + "loss": 0.3578, + "step": 22986 + }, + { + "epoch": 2.992190550566185, + "grad_norm": 3.0490710735321045, + "learning_rate": 1.7469688154136966e-10, + "loss": 0.3589, + "step": 22989 + }, + { + "epoch": 2.992581023037876, + "grad_norm": 2.6627109050750732, + "learning_rate": 1.5766402510775102e-10, + "loss": 0.3676, + "step": 22992 + }, + { + "epoch": 2.9929714955095665, + "grad_norm": 3.4538979530334473, + "learning_rate": 1.4150463061191055e-10, + "loss": 0.3702, + "step": 22995 + }, + { + "epoch": 2.993361967981257, + "grad_norm": 2.638334274291992, + "learning_rate": 1.2621870087714538e-10, + "loss": 0.3135, + "step": 22998 + }, + { + "epoch": 2.993752440452948, + "grad_norm": 2.811047077178955, + "learning_rate": 1.11806238574097e-10, + "loss": 0.3619, + "step": 23001 + }, + { + "epoch": 2.994142912924639, + "grad_norm": 2.9179980754852295, + "learning_rate": 9.826724622019613e-11, + "loss": 0.3805, + "step": 23004 + }, + { + "epoch": 2.9945333853963296, + "grad_norm": 2.6326956748962402, + "learning_rate": 8.56017261807729e-11, + "loss": 0.3641, + "step": 23007 + }, + { + "epoch": 2.99492385786802, + "grad_norm": 2.799358367919922, + "learning_rate": 7.380968066794669e-11, + "loss": 0.3647, + "step": 23010 + }, + { + "epoch": 2.995314330339711, + "grad_norm": 2.7768449783325195, + "learning_rate": 6.289111174284657e-11, + "loss": 0.3445, + "step": 23013 + }, + { + "epoch": 2.9957048028114017, + "grad_norm": 2.603933095932007, + "learning_rate": 5.284602131228056e-11, + "loss": 0.3638, + "step": 23016 + }, + { + "epoch": 2.9960952752830927, + "grad_norm": 2.9793689250946045, + "learning_rate": 4.367441113095616e-11, + "loss": 0.393, + "step": 23019 + }, + { + "epoch": 2.9964857477547833, + "grad_norm": 3.043372631072998, + "learning_rate": 3.53762828009252e-11, + "loss": 0.4288, + "step": 23022 + }, + { + "epoch": 2.996876220226474, + "grad_norm": 3.007805109024048, + "learning_rate": 2.795163777269405e-11, + "loss": 0.3576, + "step": 23025 + }, + { + "epoch": 2.997266692698165, + "grad_norm": 2.5332658290863037, + "learning_rate": 2.1400477343003213e-11, + "loss": 0.3063, + "step": 23028 + }, + { + "epoch": 2.9976571651698554, + "grad_norm": 2.6876015663146973, + "learning_rate": 1.572280265649262e-11, + "loss": 0.3617, + "step": 23031 + }, + { + "epoch": 2.9980476376415464, + "grad_norm": 3.169299840927124, + "learning_rate": 1.0918614704036323e-11, + "loss": 0.4214, + "step": 23034 + }, + { + "epoch": 2.998438110113237, + "grad_norm": 2.8437774181365967, + "learning_rate": 6.987914326073153e-12, + "loss": 0.3391, + "step": 23037 + }, + { + "epoch": 2.998828582584928, + "grad_norm": 3.0276384353637695, + "learning_rate": 3.93070220927605e-12, + "loss": 0.3807, + "step": 23040 + }, + { + "epoch": 2.9992190550566185, + "grad_norm": 2.6813414096832275, + "learning_rate": 1.7469788865520642e-12, + "loss": 0.3787, + "step": 23043 + }, + { + "epoch": 2.9996095275283094, + "grad_norm": 2.8834426403045654, + "learning_rate": 4.367447409281411e-13, + "loss": 0.3336, + "step": 23046 + }, + { + "epoch": 3.0, + "grad_norm": 5.5650634765625, + "learning_rate": 0.0, + "loss": 0.2652, + "step": 23049 + } + ], + "logging_steps": 3, + "max_steps": 23049, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.421608510493819e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}