diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30415 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 4339, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00023046784973496196, + "grad_norm": 49.091495513916016, + "learning_rate": 7.633587786259542e-07, + "loss": 2.2176, + "step": 1 + }, + { + "epoch": 0.0004609356994699239, + "grad_norm": 46.45875930786133, + "learning_rate": 1.5267175572519084e-06, + "loss": 2.1992, + "step": 2 + }, + { + "epoch": 0.0006914035492048859, + "grad_norm": 47.82147979736328, + "learning_rate": 2.2900763358778625e-06, + "loss": 2.2094, + "step": 3 + }, + { + "epoch": 0.0009218713989398478, + "grad_norm": 44.642494201660156, + "learning_rate": 3.053435114503817e-06, + "loss": 2.1912, + "step": 4 + }, + { + "epoch": 0.00115233924867481, + "grad_norm": 35.80204772949219, + "learning_rate": 3.816793893129772e-06, + "loss": 2.0999, + "step": 5 + }, + { + "epoch": 0.0013828070984097719, + "grad_norm": 30.77144432067871, + "learning_rate": 4.580152671755725e-06, + "loss": 2.0554, + "step": 6 + }, + { + "epoch": 0.0016132749481447338, + "grad_norm": 16.0904483795166, + "learning_rate": 5.343511450381679e-06, + "loss": 1.9752, + "step": 7 + }, + { + "epoch": 0.0018437427978796957, + "grad_norm": 21.500225067138672, + "learning_rate": 6.106870229007634e-06, + "loss": 1.8799, + "step": 8 + }, + { + "epoch": 0.0020742106476146576, + "grad_norm": 25.848834991455078, + "learning_rate": 6.870229007633589e-06, + "loss": 2.0303, + "step": 9 + }, + { + "epoch": 0.00230467849734962, + "grad_norm": 21.812026977539062, + "learning_rate": 7.633587786259543e-06, + "loss": 1.9578, + "step": 10 + }, + { + "epoch": 0.002535146347084582, + "grad_norm": 29.04991912841797, + "learning_rate": 8.396946564885497e-06, + "loss": 1.7629, + "step": 11 + }, + { + "epoch": 0.0027656141968195437, + "grad_norm": 6.3423566818237305, + "learning_rate": 9.16030534351145e-06, + "loss": 1.6577, + "step": 12 + }, + { + "epoch": 0.0029960820465545056, + "grad_norm": 37.8600959777832, + "learning_rate": 9.923664122137405e-06, + "loss": 1.7216, + "step": 13 + }, + { + "epoch": 0.0032265498962894676, + "grad_norm": 13.13255500793457, + "learning_rate": 1.0687022900763359e-05, + "loss": 1.6066, + "step": 14 + }, + { + "epoch": 0.0034570177460244295, + "grad_norm": 5.612797260284424, + "learning_rate": 1.1450381679389314e-05, + "loss": 1.5293, + "step": 15 + }, + { + "epoch": 0.0036874855957593914, + "grad_norm": 4.661881923675537, + "learning_rate": 1.2213740458015267e-05, + "loss": 1.5133, + "step": 16 + }, + { + "epoch": 0.003917953445494353, + "grad_norm": 2.878075361251831, + "learning_rate": 1.2977099236641221e-05, + "loss": 1.4992, + "step": 17 + }, + { + "epoch": 0.004148421295229315, + "grad_norm": 2.4321236610412598, + "learning_rate": 1.3740458015267178e-05, + "loss": 1.4797, + "step": 18 + }, + { + "epoch": 0.004378889144964277, + "grad_norm": 2.1403255462646484, + "learning_rate": 1.450381679389313e-05, + "loss": 1.4561, + "step": 19 + }, + { + "epoch": 0.00460935699469924, + "grad_norm": 1.831888198852539, + "learning_rate": 1.5267175572519086e-05, + "loss": 1.4504, + "step": 20 + }, + { + "epoch": 0.004839824844434202, + "grad_norm": 2.384835958480835, + "learning_rate": 1.6030534351145038e-05, + "loss": 1.4364, + "step": 21 + }, + { + "epoch": 0.005070292694169164, + "grad_norm": 1.8255668878555298, + "learning_rate": 1.6793893129770993e-05, + "loss": 1.4241, + "step": 22 + }, + { + "epoch": 0.005300760543904126, + "grad_norm": 20.282546997070312, + "learning_rate": 1.7557251908396945e-05, + "loss": 1.4204, + "step": 23 + }, + { + "epoch": 0.0055312283936390875, + "grad_norm": 1.2769057750701904, + "learning_rate": 1.83206106870229e-05, + "loss": 1.4212, + "step": 24 + }, + { + "epoch": 0.005761696243374049, + "grad_norm": 1.2663878202438354, + "learning_rate": 1.9083969465648855e-05, + "loss": 1.3945, + "step": 25 + }, + { + "epoch": 0.005992164093109011, + "grad_norm": 1.3147037029266357, + "learning_rate": 1.984732824427481e-05, + "loss": 1.3976, + "step": 26 + }, + { + "epoch": 0.006222631942843973, + "grad_norm": 1.3301242589950562, + "learning_rate": 2.0610687022900766e-05, + "loss": 1.3872, + "step": 27 + }, + { + "epoch": 0.006453099792578935, + "grad_norm": 1.579620599746704, + "learning_rate": 2.1374045801526718e-05, + "loss": 1.3758, + "step": 28 + }, + { + "epoch": 0.006683567642313897, + "grad_norm": 1.2853105068206787, + "learning_rate": 2.2137404580152673e-05, + "loss": 1.3719, + "step": 29 + }, + { + "epoch": 0.006914035492048859, + "grad_norm": 1.264007329940796, + "learning_rate": 2.2900763358778628e-05, + "loss": 1.3545, + "step": 30 + }, + { + "epoch": 0.007144503341783821, + "grad_norm": 1.1497324705123901, + "learning_rate": 2.3664122137404583e-05, + "loss": 1.3492, + "step": 31 + }, + { + "epoch": 0.007374971191518783, + "grad_norm": 0.9906431436538696, + "learning_rate": 2.4427480916030535e-05, + "loss": 1.3537, + "step": 32 + }, + { + "epoch": 0.0076054390412537455, + "grad_norm": 0.7753263115882874, + "learning_rate": 2.5190839694656487e-05, + "loss": 1.3452, + "step": 33 + }, + { + "epoch": 0.007835906890988707, + "grad_norm": 0.7904003858566284, + "learning_rate": 2.5954198473282442e-05, + "loss": 1.3312, + "step": 34 + }, + { + "epoch": 0.00806637474072367, + "grad_norm": 0.826572597026825, + "learning_rate": 2.6717557251908397e-05, + "loss": 1.3327, + "step": 35 + }, + { + "epoch": 0.00829684259045863, + "grad_norm": 0.7653324007987976, + "learning_rate": 2.7480916030534355e-05, + "loss": 1.3333, + "step": 36 + }, + { + "epoch": 0.008527310440193593, + "grad_norm": 0.6942451000213623, + "learning_rate": 2.824427480916031e-05, + "loss": 1.3149, + "step": 37 + }, + { + "epoch": 0.008757778289928554, + "grad_norm": 0.6542969942092896, + "learning_rate": 2.900763358778626e-05, + "loss": 1.3153, + "step": 38 + }, + { + "epoch": 0.008988246139663517, + "grad_norm": 0.619317889213562, + "learning_rate": 2.9770992366412214e-05, + "loss": 1.3035, + "step": 39 + }, + { + "epoch": 0.00921871398939848, + "grad_norm": 0.5745211839675903, + "learning_rate": 3.053435114503817e-05, + "loss": 1.303, + "step": 40 + }, + { + "epoch": 0.00944918183913344, + "grad_norm": 0.5202656388282776, + "learning_rate": 3.129770992366413e-05, + "loss": 1.2884, + "step": 41 + }, + { + "epoch": 0.009679649688868404, + "grad_norm": 0.5099622011184692, + "learning_rate": 3.2061068702290076e-05, + "loss": 1.2864, + "step": 42 + }, + { + "epoch": 0.009910117538603365, + "grad_norm": 0.5038639307022095, + "learning_rate": 3.282442748091603e-05, + "loss": 1.2713, + "step": 43 + }, + { + "epoch": 0.010140585388338327, + "grad_norm": 0.5165843963623047, + "learning_rate": 3.358778625954199e-05, + "loss": 1.2843, + "step": 44 + }, + { + "epoch": 0.010371053238073288, + "grad_norm": 0.4482625424861908, + "learning_rate": 3.435114503816794e-05, + "loss": 1.2722, + "step": 45 + }, + { + "epoch": 0.010601521087808251, + "grad_norm": 0.4263209402561188, + "learning_rate": 3.511450381679389e-05, + "loss": 1.2686, + "step": 46 + }, + { + "epoch": 0.010831988937543212, + "grad_norm": 0.4388543367385864, + "learning_rate": 3.5877862595419845e-05, + "loss": 1.2519, + "step": 47 + }, + { + "epoch": 0.011062456787278175, + "grad_norm": 0.40465718507766724, + "learning_rate": 3.66412213740458e-05, + "loss": 1.2509, + "step": 48 + }, + { + "epoch": 0.011292924637013136, + "grad_norm": 0.4082145094871521, + "learning_rate": 3.7404580152671756e-05, + "loss": 1.2527, + "step": 49 + }, + { + "epoch": 0.011523392486748099, + "grad_norm": 0.40573209524154663, + "learning_rate": 3.816793893129771e-05, + "loss": 1.2557, + "step": 50 + }, + { + "epoch": 0.01175386033648306, + "grad_norm": 0.39484360814094543, + "learning_rate": 3.8931297709923666e-05, + "loss": 1.2488, + "step": 51 + }, + { + "epoch": 0.011984328186218023, + "grad_norm": 0.32370519638061523, + "learning_rate": 3.969465648854962e-05, + "loss": 1.2396, + "step": 52 + }, + { + "epoch": 0.012214796035952985, + "grad_norm": 0.36196169257164, + "learning_rate": 4.0458015267175576e-05, + "loss": 1.2454, + "step": 53 + }, + { + "epoch": 0.012445263885687946, + "grad_norm": 0.36752983927726746, + "learning_rate": 4.122137404580153e-05, + "loss": 1.2354, + "step": 54 + }, + { + "epoch": 0.01267573173542291, + "grad_norm": 0.312123566865921, + "learning_rate": 4.198473282442748e-05, + "loss": 1.2354, + "step": 55 + }, + { + "epoch": 0.01290619958515787, + "grad_norm": 0.30579739809036255, + "learning_rate": 4.2748091603053435e-05, + "loss": 1.2225, + "step": 56 + }, + { + "epoch": 0.013136667434892833, + "grad_norm": 0.3238474428653717, + "learning_rate": 4.351145038167939e-05, + "loss": 1.233, + "step": 57 + }, + { + "epoch": 0.013367135284627794, + "grad_norm": 0.32208338379859924, + "learning_rate": 4.4274809160305345e-05, + "loss": 1.2306, + "step": 58 + }, + { + "epoch": 0.013597603134362757, + "grad_norm": 0.3156214654445648, + "learning_rate": 4.5038167938931294e-05, + "loss": 1.22, + "step": 59 + }, + { + "epoch": 0.013828070984097718, + "grad_norm": 0.27707698941230774, + "learning_rate": 4.5801526717557256e-05, + "loss": 1.2219, + "step": 60 + }, + { + "epoch": 0.01405853883383268, + "grad_norm": 0.29033875465393066, + "learning_rate": 4.656488549618321e-05, + "loss": 1.214, + "step": 61 + }, + { + "epoch": 0.014289006683567642, + "grad_norm": 0.316847562789917, + "learning_rate": 4.7328244274809166e-05, + "loss": 1.2227, + "step": 62 + }, + { + "epoch": 0.014519474533302604, + "grad_norm": 0.2887651324272156, + "learning_rate": 4.809160305343512e-05, + "loss": 1.2157, + "step": 63 + }, + { + "epoch": 0.014749942383037565, + "grad_norm": 0.36876314878463745, + "learning_rate": 4.885496183206107e-05, + "loss": 1.1988, + "step": 64 + }, + { + "epoch": 0.014980410232772528, + "grad_norm": 0.2880174517631531, + "learning_rate": 4.9618320610687025e-05, + "loss": 1.2203, + "step": 65 + }, + { + "epoch": 0.015210878082507491, + "grad_norm": 0.2820625603199005, + "learning_rate": 5.038167938931297e-05, + "loss": 1.2109, + "step": 66 + }, + { + "epoch": 0.015441345932242452, + "grad_norm": 0.27386295795440674, + "learning_rate": 5.114503816793893e-05, + "loss": 1.2097, + "step": 67 + }, + { + "epoch": 0.015671813781977413, + "grad_norm": 0.26974838972091675, + "learning_rate": 5.1908396946564884e-05, + "loss": 1.205, + "step": 68 + }, + { + "epoch": 0.015902281631712378, + "grad_norm": 0.2931689918041229, + "learning_rate": 5.267175572519084e-05, + "loss": 1.2097, + "step": 69 + }, + { + "epoch": 0.01613274948144734, + "grad_norm": 0.2843622863292694, + "learning_rate": 5.3435114503816794e-05, + "loss": 1.1955, + "step": 70 + }, + { + "epoch": 0.0163632173311823, + "grad_norm": 0.28659501671791077, + "learning_rate": 5.419847328244275e-05, + "loss": 1.1836, + "step": 71 + }, + { + "epoch": 0.01659368518091726, + "grad_norm": 0.2758232355117798, + "learning_rate": 5.496183206106871e-05, + "loss": 1.2005, + "step": 72 + }, + { + "epoch": 0.016824153030652225, + "grad_norm": 0.37796103954315186, + "learning_rate": 5.5725190839694666e-05, + "loss": 1.1996, + "step": 73 + }, + { + "epoch": 0.017054620880387186, + "grad_norm": 0.6489807963371277, + "learning_rate": 5.648854961832062e-05, + "loss": 1.1854, + "step": 74 + }, + { + "epoch": 0.017285088730122147, + "grad_norm": 0.9581599831581116, + "learning_rate": 5.725190839694656e-05, + "loss": 1.1948, + "step": 75 + }, + { + "epoch": 0.01751555657985711, + "grad_norm": 0.6904142498970032, + "learning_rate": 5.801526717557252e-05, + "loss": 1.1835, + "step": 76 + }, + { + "epoch": 0.017746024429592073, + "grad_norm": 0.546022355556488, + "learning_rate": 5.877862595419847e-05, + "loss": 1.1789, + "step": 77 + }, + { + "epoch": 0.017976492279327034, + "grad_norm": 0.7557573914527893, + "learning_rate": 5.954198473282443e-05, + "loss": 1.1842, + "step": 78 + }, + { + "epoch": 0.018206960129061995, + "grad_norm": 0.40329957008361816, + "learning_rate": 6.0305343511450384e-05, + "loss": 1.179, + "step": 79 + }, + { + "epoch": 0.01843742797879696, + "grad_norm": 0.5815610885620117, + "learning_rate": 6.106870229007635e-05, + "loss": 1.173, + "step": 80 + }, + { + "epoch": 0.01866789582853192, + "grad_norm": 0.5922554135322571, + "learning_rate": 6.18320610687023e-05, + "loss": 1.1773, + "step": 81 + }, + { + "epoch": 0.01889836367826688, + "grad_norm": 0.347842276096344, + "learning_rate": 6.259541984732826e-05, + "loss": 1.1756, + "step": 82 + }, + { + "epoch": 0.019128831528001843, + "grad_norm": 0.8533177971839905, + "learning_rate": 6.33587786259542e-05, + "loss": 1.1734, + "step": 83 + }, + { + "epoch": 0.019359299377736807, + "grad_norm": 0.9222707152366638, + "learning_rate": 6.412213740458015e-05, + "loss": 1.175, + "step": 84 + }, + { + "epoch": 0.019589767227471768, + "grad_norm": 0.4554316997528076, + "learning_rate": 6.488549618320611e-05, + "loss": 1.1612, + "step": 85 + }, + { + "epoch": 0.01982023507720673, + "grad_norm": 0.6344988346099854, + "learning_rate": 6.564885496183206e-05, + "loss": 1.1587, + "step": 86 + }, + { + "epoch": 0.02005070292694169, + "grad_norm": 0.8068343997001648, + "learning_rate": 6.641221374045802e-05, + "loss": 1.1671, + "step": 87 + }, + { + "epoch": 0.020281170776676655, + "grad_norm": 0.4366961121559143, + "learning_rate": 6.717557251908397e-05, + "loss": 1.169, + "step": 88 + }, + { + "epoch": 0.020511638626411616, + "grad_norm": 0.666015088558197, + "learning_rate": 6.793893129770993e-05, + "loss": 1.1451, + "step": 89 + }, + { + "epoch": 0.020742106476146577, + "grad_norm": 0.6575168371200562, + "learning_rate": 6.870229007633588e-05, + "loss": 1.1572, + "step": 90 + }, + { + "epoch": 0.020972574325881538, + "grad_norm": 0.4764537513256073, + "learning_rate": 6.946564885496184e-05, + "loss": 1.1506, + "step": 91 + }, + { + "epoch": 0.021203042175616502, + "grad_norm": 0.6614679098129272, + "learning_rate": 7.022900763358778e-05, + "loss": 1.1677, + "step": 92 + }, + { + "epoch": 0.021433510025351463, + "grad_norm": 0.6387923955917358, + "learning_rate": 7.099236641221374e-05, + "loss": 1.1541, + "step": 93 + }, + { + "epoch": 0.021663977875086424, + "grad_norm": 0.5480379462242126, + "learning_rate": 7.175572519083969e-05, + "loss": 1.1419, + "step": 94 + }, + { + "epoch": 0.02189444572482139, + "grad_norm": 0.7831230759620667, + "learning_rate": 7.251908396946565e-05, + "loss": 1.1501, + "step": 95 + }, + { + "epoch": 0.02212491357455635, + "grad_norm": 0.6071659326553345, + "learning_rate": 7.32824427480916e-05, + "loss": 1.1424, + "step": 96 + }, + { + "epoch": 0.02235538142429131, + "grad_norm": 0.4921586513519287, + "learning_rate": 7.404580152671756e-05, + "loss": 1.1442, + "step": 97 + }, + { + "epoch": 0.022585849274026272, + "grad_norm": 0.9020814299583435, + "learning_rate": 7.480916030534351e-05, + "loss": 1.1541, + "step": 98 + }, + { + "epoch": 0.022816317123761237, + "grad_norm": 0.9073303937911987, + "learning_rate": 7.557251908396947e-05, + "loss": 1.154, + "step": 99 + }, + { + "epoch": 0.023046784973496198, + "grad_norm": 0.5485337376594543, + "learning_rate": 7.633587786259542e-05, + "loss": 1.1372, + "step": 100 + }, + { + "epoch": 0.02327725282323116, + "grad_norm": 1.202399492263794, + "learning_rate": 7.709923664122138e-05, + "loss": 1.1339, + "step": 101 + }, + { + "epoch": 0.02350772067296612, + "grad_norm": 1.6502957344055176, + "learning_rate": 7.786259541984733e-05, + "loss": 1.1466, + "step": 102 + }, + { + "epoch": 0.023738188522701084, + "grad_norm": 0.9507301449775696, + "learning_rate": 7.862595419847329e-05, + "loss": 1.1418, + "step": 103 + }, + { + "epoch": 0.023968656372436045, + "grad_norm": 1.0010653734207153, + "learning_rate": 7.938931297709924e-05, + "loss": 1.1503, + "step": 104 + }, + { + "epoch": 0.024199124222171006, + "grad_norm": 0.6749522089958191, + "learning_rate": 8.01526717557252e-05, + "loss": 1.1335, + "step": 105 + }, + { + "epoch": 0.02442959207190597, + "grad_norm": 0.7454925775527954, + "learning_rate": 8.091603053435115e-05, + "loss": 1.1353, + "step": 106 + }, + { + "epoch": 0.024660059921640932, + "grad_norm": 0.6050741672515869, + "learning_rate": 8.167938931297711e-05, + "loss": 1.1396, + "step": 107 + }, + { + "epoch": 0.024890527771375893, + "grad_norm": 0.8601208925247192, + "learning_rate": 8.244274809160306e-05, + "loss": 1.135, + "step": 108 + }, + { + "epoch": 0.025120995621110854, + "grad_norm": 0.6721903085708618, + "learning_rate": 8.320610687022902e-05, + "loss": 1.1259, + "step": 109 + }, + { + "epoch": 0.02535146347084582, + "grad_norm": 0.5092166662216187, + "learning_rate": 8.396946564885496e-05, + "loss": 1.1276, + "step": 110 + }, + { + "epoch": 0.02558193132058078, + "grad_norm": 0.5667235851287842, + "learning_rate": 8.473282442748092e-05, + "loss": 1.1262, + "step": 111 + }, + { + "epoch": 0.02581239917031574, + "grad_norm": 0.6341401934623718, + "learning_rate": 8.549618320610687e-05, + "loss": 1.1269, + "step": 112 + }, + { + "epoch": 0.0260428670200507, + "grad_norm": 0.5853081941604614, + "learning_rate": 8.625954198473283e-05, + "loss": 1.1321, + "step": 113 + }, + { + "epoch": 0.026273334869785666, + "grad_norm": 0.514251708984375, + "learning_rate": 8.702290076335878e-05, + "loss": 1.1322, + "step": 114 + }, + { + "epoch": 0.026503802719520627, + "grad_norm": 0.8147274255752563, + "learning_rate": 8.778625954198474e-05, + "loss": 1.1153, + "step": 115 + }, + { + "epoch": 0.026734270569255588, + "grad_norm": 1.1456818580627441, + "learning_rate": 8.854961832061069e-05, + "loss": 1.1361, + "step": 116 + }, + { + "epoch": 0.02696473841899055, + "grad_norm": 1.8368473052978516, + "learning_rate": 8.931297709923665e-05, + "loss": 1.1387, + "step": 117 + }, + { + "epoch": 0.027195206268725514, + "grad_norm": 2.4498660564422607, + "learning_rate": 9.007633587786259e-05, + "loss": 1.1505, + "step": 118 + }, + { + "epoch": 0.027425674118460475, + "grad_norm": 2.034926652908325, + "learning_rate": 9.083969465648856e-05, + "loss": 1.1443, + "step": 119 + }, + { + "epoch": 0.027656141968195436, + "grad_norm": 1.2848972082138062, + "learning_rate": 9.160305343511451e-05, + "loss": 1.1466, + "step": 120 + }, + { + "epoch": 0.0278866098179304, + "grad_norm": 1.265265703201294, + "learning_rate": 9.236641221374047e-05, + "loss": 1.1345, + "step": 121 + }, + { + "epoch": 0.02811707766766536, + "grad_norm": 0.8716845512390137, + "learning_rate": 9.312977099236642e-05, + "loss": 1.1412, + "step": 122 + }, + { + "epoch": 0.028347545517400322, + "grad_norm": 0.9397414922714233, + "learning_rate": 9.389312977099238e-05, + "loss": 1.1358, + "step": 123 + }, + { + "epoch": 0.028578013367135283, + "grad_norm": 0.9730684757232666, + "learning_rate": 9.465648854961833e-05, + "loss": 1.1232, + "step": 124 + }, + { + "epoch": 0.028808481216870248, + "grad_norm": 0.8047937154769897, + "learning_rate": 9.541984732824429e-05, + "loss": 1.1329, + "step": 125 + }, + { + "epoch": 0.02903894906660521, + "grad_norm": 0.7371867895126343, + "learning_rate": 9.618320610687024e-05, + "loss": 1.1292, + "step": 126 + }, + { + "epoch": 0.02926941691634017, + "grad_norm": 0.9497559666633606, + "learning_rate": 9.694656488549618e-05, + "loss": 1.1213, + "step": 127 + }, + { + "epoch": 0.02949988476607513, + "grad_norm": 0.6228593587875366, + "learning_rate": 9.770992366412214e-05, + "loss": 1.1112, + "step": 128 + }, + { + "epoch": 0.029730352615810095, + "grad_norm": 0.8247762322425842, + "learning_rate": 9.84732824427481e-05, + "loss": 1.1187, + "step": 129 + }, + { + "epoch": 0.029960820465545056, + "grad_norm": 0.6637692451477051, + "learning_rate": 9.923664122137405e-05, + "loss": 1.1046, + "step": 130 + }, + { + "epoch": 0.030191288315280018, + "grad_norm": 0.8043597936630249, + "learning_rate": 0.0001, + "loss": 1.1031, + "step": 131 + }, + { + "epoch": 0.030421756165014982, + "grad_norm": 0.6135521531105042, + "learning_rate": 9.999998606560007e-05, + "loss": 1.1086, + "step": 132 + }, + { + "epoch": 0.030652224014749943, + "grad_norm": 0.5676527619361877, + "learning_rate": 9.999994426240797e-05, + "loss": 1.0931, + "step": 133 + }, + { + "epoch": 0.030882691864484904, + "grad_norm": 0.6945146918296814, + "learning_rate": 9.999987459044706e-05, + "loss": 1.1038, + "step": 134 + }, + { + "epoch": 0.031113159714219865, + "grad_norm": 0.6486102938652039, + "learning_rate": 9.999977704975617e-05, + "loss": 1.1031, + "step": 135 + }, + { + "epoch": 0.031343627563954826, + "grad_norm": 0.7887089848518372, + "learning_rate": 9.999965164038963e-05, + "loss": 1.1089, + "step": 136 + }, + { + "epoch": 0.03157409541368979, + "grad_norm": 1.3936797380447388, + "learning_rate": 9.999949836241736e-05, + "loss": 1.1004, + "step": 137 + }, + { + "epoch": 0.031804563263424755, + "grad_norm": 2.2348039150238037, + "learning_rate": 9.999931721592481e-05, + "loss": 1.1181, + "step": 138 + }, + { + "epoch": 0.032035031113159716, + "grad_norm": 1.54178786277771, + "learning_rate": 9.999910820101293e-05, + "loss": 1.1246, + "step": 139 + }, + { + "epoch": 0.03226549896289468, + "grad_norm": 0.7922399640083313, + "learning_rate": 9.99988713177982e-05, + "loss": 1.1064, + "step": 140 + }, + { + "epoch": 0.03249596681262964, + "grad_norm": 0.8318815231323242, + "learning_rate": 9.999860656641268e-05, + "loss": 1.1061, + "step": 141 + }, + { + "epoch": 0.0327264346623646, + "grad_norm": 0.800605297088623, + "learning_rate": 9.999831394700395e-05, + "loss": 1.1053, + "step": 142 + }, + { + "epoch": 0.03295690251209956, + "grad_norm": 0.6274577379226685, + "learning_rate": 9.999799345973506e-05, + "loss": 1.0969, + "step": 143 + }, + { + "epoch": 0.03318737036183452, + "grad_norm": 0.7087227702140808, + "learning_rate": 9.99976451047847e-05, + "loss": 1.0991, + "step": 144 + }, + { + "epoch": 0.03341783821156949, + "grad_norm": 0.68040931224823, + "learning_rate": 9.999726888234698e-05, + "loss": 1.0971, + "step": 145 + }, + { + "epoch": 0.03364830606130445, + "grad_norm": 0.6233592629432678, + "learning_rate": 9.999686479263164e-05, + "loss": 1.0923, + "step": 146 + }, + { + "epoch": 0.03387877391103941, + "grad_norm": 0.8178934454917908, + "learning_rate": 9.999643283586388e-05, + "loss": 1.0971, + "step": 147 + }, + { + "epoch": 0.03410924176077437, + "grad_norm": 0.6832984089851379, + "learning_rate": 9.999597301228448e-05, + "loss": 1.0957, + "step": 148 + }, + { + "epoch": 0.034339709610509334, + "grad_norm": 0.49561241269111633, + "learning_rate": 9.999548532214973e-05, + "loss": 1.0898, + "step": 149 + }, + { + "epoch": 0.034570177460244295, + "grad_norm": 0.6173511743545532, + "learning_rate": 9.999496976573145e-05, + "loss": 1.0912, + "step": 150 + }, + { + "epoch": 0.034800645309979256, + "grad_norm": 0.5704337358474731, + "learning_rate": 9.999442634331703e-05, + "loss": 1.0805, + "step": 151 + }, + { + "epoch": 0.03503111315971422, + "grad_norm": 0.4808043837547302, + "learning_rate": 9.999385505520931e-05, + "loss": 1.0877, + "step": 152 + }, + { + "epoch": 0.035261581009449185, + "grad_norm": 0.5703374743461609, + "learning_rate": 9.999325590172675e-05, + "loss": 1.0754, + "step": 153 + }, + { + "epoch": 0.035492048859184146, + "grad_norm": 0.7379648089408875, + "learning_rate": 9.999262888320329e-05, + "loss": 1.0987, + "step": 154 + }, + { + "epoch": 0.03572251670891911, + "grad_norm": 0.7264831066131592, + "learning_rate": 9.999197399998841e-05, + "loss": 1.0787, + "step": 155 + }, + { + "epoch": 0.03595298455865407, + "grad_norm": 0.5585530400276184, + "learning_rate": 9.999129125244714e-05, + "loss": 1.0797, + "step": 156 + }, + { + "epoch": 0.03618345240838903, + "grad_norm": 0.8825198411941528, + "learning_rate": 9.999058064096002e-05, + "loss": 1.0704, + "step": 157 + }, + { + "epoch": 0.03641392025812399, + "grad_norm": 1.1649545431137085, + "learning_rate": 9.998984216592313e-05, + "loss": 1.0844, + "step": 158 + }, + { + "epoch": 0.03664438810785895, + "grad_norm": 1.1949596405029297, + "learning_rate": 9.998907582774807e-05, + "loss": 1.071, + "step": 159 + }, + { + "epoch": 0.03687485595759392, + "grad_norm": 1.635244607925415, + "learning_rate": 9.998828162686197e-05, + "loss": 1.0777, + "step": 160 + }, + { + "epoch": 0.03710532380732888, + "grad_norm": 1.7061165571212769, + "learning_rate": 9.998745956370754e-05, + "loss": 1.0744, + "step": 161 + }, + { + "epoch": 0.03733579165706384, + "grad_norm": 1.4206252098083496, + "learning_rate": 9.998660963874294e-05, + "loss": 1.0777, + "step": 162 + }, + { + "epoch": 0.0375662595067988, + "grad_norm": 1.5643310546875, + "learning_rate": 9.998573185244192e-05, + "loss": 1.0783, + "step": 163 + }, + { + "epoch": 0.03779672735653376, + "grad_norm": 1.2039772272109985, + "learning_rate": 9.998482620529371e-05, + "loss": 1.0766, + "step": 164 + }, + { + "epoch": 0.038027195206268724, + "grad_norm": 0.8104932904243469, + "learning_rate": 9.998389269780312e-05, + "loss": 1.0693, + "step": 165 + }, + { + "epoch": 0.038257663056003685, + "grad_norm": 0.8776142597198486, + "learning_rate": 9.998293133049046e-05, + "loss": 1.0668, + "step": 166 + }, + { + "epoch": 0.038488130905738646, + "grad_norm": 0.837092936038971, + "learning_rate": 9.998194210389157e-05, + "loss": 1.061, + "step": 167 + }, + { + "epoch": 0.038718598755473614, + "grad_norm": 0.8989808559417725, + "learning_rate": 9.998092501855782e-05, + "loss": 1.0699, + "step": 168 + }, + { + "epoch": 0.038949066605208575, + "grad_norm": 0.97697913646698, + "learning_rate": 9.99798800750561e-05, + "loss": 1.0568, + "step": 169 + }, + { + "epoch": 0.039179534454943536, + "grad_norm": 0.8058460354804993, + "learning_rate": 9.997880727396886e-05, + "loss": 1.0501, + "step": 170 + }, + { + "epoch": 0.0394100023046785, + "grad_norm": 0.8557106852531433, + "learning_rate": 9.997770661589403e-05, + "loss": 1.0567, + "step": 171 + }, + { + "epoch": 0.03964047015441346, + "grad_norm": 0.794967532157898, + "learning_rate": 9.997657810144511e-05, + "loss": 1.0555, + "step": 172 + }, + { + "epoch": 0.03987093800414842, + "grad_norm": 0.7823812961578369, + "learning_rate": 9.99754217312511e-05, + "loss": 1.0373, + "step": 173 + }, + { + "epoch": 0.04010140585388338, + "grad_norm": 1.165642261505127, + "learning_rate": 9.997423750595651e-05, + "loss": 1.0486, + "step": 174 + }, + { + "epoch": 0.04033187370361835, + "grad_norm": 1.893602967262268, + "learning_rate": 9.997302542622144e-05, + "loss": 1.0738, + "step": 175 + }, + { + "epoch": 0.04056234155335331, + "grad_norm": 1.9267841577529907, + "learning_rate": 9.997178549272145e-05, + "loss": 1.0595, + "step": 176 + }, + { + "epoch": 0.04079280940308827, + "grad_norm": 1.4017184972763062, + "learning_rate": 9.997051770614765e-05, + "loss": 1.0589, + "step": 177 + }, + { + "epoch": 0.04102327725282323, + "grad_norm": 0.9901034235954285, + "learning_rate": 9.996922206720667e-05, + "loss": 1.0584, + "step": 178 + }, + { + "epoch": 0.04125374510255819, + "grad_norm": 0.9640342593193054, + "learning_rate": 9.996789857662068e-05, + "loss": 1.0513, + "step": 179 + }, + { + "epoch": 0.041484212952293154, + "grad_norm": 0.9942569136619568, + "learning_rate": 9.996654723512736e-05, + "loss": 1.0478, + "step": 180 + }, + { + "epoch": 0.041714680802028115, + "grad_norm": 0.9683095812797546, + "learning_rate": 9.996516804347991e-05, + "loss": 1.0514, + "step": 181 + }, + { + "epoch": 0.041945148651763076, + "grad_norm": 1.039068341255188, + "learning_rate": 9.996376100244704e-05, + "loss": 1.0368, + "step": 182 + }, + { + "epoch": 0.042175616501498044, + "grad_norm": 0.8390737175941467, + "learning_rate": 9.996232611281304e-05, + "loss": 1.033, + "step": 183 + }, + { + "epoch": 0.042406084351233005, + "grad_norm": 0.7644326090812683, + "learning_rate": 9.996086337537767e-05, + "loss": 1.0359, + "step": 184 + }, + { + "epoch": 0.042636552200967966, + "grad_norm": 0.7521729469299316, + "learning_rate": 9.995937279095621e-05, + "loss": 1.0259, + "step": 185 + }, + { + "epoch": 0.04286702005070293, + "grad_norm": 0.7316980361938477, + "learning_rate": 9.995785436037947e-05, + "loss": 1.0388, + "step": 186 + }, + { + "epoch": 0.04309748790043789, + "grad_norm": 0.7523071765899658, + "learning_rate": 9.995630808449383e-05, + "loss": 1.0286, + "step": 187 + }, + { + "epoch": 0.04332795575017285, + "grad_norm": 0.8001862168312073, + "learning_rate": 9.995473396416111e-05, + "loss": 1.0319, + "step": 188 + }, + { + "epoch": 0.04355842359990781, + "grad_norm": 0.859374463558197, + "learning_rate": 9.995313200025869e-05, + "loss": 1.0244, + "step": 189 + }, + { + "epoch": 0.04378889144964278, + "grad_norm": 0.9133301973342896, + "learning_rate": 9.995150219367946e-05, + "loss": 1.0106, + "step": 190 + }, + { + "epoch": 0.04401935929937774, + "grad_norm": 0.8615682125091553, + "learning_rate": 9.994984454533185e-05, + "loss": 1.0147, + "step": 191 + }, + { + "epoch": 0.0442498271491127, + "grad_norm": 0.7992554903030396, + "learning_rate": 9.994815905613981e-05, + "loss": 1.0163, + "step": 192 + }, + { + "epoch": 0.04448029499884766, + "grad_norm": 0.8807731866836548, + "learning_rate": 9.994644572704275e-05, + "loss": 1.0208, + "step": 193 + }, + { + "epoch": 0.04471076284858262, + "grad_norm": 1.1343860626220703, + "learning_rate": 9.994470455899568e-05, + "loss": 1.0152, + "step": 194 + }, + { + "epoch": 0.04494123069831758, + "grad_norm": 1.4376226663589478, + "learning_rate": 9.994293555296904e-05, + "loss": 1.0261, + "step": 195 + }, + { + "epoch": 0.045171698548052544, + "grad_norm": 1.5723721981048584, + "learning_rate": 9.994113870994888e-05, + "loss": 1.0137, + "step": 196 + }, + { + "epoch": 0.04540216639778751, + "grad_norm": 1.2862370014190674, + "learning_rate": 9.993931403093668e-05, + "loss": 1.0159, + "step": 197 + }, + { + "epoch": 0.04563263424752247, + "grad_norm": 1.0813417434692383, + "learning_rate": 9.99374615169495e-05, + "loss": 0.9976, + "step": 198 + }, + { + "epoch": 0.045863102097257434, + "grad_norm": 0.7794204354286194, + "learning_rate": 9.993558116901985e-05, + "loss": 1.0036, + "step": 199 + }, + { + "epoch": 0.046093569946992395, + "grad_norm": 1.225102186203003, + "learning_rate": 9.993367298819583e-05, + "loss": 0.9909, + "step": 200 + }, + { + "epoch": 0.046324037796727356, + "grad_norm": 1.1289259195327759, + "learning_rate": 9.9931736975541e-05, + "loss": 1.0123, + "step": 201 + }, + { + "epoch": 0.04655450564646232, + "grad_norm": 0.755088746547699, + "learning_rate": 9.992977313213443e-05, + "loss": 0.9916, + "step": 202 + }, + { + "epoch": 0.04678497349619728, + "grad_norm": 0.8576580286026001, + "learning_rate": 9.992778145907073e-05, + "loss": 0.9914, + "step": 203 + }, + { + "epoch": 0.04701544134593224, + "grad_norm": 0.8619646430015564, + "learning_rate": 9.992576195746003e-05, + "loss": 0.9963, + "step": 204 + }, + { + "epoch": 0.04724590919566721, + "grad_norm": 0.8409770727157593, + "learning_rate": 9.992371462842794e-05, + "loss": 0.9923, + "step": 205 + }, + { + "epoch": 0.04747637704540217, + "grad_norm": 0.7922968864440918, + "learning_rate": 9.992163947311557e-05, + "loss": 0.9859, + "step": 206 + }, + { + "epoch": 0.04770684489513713, + "grad_norm": 0.7922948002815247, + "learning_rate": 9.99195364926796e-05, + "loss": 0.9798, + "step": 207 + }, + { + "epoch": 0.04793731274487209, + "grad_norm": 0.8570684790611267, + "learning_rate": 9.991740568829215e-05, + "loss": 0.9644, + "step": 208 + }, + { + "epoch": 0.04816778059460705, + "grad_norm": 0.9860967993736267, + "learning_rate": 9.99152470611409e-05, + "loss": 0.976, + "step": 209 + }, + { + "epoch": 0.04839824844434201, + "grad_norm": 1.1492034196853638, + "learning_rate": 9.991306061242899e-05, + "loss": 0.9746, + "step": 210 + }, + { + "epoch": 0.048628716294076973, + "grad_norm": 1.3347015380859375, + "learning_rate": 9.991084634337511e-05, + "loss": 0.973, + "step": 211 + }, + { + "epoch": 0.04885918414381194, + "grad_norm": 1.097365140914917, + "learning_rate": 9.990860425521347e-05, + "loss": 0.9814, + "step": 212 + }, + { + "epoch": 0.0490896519935469, + "grad_norm": 1.006355881690979, + "learning_rate": 9.990633434919369e-05, + "loss": 0.9726, + "step": 213 + }, + { + "epoch": 0.049320119843281864, + "grad_norm": 0.8860179781913757, + "learning_rate": 9.990403662658104e-05, + "loss": 0.973, + "step": 214 + }, + { + "epoch": 0.049550587693016825, + "grad_norm": 0.8287469148635864, + "learning_rate": 9.990171108865614e-05, + "loss": 0.9673, + "step": 215 + }, + { + "epoch": 0.049781055542751786, + "grad_norm": 0.8757283687591553, + "learning_rate": 9.989935773671525e-05, + "loss": 0.9571, + "step": 216 + }, + { + "epoch": 0.05001152339248675, + "grad_norm": 0.8432666063308716, + "learning_rate": 9.989697657207002e-05, + "loss": 0.9543, + "step": 217 + }, + { + "epoch": 0.05024199124222171, + "grad_norm": 0.8379467725753784, + "learning_rate": 9.98945675960477e-05, + "loss": 0.9643, + "step": 218 + }, + { + "epoch": 0.05047245909195667, + "grad_norm": 0.7548062801361084, + "learning_rate": 9.989213080999097e-05, + "loss": 0.9637, + "step": 219 + }, + { + "epoch": 0.05070292694169164, + "grad_norm": 0.6869426965713501, + "learning_rate": 9.988966621525804e-05, + "loss": 0.9411, + "step": 220 + }, + { + "epoch": 0.0509333947914266, + "grad_norm": 0.8082894086837769, + "learning_rate": 9.988717381322262e-05, + "loss": 0.9441, + "step": 221 + }, + { + "epoch": 0.05116386264116156, + "grad_norm": 0.8997180461883545, + "learning_rate": 9.988465360527389e-05, + "loss": 0.9436, + "step": 222 + }, + { + "epoch": 0.05139433049089652, + "grad_norm": 0.9218918681144714, + "learning_rate": 9.988210559281658e-05, + "loss": 0.9616, + "step": 223 + }, + { + "epoch": 0.05162479834063148, + "grad_norm": 0.8721861839294434, + "learning_rate": 9.98795297772709e-05, + "loss": 0.9312, + "step": 224 + }, + { + "epoch": 0.05185526619036644, + "grad_norm": 0.8172184824943542, + "learning_rate": 9.987692616007253e-05, + "loss": 0.945, + "step": 225 + }, + { + "epoch": 0.0520857340401014, + "grad_norm": 0.8974547386169434, + "learning_rate": 9.987429474267268e-05, + "loss": 0.9419, + "step": 226 + }, + { + "epoch": 0.05231620188983637, + "grad_norm": 1.0265862941741943, + "learning_rate": 9.987163552653802e-05, + "loss": 0.9443, + "step": 227 + }, + { + "epoch": 0.05254666973957133, + "grad_norm": 1.184733510017395, + "learning_rate": 9.986894851315074e-05, + "loss": 0.9374, + "step": 228 + }, + { + "epoch": 0.05277713758930629, + "grad_norm": 1.3877384662628174, + "learning_rate": 9.98662337040085e-05, + "loss": 0.9444, + "step": 229 + }, + { + "epoch": 0.053007605439041254, + "grad_norm": 1.0500129461288452, + "learning_rate": 9.98634911006245e-05, + "loss": 0.9305, + "step": 230 + }, + { + "epoch": 0.053238073288776215, + "grad_norm": 0.6418678164482117, + "learning_rate": 9.986072070452738e-05, + "loss": 0.9261, + "step": 231 + }, + { + "epoch": 0.053468541138511176, + "grad_norm": 0.9703414440155029, + "learning_rate": 9.985792251726131e-05, + "loss": 0.9323, + "step": 232 + }, + { + "epoch": 0.05369900898824614, + "grad_norm": 0.8372150659561157, + "learning_rate": 9.985509654038591e-05, + "loss": 0.9335, + "step": 233 + }, + { + "epoch": 0.0539294768379811, + "grad_norm": 0.7052909135818481, + "learning_rate": 9.985224277547634e-05, + "loss": 0.9088, + "step": 234 + }, + { + "epoch": 0.054159944687716066, + "grad_norm": 0.7531741261482239, + "learning_rate": 9.984936122412319e-05, + "loss": 0.9182, + "step": 235 + }, + { + "epoch": 0.05439041253745103, + "grad_norm": 0.633127748966217, + "learning_rate": 9.98464518879326e-05, + "loss": 0.923, + "step": 236 + }, + { + "epoch": 0.05462088038718599, + "grad_norm": 0.5811619758605957, + "learning_rate": 9.984351476852613e-05, + "loss": 0.9184, + "step": 237 + }, + { + "epoch": 0.05485134823692095, + "grad_norm": 0.6498633027076721, + "learning_rate": 9.984054986754088e-05, + "loss": 0.919, + "step": 238 + }, + { + "epoch": 0.05508181608665591, + "grad_norm": 0.5947354435920715, + "learning_rate": 9.98375571866294e-05, + "loss": 0.9233, + "step": 239 + }, + { + "epoch": 0.05531228393639087, + "grad_norm": 0.5955505967140198, + "learning_rate": 9.983453672745975e-05, + "loss": 0.9212, + "step": 240 + }, + { + "epoch": 0.05554275178612583, + "grad_norm": 0.571624755859375, + "learning_rate": 9.983148849171546e-05, + "loss": 0.9166, + "step": 241 + }, + { + "epoch": 0.0557732196358608, + "grad_norm": 0.6101223230361938, + "learning_rate": 9.982841248109555e-05, + "loss": 0.912, + "step": 242 + }, + { + "epoch": 0.05600368748559576, + "grad_norm": 0.6820085644721985, + "learning_rate": 9.982530869731451e-05, + "loss": 0.9112, + "step": 243 + }, + { + "epoch": 0.05623415533533072, + "grad_norm": 0.9035933613777161, + "learning_rate": 9.982217714210232e-05, + "loss": 0.9129, + "step": 244 + }, + { + "epoch": 0.056464623185065684, + "grad_norm": 1.0833593606948853, + "learning_rate": 9.98190178172044e-05, + "loss": 0.9062, + "step": 245 + }, + { + "epoch": 0.056695091034800645, + "grad_norm": 1.1479918956756592, + "learning_rate": 9.981583072438173e-05, + "loss": 0.9166, + "step": 246 + }, + { + "epoch": 0.056925558884535606, + "grad_norm": 0.9499214291572571, + "learning_rate": 9.981261586541068e-05, + "loss": 0.9116, + "step": 247 + }, + { + "epoch": 0.05715602673427057, + "grad_norm": 0.8736324906349182, + "learning_rate": 9.980937324208317e-05, + "loss": 0.9076, + "step": 248 + }, + { + "epoch": 0.057386494584005535, + "grad_norm": 0.8064135313034058, + "learning_rate": 9.980610285620654e-05, + "loss": 0.9052, + "step": 249 + }, + { + "epoch": 0.057616962433740496, + "grad_norm": 0.7814821600914001, + "learning_rate": 9.980280470960363e-05, + "loss": 0.8956, + "step": 250 + }, + { + "epoch": 0.05784743028347546, + "grad_norm": 0.7428434491157532, + "learning_rate": 9.979947880411273e-05, + "loss": 0.8941, + "step": 251 + }, + { + "epoch": 0.05807789813321042, + "grad_norm": 0.6821320652961731, + "learning_rate": 9.979612514158765e-05, + "loss": 0.8985, + "step": 252 + }, + { + "epoch": 0.05830836598294538, + "grad_norm": 0.6651344895362854, + "learning_rate": 9.979274372389762e-05, + "loss": 0.8986, + "step": 253 + }, + { + "epoch": 0.05853883383268034, + "grad_norm": 0.7630136013031006, + "learning_rate": 9.978933455292736e-05, + "loss": 0.8916, + "step": 254 + }, + { + "epoch": 0.0587693016824153, + "grad_norm": 0.801031768321991, + "learning_rate": 9.978589763057708e-05, + "loss": 0.8947, + "step": 255 + }, + { + "epoch": 0.05899976953215026, + "grad_norm": 0.7151498794555664, + "learning_rate": 9.978243295876242e-05, + "loss": 0.8945, + "step": 256 + }, + { + "epoch": 0.05923023738188523, + "grad_norm": 0.7015817761421204, + "learning_rate": 9.97789405394145e-05, + "loss": 0.8903, + "step": 257 + }, + { + "epoch": 0.05946070523162019, + "grad_norm": 0.7685847282409668, + "learning_rate": 9.977542037447994e-05, + "loss": 0.8959, + "step": 258 + }, + { + "epoch": 0.05969117308135515, + "grad_norm": 0.8099703192710876, + "learning_rate": 9.977187246592076e-05, + "loss": 0.8984, + "step": 259 + }, + { + "epoch": 0.05992164093109011, + "grad_norm": 0.7858747243881226, + "learning_rate": 9.97682968157145e-05, + "loss": 0.8845, + "step": 260 + }, + { + "epoch": 0.060152108780825074, + "grad_norm": 0.8646757006645203, + "learning_rate": 9.976469342585413e-05, + "loss": 0.8776, + "step": 261 + }, + { + "epoch": 0.060382576630560035, + "grad_norm": 0.8440235257148743, + "learning_rate": 9.976106229834812e-05, + "loss": 0.8711, + "step": 262 + }, + { + "epoch": 0.060613044480294996, + "grad_norm": 0.8207287192344666, + "learning_rate": 9.975740343522033e-05, + "loss": 0.8806, + "step": 263 + }, + { + "epoch": 0.060843512330029964, + "grad_norm": 0.8601641058921814, + "learning_rate": 9.975371683851016e-05, + "loss": 0.8794, + "step": 264 + }, + { + "epoch": 0.061073980179764925, + "grad_norm": 0.8005819916725159, + "learning_rate": 9.975000251027242e-05, + "loss": 0.889, + "step": 265 + }, + { + "epoch": 0.061304448029499886, + "grad_norm": 0.7524363994598389, + "learning_rate": 9.974626045257738e-05, + "loss": 0.8734, + "step": 266 + }, + { + "epoch": 0.06153491587923485, + "grad_norm": 0.7972283959388733, + "learning_rate": 9.974249066751077e-05, + "loss": 0.8856, + "step": 267 + }, + { + "epoch": 0.06176538372896981, + "grad_norm": 0.925719678401947, + "learning_rate": 9.973869315717379e-05, + "loss": 0.8745, + "step": 268 + }, + { + "epoch": 0.06199585157870477, + "grad_norm": 0.8908208608627319, + "learning_rate": 9.973486792368307e-05, + "loss": 0.8676, + "step": 269 + }, + { + "epoch": 0.06222631942843973, + "grad_norm": 0.832204282283783, + "learning_rate": 9.973101496917072e-05, + "loss": 0.874, + "step": 270 + }, + { + "epoch": 0.06245678727817469, + "grad_norm": 0.8755982518196106, + "learning_rate": 9.972713429578427e-05, + "loss": 0.8564, + "step": 271 + }, + { + "epoch": 0.06268725512790965, + "grad_norm": 0.8395307660102844, + "learning_rate": 9.97232259056867e-05, + "loss": 0.8624, + "step": 272 + }, + { + "epoch": 0.06291772297764461, + "grad_norm": 0.7844563722610474, + "learning_rate": 9.97192898010565e-05, + "loss": 0.8637, + "step": 273 + }, + { + "epoch": 0.06314819082737957, + "grad_norm": 0.6658696532249451, + "learning_rate": 9.97153259840875e-05, + "loss": 0.8695, + "step": 274 + }, + { + "epoch": 0.06337865867711455, + "grad_norm": 0.7139994502067566, + "learning_rate": 9.971133445698908e-05, + "loss": 0.8486, + "step": 275 + }, + { + "epoch": 0.06360912652684951, + "grad_norm": 0.6378993391990662, + "learning_rate": 9.970731522198602e-05, + "loss": 0.8639, + "step": 276 + }, + { + "epoch": 0.06383959437658447, + "grad_norm": 0.6299728155136108, + "learning_rate": 9.970326828131852e-05, + "loss": 0.8541, + "step": 277 + }, + { + "epoch": 0.06407006222631943, + "grad_norm": 0.619757890701294, + "learning_rate": 9.969919363724226e-05, + "loss": 0.8506, + "step": 278 + }, + { + "epoch": 0.0643005300760544, + "grad_norm": 0.575935423374176, + "learning_rate": 9.969509129202837e-05, + "loss": 0.8599, + "step": 279 + }, + { + "epoch": 0.06453099792578935, + "grad_norm": 0.5778565406799316, + "learning_rate": 9.969096124796335e-05, + "loss": 0.8495, + "step": 280 + }, + { + "epoch": 0.06476146577552432, + "grad_norm": 0.6680006980895996, + "learning_rate": 9.968680350734922e-05, + "loss": 0.8468, + "step": 281 + }, + { + "epoch": 0.06499193362525928, + "grad_norm": 0.5861308574676514, + "learning_rate": 9.968261807250341e-05, + "loss": 0.8444, + "step": 282 + }, + { + "epoch": 0.06522240147499424, + "grad_norm": 0.5511894226074219, + "learning_rate": 9.967840494575879e-05, + "loss": 0.852, + "step": 283 + }, + { + "epoch": 0.0654528693247292, + "grad_norm": 0.6007016897201538, + "learning_rate": 9.967416412946362e-05, + "loss": 0.8503, + "step": 284 + }, + { + "epoch": 0.06568333717446416, + "grad_norm": 0.5989518761634827, + "learning_rate": 9.966989562598163e-05, + "loss": 0.8463, + "step": 285 + }, + { + "epoch": 0.06591380502419912, + "grad_norm": 0.623166561126709, + "learning_rate": 9.966559943769203e-05, + "loss": 0.8389, + "step": 286 + }, + { + "epoch": 0.06614427287393408, + "grad_norm": 0.5255123376846313, + "learning_rate": 9.966127556698936e-05, + "loss": 0.8486, + "step": 287 + }, + { + "epoch": 0.06637474072366904, + "grad_norm": 0.6101188063621521, + "learning_rate": 9.965692401628368e-05, + "loss": 0.8508, + "step": 288 + }, + { + "epoch": 0.066605208573404, + "grad_norm": 0.6269940733909607, + "learning_rate": 9.96525447880004e-05, + "loss": 0.8407, + "step": 289 + }, + { + "epoch": 0.06683567642313898, + "grad_norm": 0.7046296000480652, + "learning_rate": 9.964813788458043e-05, + "loss": 0.8329, + "step": 290 + }, + { + "epoch": 0.06706614427287394, + "grad_norm": 0.8176979422569275, + "learning_rate": 9.964370330848005e-05, + "loss": 0.8347, + "step": 291 + }, + { + "epoch": 0.0672966121226089, + "grad_norm": 0.8342549800872803, + "learning_rate": 9.963924106217102e-05, + "loss": 0.836, + "step": 292 + }, + { + "epoch": 0.06752707997234386, + "grad_norm": 0.8378840684890747, + "learning_rate": 9.963475114814045e-05, + "loss": 0.8339, + "step": 293 + }, + { + "epoch": 0.06775754782207882, + "grad_norm": 0.7933089733123779, + "learning_rate": 9.963023356889093e-05, + "loss": 0.8321, + "step": 294 + }, + { + "epoch": 0.06798801567181378, + "grad_norm": 0.7538984417915344, + "learning_rate": 9.962568832694044e-05, + "loss": 0.8395, + "step": 295 + }, + { + "epoch": 0.06821848352154875, + "grad_norm": 0.6032546162605286, + "learning_rate": 9.962111542482241e-05, + "loss": 0.8314, + "step": 296 + }, + { + "epoch": 0.0684489513712837, + "grad_norm": 0.694686770439148, + "learning_rate": 9.961651486508564e-05, + "loss": 0.826, + "step": 297 + }, + { + "epoch": 0.06867941922101867, + "grad_norm": 0.7310389876365662, + "learning_rate": 9.96118866502944e-05, + "loss": 0.8221, + "step": 298 + }, + { + "epoch": 0.06890988707075363, + "grad_norm": 0.6587698459625244, + "learning_rate": 9.960723078302832e-05, + "loss": 0.8275, + "step": 299 + }, + { + "epoch": 0.06914035492048859, + "grad_norm": 0.6249524354934692, + "learning_rate": 9.960254726588246e-05, + "loss": 0.8329, + "step": 300 + }, + { + "epoch": 0.06937082277022355, + "grad_norm": 0.6110243797302246, + "learning_rate": 9.959783610146733e-05, + "loss": 0.8296, + "step": 301 + }, + { + "epoch": 0.06960129061995851, + "grad_norm": 0.528419017791748, + "learning_rate": 9.959309729240882e-05, + "loss": 0.825, + "step": 302 + }, + { + "epoch": 0.06983175846969347, + "grad_norm": 0.47809135913848877, + "learning_rate": 9.95883308413482e-05, + "loss": 0.8163, + "step": 303 + }, + { + "epoch": 0.07006222631942843, + "grad_norm": 0.4991075098514557, + "learning_rate": 9.95835367509422e-05, + "loss": 0.8368, + "step": 304 + }, + { + "epoch": 0.07029269416916341, + "grad_norm": 0.5650983452796936, + "learning_rate": 9.957871502386291e-05, + "loss": 0.8194, + "step": 305 + }, + { + "epoch": 0.07052316201889837, + "grad_norm": 0.4641514718532562, + "learning_rate": 9.957386566279788e-05, + "loss": 0.8259, + "step": 306 + }, + { + "epoch": 0.07075362986863333, + "grad_norm": 0.4920820891857147, + "learning_rate": 9.956898867044999e-05, + "loss": 0.8188, + "step": 307 + }, + { + "epoch": 0.07098409771836829, + "grad_norm": 0.525382936000824, + "learning_rate": 9.956408404953756e-05, + "loss": 0.8156, + "step": 308 + }, + { + "epoch": 0.07121456556810325, + "grad_norm": 0.5120217800140381, + "learning_rate": 9.955915180279433e-05, + "loss": 0.8213, + "step": 309 + }, + { + "epoch": 0.07144503341783821, + "grad_norm": 0.448485791683197, + "learning_rate": 9.955419193296943e-05, + "loss": 0.8191, + "step": 310 + }, + { + "epoch": 0.07167550126757317, + "grad_norm": 0.5565616488456726, + "learning_rate": 9.954920444282732e-05, + "loss": 0.8244, + "step": 311 + }, + { + "epoch": 0.07190596911730814, + "grad_norm": 0.5660645365715027, + "learning_rate": 9.954418933514795e-05, + "loss": 0.8114, + "step": 312 + }, + { + "epoch": 0.0721364369670431, + "grad_norm": 0.5123659372329712, + "learning_rate": 9.953914661272661e-05, + "loss": 0.8135, + "step": 313 + }, + { + "epoch": 0.07236690481677806, + "grad_norm": 0.47135382890701294, + "learning_rate": 9.953407627837398e-05, + "loss": 0.813, + "step": 314 + }, + { + "epoch": 0.07259737266651302, + "grad_norm": 0.5098188519477844, + "learning_rate": 9.952897833491617e-05, + "loss": 0.8116, + "step": 315 + }, + { + "epoch": 0.07282784051624798, + "grad_norm": 0.5356022715568542, + "learning_rate": 9.952385278519462e-05, + "loss": 0.8152, + "step": 316 + }, + { + "epoch": 0.07305830836598294, + "grad_norm": 0.5658923983573914, + "learning_rate": 9.951869963206622e-05, + "loss": 0.8085, + "step": 317 + }, + { + "epoch": 0.0732887762157179, + "grad_norm": 0.6336301565170288, + "learning_rate": 9.951351887840317e-05, + "loss": 0.8137, + "step": 318 + }, + { + "epoch": 0.07351924406545286, + "grad_norm": 0.7044976949691772, + "learning_rate": 9.950831052709314e-05, + "loss": 0.809, + "step": 319 + }, + { + "epoch": 0.07374971191518784, + "grad_norm": 0.6830228567123413, + "learning_rate": 9.950307458103911e-05, + "loss": 0.8091, + "step": 320 + }, + { + "epoch": 0.0739801797649228, + "grad_norm": 0.6315388083457947, + "learning_rate": 9.949781104315951e-05, + "loss": 0.811, + "step": 321 + }, + { + "epoch": 0.07421064761465776, + "grad_norm": 0.44383642077445984, + "learning_rate": 9.949251991638806e-05, + "loss": 0.8062, + "step": 322 + }, + { + "epoch": 0.07444111546439272, + "grad_norm": 0.48055797815322876, + "learning_rate": 9.948720120367394e-05, + "loss": 0.8063, + "step": 323 + }, + { + "epoch": 0.07467158331412768, + "grad_norm": 0.452512264251709, + "learning_rate": 9.948185490798168e-05, + "loss": 0.8024, + "step": 324 + }, + { + "epoch": 0.07490205116386264, + "grad_norm": 0.46535223722457886, + "learning_rate": 9.947648103229113e-05, + "loss": 0.8065, + "step": 325 + }, + { + "epoch": 0.0751325190135976, + "grad_norm": 0.6422431468963623, + "learning_rate": 9.94710795795976e-05, + "loss": 0.8096, + "step": 326 + }, + { + "epoch": 0.07536298686333257, + "grad_norm": 0.6489291191101074, + "learning_rate": 9.946565055291174e-05, + "loss": 0.8136, + "step": 327 + }, + { + "epoch": 0.07559345471306753, + "grad_norm": 0.7198114991188049, + "learning_rate": 9.946019395525951e-05, + "loss": 0.8052, + "step": 328 + }, + { + "epoch": 0.07582392256280249, + "grad_norm": 0.6340928077697754, + "learning_rate": 9.945470978968234e-05, + "loss": 0.8, + "step": 329 + }, + { + "epoch": 0.07605439041253745, + "grad_norm": 0.6357820630073547, + "learning_rate": 9.944919805923694e-05, + "loss": 0.8031, + "step": 330 + }, + { + "epoch": 0.07628485826227241, + "grad_norm": 0.5786786675453186, + "learning_rate": 9.944365876699544e-05, + "loss": 0.8049, + "step": 331 + }, + { + "epoch": 0.07651532611200737, + "grad_norm": 0.4964509904384613, + "learning_rate": 9.943809191604527e-05, + "loss": 0.8092, + "step": 332 + }, + { + "epoch": 0.07674579396174233, + "grad_norm": 0.431325763463974, + "learning_rate": 9.943249750948929e-05, + "loss": 0.798, + "step": 333 + }, + { + "epoch": 0.07697626181147729, + "grad_norm": 0.41866233944892883, + "learning_rate": 9.942687555044568e-05, + "loss": 0.8017, + "step": 334 + }, + { + "epoch": 0.07720672966121227, + "grad_norm": 0.4726838767528534, + "learning_rate": 9.9421226042048e-05, + "loss": 0.8015, + "step": 335 + }, + { + "epoch": 0.07743719751094723, + "grad_norm": 0.473954439163208, + "learning_rate": 9.941554898744511e-05, + "loss": 0.7963, + "step": 336 + }, + { + "epoch": 0.07766766536068219, + "grad_norm": 0.49290353059768677, + "learning_rate": 9.940984438980131e-05, + "loss": 0.807, + "step": 337 + }, + { + "epoch": 0.07789813321041715, + "grad_norm": 0.4410178065299988, + "learning_rate": 9.940411225229618e-05, + "loss": 0.7958, + "step": 338 + }, + { + "epoch": 0.07812860106015211, + "grad_norm": 0.41889092326164246, + "learning_rate": 9.939835257812468e-05, + "loss": 0.7984, + "step": 339 + }, + { + "epoch": 0.07835906890988707, + "grad_norm": 0.4519578218460083, + "learning_rate": 9.939256537049711e-05, + "loss": 0.8046, + "step": 340 + }, + { + "epoch": 0.07858953675962203, + "grad_norm": 0.5191729664802551, + "learning_rate": 9.938675063263914e-05, + "loss": 0.8045, + "step": 341 + }, + { + "epoch": 0.078820004609357, + "grad_norm": 0.5066516399383545, + "learning_rate": 9.938090836779174e-05, + "loss": 0.7918, + "step": 342 + }, + { + "epoch": 0.07905047245909196, + "grad_norm": 0.4769982695579529, + "learning_rate": 9.937503857921125e-05, + "loss": 0.8122, + "step": 343 + }, + { + "epoch": 0.07928094030882692, + "grad_norm": 0.4449038505554199, + "learning_rate": 9.936914127016938e-05, + "loss": 0.8004, + "step": 344 + }, + { + "epoch": 0.07951140815856188, + "grad_norm": 0.38136690855026245, + "learning_rate": 9.936321644395312e-05, + "loss": 0.8063, + "step": 345 + }, + { + "epoch": 0.07974187600829684, + "grad_norm": 0.5003980994224548, + "learning_rate": 9.935726410386484e-05, + "loss": 0.7955, + "step": 346 + }, + { + "epoch": 0.0799723438580318, + "grad_norm": 0.47362205386161804, + "learning_rate": 9.93512842532222e-05, + "loss": 0.7994, + "step": 347 + }, + { + "epoch": 0.08020281170776676, + "grad_norm": 0.4894074499607086, + "learning_rate": 9.934527689535826e-05, + "loss": 0.8096, + "step": 348 + }, + { + "epoch": 0.08043327955750172, + "grad_norm": 0.4951101839542389, + "learning_rate": 9.933924203362138e-05, + "loss": 0.7788, + "step": 349 + }, + { + "epoch": 0.0806637474072367, + "grad_norm": 0.46733275055885315, + "learning_rate": 9.933317967137524e-05, + "loss": 0.7885, + "step": 350 + }, + { + "epoch": 0.08089421525697166, + "grad_norm": 0.48279932141304016, + "learning_rate": 9.932708981199883e-05, + "loss": 0.7924, + "step": 351 + }, + { + "epoch": 0.08112468310670662, + "grad_norm": 0.48102623224258423, + "learning_rate": 9.932097245888652e-05, + "loss": 0.7921, + "step": 352 + }, + { + "epoch": 0.08135515095644158, + "grad_norm": 0.4802628457546234, + "learning_rate": 9.931482761544797e-05, + "loss": 0.8044, + "step": 353 + }, + { + "epoch": 0.08158561880617654, + "grad_norm": 0.4573599398136139, + "learning_rate": 9.930865528510815e-05, + "loss": 0.7928, + "step": 354 + }, + { + "epoch": 0.0818160866559115, + "grad_norm": 0.5361927151679993, + "learning_rate": 9.93024554713074e-05, + "loss": 0.7967, + "step": 355 + }, + { + "epoch": 0.08204655450564646, + "grad_norm": 0.56016606092453, + "learning_rate": 9.929622817750133e-05, + "loss": 0.8019, + "step": 356 + }, + { + "epoch": 0.08227702235538142, + "grad_norm": 0.5165932774543762, + "learning_rate": 9.928997340716087e-05, + "loss": 0.7981, + "step": 357 + }, + { + "epoch": 0.08250749020511638, + "grad_norm": 0.4367765188217163, + "learning_rate": 9.928369116377231e-05, + "loss": 0.7892, + "step": 358 + }, + { + "epoch": 0.08273795805485135, + "grad_norm": 0.6103436946868896, + "learning_rate": 9.92773814508372e-05, + "loss": 0.7887, + "step": 359 + }, + { + "epoch": 0.08296842590458631, + "grad_norm": 0.6656332015991211, + "learning_rate": 9.927104427187243e-05, + "loss": 0.7924, + "step": 360 + }, + { + "epoch": 0.08319889375432127, + "grad_norm": 0.6310869455337524, + "learning_rate": 9.926467963041018e-05, + "loss": 0.7973, + "step": 361 + }, + { + "epoch": 0.08342936160405623, + "grad_norm": 0.6289921998977661, + "learning_rate": 9.925828752999797e-05, + "loss": 0.7897, + "step": 362 + }, + { + "epoch": 0.08365982945379119, + "grad_norm": 0.501392126083374, + "learning_rate": 9.925186797419858e-05, + "loss": 0.7915, + "step": 363 + }, + { + "epoch": 0.08389029730352615, + "grad_norm": 0.4528777301311493, + "learning_rate": 9.924542096659015e-05, + "loss": 0.7925, + "step": 364 + }, + { + "epoch": 0.08412076515326113, + "grad_norm": 0.5350551605224609, + "learning_rate": 9.923894651076605e-05, + "loss": 0.7941, + "step": 365 + }, + { + "epoch": 0.08435123300299609, + "grad_norm": 0.4819077253341675, + "learning_rate": 9.9232444610335e-05, + "loss": 0.7899, + "step": 366 + }, + { + "epoch": 0.08458170085273105, + "grad_norm": 0.47890499234199524, + "learning_rate": 9.9225915268921e-05, + "loss": 0.7899, + "step": 367 + }, + { + "epoch": 0.08481216870246601, + "grad_norm": 0.6021353602409363, + "learning_rate": 9.921935849016338e-05, + "loss": 0.7949, + "step": 368 + }, + { + "epoch": 0.08504263655220097, + "grad_norm": 0.624280571937561, + "learning_rate": 9.921277427771667e-05, + "loss": 0.7977, + "step": 369 + }, + { + "epoch": 0.08527310440193593, + "grad_norm": 0.48100805282592773, + "learning_rate": 9.92061626352508e-05, + "loss": 0.7847, + "step": 370 + }, + { + "epoch": 0.08550357225167089, + "grad_norm": 0.4303872287273407, + "learning_rate": 9.919952356645092e-05, + "loss": 0.7925, + "step": 371 + }, + { + "epoch": 0.08573404010140585, + "grad_norm": 0.4995540678501129, + "learning_rate": 9.91928570750175e-05, + "loss": 0.7931, + "step": 372 + }, + { + "epoch": 0.08596450795114081, + "grad_norm": 0.49925413727760315, + "learning_rate": 9.918616316466628e-05, + "loss": 0.7875, + "step": 373 + }, + { + "epoch": 0.08619497580087578, + "grad_norm": 0.4929007887840271, + "learning_rate": 9.917944183912828e-05, + "loss": 0.7821, + "step": 374 + }, + { + "epoch": 0.08642544365061074, + "grad_norm": 0.4669882357120514, + "learning_rate": 9.91726931021498e-05, + "loss": 0.786, + "step": 375 + }, + { + "epoch": 0.0866559115003457, + "grad_norm": 0.4014081358909607, + "learning_rate": 9.916591695749244e-05, + "loss": 0.7833, + "step": 376 + }, + { + "epoch": 0.08688637935008066, + "grad_norm": 0.38111427426338196, + "learning_rate": 9.915911340893305e-05, + "loss": 0.7894, + "step": 377 + }, + { + "epoch": 0.08711684719981562, + "grad_norm": 0.457692950963974, + "learning_rate": 9.915228246026376e-05, + "loss": 0.7912, + "step": 378 + }, + { + "epoch": 0.0873473150495506, + "grad_norm": 0.467342734336853, + "learning_rate": 9.9145424115292e-05, + "loss": 0.7843, + "step": 379 + }, + { + "epoch": 0.08757778289928556, + "grad_norm": 0.44467970728874207, + "learning_rate": 9.913853837784042e-05, + "loss": 0.7939, + "step": 380 + }, + { + "epoch": 0.08780825074902052, + "grad_norm": 0.4361025094985962, + "learning_rate": 9.913162525174697e-05, + "loss": 0.7803, + "step": 381 + }, + { + "epoch": 0.08803871859875548, + "grad_norm": 0.44507619738578796, + "learning_rate": 9.912468474086486e-05, + "loss": 0.7765, + "step": 382 + }, + { + "epoch": 0.08826918644849044, + "grad_norm": 0.5151132941246033, + "learning_rate": 9.911771684906257e-05, + "loss": 0.7824, + "step": 383 + }, + { + "epoch": 0.0884996542982254, + "grad_norm": 0.45923879742622375, + "learning_rate": 9.911072158022385e-05, + "loss": 0.7881, + "step": 384 + }, + { + "epoch": 0.08873012214796036, + "grad_norm": 0.41044628620147705, + "learning_rate": 9.910369893824767e-05, + "loss": 0.7841, + "step": 385 + }, + { + "epoch": 0.08896058999769532, + "grad_norm": 0.41591617465019226, + "learning_rate": 9.90966489270483e-05, + "loss": 0.7754, + "step": 386 + }, + { + "epoch": 0.08919105784743028, + "grad_norm": 0.5302747488021851, + "learning_rate": 9.908957155055523e-05, + "loss": 0.7784, + "step": 387 + }, + { + "epoch": 0.08942152569716524, + "grad_norm": 0.41914960741996765, + "learning_rate": 9.908246681271322e-05, + "loss": 0.7817, + "step": 388 + }, + { + "epoch": 0.0896519935469002, + "grad_norm": 0.38282233476638794, + "learning_rate": 9.907533471748231e-05, + "loss": 0.779, + "step": 389 + }, + { + "epoch": 0.08988246139663517, + "grad_norm": 0.4711924195289612, + "learning_rate": 9.906817526883774e-05, + "loss": 0.7853, + "step": 390 + }, + { + "epoch": 0.09011292924637013, + "grad_norm": 0.45906805992126465, + "learning_rate": 9.906098847076999e-05, + "loss": 0.7711, + "step": 391 + }, + { + "epoch": 0.09034339709610509, + "grad_norm": 0.44320812821388245, + "learning_rate": 9.905377432728484e-05, + "loss": 0.7792, + "step": 392 + }, + { + "epoch": 0.09057386494584005, + "grad_norm": 0.41927963495254517, + "learning_rate": 9.904653284240328e-05, + "loss": 0.784, + "step": 393 + }, + { + "epoch": 0.09080433279557502, + "grad_norm": 0.5191970467567444, + "learning_rate": 9.903926402016153e-05, + "loss": 0.7834, + "step": 394 + }, + { + "epoch": 0.09103480064530999, + "grad_norm": 0.6078465580940247, + "learning_rate": 9.903196786461106e-05, + "loss": 0.7784, + "step": 395 + }, + { + "epoch": 0.09126526849504495, + "grad_norm": 0.6223181486129761, + "learning_rate": 9.902464437981855e-05, + "loss": 0.7913, + "step": 396 + }, + { + "epoch": 0.09149573634477991, + "grad_norm": 0.6932902336120605, + "learning_rate": 9.901729356986597e-05, + "loss": 0.7817, + "step": 397 + }, + { + "epoch": 0.09172620419451487, + "grad_norm": 0.6165162920951843, + "learning_rate": 9.900991543885048e-05, + "loss": 0.775, + "step": 398 + }, + { + "epoch": 0.09195667204424983, + "grad_norm": 0.5054426193237305, + "learning_rate": 9.900250999088447e-05, + "loss": 0.7824, + "step": 399 + }, + { + "epoch": 0.09218713989398479, + "grad_norm": 0.5113014578819275, + "learning_rate": 9.899507723009554e-05, + "loss": 0.7827, + "step": 400 + }, + { + "epoch": 0.09241760774371975, + "grad_norm": 0.5064506530761719, + "learning_rate": 9.898761716062654e-05, + "loss": 0.7848, + "step": 401 + }, + { + "epoch": 0.09264807559345471, + "grad_norm": 0.4201597571372986, + "learning_rate": 9.898012978663553e-05, + "loss": 0.7804, + "step": 402 + }, + { + "epoch": 0.09287854344318967, + "grad_norm": 0.47890499234199524, + "learning_rate": 9.897261511229583e-05, + "loss": 0.7775, + "step": 403 + }, + { + "epoch": 0.09310901129292463, + "grad_norm": 0.5132191777229309, + "learning_rate": 9.896507314179588e-05, + "loss": 0.7739, + "step": 404 + }, + { + "epoch": 0.0933394791426596, + "grad_norm": 0.5053228735923767, + "learning_rate": 9.895750387933944e-05, + "loss": 0.7725, + "step": 405 + }, + { + "epoch": 0.09356994699239456, + "grad_norm": 0.5740991830825806, + "learning_rate": 9.894990732914541e-05, + "loss": 0.7784, + "step": 406 + }, + { + "epoch": 0.09380041484212952, + "grad_norm": 0.5145570635795593, + "learning_rate": 9.894228349544796e-05, + "loss": 0.7783, + "step": 407 + }, + { + "epoch": 0.09403088269186448, + "grad_norm": 0.5113042593002319, + "learning_rate": 9.893463238249638e-05, + "loss": 0.7835, + "step": 408 + }, + { + "epoch": 0.09426135054159945, + "grad_norm": 0.5256659984588623, + "learning_rate": 9.892695399455525e-05, + "loss": 0.7746, + "step": 409 + }, + { + "epoch": 0.09449181839133441, + "grad_norm": 0.5014989972114563, + "learning_rate": 9.891924833590431e-05, + "loss": 0.7715, + "step": 410 + }, + { + "epoch": 0.09472228624106938, + "grad_norm": 0.45504307746887207, + "learning_rate": 9.891151541083852e-05, + "loss": 0.7749, + "step": 411 + }, + { + "epoch": 0.09495275409080434, + "grad_norm": 0.4456666111946106, + "learning_rate": 9.8903755223668e-05, + "loss": 0.7685, + "step": 412 + }, + { + "epoch": 0.0951832219405393, + "grad_norm": 0.45063498616218567, + "learning_rate": 9.889596777871812e-05, + "loss": 0.7675, + "step": 413 + }, + { + "epoch": 0.09541368979027426, + "grad_norm": 0.48387956619262695, + "learning_rate": 9.88881530803294e-05, + "loss": 0.7759, + "step": 414 + }, + { + "epoch": 0.09564415764000922, + "grad_norm": 0.4509301483631134, + "learning_rate": 9.888031113285757e-05, + "loss": 0.7717, + "step": 415 + }, + { + "epoch": 0.09587462548974418, + "grad_norm": 0.453188419342041, + "learning_rate": 9.887244194067355e-05, + "loss": 0.7707, + "step": 416 + }, + { + "epoch": 0.09610509333947914, + "grad_norm": 0.4492356777191162, + "learning_rate": 9.886454550816342e-05, + "loss": 0.7736, + "step": 417 + }, + { + "epoch": 0.0963355611892141, + "grad_norm": 0.3863151967525482, + "learning_rate": 9.885662183972848e-05, + "loss": 0.7812, + "step": 418 + }, + { + "epoch": 0.09656602903894906, + "grad_norm": 0.4794161021709442, + "learning_rate": 9.884867093978519e-05, + "loss": 0.7704, + "step": 419 + }, + { + "epoch": 0.09679649688868402, + "grad_norm": 0.44275277853012085, + "learning_rate": 9.884069281276517e-05, + "loss": 0.7693, + "step": 420 + }, + { + "epoch": 0.09702696473841899, + "grad_norm": 0.4233629107475281, + "learning_rate": 9.883268746311528e-05, + "loss": 0.7742, + "step": 421 + }, + { + "epoch": 0.09725743258815395, + "grad_norm": 0.34396764636039734, + "learning_rate": 9.882465489529747e-05, + "loss": 0.772, + "step": 422 + }, + { + "epoch": 0.09748790043788891, + "grad_norm": 0.4056984484195709, + "learning_rate": 9.881659511378892e-05, + "loss": 0.7751, + "step": 423 + }, + { + "epoch": 0.09771836828762388, + "grad_norm": 0.4664100408554077, + "learning_rate": 9.880850812308196e-05, + "loss": 0.7706, + "step": 424 + }, + { + "epoch": 0.09794883613735884, + "grad_norm": 0.5346910357475281, + "learning_rate": 9.880039392768405e-05, + "loss": 0.7661, + "step": 425 + }, + { + "epoch": 0.0981793039870938, + "grad_norm": 0.4764558970928192, + "learning_rate": 9.87922525321179e-05, + "loss": 0.7794, + "step": 426 + }, + { + "epoch": 0.09840977183682877, + "grad_norm": 0.39610040187835693, + "learning_rate": 9.878408394092129e-05, + "loss": 0.7711, + "step": 427 + }, + { + "epoch": 0.09864023968656373, + "grad_norm": 0.42657968401908875, + "learning_rate": 9.877588815864722e-05, + "loss": 0.7739, + "step": 428 + }, + { + "epoch": 0.09887070753629869, + "grad_norm": 0.46280384063720703, + "learning_rate": 9.87676651898638e-05, + "loss": 0.7811, + "step": 429 + }, + { + "epoch": 0.09910117538603365, + "grad_norm": 0.44775912165641785, + "learning_rate": 9.875941503915434e-05, + "loss": 0.7733, + "step": 430 + }, + { + "epoch": 0.09933164323576861, + "grad_norm": 0.4808795154094696, + "learning_rate": 9.875113771111725e-05, + "loss": 0.7734, + "step": 431 + }, + { + "epoch": 0.09956211108550357, + "grad_norm": 0.42084455490112305, + "learning_rate": 9.874283321036615e-05, + "loss": 0.7679, + "step": 432 + }, + { + "epoch": 0.09979257893523853, + "grad_norm": 0.4245053231716156, + "learning_rate": 9.873450154152972e-05, + "loss": 0.7755, + "step": 433 + }, + { + "epoch": 0.1000230467849735, + "grad_norm": 0.4106716811656952, + "learning_rate": 9.872614270925188e-05, + "loss": 0.7757, + "step": 434 + }, + { + "epoch": 0.10025351463470845, + "grad_norm": 0.38926827907562256, + "learning_rate": 9.871775671819162e-05, + "loss": 0.7761, + "step": 435 + }, + { + "epoch": 0.10048398248444342, + "grad_norm": 0.3636447489261627, + "learning_rate": 9.870934357302308e-05, + "loss": 0.7722, + "step": 436 + }, + { + "epoch": 0.10071445033417838, + "grad_norm": 0.3969525992870331, + "learning_rate": 9.870090327843557e-05, + "loss": 0.7722, + "step": 437 + }, + { + "epoch": 0.10094491818391334, + "grad_norm": 0.36944982409477234, + "learning_rate": 9.869243583913348e-05, + "loss": 0.7716, + "step": 438 + }, + { + "epoch": 0.10117538603364831, + "grad_norm": 0.44781213998794556, + "learning_rate": 9.86839412598364e-05, + "loss": 0.7757, + "step": 439 + }, + { + "epoch": 0.10140585388338327, + "grad_norm": 0.5165262818336487, + "learning_rate": 9.867541954527894e-05, + "loss": 0.7678, + "step": 440 + }, + { + "epoch": 0.10163632173311823, + "grad_norm": 0.5470614433288574, + "learning_rate": 9.866687070021097e-05, + "loss": 0.7684, + "step": 441 + }, + { + "epoch": 0.1018667895828532, + "grad_norm": 0.4986436665058136, + "learning_rate": 9.865829472939736e-05, + "loss": 0.7682, + "step": 442 + }, + { + "epoch": 0.10209725743258816, + "grad_norm": 0.4134550988674164, + "learning_rate": 9.864969163761817e-05, + "loss": 0.7734, + "step": 443 + }, + { + "epoch": 0.10232772528232312, + "grad_norm": 0.5219729542732239, + "learning_rate": 9.864106142966856e-05, + "loss": 0.7744, + "step": 444 + }, + { + "epoch": 0.10255819313205808, + "grad_norm": 0.539275586605072, + "learning_rate": 9.863240411035878e-05, + "loss": 0.7721, + "step": 445 + }, + { + "epoch": 0.10278866098179304, + "grad_norm": 0.4756945073604584, + "learning_rate": 9.862371968451423e-05, + "loss": 0.7645, + "step": 446 + }, + { + "epoch": 0.103019128831528, + "grad_norm": 0.43748557567596436, + "learning_rate": 9.861500815697541e-05, + "loss": 0.7693, + "step": 447 + }, + { + "epoch": 0.10324959668126296, + "grad_norm": 0.356668084859848, + "learning_rate": 9.860626953259791e-05, + "loss": 0.7645, + "step": 448 + }, + { + "epoch": 0.10348006453099792, + "grad_norm": 0.31597620248794556, + "learning_rate": 9.859750381625241e-05, + "loss": 0.7626, + "step": 449 + }, + { + "epoch": 0.10371053238073288, + "grad_norm": 0.3589065968990326, + "learning_rate": 9.858871101282472e-05, + "loss": 0.782, + "step": 450 + }, + { + "epoch": 0.10394100023046784, + "grad_norm": 0.3751053810119629, + "learning_rate": 9.857989112721574e-05, + "loss": 0.7701, + "step": 451 + }, + { + "epoch": 0.1041714680802028, + "grad_norm": 0.38888126611709595, + "learning_rate": 9.857104416434147e-05, + "loss": 0.769, + "step": 452 + }, + { + "epoch": 0.10440193592993777, + "grad_norm": 0.3789352774620056, + "learning_rate": 9.856217012913299e-05, + "loss": 0.7725, + "step": 453 + }, + { + "epoch": 0.10463240377967274, + "grad_norm": 0.36712217330932617, + "learning_rate": 9.855326902653647e-05, + "loss": 0.7763, + "step": 454 + }, + { + "epoch": 0.1048628716294077, + "grad_norm": 0.38460057973861694, + "learning_rate": 9.854434086151318e-05, + "loss": 0.7603, + "step": 455 + }, + { + "epoch": 0.10509333947914266, + "grad_norm": 0.41386526823043823, + "learning_rate": 9.853538563903945e-05, + "loss": 0.7683, + "step": 456 + }, + { + "epoch": 0.10532380732887762, + "grad_norm": 0.42844244837760925, + "learning_rate": 9.852640336410671e-05, + "loss": 0.7587, + "step": 457 + }, + { + "epoch": 0.10555427517861259, + "grad_norm": 0.41544997692108154, + "learning_rate": 9.851739404172147e-05, + "loss": 0.7622, + "step": 458 + }, + { + "epoch": 0.10578474302834755, + "grad_norm": 0.42414671182632446, + "learning_rate": 9.850835767690532e-05, + "loss": 0.7644, + "step": 459 + }, + { + "epoch": 0.10601521087808251, + "grad_norm": 0.4521081745624542, + "learning_rate": 9.849929427469488e-05, + "loss": 0.7621, + "step": 460 + }, + { + "epoch": 0.10624567872781747, + "grad_norm": 0.4264926314353943, + "learning_rate": 9.849020384014192e-05, + "loss": 0.7661, + "step": 461 + }, + { + "epoch": 0.10647614657755243, + "grad_norm": 0.4072660207748413, + "learning_rate": 9.848108637831319e-05, + "loss": 0.7686, + "step": 462 + }, + { + "epoch": 0.10670661442728739, + "grad_norm": 0.38507211208343506, + "learning_rate": 9.847194189429058e-05, + "loss": 0.7705, + "step": 463 + }, + { + "epoch": 0.10693708227702235, + "grad_norm": 0.4261547923088074, + "learning_rate": 9.846277039317095e-05, + "loss": 0.7827, + "step": 464 + }, + { + "epoch": 0.10716755012675731, + "grad_norm": 0.5000432133674622, + "learning_rate": 9.845357188006635e-05, + "loss": 0.7627, + "step": 465 + }, + { + "epoch": 0.10739801797649227, + "grad_norm": 0.5209843516349792, + "learning_rate": 9.844434636010373e-05, + "loss": 0.7754, + "step": 466 + }, + { + "epoch": 0.10762848582622724, + "grad_norm": 0.4857335090637207, + "learning_rate": 9.843509383842525e-05, + "loss": 0.7687, + "step": 467 + }, + { + "epoch": 0.1078589536759622, + "grad_norm": 0.5346872806549072, + "learning_rate": 9.842581432018798e-05, + "loss": 0.772, + "step": 468 + }, + { + "epoch": 0.10808942152569717, + "grad_norm": 0.6131623387336731, + "learning_rate": 9.841650781056413e-05, + "loss": 0.7808, + "step": 469 + }, + { + "epoch": 0.10831988937543213, + "grad_norm": 0.5165646076202393, + "learning_rate": 9.840717431474094e-05, + "loss": 0.772, + "step": 470 + }, + { + "epoch": 0.1085503572251671, + "grad_norm": 0.39740481972694397, + "learning_rate": 9.839781383792064e-05, + "loss": 0.7671, + "step": 471 + }, + { + "epoch": 0.10878082507490205, + "grad_norm": 0.42045560479164124, + "learning_rate": 9.838842638532056e-05, + "loss": 0.7667, + "step": 472 + }, + { + "epoch": 0.10901129292463702, + "grad_norm": 0.4750736355781555, + "learning_rate": 9.837901196217303e-05, + "loss": 0.7565, + "step": 473 + }, + { + "epoch": 0.10924176077437198, + "grad_norm": 0.3689127266407013, + "learning_rate": 9.836957057372544e-05, + "loss": 0.7652, + "step": 474 + }, + { + "epoch": 0.10947222862410694, + "grad_norm": 0.3930782675743103, + "learning_rate": 9.836010222524018e-05, + "loss": 0.7538, + "step": 475 + }, + { + "epoch": 0.1097026964738419, + "grad_norm": 0.45599618554115295, + "learning_rate": 9.835060692199468e-05, + "loss": 0.7724, + "step": 476 + }, + { + "epoch": 0.10993316432357686, + "grad_norm": 0.5148651003837585, + "learning_rate": 9.83410846692814e-05, + "loss": 0.7612, + "step": 477 + }, + { + "epoch": 0.11016363217331182, + "grad_norm": 0.4592342674732208, + "learning_rate": 9.83315354724078e-05, + "loss": 0.7667, + "step": 478 + }, + { + "epoch": 0.11039410002304678, + "grad_norm": 0.40691468119621277, + "learning_rate": 9.83219593366964e-05, + "loss": 0.7676, + "step": 479 + }, + { + "epoch": 0.11062456787278174, + "grad_norm": 0.4532166123390198, + "learning_rate": 9.831235626748467e-05, + "loss": 0.7683, + "step": 480 + }, + { + "epoch": 0.1108550357225167, + "grad_norm": 0.42121079564094543, + "learning_rate": 9.830272627012518e-05, + "loss": 0.7691, + "step": 481 + }, + { + "epoch": 0.11108550357225166, + "grad_norm": 0.4073924422264099, + "learning_rate": 9.82930693499854e-05, + "loss": 0.7705, + "step": 482 + }, + { + "epoch": 0.11131597142198664, + "grad_norm": 0.39546170830726624, + "learning_rate": 9.828338551244794e-05, + "loss": 0.7684, + "step": 483 + }, + { + "epoch": 0.1115464392717216, + "grad_norm": 0.38143351674079895, + "learning_rate": 9.827367476291027e-05, + "loss": 0.7554, + "step": 484 + }, + { + "epoch": 0.11177690712145656, + "grad_norm": 0.3818657100200653, + "learning_rate": 9.826393710678497e-05, + "loss": 0.7632, + "step": 485 + }, + { + "epoch": 0.11200737497119152, + "grad_norm": 0.40947970747947693, + "learning_rate": 9.825417254949953e-05, + "loss": 0.7628, + "step": 486 + }, + { + "epoch": 0.11223784282092648, + "grad_norm": 0.40739309787750244, + "learning_rate": 9.824438109649654e-05, + "loss": 0.7659, + "step": 487 + }, + { + "epoch": 0.11246831067066144, + "grad_norm": 0.4805825650691986, + "learning_rate": 9.823456275323348e-05, + "loss": 0.7682, + "step": 488 + }, + { + "epoch": 0.1126987785203964, + "grad_norm": 0.48237094283103943, + "learning_rate": 9.822471752518288e-05, + "loss": 0.7658, + "step": 489 + }, + { + "epoch": 0.11292924637013137, + "grad_norm": 0.44534748792648315, + "learning_rate": 9.821484541783221e-05, + "loss": 0.769, + "step": 490 + }, + { + "epoch": 0.11315971421986633, + "grad_norm": 0.4775294363498688, + "learning_rate": 9.820494643668396e-05, + "loss": 0.7654, + "step": 491 + }, + { + "epoch": 0.11339018206960129, + "grad_norm": 0.4316689968109131, + "learning_rate": 9.81950205872556e-05, + "loss": 0.7689, + "step": 492 + }, + { + "epoch": 0.11362064991933625, + "grad_norm": 0.4406385123729706, + "learning_rate": 9.818506787507952e-05, + "loss": 0.7615, + "step": 493 + }, + { + "epoch": 0.11385111776907121, + "grad_norm": 0.37954282760620117, + "learning_rate": 9.817508830570318e-05, + "loss": 0.7581, + "step": 494 + }, + { + "epoch": 0.11408158561880617, + "grad_norm": 0.37608101963996887, + "learning_rate": 9.81650818846889e-05, + "loss": 0.7619, + "step": 495 + }, + { + "epoch": 0.11431205346854113, + "grad_norm": 0.35068264603614807, + "learning_rate": 9.815504861761404e-05, + "loss": 0.763, + "step": 496 + }, + { + "epoch": 0.1145425213182761, + "grad_norm": 0.3810550570487976, + "learning_rate": 9.81449885100709e-05, + "loss": 0.7551, + "step": 497 + }, + { + "epoch": 0.11477298916801107, + "grad_norm": 0.3873169422149658, + "learning_rate": 9.813490156766676e-05, + "loss": 0.7606, + "step": 498 + }, + { + "epoch": 0.11500345701774603, + "grad_norm": 0.39904019236564636, + "learning_rate": 9.812478779602381e-05, + "loss": 0.7562, + "step": 499 + }, + { + "epoch": 0.11523392486748099, + "grad_norm": 0.379290908575058, + "learning_rate": 9.811464720077923e-05, + "loss": 0.7544, + "step": 500 + }, + { + "epoch": 0.11546439271721595, + "grad_norm": 0.3641830384731293, + "learning_rate": 9.810447978758517e-05, + "loss": 0.7587, + "step": 501 + }, + { + "epoch": 0.11569486056695091, + "grad_norm": 0.3822626769542694, + "learning_rate": 9.809428556210867e-05, + "loss": 0.7577, + "step": 502 + }, + { + "epoch": 0.11592532841668587, + "grad_norm": 0.42176035046577454, + "learning_rate": 9.808406453003175e-05, + "loss": 0.7602, + "step": 503 + }, + { + "epoch": 0.11615579626642084, + "grad_norm": 0.3986082971096039, + "learning_rate": 9.80738166970514e-05, + "loss": 0.7504, + "step": 504 + }, + { + "epoch": 0.1163862641161558, + "grad_norm": 0.3723549246788025, + "learning_rate": 9.806354206887949e-05, + "loss": 0.764, + "step": 505 + }, + { + "epoch": 0.11661673196589076, + "grad_norm": 0.3810165524482727, + "learning_rate": 9.805324065124283e-05, + "loss": 0.7632, + "step": 506 + }, + { + "epoch": 0.11684719981562572, + "grad_norm": 0.3603302538394928, + "learning_rate": 9.804291244988324e-05, + "loss": 0.7572, + "step": 507 + }, + { + "epoch": 0.11707766766536068, + "grad_norm": 0.3849985897541046, + "learning_rate": 9.803255747055737e-05, + "loss": 0.7562, + "step": 508 + }, + { + "epoch": 0.11730813551509564, + "grad_norm": 0.3381344974040985, + "learning_rate": 9.802217571903685e-05, + "loss": 0.7643, + "step": 509 + }, + { + "epoch": 0.1175386033648306, + "grad_norm": 0.35623160004615784, + "learning_rate": 9.80117672011082e-05, + "loss": 0.752, + "step": 510 + }, + { + "epoch": 0.11776907121456556, + "grad_norm": 0.36817556619644165, + "learning_rate": 9.800133192257291e-05, + "loss": 0.7709, + "step": 511 + }, + { + "epoch": 0.11799953906430052, + "grad_norm": 0.5149376392364502, + "learning_rate": 9.799086988924733e-05, + "loss": 0.7566, + "step": 512 + }, + { + "epoch": 0.1182300069140355, + "grad_norm": 0.4210844039916992, + "learning_rate": 9.798038110696275e-05, + "loss": 0.7523, + "step": 513 + }, + { + "epoch": 0.11846047476377046, + "grad_norm": 0.4472396671772003, + "learning_rate": 9.796986558156538e-05, + "loss": 0.7586, + "step": 514 + }, + { + "epoch": 0.11869094261350542, + "grad_norm": 0.45051324367523193, + "learning_rate": 9.79593233189163e-05, + "loss": 0.7561, + "step": 515 + }, + { + "epoch": 0.11892141046324038, + "grad_norm": 0.4668648838996887, + "learning_rate": 9.794875432489152e-05, + "loss": 0.7548, + "step": 516 + }, + { + "epoch": 0.11915187831297534, + "grad_norm": 0.3892306685447693, + "learning_rate": 9.793815860538197e-05, + "loss": 0.7536, + "step": 517 + }, + { + "epoch": 0.1193823461627103, + "grad_norm": 0.36023855209350586, + "learning_rate": 9.792753616629342e-05, + "loss": 0.7595, + "step": 518 + }, + { + "epoch": 0.11961281401244526, + "grad_norm": 0.40682411193847656, + "learning_rate": 9.791688701354656e-05, + "loss": 0.7583, + "step": 519 + }, + { + "epoch": 0.11984328186218023, + "grad_norm": 0.464751660823822, + "learning_rate": 9.790621115307699e-05, + "loss": 0.7654, + "step": 520 + }, + { + "epoch": 0.12007374971191519, + "grad_norm": 0.4933690130710602, + "learning_rate": 9.789550859083517e-05, + "loss": 0.7567, + "step": 521 + }, + { + "epoch": 0.12030421756165015, + "grad_norm": 0.459985613822937, + "learning_rate": 9.788477933278646e-05, + "loss": 0.7464, + "step": 522 + }, + { + "epoch": 0.12053468541138511, + "grad_norm": 0.43470343947410583, + "learning_rate": 9.787402338491108e-05, + "loss": 0.7539, + "step": 523 + }, + { + "epoch": 0.12076515326112007, + "grad_norm": 0.3725113868713379, + "learning_rate": 9.786324075320414e-05, + "loss": 0.7536, + "step": 524 + }, + { + "epoch": 0.12099562111085503, + "grad_norm": 0.3641432225704193, + "learning_rate": 9.785243144367562e-05, + "loss": 0.7546, + "step": 525 + }, + { + "epoch": 0.12122608896058999, + "grad_norm": 0.3699088990688324, + "learning_rate": 9.784159546235037e-05, + "loss": 0.7544, + "step": 526 + }, + { + "epoch": 0.12145655681032495, + "grad_norm": 0.3829246461391449, + "learning_rate": 9.78307328152681e-05, + "loss": 0.7613, + "step": 527 + }, + { + "epoch": 0.12168702466005993, + "grad_norm": 0.42919662594795227, + "learning_rate": 9.78198435084834e-05, + "loss": 0.7546, + "step": 528 + }, + { + "epoch": 0.12191749250979489, + "grad_norm": 0.446468323469162, + "learning_rate": 9.780892754806571e-05, + "loss": 0.7496, + "step": 529 + }, + { + "epoch": 0.12214796035952985, + "grad_norm": 0.459417462348938, + "learning_rate": 9.779798494009931e-05, + "loss": 0.7559, + "step": 530 + }, + { + "epoch": 0.12237842820926481, + "grad_norm": 0.4683552086353302, + "learning_rate": 9.778701569068336e-05, + "loss": 0.7681, + "step": 531 + }, + { + "epoch": 0.12260889605899977, + "grad_norm": 0.38343241810798645, + "learning_rate": 9.777601980593184e-05, + "loss": 0.7649, + "step": 532 + }, + { + "epoch": 0.12283936390873473, + "grad_norm": 0.4223579466342926, + "learning_rate": 9.776499729197362e-05, + "loss": 0.75, + "step": 533 + }, + { + "epoch": 0.1230698317584697, + "grad_norm": 0.4097927510738373, + "learning_rate": 9.775394815495236e-05, + "loss": 0.7476, + "step": 534 + }, + { + "epoch": 0.12330029960820466, + "grad_norm": 0.4324532151222229, + "learning_rate": 9.77428724010266e-05, + "loss": 0.7568, + "step": 535 + }, + { + "epoch": 0.12353076745793962, + "grad_norm": 0.4441714584827423, + "learning_rate": 9.773177003636969e-05, + "loss": 0.7574, + "step": 536 + }, + { + "epoch": 0.12376123530767458, + "grad_norm": 0.49834537506103516, + "learning_rate": 9.77206410671698e-05, + "loss": 0.7559, + "step": 537 + }, + { + "epoch": 0.12399170315740954, + "grad_norm": 0.5015109181404114, + "learning_rate": 9.770948549963e-05, + "loss": 0.7566, + "step": 538 + }, + { + "epoch": 0.1242221710071445, + "grad_norm": 0.3993091881275177, + "learning_rate": 9.76983033399681e-05, + "loss": 0.7574, + "step": 539 + }, + { + "epoch": 0.12445263885687946, + "grad_norm": 0.37793558835983276, + "learning_rate": 9.768709459441676e-05, + "loss": 0.7433, + "step": 540 + }, + { + "epoch": 0.12468310670661442, + "grad_norm": 0.48581400513648987, + "learning_rate": 9.76758592692235e-05, + "loss": 0.7547, + "step": 541 + }, + { + "epoch": 0.12491357455634938, + "grad_norm": 0.5075335502624512, + "learning_rate": 9.76645973706506e-05, + "loss": 0.7625, + "step": 542 + }, + { + "epoch": 0.12514404240608434, + "grad_norm": 0.5207210779190063, + "learning_rate": 9.765330890497518e-05, + "loss": 0.7551, + "step": 543 + }, + { + "epoch": 0.1253745102558193, + "grad_norm": 0.42284563183784485, + "learning_rate": 9.764199387848915e-05, + "loss": 0.7599, + "step": 544 + }, + { + "epoch": 0.12560497810555427, + "grad_norm": 0.3669079542160034, + "learning_rate": 9.763065229749923e-05, + "loss": 0.7433, + "step": 545 + }, + { + "epoch": 0.12583544595528923, + "grad_norm": 0.3668546676635742, + "learning_rate": 9.761928416832695e-05, + "loss": 0.7589, + "step": 546 + }, + { + "epoch": 0.1260659138050242, + "grad_norm": 0.520101010799408, + "learning_rate": 9.760788949730866e-05, + "loss": 0.7503, + "step": 547 + }, + { + "epoch": 0.12629638165475915, + "grad_norm": 0.5019710659980774, + "learning_rate": 9.759646829079543e-05, + "loss": 0.7548, + "step": 548 + }, + { + "epoch": 0.1265268495044941, + "grad_norm": 0.4565122425556183, + "learning_rate": 9.75850205551532e-05, + "loss": 0.7479, + "step": 549 + }, + { + "epoch": 0.1267573173542291, + "grad_norm": 0.4054650664329529, + "learning_rate": 9.757354629676265e-05, + "loss": 0.7455, + "step": 550 + }, + { + "epoch": 0.12698778520396406, + "grad_norm": 0.5145794153213501, + "learning_rate": 9.756204552201926e-05, + "loss": 0.7514, + "step": 551 + }, + { + "epoch": 0.12721825305369902, + "grad_norm": 0.4342021644115448, + "learning_rate": 9.755051823733328e-05, + "loss": 0.7457, + "step": 552 + }, + { + "epoch": 0.12744872090343398, + "grad_norm": 0.3916120231151581, + "learning_rate": 9.753896444912973e-05, + "loss": 0.7557, + "step": 553 + }, + { + "epoch": 0.12767918875316894, + "grad_norm": 0.3801400363445282, + "learning_rate": 9.752738416384844e-05, + "loss": 0.7486, + "step": 554 + }, + { + "epoch": 0.1279096566029039, + "grad_norm": 0.4345585107803345, + "learning_rate": 9.751577738794398e-05, + "loss": 0.7561, + "step": 555 + }, + { + "epoch": 0.12814012445263886, + "grad_norm": 0.3579169809818268, + "learning_rate": 9.750414412788567e-05, + "loss": 0.7552, + "step": 556 + }, + { + "epoch": 0.12837059230237383, + "grad_norm": 0.3728508949279785, + "learning_rate": 9.749248439015763e-05, + "loss": 0.7605, + "step": 557 + }, + { + "epoch": 0.1286010601521088, + "grad_norm": 0.4141934812068939, + "learning_rate": 9.74807981812587e-05, + "loss": 0.7514, + "step": 558 + }, + { + "epoch": 0.12883152800184375, + "grad_norm": 0.34202003479003906, + "learning_rate": 9.746908550770252e-05, + "loss": 0.742, + "step": 559 + }, + { + "epoch": 0.1290619958515787, + "grad_norm": 0.39449524879455566, + "learning_rate": 9.745734637601743e-05, + "loss": 0.745, + "step": 560 + }, + { + "epoch": 0.12929246370131367, + "grad_norm": 0.3760475218296051, + "learning_rate": 9.744558079274652e-05, + "loss": 0.7586, + "step": 561 + }, + { + "epoch": 0.12952293155104863, + "grad_norm": 0.3431721329689026, + "learning_rate": 9.743378876444769e-05, + "loss": 0.7523, + "step": 562 + }, + { + "epoch": 0.1297533994007836, + "grad_norm": 0.3754710555076599, + "learning_rate": 9.74219702976935e-05, + "loss": 0.7374, + "step": 563 + }, + { + "epoch": 0.12998386725051855, + "grad_norm": 0.45317164063453674, + "learning_rate": 9.741012539907131e-05, + "loss": 0.7599, + "step": 564 + }, + { + "epoch": 0.13021433510025351, + "grad_norm": 0.4640779495239258, + "learning_rate": 9.739825407518314e-05, + "loss": 0.7408, + "step": 565 + }, + { + "epoch": 0.13044480294998848, + "grad_norm": 0.49257683753967285, + "learning_rate": 9.738635633264581e-05, + "loss": 0.752, + "step": 566 + }, + { + "epoch": 0.13067527079972344, + "grad_norm": 0.4536359906196594, + "learning_rate": 9.737443217809083e-05, + "loss": 0.7566, + "step": 567 + }, + { + "epoch": 0.1309057386494584, + "grad_norm": 0.4063389003276825, + "learning_rate": 9.736248161816446e-05, + "loss": 0.7556, + "step": 568 + }, + { + "epoch": 0.13113620649919336, + "grad_norm": 0.33868643641471863, + "learning_rate": 9.735050465952761e-05, + "loss": 0.751, + "step": 569 + }, + { + "epoch": 0.13136667434892832, + "grad_norm": 0.3786289095878601, + "learning_rate": 9.733850130885598e-05, + "loss": 0.7568, + "step": 570 + }, + { + "epoch": 0.13159714219866328, + "grad_norm": 0.43096908926963806, + "learning_rate": 9.732647157283994e-05, + "loss": 0.7508, + "step": 571 + }, + { + "epoch": 0.13182761004839824, + "grad_norm": 0.39179953932762146, + "learning_rate": 9.731441545818459e-05, + "loss": 0.749, + "step": 572 + }, + { + "epoch": 0.1320580778981332, + "grad_norm": 0.3335493803024292, + "learning_rate": 9.730233297160969e-05, + "loss": 0.7491, + "step": 573 + }, + { + "epoch": 0.13228854574786816, + "grad_norm": 0.4363969564437866, + "learning_rate": 9.729022411984975e-05, + "loss": 0.7496, + "step": 574 + }, + { + "epoch": 0.13251901359760312, + "grad_norm": 0.3922342360019684, + "learning_rate": 9.727808890965396e-05, + "loss": 0.7481, + "step": 575 + }, + { + "epoch": 0.13274948144733809, + "grad_norm": 0.3694200813770294, + "learning_rate": 9.726592734778617e-05, + "loss": 0.7561, + "step": 576 + }, + { + "epoch": 0.13297994929707305, + "grad_norm": 0.40044036507606506, + "learning_rate": 9.725373944102496e-05, + "loss": 0.7463, + "step": 577 + }, + { + "epoch": 0.133210417146808, + "grad_norm": 0.39088964462280273, + "learning_rate": 9.724152519616358e-05, + "loss": 0.7528, + "step": 578 + }, + { + "epoch": 0.13344088499654297, + "grad_norm": 0.35432836413383484, + "learning_rate": 9.722928462000995e-05, + "loss": 0.7393, + "step": 579 + }, + { + "epoch": 0.13367135284627796, + "grad_norm": 0.36177772283554077, + "learning_rate": 9.721701771938666e-05, + "loss": 0.7511, + "step": 580 + }, + { + "epoch": 0.13390182069601292, + "grad_norm": 0.3491220474243164, + "learning_rate": 9.7204724501131e-05, + "loss": 0.7516, + "step": 581 + }, + { + "epoch": 0.13413228854574788, + "grad_norm": 0.3720824420452118, + "learning_rate": 9.719240497209493e-05, + "loss": 0.7416, + "step": 582 + }, + { + "epoch": 0.13436275639548284, + "grad_norm": 0.3904234766960144, + "learning_rate": 9.718005913914503e-05, + "loss": 0.7429, + "step": 583 + }, + { + "epoch": 0.1345932242452178, + "grad_norm": 0.3584286868572235, + "learning_rate": 9.716768700916258e-05, + "loss": 0.7443, + "step": 584 + }, + { + "epoch": 0.13482369209495276, + "grad_norm": 0.36260986328125, + "learning_rate": 9.715528858904353e-05, + "loss": 0.7491, + "step": 585 + }, + { + "epoch": 0.13505415994468772, + "grad_norm": 0.41411370038986206, + "learning_rate": 9.714286388569845e-05, + "loss": 0.7493, + "step": 586 + }, + { + "epoch": 0.13528462779442268, + "grad_norm": 0.4128418266773224, + "learning_rate": 9.713041290605254e-05, + "loss": 0.7531, + "step": 587 + }, + { + "epoch": 0.13551509564415765, + "grad_norm": 0.37664923071861267, + "learning_rate": 9.711793565704572e-05, + "loss": 0.7491, + "step": 588 + }, + { + "epoch": 0.1357455634938926, + "grad_norm": 0.3493359386920929, + "learning_rate": 9.71054321456325e-05, + "loss": 0.743, + "step": 589 + }, + { + "epoch": 0.13597603134362757, + "grad_norm": 0.4216603934764862, + "learning_rate": 9.709290237878202e-05, + "loss": 0.7446, + "step": 590 + }, + { + "epoch": 0.13620649919336253, + "grad_norm": 0.4375728964805603, + "learning_rate": 9.708034636347807e-05, + "loss": 0.7533, + "step": 591 + }, + { + "epoch": 0.1364369670430975, + "grad_norm": 0.3984435796737671, + "learning_rate": 9.70677641067191e-05, + "loss": 0.7379, + "step": 592 + }, + { + "epoch": 0.13666743489283245, + "grad_norm": 0.3685891330242157, + "learning_rate": 9.705515561551814e-05, + "loss": 0.7462, + "step": 593 + }, + { + "epoch": 0.1368979027425674, + "grad_norm": 0.35260993242263794, + "learning_rate": 9.704252089690284e-05, + "loss": 0.7491, + "step": 594 + }, + { + "epoch": 0.13712837059230237, + "grad_norm": 0.33761847019195557, + "learning_rate": 9.702985995791554e-05, + "loss": 0.7403, + "step": 595 + }, + { + "epoch": 0.13735883844203733, + "grad_norm": 0.4064388573169708, + "learning_rate": 9.701717280561309e-05, + "loss": 0.7511, + "step": 596 + }, + { + "epoch": 0.1375893062917723, + "grad_norm": 0.4629051089286804, + "learning_rate": 9.700445944706704e-05, + "loss": 0.746, + "step": 597 + }, + { + "epoch": 0.13781977414150726, + "grad_norm": 0.45266193151474, + "learning_rate": 9.699171988936349e-05, + "loss": 0.7394, + "step": 598 + }, + { + "epoch": 0.13805024199124222, + "grad_norm": 0.3218851387500763, + "learning_rate": 9.697895413960319e-05, + "loss": 0.7477, + "step": 599 + }, + { + "epoch": 0.13828070984097718, + "grad_norm": 0.4335274398326874, + "learning_rate": 9.696616220490143e-05, + "loss": 0.7356, + "step": 600 + }, + { + "epoch": 0.13851117769071214, + "grad_norm": 0.45627596974372864, + "learning_rate": 9.695334409238813e-05, + "loss": 0.746, + "step": 601 + }, + { + "epoch": 0.1387416455404471, + "grad_norm": 0.35524532198905945, + "learning_rate": 9.694049980920783e-05, + "loss": 0.7494, + "step": 602 + }, + { + "epoch": 0.13897211339018206, + "grad_norm": 0.40399858355522156, + "learning_rate": 9.69276293625196e-05, + "loss": 0.7468, + "step": 603 + }, + { + "epoch": 0.13920258123991702, + "grad_norm": 0.4082375168800354, + "learning_rate": 9.691473275949712e-05, + "loss": 0.7518, + "step": 604 + }, + { + "epoch": 0.13943304908965198, + "grad_norm": 0.3440902829170227, + "learning_rate": 9.690181000732864e-05, + "loss": 0.7421, + "step": 605 + }, + { + "epoch": 0.13966351693938694, + "grad_norm": 0.481870174407959, + "learning_rate": 9.688886111321703e-05, + "loss": 0.7558, + "step": 606 + }, + { + "epoch": 0.1398939847891219, + "grad_norm": 0.3942829370498657, + "learning_rate": 9.687588608437963e-05, + "loss": 0.7387, + "step": 607 + }, + { + "epoch": 0.14012445263885687, + "grad_norm": 0.3642619550228119, + "learning_rate": 9.686288492804846e-05, + "loss": 0.7369, + "step": 608 + }, + { + "epoch": 0.14035492048859183, + "grad_norm": 0.4085538685321808, + "learning_rate": 9.684985765147006e-05, + "loss": 0.7455, + "step": 609 + }, + { + "epoch": 0.14058538833832682, + "grad_norm": 0.36684495210647583, + "learning_rate": 9.683680426190547e-05, + "loss": 0.7427, + "step": 610 + }, + { + "epoch": 0.14081585618806178, + "grad_norm": 0.3928658068180084, + "learning_rate": 9.682372476663037e-05, + "loss": 0.7503, + "step": 611 + }, + { + "epoch": 0.14104632403779674, + "grad_norm": 0.40327298641204834, + "learning_rate": 9.681061917293497e-05, + "loss": 0.7497, + "step": 612 + }, + { + "epoch": 0.1412767918875317, + "grad_norm": 0.45683008432388306, + "learning_rate": 9.679748748812397e-05, + "loss": 0.7443, + "step": 613 + }, + { + "epoch": 0.14150725973726666, + "grad_norm": 0.3893783986568451, + "learning_rate": 9.678432971951669e-05, + "loss": 0.7487, + "step": 614 + }, + { + "epoch": 0.14173772758700162, + "grad_norm": 0.36293652653694153, + "learning_rate": 9.677114587444696e-05, + "loss": 0.7477, + "step": 615 + }, + { + "epoch": 0.14196819543673658, + "grad_norm": 0.3704957365989685, + "learning_rate": 9.67579359602631e-05, + "loss": 0.7442, + "step": 616 + }, + { + "epoch": 0.14219866328647154, + "grad_norm": 0.3900201618671417, + "learning_rate": 9.674469998432802e-05, + "loss": 0.7408, + "step": 617 + }, + { + "epoch": 0.1424291311362065, + "grad_norm": 0.35476595163345337, + "learning_rate": 9.673143795401915e-05, + "loss": 0.7369, + "step": 618 + }, + { + "epoch": 0.14265959898594147, + "grad_norm": 0.39594173431396484, + "learning_rate": 9.671814987672842e-05, + "loss": 0.7495, + "step": 619 + }, + { + "epoch": 0.14289006683567643, + "grad_norm": 0.3954264521598816, + "learning_rate": 9.670483575986229e-05, + "loss": 0.7536, + "step": 620 + }, + { + "epoch": 0.1431205346854114, + "grad_norm": 0.329731285572052, + "learning_rate": 9.66914956108417e-05, + "loss": 0.7432, + "step": 621 + }, + { + "epoch": 0.14335100253514635, + "grad_norm": 0.33711501955986023, + "learning_rate": 9.667812943710215e-05, + "loss": 0.733, + "step": 622 + }, + { + "epoch": 0.1435814703848813, + "grad_norm": 0.3300781548023224, + "learning_rate": 9.666473724609364e-05, + "loss": 0.7366, + "step": 623 + }, + { + "epoch": 0.14381193823461627, + "grad_norm": 0.32129067182540894, + "learning_rate": 9.665131904528063e-05, + "loss": 0.7457, + "step": 624 + }, + { + "epoch": 0.14404240608435123, + "grad_norm": 0.3335452973842621, + "learning_rate": 9.66378748421421e-05, + "loss": 0.7453, + "step": 625 + }, + { + "epoch": 0.1442728739340862, + "grad_norm": 0.31821250915527344, + "learning_rate": 9.662440464417155e-05, + "loss": 0.7381, + "step": 626 + }, + { + "epoch": 0.14450334178382115, + "grad_norm": 0.6273105144500732, + "learning_rate": 9.661090845887693e-05, + "loss": 0.7383, + "step": 627 + }, + { + "epoch": 0.14473380963355612, + "grad_norm": 0.32164305448532104, + "learning_rate": 9.65973862937807e-05, + "loss": 0.7398, + "step": 628 + }, + { + "epoch": 0.14496427748329108, + "grad_norm": 0.3327626585960388, + "learning_rate": 9.658383815641978e-05, + "loss": 0.7357, + "step": 629 + }, + { + "epoch": 0.14519474533302604, + "grad_norm": 0.30448755621910095, + "learning_rate": 9.657026405434557e-05, + "loss": 0.7309, + "step": 630 + }, + { + "epoch": 0.145425213182761, + "grad_norm": 0.30915525555610657, + "learning_rate": 9.655666399512397e-05, + "loss": 0.7407, + "step": 631 + }, + { + "epoch": 0.14565568103249596, + "grad_norm": 0.3181747794151306, + "learning_rate": 9.654303798633532e-05, + "loss": 0.7443, + "step": 632 + }, + { + "epoch": 0.14588614888223092, + "grad_norm": 0.3349412977695465, + "learning_rate": 9.652938603557442e-05, + "loss": 0.7459, + "step": 633 + }, + { + "epoch": 0.14611661673196588, + "grad_norm": 0.32056349515914917, + "learning_rate": 9.651570815045054e-05, + "loss": 0.7415, + "step": 634 + }, + { + "epoch": 0.14634708458170084, + "grad_norm": 0.34287700057029724, + "learning_rate": 9.650200433858741e-05, + "loss": 0.7412, + "step": 635 + }, + { + "epoch": 0.1465775524314358, + "grad_norm": 0.3507481813430786, + "learning_rate": 9.648827460762322e-05, + "loss": 0.7487, + "step": 636 + }, + { + "epoch": 0.14680802028117076, + "grad_norm": 0.3356325328350067, + "learning_rate": 9.647451896521055e-05, + "loss": 0.7423, + "step": 637 + }, + { + "epoch": 0.14703848813090573, + "grad_norm": 0.37254631519317627, + "learning_rate": 9.646073741901652e-05, + "loss": 0.743, + "step": 638 + }, + { + "epoch": 0.14726895598064071, + "grad_norm": 0.3373221755027771, + "learning_rate": 9.64469299767226e-05, + "loss": 0.7406, + "step": 639 + }, + { + "epoch": 0.14749942383037568, + "grad_norm": 0.3344241976737976, + "learning_rate": 9.643309664602474e-05, + "loss": 0.7377, + "step": 640 + }, + { + "epoch": 0.14772989168011064, + "grad_norm": 0.34580978751182556, + "learning_rate": 9.641923743463327e-05, + "loss": 0.7454, + "step": 641 + }, + { + "epoch": 0.1479603595298456, + "grad_norm": 0.31924277544021606, + "learning_rate": 9.640535235027303e-05, + "loss": 0.7369, + "step": 642 + }, + { + "epoch": 0.14819082737958056, + "grad_norm": 0.33480551838874817, + "learning_rate": 9.639144140068324e-05, + "loss": 0.7582, + "step": 643 + }, + { + "epoch": 0.14842129522931552, + "grad_norm": 0.3792799413204193, + "learning_rate": 9.637750459361748e-05, + "loss": 0.7495, + "step": 644 + }, + { + "epoch": 0.14865176307905048, + "grad_norm": 0.42829740047454834, + "learning_rate": 9.63635419368438e-05, + "loss": 0.7406, + "step": 645 + }, + { + "epoch": 0.14888223092878544, + "grad_norm": 0.48589882254600525, + "learning_rate": 9.634955343814469e-05, + "loss": 0.7468, + "step": 646 + }, + { + "epoch": 0.1491126987785204, + "grad_norm": 0.43215352296829224, + "learning_rate": 9.633553910531697e-05, + "loss": 0.7444, + "step": 647 + }, + { + "epoch": 0.14934316662825536, + "grad_norm": 0.38566887378692627, + "learning_rate": 9.632149894617191e-05, + "loss": 0.7447, + "step": 648 + }, + { + "epoch": 0.14957363447799032, + "grad_norm": 0.31605324149131775, + "learning_rate": 9.630743296853513e-05, + "loss": 0.7462, + "step": 649 + }, + { + "epoch": 0.14980410232772529, + "grad_norm": 0.3389308452606201, + "learning_rate": 9.629334118024669e-05, + "loss": 0.7345, + "step": 650 + }, + { + "epoch": 0.15003457017746025, + "grad_norm": 0.3665103614330292, + "learning_rate": 9.627922358916102e-05, + "loss": 0.7393, + "step": 651 + }, + { + "epoch": 0.1502650380271952, + "grad_norm": 0.33083248138427734, + "learning_rate": 9.626508020314693e-05, + "loss": 0.7439, + "step": 652 + }, + { + "epoch": 0.15049550587693017, + "grad_norm": 0.3108620345592499, + "learning_rate": 9.625091103008757e-05, + "loss": 0.7347, + "step": 653 + }, + { + "epoch": 0.15072597372666513, + "grad_norm": 0.32993149757385254, + "learning_rate": 9.623671607788054e-05, + "loss": 0.7429, + "step": 654 + }, + { + "epoch": 0.1509564415764001, + "grad_norm": 0.35567355155944824, + "learning_rate": 9.622249535443773e-05, + "loss": 0.7365, + "step": 655 + }, + { + "epoch": 0.15118690942613505, + "grad_norm": 0.3489120602607727, + "learning_rate": 9.620824886768545e-05, + "loss": 0.7314, + "step": 656 + }, + { + "epoch": 0.15141737727587, + "grad_norm": 0.34390199184417725, + "learning_rate": 9.619397662556435e-05, + "loss": 0.7468, + "step": 657 + }, + { + "epoch": 0.15164784512560497, + "grad_norm": 0.3184950351715088, + "learning_rate": 9.617967863602941e-05, + "loss": 0.7474, + "step": 658 + }, + { + "epoch": 0.15187831297533994, + "grad_norm": 0.3136204779148102, + "learning_rate": 9.616535490705004e-05, + "loss": 0.7423, + "step": 659 + }, + { + "epoch": 0.1521087808250749, + "grad_norm": 0.29085636138916016, + "learning_rate": 9.615100544660987e-05, + "loss": 0.7374, + "step": 660 + }, + { + "epoch": 0.15233924867480986, + "grad_norm": 0.35054919123649597, + "learning_rate": 9.6136630262707e-05, + "loss": 0.7436, + "step": 661 + }, + { + "epoch": 0.15256971652454482, + "grad_norm": 0.32511407136917114, + "learning_rate": 9.61222293633538e-05, + "loss": 0.7401, + "step": 662 + }, + { + "epoch": 0.15280018437427978, + "grad_norm": 0.3348987400531769, + "learning_rate": 9.610780275657698e-05, + "loss": 0.7387, + "step": 663 + }, + { + "epoch": 0.15303065222401474, + "grad_norm": 0.3278294801712036, + "learning_rate": 9.609335045041759e-05, + "loss": 0.7402, + "step": 664 + }, + { + "epoch": 0.1532611200737497, + "grad_norm": 0.3211444020271301, + "learning_rate": 9.6078872452931e-05, + "loss": 0.7448, + "step": 665 + }, + { + "epoch": 0.15349158792348466, + "grad_norm": 0.341467946767807, + "learning_rate": 9.606436877218688e-05, + "loss": 0.7395, + "step": 666 + }, + { + "epoch": 0.15372205577321962, + "grad_norm": 0.3915879726409912, + "learning_rate": 9.604983941626924e-05, + "loss": 0.7405, + "step": 667 + }, + { + "epoch": 0.15395252362295458, + "grad_norm": 0.41400331258773804, + "learning_rate": 9.603528439327642e-05, + "loss": 0.7456, + "step": 668 + }, + { + "epoch": 0.15418299147268957, + "grad_norm": 0.35899117588996887, + "learning_rate": 9.602070371132102e-05, + "loss": 0.7282, + "step": 669 + }, + { + "epoch": 0.15441345932242453, + "grad_norm": 0.273082971572876, + "learning_rate": 9.600609737852995e-05, + "loss": 0.7323, + "step": 670 + }, + { + "epoch": 0.1546439271721595, + "grad_norm": 0.34714290499687195, + "learning_rate": 9.599146540304444e-05, + "loss": 0.7426, + "step": 671 + }, + { + "epoch": 0.15487439502189446, + "grad_norm": 0.3830714523792267, + "learning_rate": 9.597680779302e-05, + "loss": 0.7362, + "step": 672 + }, + { + "epoch": 0.15510486287162942, + "grad_norm": 0.30310484766960144, + "learning_rate": 9.596212455662645e-05, + "loss": 0.7368, + "step": 673 + }, + { + "epoch": 0.15533533072136438, + "grad_norm": 0.3948887586593628, + "learning_rate": 9.594741570204787e-05, + "loss": 0.7377, + "step": 674 + }, + { + "epoch": 0.15556579857109934, + "grad_norm": 0.42148318886756897, + "learning_rate": 9.593268123748259e-05, + "loss": 0.7415, + "step": 675 + }, + { + "epoch": 0.1557962664208343, + "grad_norm": 0.5412166118621826, + "learning_rate": 9.591792117114328e-05, + "loss": 0.7277, + "step": 676 + }, + { + "epoch": 0.15602673427056926, + "grad_norm": 0.6122573614120483, + "learning_rate": 9.590313551125683e-05, + "loss": 0.7376, + "step": 677 + }, + { + "epoch": 0.15625720212030422, + "grad_norm": 0.4985668361186981, + "learning_rate": 9.58883242660644e-05, + "loss": 0.742, + "step": 678 + }, + { + "epoch": 0.15648766997003918, + "grad_norm": 0.356157124042511, + "learning_rate": 9.587348744382145e-05, + "loss": 0.73, + "step": 679 + }, + { + "epoch": 0.15671813781977414, + "grad_norm": 0.3795771300792694, + "learning_rate": 9.585862505279766e-05, + "loss": 0.735, + "step": 680 + }, + { + "epoch": 0.1569486056695091, + "grad_norm": 0.4135006070137024, + "learning_rate": 9.584373710127697e-05, + "loss": 0.7361, + "step": 681 + }, + { + "epoch": 0.15717907351924407, + "grad_norm": 0.3758498728275299, + "learning_rate": 9.582882359755756e-05, + "loss": 0.7335, + "step": 682 + }, + { + "epoch": 0.15740954136897903, + "grad_norm": 0.35648760199546814, + "learning_rate": 9.581388454995187e-05, + "loss": 0.7387, + "step": 683 + }, + { + "epoch": 0.157640009218714, + "grad_norm": 0.3326440453529358, + "learning_rate": 9.579891996678655e-05, + "loss": 0.7326, + "step": 684 + }, + { + "epoch": 0.15787047706844895, + "grad_norm": 0.3023831248283386, + "learning_rate": 9.578392985640252e-05, + "loss": 0.7302, + "step": 685 + }, + { + "epoch": 0.1581009449181839, + "grad_norm": 0.3539700508117676, + "learning_rate": 9.576891422715489e-05, + "loss": 0.729, + "step": 686 + }, + { + "epoch": 0.15833141276791887, + "grad_norm": 0.38474544882774353, + "learning_rate": 9.575387308741301e-05, + "loss": 0.7299, + "step": 687 + }, + { + "epoch": 0.15856188061765383, + "grad_norm": 0.3182377517223358, + "learning_rate": 9.573880644556047e-05, + "loss": 0.7424, + "step": 688 + }, + { + "epoch": 0.1587923484673888, + "grad_norm": 0.3241504430770874, + "learning_rate": 9.572371430999506e-05, + "loss": 0.7363, + "step": 689 + }, + { + "epoch": 0.15902281631712376, + "grad_norm": 0.3680818974971771, + "learning_rate": 9.570859668912873e-05, + "loss": 0.7336, + "step": 690 + }, + { + "epoch": 0.15925328416685872, + "grad_norm": 0.3773689270019531, + "learning_rate": 9.569345359138771e-05, + "loss": 0.7435, + "step": 691 + }, + { + "epoch": 0.15948375201659368, + "grad_norm": 0.34049540758132935, + "learning_rate": 9.567828502521239e-05, + "loss": 0.7327, + "step": 692 + }, + { + "epoch": 0.15971421986632864, + "grad_norm": 0.34470391273498535, + "learning_rate": 9.566309099905739e-05, + "loss": 0.7308, + "step": 693 + }, + { + "epoch": 0.1599446877160636, + "grad_norm": 0.3475648760795593, + "learning_rate": 9.564787152139146e-05, + "loss": 0.7393, + "step": 694 + }, + { + "epoch": 0.16017515556579856, + "grad_norm": 0.304973840713501, + "learning_rate": 9.563262660069759e-05, + "loss": 0.7398, + "step": 695 + }, + { + "epoch": 0.16040562341553352, + "grad_norm": 0.33955731987953186, + "learning_rate": 9.561735624547294e-05, + "loss": 0.7388, + "step": 696 + }, + { + "epoch": 0.16063609126526848, + "grad_norm": 0.31284675002098083, + "learning_rate": 9.560206046422881e-05, + "loss": 0.7468, + "step": 697 + }, + { + "epoch": 0.16086655911500344, + "grad_norm": 0.3111793100833893, + "learning_rate": 9.558673926549075e-05, + "loss": 0.7445, + "step": 698 + }, + { + "epoch": 0.16109702696473843, + "grad_norm": 0.32062941789627075, + "learning_rate": 9.557139265779838e-05, + "loss": 0.7373, + "step": 699 + }, + { + "epoch": 0.1613274948144734, + "grad_norm": 0.32092171907424927, + "learning_rate": 9.555602064970554e-05, + "loss": 0.7315, + "step": 700 + }, + { + "epoch": 0.16155796266420835, + "grad_norm": 0.29449769854545593, + "learning_rate": 9.554062324978025e-05, + "loss": 0.7327, + "step": 701 + }, + { + "epoch": 0.16178843051394332, + "grad_norm": 0.339097261428833, + "learning_rate": 9.552520046660462e-05, + "loss": 0.7433, + "step": 702 + }, + { + "epoch": 0.16201889836367828, + "grad_norm": 0.3251134157180786, + "learning_rate": 9.550975230877495e-05, + "loss": 0.742, + "step": 703 + }, + { + "epoch": 0.16224936621341324, + "grad_norm": 0.2941744923591614, + "learning_rate": 9.549427878490168e-05, + "loss": 0.7321, + "step": 704 + }, + { + "epoch": 0.1624798340631482, + "grad_norm": 0.31333979964256287, + "learning_rate": 9.547877990360935e-05, + "loss": 0.7374, + "step": 705 + }, + { + "epoch": 0.16271030191288316, + "grad_norm": 0.3424711525440216, + "learning_rate": 9.546325567353671e-05, + "loss": 0.7381, + "step": 706 + }, + { + "epoch": 0.16294076976261812, + "grad_norm": 0.37201154232025146, + "learning_rate": 9.544770610333655e-05, + "loss": 0.7333, + "step": 707 + }, + { + "epoch": 0.16317123761235308, + "grad_norm": 0.29893067479133606, + "learning_rate": 9.543213120167586e-05, + "loss": 0.7397, + "step": 708 + }, + { + "epoch": 0.16340170546208804, + "grad_norm": 0.2980719804763794, + "learning_rate": 9.54165309772357e-05, + "loss": 0.7293, + "step": 709 + }, + { + "epoch": 0.163632173311823, + "grad_norm": 0.3534228503704071, + "learning_rate": 9.540090543871126e-05, + "loss": 0.7227, + "step": 710 + }, + { + "epoch": 0.16386264116155796, + "grad_norm": 0.42632773518562317, + "learning_rate": 9.538525459481185e-05, + "loss": 0.7304, + "step": 711 + }, + { + "epoch": 0.16409310901129293, + "grad_norm": 0.3898342549800873, + "learning_rate": 9.536957845426086e-05, + "loss": 0.7268, + "step": 712 + }, + { + "epoch": 0.1643235768610279, + "grad_norm": 0.290547251701355, + "learning_rate": 9.535387702579581e-05, + "loss": 0.7349, + "step": 713 + }, + { + "epoch": 0.16455404471076285, + "grad_norm": 0.2991439402103424, + "learning_rate": 9.53381503181683e-05, + "loss": 0.7406, + "step": 714 + }, + { + "epoch": 0.1647845125604978, + "grad_norm": 0.30085012316703796, + "learning_rate": 9.5322398340144e-05, + "loss": 0.7353, + "step": 715 + }, + { + "epoch": 0.16501498041023277, + "grad_norm": 0.39195868372917175, + "learning_rate": 9.53066211005027e-05, + "loss": 0.7314, + "step": 716 + }, + { + "epoch": 0.16524544825996773, + "grad_norm": 0.3485901951789856, + "learning_rate": 9.529081860803825e-05, + "loss": 0.7291, + "step": 717 + }, + { + "epoch": 0.1654759161097027, + "grad_norm": 0.27913734316825867, + "learning_rate": 9.527499087155857e-05, + "loss": 0.7291, + "step": 718 + }, + { + "epoch": 0.16570638395943765, + "grad_norm": 0.3681480288505554, + "learning_rate": 9.52591378998857e-05, + "loss": 0.7409, + "step": 719 + }, + { + "epoch": 0.16593685180917261, + "grad_norm": 0.37066853046417236, + "learning_rate": 9.524325970185565e-05, + "loss": 0.7291, + "step": 720 + }, + { + "epoch": 0.16616731965890758, + "grad_norm": 0.2994784712791443, + "learning_rate": 9.52273562863186e-05, + "loss": 0.7274, + "step": 721 + }, + { + "epoch": 0.16639778750864254, + "grad_norm": 0.3262505531311035, + "learning_rate": 9.521142766213869e-05, + "loss": 0.7387, + "step": 722 + }, + { + "epoch": 0.1666282553583775, + "grad_norm": 0.37113088369369507, + "learning_rate": 9.519547383819416e-05, + "loss": 0.7303, + "step": 723 + }, + { + "epoch": 0.16685872320811246, + "grad_norm": 0.35534918308258057, + "learning_rate": 9.517949482337732e-05, + "loss": 0.7383, + "step": 724 + }, + { + "epoch": 0.16708919105784742, + "grad_norm": 0.3642905056476593, + "learning_rate": 9.516349062659444e-05, + "loss": 0.7299, + "step": 725 + }, + { + "epoch": 0.16731965890758238, + "grad_norm": 0.3798482418060303, + "learning_rate": 9.514746125676593e-05, + "loss": 0.7279, + "step": 726 + }, + { + "epoch": 0.16755012675731734, + "grad_norm": 0.38027462363243103, + "learning_rate": 9.513140672282612e-05, + "loss": 0.7324, + "step": 727 + }, + { + "epoch": 0.1677805946070523, + "grad_norm": 0.34657275676727295, + "learning_rate": 9.511532703372348e-05, + "loss": 0.7394, + "step": 728 + }, + { + "epoch": 0.1680110624567873, + "grad_norm": 0.35259348154067993, + "learning_rate": 9.50992221984204e-05, + "loss": 0.7282, + "step": 729 + }, + { + "epoch": 0.16824153030652225, + "grad_norm": 0.3222431242465973, + "learning_rate": 9.508309222589333e-05, + "loss": 0.7242, + "step": 730 + }, + { + "epoch": 0.1684719981562572, + "grad_norm": 0.32721391320228577, + "learning_rate": 9.506693712513274e-05, + "loss": 0.7302, + "step": 731 + }, + { + "epoch": 0.16870246600599217, + "grad_norm": 0.3368002474308014, + "learning_rate": 9.505075690514312e-05, + "loss": 0.7265, + "step": 732 + }, + { + "epoch": 0.16893293385572714, + "grad_norm": 0.28599828481674194, + "learning_rate": 9.503455157494289e-05, + "loss": 0.7267, + "step": 733 + }, + { + "epoch": 0.1691634017054621, + "grad_norm": 0.30026113986968994, + "learning_rate": 9.501832114356453e-05, + "loss": 0.7375, + "step": 734 + }, + { + "epoch": 0.16939386955519706, + "grad_norm": 0.3616209030151367, + "learning_rate": 9.500206562005451e-05, + "loss": 0.736, + "step": 735 + }, + { + "epoch": 0.16962433740493202, + "grad_norm": 0.34600475430488586, + "learning_rate": 9.498578501347327e-05, + "loss": 0.7412, + "step": 736 + }, + { + "epoch": 0.16985480525466698, + "grad_norm": 0.33249667286872864, + "learning_rate": 9.49694793328952e-05, + "loss": 0.7355, + "step": 737 + }, + { + "epoch": 0.17008527310440194, + "grad_norm": 0.31944364309310913, + "learning_rate": 9.495314858740869e-05, + "loss": 0.7263, + "step": 738 + }, + { + "epoch": 0.1703157409541369, + "grad_norm": 0.29929810762405396, + "learning_rate": 9.493679278611616e-05, + "loss": 0.7244, + "step": 739 + }, + { + "epoch": 0.17054620880387186, + "grad_norm": 0.3145100474357605, + "learning_rate": 9.492041193813388e-05, + "loss": 0.7316, + "step": 740 + }, + { + "epoch": 0.17077667665360682, + "grad_norm": 0.34142231941223145, + "learning_rate": 9.490400605259218e-05, + "loss": 0.7281, + "step": 741 + }, + { + "epoch": 0.17100714450334178, + "grad_norm": 0.39629876613616943, + "learning_rate": 9.488757513863531e-05, + "loss": 0.7353, + "step": 742 + }, + { + "epoch": 0.17123761235307675, + "grad_norm": 0.43608033657073975, + "learning_rate": 9.487111920542143e-05, + "loss": 0.7323, + "step": 743 + }, + { + "epoch": 0.1714680802028117, + "grad_norm": 0.4607144296169281, + "learning_rate": 9.48546382621227e-05, + "loss": 0.7307, + "step": 744 + }, + { + "epoch": 0.17169854805254667, + "grad_norm": 0.4801574945449829, + "learning_rate": 9.483813231792523e-05, + "loss": 0.7299, + "step": 745 + }, + { + "epoch": 0.17192901590228163, + "grad_norm": 0.4828302264213562, + "learning_rate": 9.4821601382029e-05, + "loss": 0.734, + "step": 746 + }, + { + "epoch": 0.1721594837520166, + "grad_norm": 0.35751068592071533, + "learning_rate": 9.4805045463648e-05, + "loss": 0.7269, + "step": 747 + }, + { + "epoch": 0.17238995160175155, + "grad_norm": 0.35991883277893066, + "learning_rate": 9.478846457201003e-05, + "loss": 0.7308, + "step": 748 + }, + { + "epoch": 0.1726204194514865, + "grad_norm": 0.38963592052459717, + "learning_rate": 9.477185871635694e-05, + "loss": 0.7394, + "step": 749 + }, + { + "epoch": 0.17285088730122147, + "grad_norm": 0.3973761796951294, + "learning_rate": 9.475522790594443e-05, + "loss": 0.7279, + "step": 750 + }, + { + "epoch": 0.17308135515095643, + "grad_norm": 0.35446715354919434, + "learning_rate": 9.473857215004208e-05, + "loss": 0.7328, + "step": 751 + }, + { + "epoch": 0.1733118230006914, + "grad_norm": 0.32000207901000977, + "learning_rate": 9.472189145793345e-05, + "loss": 0.7304, + "step": 752 + }, + { + "epoch": 0.17354229085042636, + "grad_norm": 0.3462921679019928, + "learning_rate": 9.470518583891592e-05, + "loss": 0.7266, + "step": 753 + }, + { + "epoch": 0.17377275870016132, + "grad_norm": 0.37639400362968445, + "learning_rate": 9.468845530230084e-05, + "loss": 0.7261, + "step": 754 + }, + { + "epoch": 0.17400322654989628, + "grad_norm": 0.3057329058647156, + "learning_rate": 9.467169985741337e-05, + "loss": 0.7332, + "step": 755 + }, + { + "epoch": 0.17423369439963124, + "grad_norm": 0.33508437871932983, + "learning_rate": 9.465491951359265e-05, + "loss": 0.7242, + "step": 756 + }, + { + "epoch": 0.1744641622493662, + "grad_norm": 0.41482967138290405, + "learning_rate": 9.463811428019156e-05, + "loss": 0.7284, + "step": 757 + }, + { + "epoch": 0.1746946300991012, + "grad_norm": 0.3846109211444855, + "learning_rate": 9.4621284166577e-05, + "loss": 0.7358, + "step": 758 + }, + { + "epoch": 0.17492509794883615, + "grad_norm": 0.35918307304382324, + "learning_rate": 9.460442918212965e-05, + "loss": 0.7249, + "step": 759 + }, + { + "epoch": 0.1751555657985711, + "grad_norm": 0.38481009006500244, + "learning_rate": 9.458754933624406e-05, + "loss": 0.736, + "step": 760 + }, + { + "epoch": 0.17538603364830607, + "grad_norm": 0.34131038188934326, + "learning_rate": 9.457064463832868e-05, + "loss": 0.7291, + "step": 761 + }, + { + "epoch": 0.17561650149804103, + "grad_norm": 0.3763889670372009, + "learning_rate": 9.455371509780575e-05, + "loss": 0.7322, + "step": 762 + }, + { + "epoch": 0.175846969347776, + "grad_norm": 0.36019256711006165, + "learning_rate": 9.453676072411142e-05, + "loss": 0.7206, + "step": 763 + }, + { + "epoch": 0.17607743719751096, + "grad_norm": 0.324862003326416, + "learning_rate": 9.451978152669563e-05, + "loss": 0.7277, + "step": 764 + }, + { + "epoch": 0.17630790504724592, + "grad_norm": 0.3834820091724396, + "learning_rate": 9.450277751502218e-05, + "loss": 0.7416, + "step": 765 + }, + { + "epoch": 0.17653837289698088, + "grad_norm": 0.37926480174064636, + "learning_rate": 9.44857486985687e-05, + "loss": 0.7293, + "step": 766 + }, + { + "epoch": 0.17676884074671584, + "grad_norm": 0.3776107430458069, + "learning_rate": 9.446869508682666e-05, + "loss": 0.7384, + "step": 767 + }, + { + "epoch": 0.1769993085964508, + "grad_norm": 0.2972352206707001, + "learning_rate": 9.445161668930129e-05, + "loss": 0.7351, + "step": 768 + }, + { + "epoch": 0.17722977644618576, + "grad_norm": 0.2998862862586975, + "learning_rate": 9.443451351551174e-05, + "loss": 0.7326, + "step": 769 + }, + { + "epoch": 0.17746024429592072, + "grad_norm": 0.31847459077835083, + "learning_rate": 9.441738557499087e-05, + "loss": 0.7267, + "step": 770 + }, + { + "epoch": 0.17769071214565568, + "grad_norm": 0.3504515588283539, + "learning_rate": 9.440023287728537e-05, + "loss": 0.732, + "step": 771 + }, + { + "epoch": 0.17792117999539064, + "grad_norm": 0.2948881983757019, + "learning_rate": 9.438305543195579e-05, + "loss": 0.735, + "step": 772 + }, + { + "epoch": 0.1781516478451256, + "grad_norm": 0.3130739629268646, + "learning_rate": 9.436585324857636e-05, + "loss": 0.7325, + "step": 773 + }, + { + "epoch": 0.17838211569486057, + "grad_norm": 0.30023884773254395, + "learning_rate": 9.434862633673523e-05, + "loss": 0.7274, + "step": 774 + }, + { + "epoch": 0.17861258354459553, + "grad_norm": 0.36194613575935364, + "learning_rate": 9.433137470603424e-05, + "loss": 0.7273, + "step": 775 + }, + { + "epoch": 0.1788430513943305, + "grad_norm": 0.330105721950531, + "learning_rate": 9.431409836608902e-05, + "loss": 0.7236, + "step": 776 + }, + { + "epoch": 0.17907351924406545, + "grad_norm": 0.3341142535209656, + "learning_rate": 9.429679732652901e-05, + "loss": 0.7262, + "step": 777 + }, + { + "epoch": 0.1793039870938004, + "grad_norm": 0.32330140471458435, + "learning_rate": 9.427947159699738e-05, + "loss": 0.7299, + "step": 778 + }, + { + "epoch": 0.17953445494353537, + "grad_norm": 0.3477638363838196, + "learning_rate": 9.426212118715108e-05, + "loss": 0.7379, + "step": 779 + }, + { + "epoch": 0.17976492279327033, + "grad_norm": 0.36434227228164673, + "learning_rate": 9.424474610666082e-05, + "loss": 0.7247, + "step": 780 + }, + { + "epoch": 0.1799953906430053, + "grad_norm": 0.3425809144973755, + "learning_rate": 9.422734636521104e-05, + "loss": 0.7276, + "step": 781 + }, + { + "epoch": 0.18022585849274025, + "grad_norm": 0.33053991198539734, + "learning_rate": 9.420992197249994e-05, + "loss": 0.7255, + "step": 782 + }, + { + "epoch": 0.18045632634247522, + "grad_norm": 0.3649672567844391, + "learning_rate": 9.419247293823947e-05, + "loss": 0.7249, + "step": 783 + }, + { + "epoch": 0.18068679419221018, + "grad_norm": 0.4266059696674347, + "learning_rate": 9.417499927215528e-05, + "loss": 0.7247, + "step": 784 + }, + { + "epoch": 0.18091726204194514, + "grad_norm": 0.360500693321228, + "learning_rate": 9.41575009839868e-05, + "loss": 0.7343, + "step": 785 + }, + { + "epoch": 0.1811477298916801, + "grad_norm": 0.37427565455436707, + "learning_rate": 9.413997808348714e-05, + "loss": 0.7195, + "step": 786 + }, + { + "epoch": 0.18137819774141506, + "grad_norm": 0.42820265889167786, + "learning_rate": 9.412243058042315e-05, + "loss": 0.7426, + "step": 787 + }, + { + "epoch": 0.18160866559115005, + "grad_norm": 0.36763229966163635, + "learning_rate": 9.410485848457537e-05, + "loss": 0.7335, + "step": 788 + }, + { + "epoch": 0.181839133440885, + "grad_norm": 0.3920682668685913, + "learning_rate": 9.408726180573811e-05, + "loss": 0.7293, + "step": 789 + }, + { + "epoch": 0.18206960129061997, + "grad_norm": 0.41110143065452576, + "learning_rate": 9.406964055371928e-05, + "loss": 0.7226, + "step": 790 + }, + { + "epoch": 0.18230006914035493, + "grad_norm": 0.31723544001579285, + "learning_rate": 9.405199473834057e-05, + "loss": 0.7195, + "step": 791 + }, + { + "epoch": 0.1825305369900899, + "grad_norm": 0.3424417972564697, + "learning_rate": 9.403432436943733e-05, + "loss": 0.727, + "step": 792 + }, + { + "epoch": 0.18276100483982485, + "grad_norm": 0.4001381993293762, + "learning_rate": 9.40166294568586e-05, + "loss": 0.7267, + "step": 793 + }, + { + "epoch": 0.18299147268955981, + "grad_norm": 0.4231438636779785, + "learning_rate": 9.399891001046712e-05, + "loss": 0.7256, + "step": 794 + }, + { + "epoch": 0.18322194053929478, + "grad_norm": 0.3733879029750824, + "learning_rate": 9.398116604013925e-05, + "loss": 0.7341, + "step": 795 + }, + { + "epoch": 0.18345240838902974, + "grad_norm": 0.28780287504196167, + "learning_rate": 9.396339755576506e-05, + "loss": 0.7255, + "step": 796 + }, + { + "epoch": 0.1836828762387647, + "grad_norm": 0.3469153046607971, + "learning_rate": 9.39456045672483e-05, + "loss": 0.716, + "step": 797 + }, + { + "epoch": 0.18391334408849966, + "grad_norm": 0.30638718605041504, + "learning_rate": 9.392778708450633e-05, + "loss": 0.7111, + "step": 798 + }, + { + "epoch": 0.18414381193823462, + "grad_norm": 0.3063083291053772, + "learning_rate": 9.39099451174702e-05, + "loss": 0.7271, + "step": 799 + }, + { + "epoch": 0.18437427978796958, + "grad_norm": 0.32252591848373413, + "learning_rate": 9.389207867608461e-05, + "loss": 0.7266, + "step": 800 + }, + { + "epoch": 0.18460474763770454, + "grad_norm": 0.3048442304134369, + "learning_rate": 9.387418777030784e-05, + "loss": 0.7214, + "step": 801 + }, + { + "epoch": 0.1848352154874395, + "grad_norm": 0.3063707649707794, + "learning_rate": 9.385627241011187e-05, + "loss": 0.7223, + "step": 802 + }, + { + "epoch": 0.18506568333717446, + "grad_norm": 0.32035815715789795, + "learning_rate": 9.383833260548233e-05, + "loss": 0.724, + "step": 803 + }, + { + "epoch": 0.18529615118690942, + "grad_norm": 0.3516310453414917, + "learning_rate": 9.382036836641839e-05, + "loss": 0.7246, + "step": 804 + }, + { + "epoch": 0.18552661903664439, + "grad_norm": 0.34494486451148987, + "learning_rate": 9.380237970293291e-05, + "loss": 0.7116, + "step": 805 + }, + { + "epoch": 0.18575708688637935, + "grad_norm": 0.34283608198165894, + "learning_rate": 9.378436662505232e-05, + "loss": 0.7211, + "step": 806 + }, + { + "epoch": 0.1859875547361143, + "grad_norm": 0.30393049120903015, + "learning_rate": 9.376632914281669e-05, + "loss": 0.724, + "step": 807 + }, + { + "epoch": 0.18621802258584927, + "grad_norm": 0.27396926283836365, + "learning_rate": 9.374826726627969e-05, + "loss": 0.7202, + "step": 808 + }, + { + "epoch": 0.18644849043558423, + "grad_norm": 0.3302895724773407, + "learning_rate": 9.373018100550855e-05, + "loss": 0.7303, + "step": 809 + }, + { + "epoch": 0.1866789582853192, + "grad_norm": 0.34311676025390625, + "learning_rate": 9.371207037058414e-05, + "loss": 0.7205, + "step": 810 + }, + { + "epoch": 0.18690942613505415, + "grad_norm": 0.3147958219051361, + "learning_rate": 9.369393537160089e-05, + "loss": 0.7234, + "step": 811 + }, + { + "epoch": 0.1871398939847891, + "grad_norm": 0.29841968417167664, + "learning_rate": 9.36757760186668e-05, + "loss": 0.7242, + "step": 812 + }, + { + "epoch": 0.18737036183452407, + "grad_norm": 0.3176330029964447, + "learning_rate": 9.365759232190348e-05, + "loss": 0.7222, + "step": 813 + }, + { + "epoch": 0.18760082968425904, + "grad_norm": 0.3466809093952179, + "learning_rate": 9.363938429144605e-05, + "loss": 0.7221, + "step": 814 + }, + { + "epoch": 0.187831297533994, + "grad_norm": 0.35498663783073425, + "learning_rate": 9.362115193744328e-05, + "loss": 0.7221, + "step": 815 + }, + { + "epoch": 0.18806176538372896, + "grad_norm": 0.4183621108531952, + "learning_rate": 9.360289527005739e-05, + "loss": 0.7296, + "step": 816 + }, + { + "epoch": 0.18829223323346392, + "grad_norm": 0.41364750266075134, + "learning_rate": 9.358461429946425e-05, + "loss": 0.7226, + "step": 817 + }, + { + "epoch": 0.1885227010831989, + "grad_norm": 0.3849059045314789, + "learning_rate": 9.356630903585321e-05, + "loss": 0.7242, + "step": 818 + }, + { + "epoch": 0.18875316893293387, + "grad_norm": 0.3003298342227936, + "learning_rate": 9.354797948942719e-05, + "loss": 0.7267, + "step": 819 + }, + { + "epoch": 0.18898363678266883, + "grad_norm": 0.294464647769928, + "learning_rate": 9.352962567040266e-05, + "loss": 0.7208, + "step": 820 + }, + { + "epoch": 0.1892141046324038, + "grad_norm": 0.3354666531085968, + "learning_rate": 9.35112475890096e-05, + "loss": 0.7165, + "step": 821 + }, + { + "epoch": 0.18944457248213875, + "grad_norm": 0.3083663582801819, + "learning_rate": 9.349284525549147e-05, + "loss": 0.7148, + "step": 822 + }, + { + "epoch": 0.1896750403318737, + "grad_norm": 0.28768306970596313, + "learning_rate": 9.347441868010531e-05, + "loss": 0.7136, + "step": 823 + }, + { + "epoch": 0.18990550818160867, + "grad_norm": 0.2938915193080902, + "learning_rate": 9.345596787312165e-05, + "loss": 0.7293, + "step": 824 + }, + { + "epoch": 0.19013597603134363, + "grad_norm": 0.3318259119987488, + "learning_rate": 9.343749284482454e-05, + "loss": 0.7088, + "step": 825 + }, + { + "epoch": 0.1903664438810786, + "grad_norm": 0.3234262764453888, + "learning_rate": 9.341899360551151e-05, + "loss": 0.7196, + "step": 826 + }, + { + "epoch": 0.19059691173081356, + "grad_norm": 0.32499444484710693, + "learning_rate": 9.340047016549358e-05, + "loss": 0.7169, + "step": 827 + }, + { + "epoch": 0.19082737958054852, + "grad_norm": 0.28975191712379456, + "learning_rate": 9.33819225350953e-05, + "loss": 0.7216, + "step": 828 + }, + { + "epoch": 0.19105784743028348, + "grad_norm": 0.28860896825790405, + "learning_rate": 9.336335072465465e-05, + "loss": 0.7174, + "step": 829 + }, + { + "epoch": 0.19128831528001844, + "grad_norm": 0.3400057554244995, + "learning_rate": 9.334475474452308e-05, + "loss": 0.7273, + "step": 830 + }, + { + "epoch": 0.1915187831297534, + "grad_norm": 0.3378770649433136, + "learning_rate": 9.332613460506563e-05, + "loss": 0.7221, + "step": 831 + }, + { + "epoch": 0.19174925097948836, + "grad_norm": 0.2862728536128998, + "learning_rate": 9.330749031666064e-05, + "loss": 0.7144, + "step": 832 + }, + { + "epoch": 0.19197971882922332, + "grad_norm": 0.3653225004673004, + "learning_rate": 9.328882188970003e-05, + "loss": 0.7235, + "step": 833 + }, + { + "epoch": 0.19221018667895828, + "grad_norm": 0.40798598527908325, + "learning_rate": 9.327012933458909e-05, + "loss": 0.7232, + "step": 834 + }, + { + "epoch": 0.19244065452869324, + "grad_norm": 0.3767978250980377, + "learning_rate": 9.325141266174666e-05, + "loss": 0.7354, + "step": 835 + }, + { + "epoch": 0.1926711223784282, + "grad_norm": 0.3316027522087097, + "learning_rate": 9.323267188160494e-05, + "loss": 0.7245, + "step": 836 + }, + { + "epoch": 0.19290159022816317, + "grad_norm": 0.2936481535434723, + "learning_rate": 9.321390700460956e-05, + "loss": 0.7238, + "step": 837 + }, + { + "epoch": 0.19313205807789813, + "grad_norm": 0.29495328664779663, + "learning_rate": 9.319511804121967e-05, + "loss": 0.7136, + "step": 838 + }, + { + "epoch": 0.1933625259276331, + "grad_norm": 0.3090824484825134, + "learning_rate": 9.317630500190774e-05, + "loss": 0.7112, + "step": 839 + }, + { + "epoch": 0.19359299377736805, + "grad_norm": 0.288487046957016, + "learning_rate": 9.315746789715973e-05, + "loss": 0.722, + "step": 840 + }, + { + "epoch": 0.193823461627103, + "grad_norm": 0.31703829765319824, + "learning_rate": 9.313860673747496e-05, + "loss": 0.7235, + "step": 841 + }, + { + "epoch": 0.19405392947683797, + "grad_norm": 0.36949485540390015, + "learning_rate": 9.311972153336623e-05, + "loss": 0.7211, + "step": 842 + }, + { + "epoch": 0.19428439732657293, + "grad_norm": 0.3008967936038971, + "learning_rate": 9.310081229535968e-05, + "loss": 0.722, + "step": 843 + }, + { + "epoch": 0.1945148651763079, + "grad_norm": 0.28252121806144714, + "learning_rate": 9.308187903399486e-05, + "loss": 0.7231, + "step": 844 + }, + { + "epoch": 0.19474533302604286, + "grad_norm": 0.32899102568626404, + "learning_rate": 9.306292175982472e-05, + "loss": 0.7103, + "step": 845 + }, + { + "epoch": 0.19497580087577782, + "grad_norm": 0.331802099943161, + "learning_rate": 9.304394048341559e-05, + "loss": 0.7263, + "step": 846 + }, + { + "epoch": 0.1952062687255128, + "grad_norm": 0.2927376925945282, + "learning_rate": 9.302493521534719e-05, + "loss": 0.7177, + "step": 847 + }, + { + "epoch": 0.19543673657524777, + "grad_norm": 0.28728216886520386, + "learning_rate": 9.300590596621257e-05, + "loss": 0.7118, + "step": 848 + }, + { + "epoch": 0.19566720442498273, + "grad_norm": 0.2990799844264984, + "learning_rate": 9.29868527466182e-05, + "loss": 0.7141, + "step": 849 + }, + { + "epoch": 0.1958976722747177, + "grad_norm": 0.2656897306442261, + "learning_rate": 9.296777556718387e-05, + "loss": 0.728, + "step": 850 + }, + { + "epoch": 0.19612814012445265, + "grad_norm": 0.30828338861465454, + "learning_rate": 9.294867443854278e-05, + "loss": 0.7262, + "step": 851 + }, + { + "epoch": 0.1963586079741876, + "grad_norm": 0.29851728677749634, + "learning_rate": 9.292954937134142e-05, + "loss": 0.7277, + "step": 852 + }, + { + "epoch": 0.19658907582392257, + "grad_norm": 0.3060734272003174, + "learning_rate": 9.291040037623961e-05, + "loss": 0.7239, + "step": 853 + }, + { + "epoch": 0.19681954367365753, + "grad_norm": 0.3331679105758667, + "learning_rate": 9.28912274639106e-05, + "loss": 0.7227, + "step": 854 + }, + { + "epoch": 0.1970500115233925, + "grad_norm": 0.28434881567955017, + "learning_rate": 9.287203064504084e-05, + "loss": 0.7143, + "step": 855 + }, + { + "epoch": 0.19728047937312745, + "grad_norm": 0.2975858449935913, + "learning_rate": 9.285280993033024e-05, + "loss": 0.7194, + "step": 856 + }, + { + "epoch": 0.19751094722286242, + "grad_norm": 0.26783379912376404, + "learning_rate": 9.283356533049194e-05, + "loss": 0.7158, + "step": 857 + }, + { + "epoch": 0.19774141507259738, + "grad_norm": 0.31870049238204956, + "learning_rate": 9.281429685625243e-05, + "loss": 0.7282, + "step": 858 + }, + { + "epoch": 0.19797188292233234, + "grad_norm": 0.3032819330692291, + "learning_rate": 9.279500451835145e-05, + "loss": 0.7217, + "step": 859 + }, + { + "epoch": 0.1982023507720673, + "grad_norm": 0.2742626667022705, + "learning_rate": 9.277568832754216e-05, + "loss": 0.7199, + "step": 860 + }, + { + "epoch": 0.19843281862180226, + "grad_norm": 0.2852528989315033, + "learning_rate": 9.275634829459087e-05, + "loss": 0.7242, + "step": 861 + }, + { + "epoch": 0.19866328647153722, + "grad_norm": 0.26641765236854553, + "learning_rate": 9.27369844302773e-05, + "loss": 0.7202, + "step": 862 + }, + { + "epoch": 0.19889375432127218, + "grad_norm": 0.27515074610710144, + "learning_rate": 9.271759674539438e-05, + "loss": 0.7143, + "step": 863 + }, + { + "epoch": 0.19912422217100714, + "grad_norm": 0.3010948598384857, + "learning_rate": 9.269818525074833e-05, + "loss": 0.7182, + "step": 864 + }, + { + "epoch": 0.1993546900207421, + "grad_norm": 0.28209689259529114, + "learning_rate": 9.267874995715868e-05, + "loss": 0.724, + "step": 865 + }, + { + "epoch": 0.19958515787047706, + "grad_norm": 0.29856470227241516, + "learning_rate": 9.26592908754582e-05, + "loss": 0.7151, + "step": 866 + }, + { + "epoch": 0.19981562572021203, + "grad_norm": 0.3309277594089508, + "learning_rate": 9.263980801649286e-05, + "loss": 0.7171, + "step": 867 + }, + { + "epoch": 0.200046093569947, + "grad_norm": 0.264207661151886, + "learning_rate": 9.262030139112198e-05, + "loss": 0.723, + "step": 868 + }, + { + "epoch": 0.20027656141968195, + "grad_norm": 0.31711748242378235, + "learning_rate": 9.260077101021811e-05, + "loss": 0.7258, + "step": 869 + }, + { + "epoch": 0.2005070292694169, + "grad_norm": 0.3416388928890228, + "learning_rate": 9.258121688466696e-05, + "loss": 0.7271, + "step": 870 + }, + { + "epoch": 0.20073749711915187, + "grad_norm": 0.37464427947998047, + "learning_rate": 9.256163902536756e-05, + "loss": 0.7173, + "step": 871 + }, + { + "epoch": 0.20096796496888683, + "grad_norm": 0.36626744270324707, + "learning_rate": 9.254203744323216e-05, + "loss": 0.7294, + "step": 872 + }, + { + "epoch": 0.2011984328186218, + "grad_norm": 0.27999523282051086, + "learning_rate": 9.252241214918615e-05, + "loss": 0.7204, + "step": 873 + }, + { + "epoch": 0.20142890066835675, + "grad_norm": 0.4048886001110077, + "learning_rate": 9.250276315416825e-05, + "loss": 0.7151, + "step": 874 + }, + { + "epoch": 0.20165936851809171, + "grad_norm": 0.4276858866214752, + "learning_rate": 9.248309046913032e-05, + "loss": 0.7311, + "step": 875 + }, + { + "epoch": 0.20188983636782667, + "grad_norm": 0.3747239410877228, + "learning_rate": 9.246339410503745e-05, + "loss": 0.7223, + "step": 876 + }, + { + "epoch": 0.20212030421756166, + "grad_norm": 0.2862272560596466, + "learning_rate": 9.24436740728679e-05, + "loss": 0.7135, + "step": 877 + }, + { + "epoch": 0.20235077206729662, + "grad_norm": 0.3424084484577179, + "learning_rate": 9.242393038361316e-05, + "loss": 0.7224, + "step": 878 + }, + { + "epoch": 0.20258123991703159, + "grad_norm": 0.3472643792629242, + "learning_rate": 9.24041630482779e-05, + "loss": 0.7144, + "step": 879 + }, + { + "epoch": 0.20281170776676655, + "grad_norm": 0.31430667638778687, + "learning_rate": 9.238437207787994e-05, + "loss": 0.7234, + "step": 880 + }, + { + "epoch": 0.2030421756165015, + "grad_norm": 0.2940575182437897, + "learning_rate": 9.23645574834503e-05, + "loss": 0.7105, + "step": 881 + }, + { + "epoch": 0.20327264346623647, + "grad_norm": 0.3699895739555359, + "learning_rate": 9.234471927603314e-05, + "loss": 0.7107, + "step": 882 + }, + { + "epoch": 0.20350311131597143, + "grad_norm": 0.3853522837162018, + "learning_rate": 9.232485746668584e-05, + "loss": 0.721, + "step": 883 + }, + { + "epoch": 0.2037335791657064, + "grad_norm": 0.37054017186164856, + "learning_rate": 9.230497206647885e-05, + "loss": 0.7209, + "step": 884 + }, + { + "epoch": 0.20396404701544135, + "grad_norm": 0.2768975496292114, + "learning_rate": 9.228506308649585e-05, + "loss": 0.7226, + "step": 885 + }, + { + "epoch": 0.2041945148651763, + "grad_norm": 0.3139522671699524, + "learning_rate": 9.22651305378336e-05, + "loss": 0.7189, + "step": 886 + }, + { + "epoch": 0.20442498271491127, + "grad_norm": 0.33646050095558167, + "learning_rate": 9.224517443160205e-05, + "loss": 0.7163, + "step": 887 + }, + { + "epoch": 0.20465545056464624, + "grad_norm": 0.28335535526275635, + "learning_rate": 9.222519477892425e-05, + "loss": 0.7233, + "step": 888 + }, + { + "epoch": 0.2048859184143812, + "grad_norm": 0.31536251306533813, + "learning_rate": 9.220519159093637e-05, + "loss": 0.7152, + "step": 889 + }, + { + "epoch": 0.20511638626411616, + "grad_norm": 0.2616060972213745, + "learning_rate": 9.21851648787877e-05, + "loss": 0.7144, + "step": 890 + }, + { + "epoch": 0.20534685411385112, + "grad_norm": 0.30261725187301636, + "learning_rate": 9.216511465364066e-05, + "loss": 0.717, + "step": 891 + }, + { + "epoch": 0.20557732196358608, + "grad_norm": 0.3098640739917755, + "learning_rate": 9.214504092667075e-05, + "loss": 0.7182, + "step": 892 + }, + { + "epoch": 0.20580778981332104, + "grad_norm": 0.32920923829078674, + "learning_rate": 9.212494370906661e-05, + "loss": 0.7217, + "step": 893 + }, + { + "epoch": 0.206038257663056, + "grad_norm": 0.31871381402015686, + "learning_rate": 9.210482301202994e-05, + "loss": 0.7135, + "step": 894 + }, + { + "epoch": 0.20626872551279096, + "grad_norm": 0.2803013324737549, + "learning_rate": 9.208467884677551e-05, + "loss": 0.7218, + "step": 895 + }, + { + "epoch": 0.20649919336252592, + "grad_norm": 0.31041911244392395, + "learning_rate": 9.206451122453122e-05, + "loss": 0.7207, + "step": 896 + }, + { + "epoch": 0.20672966121226088, + "grad_norm": 0.3201538324356079, + "learning_rate": 9.204432015653801e-05, + "loss": 0.7163, + "step": 897 + }, + { + "epoch": 0.20696012906199585, + "grad_norm": 0.26762691140174866, + "learning_rate": 9.202410565404988e-05, + "loss": 0.7103, + "step": 898 + }, + { + "epoch": 0.2071905969117308, + "grad_norm": 0.3141172528266907, + "learning_rate": 9.200386772833394e-05, + "loss": 0.7251, + "step": 899 + }, + { + "epoch": 0.20742106476146577, + "grad_norm": 0.3220638632774353, + "learning_rate": 9.19836063906703e-05, + "loss": 0.716, + "step": 900 + }, + { + "epoch": 0.20765153261120073, + "grad_norm": 0.3352587819099426, + "learning_rate": 9.196332165235215e-05, + "loss": 0.7235, + "step": 901 + }, + { + "epoch": 0.2078820004609357, + "grad_norm": 0.3438172936439514, + "learning_rate": 9.194301352468572e-05, + "loss": 0.713, + "step": 902 + }, + { + "epoch": 0.20811246831067065, + "grad_norm": 0.3374473452568054, + "learning_rate": 9.192268201899028e-05, + "loss": 0.7203, + "step": 903 + }, + { + "epoch": 0.2083429361604056, + "grad_norm": 0.32671868801116943, + "learning_rate": 9.19023271465981e-05, + "loss": 0.7264, + "step": 904 + }, + { + "epoch": 0.20857340401014057, + "grad_norm": 0.27988868951797485, + "learning_rate": 9.188194891885453e-05, + "loss": 0.7129, + "step": 905 + }, + { + "epoch": 0.20880387185987553, + "grad_norm": 0.2564987540245056, + "learning_rate": 9.186154734711786e-05, + "loss": 0.7251, + "step": 906 + }, + { + "epoch": 0.20903433970961052, + "grad_norm": 0.30010056495666504, + "learning_rate": 9.184112244275948e-05, + "loss": 0.7137, + "step": 907 + }, + { + "epoch": 0.20926480755934548, + "grad_norm": 0.2974473536014557, + "learning_rate": 9.182067421716372e-05, + "loss": 0.7259, + "step": 908 + }, + { + "epoch": 0.20949527540908044, + "grad_norm": 0.31991463899612427, + "learning_rate": 9.180020268172794e-05, + "loss": 0.7124, + "step": 909 + }, + { + "epoch": 0.2097257432588154, + "grad_norm": 0.32082507014274597, + "learning_rate": 9.177970784786245e-05, + "loss": 0.7184, + "step": 910 + }, + { + "epoch": 0.20995621110855037, + "grad_norm": 0.32305994629859924, + "learning_rate": 9.175918972699063e-05, + "loss": 0.7147, + "step": 911 + }, + { + "epoch": 0.21018667895828533, + "grad_norm": 0.3100352883338928, + "learning_rate": 9.173864833054875e-05, + "loss": 0.7199, + "step": 912 + }, + { + "epoch": 0.2104171468080203, + "grad_norm": 0.30127498507499695, + "learning_rate": 9.17180836699861e-05, + "loss": 0.7188, + "step": 913 + }, + { + "epoch": 0.21064761465775525, + "grad_norm": 0.27540695667266846, + "learning_rate": 9.169749575676496e-05, + "loss": 0.7133, + "step": 914 + }, + { + "epoch": 0.2108780825074902, + "grad_norm": 0.3038792610168457, + "learning_rate": 9.167688460236049e-05, + "loss": 0.716, + "step": 915 + }, + { + "epoch": 0.21110855035722517, + "grad_norm": 0.3074706792831421, + "learning_rate": 9.165625021826087e-05, + "loss": 0.7136, + "step": 916 + }, + { + "epoch": 0.21133901820696013, + "grad_norm": 0.31850558519363403, + "learning_rate": 9.163559261596723e-05, + "loss": 0.7147, + "step": 917 + }, + { + "epoch": 0.2115694860566951, + "grad_norm": 0.326848566532135, + "learning_rate": 9.161491180699359e-05, + "loss": 0.7126, + "step": 918 + }, + { + "epoch": 0.21179995390643006, + "grad_norm": 0.278940349817276, + "learning_rate": 9.159420780286699e-05, + "loss": 0.7173, + "step": 919 + }, + { + "epoch": 0.21203042175616502, + "grad_norm": 0.3120673894882202, + "learning_rate": 9.157348061512727e-05, + "loss": 0.7157, + "step": 920 + }, + { + "epoch": 0.21226088960589998, + "grad_norm": 0.2851211428642273, + "learning_rate": 9.155273025532731e-05, + "loss": 0.7153, + "step": 921 + }, + { + "epoch": 0.21249135745563494, + "grad_norm": 0.2742621600627899, + "learning_rate": 9.153195673503286e-05, + "loss": 0.7158, + "step": 922 + }, + { + "epoch": 0.2127218253053699, + "grad_norm": 0.29812031984329224, + "learning_rate": 9.151116006582259e-05, + "loss": 0.7083, + "step": 923 + }, + { + "epoch": 0.21295229315510486, + "grad_norm": 0.33117279410362244, + "learning_rate": 9.149034025928804e-05, + "loss": 0.7196, + "step": 924 + }, + { + "epoch": 0.21318276100483982, + "grad_norm": 0.2820033133029938, + "learning_rate": 9.146949732703368e-05, + "loss": 0.7102, + "step": 925 + }, + { + "epoch": 0.21341322885457478, + "grad_norm": 0.2883508801460266, + "learning_rate": 9.144863128067687e-05, + "loss": 0.7246, + "step": 926 + }, + { + "epoch": 0.21364369670430974, + "grad_norm": 0.34615033864974976, + "learning_rate": 9.142774213184784e-05, + "loss": 0.7164, + "step": 927 + }, + { + "epoch": 0.2138741645540447, + "grad_norm": 0.33778396248817444, + "learning_rate": 9.140682989218969e-05, + "loss": 0.7152, + "step": 928 + }, + { + "epoch": 0.21410463240377967, + "grad_norm": 0.37993401288986206, + "learning_rate": 9.138589457335842e-05, + "loss": 0.715, + "step": 929 + }, + { + "epoch": 0.21433510025351463, + "grad_norm": 0.32394763827323914, + "learning_rate": 9.136493618702284e-05, + "loss": 0.7263, + "step": 930 + }, + { + "epoch": 0.2145655681032496, + "grad_norm": 0.2940785884857178, + "learning_rate": 9.13439547448647e-05, + "loss": 0.7267, + "step": 931 + }, + { + "epoch": 0.21479603595298455, + "grad_norm": 0.32346901297569275, + "learning_rate": 9.132295025857851e-05, + "loss": 0.719, + "step": 932 + }, + { + "epoch": 0.2150265038027195, + "grad_norm": 0.40225303173065186, + "learning_rate": 9.130192273987168e-05, + "loss": 0.7186, + "step": 933 + }, + { + "epoch": 0.21525697165245447, + "grad_norm": 0.28617575764656067, + "learning_rate": 9.128087220046445e-05, + "loss": 0.7148, + "step": 934 + }, + { + "epoch": 0.21548743950218943, + "grad_norm": 0.3431677222251892, + "learning_rate": 9.125979865208988e-05, + "loss": 0.7135, + "step": 935 + }, + { + "epoch": 0.2157179073519244, + "grad_norm": 0.4484764039516449, + "learning_rate": 9.123870210649387e-05, + "loss": 0.7096, + "step": 936 + }, + { + "epoch": 0.21594837520165938, + "grad_norm": 0.39285802841186523, + "learning_rate": 9.121758257543511e-05, + "loss": 0.7138, + "step": 937 + }, + { + "epoch": 0.21617884305139434, + "grad_norm": 0.2899276912212372, + "learning_rate": 9.119644007068513e-05, + "loss": 0.7072, + "step": 938 + }, + { + "epoch": 0.2164093109011293, + "grad_norm": 0.39116156101226807, + "learning_rate": 9.117527460402826e-05, + "loss": 0.7193, + "step": 939 + }, + { + "epoch": 0.21663977875086426, + "grad_norm": 0.3671886622905731, + "learning_rate": 9.115408618726162e-05, + "loss": 0.7171, + "step": 940 + }, + { + "epoch": 0.21687024660059923, + "grad_norm": 0.35713937878608704, + "learning_rate": 9.113287483219511e-05, + "loss": 0.7105, + "step": 941 + }, + { + "epoch": 0.2171007144503342, + "grad_norm": 0.3286116123199463, + "learning_rate": 9.111164055065145e-05, + "loss": 0.7185, + "step": 942 + }, + { + "epoch": 0.21733118230006915, + "grad_norm": 0.31977954506874084, + "learning_rate": 9.109038335446612e-05, + "loss": 0.7177, + "step": 943 + }, + { + "epoch": 0.2175616501498041, + "grad_norm": 0.3611249625682831, + "learning_rate": 9.106910325548734e-05, + "loss": 0.7169, + "step": 944 + }, + { + "epoch": 0.21779211799953907, + "grad_norm": 0.352547824382782, + "learning_rate": 9.104780026557618e-05, + "loss": 0.7129, + "step": 945 + }, + { + "epoch": 0.21802258584927403, + "grad_norm": 0.33489990234375, + "learning_rate": 9.102647439660637e-05, + "loss": 0.7123, + "step": 946 + }, + { + "epoch": 0.218253053699009, + "grad_norm": 0.37026363611221313, + "learning_rate": 9.100512566046444e-05, + "loss": 0.7225, + "step": 947 + }, + { + "epoch": 0.21848352154874395, + "grad_norm": 0.3738853335380554, + "learning_rate": 9.098375406904968e-05, + "loss": 0.7086, + "step": 948 + }, + { + "epoch": 0.21871398939847891, + "grad_norm": 0.2526826560497284, + "learning_rate": 9.09623596342741e-05, + "loss": 0.7142, + "step": 949 + }, + { + "epoch": 0.21894445724821388, + "grad_norm": 0.3173041045665741, + "learning_rate": 9.094094236806244e-05, + "loss": 0.7133, + "step": 950 + }, + { + "epoch": 0.21917492509794884, + "grad_norm": 0.35794541239738464, + "learning_rate": 9.091950228235217e-05, + "loss": 0.7211, + "step": 951 + }, + { + "epoch": 0.2194053929476838, + "grad_norm": 0.36241787672042847, + "learning_rate": 9.089803938909349e-05, + "loss": 0.7164, + "step": 952 + }, + { + "epoch": 0.21963586079741876, + "grad_norm": 0.28174248337745667, + "learning_rate": 9.087655370024928e-05, + "loss": 0.7085, + "step": 953 + }, + { + "epoch": 0.21986632864715372, + "grad_norm": 0.3472788631916046, + "learning_rate": 9.085504522779517e-05, + "loss": 0.7166, + "step": 954 + }, + { + "epoch": 0.22009679649688868, + "grad_norm": 0.39354413747787476, + "learning_rate": 9.083351398371944e-05, + "loss": 0.7086, + "step": 955 + }, + { + "epoch": 0.22032726434662364, + "grad_norm": 0.4095928966999054, + "learning_rate": 9.081195998002312e-05, + "loss": 0.7136, + "step": 956 + }, + { + "epoch": 0.2205577321963586, + "grad_norm": 0.3270823061466217, + "learning_rate": 9.079038322871987e-05, + "loss": 0.7218, + "step": 957 + }, + { + "epoch": 0.22078820004609356, + "grad_norm": 0.30970245599746704, + "learning_rate": 9.076878374183606e-05, + "loss": 0.7144, + "step": 958 + }, + { + "epoch": 0.22101866789582852, + "grad_norm": 0.33222824335098267, + "learning_rate": 9.074716153141074e-05, + "loss": 0.7069, + "step": 959 + }, + { + "epoch": 0.22124913574556349, + "grad_norm": 0.29064905643463135, + "learning_rate": 9.072551660949558e-05, + "loss": 0.7171, + "step": 960 + }, + { + "epoch": 0.22147960359529845, + "grad_norm": 0.2900223135948181, + "learning_rate": 9.070384898815497e-05, + "loss": 0.7182, + "step": 961 + }, + { + "epoch": 0.2217100714450334, + "grad_norm": 0.33210277557373047, + "learning_rate": 9.068215867946591e-05, + "loss": 0.7184, + "step": 962 + }, + { + "epoch": 0.22194053929476837, + "grad_norm": 0.35366907715797424, + "learning_rate": 9.066044569551807e-05, + "loss": 0.7209, + "step": 963 + }, + { + "epoch": 0.22217100714450333, + "grad_norm": 0.29984578490257263, + "learning_rate": 9.063871004841371e-05, + "loss": 0.7083, + "step": 964 + }, + { + "epoch": 0.2224014749942383, + "grad_norm": 0.2960159182548523, + "learning_rate": 9.061695175026779e-05, + "loss": 0.7146, + "step": 965 + }, + { + "epoch": 0.22263194284397328, + "grad_norm": 0.3301765024662018, + "learning_rate": 9.059517081320787e-05, + "loss": 0.7001, + "step": 966 + }, + { + "epoch": 0.22286241069370824, + "grad_norm": 0.3346138894557953, + "learning_rate": 9.057336724937409e-05, + "loss": 0.7132, + "step": 967 + }, + { + "epoch": 0.2230928785434432, + "grad_norm": 0.32978126406669617, + "learning_rate": 9.055154107091925e-05, + "loss": 0.7125, + "step": 968 + }, + { + "epoch": 0.22332334639317816, + "grad_norm": 0.3108513653278351, + "learning_rate": 9.052969229000874e-05, + "loss": 0.7251, + "step": 969 + }, + { + "epoch": 0.22355381424291312, + "grad_norm": 0.36438095569610596, + "learning_rate": 9.050782091882056e-05, + "loss": 0.7118, + "step": 970 + }, + { + "epoch": 0.22378428209264808, + "grad_norm": 0.3605962097644806, + "learning_rate": 9.048592696954524e-05, + "loss": 0.7099, + "step": 971 + }, + { + "epoch": 0.22401474994238305, + "grad_norm": 0.2907812297344208, + "learning_rate": 9.0464010454386e-05, + "loss": 0.7062, + "step": 972 + }, + { + "epoch": 0.224245217792118, + "grad_norm": 0.30365508794784546, + "learning_rate": 9.044207138555854e-05, + "loss": 0.7057, + "step": 973 + }, + { + "epoch": 0.22447568564185297, + "grad_norm": 0.3420591950416565, + "learning_rate": 9.042010977529118e-05, + "loss": 0.7136, + "step": 974 + }, + { + "epoch": 0.22470615349158793, + "grad_norm": 0.27449890971183777, + "learning_rate": 9.039812563582482e-05, + "loss": 0.7185, + "step": 975 + }, + { + "epoch": 0.2249366213413229, + "grad_norm": 0.3264601230621338, + "learning_rate": 9.037611897941283e-05, + "loss": 0.7205, + "step": 976 + }, + { + "epoch": 0.22516708919105785, + "grad_norm": 0.3168565332889557, + "learning_rate": 9.035408981832126e-05, + "loss": 0.723, + "step": 977 + }, + { + "epoch": 0.2253975570407928, + "grad_norm": 0.34214240312576294, + "learning_rate": 9.03320381648286e-05, + "loss": 0.7068, + "step": 978 + }, + { + "epoch": 0.22562802489052777, + "grad_norm": 0.3359646797180176, + "learning_rate": 9.030996403122592e-05, + "loss": 0.7159, + "step": 979 + }, + { + "epoch": 0.22585849274026273, + "grad_norm": 0.32919085025787354, + "learning_rate": 9.02878674298168e-05, + "loss": 0.7126, + "step": 980 + }, + { + "epoch": 0.2260889605899977, + "grad_norm": 0.2955667972564697, + "learning_rate": 9.026574837291739e-05, + "loss": 0.7104, + "step": 981 + }, + { + "epoch": 0.22631942843973266, + "grad_norm": 0.307659775018692, + "learning_rate": 9.024360687285629e-05, + "loss": 0.7065, + "step": 982 + }, + { + "epoch": 0.22654989628946762, + "grad_norm": 0.31468522548675537, + "learning_rate": 9.022144294197466e-05, + "loss": 0.7091, + "step": 983 + }, + { + "epoch": 0.22678036413920258, + "grad_norm": 0.2780282497406006, + "learning_rate": 9.019925659262612e-05, + "loss": 0.7115, + "step": 984 + }, + { + "epoch": 0.22701083198893754, + "grad_norm": 0.32122835516929626, + "learning_rate": 9.01770478371768e-05, + "loss": 0.7219, + "step": 985 + }, + { + "epoch": 0.2272412998386725, + "grad_norm": 0.33038368821144104, + "learning_rate": 9.015481668800537e-05, + "loss": 0.7051, + "step": 986 + }, + { + "epoch": 0.22747176768840746, + "grad_norm": 0.3402576744556427, + "learning_rate": 9.013256315750291e-05, + "loss": 0.7195, + "step": 987 + }, + { + "epoch": 0.22770223553814242, + "grad_norm": 0.37470388412475586, + "learning_rate": 9.011028725807302e-05, + "loss": 0.7078, + "step": 988 + }, + { + "epoch": 0.22793270338787738, + "grad_norm": 0.3758593797683716, + "learning_rate": 9.008798900213173e-05, + "loss": 0.7092, + "step": 989 + }, + { + "epoch": 0.22816317123761234, + "grad_norm": 0.31909018754959106, + "learning_rate": 9.006566840210757e-05, + "loss": 0.7129, + "step": 990 + }, + { + "epoch": 0.2283936390873473, + "grad_norm": 0.34421950578689575, + "learning_rate": 9.00433254704415e-05, + "loss": 0.7181, + "step": 991 + }, + { + "epoch": 0.22862410693708227, + "grad_norm": 0.3339763879776001, + "learning_rate": 9.002096021958693e-05, + "loss": 0.7227, + "step": 992 + }, + { + "epoch": 0.22885457478681723, + "grad_norm": 0.29700416326522827, + "learning_rate": 8.999857266200973e-05, + "loss": 0.712, + "step": 993 + }, + { + "epoch": 0.2290850426365522, + "grad_norm": 0.3107481896877289, + "learning_rate": 8.997616281018816e-05, + "loss": 0.7106, + "step": 994 + }, + { + "epoch": 0.22931551048628715, + "grad_norm": 0.3676799535751343, + "learning_rate": 8.995373067661296e-05, + "loss": 0.7113, + "step": 995 + }, + { + "epoch": 0.22954597833602214, + "grad_norm": 0.2843577563762665, + "learning_rate": 8.993127627378727e-05, + "loss": 0.7127, + "step": 996 + }, + { + "epoch": 0.2297764461857571, + "grad_norm": 0.30162549018859863, + "learning_rate": 8.99087996142266e-05, + "loss": 0.7159, + "step": 997 + }, + { + "epoch": 0.23000691403549206, + "grad_norm": 0.4627409279346466, + "learning_rate": 8.988630071045892e-05, + "loss": 0.714, + "step": 998 + }, + { + "epoch": 0.23023738188522702, + "grad_norm": 0.4146185517311096, + "learning_rate": 8.986377957502459e-05, + "loss": 0.7239, + "step": 999 + }, + { + "epoch": 0.23046784973496198, + "grad_norm": 0.29513731598854065, + "learning_rate": 8.984123622047632e-05, + "loss": 0.7084, + "step": 1000 + }, + { + "epoch": 0.23069831758469694, + "grad_norm": 0.29365894198417664, + "learning_rate": 8.981867065937925e-05, + "loss": 0.7183, + "step": 1001 + }, + { + "epoch": 0.2309287854344319, + "grad_norm": 0.2846957743167877, + "learning_rate": 8.97960829043109e-05, + "loss": 0.7087, + "step": 1002 + }, + { + "epoch": 0.23115925328416687, + "grad_norm": 0.3023827373981476, + "learning_rate": 8.977347296786113e-05, + "loss": 0.7142, + "step": 1003 + }, + { + "epoch": 0.23138972113390183, + "grad_norm": 0.2838916480541229, + "learning_rate": 8.975084086263217e-05, + "loss": 0.7173, + "step": 1004 + }, + { + "epoch": 0.2316201889836368, + "grad_norm": 0.3648954927921295, + "learning_rate": 8.97281866012386e-05, + "loss": 0.7182, + "step": 1005 + }, + { + "epoch": 0.23185065683337175, + "grad_norm": 0.29563209414482117, + "learning_rate": 8.97055101963074e-05, + "loss": 0.7105, + "step": 1006 + }, + { + "epoch": 0.2320811246831067, + "grad_norm": 0.3027130365371704, + "learning_rate": 8.968281166047781e-05, + "loss": 0.7092, + "step": 1007 + }, + { + "epoch": 0.23231159253284167, + "grad_norm": 0.3136472702026367, + "learning_rate": 8.966009100640148e-05, + "loss": 0.7097, + "step": 1008 + }, + { + "epoch": 0.23254206038257663, + "grad_norm": 0.2663586735725403, + "learning_rate": 8.963734824674235e-05, + "loss": 0.716, + "step": 1009 + }, + { + "epoch": 0.2327725282323116, + "grad_norm": 0.27872374653816223, + "learning_rate": 8.961458339417669e-05, + "loss": 0.7119, + "step": 1010 + }, + { + "epoch": 0.23300299608204655, + "grad_norm": 0.32824763655662537, + "learning_rate": 8.959179646139306e-05, + "loss": 0.7013, + "step": 1011 + }, + { + "epoch": 0.23323346393178152, + "grad_norm": 0.2956524193286896, + "learning_rate": 8.95689874610924e-05, + "loss": 0.7092, + "step": 1012 + }, + { + "epoch": 0.23346393178151648, + "grad_norm": 0.2657110095024109, + "learning_rate": 8.954615640598782e-05, + "loss": 0.7061, + "step": 1013 + }, + { + "epoch": 0.23369439963125144, + "grad_norm": 0.31145045161247253, + "learning_rate": 8.952330330880487e-05, + "loss": 0.7105, + "step": 1014 + }, + { + "epoch": 0.2339248674809864, + "grad_norm": 0.31407973170280457, + "learning_rate": 8.95004281822813e-05, + "loss": 0.7045, + "step": 1015 + }, + { + "epoch": 0.23415533533072136, + "grad_norm": 0.24322229623794556, + "learning_rate": 8.947753103916716e-05, + "loss": 0.7169, + "step": 1016 + }, + { + "epoch": 0.23438580318045632, + "grad_norm": 0.34636759757995605, + "learning_rate": 8.945461189222474e-05, + "loss": 0.7111, + "step": 1017 + }, + { + "epoch": 0.23461627103019128, + "grad_norm": 0.3195311725139618, + "learning_rate": 8.943167075422867e-05, + "loss": 0.7136, + "step": 1018 + }, + { + "epoch": 0.23484673887992624, + "grad_norm": 0.30927303433418274, + "learning_rate": 8.940870763796574e-05, + "loss": 0.7086, + "step": 1019 + }, + { + "epoch": 0.2350772067296612, + "grad_norm": 0.29347729682922363, + "learning_rate": 8.938572255623509e-05, + "loss": 0.7046, + "step": 1020 + }, + { + "epoch": 0.23530767457939616, + "grad_norm": 0.28396493196487427, + "learning_rate": 8.9362715521848e-05, + "loss": 0.7064, + "step": 1021 + }, + { + "epoch": 0.23553814242913113, + "grad_norm": 0.26140013337135315, + "learning_rate": 8.933968654762808e-05, + "loss": 0.7112, + "step": 1022 + }, + { + "epoch": 0.2357686102788661, + "grad_norm": 0.2797393500804901, + "learning_rate": 8.93166356464111e-05, + "loss": 0.7127, + "step": 1023 + }, + { + "epoch": 0.23599907812860105, + "grad_norm": 0.289408802986145, + "learning_rate": 8.92935628310451e-05, + "loss": 0.7079, + "step": 1024 + }, + { + "epoch": 0.236229545978336, + "grad_norm": 0.31490978598594666, + "learning_rate": 8.92704681143903e-05, + "loss": 0.7066, + "step": 1025 + }, + { + "epoch": 0.236460013828071, + "grad_norm": 0.30412665009498596, + "learning_rate": 8.924735150931915e-05, + "loss": 0.7068, + "step": 1026 + }, + { + "epoch": 0.23669048167780596, + "grad_norm": 0.3499104380607605, + "learning_rate": 8.922421302871629e-05, + "loss": 0.7169, + "step": 1027 + }, + { + "epoch": 0.23692094952754092, + "grad_norm": 0.3591751158237457, + "learning_rate": 8.920105268547854e-05, + "loss": 0.7108, + "step": 1028 + }, + { + "epoch": 0.23715141737727588, + "grad_norm": 0.35666418075561523, + "learning_rate": 8.917787049251493e-05, + "loss": 0.7055, + "step": 1029 + }, + { + "epoch": 0.23738188522701084, + "grad_norm": 0.35615837574005127, + "learning_rate": 8.915466646274666e-05, + "loss": 0.7131, + "step": 1030 + }, + { + "epoch": 0.2376123530767458, + "grad_norm": 0.37170687317848206, + "learning_rate": 8.91314406091071e-05, + "loss": 0.7194, + "step": 1031 + }, + { + "epoch": 0.23784282092648076, + "grad_norm": 0.35245972871780396, + "learning_rate": 8.910819294454177e-05, + "loss": 0.7031, + "step": 1032 + }, + { + "epoch": 0.23807328877621572, + "grad_norm": 0.2725777328014374, + "learning_rate": 8.908492348200838e-05, + "loss": 0.7062, + "step": 1033 + }, + { + "epoch": 0.23830375662595069, + "grad_norm": 0.3370811343193054, + "learning_rate": 8.906163223447676e-05, + "loss": 0.7078, + "step": 1034 + }, + { + "epoch": 0.23853422447568565, + "grad_norm": 0.35790640115737915, + "learning_rate": 8.903831921492889e-05, + "loss": 0.7107, + "step": 1035 + }, + { + "epoch": 0.2387646923254206, + "grad_norm": 0.2598463296890259, + "learning_rate": 8.90149844363589e-05, + "loss": 0.7191, + "step": 1036 + }, + { + "epoch": 0.23899516017515557, + "grad_norm": 0.2851402461528778, + "learning_rate": 8.899162791177301e-05, + "loss": 0.7103, + "step": 1037 + }, + { + "epoch": 0.23922562802489053, + "grad_norm": 0.34928953647613525, + "learning_rate": 8.896824965418961e-05, + "loss": 0.7202, + "step": 1038 + }, + { + "epoch": 0.2394560958746255, + "grad_norm": 0.3223908245563507, + "learning_rate": 8.894484967663917e-05, + "loss": 0.7086, + "step": 1039 + }, + { + "epoch": 0.23968656372436045, + "grad_norm": 0.31179115176200867, + "learning_rate": 8.892142799216428e-05, + "loss": 0.7122, + "step": 1040 + }, + { + "epoch": 0.2399170315740954, + "grad_norm": 0.3074735403060913, + "learning_rate": 8.889798461381961e-05, + "loss": 0.7091, + "step": 1041 + }, + { + "epoch": 0.24014749942383037, + "grad_norm": 0.2974226474761963, + "learning_rate": 8.887451955467195e-05, + "loss": 0.7096, + "step": 1042 + }, + { + "epoch": 0.24037796727356533, + "grad_norm": 0.3330132067203522, + "learning_rate": 8.885103282780016e-05, + "loss": 0.7104, + "step": 1043 + }, + { + "epoch": 0.2406084351233003, + "grad_norm": 0.28745895624160767, + "learning_rate": 8.882752444629518e-05, + "loss": 0.7174, + "step": 1044 + }, + { + "epoch": 0.24083890297303526, + "grad_norm": 0.2735026180744171, + "learning_rate": 8.880399442326001e-05, + "loss": 0.7211, + "step": 1045 + }, + { + "epoch": 0.24106937082277022, + "grad_norm": 0.2821330726146698, + "learning_rate": 8.878044277180975e-05, + "loss": 0.7153, + "step": 1046 + }, + { + "epoch": 0.24129983867250518, + "grad_norm": 0.30076584219932556, + "learning_rate": 8.875686950507147e-05, + "loss": 0.7149, + "step": 1047 + }, + { + "epoch": 0.24153030652224014, + "grad_norm": 0.24282529950141907, + "learning_rate": 8.873327463618438e-05, + "loss": 0.7039, + "step": 1048 + }, + { + "epoch": 0.2417607743719751, + "grad_norm": 0.26797229051589966, + "learning_rate": 8.87096581782997e-05, + "loss": 0.7115, + "step": 1049 + }, + { + "epoch": 0.24199124222171006, + "grad_norm": 0.27635279297828674, + "learning_rate": 8.868602014458065e-05, + "loss": 0.7142, + "step": 1050 + }, + { + "epoch": 0.24222171007144502, + "grad_norm": 0.6189936399459839, + "learning_rate": 8.866236054820251e-05, + "loss": 0.7268, + "step": 1051 + }, + { + "epoch": 0.24245217792117998, + "grad_norm": 0.25462353229522705, + "learning_rate": 8.86386794023526e-05, + "loss": 0.7054, + "step": 1052 + }, + { + "epoch": 0.24268264577091495, + "grad_norm": 0.2922193706035614, + "learning_rate": 8.861497672023018e-05, + "loss": 0.712, + "step": 1053 + }, + { + "epoch": 0.2429131136206499, + "grad_norm": 0.28528186678886414, + "learning_rate": 8.859125251504657e-05, + "loss": 0.7158, + "step": 1054 + }, + { + "epoch": 0.24314358147038487, + "grad_norm": 0.26813432574272156, + "learning_rate": 8.85675068000251e-05, + "loss": 0.7048, + "step": 1055 + }, + { + "epoch": 0.24337404932011986, + "grad_norm": 0.27089470624923706, + "learning_rate": 8.854373958840102e-05, + "loss": 0.6999, + "step": 1056 + }, + { + "epoch": 0.24360451716985482, + "grad_norm": 0.27272212505340576, + "learning_rate": 8.851995089342163e-05, + "loss": 0.7162, + "step": 1057 + }, + { + "epoch": 0.24383498501958978, + "grad_norm": 0.30942168831825256, + "learning_rate": 8.849614072834617e-05, + "loss": 0.7128, + "step": 1058 + }, + { + "epoch": 0.24406545286932474, + "grad_norm": 0.27839475870132446, + "learning_rate": 8.847230910644586e-05, + "loss": 0.7007, + "step": 1059 + }, + { + "epoch": 0.2442959207190597, + "grad_norm": 0.28866031765937805, + "learning_rate": 8.844845604100387e-05, + "loss": 0.707, + "step": 1060 + }, + { + "epoch": 0.24452638856879466, + "grad_norm": 0.30991750955581665, + "learning_rate": 8.842458154531533e-05, + "loss": 0.7081, + "step": 1061 + }, + { + "epoch": 0.24475685641852962, + "grad_norm": 0.2961871325969696, + "learning_rate": 8.840068563268728e-05, + "loss": 0.7093, + "step": 1062 + }, + { + "epoch": 0.24498732426826458, + "grad_norm": 0.26729321479797363, + "learning_rate": 8.837676831643877e-05, + "loss": 0.7093, + "step": 1063 + }, + { + "epoch": 0.24521779211799954, + "grad_norm": 0.32758355140686035, + "learning_rate": 8.835282960990073e-05, + "loss": 0.7103, + "step": 1064 + }, + { + "epoch": 0.2454482599677345, + "grad_norm": 0.2944065034389496, + "learning_rate": 8.8328869526416e-05, + "loss": 0.7166, + "step": 1065 + }, + { + "epoch": 0.24567872781746947, + "grad_norm": 0.2752934992313385, + "learning_rate": 8.830488807933937e-05, + "loss": 0.7063, + "step": 1066 + }, + { + "epoch": 0.24590919566720443, + "grad_norm": 0.29886919260025024, + "learning_rate": 8.828088528203753e-05, + "loss": 0.7187, + "step": 1067 + }, + { + "epoch": 0.2461396635169394, + "grad_norm": 0.32834017276763916, + "learning_rate": 8.825686114788904e-05, + "loss": 0.7117, + "step": 1068 + }, + { + "epoch": 0.24637013136667435, + "grad_norm": 0.3167296051979065, + "learning_rate": 8.823281569028439e-05, + "loss": 0.7019, + "step": 1069 + }, + { + "epoch": 0.2466005992164093, + "grad_norm": 0.29267629981040955, + "learning_rate": 8.820874892262596e-05, + "loss": 0.7158, + "step": 1070 + }, + { + "epoch": 0.24683106706614427, + "grad_norm": 0.29060524702072144, + "learning_rate": 8.818466085832796e-05, + "loss": 0.7046, + "step": 1071 + }, + { + "epoch": 0.24706153491587923, + "grad_norm": 0.29323479533195496, + "learning_rate": 8.81605515108165e-05, + "loss": 0.7078, + "step": 1072 + }, + { + "epoch": 0.2472920027656142, + "grad_norm": 0.27811145782470703, + "learning_rate": 8.813642089352957e-05, + "loss": 0.7155, + "step": 1073 + }, + { + "epoch": 0.24752247061534915, + "grad_norm": 0.31902894377708435, + "learning_rate": 8.811226901991698e-05, + "loss": 0.7104, + "step": 1074 + }, + { + "epoch": 0.24775293846508412, + "grad_norm": 0.2658187747001648, + "learning_rate": 8.808809590344042e-05, + "loss": 0.6963, + "step": 1075 + }, + { + "epoch": 0.24798340631481908, + "grad_norm": 0.2968449592590332, + "learning_rate": 8.806390155757339e-05, + "loss": 0.703, + "step": 1076 + }, + { + "epoch": 0.24821387416455404, + "grad_norm": 0.29377731680870056, + "learning_rate": 8.803968599580125e-05, + "loss": 0.7063, + "step": 1077 + }, + { + "epoch": 0.248444342014289, + "grad_norm": 0.2845487594604492, + "learning_rate": 8.801544923162116e-05, + "loss": 0.7152, + "step": 1078 + }, + { + "epoch": 0.24867480986402396, + "grad_norm": 0.2773647904396057, + "learning_rate": 8.799119127854212e-05, + "loss": 0.7128, + "step": 1079 + }, + { + "epoch": 0.24890527771375892, + "grad_norm": 0.26741766929626465, + "learning_rate": 8.796691215008492e-05, + "loss": 0.7124, + "step": 1080 + }, + { + "epoch": 0.24913574556349388, + "grad_norm": 0.28756871819496155, + "learning_rate": 8.794261185978219e-05, + "loss": 0.7144, + "step": 1081 + }, + { + "epoch": 0.24936621341322884, + "grad_norm": 0.29482561349868774, + "learning_rate": 8.79182904211783e-05, + "loss": 0.712, + "step": 1082 + }, + { + "epoch": 0.2495966812629638, + "grad_norm": 0.2730870842933655, + "learning_rate": 8.789394784782945e-05, + "loss": 0.7076, + "step": 1083 + }, + { + "epoch": 0.24982714911269877, + "grad_norm": 0.2692764103412628, + "learning_rate": 8.786958415330359e-05, + "loss": 0.7024, + "step": 1084 + }, + { + "epoch": 0.2500576169624337, + "grad_norm": 0.2866170108318329, + "learning_rate": 8.78451993511805e-05, + "loss": 0.7082, + "step": 1085 + }, + { + "epoch": 0.2502880848121687, + "grad_norm": 0.300509512424469, + "learning_rate": 8.782079345505163e-05, + "loss": 0.7082, + "step": 1086 + }, + { + "epoch": 0.25051855266190365, + "grad_norm": 0.2729628086090088, + "learning_rate": 8.779636647852027e-05, + "loss": 0.7143, + "step": 1087 + }, + { + "epoch": 0.2507490205116386, + "grad_norm": 0.26800036430358887, + "learning_rate": 8.777191843520142e-05, + "loss": 0.7149, + "step": 1088 + }, + { + "epoch": 0.25097948836137357, + "grad_norm": 0.2820512056350708, + "learning_rate": 8.774744933872186e-05, + "loss": 0.7154, + "step": 1089 + }, + { + "epoch": 0.25120995621110853, + "grad_norm": 0.2764631509780884, + "learning_rate": 8.772295920272003e-05, + "loss": 0.7179, + "step": 1090 + }, + { + "epoch": 0.2514404240608435, + "grad_norm": 0.23512467741966248, + "learning_rate": 8.769844804084619e-05, + "loss": 0.7126, + "step": 1091 + }, + { + "epoch": 0.25167089191057845, + "grad_norm": 0.25535455346107483, + "learning_rate": 8.767391586676223e-05, + "loss": 0.7041, + "step": 1092 + }, + { + "epoch": 0.2519013597603134, + "grad_norm": 0.250704288482666, + "learning_rate": 8.764936269414184e-05, + "loss": 0.7144, + "step": 1093 + }, + { + "epoch": 0.2521318276100484, + "grad_norm": 0.353876531124115, + "learning_rate": 8.762478853667033e-05, + "loss": 0.7028, + "step": 1094 + }, + { + "epoch": 0.25236229545978334, + "grad_norm": 0.2544647455215454, + "learning_rate": 8.760019340804478e-05, + "loss": 0.7035, + "step": 1095 + }, + { + "epoch": 0.2525927633095183, + "grad_norm": 0.2700643241405487, + "learning_rate": 8.75755773219739e-05, + "loss": 0.7063, + "step": 1096 + }, + { + "epoch": 0.25282323115925326, + "grad_norm": 0.3172542154788971, + "learning_rate": 8.755094029217809e-05, + "loss": 0.7113, + "step": 1097 + }, + { + "epoch": 0.2530536990089882, + "grad_norm": 0.2928423583507538, + "learning_rate": 8.75262823323895e-05, + "loss": 0.7074, + "step": 1098 + }, + { + "epoch": 0.2532841668587232, + "grad_norm": 0.27802032232284546, + "learning_rate": 8.750160345635183e-05, + "loss": 0.7005, + "step": 1099 + }, + { + "epoch": 0.2535146347084582, + "grad_norm": 0.313966304063797, + "learning_rate": 8.74769036778205e-05, + "loss": 0.7019, + "step": 1100 + }, + { + "epoch": 0.25374510255819316, + "grad_norm": 0.3200061619281769, + "learning_rate": 8.74521830105626e-05, + "loss": 0.7082, + "step": 1101 + }, + { + "epoch": 0.2539755704079281, + "grad_norm": 0.30982211232185364, + "learning_rate": 8.74274414683568e-05, + "loss": 0.7161, + "step": 1102 + }, + { + "epoch": 0.2542060382576631, + "grad_norm": 0.26937606930732727, + "learning_rate": 8.740267906499346e-05, + "loss": 0.7037, + "step": 1103 + }, + { + "epoch": 0.25443650610739804, + "grad_norm": 0.33216673135757446, + "learning_rate": 8.737789581427455e-05, + "loss": 0.7087, + "step": 1104 + }, + { + "epoch": 0.254666973957133, + "grad_norm": 0.31464841961860657, + "learning_rate": 8.73530917300137e-05, + "loss": 0.7027, + "step": 1105 + }, + { + "epoch": 0.25489744180686796, + "grad_norm": 0.2963610887527466, + "learning_rate": 8.732826682603603e-05, + "loss": 0.7244, + "step": 1106 + }, + { + "epoch": 0.2551279096566029, + "grad_norm": 0.28852593898773193, + "learning_rate": 8.73034211161784e-05, + "loss": 0.703, + "step": 1107 + }, + { + "epoch": 0.2553583775063379, + "grad_norm": 0.2891587018966675, + "learning_rate": 8.72785546142892e-05, + "loss": 0.6958, + "step": 1108 + }, + { + "epoch": 0.25558884535607285, + "grad_norm": 0.3704643249511719, + "learning_rate": 8.725366733422842e-05, + "loss": 0.7071, + "step": 1109 + }, + { + "epoch": 0.2558193132058078, + "grad_norm": 0.3873230516910553, + "learning_rate": 8.722875928986762e-05, + "loss": 0.708, + "step": 1110 + }, + { + "epoch": 0.25604978105554277, + "grad_norm": 0.29551389813423157, + "learning_rate": 8.720383049508997e-05, + "loss": 0.6972, + "step": 1111 + }, + { + "epoch": 0.25628024890527773, + "grad_norm": 0.2649083137512207, + "learning_rate": 8.717888096379018e-05, + "loss": 0.7027, + "step": 1112 + }, + { + "epoch": 0.2565107167550127, + "grad_norm": 0.33623412251472473, + "learning_rate": 8.71539107098745e-05, + "loss": 0.7038, + "step": 1113 + }, + { + "epoch": 0.25674118460474765, + "grad_norm": 0.450360506772995, + "learning_rate": 8.712891974726076e-05, + "loss": 0.7067, + "step": 1114 + }, + { + "epoch": 0.2569716524544826, + "grad_norm": 1.8468785285949707, + "learning_rate": 8.710390808987833e-05, + "loss": 0.7232, + "step": 1115 + }, + { + "epoch": 0.2572021203042176, + "grad_norm": 0.29168421030044556, + "learning_rate": 8.70788757516681e-05, + "loss": 0.6999, + "step": 1116 + }, + { + "epoch": 0.25743258815395254, + "grad_norm": 0.3151211738586426, + "learning_rate": 8.705382274658249e-05, + "loss": 0.7001, + "step": 1117 + }, + { + "epoch": 0.2576630560036875, + "grad_norm": 0.3864755630493164, + "learning_rate": 8.702874908858545e-05, + "loss": 0.7102, + "step": 1118 + }, + { + "epoch": 0.25789352385342246, + "grad_norm": 0.3055296242237091, + "learning_rate": 8.700365479165244e-05, + "loss": 0.6979, + "step": 1119 + }, + { + "epoch": 0.2581239917031574, + "grad_norm": 0.3239305317401886, + "learning_rate": 8.697853986977041e-05, + "loss": 0.7155, + "step": 1120 + }, + { + "epoch": 0.2583544595528924, + "grad_norm": 0.29787853360176086, + "learning_rate": 8.695340433693781e-05, + "loss": 0.7129, + "step": 1121 + }, + { + "epoch": 0.25858492740262734, + "grad_norm": 0.2536768615245819, + "learning_rate": 8.692824820716461e-05, + "loss": 0.7032, + "step": 1122 + }, + { + "epoch": 0.2588153952523623, + "grad_norm": 0.31833314895629883, + "learning_rate": 8.690307149447221e-05, + "loss": 0.709, + "step": 1123 + }, + { + "epoch": 0.25904586310209726, + "grad_norm": 0.30274853110313416, + "learning_rate": 8.68778742128935e-05, + "loss": 0.7057, + "step": 1124 + }, + { + "epoch": 0.2592763309518322, + "grad_norm": 0.2601548135280609, + "learning_rate": 8.685265637647284e-05, + "loss": 0.7002, + "step": 1125 + }, + { + "epoch": 0.2595067988015672, + "grad_norm": 0.26416724920272827, + "learning_rate": 8.682741799926609e-05, + "loss": 0.7109, + "step": 1126 + }, + { + "epoch": 0.25973726665130215, + "grad_norm": 0.3520905375480652, + "learning_rate": 8.680215909534044e-05, + "loss": 0.7183, + "step": 1127 + }, + { + "epoch": 0.2599677345010371, + "grad_norm": 0.2665805220603943, + "learning_rate": 8.677687967877466e-05, + "loss": 0.7108, + "step": 1128 + }, + { + "epoch": 0.26019820235077207, + "grad_norm": 0.3360787034034729, + "learning_rate": 8.675157976365886e-05, + "loss": 0.7033, + "step": 1129 + }, + { + "epoch": 0.26042867020050703, + "grad_norm": 0.28345534205436707, + "learning_rate": 8.672625936409462e-05, + "loss": 0.7068, + "step": 1130 + }, + { + "epoch": 0.260659138050242, + "grad_norm": 0.2835051119327545, + "learning_rate": 8.67009184941949e-05, + "loss": 0.7021, + "step": 1131 + }, + { + "epoch": 0.26088960589997695, + "grad_norm": 0.2675539553165436, + "learning_rate": 8.667555716808414e-05, + "loss": 0.7028, + "step": 1132 + }, + { + "epoch": 0.2611200737497119, + "grad_norm": 0.29232192039489746, + "learning_rate": 8.665017539989808e-05, + "loss": 0.7108, + "step": 1133 + }, + { + "epoch": 0.2613505415994469, + "grad_norm": 0.27097558975219727, + "learning_rate": 8.662477320378395e-05, + "loss": 0.7142, + "step": 1134 + }, + { + "epoch": 0.26158100944918183, + "grad_norm": 0.2470446676015854, + "learning_rate": 8.659935059390028e-05, + "loss": 0.7061, + "step": 1135 + }, + { + "epoch": 0.2618114772989168, + "grad_norm": 0.2592596113681793, + "learning_rate": 8.657390758441708e-05, + "loss": 0.705, + "step": 1136 + }, + { + "epoch": 0.26204194514865176, + "grad_norm": 0.27617162466049194, + "learning_rate": 8.654844418951563e-05, + "loss": 0.7039, + "step": 1137 + }, + { + "epoch": 0.2622724129983867, + "grad_norm": 0.2819935977458954, + "learning_rate": 8.652296042338861e-05, + "loss": 0.7089, + "step": 1138 + }, + { + "epoch": 0.2625028808481217, + "grad_norm": 0.24662001430988312, + "learning_rate": 8.64974563002401e-05, + "loss": 0.7052, + "step": 1139 + }, + { + "epoch": 0.26273334869785664, + "grad_norm": 0.28374093770980835, + "learning_rate": 8.647193183428545e-05, + "loss": 0.7071, + "step": 1140 + }, + { + "epoch": 0.2629638165475916, + "grad_norm": 0.2856753468513489, + "learning_rate": 8.64463870397514e-05, + "loss": 0.7074, + "step": 1141 + }, + { + "epoch": 0.26319428439732656, + "grad_norm": 0.2997211813926697, + "learning_rate": 8.6420821930876e-05, + "loss": 0.7084, + "step": 1142 + }, + { + "epoch": 0.2634247522470615, + "grad_norm": 0.26690179109573364, + "learning_rate": 8.639523652190864e-05, + "loss": 0.7025, + "step": 1143 + }, + { + "epoch": 0.2636552200967965, + "grad_norm": 0.27999091148376465, + "learning_rate": 8.636963082710999e-05, + "loss": 0.7008, + "step": 1144 + }, + { + "epoch": 0.26388568794653144, + "grad_norm": 0.3172776997089386, + "learning_rate": 8.634400486075207e-05, + "loss": 0.7137, + "step": 1145 + }, + { + "epoch": 0.2641161557962664, + "grad_norm": 0.30888575315475464, + "learning_rate": 8.631835863711817e-05, + "loss": 0.7105, + "step": 1146 + }, + { + "epoch": 0.26434662364600137, + "grad_norm": 0.28774750232696533, + "learning_rate": 8.629269217050289e-05, + "loss": 0.7057, + "step": 1147 + }, + { + "epoch": 0.2645770914957363, + "grad_norm": 0.24954736232757568, + "learning_rate": 8.626700547521209e-05, + "loss": 0.7165, + "step": 1148 + }, + { + "epoch": 0.2648075593454713, + "grad_norm": 0.2624035179615021, + "learning_rate": 8.624129856556291e-05, + "loss": 0.7006, + "step": 1149 + }, + { + "epoch": 0.26503802719520625, + "grad_norm": 0.2522975504398346, + "learning_rate": 8.62155714558838e-05, + "loss": 0.7096, + "step": 1150 + }, + { + "epoch": 0.2652684950449412, + "grad_norm": 0.2908228039741516, + "learning_rate": 8.618982416051438e-05, + "loss": 0.7077, + "step": 1151 + }, + { + "epoch": 0.26549896289467617, + "grad_norm": 0.23220433294773102, + "learning_rate": 8.616405669380561e-05, + "loss": 0.7094, + "step": 1152 + }, + { + "epoch": 0.26572943074441113, + "grad_norm": 0.31848591566085815, + "learning_rate": 8.613826907011965e-05, + "loss": 0.7091, + "step": 1153 + }, + { + "epoch": 0.2659598985941461, + "grad_norm": 0.2557988464832306, + "learning_rate": 8.611246130382992e-05, + "loss": 0.7028, + "step": 1154 + }, + { + "epoch": 0.26619036644388105, + "grad_norm": 0.24541644752025604, + "learning_rate": 8.608663340932104e-05, + "loss": 0.6998, + "step": 1155 + }, + { + "epoch": 0.266420834293616, + "grad_norm": 0.25283804535865784, + "learning_rate": 8.606078540098883e-05, + "loss": 0.7044, + "step": 1156 + }, + { + "epoch": 0.266651302143351, + "grad_norm": 0.24628406763076782, + "learning_rate": 8.603491729324039e-05, + "loss": 0.7136, + "step": 1157 + }, + { + "epoch": 0.26688176999308594, + "grad_norm": 0.24645006656646729, + "learning_rate": 8.600902910049393e-05, + "loss": 0.7039, + "step": 1158 + }, + { + "epoch": 0.2671122378428209, + "grad_norm": 0.23496174812316895, + "learning_rate": 8.598312083717896e-05, + "loss": 0.7146, + "step": 1159 + }, + { + "epoch": 0.2673427056925559, + "grad_norm": 0.26855039596557617, + "learning_rate": 8.59571925177361e-05, + "loss": 0.7036, + "step": 1160 + }, + { + "epoch": 0.2675731735422909, + "grad_norm": 0.21970762312412262, + "learning_rate": 8.59312441566172e-05, + "loss": 0.7109, + "step": 1161 + }, + { + "epoch": 0.26780364139202584, + "grad_norm": 0.2488812357187271, + "learning_rate": 8.59052757682852e-05, + "loss": 0.703, + "step": 1162 + }, + { + "epoch": 0.2680341092417608, + "grad_norm": 0.2618003487586975, + "learning_rate": 8.587928736721432e-05, + "loss": 0.695, + "step": 1163 + }, + { + "epoch": 0.26826457709149576, + "grad_norm": 0.25669237971305847, + "learning_rate": 8.585327896788981e-05, + "loss": 0.6989, + "step": 1164 + }, + { + "epoch": 0.2684950449412307, + "grad_norm": 0.26360106468200684, + "learning_rate": 8.582725058480816e-05, + "loss": 0.7075, + "step": 1165 + }, + { + "epoch": 0.2687255127909657, + "grad_norm": 0.24905282258987427, + "learning_rate": 8.580120223247697e-05, + "loss": 0.7065, + "step": 1166 + }, + { + "epoch": 0.26895598064070064, + "grad_norm": 0.343464195728302, + "learning_rate": 8.577513392541496e-05, + "loss": 0.6978, + "step": 1167 + }, + { + "epoch": 0.2691864484904356, + "grad_norm": 0.2586437463760376, + "learning_rate": 8.574904567815196e-05, + "loss": 0.7056, + "step": 1168 + }, + { + "epoch": 0.26941691634017056, + "grad_norm": 0.23977968096733093, + "learning_rate": 8.572293750522897e-05, + "loss": 0.7056, + "step": 1169 + }, + { + "epoch": 0.2696473841899055, + "grad_norm": 0.2433982640504837, + "learning_rate": 8.569680942119804e-05, + "loss": 0.7021, + "step": 1170 + }, + { + "epoch": 0.2698778520396405, + "grad_norm": 0.26769572496414185, + "learning_rate": 8.567066144062232e-05, + "loss": 0.6999, + "step": 1171 + }, + { + "epoch": 0.27010831988937545, + "grad_norm": 0.24118681252002716, + "learning_rate": 8.564449357807608e-05, + "loss": 0.707, + "step": 1172 + }, + { + "epoch": 0.2703387877391104, + "grad_norm": 0.2884605824947357, + "learning_rate": 8.561830584814468e-05, + "loss": 0.7099, + "step": 1173 + }, + { + "epoch": 0.27056925558884537, + "grad_norm": 0.28720924258232117, + "learning_rate": 8.559209826542451e-05, + "loss": 0.6992, + "step": 1174 + }, + { + "epoch": 0.27079972343858033, + "grad_norm": 0.24960888922214508, + "learning_rate": 8.556587084452305e-05, + "loss": 0.7054, + "step": 1175 + }, + { + "epoch": 0.2710301912883153, + "grad_norm": 0.2377004772424698, + "learning_rate": 8.553962360005882e-05, + "loss": 0.7014, + "step": 1176 + }, + { + "epoch": 0.27126065913805025, + "grad_norm": 0.2665834426879883, + "learning_rate": 8.551335654666144e-05, + "loss": 0.7019, + "step": 1177 + }, + { + "epoch": 0.2714911269877852, + "grad_norm": 0.25975316762924194, + "learning_rate": 8.54870696989715e-05, + "loss": 0.6971, + "step": 1178 + }, + { + "epoch": 0.2717215948375202, + "grad_norm": 0.2380094677209854, + "learning_rate": 8.546076307164068e-05, + "loss": 0.6961, + "step": 1179 + }, + { + "epoch": 0.27195206268725514, + "grad_norm": 0.24472801387310028, + "learning_rate": 8.543443667933166e-05, + "loss": 0.7049, + "step": 1180 + }, + { + "epoch": 0.2721825305369901, + "grad_norm": 0.2646014094352722, + "learning_rate": 8.540809053671812e-05, + "loss": 0.7148, + "step": 1181 + }, + { + "epoch": 0.27241299838672506, + "grad_norm": 0.23404592275619507, + "learning_rate": 8.538172465848479e-05, + "loss": 0.7094, + "step": 1182 + }, + { + "epoch": 0.27264346623646, + "grad_norm": 0.2830381691455841, + "learning_rate": 8.535533905932738e-05, + "loss": 0.7061, + "step": 1183 + }, + { + "epoch": 0.272873934086195, + "grad_norm": 0.2919880747795105, + "learning_rate": 8.532893375395257e-05, + "loss": 0.7036, + "step": 1184 + }, + { + "epoch": 0.27310440193592994, + "grad_norm": 0.24687440693378448, + "learning_rate": 8.530250875707806e-05, + "loss": 0.7004, + "step": 1185 + }, + { + "epoch": 0.2733348697856649, + "grad_norm": 0.2443513572216034, + "learning_rate": 8.527606408343249e-05, + "loss": 0.6899, + "step": 1186 + }, + { + "epoch": 0.27356533763539986, + "grad_norm": 0.2695756256580353, + "learning_rate": 8.524959974775551e-05, + "loss": 0.7065, + "step": 1187 + }, + { + "epoch": 0.2737958054851348, + "grad_norm": 0.24225404858589172, + "learning_rate": 8.522311576479768e-05, + "loss": 0.7019, + "step": 1188 + }, + { + "epoch": 0.2740262733348698, + "grad_norm": 0.2715757191181183, + "learning_rate": 8.519661214932055e-05, + "loss": 0.7099, + "step": 1189 + }, + { + "epoch": 0.27425674118460475, + "grad_norm": 0.31206077337265015, + "learning_rate": 8.517008891609661e-05, + "loss": 0.7088, + "step": 1190 + }, + { + "epoch": 0.2744872090343397, + "grad_norm": 0.2775088846683502, + "learning_rate": 8.514354607990926e-05, + "loss": 0.7031, + "step": 1191 + }, + { + "epoch": 0.27471767688407467, + "grad_norm": 0.2421555519104004, + "learning_rate": 8.511698365555285e-05, + "loss": 0.7112, + "step": 1192 + }, + { + "epoch": 0.27494814473380963, + "grad_norm": 0.31836438179016113, + "learning_rate": 8.509040165783263e-05, + "loss": 0.7034, + "step": 1193 + }, + { + "epoch": 0.2751786125835446, + "grad_norm": 0.3080047369003296, + "learning_rate": 8.506380010156476e-05, + "loss": 0.7053, + "step": 1194 + }, + { + "epoch": 0.27540908043327955, + "grad_norm": 0.2516467869281769, + "learning_rate": 8.503717900157632e-05, + "loss": 0.7077, + "step": 1195 + }, + { + "epoch": 0.2756395482830145, + "grad_norm": 0.26254069805145264, + "learning_rate": 8.501053837270528e-05, + "loss": 0.7066, + "step": 1196 + }, + { + "epoch": 0.2758700161327495, + "grad_norm": 0.3127571940422058, + "learning_rate": 8.498387822980046e-05, + "loss": 0.6957, + "step": 1197 + }, + { + "epoch": 0.27610048398248443, + "grad_norm": 0.2957381308078766, + "learning_rate": 8.49571985877216e-05, + "loss": 0.6919, + "step": 1198 + }, + { + "epoch": 0.2763309518322194, + "grad_norm": 0.29202744364738464, + "learning_rate": 8.49304994613393e-05, + "loss": 0.7101, + "step": 1199 + }, + { + "epoch": 0.27656141968195436, + "grad_norm": 0.272582083940506, + "learning_rate": 8.490378086553499e-05, + "loss": 0.6988, + "step": 1200 + }, + { + "epoch": 0.2767918875316893, + "grad_norm": 0.24930858612060547, + "learning_rate": 8.4877042815201e-05, + "loss": 0.699, + "step": 1201 + }, + { + "epoch": 0.2770223553814243, + "grad_norm": 0.2611529231071472, + "learning_rate": 8.485028532524046e-05, + "loss": 0.7102, + "step": 1202 + }, + { + "epoch": 0.27725282323115924, + "grad_norm": 0.3048509359359741, + "learning_rate": 8.482350841056737e-05, + "loss": 0.7006, + "step": 1203 + }, + { + "epoch": 0.2774832910808942, + "grad_norm": 0.26629912853240967, + "learning_rate": 8.479671208610653e-05, + "loss": 0.6953, + "step": 1204 + }, + { + "epoch": 0.27771375893062916, + "grad_norm": 0.27922797203063965, + "learning_rate": 8.476989636679355e-05, + "loss": 0.7101, + "step": 1205 + }, + { + "epoch": 0.2779442267803641, + "grad_norm": 0.26804566383361816, + "learning_rate": 8.47430612675749e-05, + "loss": 0.7082, + "step": 1206 + }, + { + "epoch": 0.2781746946300991, + "grad_norm": 0.3235054910182953, + "learning_rate": 8.471620680340779e-05, + "loss": 0.6958, + "step": 1207 + }, + { + "epoch": 0.27840516247983405, + "grad_norm": 0.260946124792099, + "learning_rate": 8.468933298926029e-05, + "loss": 0.6972, + "step": 1208 + }, + { + "epoch": 0.278635630329569, + "grad_norm": 0.2844315469264984, + "learning_rate": 8.466243984011117e-05, + "loss": 0.6987, + "step": 1209 + }, + { + "epoch": 0.27886609817930397, + "grad_norm": 0.2835547626018524, + "learning_rate": 8.463552737095007e-05, + "loss": 0.6999, + "step": 1210 + }, + { + "epoch": 0.27909656602903893, + "grad_norm": 0.2883650064468384, + "learning_rate": 8.460859559677734e-05, + "loss": 0.7028, + "step": 1211 + }, + { + "epoch": 0.2793270338787739, + "grad_norm": 0.2849369943141937, + "learning_rate": 8.458164453260408e-05, + "loss": 0.7007, + "step": 1212 + }, + { + "epoch": 0.27955750172850885, + "grad_norm": 0.24277077615261078, + "learning_rate": 8.455467419345222e-05, + "loss": 0.698, + "step": 1213 + }, + { + "epoch": 0.2797879695782438, + "grad_norm": 0.2855810821056366, + "learning_rate": 8.452768459435434e-05, + "loss": 0.6947, + "step": 1214 + }, + { + "epoch": 0.2800184374279788, + "grad_norm": 0.2892487645149231, + "learning_rate": 8.450067575035378e-05, + "loss": 0.7041, + "step": 1215 + }, + { + "epoch": 0.28024890527771373, + "grad_norm": 0.2550940215587616, + "learning_rate": 8.447364767650468e-05, + "loss": 0.6982, + "step": 1216 + }, + { + "epoch": 0.2804793731274487, + "grad_norm": 0.28303179144859314, + "learning_rate": 8.444660038787178e-05, + "loss": 0.6891, + "step": 1217 + }, + { + "epoch": 0.28070984097718366, + "grad_norm": 0.29353299736976624, + "learning_rate": 8.441953389953062e-05, + "loss": 0.699, + "step": 1218 + }, + { + "epoch": 0.28094030882691867, + "grad_norm": 0.2536478340625763, + "learning_rate": 8.43924482265674e-05, + "loss": 0.7046, + "step": 1219 + }, + { + "epoch": 0.28117077667665363, + "grad_norm": 0.32717081904411316, + "learning_rate": 8.436534338407902e-05, + "loss": 0.7079, + "step": 1220 + }, + { + "epoch": 0.2814012445263886, + "grad_norm": 0.2975577116012573, + "learning_rate": 8.43382193871731e-05, + "loss": 0.7002, + "step": 1221 + }, + { + "epoch": 0.28163171237612356, + "grad_norm": 0.25119921565055847, + "learning_rate": 8.431107625096787e-05, + "loss": 0.702, + "step": 1222 + }, + { + "epoch": 0.2818621802258585, + "grad_norm": 0.30405279994010925, + "learning_rate": 8.428391399059228e-05, + "loss": 0.6982, + "step": 1223 + }, + { + "epoch": 0.2820926480755935, + "grad_norm": 0.2723596692085266, + "learning_rate": 8.42567326211859e-05, + "loss": 0.7001, + "step": 1224 + }, + { + "epoch": 0.28232311592532844, + "grad_norm": 0.226847305893898, + "learning_rate": 8.4229532157899e-05, + "loss": 0.6981, + "step": 1225 + }, + { + "epoch": 0.2825535837750634, + "grad_norm": 0.2864554226398468, + "learning_rate": 8.420231261589246e-05, + "loss": 0.7076, + "step": 1226 + }, + { + "epoch": 0.28278405162479836, + "grad_norm": 0.24330449104309082, + "learning_rate": 8.417507401033779e-05, + "loss": 0.6981, + "step": 1227 + }, + { + "epoch": 0.2830145194745333, + "grad_norm": 0.3172352612018585, + "learning_rate": 8.414781635641714e-05, + "loss": 0.6957, + "step": 1228 + }, + { + "epoch": 0.2832449873242683, + "grad_norm": 0.2853315770626068, + "learning_rate": 8.412053966932326e-05, + "loss": 0.7, + "step": 1229 + }, + { + "epoch": 0.28347545517400324, + "grad_norm": 0.3434127867221832, + "learning_rate": 8.409324396425954e-05, + "loss": 0.7085, + "step": 1230 + }, + { + "epoch": 0.2837059230237382, + "grad_norm": 0.2804059088230133, + "learning_rate": 8.406592925643995e-05, + "loss": 0.7037, + "step": 1231 + }, + { + "epoch": 0.28393639087347317, + "grad_norm": 0.26557254791259766, + "learning_rate": 8.403859556108904e-05, + "loss": 0.699, + "step": 1232 + }, + { + "epoch": 0.2841668587232081, + "grad_norm": 0.28296953439712524, + "learning_rate": 8.401124289344195e-05, + "loss": 0.6907, + "step": 1233 + }, + { + "epoch": 0.2843973265729431, + "grad_norm": 0.2905629873275757, + "learning_rate": 8.398387126874443e-05, + "loss": 0.6962, + "step": 1234 + }, + { + "epoch": 0.28462779442267805, + "grad_norm": 0.2673647403717041, + "learning_rate": 8.395648070225272e-05, + "loss": 0.6941, + "step": 1235 + }, + { + "epoch": 0.284858262272413, + "grad_norm": 0.30280497670173645, + "learning_rate": 8.392907120923373e-05, + "loss": 0.6993, + "step": 1236 + }, + { + "epoch": 0.28508873012214797, + "grad_norm": 0.28587183356285095, + "learning_rate": 8.39016428049648e-05, + "loss": 0.696, + "step": 1237 + }, + { + "epoch": 0.28531919797188293, + "grad_norm": 0.24387072026729584, + "learning_rate": 8.387419550473387e-05, + "loss": 0.7027, + "step": 1238 + }, + { + "epoch": 0.2855496658216179, + "grad_norm": 0.2583426237106323, + "learning_rate": 8.384672932383942e-05, + "loss": 0.701, + "step": 1239 + }, + { + "epoch": 0.28578013367135285, + "grad_norm": 0.2734989821910858, + "learning_rate": 8.381924427759044e-05, + "loss": 0.6959, + "step": 1240 + }, + { + "epoch": 0.2860106015210878, + "grad_norm": 0.2484637349843979, + "learning_rate": 8.379174038130643e-05, + "loss": 0.6997, + "step": 1241 + }, + { + "epoch": 0.2862410693708228, + "grad_norm": 0.2433558702468872, + "learning_rate": 8.376421765031741e-05, + "loss": 0.6998, + "step": 1242 + }, + { + "epoch": 0.28647153722055774, + "grad_norm": 0.2775626480579376, + "learning_rate": 8.373667609996387e-05, + "loss": 0.6962, + "step": 1243 + }, + { + "epoch": 0.2867020050702927, + "grad_norm": 0.26966193318367004, + "learning_rate": 8.370911574559683e-05, + "loss": 0.7024, + "step": 1244 + }, + { + "epoch": 0.28693247292002766, + "grad_norm": 0.2636489272117615, + "learning_rate": 8.368153660257776e-05, + "loss": 0.7041, + "step": 1245 + }, + { + "epoch": 0.2871629407697626, + "grad_norm": 0.2798607051372528, + "learning_rate": 8.365393868627862e-05, + "loss": 0.6912, + "step": 1246 + }, + { + "epoch": 0.2873934086194976, + "grad_norm": 0.26497724652290344, + "learning_rate": 8.362632201208181e-05, + "loss": 0.6995, + "step": 1247 + }, + { + "epoch": 0.28762387646923254, + "grad_norm": 0.26089614629745483, + "learning_rate": 8.359868659538022e-05, + "loss": 0.6994, + "step": 1248 + }, + { + "epoch": 0.2878543443189675, + "grad_norm": 0.2735275328159332, + "learning_rate": 8.357103245157715e-05, + "loss": 0.6995, + "step": 1249 + }, + { + "epoch": 0.28808481216870246, + "grad_norm": 0.27281323075294495, + "learning_rate": 8.354335959608638e-05, + "loss": 0.6947, + "step": 1250 + }, + { + "epoch": 0.2883152800184374, + "grad_norm": 0.2844576835632324, + "learning_rate": 8.351566804433207e-05, + "loss": 0.7073, + "step": 1251 + }, + { + "epoch": 0.2885457478681724, + "grad_norm": 0.2630547285079956, + "learning_rate": 8.348795781174885e-05, + "loss": 0.7021, + "step": 1252 + }, + { + "epoch": 0.28877621571790735, + "grad_norm": 0.23190078139305115, + "learning_rate": 8.346022891378172e-05, + "loss": 0.7013, + "step": 1253 + }, + { + "epoch": 0.2890066835676423, + "grad_norm": 0.28771939873695374, + "learning_rate": 8.34324813658861e-05, + "loss": 0.7027, + "step": 1254 + }, + { + "epoch": 0.28923715141737727, + "grad_norm": 0.2860211133956909, + "learning_rate": 8.340471518352781e-05, + "loss": 0.7008, + "step": 1255 + }, + { + "epoch": 0.28946761926711223, + "grad_norm": 0.26749560236930847, + "learning_rate": 8.337693038218308e-05, + "loss": 0.7003, + "step": 1256 + }, + { + "epoch": 0.2896980871168472, + "grad_norm": 0.22241248190402985, + "learning_rate": 8.334912697733845e-05, + "loss": 0.7045, + "step": 1257 + }, + { + "epoch": 0.28992855496658215, + "grad_norm": 0.2874812185764313, + "learning_rate": 8.33213049844909e-05, + "loss": 0.696, + "step": 1258 + }, + { + "epoch": 0.2901590228163171, + "grad_norm": 0.28802555799484253, + "learning_rate": 8.329346441914774e-05, + "loss": 0.6997, + "step": 1259 + }, + { + "epoch": 0.2903894906660521, + "grad_norm": 0.24786509573459625, + "learning_rate": 8.326560529682661e-05, + "loss": 0.6983, + "step": 1260 + }, + { + "epoch": 0.29061995851578704, + "grad_norm": 0.2500215768814087, + "learning_rate": 8.323772763305554e-05, + "loss": 0.7038, + "step": 1261 + }, + { + "epoch": 0.290850426365522, + "grad_norm": 0.22261877357959747, + "learning_rate": 8.320983144337286e-05, + "loss": 0.6928, + "step": 1262 + }, + { + "epoch": 0.29108089421525696, + "grad_norm": 0.26974332332611084, + "learning_rate": 8.318191674332724e-05, + "loss": 0.7024, + "step": 1263 + }, + { + "epoch": 0.2913113620649919, + "grad_norm": 0.2661108374595642, + "learning_rate": 8.315398354847766e-05, + "loss": 0.6996, + "step": 1264 + }, + { + "epoch": 0.2915418299147269, + "grad_norm": 0.2478044480085373, + "learning_rate": 8.31260318743934e-05, + "loss": 0.6913, + "step": 1265 + }, + { + "epoch": 0.29177229776446184, + "grad_norm": 0.28388431668281555, + "learning_rate": 8.309806173665409e-05, + "loss": 0.6914, + "step": 1266 + }, + { + "epoch": 0.2920027656141968, + "grad_norm": 0.26973721385002136, + "learning_rate": 8.307007315084958e-05, + "loss": 0.706, + "step": 1267 + }, + { + "epoch": 0.29223323346393176, + "grad_norm": 0.2543336749076843, + "learning_rate": 8.304206613258003e-05, + "loss": 0.6988, + "step": 1268 + }, + { + "epoch": 0.2924637013136667, + "grad_norm": 0.2953673303127289, + "learning_rate": 8.301404069745592e-05, + "loss": 0.6876, + "step": 1269 + }, + { + "epoch": 0.2926941691634017, + "grad_norm": 0.24503032863140106, + "learning_rate": 8.298599686109792e-05, + "loss": 0.6943, + "step": 1270 + }, + { + "epoch": 0.29292463701313665, + "grad_norm": 0.2405981868505478, + "learning_rate": 8.2957934639137e-05, + "loss": 0.7012, + "step": 1271 + }, + { + "epoch": 0.2931551048628716, + "grad_norm": 0.23449143767356873, + "learning_rate": 8.292985404721438e-05, + "loss": 0.6919, + "step": 1272 + }, + { + "epoch": 0.29338557271260657, + "grad_norm": 0.2791789472103119, + "learning_rate": 8.29017551009815e-05, + "loss": 0.6987, + "step": 1273 + }, + { + "epoch": 0.29361604056234153, + "grad_norm": 0.26588696241378784, + "learning_rate": 8.287363781610003e-05, + "loss": 0.7009, + "step": 1274 + }, + { + "epoch": 0.2938465084120765, + "grad_norm": 0.2526492178440094, + "learning_rate": 8.284550220824187e-05, + "loss": 0.7001, + "step": 1275 + }, + { + "epoch": 0.29407697626181145, + "grad_norm": 0.23610974848270416, + "learning_rate": 8.281734829308914e-05, + "loss": 0.6982, + "step": 1276 + }, + { + "epoch": 0.2943074441115464, + "grad_norm": 0.24589475989341736, + "learning_rate": 8.278917608633416e-05, + "loss": 0.7077, + "step": 1277 + }, + { + "epoch": 0.29453791196128143, + "grad_norm": 0.25068679451942444, + "learning_rate": 8.276098560367944e-05, + "loss": 0.6999, + "step": 1278 + }, + { + "epoch": 0.2947683798110164, + "grad_norm": 0.24742679297924042, + "learning_rate": 8.273277686083767e-05, + "loss": 0.7018, + "step": 1279 + }, + { + "epoch": 0.29499884766075135, + "grad_norm": 0.25572851300239563, + "learning_rate": 8.270454987353173e-05, + "loss": 0.7108, + "step": 1280 + }, + { + "epoch": 0.2952293155104863, + "grad_norm": 0.2460428923368454, + "learning_rate": 8.267630465749467e-05, + "loss": 0.6992, + "step": 1281 + }, + { + "epoch": 0.2954597833602213, + "grad_norm": 0.2824234068393707, + "learning_rate": 8.264804122846969e-05, + "loss": 0.6957, + "step": 1282 + }, + { + "epoch": 0.29569025120995623, + "grad_norm": 0.23757845163345337, + "learning_rate": 8.261975960221017e-05, + "loss": 0.6955, + "step": 1283 + }, + { + "epoch": 0.2959207190596912, + "grad_norm": 0.2634974718093872, + "learning_rate": 8.259145979447957e-05, + "loss": 0.6982, + "step": 1284 + }, + { + "epoch": 0.29615118690942616, + "grad_norm": 0.26144111156463623, + "learning_rate": 8.256314182105155e-05, + "loss": 0.7052, + "step": 1285 + }, + { + "epoch": 0.2963816547591611, + "grad_norm": 0.2734619081020355, + "learning_rate": 8.253480569770986e-05, + "loss": 0.705, + "step": 1286 + }, + { + "epoch": 0.2966121226088961, + "grad_norm": 0.28824108839035034, + "learning_rate": 8.250645144024838e-05, + "loss": 0.7058, + "step": 1287 + }, + { + "epoch": 0.29684259045863104, + "grad_norm": 0.2391139417886734, + "learning_rate": 8.247807906447108e-05, + "loss": 0.6906, + "step": 1288 + }, + { + "epoch": 0.297073058308366, + "grad_norm": 0.2693355977535248, + "learning_rate": 8.244968858619206e-05, + "loss": 0.7039, + "step": 1289 + }, + { + "epoch": 0.29730352615810096, + "grad_norm": 0.2826348841190338, + "learning_rate": 8.242128002123548e-05, + "loss": 0.696, + "step": 1290 + }, + { + "epoch": 0.2975339940078359, + "grad_norm": 0.22388167679309845, + "learning_rate": 8.239285338543558e-05, + "loss": 0.7022, + "step": 1291 + }, + { + "epoch": 0.2977644618575709, + "grad_norm": 0.29318729043006897, + "learning_rate": 8.236440869463671e-05, + "loss": 0.6948, + "step": 1292 + }, + { + "epoch": 0.29799492970730584, + "grad_norm": 0.28733378648757935, + "learning_rate": 8.233594596469325e-05, + "loss": 0.7068, + "step": 1293 + }, + { + "epoch": 0.2982253975570408, + "grad_norm": 0.2567828893661499, + "learning_rate": 8.230746521146963e-05, + "loss": 0.6986, + "step": 1294 + }, + { + "epoch": 0.29845586540677577, + "grad_norm": 0.2597467601299286, + "learning_rate": 8.227896645084036e-05, + "loss": 0.685, + "step": 1295 + }, + { + "epoch": 0.2986863332565107, + "grad_norm": 0.26486629247665405, + "learning_rate": 8.225044969868994e-05, + "loss": 0.694, + "step": 1296 + }, + { + "epoch": 0.2989168011062457, + "grad_norm": 0.2641950249671936, + "learning_rate": 8.222191497091294e-05, + "loss": 0.7005, + "step": 1297 + }, + { + "epoch": 0.29914726895598065, + "grad_norm": 0.3094748556613922, + "learning_rate": 8.219336228341392e-05, + "loss": 0.7035, + "step": 1298 + }, + { + "epoch": 0.2993777368057156, + "grad_norm": 0.2441185563802719, + "learning_rate": 8.216479165210748e-05, + "loss": 0.6908, + "step": 1299 + }, + { + "epoch": 0.29960820465545057, + "grad_norm": 0.23661471903324127, + "learning_rate": 8.213620309291819e-05, + "loss": 0.7005, + "step": 1300 + }, + { + "epoch": 0.29983867250518553, + "grad_norm": 0.3035421371459961, + "learning_rate": 8.210759662178064e-05, + "loss": 0.6909, + "step": 1301 + }, + { + "epoch": 0.3000691403549205, + "grad_norm": 0.2798277735710144, + "learning_rate": 8.207897225463938e-05, + "loss": 0.6936, + "step": 1302 + }, + { + "epoch": 0.30029960820465545, + "grad_norm": 0.232849583029747, + "learning_rate": 8.205033000744894e-05, + "loss": 0.6943, + "step": 1303 + }, + { + "epoch": 0.3005300760543904, + "grad_norm": 0.26353907585144043, + "learning_rate": 8.202166989617383e-05, + "loss": 0.6917, + "step": 1304 + }, + { + "epoch": 0.3007605439041254, + "grad_norm": 0.27309128642082214, + "learning_rate": 8.199299193678851e-05, + "loss": 0.702, + "step": 1305 + }, + { + "epoch": 0.30099101175386034, + "grad_norm": 0.2615521252155304, + "learning_rate": 8.196429614527737e-05, + "loss": 0.6962, + "step": 1306 + }, + { + "epoch": 0.3012214796035953, + "grad_norm": 0.25091928243637085, + "learning_rate": 8.193558253763478e-05, + "loss": 0.6988, + "step": 1307 + }, + { + "epoch": 0.30145194745333026, + "grad_norm": 0.25908082723617554, + "learning_rate": 8.1906851129865e-05, + "loss": 0.6996, + "step": 1308 + }, + { + "epoch": 0.3016824153030652, + "grad_norm": 0.2851544916629791, + "learning_rate": 8.187810193798223e-05, + "loss": 0.6932, + "step": 1309 + }, + { + "epoch": 0.3019128831528002, + "grad_norm": 0.2424258440732956, + "learning_rate": 8.184933497801059e-05, + "loss": 0.6926, + "step": 1310 + }, + { + "epoch": 0.30214335100253514, + "grad_norm": 0.24669015407562256, + "learning_rate": 8.182055026598408e-05, + "loss": 0.701, + "step": 1311 + }, + { + "epoch": 0.3023738188522701, + "grad_norm": 0.27151423692703247, + "learning_rate": 8.179174781794658e-05, + "loss": 0.6909, + "step": 1312 + }, + { + "epoch": 0.30260428670200507, + "grad_norm": 0.2658521831035614, + "learning_rate": 8.176292764995195e-05, + "loss": 0.6906, + "step": 1313 + }, + { + "epoch": 0.30283475455174, + "grad_norm": 0.2563072443008423, + "learning_rate": 8.173408977806382e-05, + "loss": 0.6948, + "step": 1314 + }, + { + "epoch": 0.303065222401475, + "grad_norm": 0.27026668190956116, + "learning_rate": 8.170523421835572e-05, + "loss": 0.6986, + "step": 1315 + }, + { + "epoch": 0.30329569025120995, + "grad_norm": 0.2493722289800644, + "learning_rate": 8.167636098691105e-05, + "loss": 0.695, + "step": 1316 + }, + { + "epoch": 0.3035261581009449, + "grad_norm": 0.46334898471832275, + "learning_rate": 8.164747009982308e-05, + "loss": 0.6948, + "step": 1317 + }, + { + "epoch": 0.30375662595067987, + "grad_norm": 0.3177790343761444, + "learning_rate": 8.161856157319488e-05, + "loss": 0.692, + "step": 1318 + }, + { + "epoch": 0.30398709380041483, + "grad_norm": 0.25527554750442505, + "learning_rate": 8.158963542313937e-05, + "loss": 0.7024, + "step": 1319 + }, + { + "epoch": 0.3042175616501498, + "grad_norm": 0.2921774983406067, + "learning_rate": 8.156069166577928e-05, + "loss": 0.7102, + "step": 1320 + }, + { + "epoch": 0.30444802949988475, + "grad_norm": 0.3124329149723053, + "learning_rate": 8.153173031724719e-05, + "loss": 0.6898, + "step": 1321 + }, + { + "epoch": 0.3046784973496197, + "grad_norm": 0.2984555959701538, + "learning_rate": 8.150275139368544e-05, + "loss": 0.6944, + "step": 1322 + }, + { + "epoch": 0.3049089651993547, + "grad_norm": 0.2839076817035675, + "learning_rate": 8.14737549112462e-05, + "loss": 0.6978, + "step": 1323 + }, + { + "epoch": 0.30513943304908964, + "grad_norm": 0.25131693482398987, + "learning_rate": 8.144474088609141e-05, + "loss": 0.6931, + "step": 1324 + }, + { + "epoch": 0.3053699008988246, + "grad_norm": 0.2686595916748047, + "learning_rate": 8.141570933439277e-05, + "loss": 0.6873, + "step": 1325 + }, + { + "epoch": 0.30560036874855956, + "grad_norm": 0.2873900830745697, + "learning_rate": 8.138666027233181e-05, + "loss": 0.6959, + "step": 1326 + }, + { + "epoch": 0.3058308365982945, + "grad_norm": 0.26454657316207886, + "learning_rate": 8.135759371609976e-05, + "loss": 0.6977, + "step": 1327 + }, + { + "epoch": 0.3060613044480295, + "grad_norm": 0.27393782138824463, + "learning_rate": 8.132850968189762e-05, + "loss": 0.6977, + "step": 1328 + }, + { + "epoch": 0.30629177229776444, + "grad_norm": 0.29595717787742615, + "learning_rate": 8.129940818593612e-05, + "loss": 0.6941, + "step": 1329 + }, + { + "epoch": 0.3065222401474994, + "grad_norm": 0.2389097660779953, + "learning_rate": 8.127028924443573e-05, + "loss": 0.7001, + "step": 1330 + }, + { + "epoch": 0.30675270799723436, + "grad_norm": 0.7923142910003662, + "learning_rate": 8.12411528736267e-05, + "loss": 0.6915, + "step": 1331 + }, + { + "epoch": 0.3069831758469693, + "grad_norm": 0.40648335218429565, + "learning_rate": 8.12119990897489e-05, + "loss": 0.6905, + "step": 1332 + }, + { + "epoch": 0.3072136436967043, + "grad_norm": 0.2911185622215271, + "learning_rate": 8.118282790905197e-05, + "loss": 0.6953, + "step": 1333 + }, + { + "epoch": 0.30744411154643925, + "grad_norm": 0.2990292012691498, + "learning_rate": 8.11536393477952e-05, + "loss": 0.6953, + "step": 1334 + }, + { + "epoch": 0.3076745793961742, + "grad_norm": 0.28816699981689453, + "learning_rate": 8.112443342224761e-05, + "loss": 0.6954, + "step": 1335 + }, + { + "epoch": 0.30790504724590917, + "grad_norm": 0.32540613412857056, + "learning_rate": 8.109521014868789e-05, + "loss": 0.6935, + "step": 1336 + }, + { + "epoch": 0.30813551509564413, + "grad_norm": 0.29871416091918945, + "learning_rate": 8.106596954340438e-05, + "loss": 0.7029, + "step": 1337 + }, + { + "epoch": 0.30836598294537915, + "grad_norm": 0.29098618030548096, + "learning_rate": 8.10367116226951e-05, + "loss": 0.7083, + "step": 1338 + }, + { + "epoch": 0.3085964507951141, + "grad_norm": 0.27469146251678467, + "learning_rate": 8.100743640286768e-05, + "loss": 0.7025, + "step": 1339 + }, + { + "epoch": 0.30882691864484907, + "grad_norm": 0.34525662660598755, + "learning_rate": 8.097814390023947e-05, + "loss": 0.6983, + "step": 1340 + }, + { + "epoch": 0.30905738649458403, + "grad_norm": 0.3090488314628601, + "learning_rate": 8.09488341311374e-05, + "loss": 0.7003, + "step": 1341 + }, + { + "epoch": 0.309287854344319, + "grad_norm": 0.2643933892250061, + "learning_rate": 8.091950711189801e-05, + "loss": 0.689, + "step": 1342 + }, + { + "epoch": 0.30951832219405395, + "grad_norm": 0.3033878803253174, + "learning_rate": 8.089016285886748e-05, + "loss": 0.6958, + "step": 1343 + }, + { + "epoch": 0.3097487900437889, + "grad_norm": 0.29167822003364563, + "learning_rate": 8.086080138840162e-05, + "loss": 0.6976, + "step": 1344 + }, + { + "epoch": 0.3099792578935239, + "grad_norm": 0.29147741198539734, + "learning_rate": 8.083142271686577e-05, + "loss": 0.6985, + "step": 1345 + }, + { + "epoch": 0.31020972574325884, + "grad_norm": 0.2536722719669342, + "learning_rate": 8.080202686063492e-05, + "loss": 0.7038, + "step": 1346 + }, + { + "epoch": 0.3104401935929938, + "grad_norm": 0.2939794361591339, + "learning_rate": 8.077261383609363e-05, + "loss": 0.6915, + "step": 1347 + }, + { + "epoch": 0.31067066144272876, + "grad_norm": 0.260345458984375, + "learning_rate": 8.074318365963597e-05, + "loss": 0.6942, + "step": 1348 + }, + { + "epoch": 0.3109011292924637, + "grad_norm": 0.25490236282348633, + "learning_rate": 8.071373634766563e-05, + "loss": 0.6947, + "step": 1349 + }, + { + "epoch": 0.3111315971421987, + "grad_norm": 0.2755090892314911, + "learning_rate": 8.068427191659586e-05, + "loss": 0.7035, + "step": 1350 + }, + { + "epoch": 0.31136206499193364, + "grad_norm": 0.2627999484539032, + "learning_rate": 8.065479038284942e-05, + "loss": 0.6992, + "step": 1351 + }, + { + "epoch": 0.3115925328416686, + "grad_norm": 0.25387728214263916, + "learning_rate": 8.062529176285858e-05, + "loss": 0.7024, + "step": 1352 + }, + { + "epoch": 0.31182300069140356, + "grad_norm": 0.27479302883148193, + "learning_rate": 8.059577607306518e-05, + "loss": 0.6976, + "step": 1353 + }, + { + "epoch": 0.3120534685411385, + "grad_norm": 0.23171396553516388, + "learning_rate": 8.056624332992057e-05, + "loss": 0.6977, + "step": 1354 + }, + { + "epoch": 0.3122839363908735, + "grad_norm": 0.26171013712882996, + "learning_rate": 8.05366935498856e-05, + "loss": 0.6927, + "step": 1355 + }, + { + "epoch": 0.31251440424060845, + "grad_norm": 0.27590593695640564, + "learning_rate": 8.050712674943055e-05, + "loss": 0.6868, + "step": 1356 + }, + { + "epoch": 0.3127448720903434, + "grad_norm": 0.2615238428115845, + "learning_rate": 8.047754294503531e-05, + "loss": 0.6913, + "step": 1357 + }, + { + "epoch": 0.31297533994007837, + "grad_norm": 0.2482849508523941, + "learning_rate": 8.044794215318916e-05, + "loss": 0.6949, + "step": 1358 + }, + { + "epoch": 0.31320580778981333, + "grad_norm": 0.26599544286727905, + "learning_rate": 8.041832439039084e-05, + "loss": 0.6972, + "step": 1359 + }, + { + "epoch": 0.3134362756395483, + "grad_norm": 0.2676306962966919, + "learning_rate": 8.038868967314863e-05, + "loss": 0.7023, + "step": 1360 + }, + { + "epoch": 0.31366674348928325, + "grad_norm": 0.2429536134004593, + "learning_rate": 8.035903801798018e-05, + "loss": 0.691, + "step": 1361 + }, + { + "epoch": 0.3138972113390182, + "grad_norm": 0.264345645904541, + "learning_rate": 8.032936944141261e-05, + "loss": 0.6948, + "step": 1362 + }, + { + "epoch": 0.3141276791887532, + "grad_norm": 0.2679714262485504, + "learning_rate": 8.02996839599825e-05, + "loss": 0.6977, + "step": 1363 + }, + { + "epoch": 0.31435814703848813, + "grad_norm": 0.28790920972824097, + "learning_rate": 8.02699815902358e-05, + "loss": 0.6931, + "step": 1364 + }, + { + "epoch": 0.3145886148882231, + "grad_norm": 0.23752626776695251, + "learning_rate": 8.02402623487279e-05, + "loss": 0.692, + "step": 1365 + }, + { + "epoch": 0.31481908273795806, + "grad_norm": 0.25893813371658325, + "learning_rate": 8.021052625202359e-05, + "loss": 0.7112, + "step": 1366 + }, + { + "epoch": 0.315049550587693, + "grad_norm": 0.29539117217063904, + "learning_rate": 8.018077331669706e-05, + "loss": 0.6844, + "step": 1367 + }, + { + "epoch": 0.315280018437428, + "grad_norm": 0.2858338952064514, + "learning_rate": 8.015100355933189e-05, + "loss": 0.6944, + "step": 1368 + }, + { + "epoch": 0.31551048628716294, + "grad_norm": 0.24025440216064453, + "learning_rate": 8.012121699652103e-05, + "loss": 0.6995, + "step": 1369 + }, + { + "epoch": 0.3157409541368979, + "grad_norm": 0.2733021676540375, + "learning_rate": 8.009141364486679e-05, + "loss": 0.6892, + "step": 1370 + }, + { + "epoch": 0.31597142198663286, + "grad_norm": 0.286783903837204, + "learning_rate": 8.006159352098082e-05, + "loss": 0.6881, + "step": 1371 + }, + { + "epoch": 0.3162018898363678, + "grad_norm": 0.2588256001472473, + "learning_rate": 8.003175664148416e-05, + "loss": 0.6927, + "step": 1372 + }, + { + "epoch": 0.3164323576861028, + "grad_norm": 0.26540815830230713, + "learning_rate": 8.000190302300721e-05, + "loss": 0.6959, + "step": 1373 + }, + { + "epoch": 0.31666282553583774, + "grad_norm": 0.2769984006881714, + "learning_rate": 7.99720326821896e-05, + "loss": 0.6986, + "step": 1374 + }, + { + "epoch": 0.3168932933855727, + "grad_norm": 0.23313313722610474, + "learning_rate": 7.994214563568036e-05, + "loss": 0.6915, + "step": 1375 + }, + { + "epoch": 0.31712376123530767, + "grad_norm": 0.24230745434761047, + "learning_rate": 7.991224190013782e-05, + "loss": 0.7006, + "step": 1376 + }, + { + "epoch": 0.3173542290850426, + "grad_norm": 0.2684518098831177, + "learning_rate": 7.988232149222959e-05, + "loss": 0.6968, + "step": 1377 + }, + { + "epoch": 0.3175846969347776, + "grad_norm": 0.24516130983829498, + "learning_rate": 7.985238442863261e-05, + "loss": 0.6957, + "step": 1378 + }, + { + "epoch": 0.31781516478451255, + "grad_norm": 0.22360306978225708, + "learning_rate": 7.982243072603306e-05, + "loss": 0.7003, + "step": 1379 + }, + { + "epoch": 0.3180456326342475, + "grad_norm": 0.2547612190246582, + "learning_rate": 7.979246040112643e-05, + "loss": 0.6904, + "step": 1380 + }, + { + "epoch": 0.31827610048398247, + "grad_norm": 0.3107689619064331, + "learning_rate": 7.976247347061745e-05, + "loss": 0.6902, + "step": 1381 + }, + { + "epoch": 0.31850656833371743, + "grad_norm": 0.2589125335216522, + "learning_rate": 7.973246995122013e-05, + "loss": 0.6908, + "step": 1382 + }, + { + "epoch": 0.3187370361834524, + "grad_norm": 0.24334730207920074, + "learning_rate": 7.970244985965767e-05, + "loss": 0.6942, + "step": 1383 + }, + { + "epoch": 0.31896750403318735, + "grad_norm": 0.249411478638649, + "learning_rate": 7.96724132126626e-05, + "loss": 0.6968, + "step": 1384 + }, + { + "epoch": 0.3191979718829223, + "grad_norm": 0.29693132638931274, + "learning_rate": 7.964236002697661e-05, + "loss": 0.6869, + "step": 1385 + }, + { + "epoch": 0.3194284397326573, + "grad_norm": 0.2691076993942261, + "learning_rate": 7.96122903193506e-05, + "loss": 0.6914, + "step": 1386 + }, + { + "epoch": 0.31965890758239224, + "grad_norm": 0.2546314597129822, + "learning_rate": 7.958220410654475e-05, + "loss": 0.6944, + "step": 1387 + }, + { + "epoch": 0.3198893754321272, + "grad_norm": 0.29681459069252014, + "learning_rate": 7.955210140532833e-05, + "loss": 0.6801, + "step": 1388 + }, + { + "epoch": 0.32011984328186216, + "grad_norm": 0.24881964921951294, + "learning_rate": 7.952198223247993e-05, + "loss": 0.6966, + "step": 1389 + }, + { + "epoch": 0.3203503111315971, + "grad_norm": 0.2598433196544647, + "learning_rate": 7.949184660478721e-05, + "loss": 0.6955, + "step": 1390 + }, + { + "epoch": 0.3205807789813321, + "grad_norm": 0.2911115884780884, + "learning_rate": 7.946169453904706e-05, + "loss": 0.696, + "step": 1391 + }, + { + "epoch": 0.32081124683106704, + "grad_norm": 0.2831457853317261, + "learning_rate": 7.943152605206551e-05, + "loss": 0.6951, + "step": 1392 + }, + { + "epoch": 0.321041714680802, + "grad_norm": 0.25077253580093384, + "learning_rate": 7.940134116065776e-05, + "loss": 0.691, + "step": 1393 + }, + { + "epoch": 0.32127218253053696, + "grad_norm": 0.2983539402484894, + "learning_rate": 7.937113988164814e-05, + "loss": 0.69, + "step": 1394 + }, + { + "epoch": 0.3215026503802719, + "grad_norm": 0.28595560789108276, + "learning_rate": 7.93409222318701e-05, + "loss": 0.6948, + "step": 1395 + }, + { + "epoch": 0.3217331182300069, + "grad_norm": 0.2612264156341553, + "learning_rate": 7.931068822816627e-05, + "loss": 0.6897, + "step": 1396 + }, + { + "epoch": 0.3219635860797419, + "grad_norm": 0.2846459150314331, + "learning_rate": 7.928043788738835e-05, + "loss": 0.7021, + "step": 1397 + }, + { + "epoch": 0.32219405392947686, + "grad_norm": 0.26386579871177673, + "learning_rate": 7.925017122639712e-05, + "loss": 0.6866, + "step": 1398 + }, + { + "epoch": 0.3224245217792118, + "grad_norm": 0.24749140441417694, + "learning_rate": 7.921988826206252e-05, + "loss": 0.6953, + "step": 1399 + }, + { + "epoch": 0.3226549896289468, + "grad_norm": 0.24181775748729706, + "learning_rate": 7.918958901126354e-05, + "loss": 0.6928, + "step": 1400 + }, + { + "epoch": 0.32288545747868175, + "grad_norm": 0.26094570755958557, + "learning_rate": 7.915927349088825e-05, + "loss": 0.6997, + "step": 1401 + }, + { + "epoch": 0.3231159253284167, + "grad_norm": 0.25756776332855225, + "learning_rate": 7.912894171783383e-05, + "loss": 0.6834, + "step": 1402 + }, + { + "epoch": 0.32334639317815167, + "grad_norm": 0.2835976183414459, + "learning_rate": 7.909859370900642e-05, + "loss": 0.6971, + "step": 1403 + }, + { + "epoch": 0.32357686102788663, + "grad_norm": 0.27739542722702026, + "learning_rate": 7.906822948132131e-05, + "loss": 0.6931, + "step": 1404 + }, + { + "epoch": 0.3238073288776216, + "grad_norm": 0.2250133603811264, + "learning_rate": 7.903784905170277e-05, + "loss": 0.6969, + "step": 1405 + }, + { + "epoch": 0.32403779672735655, + "grad_norm": 0.2541544735431671, + "learning_rate": 7.900745243708416e-05, + "loss": 0.6874, + "step": 1406 + }, + { + "epoch": 0.3242682645770915, + "grad_norm": 0.24582208693027496, + "learning_rate": 7.89770396544078e-05, + "loss": 0.688, + "step": 1407 + }, + { + "epoch": 0.3244987324268265, + "grad_norm": 0.2596662938594818, + "learning_rate": 7.894661072062503e-05, + "loss": 0.7033, + "step": 1408 + }, + { + "epoch": 0.32472920027656144, + "grad_norm": 0.24980351328849792, + "learning_rate": 7.891616565269623e-05, + "loss": 0.6844, + "step": 1409 + }, + { + "epoch": 0.3249596681262964, + "grad_norm": 0.2832036316394806, + "learning_rate": 7.888570446759074e-05, + "loss": 0.6904, + "step": 1410 + }, + { + "epoch": 0.32519013597603136, + "grad_norm": 0.27952510118484497, + "learning_rate": 7.88552271822869e-05, + "loss": 0.6913, + "step": 1411 + }, + { + "epoch": 0.3254206038257663, + "grad_norm": 0.2454599142074585, + "learning_rate": 7.882473381377202e-05, + "loss": 0.6917, + "step": 1412 + }, + { + "epoch": 0.3256510716755013, + "grad_norm": 0.219132199883461, + "learning_rate": 7.879422437904236e-05, + "loss": 0.6959, + "step": 1413 + }, + { + "epoch": 0.32588153952523624, + "grad_norm": 0.25890830159187317, + "learning_rate": 7.876369889510316e-05, + "loss": 0.6987, + "step": 1414 + }, + { + "epoch": 0.3261120073749712, + "grad_norm": 0.22995416820049286, + "learning_rate": 7.873315737896856e-05, + "loss": 0.6895, + "step": 1415 + }, + { + "epoch": 0.32634247522470616, + "grad_norm": 0.26824015378952026, + "learning_rate": 7.87025998476617e-05, + "loss": 0.6919, + "step": 1416 + }, + { + "epoch": 0.3265729430744411, + "grad_norm": 0.26897183060646057, + "learning_rate": 7.867202631821462e-05, + "loss": 0.6913, + "step": 1417 + }, + { + "epoch": 0.3268034109241761, + "grad_norm": 0.24150390923023224, + "learning_rate": 7.864143680766826e-05, + "loss": 0.6967, + "step": 1418 + }, + { + "epoch": 0.32703387877391105, + "grad_norm": 0.26501011848449707, + "learning_rate": 7.861083133307247e-05, + "loss": 0.6967, + "step": 1419 + }, + { + "epoch": 0.327264346623646, + "grad_norm": 0.23892751336097717, + "learning_rate": 7.858020991148602e-05, + "loss": 0.6999, + "step": 1420 + }, + { + "epoch": 0.32749481447338097, + "grad_norm": 0.2643808126449585, + "learning_rate": 7.854957255997653e-05, + "loss": 0.6919, + "step": 1421 + }, + { + "epoch": 0.32772528232311593, + "grad_norm": 0.26628732681274414, + "learning_rate": 7.851891929562057e-05, + "loss": 0.6963, + "step": 1422 + }, + { + "epoch": 0.3279557501728509, + "grad_norm": 0.27681294083595276, + "learning_rate": 7.84882501355035e-05, + "loss": 0.6879, + "step": 1423 + }, + { + "epoch": 0.32818621802258585, + "grad_norm": 0.25981584191322327, + "learning_rate": 7.845756509671955e-05, + "loss": 0.6974, + "step": 1424 + }, + { + "epoch": 0.3284166858723208, + "grad_norm": 0.2389248162508011, + "learning_rate": 7.842686419637189e-05, + "loss": 0.6961, + "step": 1425 + }, + { + "epoch": 0.3286471537220558, + "grad_norm": 0.27327385544776917, + "learning_rate": 7.839614745157242e-05, + "loss": 0.6834, + "step": 1426 + }, + { + "epoch": 0.32887762157179073, + "grad_norm": 0.2433241307735443, + "learning_rate": 7.836541487944193e-05, + "loss": 0.6887, + "step": 1427 + }, + { + "epoch": 0.3291080894215257, + "grad_norm": 0.23156027495861053, + "learning_rate": 7.833466649711002e-05, + "loss": 0.6872, + "step": 1428 + }, + { + "epoch": 0.32933855727126066, + "grad_norm": 0.27620989084243774, + "learning_rate": 7.83039023217151e-05, + "loss": 0.7045, + "step": 1429 + }, + { + "epoch": 0.3295690251209956, + "grad_norm": 0.2832176983356476, + "learning_rate": 7.827312237040435e-05, + "loss": 0.6952, + "step": 1430 + }, + { + "epoch": 0.3297994929707306, + "grad_norm": 0.2531898319721222, + "learning_rate": 7.824232666033382e-05, + "loss": 0.6888, + "step": 1431 + }, + { + "epoch": 0.33002996082046554, + "grad_norm": 0.21987776458263397, + "learning_rate": 7.821151520866829e-05, + "loss": 0.687, + "step": 1432 + }, + { + "epoch": 0.3302604286702005, + "grad_norm": 0.26392921805381775, + "learning_rate": 7.818068803258131e-05, + "loss": 0.6815, + "step": 1433 + }, + { + "epoch": 0.33049089651993546, + "grad_norm": 0.23336456716060638, + "learning_rate": 7.814984514925521e-05, + "loss": 0.6834, + "step": 1434 + }, + { + "epoch": 0.3307213643696704, + "grad_norm": 0.25581541657447815, + "learning_rate": 7.811898657588109e-05, + "loss": 0.6811, + "step": 1435 + }, + { + "epoch": 0.3309518322194054, + "grad_norm": 0.23013727366924286, + "learning_rate": 7.808811232965876e-05, + "loss": 0.676, + "step": 1436 + }, + { + "epoch": 0.33118230006914035, + "grad_norm": 0.222098246216774, + "learning_rate": 7.805722242779678e-05, + "loss": 0.6909, + "step": 1437 + }, + { + "epoch": 0.3314127679188753, + "grad_norm": 0.24615336954593658, + "learning_rate": 7.802631688751247e-05, + "loss": 0.6924, + "step": 1438 + }, + { + "epoch": 0.33164323576861027, + "grad_norm": 0.23416768014431, + "learning_rate": 7.79953957260318e-05, + "loss": 0.7015, + "step": 1439 + }, + { + "epoch": 0.33187370361834523, + "grad_norm": 0.2338051199913025, + "learning_rate": 7.796445896058953e-05, + "loss": 0.6929, + "step": 1440 + }, + { + "epoch": 0.3321041714680802, + "grad_norm": 0.24458903074264526, + "learning_rate": 7.793350660842902e-05, + "loss": 0.6852, + "step": 1441 + }, + { + "epoch": 0.33233463931781515, + "grad_norm": 0.2856997847557068, + "learning_rate": 7.790253868680237e-05, + "loss": 0.6926, + "step": 1442 + }, + { + "epoch": 0.3325651071675501, + "grad_norm": 0.24041031301021576, + "learning_rate": 7.78715552129704e-05, + "loss": 0.689, + "step": 1443 + }, + { + "epoch": 0.3327955750172851, + "grad_norm": 0.286312997341156, + "learning_rate": 7.784055620420251e-05, + "loss": 0.6954, + "step": 1444 + }, + { + "epoch": 0.33302604286702003, + "grad_norm": 0.25002339482307434, + "learning_rate": 7.780954167777684e-05, + "loss": 0.6929, + "step": 1445 + }, + { + "epoch": 0.333256510716755, + "grad_norm": 0.2585028409957886, + "learning_rate": 7.777851165098012e-05, + "loss": 0.6919, + "step": 1446 + }, + { + "epoch": 0.33348697856648996, + "grad_norm": 0.25653818249702454, + "learning_rate": 7.774746614110775e-05, + "loss": 0.6909, + "step": 1447 + }, + { + "epoch": 0.3337174464162249, + "grad_norm": 0.2610546052455902, + "learning_rate": 7.771640516546373e-05, + "loss": 0.6921, + "step": 1448 + }, + { + "epoch": 0.3339479142659599, + "grad_norm": 0.23188693821430206, + "learning_rate": 7.768532874136074e-05, + "loss": 0.6898, + "step": 1449 + }, + { + "epoch": 0.33417838211569484, + "grad_norm": 0.2562350332736969, + "learning_rate": 7.765423688612001e-05, + "loss": 0.6815, + "step": 1450 + }, + { + "epoch": 0.3344088499654298, + "grad_norm": 0.2621154487133026, + "learning_rate": 7.762312961707141e-05, + "loss": 0.6978, + "step": 1451 + }, + { + "epoch": 0.33463931781516476, + "grad_norm": 0.23152048885822296, + "learning_rate": 7.759200695155336e-05, + "loss": 0.6961, + "step": 1452 + }, + { + "epoch": 0.3348697856648997, + "grad_norm": 0.2416309267282486, + "learning_rate": 7.75608689069129e-05, + "loss": 0.6855, + "step": 1453 + }, + { + "epoch": 0.3351002535146347, + "grad_norm": 0.2472970187664032, + "learning_rate": 7.752971550050563e-05, + "loss": 0.6993, + "step": 1454 + }, + { + "epoch": 0.33533072136436964, + "grad_norm": 0.24622397124767303, + "learning_rate": 7.749854674969573e-05, + "loss": 0.7027, + "step": 1455 + }, + { + "epoch": 0.3355611892141046, + "grad_norm": 0.21482546627521515, + "learning_rate": 7.746736267185587e-05, + "loss": 0.6861, + "step": 1456 + }, + { + "epoch": 0.3357916570638396, + "grad_norm": 0.2635815143585205, + "learning_rate": 7.743616328436733e-05, + "loss": 0.6936, + "step": 1457 + }, + { + "epoch": 0.3360221249135746, + "grad_norm": 0.22743616998195648, + "learning_rate": 7.740494860461991e-05, + "loss": 0.6925, + "step": 1458 + }, + { + "epoch": 0.33625259276330954, + "grad_norm": 0.24108994007110596, + "learning_rate": 7.73737186500119e-05, + "loss": 0.6888, + "step": 1459 + }, + { + "epoch": 0.3364830606130445, + "grad_norm": 0.2540665566921234, + "learning_rate": 7.734247343795016e-05, + "loss": 0.6934, + "step": 1460 + }, + { + "epoch": 0.33671352846277947, + "grad_norm": 0.22767134010791779, + "learning_rate": 7.731121298585e-05, + "loss": 0.6871, + "step": 1461 + }, + { + "epoch": 0.3369439963125144, + "grad_norm": 0.2467404305934906, + "learning_rate": 7.727993731113523e-05, + "loss": 0.6895, + "step": 1462 + }, + { + "epoch": 0.3371744641622494, + "grad_norm": 0.23155854642391205, + "learning_rate": 7.724864643123819e-05, + "loss": 0.683, + "step": 1463 + }, + { + "epoch": 0.33740493201198435, + "grad_norm": 0.2407347857952118, + "learning_rate": 7.721734036359964e-05, + "loss": 0.6873, + "step": 1464 + }, + { + "epoch": 0.3376353998617193, + "grad_norm": 0.254625141620636, + "learning_rate": 7.718601912566887e-05, + "loss": 0.6934, + "step": 1465 + }, + { + "epoch": 0.33786586771145427, + "grad_norm": 0.2507195770740509, + "learning_rate": 7.715468273490354e-05, + "loss": 0.6922, + "step": 1466 + }, + { + "epoch": 0.33809633556118923, + "grad_norm": 0.2584257423877716, + "learning_rate": 7.712333120876983e-05, + "loss": 0.6855, + "step": 1467 + }, + { + "epoch": 0.3383268034109242, + "grad_norm": 0.24727049469947815, + "learning_rate": 7.709196456474231e-05, + "loss": 0.6748, + "step": 1468 + }, + { + "epoch": 0.33855727126065915, + "grad_norm": 0.2488059401512146, + "learning_rate": 7.7060582820304e-05, + "loss": 0.6869, + "step": 1469 + }, + { + "epoch": 0.3387877391103941, + "grad_norm": 0.24992689490318298, + "learning_rate": 7.702918599294636e-05, + "loss": 0.6857, + "step": 1470 + }, + { + "epoch": 0.3390182069601291, + "grad_norm": 0.2532345652580261, + "learning_rate": 7.69977741001692e-05, + "loss": 0.6923, + "step": 1471 + }, + { + "epoch": 0.33924867480986404, + "grad_norm": 0.2411700189113617, + "learning_rate": 7.696634715948072e-05, + "loss": 0.6909, + "step": 1472 + }, + { + "epoch": 0.339479142659599, + "grad_norm": 0.2590501606464386, + "learning_rate": 7.693490518839763e-05, + "loss": 0.6771, + "step": 1473 + }, + { + "epoch": 0.33970961050933396, + "grad_norm": 0.2995712161064148, + "learning_rate": 7.690344820444486e-05, + "loss": 0.6867, + "step": 1474 + }, + { + "epoch": 0.3399400783590689, + "grad_norm": 0.26080286502838135, + "learning_rate": 7.68719762251558e-05, + "loss": 0.696, + "step": 1475 + }, + { + "epoch": 0.3401705462088039, + "grad_norm": 0.2599622309207916, + "learning_rate": 7.684048926807219e-05, + "loss": 0.6828, + "step": 1476 + }, + { + "epoch": 0.34040101405853884, + "grad_norm": 0.2792719006538391, + "learning_rate": 7.680898735074407e-05, + "loss": 0.6876, + "step": 1477 + }, + { + "epoch": 0.3406314819082738, + "grad_norm": 0.25866684317588806, + "learning_rate": 7.677747049072987e-05, + "loss": 0.6922, + "step": 1478 + }, + { + "epoch": 0.34086194975800876, + "grad_norm": 0.23737114667892456, + "learning_rate": 7.674593870559634e-05, + "loss": 0.692, + "step": 1479 + }, + { + "epoch": 0.3410924176077437, + "grad_norm": 0.26033899188041687, + "learning_rate": 7.671439201291853e-05, + "loss": 0.69, + "step": 1480 + }, + { + "epoch": 0.3413228854574787, + "grad_norm": 0.24097611010074615, + "learning_rate": 7.668283043027982e-05, + "loss": 0.6896, + "step": 1481 + }, + { + "epoch": 0.34155335330721365, + "grad_norm": 0.26150503754615784, + "learning_rate": 7.665125397527187e-05, + "loss": 0.6919, + "step": 1482 + }, + { + "epoch": 0.3417838211569486, + "grad_norm": 0.21474456787109375, + "learning_rate": 7.661966266549463e-05, + "loss": 0.7017, + "step": 1483 + }, + { + "epoch": 0.34201428900668357, + "grad_norm": 0.2468489408493042, + "learning_rate": 7.658805651855636e-05, + "loss": 0.6858, + "step": 1484 + }, + { + "epoch": 0.34224475685641853, + "grad_norm": 0.2597835659980774, + "learning_rate": 7.655643555207355e-05, + "loss": 0.6919, + "step": 1485 + }, + { + "epoch": 0.3424752247061535, + "grad_norm": 0.24256932735443115, + "learning_rate": 7.652479978367097e-05, + "loss": 0.6915, + "step": 1486 + }, + { + "epoch": 0.34270569255588845, + "grad_norm": 0.2565065026283264, + "learning_rate": 7.649314923098164e-05, + "loss": 0.6863, + "step": 1487 + }, + { + "epoch": 0.3429361604056234, + "grad_norm": 0.27215030789375305, + "learning_rate": 7.646148391164682e-05, + "loss": 0.6871, + "step": 1488 + }, + { + "epoch": 0.3431666282553584, + "grad_norm": 0.27064988017082214, + "learning_rate": 7.6429803843316e-05, + "loss": 0.6914, + "step": 1489 + }, + { + "epoch": 0.34339709610509334, + "grad_norm": 0.2974797487258911, + "learning_rate": 7.63981090436469e-05, + "loss": 0.6805, + "step": 1490 + }, + { + "epoch": 0.3436275639548283, + "grad_norm": 0.21655718982219696, + "learning_rate": 7.636639953030541e-05, + "loss": 0.688, + "step": 1491 + }, + { + "epoch": 0.34385803180456326, + "grad_norm": 0.26122844219207764, + "learning_rate": 7.633467532096567e-05, + "loss": 0.675, + "step": 1492 + }, + { + "epoch": 0.3440884996542982, + "grad_norm": 0.22643567621707916, + "learning_rate": 7.630293643331001e-05, + "loss": 0.6917, + "step": 1493 + }, + { + "epoch": 0.3443189675040332, + "grad_norm": 0.26686912775039673, + "learning_rate": 7.627118288502889e-05, + "loss": 0.6975, + "step": 1494 + }, + { + "epoch": 0.34454943535376814, + "grad_norm": 0.25299954414367676, + "learning_rate": 7.623941469382099e-05, + "loss": 0.6944, + "step": 1495 + }, + { + "epoch": 0.3447799032035031, + "grad_norm": 0.2686970829963684, + "learning_rate": 7.620763187739315e-05, + "loss": 0.6879, + "step": 1496 + }, + { + "epoch": 0.34501037105323806, + "grad_norm": 0.2587553858757019, + "learning_rate": 7.617583445346033e-05, + "loss": 0.6885, + "step": 1497 + }, + { + "epoch": 0.345240838902973, + "grad_norm": 0.25488153100013733, + "learning_rate": 7.614402243974568e-05, + "loss": 0.7005, + "step": 1498 + }, + { + "epoch": 0.345471306752708, + "grad_norm": 0.2613796889781952, + "learning_rate": 7.61121958539804e-05, + "loss": 0.6872, + "step": 1499 + }, + { + "epoch": 0.34570177460244295, + "grad_norm": 0.21812820434570312, + "learning_rate": 7.608035471390394e-05, + "loss": 0.6909, + "step": 1500 + }, + { + "epoch": 0.3459322424521779, + "grad_norm": 0.24839772284030914, + "learning_rate": 7.604849903726372e-05, + "loss": 0.6839, + "step": 1501 + }, + { + "epoch": 0.34616271030191287, + "grad_norm": 0.25548359751701355, + "learning_rate": 7.601662884181534e-05, + "loss": 0.6927, + "step": 1502 + }, + { + "epoch": 0.34639317815164783, + "grad_norm": 0.25990694761276245, + "learning_rate": 7.598474414532252e-05, + "loss": 0.6875, + "step": 1503 + }, + { + "epoch": 0.3466236460013828, + "grad_norm": 0.23840296268463135, + "learning_rate": 7.595284496555698e-05, + "loss": 0.6963, + "step": 1504 + }, + { + "epoch": 0.34685411385111775, + "grad_norm": 0.24960501492023468, + "learning_rate": 7.592093132029861e-05, + "loss": 0.6853, + "step": 1505 + }, + { + "epoch": 0.3470845817008527, + "grad_norm": 0.22266826033592224, + "learning_rate": 7.588900322733526e-05, + "loss": 0.6832, + "step": 1506 + }, + { + "epoch": 0.3473150495505877, + "grad_norm": 0.23632311820983887, + "learning_rate": 7.585706070446288e-05, + "loss": 0.6813, + "step": 1507 + }, + { + "epoch": 0.34754551740032263, + "grad_norm": 0.22808539867401123, + "learning_rate": 7.582510376948552e-05, + "loss": 0.6788, + "step": 1508 + }, + { + "epoch": 0.3477759852500576, + "grad_norm": 0.2298205941915512, + "learning_rate": 7.579313244021515e-05, + "loss": 0.6926, + "step": 1509 + }, + { + "epoch": 0.34800645309979256, + "grad_norm": 0.25376659631729126, + "learning_rate": 7.576114673447186e-05, + "loss": 0.6953, + "step": 1510 + }, + { + "epoch": 0.3482369209495275, + "grad_norm": 0.24706676602363586, + "learning_rate": 7.57291466700837e-05, + "loss": 0.685, + "step": 1511 + }, + { + "epoch": 0.3484673887992625, + "grad_norm": 0.25202682614326477, + "learning_rate": 7.569713226488674e-05, + "loss": 0.6867, + "step": 1512 + }, + { + "epoch": 0.34869785664899744, + "grad_norm": 0.2461564689874649, + "learning_rate": 7.566510353672504e-05, + "loss": 0.6946, + "step": 1513 + }, + { + "epoch": 0.3489283244987324, + "grad_norm": 0.26574838161468506, + "learning_rate": 7.563306050345062e-05, + "loss": 0.699, + "step": 1514 + }, + { + "epoch": 0.34915879234846736, + "grad_norm": 0.256676584482193, + "learning_rate": 7.560100318292355e-05, + "loss": 0.6896, + "step": 1515 + }, + { + "epoch": 0.3493892601982024, + "grad_norm": 0.23021619021892548, + "learning_rate": 7.556893159301178e-05, + "loss": 0.693, + "step": 1516 + }, + { + "epoch": 0.34961972804793734, + "grad_norm": 0.23799525201320648, + "learning_rate": 7.553684575159124e-05, + "loss": 0.6855, + "step": 1517 + }, + { + "epoch": 0.3498501958976723, + "grad_norm": 0.23646856844425201, + "learning_rate": 7.550474567654583e-05, + "loss": 0.6894, + "step": 1518 + }, + { + "epoch": 0.35008066374740726, + "grad_norm": 0.2123948484659195, + "learning_rate": 7.547263138576732e-05, + "loss": 0.6744, + "step": 1519 + }, + { + "epoch": 0.3503111315971422, + "grad_norm": 0.22038936614990234, + "learning_rate": 7.54405028971555e-05, + "loss": 0.6812, + "step": 1520 + }, + { + "epoch": 0.3505415994468772, + "grad_norm": 0.22953416407108307, + "learning_rate": 7.540836022861797e-05, + "loss": 0.679, + "step": 1521 + }, + { + "epoch": 0.35077206729661214, + "grad_norm": 0.23723843693733215, + "learning_rate": 7.53762033980703e-05, + "loss": 0.6828, + "step": 1522 + }, + { + "epoch": 0.3510025351463471, + "grad_norm": 0.2359115332365036, + "learning_rate": 7.534403242343595e-05, + "loss": 0.6821, + "step": 1523 + }, + { + "epoch": 0.35123300299608207, + "grad_norm": 0.23382125794887543, + "learning_rate": 7.531184732264624e-05, + "loss": 0.6885, + "step": 1524 + }, + { + "epoch": 0.351463470845817, + "grad_norm": 0.2295057624578476, + "learning_rate": 7.527964811364035e-05, + "loss": 0.686, + "step": 1525 + }, + { + "epoch": 0.351693938695552, + "grad_norm": 0.21867454051971436, + "learning_rate": 7.524743481436537e-05, + "loss": 0.6864, + "step": 1526 + }, + { + "epoch": 0.35192440654528695, + "grad_norm": 0.21911703050136566, + "learning_rate": 7.521520744277623e-05, + "loss": 0.6882, + "step": 1527 + }, + { + "epoch": 0.3521548743950219, + "grad_norm": 0.227437362074852, + "learning_rate": 7.518296601683567e-05, + "loss": 0.6894, + "step": 1528 + }, + { + "epoch": 0.35238534224475687, + "grad_norm": 0.23779958486557007, + "learning_rate": 7.515071055451429e-05, + "loss": 0.6749, + "step": 1529 + }, + { + "epoch": 0.35261581009449183, + "grad_norm": 0.22434940934181213, + "learning_rate": 7.511844107379052e-05, + "loss": 0.6897, + "step": 1530 + }, + { + "epoch": 0.3528462779442268, + "grad_norm": 0.22103999555110931, + "learning_rate": 7.508615759265059e-05, + "loss": 0.68, + "step": 1531 + }, + { + "epoch": 0.35307674579396175, + "grad_norm": 0.22952067852020264, + "learning_rate": 7.505386012908853e-05, + "loss": 0.6892, + "step": 1532 + }, + { + "epoch": 0.3533072136436967, + "grad_norm": 0.24926556646823883, + "learning_rate": 7.50215487011062e-05, + "loss": 0.6829, + "step": 1533 + }, + { + "epoch": 0.3535376814934317, + "grad_norm": 0.2560986280441284, + "learning_rate": 7.498922332671317e-05, + "loss": 0.6971, + "step": 1534 + }, + { + "epoch": 0.35376814934316664, + "grad_norm": 0.256628155708313, + "learning_rate": 7.495688402392686e-05, + "loss": 0.6908, + "step": 1535 + }, + { + "epoch": 0.3539986171929016, + "grad_norm": 0.23531514406204224, + "learning_rate": 7.492453081077241e-05, + "loss": 0.6811, + "step": 1536 + }, + { + "epoch": 0.35422908504263656, + "grad_norm": 0.24269689619541168, + "learning_rate": 7.489216370528273e-05, + "loss": 0.6824, + "step": 1537 + }, + { + "epoch": 0.3544595528923715, + "grad_norm": 0.2383260726928711, + "learning_rate": 7.485978272549847e-05, + "loss": 0.6876, + "step": 1538 + }, + { + "epoch": 0.3546900207421065, + "grad_norm": 0.25450727343559265, + "learning_rate": 7.482738788946799e-05, + "loss": 0.6852, + "step": 1539 + }, + { + "epoch": 0.35492048859184144, + "grad_norm": 0.21417759358882904, + "learning_rate": 7.479497921524741e-05, + "loss": 0.6848, + "step": 1540 + }, + { + "epoch": 0.3551509564415764, + "grad_norm": 0.23762138187885284, + "learning_rate": 7.476255672090055e-05, + "loss": 0.6943, + "step": 1541 + }, + { + "epoch": 0.35538142429131137, + "grad_norm": 0.2219182252883911, + "learning_rate": 7.473012042449894e-05, + "loss": 0.687, + "step": 1542 + }, + { + "epoch": 0.3556118921410463, + "grad_norm": 0.23807905614376068, + "learning_rate": 7.469767034412176e-05, + "loss": 0.6859, + "step": 1543 + }, + { + "epoch": 0.3558423599907813, + "grad_norm": 0.24784959852695465, + "learning_rate": 7.466520649785593e-05, + "loss": 0.6787, + "step": 1544 + }, + { + "epoch": 0.35607282784051625, + "grad_norm": 0.2389153093099594, + "learning_rate": 7.463272890379602e-05, + "loss": 0.6922, + "step": 1545 + }, + { + "epoch": 0.3563032956902512, + "grad_norm": 0.24661684036254883, + "learning_rate": 7.460023758004426e-05, + "loss": 0.6782, + "step": 1546 + }, + { + "epoch": 0.35653376353998617, + "grad_norm": 0.22186362743377686, + "learning_rate": 7.456773254471053e-05, + "loss": 0.684, + "step": 1547 + }, + { + "epoch": 0.35676423138972113, + "grad_norm": 0.2341771274805069, + "learning_rate": 7.453521381591233e-05, + "loss": 0.688, + "step": 1548 + }, + { + "epoch": 0.3569946992394561, + "grad_norm": 0.2423945963382721, + "learning_rate": 7.450268141177486e-05, + "loss": 0.6738, + "step": 1549 + }, + { + "epoch": 0.35722516708919105, + "grad_norm": 0.2303714007139206, + "learning_rate": 7.44701353504309e-05, + "loss": 0.6826, + "step": 1550 + }, + { + "epoch": 0.357455634938926, + "grad_norm": 0.26467540860176086, + "learning_rate": 7.443757565002081e-05, + "loss": 0.6836, + "step": 1551 + }, + { + "epoch": 0.357686102788661, + "grad_norm": 0.25847116112709045, + "learning_rate": 7.440500232869262e-05, + "loss": 0.6886, + "step": 1552 + }, + { + "epoch": 0.35791657063839594, + "grad_norm": 0.22903743386268616, + "learning_rate": 7.43724154046019e-05, + "loss": 0.6929, + "step": 1553 + }, + { + "epoch": 0.3581470384881309, + "grad_norm": 0.2548694312572479, + "learning_rate": 7.433981489591181e-05, + "loss": 0.6879, + "step": 1554 + }, + { + "epoch": 0.35837750633786586, + "grad_norm": 0.2554045617580414, + "learning_rate": 7.43072008207931e-05, + "loss": 0.6834, + "step": 1555 + }, + { + "epoch": 0.3586079741876008, + "grad_norm": 0.23032429814338684, + "learning_rate": 7.42745731974241e-05, + "loss": 0.6771, + "step": 1556 + }, + { + "epoch": 0.3588384420373358, + "grad_norm": 0.23225395381450653, + "learning_rate": 7.424193204399061e-05, + "loss": 0.6906, + "step": 1557 + }, + { + "epoch": 0.35906890988707074, + "grad_norm": 0.24884352087974548, + "learning_rate": 7.420927737868608e-05, + "loss": 0.6937, + "step": 1558 + }, + { + "epoch": 0.3592993777368057, + "grad_norm": 0.22079841792583466, + "learning_rate": 7.417660921971141e-05, + "loss": 0.6965, + "step": 1559 + }, + { + "epoch": 0.35952984558654066, + "grad_norm": 0.2275458425283432, + "learning_rate": 7.414392758527504e-05, + "loss": 0.6788, + "step": 1560 + }, + { + "epoch": 0.3597603134362756, + "grad_norm": 0.25046491622924805, + "learning_rate": 7.411123249359294e-05, + "loss": 0.6874, + "step": 1561 + }, + { + "epoch": 0.3599907812860106, + "grad_norm": 0.24402521550655365, + "learning_rate": 7.407852396288857e-05, + "loss": 0.6775, + "step": 1562 + }, + { + "epoch": 0.36022124913574555, + "grad_norm": 0.2387145459651947, + "learning_rate": 7.404580201139286e-05, + "loss": 0.6817, + "step": 1563 + }, + { + "epoch": 0.3604517169854805, + "grad_norm": 0.2291865348815918, + "learning_rate": 7.401306665734429e-05, + "loss": 0.691, + "step": 1564 + }, + { + "epoch": 0.36068218483521547, + "grad_norm": 0.24180877208709717, + "learning_rate": 7.398031791898872e-05, + "loss": 0.6863, + "step": 1565 + }, + { + "epoch": 0.36091265268495043, + "grad_norm": 0.2156798541545868, + "learning_rate": 7.394755581457949e-05, + "loss": 0.683, + "step": 1566 + }, + { + "epoch": 0.3611431205346854, + "grad_norm": 0.2224697768688202, + "learning_rate": 7.391478036237747e-05, + "loss": 0.6863, + "step": 1567 + }, + { + "epoch": 0.36137358838442035, + "grad_norm": 0.23175424337387085, + "learning_rate": 7.388199158065086e-05, + "loss": 0.6896, + "step": 1568 + }, + { + "epoch": 0.3616040562341553, + "grad_norm": 0.20944839715957642, + "learning_rate": 7.384918948767538e-05, + "loss": 0.6949, + "step": 1569 + }, + { + "epoch": 0.3618345240838903, + "grad_norm": 0.24800141155719757, + "learning_rate": 7.38163741017341e-05, + "loss": 0.6827, + "step": 1570 + }, + { + "epoch": 0.36206499193362524, + "grad_norm": 0.2055944949388504, + "learning_rate": 7.378354544111755e-05, + "loss": 0.6859, + "step": 1571 + }, + { + "epoch": 0.3622954597833602, + "grad_norm": 0.23914718627929688, + "learning_rate": 7.37507035241236e-05, + "loss": 0.691, + "step": 1572 + }, + { + "epoch": 0.36252592763309516, + "grad_norm": 0.24815700948238373, + "learning_rate": 7.371784836905758e-05, + "loss": 0.6964, + "step": 1573 + }, + { + "epoch": 0.3627563954828301, + "grad_norm": 0.22123336791992188, + "learning_rate": 7.368497999423216e-05, + "loss": 0.6821, + "step": 1574 + }, + { + "epoch": 0.3629868633325651, + "grad_norm": 0.22671015560626984, + "learning_rate": 7.365209841796738e-05, + "loss": 0.6824, + "step": 1575 + }, + { + "epoch": 0.3632173311823001, + "grad_norm": 0.2351810485124588, + "learning_rate": 7.361920365859066e-05, + "loss": 0.6829, + "step": 1576 + }, + { + "epoch": 0.36344779903203506, + "grad_norm": 0.27589890360832214, + "learning_rate": 7.35862957344367e-05, + "loss": 0.6909, + "step": 1577 + }, + { + "epoch": 0.36367826688177, + "grad_norm": 0.24565422534942627, + "learning_rate": 7.355337466384761e-05, + "loss": 0.6882, + "step": 1578 + }, + { + "epoch": 0.363908734731505, + "grad_norm": 0.24734973907470703, + "learning_rate": 7.352044046517285e-05, + "loss": 0.694, + "step": 1579 + }, + { + "epoch": 0.36413920258123994, + "grad_norm": 0.23439617455005646, + "learning_rate": 7.34874931567691e-05, + "loss": 0.6883, + "step": 1580 + }, + { + "epoch": 0.3643696704309749, + "grad_norm": 0.2454935908317566, + "learning_rate": 7.34545327570004e-05, + "loss": 0.692, + "step": 1581 + }, + { + "epoch": 0.36460013828070986, + "grad_norm": 0.21465803682804108, + "learning_rate": 7.342155928423812e-05, + "loss": 0.68, + "step": 1582 + }, + { + "epoch": 0.3648306061304448, + "grad_norm": 0.24188017845153809, + "learning_rate": 7.338857275686084e-05, + "loss": 0.683, + "step": 1583 + }, + { + "epoch": 0.3650610739801798, + "grad_norm": 0.2033400982618332, + "learning_rate": 7.335557319325449e-05, + "loss": 0.6855, + "step": 1584 + }, + { + "epoch": 0.36529154182991475, + "grad_norm": 0.2551295757293701, + "learning_rate": 7.332256061181222e-05, + "loss": 0.6839, + "step": 1585 + }, + { + "epoch": 0.3655220096796497, + "grad_norm": 0.218764066696167, + "learning_rate": 7.328953503093446e-05, + "loss": 0.6823, + "step": 1586 + }, + { + "epoch": 0.36575247752938467, + "grad_norm": 0.23315462470054626, + "learning_rate": 7.325649646902887e-05, + "loss": 0.6884, + "step": 1587 + }, + { + "epoch": 0.36598294537911963, + "grad_norm": 0.2543986439704895, + "learning_rate": 7.322344494451034e-05, + "loss": 0.681, + "step": 1588 + }, + { + "epoch": 0.3662134132288546, + "grad_norm": 0.3002406060695648, + "learning_rate": 7.319038047580102e-05, + "loss": 0.6907, + "step": 1589 + }, + { + "epoch": 0.36644388107858955, + "grad_norm": 0.24670617282390594, + "learning_rate": 7.315730308133023e-05, + "loss": 0.6922, + "step": 1590 + }, + { + "epoch": 0.3666743489283245, + "grad_norm": 0.23123899102210999, + "learning_rate": 7.312421277953454e-05, + "loss": 0.682, + "step": 1591 + }, + { + "epoch": 0.3669048167780595, + "grad_norm": 0.3154587745666504, + "learning_rate": 7.309110958885768e-05, + "loss": 0.6841, + "step": 1592 + }, + { + "epoch": 0.36713528462779443, + "grad_norm": 0.24491167068481445, + "learning_rate": 7.305799352775055e-05, + "loss": 0.6869, + "step": 1593 + }, + { + "epoch": 0.3673657524775294, + "grad_norm": 0.24614278972148895, + "learning_rate": 7.302486461467128e-05, + "loss": 0.678, + "step": 1594 + }, + { + "epoch": 0.36759622032726436, + "grad_norm": 0.2878095507621765, + "learning_rate": 7.299172286808511e-05, + "loss": 0.6899, + "step": 1595 + }, + { + "epoch": 0.3678266881769993, + "grad_norm": 0.26783236861228943, + "learning_rate": 7.295856830646446e-05, + "loss": 0.693, + "step": 1596 + }, + { + "epoch": 0.3680571560267343, + "grad_norm": 0.2804826498031616, + "learning_rate": 7.29254009482889e-05, + "loss": 0.687, + "step": 1597 + }, + { + "epoch": 0.36828762387646924, + "grad_norm": 0.29956531524658203, + "learning_rate": 7.28922208120451e-05, + "loss": 0.6868, + "step": 1598 + }, + { + "epoch": 0.3685180917262042, + "grad_norm": 0.22456161677837372, + "learning_rate": 7.285902791622688e-05, + "loss": 0.6829, + "step": 1599 + }, + { + "epoch": 0.36874855957593916, + "grad_norm": 0.2787242829799652, + "learning_rate": 7.282582227933517e-05, + "loss": 0.6845, + "step": 1600 + }, + { + "epoch": 0.3689790274256741, + "grad_norm": 0.28973785042762756, + "learning_rate": 7.279260391987799e-05, + "loss": 0.6921, + "step": 1601 + }, + { + "epoch": 0.3692094952754091, + "grad_norm": 0.25046855211257935, + "learning_rate": 7.275937285637044e-05, + "loss": 0.6825, + "step": 1602 + }, + { + "epoch": 0.36943996312514404, + "grad_norm": 0.25190284848213196, + "learning_rate": 7.272612910733475e-05, + "loss": 0.6882, + "step": 1603 + }, + { + "epoch": 0.369670430974879, + "grad_norm": 0.2506854832172394, + "learning_rate": 7.269287269130017e-05, + "loss": 0.6851, + "step": 1604 + }, + { + "epoch": 0.36990089882461397, + "grad_norm": 0.2547498643398285, + "learning_rate": 7.2659603626803e-05, + "loss": 0.6797, + "step": 1605 + }, + { + "epoch": 0.3701313666743489, + "grad_norm": 0.23294281959533691, + "learning_rate": 7.262632193238668e-05, + "loss": 0.6831, + "step": 1606 + }, + { + "epoch": 0.3703618345240839, + "grad_norm": 0.24481774866580963, + "learning_rate": 7.259302762660157e-05, + "loss": 0.6835, + "step": 1607 + }, + { + "epoch": 0.37059230237381885, + "grad_norm": 0.24376718699932098, + "learning_rate": 7.255972072800514e-05, + "loss": 0.6762, + "step": 1608 + }, + { + "epoch": 0.3708227702235538, + "grad_norm": 0.22260530292987823, + "learning_rate": 7.252640125516189e-05, + "loss": 0.6831, + "step": 1609 + }, + { + "epoch": 0.37105323807328877, + "grad_norm": 0.23927612602710724, + "learning_rate": 7.249306922664322e-05, + "loss": 0.6926, + "step": 1610 + }, + { + "epoch": 0.37128370592302373, + "grad_norm": 0.23161613941192627, + "learning_rate": 7.245972466102766e-05, + "loss": 0.6795, + "step": 1611 + }, + { + "epoch": 0.3715141737727587, + "grad_norm": 0.2783213257789612, + "learning_rate": 7.242636757690064e-05, + "loss": 0.6919, + "step": 1612 + }, + { + "epoch": 0.37174464162249365, + "grad_norm": 0.3050868809223175, + "learning_rate": 7.239299799285462e-05, + "loss": 0.6864, + "step": 1613 + }, + { + "epoch": 0.3719751094722286, + "grad_norm": 0.2619262635707855, + "learning_rate": 7.235961592748901e-05, + "loss": 0.6822, + "step": 1614 + }, + { + "epoch": 0.3722055773219636, + "grad_norm": 0.24144136905670166, + "learning_rate": 7.232622139941016e-05, + "loss": 0.6942, + "step": 1615 + }, + { + "epoch": 0.37243604517169854, + "grad_norm": 0.2681979238986969, + "learning_rate": 7.229281442723136e-05, + "loss": 0.686, + "step": 1616 + }, + { + "epoch": 0.3726665130214335, + "grad_norm": 0.23774147033691406, + "learning_rate": 7.225939502957287e-05, + "loss": 0.6849, + "step": 1617 + }, + { + "epoch": 0.37289698087116846, + "grad_norm": 0.24318799376487732, + "learning_rate": 7.222596322506188e-05, + "loss": 0.6866, + "step": 1618 + }, + { + "epoch": 0.3731274487209034, + "grad_norm": 0.2579040825366974, + "learning_rate": 7.219251903233246e-05, + "loss": 0.6873, + "step": 1619 + }, + { + "epoch": 0.3733579165706384, + "grad_norm": 0.26902881264686584, + "learning_rate": 7.215906247002557e-05, + "loss": 0.684, + "step": 1620 + }, + { + "epoch": 0.37358838442037334, + "grad_norm": 0.22406496107578278, + "learning_rate": 7.212559355678915e-05, + "loss": 0.6909, + "step": 1621 + }, + { + "epoch": 0.3738188522701083, + "grad_norm": 0.2592219114303589, + "learning_rate": 7.209211231127791e-05, + "loss": 0.6849, + "step": 1622 + }, + { + "epoch": 0.37404932011984326, + "grad_norm": 0.22760163247585297, + "learning_rate": 7.205861875215357e-05, + "loss": 0.6806, + "step": 1623 + }, + { + "epoch": 0.3742797879695782, + "grad_norm": 0.24689623713493347, + "learning_rate": 7.202511289808456e-05, + "loss": 0.6842, + "step": 1624 + }, + { + "epoch": 0.3745102558193132, + "grad_norm": 0.23074333369731903, + "learning_rate": 7.199159476774627e-05, + "loss": 0.6808, + "step": 1625 + }, + { + "epoch": 0.37474072366904815, + "grad_norm": 0.23396410048007965, + "learning_rate": 7.19580643798209e-05, + "loss": 0.6801, + "step": 1626 + }, + { + "epoch": 0.3749711915187831, + "grad_norm": 0.21473470330238342, + "learning_rate": 7.192452175299748e-05, + "loss": 0.6891, + "step": 1627 + }, + { + "epoch": 0.37520165936851807, + "grad_norm": 0.24224236607551575, + "learning_rate": 7.189096690597188e-05, + "loss": 0.6844, + "step": 1628 + }, + { + "epoch": 0.37543212721825303, + "grad_norm": 0.21217916905879974, + "learning_rate": 7.185739985744675e-05, + "loss": 0.681, + "step": 1629 + }, + { + "epoch": 0.375662595067988, + "grad_norm": 0.24271690845489502, + "learning_rate": 7.182382062613156e-05, + "loss": 0.6834, + "step": 1630 + }, + { + "epoch": 0.37589306291772295, + "grad_norm": 0.2046801894903183, + "learning_rate": 7.179022923074258e-05, + "loss": 0.685, + "step": 1631 + }, + { + "epoch": 0.3761235307674579, + "grad_norm": 0.23926329612731934, + "learning_rate": 7.175662569000282e-05, + "loss": 0.6768, + "step": 1632 + }, + { + "epoch": 0.3763539986171929, + "grad_norm": 0.2330237329006195, + "learning_rate": 7.172301002264212e-05, + "loss": 0.6862, + "step": 1633 + }, + { + "epoch": 0.37658446646692784, + "grad_norm": 0.2538076937198639, + "learning_rate": 7.168938224739704e-05, + "loss": 0.6871, + "step": 1634 + }, + { + "epoch": 0.37681493431666285, + "grad_norm": 0.2078879177570343, + "learning_rate": 7.165574238301085e-05, + "loss": 0.6845, + "step": 1635 + }, + { + "epoch": 0.3770454021663978, + "grad_norm": 0.23577922582626343, + "learning_rate": 7.162209044823367e-05, + "loss": 0.6771, + "step": 1636 + }, + { + "epoch": 0.3772758700161328, + "grad_norm": 0.22798825800418854, + "learning_rate": 7.158842646182222e-05, + "loss": 0.6861, + "step": 1637 + }, + { + "epoch": 0.37750633786586774, + "grad_norm": 0.2441009134054184, + "learning_rate": 7.155475044254006e-05, + "loss": 0.6834, + "step": 1638 + }, + { + "epoch": 0.3777368057156027, + "grad_norm": 0.2254721075296402, + "learning_rate": 7.152106240915735e-05, + "loss": 0.6814, + "step": 1639 + }, + { + "epoch": 0.37796727356533766, + "grad_norm": 0.23852422833442688, + "learning_rate": 7.148736238045098e-05, + "loss": 0.6843, + "step": 1640 + }, + { + "epoch": 0.3781977414150726, + "grad_norm": 0.23249536752700806, + "learning_rate": 7.14536503752046e-05, + "loss": 0.69, + "step": 1641 + }, + { + "epoch": 0.3784282092648076, + "grad_norm": 0.2591071128845215, + "learning_rate": 7.141992641220841e-05, + "loss": 0.6809, + "step": 1642 + }, + { + "epoch": 0.37865867711454254, + "grad_norm": 0.24217630922794342, + "learning_rate": 7.138619051025935e-05, + "loss": 0.6746, + "step": 1643 + }, + { + "epoch": 0.3788891449642775, + "grad_norm": 0.24814555048942566, + "learning_rate": 7.135244268816102e-05, + "loss": 0.6856, + "step": 1644 + }, + { + "epoch": 0.37911961281401246, + "grad_norm": 0.24192722141742706, + "learning_rate": 7.131868296472366e-05, + "loss": 0.6856, + "step": 1645 + }, + { + "epoch": 0.3793500806637474, + "grad_norm": 0.27118441462516785, + "learning_rate": 7.12849113587641e-05, + "loss": 0.6829, + "step": 1646 + }, + { + "epoch": 0.3795805485134824, + "grad_norm": 0.2808471918106079, + "learning_rate": 7.125112788910581e-05, + "loss": 0.6844, + "step": 1647 + }, + { + "epoch": 0.37981101636321735, + "grad_norm": 0.23738627135753632, + "learning_rate": 7.121733257457893e-05, + "loss": 0.6808, + "step": 1648 + }, + { + "epoch": 0.3800414842129523, + "grad_norm": 0.241289883852005, + "learning_rate": 7.11835254340201e-05, + "loss": 0.6853, + "step": 1649 + }, + { + "epoch": 0.38027195206268727, + "grad_norm": 0.240788996219635, + "learning_rate": 7.114970648627267e-05, + "loss": 0.6805, + "step": 1650 + }, + { + "epoch": 0.38050241991242223, + "grad_norm": 0.2523535192012787, + "learning_rate": 7.111587575018648e-05, + "loss": 0.6764, + "step": 1651 + }, + { + "epoch": 0.3807328877621572, + "grad_norm": 0.2584069073200226, + "learning_rate": 7.108203324461795e-05, + "loss": 0.6913, + "step": 1652 + }, + { + "epoch": 0.38096335561189215, + "grad_norm": 0.24159248173236847, + "learning_rate": 7.104817898843013e-05, + "loss": 0.6815, + "step": 1653 + }, + { + "epoch": 0.3811938234616271, + "grad_norm": 0.22381487488746643, + "learning_rate": 7.101431300049253e-05, + "loss": 0.6824, + "step": 1654 + }, + { + "epoch": 0.3814242913113621, + "grad_norm": 0.25524625182151794, + "learning_rate": 7.098043529968123e-05, + "loss": 0.6801, + "step": 1655 + }, + { + "epoch": 0.38165475916109703, + "grad_norm": 0.24695929884910583, + "learning_rate": 7.09465459048789e-05, + "loss": 0.6799, + "step": 1656 + }, + { + "epoch": 0.381885227010832, + "grad_norm": 0.21066734194755554, + "learning_rate": 7.091264483497463e-05, + "loss": 0.6713, + "step": 1657 + }, + { + "epoch": 0.38211569486056696, + "grad_norm": 0.25811514258384705, + "learning_rate": 7.087873210886406e-05, + "loss": 0.6852, + "step": 1658 + }, + { + "epoch": 0.3823461627103019, + "grad_norm": 0.21766284108161926, + "learning_rate": 7.084480774544937e-05, + "loss": 0.6869, + "step": 1659 + }, + { + "epoch": 0.3825766305600369, + "grad_norm": 0.2117396891117096, + "learning_rate": 7.081087176363916e-05, + "loss": 0.6815, + "step": 1660 + }, + { + "epoch": 0.38280709840977184, + "grad_norm": 0.23894590139389038, + "learning_rate": 7.077692418234852e-05, + "loss": 0.6919, + "step": 1661 + }, + { + "epoch": 0.3830375662595068, + "grad_norm": 0.2442779392004013, + "learning_rate": 7.074296502049903e-05, + "loss": 0.6762, + "step": 1662 + }, + { + "epoch": 0.38326803410924176, + "grad_norm": 0.20315802097320557, + "learning_rate": 7.070899429701873e-05, + "loss": 0.6747, + "step": 1663 + }, + { + "epoch": 0.3834985019589767, + "grad_norm": 0.22201156616210938, + "learning_rate": 7.067501203084203e-05, + "loss": 0.6802, + "step": 1664 + }, + { + "epoch": 0.3837289698087117, + "grad_norm": 0.21427859365940094, + "learning_rate": 7.064101824090991e-05, + "loss": 0.6758, + "step": 1665 + }, + { + "epoch": 0.38395943765844665, + "grad_norm": 0.20603862404823303, + "learning_rate": 7.060701294616963e-05, + "loss": 0.6759, + "step": 1666 + }, + { + "epoch": 0.3841899055081816, + "grad_norm": 0.2058810144662857, + "learning_rate": 7.057299616557493e-05, + "loss": 0.6883, + "step": 1667 + }, + { + "epoch": 0.38442037335791657, + "grad_norm": 0.20582230389118195, + "learning_rate": 7.053896791808598e-05, + "loss": 0.6883, + "step": 1668 + }, + { + "epoch": 0.38465084120765153, + "grad_norm": 0.20562750101089478, + "learning_rate": 7.050492822266929e-05, + "loss": 0.6827, + "step": 1669 + }, + { + "epoch": 0.3848813090573865, + "grad_norm": 0.1945854127407074, + "learning_rate": 7.047087709829777e-05, + "loss": 0.6869, + "step": 1670 + }, + { + "epoch": 0.38511177690712145, + "grad_norm": 0.22347545623779297, + "learning_rate": 7.043681456395068e-05, + "loss": 0.6801, + "step": 1671 + }, + { + "epoch": 0.3853422447568564, + "grad_norm": 0.21079128980636597, + "learning_rate": 7.04027406386137e-05, + "loss": 0.6774, + "step": 1672 + }, + { + "epoch": 0.3855727126065914, + "grad_norm": 0.2209564447402954, + "learning_rate": 7.036865534127879e-05, + "loss": 0.6855, + "step": 1673 + }, + { + "epoch": 0.38580318045632633, + "grad_norm": 0.226369246840477, + "learning_rate": 7.033455869094428e-05, + "loss": 0.6808, + "step": 1674 + }, + { + "epoch": 0.3860336483060613, + "grad_norm": 0.2145013064146042, + "learning_rate": 7.030045070661484e-05, + "loss": 0.6904, + "step": 1675 + }, + { + "epoch": 0.38626411615579626, + "grad_norm": 0.20242154598236084, + "learning_rate": 7.02663314073014e-05, + "loss": 0.6812, + "step": 1676 + }, + { + "epoch": 0.3864945840055312, + "grad_norm": 0.2313269078731537, + "learning_rate": 7.023220081202128e-05, + "loss": 0.6791, + "step": 1677 + }, + { + "epoch": 0.3867250518552662, + "grad_norm": 0.2067273110151291, + "learning_rate": 7.019805893979805e-05, + "loss": 0.6795, + "step": 1678 + }, + { + "epoch": 0.38695551970500114, + "grad_norm": 0.2214699238538742, + "learning_rate": 7.016390580966157e-05, + "loss": 0.6751, + "step": 1679 + }, + { + "epoch": 0.3871859875547361, + "grad_norm": 0.23702481389045715, + "learning_rate": 7.012974144064795e-05, + "loss": 0.6868, + "step": 1680 + }, + { + "epoch": 0.38741645540447106, + "grad_norm": 0.23808321356773376, + "learning_rate": 7.00955658517996e-05, + "loss": 0.6805, + "step": 1681 + }, + { + "epoch": 0.387646923254206, + "grad_norm": 0.22761982679367065, + "learning_rate": 7.00613790621652e-05, + "loss": 0.6794, + "step": 1682 + }, + { + "epoch": 0.387877391103941, + "grad_norm": 0.22034461796283722, + "learning_rate": 7.002718109079964e-05, + "loss": 0.6863, + "step": 1683 + }, + { + "epoch": 0.38810785895367594, + "grad_norm": 0.24931013584136963, + "learning_rate": 6.999297195676399e-05, + "loss": 0.6806, + "step": 1684 + }, + { + "epoch": 0.3883383268034109, + "grad_norm": 0.24123869836330414, + "learning_rate": 6.995875167912566e-05, + "loss": 0.6765, + "step": 1685 + }, + { + "epoch": 0.38856879465314587, + "grad_norm": 0.20375779271125793, + "learning_rate": 6.992452027695821e-05, + "loss": 0.677, + "step": 1686 + }, + { + "epoch": 0.3887992625028808, + "grad_norm": 0.2386665791273117, + "learning_rate": 6.989027776934138e-05, + "loss": 0.6855, + "step": 1687 + }, + { + "epoch": 0.3890297303526158, + "grad_norm": 0.21312054991722107, + "learning_rate": 6.985602417536112e-05, + "loss": 0.6694, + "step": 1688 + }, + { + "epoch": 0.38926019820235075, + "grad_norm": 0.21280424296855927, + "learning_rate": 6.982175951410957e-05, + "loss": 0.674, + "step": 1689 + }, + { + "epoch": 0.3894906660520857, + "grad_norm": 0.24086953699588776, + "learning_rate": 6.978748380468504e-05, + "loss": 0.6634, + "step": 1690 + }, + { + "epoch": 0.38972113390182067, + "grad_norm": 0.21306774020195007, + "learning_rate": 6.975319706619197e-05, + "loss": 0.6797, + "step": 1691 + }, + { + "epoch": 0.38995160175155563, + "grad_norm": 0.22666017711162567, + "learning_rate": 6.971889931774098e-05, + "loss": 0.6839, + "step": 1692 + }, + { + "epoch": 0.3901820696012906, + "grad_norm": 0.22966359555721283, + "learning_rate": 6.968459057844881e-05, + "loss": 0.6755, + "step": 1693 + }, + { + "epoch": 0.3904125374510256, + "grad_norm": 0.2565663158893585, + "learning_rate": 6.96502708674383e-05, + "loss": 0.6831, + "step": 1694 + }, + { + "epoch": 0.39064300530076057, + "grad_norm": 0.21080222725868225, + "learning_rate": 6.961594020383848e-05, + "loss": 0.6786, + "step": 1695 + }, + { + "epoch": 0.39087347315049553, + "grad_norm": 0.22605536878108978, + "learning_rate": 6.958159860678439e-05, + "loss": 0.6843, + "step": 1696 + }, + { + "epoch": 0.3911039410002305, + "grad_norm": 0.23553648591041565, + "learning_rate": 6.954724609541727e-05, + "loss": 0.6775, + "step": 1697 + }, + { + "epoch": 0.39133440884996545, + "grad_norm": 0.2261909395456314, + "learning_rate": 6.951288268888431e-05, + "loss": 0.6903, + "step": 1698 + }, + { + "epoch": 0.3915648766997004, + "grad_norm": 0.201494961977005, + "learning_rate": 6.947850840633892e-05, + "loss": 0.6812, + "step": 1699 + }, + { + "epoch": 0.3917953445494354, + "grad_norm": 0.24182942509651184, + "learning_rate": 6.944412326694046e-05, + "loss": 0.6807, + "step": 1700 + }, + { + "epoch": 0.39202581239917034, + "grad_norm": 0.2183101326227188, + "learning_rate": 6.940972728985438e-05, + "loss": 0.6847, + "step": 1701 + }, + { + "epoch": 0.3922562802489053, + "grad_norm": 0.21912945806980133, + "learning_rate": 6.93753204942522e-05, + "loss": 0.6768, + "step": 1702 + }, + { + "epoch": 0.39248674809864026, + "grad_norm": 0.24168923497200012, + "learning_rate": 6.93409028993114e-05, + "loss": 0.6825, + "step": 1703 + }, + { + "epoch": 0.3927172159483752, + "grad_norm": 0.21193869411945343, + "learning_rate": 6.930647452421557e-05, + "loss": 0.6891, + "step": 1704 + }, + { + "epoch": 0.3929476837981102, + "grad_norm": 0.23223961889743805, + "learning_rate": 6.927203538815422e-05, + "loss": 0.6724, + "step": 1705 + }, + { + "epoch": 0.39317815164784514, + "grad_norm": 0.2386191338300705, + "learning_rate": 6.923758551032291e-05, + "loss": 0.6857, + "step": 1706 + }, + { + "epoch": 0.3934086194975801, + "grad_norm": 0.2278871089220047, + "learning_rate": 6.92031249099232e-05, + "loss": 0.6871, + "step": 1707 + }, + { + "epoch": 0.39363908734731506, + "grad_norm": 0.25711220502853394, + "learning_rate": 6.916865360616256e-05, + "loss": 0.6841, + "step": 1708 + }, + { + "epoch": 0.39386955519705, + "grad_norm": 0.21602317690849304, + "learning_rate": 6.91341716182545e-05, + "loss": 0.6761, + "step": 1709 + }, + { + "epoch": 0.394100023046785, + "grad_norm": 0.24047702550888062, + "learning_rate": 6.909967896541843e-05, + "loss": 0.6855, + "step": 1710 + }, + { + "epoch": 0.39433049089651995, + "grad_norm": 0.2949325740337372, + "learning_rate": 6.906517566687973e-05, + "loss": 0.6794, + "step": 1711 + }, + { + "epoch": 0.3945609587462549, + "grad_norm": 0.2666512727737427, + "learning_rate": 6.903066174186974e-05, + "loss": 0.685, + "step": 1712 + }, + { + "epoch": 0.39479142659598987, + "grad_norm": 0.22977623343467712, + "learning_rate": 6.899613720962566e-05, + "loss": 0.6855, + "step": 1713 + }, + { + "epoch": 0.39502189444572483, + "grad_norm": 0.2291439175605774, + "learning_rate": 6.896160208939063e-05, + "loss": 0.6755, + "step": 1714 + }, + { + "epoch": 0.3952523622954598, + "grad_norm": 0.26552703976631165, + "learning_rate": 6.892705640041373e-05, + "loss": 0.6886, + "step": 1715 + }, + { + "epoch": 0.39548283014519475, + "grad_norm": 0.20833250880241394, + "learning_rate": 6.889250016194988e-05, + "loss": 0.6832, + "step": 1716 + }, + { + "epoch": 0.3957132979949297, + "grad_norm": 0.23018650710582733, + "learning_rate": 6.88579333932599e-05, + "loss": 0.6717, + "step": 1717 + }, + { + "epoch": 0.3959437658446647, + "grad_norm": 0.23923005163669586, + "learning_rate": 6.882335611361046e-05, + "loss": 0.6858, + "step": 1718 + }, + { + "epoch": 0.39617423369439964, + "grad_norm": 0.22628365457057953, + "learning_rate": 6.878876834227413e-05, + "loss": 0.6803, + "step": 1719 + }, + { + "epoch": 0.3964047015441346, + "grad_norm": 0.232679545879364, + "learning_rate": 6.87541700985293e-05, + "loss": 0.6903, + "step": 1720 + }, + { + "epoch": 0.39663516939386956, + "grad_norm": 0.24142929911613464, + "learning_rate": 6.871956140166019e-05, + "loss": 0.676, + "step": 1721 + }, + { + "epoch": 0.3968656372436045, + "grad_norm": 0.21797588467597961, + "learning_rate": 6.868494227095688e-05, + "loss": 0.6824, + "step": 1722 + }, + { + "epoch": 0.3970961050933395, + "grad_norm": 0.24194461107254028, + "learning_rate": 6.86503127257152e-05, + "loss": 0.6743, + "step": 1723 + }, + { + "epoch": 0.39732657294307444, + "grad_norm": 0.21700245141983032, + "learning_rate": 6.861567278523687e-05, + "loss": 0.6746, + "step": 1724 + }, + { + "epoch": 0.3975570407928094, + "grad_norm": 0.2519386410713196, + "learning_rate": 6.858102246882934e-05, + "loss": 0.6816, + "step": 1725 + }, + { + "epoch": 0.39778750864254436, + "grad_norm": 0.2088238000869751, + "learning_rate": 6.854636179580587e-05, + "loss": 0.6775, + "step": 1726 + }, + { + "epoch": 0.3980179764922793, + "grad_norm": 0.23535437881946564, + "learning_rate": 6.851169078548549e-05, + "loss": 0.6774, + "step": 1727 + }, + { + "epoch": 0.3982484443420143, + "grad_norm": 0.22251394391059875, + "learning_rate": 6.847700945719298e-05, + "loss": 0.68, + "step": 1728 + }, + { + "epoch": 0.39847891219174925, + "grad_norm": 0.21713124215602875, + "learning_rate": 6.844231783025888e-05, + "loss": 0.6772, + "step": 1729 + }, + { + "epoch": 0.3987093800414842, + "grad_norm": 0.21638789772987366, + "learning_rate": 6.840761592401948e-05, + "loss": 0.6854, + "step": 1730 + }, + { + "epoch": 0.39893984789121917, + "grad_norm": 0.22768516838550568, + "learning_rate": 6.837290375781678e-05, + "loss": 0.6828, + "step": 1731 + }, + { + "epoch": 0.39917031574095413, + "grad_norm": 0.22035862505435944, + "learning_rate": 6.833818135099852e-05, + "loss": 0.678, + "step": 1732 + }, + { + "epoch": 0.3994007835906891, + "grad_norm": 0.2309066206216812, + "learning_rate": 6.830344872291812e-05, + "loss": 0.6797, + "step": 1733 + }, + { + "epoch": 0.39963125144042405, + "grad_norm": 0.2268085926771164, + "learning_rate": 6.826870589293474e-05, + "loss": 0.6789, + "step": 1734 + }, + { + "epoch": 0.399861719290159, + "grad_norm": 0.22533169388771057, + "learning_rate": 6.823395288041316e-05, + "loss": 0.6791, + "step": 1735 + }, + { + "epoch": 0.400092187139894, + "grad_norm": 0.22329865396022797, + "learning_rate": 6.819918970472391e-05, + "loss": 0.6779, + "step": 1736 + }, + { + "epoch": 0.40032265498962893, + "grad_norm": 0.22880035638809204, + "learning_rate": 6.816441638524315e-05, + "loss": 0.6845, + "step": 1737 + }, + { + "epoch": 0.4005531228393639, + "grad_norm": 0.20969046652317047, + "learning_rate": 6.812963294135265e-05, + "loss": 0.6739, + "step": 1738 + }, + { + "epoch": 0.40078359068909886, + "grad_norm": 0.20743465423583984, + "learning_rate": 6.809483939243992e-05, + "loss": 0.6761, + "step": 1739 + }, + { + "epoch": 0.4010140585388338, + "grad_norm": 0.22215551137924194, + "learning_rate": 6.806003575789801e-05, + "loss": 0.6696, + "step": 1740 + }, + { + "epoch": 0.4012445263885688, + "grad_norm": 0.20283448696136475, + "learning_rate": 6.802522205712564e-05, + "loss": 0.6786, + "step": 1741 + }, + { + "epoch": 0.40147499423830374, + "grad_norm": 0.21726839244365692, + "learning_rate": 6.799039830952718e-05, + "loss": 0.6883, + "step": 1742 + }, + { + "epoch": 0.4017054620880387, + "grad_norm": 0.20720744132995605, + "learning_rate": 6.795556453451246e-05, + "loss": 0.6844, + "step": 1743 + }, + { + "epoch": 0.40193592993777366, + "grad_norm": 0.21686407923698425, + "learning_rate": 6.792072075149707e-05, + "loss": 0.6776, + "step": 1744 + }, + { + "epoch": 0.4021663977875086, + "grad_norm": 0.21741721034049988, + "learning_rate": 6.788586697990206e-05, + "loss": 0.6675, + "step": 1745 + }, + { + "epoch": 0.4023968656372436, + "grad_norm": 0.22603777050971985, + "learning_rate": 6.785100323915409e-05, + "loss": 0.6708, + "step": 1746 + }, + { + "epoch": 0.40262733348697854, + "grad_norm": 0.2569337487220764, + "learning_rate": 6.781612954868538e-05, + "loss": 0.6787, + "step": 1747 + }, + { + "epoch": 0.4028578013367135, + "grad_norm": 0.21277722716331482, + "learning_rate": 6.778124592793368e-05, + "loss": 0.6772, + "step": 1748 + }, + { + "epoch": 0.40308826918644847, + "grad_norm": 0.23380626738071442, + "learning_rate": 6.774635239634229e-05, + "loss": 0.6775, + "step": 1749 + }, + { + "epoch": 0.40331873703618343, + "grad_norm": 0.22136418521404266, + "learning_rate": 6.771144897336001e-05, + "loss": 0.679, + "step": 1750 + }, + { + "epoch": 0.4035492048859184, + "grad_norm": 0.22446192800998688, + "learning_rate": 6.767653567844121e-05, + "loss": 0.6803, + "step": 1751 + }, + { + "epoch": 0.40377967273565335, + "grad_norm": 0.21622978150844574, + "learning_rate": 6.764161253104567e-05, + "loss": 0.6771, + "step": 1752 + }, + { + "epoch": 0.4040101405853883, + "grad_norm": 0.2152906060218811, + "learning_rate": 6.760667955063876e-05, + "loss": 0.6848, + "step": 1753 + }, + { + "epoch": 0.4042406084351233, + "grad_norm": 0.19828496873378754, + "learning_rate": 6.757173675669127e-05, + "loss": 0.6788, + "step": 1754 + }, + { + "epoch": 0.4044710762848583, + "grad_norm": 0.20052795112133026, + "learning_rate": 6.753678416867944e-05, + "loss": 0.6759, + "step": 1755 + }, + { + "epoch": 0.40470154413459325, + "grad_norm": 0.21270537376403809, + "learning_rate": 6.750182180608505e-05, + "loss": 0.6801, + "step": 1756 + }, + { + "epoch": 0.4049320119843282, + "grad_norm": 0.20280341804027557, + "learning_rate": 6.746684968839525e-05, + "loss": 0.6724, + "step": 1757 + }, + { + "epoch": 0.40516247983406317, + "grad_norm": 0.22918108105659485, + "learning_rate": 6.743186783510269e-05, + "loss": 0.6797, + "step": 1758 + }, + { + "epoch": 0.40539294768379813, + "grad_norm": 0.23558905720710754, + "learning_rate": 6.739687626570541e-05, + "loss": 0.6875, + "step": 1759 + }, + { + "epoch": 0.4056234155335331, + "grad_norm": 0.19638538360595703, + "learning_rate": 6.736187499970684e-05, + "loss": 0.6751, + "step": 1760 + }, + { + "epoch": 0.40585388338326805, + "grad_norm": 0.19849823415279388, + "learning_rate": 6.732686405661587e-05, + "loss": 0.6771, + "step": 1761 + }, + { + "epoch": 0.406084351233003, + "grad_norm": 0.2098354548215866, + "learning_rate": 6.729184345594677e-05, + "loss": 0.6775, + "step": 1762 + }, + { + "epoch": 0.406314819082738, + "grad_norm": 0.20642666518688202, + "learning_rate": 6.725681321721916e-05, + "loss": 0.6746, + "step": 1763 + }, + { + "epoch": 0.40654528693247294, + "grad_norm": 0.23004627227783203, + "learning_rate": 6.722177335995805e-05, + "loss": 0.6811, + "step": 1764 + }, + { + "epoch": 0.4067757547822079, + "grad_norm": 0.21482717990875244, + "learning_rate": 6.718672390369384e-05, + "loss": 0.6762, + "step": 1765 + }, + { + "epoch": 0.40700622263194286, + "grad_norm": 0.19076307117938995, + "learning_rate": 6.715166486796225e-05, + "loss": 0.6849, + "step": 1766 + }, + { + "epoch": 0.4072366904816778, + "grad_norm": 0.23472625017166138, + "learning_rate": 6.711659627230432e-05, + "loss": 0.6638, + "step": 1767 + }, + { + "epoch": 0.4074671583314128, + "grad_norm": 0.20769445598125458, + "learning_rate": 6.708151813626648e-05, + "loss": 0.6732, + "step": 1768 + }, + { + "epoch": 0.40769762618114774, + "grad_norm": 0.2301913946866989, + "learning_rate": 6.704643047940041e-05, + "loss": 0.6707, + "step": 1769 + }, + { + "epoch": 0.4079280940308827, + "grad_norm": 0.2207859307527542, + "learning_rate": 6.701133332126313e-05, + "loss": 0.6799, + "step": 1770 + }, + { + "epoch": 0.40815856188061767, + "grad_norm": 0.20174138247966766, + "learning_rate": 6.697622668141698e-05, + "loss": 0.6815, + "step": 1771 + }, + { + "epoch": 0.4083890297303526, + "grad_norm": 0.24854540824890137, + "learning_rate": 6.694111057942953e-05, + "loss": 0.6759, + "step": 1772 + }, + { + "epoch": 0.4086194975800876, + "grad_norm": 0.2082047462463379, + "learning_rate": 6.690598503487368e-05, + "loss": 0.6792, + "step": 1773 + }, + { + "epoch": 0.40884996542982255, + "grad_norm": 0.2406819462776184, + "learning_rate": 6.687085006732755e-05, + "loss": 0.6852, + "step": 1774 + }, + { + "epoch": 0.4090804332795575, + "grad_norm": 0.22995240986347198, + "learning_rate": 6.683570569637451e-05, + "loss": 0.675, + "step": 1775 + }, + { + "epoch": 0.40931090112929247, + "grad_norm": 0.21281692385673523, + "learning_rate": 6.680055194160323e-05, + "loss": 0.6801, + "step": 1776 + }, + { + "epoch": 0.40954136897902743, + "grad_norm": 0.2113114297389984, + "learning_rate": 6.676538882260752e-05, + "loss": 0.6841, + "step": 1777 + }, + { + "epoch": 0.4097718368287624, + "grad_norm": 0.22743813693523407, + "learning_rate": 6.67302163589865e-05, + "loss": 0.6817, + "step": 1778 + }, + { + "epoch": 0.41000230467849735, + "grad_norm": 0.23003269731998444, + "learning_rate": 6.669503457034446e-05, + "loss": 0.6772, + "step": 1779 + }, + { + "epoch": 0.4102327725282323, + "grad_norm": 0.22872349619865417, + "learning_rate": 6.665984347629083e-05, + "loss": 0.6782, + "step": 1780 + }, + { + "epoch": 0.4104632403779673, + "grad_norm": 0.2347979098558426, + "learning_rate": 6.662464309644034e-05, + "loss": 0.678, + "step": 1781 + }, + { + "epoch": 0.41069370822770224, + "grad_norm": 0.21414168179035187, + "learning_rate": 6.658943345041279e-05, + "loss": 0.6821, + "step": 1782 + }, + { + "epoch": 0.4109241760774372, + "grad_norm": 0.21668468415737152, + "learning_rate": 6.655421455783324e-05, + "loss": 0.6705, + "step": 1783 + }, + { + "epoch": 0.41115464392717216, + "grad_norm": 0.22316017746925354, + "learning_rate": 6.651898643833182e-05, + "loss": 0.6775, + "step": 1784 + }, + { + "epoch": 0.4113851117769071, + "grad_norm": 0.21024148166179657, + "learning_rate": 6.648374911154385e-05, + "loss": 0.678, + "step": 1785 + }, + { + "epoch": 0.4116155796266421, + "grad_norm": 0.2320425808429718, + "learning_rate": 6.644850259710978e-05, + "loss": 0.6739, + "step": 1786 + }, + { + "epoch": 0.41184604747637704, + "grad_norm": 0.21307185292243958, + "learning_rate": 6.641324691467514e-05, + "loss": 0.6781, + "step": 1787 + }, + { + "epoch": 0.412076515326112, + "grad_norm": 0.21239805221557617, + "learning_rate": 6.637798208389063e-05, + "loss": 0.671, + "step": 1788 + }, + { + "epoch": 0.41230698317584696, + "grad_norm": 0.20224244892597198, + "learning_rate": 6.634270812441201e-05, + "loss": 0.6801, + "step": 1789 + }, + { + "epoch": 0.4125374510255819, + "grad_norm": 0.228766068816185, + "learning_rate": 6.630742505590015e-05, + "loss": 0.6785, + "step": 1790 + }, + { + "epoch": 0.4127679188753169, + "grad_norm": 0.24188366532325745, + "learning_rate": 6.627213289802098e-05, + "loss": 0.6889, + "step": 1791 + }, + { + "epoch": 0.41299838672505185, + "grad_norm": 0.2053944319486618, + "learning_rate": 6.623683167044548e-05, + "loss": 0.6838, + "step": 1792 + }, + { + "epoch": 0.4132288545747868, + "grad_norm": 0.22618556022644043, + "learning_rate": 6.620152139284974e-05, + "loss": 0.6778, + "step": 1793 + }, + { + "epoch": 0.41345932242452177, + "grad_norm": 0.20091716945171356, + "learning_rate": 6.616620208491482e-05, + "loss": 0.6835, + "step": 1794 + }, + { + "epoch": 0.41368979027425673, + "grad_norm": 0.2244507372379303, + "learning_rate": 6.61308737663269e-05, + "loss": 0.6756, + "step": 1795 + }, + { + "epoch": 0.4139202581239917, + "grad_norm": 0.24260321259498596, + "learning_rate": 6.609553645677714e-05, + "loss": 0.6853, + "step": 1796 + }, + { + "epoch": 0.41415072597372665, + "grad_norm": 0.21369081735610962, + "learning_rate": 6.606019017596164e-05, + "loss": 0.6953, + "step": 1797 + }, + { + "epoch": 0.4143811938234616, + "grad_norm": 0.20895330607891083, + "learning_rate": 6.602483494358164e-05, + "loss": 0.6743, + "step": 1798 + }, + { + "epoch": 0.4146116616731966, + "grad_norm": 0.231523796916008, + "learning_rate": 6.598947077934326e-05, + "loss": 0.6732, + "step": 1799 + }, + { + "epoch": 0.41484212952293154, + "grad_norm": 0.23038837313652039, + "learning_rate": 6.595409770295765e-05, + "loss": 0.6803, + "step": 1800 + }, + { + "epoch": 0.4150725973726665, + "grad_norm": 0.2283259779214859, + "learning_rate": 6.591871573414093e-05, + "loss": 0.6748, + "step": 1801 + }, + { + "epoch": 0.41530306522240146, + "grad_norm": 0.23936063051223755, + "learning_rate": 6.588332489261412e-05, + "loss": 0.676, + "step": 1802 + }, + { + "epoch": 0.4155335330721364, + "grad_norm": 0.20093607902526855, + "learning_rate": 6.584792519810325e-05, + "loss": 0.6767, + "step": 1803 + }, + { + "epoch": 0.4157640009218714, + "grad_norm": 0.24578706920146942, + "learning_rate": 6.581251667033927e-05, + "loss": 0.678, + "step": 1804 + }, + { + "epoch": 0.41599446877160634, + "grad_norm": 0.21614333987236023, + "learning_rate": 6.577709932905802e-05, + "loss": 0.6824, + "step": 1805 + }, + { + "epoch": 0.4162249366213413, + "grad_norm": 0.22684995830059052, + "learning_rate": 6.574167319400028e-05, + "loss": 0.6815, + "step": 1806 + }, + { + "epoch": 0.41645540447107626, + "grad_norm": 0.2361087203025818, + "learning_rate": 6.570623828491175e-05, + "loss": 0.681, + "step": 1807 + }, + { + "epoch": 0.4166858723208112, + "grad_norm": 0.2213921695947647, + "learning_rate": 6.567079462154298e-05, + "loss": 0.6683, + "step": 1808 + }, + { + "epoch": 0.4169163401705462, + "grad_norm": 0.22478877007961273, + "learning_rate": 6.563534222364941e-05, + "loss": 0.6801, + "step": 1809 + }, + { + "epoch": 0.41714680802028115, + "grad_norm": 0.22775450348854065, + "learning_rate": 6.559988111099139e-05, + "loss": 0.6733, + "step": 1810 + }, + { + "epoch": 0.4173772758700161, + "grad_norm": 0.21958664059638977, + "learning_rate": 6.556441130333403e-05, + "loss": 0.6759, + "step": 1811 + }, + { + "epoch": 0.41760774371975107, + "grad_norm": 0.21274985373020172, + "learning_rate": 6.55289328204474e-05, + "loss": 0.6758, + "step": 1812 + }, + { + "epoch": 0.4178382115694861, + "grad_norm": 0.2310141921043396, + "learning_rate": 6.549344568210636e-05, + "loss": 0.6769, + "step": 1813 + }, + { + "epoch": 0.41806867941922105, + "grad_norm": 0.20216527581214905, + "learning_rate": 6.545794990809056e-05, + "loss": 0.6665, + "step": 1814 + }, + { + "epoch": 0.418299147268956, + "grad_norm": 0.22055986523628235, + "learning_rate": 6.54224455181845e-05, + "loss": 0.6646, + "step": 1815 + }, + { + "epoch": 0.41852961511869097, + "grad_norm": 0.22518932819366455, + "learning_rate": 6.538693253217748e-05, + "loss": 0.6865, + "step": 1816 + }, + { + "epoch": 0.41876008296842593, + "grad_norm": 0.23729877173900604, + "learning_rate": 6.535141096986359e-05, + "loss": 0.6711, + "step": 1817 + }, + { + "epoch": 0.4189905508181609, + "grad_norm": 0.2299748808145523, + "learning_rate": 6.531588085104169e-05, + "loss": 0.6771, + "step": 1818 + }, + { + "epoch": 0.41922101866789585, + "grad_norm": 0.248263880610466, + "learning_rate": 6.528034219551543e-05, + "loss": 0.6709, + "step": 1819 + }, + { + "epoch": 0.4194514865176308, + "grad_norm": 0.22252872586250305, + "learning_rate": 6.524479502309315e-05, + "loss": 0.6814, + "step": 1820 + }, + { + "epoch": 0.4196819543673658, + "grad_norm": 0.24352537095546722, + "learning_rate": 6.520923935358806e-05, + "loss": 0.6792, + "step": 1821 + }, + { + "epoch": 0.41991242221710073, + "grad_norm": 0.24360835552215576, + "learning_rate": 6.517367520681801e-05, + "loss": 0.6874, + "step": 1822 + }, + { + "epoch": 0.4201428900668357, + "grad_norm": 0.24170473217964172, + "learning_rate": 6.513810260260558e-05, + "loss": 0.6815, + "step": 1823 + }, + { + "epoch": 0.42037335791657066, + "grad_norm": 0.2242230921983719, + "learning_rate": 6.510252156077813e-05, + "loss": 0.676, + "step": 1824 + }, + { + "epoch": 0.4206038257663056, + "grad_norm": 0.22861500084400177, + "learning_rate": 6.506693210116764e-05, + "loss": 0.671, + "step": 1825 + }, + { + "epoch": 0.4208342936160406, + "grad_norm": 0.23668934404850006, + "learning_rate": 6.503133424361082e-05, + "loss": 0.6788, + "step": 1826 + }, + { + "epoch": 0.42106476146577554, + "grad_norm": 0.2199348360300064, + "learning_rate": 6.499572800794911e-05, + "loss": 0.6786, + "step": 1827 + }, + { + "epoch": 0.4212952293155105, + "grad_norm": 0.23262521624565125, + "learning_rate": 6.496011341402852e-05, + "loss": 0.6797, + "step": 1828 + }, + { + "epoch": 0.42152569716524546, + "grad_norm": 0.2463759034872055, + "learning_rate": 6.492449048169977e-05, + "loss": 0.6763, + "step": 1829 + }, + { + "epoch": 0.4217561650149804, + "grad_norm": 0.2045678198337555, + "learning_rate": 6.488885923081827e-05, + "loss": 0.6856, + "step": 1830 + }, + { + "epoch": 0.4219866328647154, + "grad_norm": 0.23405107855796814, + "learning_rate": 6.485321968124398e-05, + "loss": 0.671, + "step": 1831 + }, + { + "epoch": 0.42221710071445034, + "grad_norm": 0.21921029686927795, + "learning_rate": 6.481757185284157e-05, + "loss": 0.6799, + "step": 1832 + }, + { + "epoch": 0.4224475685641853, + "grad_norm": 0.218912273645401, + "learning_rate": 6.478191576548024e-05, + "loss": 0.672, + "step": 1833 + }, + { + "epoch": 0.42267803641392027, + "grad_norm": 0.20524494349956512, + "learning_rate": 6.474625143903387e-05, + "loss": 0.686, + "step": 1834 + }, + { + "epoch": 0.4229085042636552, + "grad_norm": 0.2237442284822464, + "learning_rate": 6.471057889338089e-05, + "loss": 0.6793, + "step": 1835 + }, + { + "epoch": 0.4231389721133902, + "grad_norm": 0.21341606974601746, + "learning_rate": 6.46748981484043e-05, + "loss": 0.678, + "step": 1836 + }, + { + "epoch": 0.42336943996312515, + "grad_norm": 0.258621484041214, + "learning_rate": 6.463920922399173e-05, + "loss": 0.6849, + "step": 1837 + }, + { + "epoch": 0.4235999078128601, + "grad_norm": 0.23613744974136353, + "learning_rate": 6.46035121400353e-05, + "loss": 0.6809, + "step": 1838 + }, + { + "epoch": 0.42383037566259507, + "grad_norm": 0.21923868358135223, + "learning_rate": 6.456780691643171e-05, + "loss": 0.6752, + "step": 1839 + }, + { + "epoch": 0.42406084351233003, + "grad_norm": 0.26361706852912903, + "learning_rate": 6.453209357308224e-05, + "loss": 0.6801, + "step": 1840 + }, + { + "epoch": 0.424291311362065, + "grad_norm": 0.22276267409324646, + "learning_rate": 6.449637212989256e-05, + "loss": 0.6755, + "step": 1841 + }, + { + "epoch": 0.42452177921179995, + "grad_norm": 0.2298184633255005, + "learning_rate": 6.446064260677303e-05, + "loss": 0.6751, + "step": 1842 + }, + { + "epoch": 0.4247522470615349, + "grad_norm": 0.2290985882282257, + "learning_rate": 6.442490502363838e-05, + "loss": 0.6794, + "step": 1843 + }, + { + "epoch": 0.4249827149112699, + "grad_norm": 0.2414330095052719, + "learning_rate": 6.438915940040791e-05, + "loss": 0.6783, + "step": 1844 + }, + { + "epoch": 0.42521318276100484, + "grad_norm": 0.21323642134666443, + "learning_rate": 6.435340575700536e-05, + "loss": 0.6757, + "step": 1845 + }, + { + "epoch": 0.4254436506107398, + "grad_norm": 0.24010320007801056, + "learning_rate": 6.431764411335894e-05, + "loss": 0.6741, + "step": 1846 + }, + { + "epoch": 0.42567411846047476, + "grad_norm": 0.22818125784397125, + "learning_rate": 6.428187448940136e-05, + "loss": 0.676, + "step": 1847 + }, + { + "epoch": 0.4259045863102097, + "grad_norm": 0.26238662004470825, + "learning_rate": 6.424609690506972e-05, + "loss": 0.6724, + "step": 1848 + }, + { + "epoch": 0.4261350541599447, + "grad_norm": 0.21544672548770905, + "learning_rate": 6.421031138030562e-05, + "loss": 0.6768, + "step": 1849 + }, + { + "epoch": 0.42636552200967964, + "grad_norm": 0.25987011194229126, + "learning_rate": 6.417451793505502e-05, + "loss": 0.6761, + "step": 1850 + }, + { + "epoch": 0.4265959898594146, + "grad_norm": 0.23844222724437714, + "learning_rate": 6.413871658926833e-05, + "loss": 0.6709, + "step": 1851 + }, + { + "epoch": 0.42682645770914956, + "grad_norm": 0.25137385725975037, + "learning_rate": 6.41029073629004e-05, + "loss": 0.6798, + "step": 1852 + }, + { + "epoch": 0.4270569255588845, + "grad_norm": 0.2629540264606476, + "learning_rate": 6.406709027591039e-05, + "loss": 0.6829, + "step": 1853 + }, + { + "epoch": 0.4272873934086195, + "grad_norm": 0.27725306153297424, + "learning_rate": 6.403126534826189e-05, + "loss": 0.6825, + "step": 1854 + }, + { + "epoch": 0.42751786125835445, + "grad_norm": 0.2338194102048874, + "learning_rate": 6.399543259992288e-05, + "loss": 0.6717, + "step": 1855 + }, + { + "epoch": 0.4277483291080894, + "grad_norm": 0.26456090807914734, + "learning_rate": 6.395959205086564e-05, + "loss": 0.6735, + "step": 1856 + }, + { + "epoch": 0.42797879695782437, + "grad_norm": 0.22425277531147003, + "learning_rate": 6.392374372106686e-05, + "loss": 0.6744, + "step": 1857 + }, + { + "epoch": 0.42820926480755933, + "grad_norm": 0.26653608679771423, + "learning_rate": 6.388788763050753e-05, + "loss": 0.6773, + "step": 1858 + }, + { + "epoch": 0.4284397326572943, + "grad_norm": 0.29735860228538513, + "learning_rate": 6.385202379917297e-05, + "loss": 0.6769, + "step": 1859 + }, + { + "epoch": 0.42867020050702925, + "grad_norm": 0.25533658266067505, + "learning_rate": 6.381615224705283e-05, + "loss": 0.6652, + "step": 1860 + }, + { + "epoch": 0.4289006683567642, + "grad_norm": 0.27232304215431213, + "learning_rate": 6.378027299414104e-05, + "loss": 0.6763, + "step": 1861 + }, + { + "epoch": 0.4291311362064992, + "grad_norm": 0.25832486152648926, + "learning_rate": 6.374438606043582e-05, + "loss": 0.6839, + "step": 1862 + }, + { + "epoch": 0.42936160405623414, + "grad_norm": 0.22207960486412048, + "learning_rate": 6.370849146593973e-05, + "loss": 0.6797, + "step": 1863 + }, + { + "epoch": 0.4295920719059691, + "grad_norm": 0.2290259748697281, + "learning_rate": 6.367258923065951e-05, + "loss": 0.6747, + "step": 1864 + }, + { + "epoch": 0.42982253975570406, + "grad_norm": 0.24262063205242157, + "learning_rate": 6.363667937460624e-05, + "loss": 0.6784, + "step": 1865 + }, + { + "epoch": 0.430053007605439, + "grad_norm": 0.21408423781394958, + "learning_rate": 6.360076191779519e-05, + "loss": 0.6623, + "step": 1866 + }, + { + "epoch": 0.430283475455174, + "grad_norm": 0.2212885320186615, + "learning_rate": 6.356483688024588e-05, + "loss": 0.6676, + "step": 1867 + }, + { + "epoch": 0.43051394330490894, + "grad_norm": 0.23267394304275513, + "learning_rate": 6.352890428198208e-05, + "loss": 0.6764, + "step": 1868 + }, + { + "epoch": 0.4307444111546439, + "grad_norm": 0.21774008870124817, + "learning_rate": 6.349296414303176e-05, + "loss": 0.6752, + "step": 1869 + }, + { + "epoch": 0.43097487900437886, + "grad_norm": 0.22179049253463745, + "learning_rate": 6.345701648342709e-05, + "loss": 0.6886, + "step": 1870 + }, + { + "epoch": 0.4312053468541138, + "grad_norm": 0.2196541428565979, + "learning_rate": 6.342106132320442e-05, + "loss": 0.6793, + "step": 1871 + }, + { + "epoch": 0.4314358147038488, + "grad_norm": 0.22896122932434082, + "learning_rate": 6.338509868240432e-05, + "loss": 0.6643, + "step": 1872 + }, + { + "epoch": 0.4316662825535838, + "grad_norm": 0.22660693526268005, + "learning_rate": 6.334912858107147e-05, + "loss": 0.6817, + "step": 1873 + }, + { + "epoch": 0.43189675040331876, + "grad_norm": 0.1979314535856247, + "learning_rate": 6.331315103925475e-05, + "loss": 0.6786, + "step": 1874 + }, + { + "epoch": 0.4321272182530537, + "grad_norm": 0.22958844900131226, + "learning_rate": 6.327716607700719e-05, + "loss": 0.6719, + "step": 1875 + }, + { + "epoch": 0.4323576861027887, + "grad_norm": 0.20848886668682098, + "learning_rate": 6.324117371438593e-05, + "loss": 0.6718, + "step": 1876 + }, + { + "epoch": 0.43258815395252365, + "grad_norm": 0.21527381241321564, + "learning_rate": 6.320517397145228e-05, + "loss": 0.6845, + "step": 1877 + }, + { + "epoch": 0.4328186218022586, + "grad_norm": 0.20677097141742706, + "learning_rate": 6.316916686827159e-05, + "loss": 0.6769, + "step": 1878 + }, + { + "epoch": 0.43304908965199357, + "grad_norm": 0.20856349170207977, + "learning_rate": 6.313315242491338e-05, + "loss": 0.6758, + "step": 1879 + }, + { + "epoch": 0.43327955750172853, + "grad_norm": 0.21471652388572693, + "learning_rate": 6.309713066145123e-05, + "loss": 0.6686, + "step": 1880 + }, + { + "epoch": 0.4335100253514635, + "grad_norm": 0.22820867598056793, + "learning_rate": 6.306110159796282e-05, + "loss": 0.6764, + "step": 1881 + }, + { + "epoch": 0.43374049320119845, + "grad_norm": 0.21438585221767426, + "learning_rate": 6.302506525452986e-05, + "loss": 0.675, + "step": 1882 + }, + { + "epoch": 0.4339709610509334, + "grad_norm": 0.21153029799461365, + "learning_rate": 6.298902165123815e-05, + "loss": 0.6701, + "step": 1883 + }, + { + "epoch": 0.4342014289006684, + "grad_norm": 0.21586838364601135, + "learning_rate": 6.295297080817754e-05, + "loss": 0.6741, + "step": 1884 + }, + { + "epoch": 0.43443189675040333, + "grad_norm": 0.20780424773693085, + "learning_rate": 6.29169127454419e-05, + "loss": 0.6765, + "step": 1885 + }, + { + "epoch": 0.4346623646001383, + "grad_norm": 0.2166220098733902, + "learning_rate": 6.288084748312915e-05, + "loss": 0.6734, + "step": 1886 + }, + { + "epoch": 0.43489283244987326, + "grad_norm": 0.209901362657547, + "learning_rate": 6.284477504134116e-05, + "loss": 0.6738, + "step": 1887 + }, + { + "epoch": 0.4351233002996082, + "grad_norm": 0.21214714646339417, + "learning_rate": 6.280869544018385e-05, + "loss": 0.6764, + "step": 1888 + }, + { + "epoch": 0.4353537681493432, + "grad_norm": 0.22302688658237457, + "learning_rate": 6.277260869976716e-05, + "loss": 0.6727, + "step": 1889 + }, + { + "epoch": 0.43558423599907814, + "grad_norm": 0.21137331426143646, + "learning_rate": 6.273651484020492e-05, + "loss": 0.6813, + "step": 1890 + }, + { + "epoch": 0.4358147038488131, + "grad_norm": 0.23412688076496124, + "learning_rate": 6.270041388161503e-05, + "loss": 0.6748, + "step": 1891 + }, + { + "epoch": 0.43604517169854806, + "grad_norm": 0.21604032814502716, + "learning_rate": 6.26643058441193e-05, + "loss": 0.676, + "step": 1892 + }, + { + "epoch": 0.436275639548283, + "grad_norm": 0.2466844618320465, + "learning_rate": 6.262819074784343e-05, + "loss": 0.6804, + "step": 1893 + }, + { + "epoch": 0.436506107398018, + "grad_norm": 0.2448147088289261, + "learning_rate": 6.259206861291716e-05, + "loss": 0.6694, + "step": 1894 + }, + { + "epoch": 0.43673657524775295, + "grad_norm": 0.23058217763900757, + "learning_rate": 6.255593945947407e-05, + "loss": 0.6764, + "step": 1895 + }, + { + "epoch": 0.4369670430974879, + "grad_norm": 0.2556455731391907, + "learning_rate": 6.251980330765171e-05, + "loss": 0.6752, + "step": 1896 + }, + { + "epoch": 0.43719751094722287, + "grad_norm": 0.251406729221344, + "learning_rate": 6.248366017759146e-05, + "loss": 0.6751, + "step": 1897 + }, + { + "epoch": 0.43742797879695783, + "grad_norm": 0.2086581438779831, + "learning_rate": 6.244751008943867e-05, + "loss": 0.6774, + "step": 1898 + }, + { + "epoch": 0.4376584466466928, + "grad_norm": 0.2590232789516449, + "learning_rate": 6.241135306334254e-05, + "loss": 0.6764, + "step": 1899 + }, + { + "epoch": 0.43788891449642775, + "grad_norm": 0.23755843937397003, + "learning_rate": 6.237518911945608e-05, + "loss": 0.675, + "step": 1900 + }, + { + "epoch": 0.4381193823461627, + "grad_norm": 0.20899838209152222, + "learning_rate": 6.233901827793625e-05, + "loss": 0.6699, + "step": 1901 + }, + { + "epoch": 0.4383498501958977, + "grad_norm": 0.2480127364397049, + "learning_rate": 6.230284055894379e-05, + "loss": 0.6698, + "step": 1902 + }, + { + "epoch": 0.43858031804563263, + "grad_norm": 0.23556362092494965, + "learning_rate": 6.22666559826433e-05, + "loss": 0.6734, + "step": 1903 + }, + { + "epoch": 0.4388107858953676, + "grad_norm": 0.2578951418399811, + "learning_rate": 6.223046456920321e-05, + "loss": 0.6689, + "step": 1904 + }, + { + "epoch": 0.43904125374510256, + "grad_norm": 0.275614470243454, + "learning_rate": 6.21942663387957e-05, + "loss": 0.6684, + "step": 1905 + }, + { + "epoch": 0.4392717215948375, + "grad_norm": 0.21438564360141754, + "learning_rate": 6.215806131159683e-05, + "loss": 0.6766, + "step": 1906 + }, + { + "epoch": 0.4395021894445725, + "grad_norm": 0.26986971497535706, + "learning_rate": 6.21218495077864e-05, + "loss": 0.6716, + "step": 1907 + }, + { + "epoch": 0.43973265729430744, + "grad_norm": 0.25460711121559143, + "learning_rate": 6.208563094754802e-05, + "loss": 0.6763, + "step": 1908 + }, + { + "epoch": 0.4399631251440424, + "grad_norm": 0.2263481318950653, + "learning_rate": 6.2049405651069e-05, + "loss": 0.6738, + "step": 1909 + }, + { + "epoch": 0.44019359299377736, + "grad_norm": 0.26499027013778687, + "learning_rate": 6.20131736385405e-05, + "loss": 0.6708, + "step": 1910 + }, + { + "epoch": 0.4404240608435123, + "grad_norm": 0.2109207808971405, + "learning_rate": 6.197693493015734e-05, + "loss": 0.6737, + "step": 1911 + }, + { + "epoch": 0.4406545286932473, + "grad_norm": 0.22647859156131744, + "learning_rate": 6.194068954611814e-05, + "loss": 0.6719, + "step": 1912 + }, + { + "epoch": 0.44088499654298224, + "grad_norm": 0.24029402434825897, + "learning_rate": 6.190443750662518e-05, + "loss": 0.6752, + "step": 1913 + }, + { + "epoch": 0.4411154643927172, + "grad_norm": 0.20866861939430237, + "learning_rate": 6.186817883188449e-05, + "loss": 0.6746, + "step": 1914 + }, + { + "epoch": 0.44134593224245217, + "grad_norm": 0.20965643227100372, + "learning_rate": 6.183191354210577e-05, + "loss": 0.675, + "step": 1915 + }, + { + "epoch": 0.4415764000921871, + "grad_norm": 0.23772816359996796, + "learning_rate": 6.179564165750244e-05, + "loss": 0.6806, + "step": 1916 + }, + { + "epoch": 0.4418068679419221, + "grad_norm": 0.2268814593553543, + "learning_rate": 6.175936319829157e-05, + "loss": 0.6682, + "step": 1917 + }, + { + "epoch": 0.44203733579165705, + "grad_norm": 0.24102425575256348, + "learning_rate": 6.17230781846939e-05, + "loss": 0.679, + "step": 1918 + }, + { + "epoch": 0.442267803641392, + "grad_norm": 0.22678816318511963, + "learning_rate": 6.168678663693382e-05, + "loss": 0.6703, + "step": 1919 + }, + { + "epoch": 0.44249827149112697, + "grad_norm": 0.22168827056884766, + "learning_rate": 6.165048857523938e-05, + "loss": 0.6867, + "step": 1920 + }, + { + "epoch": 0.44272873934086193, + "grad_norm": 0.23149913549423218, + "learning_rate": 6.161418401984225e-05, + "loss": 0.6705, + "step": 1921 + }, + { + "epoch": 0.4429592071905969, + "grad_norm": 0.22349926829338074, + "learning_rate": 6.157787299097771e-05, + "loss": 0.6817, + "step": 1922 + }, + { + "epoch": 0.44318967504033185, + "grad_norm": 0.2404746562242508, + "learning_rate": 6.154155550888466e-05, + "loss": 0.6758, + "step": 1923 + }, + { + "epoch": 0.4434201428900668, + "grad_norm": 0.2592242956161499, + "learning_rate": 6.150523159380558e-05, + "loss": 0.6769, + "step": 1924 + }, + { + "epoch": 0.4436506107398018, + "grad_norm": 0.20442263782024384, + "learning_rate": 6.146890126598657e-05, + "loss": 0.6711, + "step": 1925 + }, + { + "epoch": 0.44388107858953674, + "grad_norm": 0.22541704773902893, + "learning_rate": 6.143256454567727e-05, + "loss": 0.674, + "step": 1926 + }, + { + "epoch": 0.4441115464392717, + "grad_norm": 0.20725850760936737, + "learning_rate": 6.139622145313089e-05, + "loss": 0.6663, + "step": 1927 + }, + { + "epoch": 0.44434201428900666, + "grad_norm": 0.21723538637161255, + "learning_rate": 6.13598720086042e-05, + "loss": 0.675, + "step": 1928 + }, + { + "epoch": 0.4445724821387416, + "grad_norm": 0.21730880439281464, + "learning_rate": 6.132351623235753e-05, + "loss": 0.6746, + "step": 1929 + }, + { + "epoch": 0.4448029499884766, + "grad_norm": 0.23787382245063782, + "learning_rate": 6.12871541446547e-05, + "loss": 0.6767, + "step": 1930 + }, + { + "epoch": 0.44503341783821154, + "grad_norm": 0.23728126287460327, + "learning_rate": 6.125078576576306e-05, + "loss": 0.6811, + "step": 1931 + }, + { + "epoch": 0.44526388568794656, + "grad_norm": 0.2103312760591507, + "learning_rate": 6.121441111595347e-05, + "loss": 0.6702, + "step": 1932 + }, + { + "epoch": 0.4454943535376815, + "grad_norm": 0.23246845602989197, + "learning_rate": 6.117803021550028e-05, + "loss": 0.6714, + "step": 1933 + }, + { + "epoch": 0.4457248213874165, + "grad_norm": 0.2163802981376648, + "learning_rate": 6.114164308468136e-05, + "loss": 0.6749, + "step": 1934 + }, + { + "epoch": 0.44595528923715144, + "grad_norm": 0.22772881388664246, + "learning_rate": 6.110524974377802e-05, + "loss": 0.6758, + "step": 1935 + }, + { + "epoch": 0.4461857570868864, + "grad_norm": 0.21084089577198029, + "learning_rate": 6.1068850213075e-05, + "loss": 0.6662, + "step": 1936 + }, + { + "epoch": 0.44641622493662136, + "grad_norm": 0.21491113305091858, + "learning_rate": 6.1032444512860556e-05, + "loss": 0.6724, + "step": 1937 + }, + { + "epoch": 0.4466466927863563, + "grad_norm": 0.2205878645181656, + "learning_rate": 6.099603266342635e-05, + "loss": 0.6817, + "step": 1938 + }, + { + "epoch": 0.4468771606360913, + "grad_norm": 0.22424796223640442, + "learning_rate": 6.0959614685067444e-05, + "loss": 0.6858, + "step": 1939 + }, + { + "epoch": 0.44710762848582625, + "grad_norm": 0.2058129459619522, + "learning_rate": 6.092319059808238e-05, + "loss": 0.6659, + "step": 1940 + }, + { + "epoch": 0.4473380963355612, + "grad_norm": 0.22552870213985443, + "learning_rate": 6.088676042277306e-05, + "loss": 0.6786, + "step": 1941 + }, + { + "epoch": 0.44756856418529617, + "grad_norm": 0.24123747646808624, + "learning_rate": 6.085032417944477e-05, + "loss": 0.6724, + "step": 1942 + }, + { + "epoch": 0.44779903203503113, + "grad_norm": 0.2038024514913559, + "learning_rate": 6.081388188840623e-05, + "loss": 0.6653, + "step": 1943 + }, + { + "epoch": 0.4480294998847661, + "grad_norm": 0.21841315925121307, + "learning_rate": 6.077743356996947e-05, + "loss": 0.6684, + "step": 1944 + }, + { + "epoch": 0.44825996773450105, + "grad_norm": 0.2158724069595337, + "learning_rate": 6.074097924444992e-05, + "loss": 0.6716, + "step": 1945 + }, + { + "epoch": 0.448490435584236, + "grad_norm": 0.2177586555480957, + "learning_rate": 6.0704518932166356e-05, + "loss": 0.6714, + "step": 1946 + }, + { + "epoch": 0.448720903433971, + "grad_norm": 0.21237412095069885, + "learning_rate": 6.066805265344084e-05, + "loss": 0.6753, + "step": 1947 + }, + { + "epoch": 0.44895137128370594, + "grad_norm": 0.22231976687908173, + "learning_rate": 6.0631580428598864e-05, + "loss": 0.6767, + "step": 1948 + }, + { + "epoch": 0.4491818391334409, + "grad_norm": 0.200894296169281, + "learning_rate": 6.059510227796912e-05, + "loss": 0.6752, + "step": 1949 + }, + { + "epoch": 0.44941230698317586, + "grad_norm": 0.22096696496009827, + "learning_rate": 6.0558618221883664e-05, + "loss": 0.6732, + "step": 1950 + }, + { + "epoch": 0.4496427748329108, + "grad_norm": 0.1898728609085083, + "learning_rate": 6.052212828067787e-05, + "loss": 0.6762, + "step": 1951 + }, + { + "epoch": 0.4498732426826458, + "grad_norm": 0.21051624417304993, + "learning_rate": 6.0485632474690304e-05, + "loss": 0.675, + "step": 1952 + }, + { + "epoch": 0.45010371053238074, + "grad_norm": 0.20307575166225433, + "learning_rate": 6.0449130824262864e-05, + "loss": 0.6638, + "step": 1953 + }, + { + "epoch": 0.4503341783821157, + "grad_norm": 0.2037128061056137, + "learning_rate": 6.041262334974072e-05, + "loss": 0.6732, + "step": 1954 + }, + { + "epoch": 0.45056464623185066, + "grad_norm": 0.20359142124652863, + "learning_rate": 6.0376110071472234e-05, + "loss": 0.6662, + "step": 1955 + }, + { + "epoch": 0.4507951140815856, + "grad_norm": 0.19084343314170837, + "learning_rate": 6.033959100980905e-05, + "loss": 0.6774, + "step": 1956 + }, + { + "epoch": 0.4510255819313206, + "grad_norm": 0.2061081826686859, + "learning_rate": 6.0303066185106e-05, + "loss": 0.674, + "step": 1957 + }, + { + "epoch": 0.45125604978105555, + "grad_norm": 0.19711264967918396, + "learning_rate": 6.026653561772118e-05, + "loss": 0.6689, + "step": 1958 + }, + { + "epoch": 0.4514865176307905, + "grad_norm": 0.19926705956459045, + "learning_rate": 6.022999932801579e-05, + "loss": 0.6764, + "step": 1959 + }, + { + "epoch": 0.45171698548052547, + "grad_norm": 0.20394420623779297, + "learning_rate": 6.019345733635433e-05, + "loss": 0.6714, + "step": 1960 + }, + { + "epoch": 0.45194745333026043, + "grad_norm": 0.19170762598514557, + "learning_rate": 6.015690966310441e-05, + "loss": 0.6676, + "step": 1961 + }, + { + "epoch": 0.4521779211799954, + "grad_norm": 0.20363013446331024, + "learning_rate": 6.012035632863683e-05, + "loss": 0.6661, + "step": 1962 + }, + { + "epoch": 0.45240838902973035, + "grad_norm": 0.1984439343214035, + "learning_rate": 6.008379735332556e-05, + "loss": 0.67, + "step": 1963 + }, + { + "epoch": 0.4526388568794653, + "grad_norm": 0.20422999560832977, + "learning_rate": 6.0047232757547654e-05, + "loss": 0.6804, + "step": 1964 + }, + { + "epoch": 0.4528693247292003, + "grad_norm": 0.2078217715024948, + "learning_rate": 6.001066256168337e-05, + "loss": 0.6704, + "step": 1965 + }, + { + "epoch": 0.45309979257893523, + "grad_norm": 0.21460744738578796, + "learning_rate": 5.997408678611606e-05, + "loss": 0.674, + "step": 1966 + }, + { + "epoch": 0.4533302604286702, + "grad_norm": 0.2205062061548233, + "learning_rate": 5.9937505451232165e-05, + "loss": 0.6722, + "step": 1967 + }, + { + "epoch": 0.45356072827840516, + "grad_norm": 0.18799827992916107, + "learning_rate": 5.990091857742126e-05, + "loss": 0.6728, + "step": 1968 + }, + { + "epoch": 0.4537911961281401, + "grad_norm": 0.2258245050907135, + "learning_rate": 5.986432618507598e-05, + "loss": 0.6753, + "step": 1969 + }, + { + "epoch": 0.4540216639778751, + "grad_norm": 0.20360851287841797, + "learning_rate": 5.982772829459204e-05, + "loss": 0.6741, + "step": 1970 + }, + { + "epoch": 0.45425213182761004, + "grad_norm": 0.21974268555641174, + "learning_rate": 5.979112492636824e-05, + "loss": 0.6732, + "step": 1971 + }, + { + "epoch": 0.454482599677345, + "grad_norm": 0.18492695689201355, + "learning_rate": 5.9754516100806423e-05, + "loss": 0.667, + "step": 1972 + }, + { + "epoch": 0.45471306752707996, + "grad_norm": 0.2193833887577057, + "learning_rate": 5.971790183831145e-05, + "loss": 0.6748, + "step": 1973 + }, + { + "epoch": 0.4549435353768149, + "grad_norm": 0.19800156354904175, + "learning_rate": 5.968128215929123e-05, + "loss": 0.6672, + "step": 1974 + }, + { + "epoch": 0.4551740032265499, + "grad_norm": 0.2352752387523651, + "learning_rate": 5.964465708415673e-05, + "loss": 0.6753, + "step": 1975 + }, + { + "epoch": 0.45540447107628484, + "grad_norm": 0.22403676807880402, + "learning_rate": 5.9608026633321846e-05, + "loss": 0.6752, + "step": 1976 + }, + { + "epoch": 0.4556349389260198, + "grad_norm": 0.22172044217586517, + "learning_rate": 5.957139082720353e-05, + "loss": 0.6721, + "step": 1977 + }, + { + "epoch": 0.45586540677575477, + "grad_norm": 0.2300235778093338, + "learning_rate": 5.9534749686221715e-05, + "loss": 0.6779, + "step": 1978 + }, + { + "epoch": 0.45609587462548973, + "grad_norm": 0.2155136615037918, + "learning_rate": 5.9498103230799274e-05, + "loss": 0.6717, + "step": 1979 + }, + { + "epoch": 0.4563263424752247, + "grad_norm": 0.2152310460805893, + "learning_rate": 5.9461451481362054e-05, + "loss": 0.6599, + "step": 1980 + }, + { + "epoch": 0.45655681032495965, + "grad_norm": 0.23655883967876434, + "learning_rate": 5.942479445833887e-05, + "loss": 0.6705, + "step": 1981 + }, + { + "epoch": 0.4567872781746946, + "grad_norm": 0.20639467239379883, + "learning_rate": 5.938813218216149e-05, + "loss": 0.674, + "step": 1982 + }, + { + "epoch": 0.45701774602442957, + "grad_norm": 0.24684162437915802, + "learning_rate": 5.9351464673264556e-05, + "loss": 0.6707, + "step": 1983 + }, + { + "epoch": 0.45724821387416453, + "grad_norm": 0.2441384196281433, + "learning_rate": 5.931479195208566e-05, + "loss": 0.6608, + "step": 1984 + }, + { + "epoch": 0.4574786817238995, + "grad_norm": 0.20324283838272095, + "learning_rate": 5.927811403906531e-05, + "loss": 0.6749, + "step": 1985 + }, + { + "epoch": 0.45770914957363446, + "grad_norm": 0.24871313571929932, + "learning_rate": 5.9241430954646884e-05, + "loss": 0.6715, + "step": 1986 + }, + { + "epoch": 0.4579396174233694, + "grad_norm": 0.20936191082000732, + "learning_rate": 5.9204742719276676e-05, + "loss": 0.6773, + "step": 1987 + }, + { + "epoch": 0.4581700852731044, + "grad_norm": 0.2653404474258423, + "learning_rate": 5.916804935340379e-05, + "loss": 0.6766, + "step": 1988 + }, + { + "epoch": 0.45840055312283934, + "grad_norm": 0.24194635450839996, + "learning_rate": 5.913135087748025e-05, + "loss": 0.6688, + "step": 1989 + }, + { + "epoch": 0.4586310209725743, + "grad_norm": 0.20647594332695007, + "learning_rate": 5.90946473119609e-05, + "loss": 0.675, + "step": 1990 + }, + { + "epoch": 0.45886148882230926, + "grad_norm": 0.26715630292892456, + "learning_rate": 5.9057938677303435e-05, + "loss": 0.6729, + "step": 1991 + }, + { + "epoch": 0.4590919566720443, + "grad_norm": 0.21820828318595886, + "learning_rate": 5.902122499396836e-05, + "loss": 0.6683, + "step": 1992 + }, + { + "epoch": 0.45932242452177924, + "grad_norm": 0.2330540269613266, + "learning_rate": 5.898450628241899e-05, + "loss": 0.6566, + "step": 1993 + }, + { + "epoch": 0.4595528923715142, + "grad_norm": 0.2669338583946228, + "learning_rate": 5.894778256312149e-05, + "loss": 0.6655, + "step": 1994 + }, + { + "epoch": 0.45978336022124916, + "grad_norm": 0.21196076273918152, + "learning_rate": 5.891105385654474e-05, + "loss": 0.6765, + "step": 1995 + }, + { + "epoch": 0.4600138280709841, + "grad_norm": 0.23386694490909576, + "learning_rate": 5.887432018316045e-05, + "loss": 0.6729, + "step": 1996 + }, + { + "epoch": 0.4602442959207191, + "grad_norm": 0.21577976644039154, + "learning_rate": 5.88375815634431e-05, + "loss": 0.6745, + "step": 1997 + }, + { + "epoch": 0.46047476377045404, + "grad_norm": 0.2088993936777115, + "learning_rate": 5.8800838017869886e-05, + "loss": 0.671, + "step": 1998 + }, + { + "epoch": 0.460705231620189, + "grad_norm": 0.2429114431142807, + "learning_rate": 5.876408956692083e-05, + "loss": 0.6655, + "step": 1999 + }, + { + "epoch": 0.46093569946992397, + "grad_norm": 0.19586068391799927, + "learning_rate": 5.87273362310786e-05, + "loss": 0.6832, + "step": 2000 + }, + { + "epoch": 0.4611661673196589, + "grad_norm": 0.22119756042957306, + "learning_rate": 5.86905780308286e-05, + "loss": 0.674, + "step": 2001 + }, + { + "epoch": 0.4613966351693939, + "grad_norm": 0.2294689267873764, + "learning_rate": 5.8653814986659026e-05, + "loss": 0.6654, + "step": 2002 + }, + { + "epoch": 0.46162710301912885, + "grad_norm": 0.20585887134075165, + "learning_rate": 5.861704711906067e-05, + "loss": 0.6715, + "step": 2003 + }, + { + "epoch": 0.4618575708688638, + "grad_norm": 0.22693046927452087, + "learning_rate": 5.8580274448527094e-05, + "loss": 0.6639, + "step": 2004 + }, + { + "epoch": 0.46208803871859877, + "grad_norm": 0.23352816700935364, + "learning_rate": 5.854349699555448e-05, + "loss": 0.674, + "step": 2005 + }, + { + "epoch": 0.46231850656833373, + "grad_norm": 0.21101726591587067, + "learning_rate": 5.850671478064169e-05, + "loss": 0.6763, + "step": 2006 + }, + { + "epoch": 0.4625489744180687, + "grad_norm": 0.21366450190544128, + "learning_rate": 5.846992782429027e-05, + "loss": 0.6712, + "step": 2007 + }, + { + "epoch": 0.46277944226780365, + "grad_norm": 0.20312468707561493, + "learning_rate": 5.843313614700438e-05, + "loss": 0.6683, + "step": 2008 + }, + { + "epoch": 0.4630099101175386, + "grad_norm": 0.21990333497524261, + "learning_rate": 5.8396339769290795e-05, + "loss": 0.6773, + "step": 2009 + }, + { + "epoch": 0.4632403779672736, + "grad_norm": 0.1999209225177765, + "learning_rate": 5.8359538711658976e-05, + "loss": 0.6837, + "step": 2010 + }, + { + "epoch": 0.46347084581700854, + "grad_norm": 0.22440659999847412, + "learning_rate": 5.832273299462092e-05, + "loss": 0.6775, + "step": 2011 + }, + { + "epoch": 0.4637013136667435, + "grad_norm": 0.23072776198387146, + "learning_rate": 5.8285922638691246e-05, + "loss": 0.6676, + "step": 2012 + }, + { + "epoch": 0.46393178151647846, + "grad_norm": 0.2057875543832779, + "learning_rate": 5.824910766438718e-05, + "loss": 0.6673, + "step": 2013 + }, + { + "epoch": 0.4641622493662134, + "grad_norm": 0.23501604795455933, + "learning_rate": 5.8212288092228504e-05, + "loss": 0.6752, + "step": 2014 + }, + { + "epoch": 0.4643927172159484, + "grad_norm": 0.22704358398914337, + "learning_rate": 5.817546394273754e-05, + "loss": 0.6716, + "step": 2015 + }, + { + "epoch": 0.46462318506568334, + "grad_norm": 0.20482905209064484, + "learning_rate": 5.8138635236439207e-05, + "loss": 0.6605, + "step": 2016 + }, + { + "epoch": 0.4648536529154183, + "grad_norm": 0.22710943222045898, + "learning_rate": 5.810180199386096e-05, + "loss": 0.6605, + "step": 2017 + }, + { + "epoch": 0.46508412076515326, + "grad_norm": 0.20111124217510223, + "learning_rate": 5.8064964235532705e-05, + "loss": 0.668, + "step": 2018 + }, + { + "epoch": 0.4653145886148882, + "grad_norm": 0.21100562810897827, + "learning_rate": 5.802812198198699e-05, + "loss": 0.6779, + "step": 2019 + }, + { + "epoch": 0.4655450564646232, + "grad_norm": 0.22643785178661346, + "learning_rate": 5.799127525375876e-05, + "loss": 0.6764, + "step": 2020 + }, + { + "epoch": 0.46577552431435815, + "grad_norm": 0.20172421634197235, + "learning_rate": 5.7954424071385505e-05, + "loss": 0.6709, + "step": 2021 + }, + { + "epoch": 0.4660059921640931, + "grad_norm": 0.20389583706855774, + "learning_rate": 5.791756845540721e-05, + "loss": 0.6713, + "step": 2022 + }, + { + "epoch": 0.46623646001382807, + "grad_norm": 0.1967121958732605, + "learning_rate": 5.788070842636629e-05, + "loss": 0.6654, + "step": 2023 + }, + { + "epoch": 0.46646692786356303, + "grad_norm": 0.20393262803554535, + "learning_rate": 5.784384400480765e-05, + "loss": 0.6693, + "step": 2024 + }, + { + "epoch": 0.466697395713298, + "grad_norm": 0.22986234724521637, + "learning_rate": 5.780697521127862e-05, + "loss": 0.6671, + "step": 2025 + }, + { + "epoch": 0.46692786356303295, + "grad_norm": 0.2337898313999176, + "learning_rate": 5.7770102066329e-05, + "loss": 0.6716, + "step": 2026 + }, + { + "epoch": 0.4671583314127679, + "grad_norm": 0.21430113911628723, + "learning_rate": 5.773322459051098e-05, + "loss": 0.668, + "step": 2027 + }, + { + "epoch": 0.4673887992625029, + "grad_norm": 0.2504282295703888, + "learning_rate": 5.769634280437919e-05, + "loss": 0.6752, + "step": 2028 + }, + { + "epoch": 0.46761926711223784, + "grad_norm": 0.2045115828514099, + "learning_rate": 5.765945672849066e-05, + "loss": 0.6687, + "step": 2029 + }, + { + "epoch": 0.4678497349619728, + "grad_norm": 0.26005640625953674, + "learning_rate": 5.7622566383404774e-05, + "loss": 0.6812, + "step": 2030 + }, + { + "epoch": 0.46808020281170776, + "grad_norm": 0.21131455898284912, + "learning_rate": 5.758567178968336e-05, + "loss": 0.6679, + "step": 2031 + }, + { + "epoch": 0.4683106706614427, + "grad_norm": 0.24453459680080414, + "learning_rate": 5.754877296789056e-05, + "loss": 0.6715, + "step": 2032 + }, + { + "epoch": 0.4685411385111777, + "grad_norm": 0.25293347239494324, + "learning_rate": 5.751186993859287e-05, + "loss": 0.6685, + "step": 2033 + }, + { + "epoch": 0.46877160636091264, + "grad_norm": 0.20162692666053772, + "learning_rate": 5.74749627223592e-05, + "loss": 0.6718, + "step": 2034 + }, + { + "epoch": 0.4690020742106476, + "grad_norm": 0.2305351048707962, + "learning_rate": 5.743805133976071e-05, + "loss": 0.6754, + "step": 2035 + }, + { + "epoch": 0.46923254206038256, + "grad_norm": 0.22010259330272675, + "learning_rate": 5.740113581137094e-05, + "loss": 0.6668, + "step": 2036 + }, + { + "epoch": 0.4694630099101175, + "grad_norm": 0.23104584217071533, + "learning_rate": 5.736421615776573e-05, + "loss": 0.6791, + "step": 2037 + }, + { + "epoch": 0.4696934777598525, + "grad_norm": 0.2572833299636841, + "learning_rate": 5.732729239952316e-05, + "loss": 0.6708, + "step": 2038 + }, + { + "epoch": 0.46992394560958745, + "grad_norm": 0.230802983045578, + "learning_rate": 5.7290364557223685e-05, + "loss": 0.6669, + "step": 2039 + }, + { + "epoch": 0.4701544134593224, + "grad_norm": 0.20792588591575623, + "learning_rate": 5.725343265144999e-05, + "loss": 0.6723, + "step": 2040 + }, + { + "epoch": 0.47038488130905737, + "grad_norm": 0.22476236522197723, + "learning_rate": 5.721649670278704e-05, + "loss": 0.6714, + "step": 2041 + }, + { + "epoch": 0.47061534915879233, + "grad_norm": 0.1980828195810318, + "learning_rate": 5.717955673182202e-05, + "loss": 0.6707, + "step": 2042 + }, + { + "epoch": 0.4708458170085273, + "grad_norm": 0.2145676165819168, + "learning_rate": 5.714261275914442e-05, + "loss": 0.6716, + "step": 2043 + }, + { + "epoch": 0.47107628485826225, + "grad_norm": 0.19914093613624573, + "learning_rate": 5.71056648053459e-05, + "loss": 0.6762, + "step": 2044 + }, + { + "epoch": 0.4713067527079972, + "grad_norm": 0.21874842047691345, + "learning_rate": 5.706871289102036e-05, + "loss": 0.667, + "step": 2045 + }, + { + "epoch": 0.4715372205577322, + "grad_norm": 0.20119917392730713, + "learning_rate": 5.7031757036763934e-05, + "loss": 0.6658, + "step": 2046 + }, + { + "epoch": 0.47176768840746713, + "grad_norm": 0.21794843673706055, + "learning_rate": 5.69947972631749e-05, + "loss": 0.6702, + "step": 2047 + }, + { + "epoch": 0.4719981562572021, + "grad_norm": 0.20612038671970367, + "learning_rate": 5.695783359085377e-05, + "loss": 0.663, + "step": 2048 + }, + { + "epoch": 0.47222862410693706, + "grad_norm": 0.20984764397144318, + "learning_rate": 5.69208660404032e-05, + "loss": 0.6743, + "step": 2049 + }, + { + "epoch": 0.472459091956672, + "grad_norm": 0.22582849860191345, + "learning_rate": 5.6883894632428005e-05, + "loss": 0.6827, + "step": 2050 + }, + { + "epoch": 0.47268955980640703, + "grad_norm": 0.1955862045288086, + "learning_rate": 5.684691938753517e-05, + "loss": 0.6679, + "step": 2051 + }, + { + "epoch": 0.472920027656142, + "grad_norm": 0.22272345423698425, + "learning_rate": 5.680994032633381e-05, + "loss": 0.6657, + "step": 2052 + }, + { + "epoch": 0.47315049550587696, + "grad_norm": 0.208807110786438, + "learning_rate": 5.6772957469435176e-05, + "loss": 0.6673, + "step": 2053 + }, + { + "epoch": 0.4733809633556119, + "grad_norm": 0.21152140200138092, + "learning_rate": 5.67359708374526e-05, + "loss": 0.6739, + "step": 2054 + }, + { + "epoch": 0.4736114312053469, + "grad_norm": 0.22693230211734772, + "learning_rate": 5.669898045100156e-05, + "loss": 0.6648, + "step": 2055 + }, + { + "epoch": 0.47384189905508184, + "grad_norm": 0.22745169699192047, + "learning_rate": 5.6661986330699615e-05, + "loss": 0.6723, + "step": 2056 + }, + { + "epoch": 0.4740723669048168, + "grad_norm": 0.217105895280838, + "learning_rate": 5.662498849716636e-05, + "loss": 0.6751, + "step": 2057 + }, + { + "epoch": 0.47430283475455176, + "grad_norm": 0.2365586906671524, + "learning_rate": 5.6587986971023564e-05, + "loss": 0.6833, + "step": 2058 + }, + { + "epoch": 0.4745333026042867, + "grad_norm": 0.22789350152015686, + "learning_rate": 5.655098177289496e-05, + "loss": 0.673, + "step": 2059 + }, + { + "epoch": 0.4747637704540217, + "grad_norm": 0.20388175547122955, + "learning_rate": 5.651397292340632e-05, + "loss": 0.6667, + "step": 2060 + }, + { + "epoch": 0.47499423830375664, + "grad_norm": 0.22663913667201996, + "learning_rate": 5.6476960443185546e-05, + "loss": 0.676, + "step": 2061 + }, + { + "epoch": 0.4752247061534916, + "grad_norm": 0.19801859557628632, + "learning_rate": 5.6439944352862476e-05, + "loss": 0.6707, + "step": 2062 + }, + { + "epoch": 0.47545517400322657, + "grad_norm": 0.21870480477809906, + "learning_rate": 5.640292467306899e-05, + "loss": 0.6705, + "step": 2063 + }, + { + "epoch": 0.4756856418529615, + "grad_norm": 0.21825294196605682, + "learning_rate": 5.6365901424438985e-05, + "loss": 0.6729, + "step": 2064 + }, + { + "epoch": 0.4759161097026965, + "grad_norm": 0.19710072875022888, + "learning_rate": 5.632887462760831e-05, + "loss": 0.6783, + "step": 2065 + }, + { + "epoch": 0.47614657755243145, + "grad_norm": 0.21672062575817108, + "learning_rate": 5.6291844303214826e-05, + "loss": 0.6733, + "step": 2066 + }, + { + "epoch": 0.4763770454021664, + "grad_norm": 0.1974102258682251, + "learning_rate": 5.625481047189835e-05, + "loss": 0.6797, + "step": 2067 + }, + { + "epoch": 0.47660751325190137, + "grad_norm": 0.220162034034729, + "learning_rate": 5.6217773154300646e-05, + "loss": 0.6668, + "step": 2068 + }, + { + "epoch": 0.47683798110163633, + "grad_norm": 0.20030367374420166, + "learning_rate": 5.618073237106541e-05, + "loss": 0.6704, + "step": 2069 + }, + { + "epoch": 0.4770684489513713, + "grad_norm": 0.1981177181005478, + "learning_rate": 5.614368814283831e-05, + "loss": 0.6609, + "step": 2070 + }, + { + "epoch": 0.47729891680110625, + "grad_norm": 0.19721218943595886, + "learning_rate": 5.6106640490266904e-05, + "loss": 0.665, + "step": 2071 + }, + { + "epoch": 0.4775293846508412, + "grad_norm": 0.20160210132598877, + "learning_rate": 5.606958943400066e-05, + "loss": 0.6687, + "step": 2072 + }, + { + "epoch": 0.4777598525005762, + "grad_norm": 0.20998936891555786, + "learning_rate": 5.6032534994690945e-05, + "loss": 0.6719, + "step": 2073 + }, + { + "epoch": 0.47799032035031114, + "grad_norm": 0.19076962769031525, + "learning_rate": 5.599547719299102e-05, + "loss": 0.6719, + "step": 2074 + }, + { + "epoch": 0.4782207882000461, + "grad_norm": 0.2014273852109909, + "learning_rate": 5.595841604955601e-05, + "loss": 0.6757, + "step": 2075 + }, + { + "epoch": 0.47845125604978106, + "grad_norm": 0.20173531770706177, + "learning_rate": 5.5921351585042915e-05, + "loss": 0.6707, + "step": 2076 + }, + { + "epoch": 0.478681723899516, + "grad_norm": 0.2177407145500183, + "learning_rate": 5.588428382011055e-05, + "loss": 0.6611, + "step": 2077 + }, + { + "epoch": 0.478912191749251, + "grad_norm": 0.22235806286334991, + "learning_rate": 5.584721277541964e-05, + "loss": 0.6692, + "step": 2078 + }, + { + "epoch": 0.47914265959898594, + "grad_norm": 0.19388116896152496, + "learning_rate": 5.581013847163267e-05, + "loss": 0.6679, + "step": 2079 + }, + { + "epoch": 0.4793731274487209, + "grad_norm": 0.23979228734970093, + "learning_rate": 5.577306092941397e-05, + "loss": 0.6729, + "step": 2080 + }, + { + "epoch": 0.47960359529845586, + "grad_norm": 0.22609175741672516, + "learning_rate": 5.573598016942968e-05, + "loss": 0.6638, + "step": 2081 + }, + { + "epoch": 0.4798340631481908, + "grad_norm": 0.20589977502822876, + "learning_rate": 5.569889621234771e-05, + "loss": 0.667, + "step": 2082 + }, + { + "epoch": 0.4800645309979258, + "grad_norm": 0.23106610774993896, + "learning_rate": 5.566180907883777e-05, + "loss": 0.6704, + "step": 2083 + }, + { + "epoch": 0.48029499884766075, + "grad_norm": 0.19989155232906342, + "learning_rate": 5.562471878957135e-05, + "loss": 0.674, + "step": 2084 + }, + { + "epoch": 0.4805254666973957, + "grad_norm": 0.22377397119998932, + "learning_rate": 5.55876253652217e-05, + "loss": 0.6624, + "step": 2085 + }, + { + "epoch": 0.48075593454713067, + "grad_norm": 0.21617180109024048, + "learning_rate": 5.5550528826463754e-05, + "loss": 0.6764, + "step": 2086 + }, + { + "epoch": 0.48098640239686563, + "grad_norm": 0.20691335201263428, + "learning_rate": 5.551342919397429e-05, + "loss": 0.6638, + "step": 2087 + }, + { + "epoch": 0.4812168702466006, + "grad_norm": 0.20599327981472015, + "learning_rate": 5.547632648843172e-05, + "loss": 0.665, + "step": 2088 + }, + { + "epoch": 0.48144733809633555, + "grad_norm": 0.21349526941776276, + "learning_rate": 5.54392207305162e-05, + "loss": 0.6734, + "step": 2089 + }, + { + "epoch": 0.4816778059460705, + "grad_norm": 0.20899753272533417, + "learning_rate": 5.5402111940909595e-05, + "loss": 0.6702, + "step": 2090 + }, + { + "epoch": 0.4819082737958055, + "grad_norm": 0.22185955941677094, + "learning_rate": 5.536500014029547e-05, + "loss": 0.6638, + "step": 2091 + }, + { + "epoch": 0.48213874164554044, + "grad_norm": 0.19926568865776062, + "learning_rate": 5.532788534935902e-05, + "loss": 0.6649, + "step": 2092 + }, + { + "epoch": 0.4823692094952754, + "grad_norm": 0.22930283844470978, + "learning_rate": 5.529076758878718e-05, + "loss": 0.6691, + "step": 2093 + }, + { + "epoch": 0.48259967734501036, + "grad_norm": 0.2192930430173874, + "learning_rate": 5.525364687926846e-05, + "loss": 0.6681, + "step": 2094 + }, + { + "epoch": 0.4828301451947453, + "grad_norm": 0.23378318548202515, + "learning_rate": 5.521652324149307e-05, + "loss": 0.6736, + "step": 2095 + }, + { + "epoch": 0.4830606130444803, + "grad_norm": 0.20761679112911224, + "learning_rate": 5.517939669615284e-05, + "loss": 0.6678, + "step": 2096 + }, + { + "epoch": 0.48329108089421524, + "grad_norm": 0.22278013825416565, + "learning_rate": 5.5142267263941204e-05, + "loss": 0.6649, + "step": 2097 + }, + { + "epoch": 0.4835215487439502, + "grad_norm": 0.2117822915315628, + "learning_rate": 5.510513496555322e-05, + "loss": 0.6664, + "step": 2098 + }, + { + "epoch": 0.48375201659368516, + "grad_norm": 0.20482023060321808, + "learning_rate": 5.506799982168553e-05, + "loss": 0.6709, + "step": 2099 + }, + { + "epoch": 0.4839824844434201, + "grad_norm": 0.2443208247423172, + "learning_rate": 5.50308618530364e-05, + "loss": 0.6694, + "step": 2100 + }, + { + "epoch": 0.4842129522931551, + "grad_norm": 0.1926216185092926, + "learning_rate": 5.4993721080305614e-05, + "loss": 0.6608, + "step": 2101 + }, + { + "epoch": 0.48444342014289005, + "grad_norm": 0.22089843451976776, + "learning_rate": 5.495657752419455e-05, + "loss": 0.6718, + "step": 2102 + }, + { + "epoch": 0.484673887992625, + "grad_norm": 0.22440844774246216, + "learning_rate": 5.491943120540616e-05, + "loss": 0.6648, + "step": 2103 + }, + { + "epoch": 0.48490435584235997, + "grad_norm": 0.19866810739040375, + "learning_rate": 5.488228214464487e-05, + "loss": 0.6596, + "step": 2104 + }, + { + "epoch": 0.48513482369209493, + "grad_norm": 0.22278371453285217, + "learning_rate": 5.484513036261671e-05, + "loss": 0.6685, + "step": 2105 + }, + { + "epoch": 0.4853652915418299, + "grad_norm": 0.21866042912006378, + "learning_rate": 5.480797588002918e-05, + "loss": 0.6683, + "step": 2106 + }, + { + "epoch": 0.48559575939156485, + "grad_norm": 0.20206943154335022, + "learning_rate": 5.47708187175913e-05, + "loss": 0.6705, + "step": 2107 + }, + { + "epoch": 0.4858262272412998, + "grad_norm": 0.2211609184741974, + "learning_rate": 5.4733658896013575e-05, + "loss": 0.6669, + "step": 2108 + }, + { + "epoch": 0.4860566950910348, + "grad_norm": 0.20941609144210815, + "learning_rate": 5.4696496436008e-05, + "loss": 0.6641, + "step": 2109 + }, + { + "epoch": 0.48628716294076973, + "grad_norm": 0.21404802799224854, + "learning_rate": 5.465933135828802e-05, + "loss": 0.6754, + "step": 2110 + }, + { + "epoch": 0.48651763079050475, + "grad_norm": 0.2135351300239563, + "learning_rate": 5.4622163683568584e-05, + "loss": 0.6755, + "step": 2111 + }, + { + "epoch": 0.4867480986402397, + "grad_norm": 0.22787488996982574, + "learning_rate": 5.4584993432566066e-05, + "loss": 0.6608, + "step": 2112 + }, + { + "epoch": 0.4869785664899747, + "grad_norm": 0.22770029306411743, + "learning_rate": 5.4547820625998244e-05, + "loss": 0.6669, + "step": 2113 + }, + { + "epoch": 0.48720903433970963, + "grad_norm": 0.2112642526626587, + "learning_rate": 5.4510645284584364e-05, + "loss": 0.6636, + "step": 2114 + }, + { + "epoch": 0.4874395021894446, + "grad_norm": 0.22324852645397186, + "learning_rate": 5.447346742904508e-05, + "loss": 0.679, + "step": 2115 + }, + { + "epoch": 0.48766997003917956, + "grad_norm": 0.2138347178697586, + "learning_rate": 5.443628708010239e-05, + "loss": 0.6636, + "step": 2116 + }, + { + "epoch": 0.4879004378889145, + "grad_norm": 0.22339309751987457, + "learning_rate": 5.439910425847979e-05, + "loss": 0.6648, + "step": 2117 + }, + { + "epoch": 0.4881309057386495, + "grad_norm": 0.22265255451202393, + "learning_rate": 5.436191898490207e-05, + "loss": 0.6686, + "step": 2118 + }, + { + "epoch": 0.48836137358838444, + "grad_norm": 0.22078566253185272, + "learning_rate": 5.4324731280095374e-05, + "loss": 0.6704, + "step": 2119 + }, + { + "epoch": 0.4885918414381194, + "grad_norm": 0.19747048616409302, + "learning_rate": 5.428754116478729e-05, + "loss": 0.6723, + "step": 2120 + }, + { + "epoch": 0.48882230928785436, + "grad_norm": 0.22434230148792267, + "learning_rate": 5.425034865970666e-05, + "loss": 0.6769, + "step": 2121 + }, + { + "epoch": 0.4890527771375893, + "grad_norm": 0.2001955658197403, + "learning_rate": 5.4213153785583705e-05, + "loss": 0.6748, + "step": 2122 + }, + { + "epoch": 0.4892832449873243, + "grad_norm": 0.2093842476606369, + "learning_rate": 5.417595656314997e-05, + "loss": 0.666, + "step": 2123 + }, + { + "epoch": 0.48951371283705924, + "grad_norm": 0.18802720308303833, + "learning_rate": 5.413875701313825e-05, + "loss": 0.6667, + "step": 2124 + }, + { + "epoch": 0.4897441806867942, + "grad_norm": 0.2099105566740036, + "learning_rate": 5.410155515628272e-05, + "loss": 0.6793, + "step": 2125 + }, + { + "epoch": 0.48997464853652917, + "grad_norm": 0.1877232789993286, + "learning_rate": 5.406435101331879e-05, + "loss": 0.6583, + "step": 2126 + }, + { + "epoch": 0.49020511638626413, + "grad_norm": 0.21019746363162994, + "learning_rate": 5.402714460498318e-05, + "loss": 0.6643, + "step": 2127 + }, + { + "epoch": 0.4904355842359991, + "grad_norm": 0.19312387704849243, + "learning_rate": 5.39899359520138e-05, + "loss": 0.666, + "step": 2128 + }, + { + "epoch": 0.49066605208573405, + "grad_norm": 0.19418299198150635, + "learning_rate": 5.39527250751499e-05, + "loss": 0.6663, + "step": 2129 + }, + { + "epoch": 0.490896519935469, + "grad_norm": 0.19869892299175262, + "learning_rate": 5.391551199513192e-05, + "loss": 0.67, + "step": 2130 + }, + { + "epoch": 0.491126987785204, + "grad_norm": 0.19379819929599762, + "learning_rate": 5.3878296732701515e-05, + "loss": 0.6661, + "step": 2131 + }, + { + "epoch": 0.49135745563493893, + "grad_norm": 0.20687328279018402, + "learning_rate": 5.384107930860162e-05, + "loss": 0.6815, + "step": 2132 + }, + { + "epoch": 0.4915879234846739, + "grad_norm": 0.20180322229862213, + "learning_rate": 5.38038597435763e-05, + "loss": 0.6631, + "step": 2133 + }, + { + "epoch": 0.49181839133440886, + "grad_norm": 0.19170688092708588, + "learning_rate": 5.3766638058370855e-05, + "loss": 0.6794, + "step": 2134 + }, + { + "epoch": 0.4920488591841438, + "grad_norm": 0.1976936012506485, + "learning_rate": 5.372941427373178e-05, + "loss": 0.6709, + "step": 2135 + }, + { + "epoch": 0.4922793270338788, + "grad_norm": 0.1848510205745697, + "learning_rate": 5.3692188410406695e-05, + "loss": 0.6664, + "step": 2136 + }, + { + "epoch": 0.49250979488361374, + "grad_norm": 0.19432882964611053, + "learning_rate": 5.36549604891444e-05, + "loss": 0.6613, + "step": 2137 + }, + { + "epoch": 0.4927402627333487, + "grad_norm": 0.19802847504615784, + "learning_rate": 5.361773053069487e-05, + "loss": 0.663, + "step": 2138 + }, + { + "epoch": 0.49297073058308366, + "grad_norm": 0.19331657886505127, + "learning_rate": 5.3580498555809163e-05, + "loss": 0.6743, + "step": 2139 + }, + { + "epoch": 0.4932011984328186, + "grad_norm": 0.20699891448020935, + "learning_rate": 5.354326458523952e-05, + "loss": 0.6685, + "step": 2140 + }, + { + "epoch": 0.4934316662825536, + "grad_norm": 0.17387264966964722, + "learning_rate": 5.350602863973923e-05, + "loss": 0.6707, + "step": 2141 + }, + { + "epoch": 0.49366213413228854, + "grad_norm": 0.205754354596138, + "learning_rate": 5.346879074006271e-05, + "loss": 0.6634, + "step": 2142 + }, + { + "epoch": 0.4938926019820235, + "grad_norm": 0.19446855783462524, + "learning_rate": 5.343155090696551e-05, + "loss": 0.6784, + "step": 2143 + }, + { + "epoch": 0.49412306983175847, + "grad_norm": 0.20317377150058746, + "learning_rate": 5.33943091612042e-05, + "loss": 0.6599, + "step": 2144 + }, + { + "epoch": 0.4943535376814934, + "grad_norm": 0.1977040022611618, + "learning_rate": 5.335706552353643e-05, + "loss": 0.6631, + "step": 2145 + }, + { + "epoch": 0.4945840055312284, + "grad_norm": 0.19229522347450256, + "learning_rate": 5.331982001472091e-05, + "loss": 0.6602, + "step": 2146 + }, + { + "epoch": 0.49481447338096335, + "grad_norm": 0.20331966876983643, + "learning_rate": 5.3282572655517416e-05, + "loss": 0.6738, + "step": 2147 + }, + { + "epoch": 0.4950449412306983, + "grad_norm": 0.20568066835403442, + "learning_rate": 5.324532346668668e-05, + "loss": 0.6712, + "step": 2148 + }, + { + "epoch": 0.49527540908043327, + "grad_norm": 0.20340462028980255, + "learning_rate": 5.3208072468990555e-05, + "loss": 0.6651, + "step": 2149 + }, + { + "epoch": 0.49550587693016823, + "grad_norm": 0.2173525094985962, + "learning_rate": 5.317081968319185e-05, + "loss": 0.6651, + "step": 2150 + }, + { + "epoch": 0.4957363447799032, + "grad_norm": 0.19157260656356812, + "learning_rate": 5.313356513005433e-05, + "loss": 0.6687, + "step": 2151 + }, + { + "epoch": 0.49596681262963815, + "grad_norm": 0.20568935573101044, + "learning_rate": 5.3096308830342844e-05, + "loss": 0.6671, + "step": 2152 + }, + { + "epoch": 0.4961972804793731, + "grad_norm": 0.19956238567829132, + "learning_rate": 5.305905080482312e-05, + "loss": 0.6649, + "step": 2153 + }, + { + "epoch": 0.4964277483291081, + "grad_norm": 0.19893619418144226, + "learning_rate": 5.302179107426191e-05, + "loss": 0.6692, + "step": 2154 + }, + { + "epoch": 0.49665821617884304, + "grad_norm": 0.20772667229175568, + "learning_rate": 5.298452965942687e-05, + "loss": 0.6686, + "step": 2155 + }, + { + "epoch": 0.496888684028578, + "grad_norm": 0.1938534379005432, + "learning_rate": 5.294726658108665e-05, + "loss": 0.6659, + "step": 2156 + }, + { + "epoch": 0.49711915187831296, + "grad_norm": 0.19728915393352509, + "learning_rate": 5.291000186001076e-05, + "loss": 0.6745, + "step": 2157 + }, + { + "epoch": 0.4973496197280479, + "grad_norm": 0.19719786942005157, + "learning_rate": 5.2872735516969695e-05, + "loss": 0.6713, + "step": 2158 + }, + { + "epoch": 0.4975800875777829, + "grad_norm": 0.18405082821846008, + "learning_rate": 5.28354675727348e-05, + "loss": 0.6542, + "step": 2159 + }, + { + "epoch": 0.49781055542751784, + "grad_norm": 0.2271628975868225, + "learning_rate": 5.279819804807834e-05, + "loss": 0.6674, + "step": 2160 + }, + { + "epoch": 0.4980410232772528, + "grad_norm": 0.18239864706993103, + "learning_rate": 5.2760926963773436e-05, + "loss": 0.6755, + "step": 2161 + }, + { + "epoch": 0.49827149112698776, + "grad_norm": 0.19707393646240234, + "learning_rate": 5.272365434059413e-05, + "loss": 0.6676, + "step": 2162 + }, + { + "epoch": 0.4985019589767227, + "grad_norm": 0.21636438369750977, + "learning_rate": 5.2686380199315244e-05, + "loss": 0.6743, + "step": 2163 + }, + { + "epoch": 0.4987324268264577, + "grad_norm": 0.2144385576248169, + "learning_rate": 5.2649104560712536e-05, + "loss": 0.6648, + "step": 2164 + }, + { + "epoch": 0.49896289467619265, + "grad_norm": 0.21302272379398346, + "learning_rate": 5.261182744556252e-05, + "loss": 0.6673, + "step": 2165 + }, + { + "epoch": 0.4991933625259276, + "grad_norm": 0.22367598116397858, + "learning_rate": 5.257454887464258e-05, + "loss": 0.6754, + "step": 2166 + }, + { + "epoch": 0.49942383037566257, + "grad_norm": 0.1969953179359436, + "learning_rate": 5.253726886873089e-05, + "loss": 0.6759, + "step": 2167 + }, + { + "epoch": 0.49965429822539753, + "grad_norm": 0.22727370262145996, + "learning_rate": 5.2499987448606436e-05, + "loss": 0.6622, + "step": 2168 + }, + { + "epoch": 0.4998847660751325, + "grad_norm": 0.19945083558559418, + "learning_rate": 5.246270463504898e-05, + "loss": 0.6689, + "step": 2169 + }, + { + "epoch": 0.5001152339248675, + "grad_norm": 0.19829121232032776, + "learning_rate": 5.2425420448839055e-05, + "loss": 0.6707, + "step": 2170 + }, + { + "epoch": 0.5003457017746025, + "grad_norm": 0.19157467782497406, + "learning_rate": 5.2388134910758015e-05, + "loss": 0.6674, + "step": 2171 + }, + { + "epoch": 0.5005761696243374, + "grad_norm": 0.19981160759925842, + "learning_rate": 5.235084804158787e-05, + "loss": 0.6703, + "step": 2172 + }, + { + "epoch": 0.5008066374740724, + "grad_norm": 0.2080090194940567, + "learning_rate": 5.231355986211146e-05, + "loss": 0.6659, + "step": 2173 + }, + { + "epoch": 0.5010371053238073, + "grad_norm": 0.19367657601833344, + "learning_rate": 5.2276270393112325e-05, + "loss": 0.6576, + "step": 2174 + }, + { + "epoch": 0.5012675731735423, + "grad_norm": 0.21728312969207764, + "learning_rate": 5.223897965537469e-05, + "loss": 0.6723, + "step": 2175 + }, + { + "epoch": 0.5014980410232772, + "grad_norm": 0.19311760365962982, + "learning_rate": 5.220168766968355e-05, + "loss": 0.6622, + "step": 2176 + }, + { + "epoch": 0.5017285088730122, + "grad_norm": 0.20615801215171814, + "learning_rate": 5.216439445682455e-05, + "loss": 0.6652, + "step": 2177 + }, + { + "epoch": 0.5019589767227471, + "grad_norm": 0.1940842866897583, + "learning_rate": 5.212710003758401e-05, + "loss": 0.667, + "step": 2178 + }, + { + "epoch": 0.5021894445724822, + "grad_norm": 0.19929523766040802, + "learning_rate": 5.208980443274899e-05, + "loss": 0.6649, + "step": 2179 + }, + { + "epoch": 0.5024199124222171, + "grad_norm": 0.181834876537323, + "learning_rate": 5.205250766310712e-05, + "loss": 0.6654, + "step": 2180 + }, + { + "epoch": 0.5026503802719521, + "grad_norm": 0.20930717885494232, + "learning_rate": 5.201520974944675e-05, + "loss": 0.6627, + "step": 2181 + }, + { + "epoch": 0.502880848121687, + "grad_norm": 0.18552522361278534, + "learning_rate": 5.1977910712556834e-05, + "loss": 0.6738, + "step": 2182 + }, + { + "epoch": 0.503111315971422, + "grad_norm": 0.21850310266017914, + "learning_rate": 5.1940610573226955e-05, + "loss": 0.6776, + "step": 2183 + }, + { + "epoch": 0.5033417838211569, + "grad_norm": 0.18333658576011658, + "learning_rate": 5.190330935224732e-05, + "loss": 0.6657, + "step": 2184 + }, + { + "epoch": 0.5035722516708919, + "grad_norm": 0.21493695676326752, + "learning_rate": 5.186600707040874e-05, + "loss": 0.6724, + "step": 2185 + }, + { + "epoch": 0.5038027195206268, + "grad_norm": 0.19420945644378662, + "learning_rate": 5.1828703748502614e-05, + "loss": 0.6571, + "step": 2186 + }, + { + "epoch": 0.5040331873703618, + "grad_norm": 0.19441378116607666, + "learning_rate": 5.179139940732091e-05, + "loss": 0.6548, + "step": 2187 + }, + { + "epoch": 0.5042636552200968, + "grad_norm": 0.201436385512352, + "learning_rate": 5.1754094067656164e-05, + "loss": 0.6734, + "step": 2188 + }, + { + "epoch": 0.5044941230698318, + "grad_norm": 0.222244992852211, + "learning_rate": 5.17167877503015e-05, + "loss": 0.67, + "step": 2189 + }, + { + "epoch": 0.5047245909195667, + "grad_norm": 0.20612792670726776, + "learning_rate": 5.1679480476050525e-05, + "loss": 0.6591, + "step": 2190 + }, + { + "epoch": 0.5049550587693017, + "grad_norm": 0.20908065140247345, + "learning_rate": 5.164217226569747e-05, + "loss": 0.6708, + "step": 2191 + }, + { + "epoch": 0.5051855266190366, + "grad_norm": 0.1922689974308014, + "learning_rate": 5.1604863140037e-05, + "loss": 0.6586, + "step": 2192 + }, + { + "epoch": 0.5054159944687716, + "grad_norm": 0.22337596118450165, + "learning_rate": 5.156755311986433e-05, + "loss": 0.6675, + "step": 2193 + }, + { + "epoch": 0.5056464623185065, + "grad_norm": 0.20729419589042664, + "learning_rate": 5.153024222597519e-05, + "loss": 0.6646, + "step": 2194 + }, + { + "epoch": 0.5058769301682415, + "grad_norm": 0.20641185343265533, + "learning_rate": 5.149293047916576e-05, + "loss": 0.6679, + "step": 2195 + }, + { + "epoch": 0.5061073980179764, + "grad_norm": 0.20790040493011475, + "learning_rate": 5.14556179002327e-05, + "loss": 0.6635, + "step": 2196 + }, + { + "epoch": 0.5063378658677115, + "grad_norm": 0.19893454015254974, + "learning_rate": 5.141830450997316e-05, + "loss": 0.6687, + "step": 2197 + }, + { + "epoch": 0.5065683337174464, + "grad_norm": 0.2278234213590622, + "learning_rate": 5.138099032918475e-05, + "loss": 0.6625, + "step": 2198 + }, + { + "epoch": 0.5067988015671814, + "grad_norm": 0.20414131879806519, + "learning_rate": 5.1343675378665455e-05, + "loss": 0.6677, + "step": 2199 + }, + { + "epoch": 0.5070292694169164, + "grad_norm": 0.20839910209178925, + "learning_rate": 5.130635967921377e-05, + "loss": 0.6679, + "step": 2200 + }, + { + "epoch": 0.5072597372666513, + "grad_norm": 0.1961502730846405, + "learning_rate": 5.1269043251628556e-05, + "loss": 0.6703, + "step": 2201 + }, + { + "epoch": 0.5074902051163863, + "grad_norm": 0.19088955223560333, + "learning_rate": 5.123172611670907e-05, + "loss": 0.6757, + "step": 2202 + }, + { + "epoch": 0.5077206729661212, + "grad_norm": 0.20697516202926636, + "learning_rate": 5.119440829525504e-05, + "loss": 0.6694, + "step": 2203 + }, + { + "epoch": 0.5079511408158562, + "grad_norm": 0.20802126824855804, + "learning_rate": 5.115708980806647e-05, + "loss": 0.6742, + "step": 2204 + }, + { + "epoch": 0.5081816086655911, + "grad_norm": 0.19614768028259277, + "learning_rate": 5.111977067594382e-05, + "loss": 0.6664, + "step": 2205 + }, + { + "epoch": 0.5084120765153262, + "grad_norm": 0.1986846625804901, + "learning_rate": 5.1082450919687884e-05, + "loss": 0.6523, + "step": 2206 + }, + { + "epoch": 0.5086425443650611, + "grad_norm": 0.20315955579280853, + "learning_rate": 5.1045130560099776e-05, + "loss": 0.6673, + "step": 2207 + }, + { + "epoch": 0.5088730122147961, + "grad_norm": 0.20175747573375702, + "learning_rate": 5.100780961798098e-05, + "loss": 0.6709, + "step": 2208 + }, + { + "epoch": 0.509103480064531, + "grad_norm": 0.19836615025997162, + "learning_rate": 5.097048811413331e-05, + "loss": 0.6712, + "step": 2209 + }, + { + "epoch": 0.509333947914266, + "grad_norm": 0.2123645395040512, + "learning_rate": 5.093316606935883e-05, + "loss": 0.662, + "step": 2210 + }, + { + "epoch": 0.5095644157640009, + "grad_norm": 0.19712963700294495, + "learning_rate": 5.0895843504460005e-05, + "loss": 0.6771, + "step": 2211 + }, + { + "epoch": 0.5097948836137359, + "grad_norm": 0.19324639439582825, + "learning_rate": 5.08585204402395e-05, + "loss": 0.6606, + "step": 2212 + }, + { + "epoch": 0.5100253514634708, + "grad_norm": 0.19227439165115356, + "learning_rate": 5.08211968975003e-05, + "loss": 0.6669, + "step": 2213 + }, + { + "epoch": 0.5102558193132058, + "grad_norm": 0.21038465201854706, + "learning_rate": 5.078387289704568e-05, + "loss": 0.6682, + "step": 2214 + }, + { + "epoch": 0.5104862871629408, + "grad_norm": 0.1920362412929535, + "learning_rate": 5.074654845967911e-05, + "loss": 0.6558, + "step": 2215 + }, + { + "epoch": 0.5107167550126758, + "grad_norm": 0.20892934501171112, + "learning_rate": 5.0709223606204345e-05, + "loss": 0.6653, + "step": 2216 + }, + { + "epoch": 0.5109472228624107, + "grad_norm": 0.1770077347755432, + "learning_rate": 5.0671898357425366e-05, + "loss": 0.6642, + "step": 2217 + }, + { + "epoch": 0.5111776907121457, + "grad_norm": 0.1956147402524948, + "learning_rate": 5.063457273414638e-05, + "loss": 0.6676, + "step": 2218 + }, + { + "epoch": 0.5114081585618806, + "grad_norm": 0.1727982610464096, + "learning_rate": 5.059724675717177e-05, + "loss": 0.6607, + "step": 2219 + }, + { + "epoch": 0.5116386264116156, + "grad_norm": 0.18610809743404388, + "learning_rate": 5.055992044730615e-05, + "loss": 0.6625, + "step": 2220 + }, + { + "epoch": 0.5118690942613505, + "grad_norm": 0.1783786416053772, + "learning_rate": 5.0522593825354336e-05, + "loss": 0.6589, + "step": 2221 + }, + { + "epoch": 0.5120995621110855, + "grad_norm": 0.18354859948158264, + "learning_rate": 5.048526691212123e-05, + "loss": 0.6544, + "step": 2222 + }, + { + "epoch": 0.5123300299608204, + "grad_norm": 0.1823883056640625, + "learning_rate": 5.044793972841203e-05, + "loss": 0.6713, + "step": 2223 + }, + { + "epoch": 0.5125604978105555, + "grad_norm": 0.19022877514362335, + "learning_rate": 5.041061229503196e-05, + "loss": 0.6669, + "step": 2224 + }, + { + "epoch": 0.5127909656602904, + "grad_norm": 0.19528958201408386, + "learning_rate": 5.037328463278646e-05, + "loss": 0.6589, + "step": 2225 + }, + { + "epoch": 0.5130214335100254, + "grad_norm": 0.1989632546901703, + "learning_rate": 5.033595676248106e-05, + "loss": 0.6609, + "step": 2226 + }, + { + "epoch": 0.5132519013597603, + "grad_norm": 0.17961591482162476, + "learning_rate": 5.029862870492142e-05, + "loss": 0.6652, + "step": 2227 + }, + { + "epoch": 0.5134823692094953, + "grad_norm": 0.18870680034160614, + "learning_rate": 5.026130048091331e-05, + "loss": 0.6658, + "step": 2228 + }, + { + "epoch": 0.5137128370592302, + "grad_norm": 0.2052677571773529, + "learning_rate": 5.0223972111262584e-05, + "loss": 0.6663, + "step": 2229 + }, + { + "epoch": 0.5139433049089652, + "grad_norm": 0.23451951146125793, + "learning_rate": 5.018664361677519e-05, + "loss": 0.6642, + "step": 2230 + }, + { + "epoch": 0.5141737727587001, + "grad_norm": 0.18537165224552155, + "learning_rate": 5.01493150182571e-05, + "loss": 0.6698, + "step": 2231 + }, + { + "epoch": 0.5144042406084351, + "grad_norm": 0.21171410381793976, + "learning_rate": 5.011198633651442e-05, + "loss": 0.6687, + "step": 2232 + }, + { + "epoch": 0.51463470845817, + "grad_norm": 0.22823838889598846, + "learning_rate": 5.0074657592353246e-05, + "loss": 0.6657, + "step": 2233 + }, + { + "epoch": 0.5148651763079051, + "grad_norm": 0.1847493201494217, + "learning_rate": 5.003732880657971e-05, + "loss": 0.6638, + "step": 2234 + }, + { + "epoch": 0.51509564415764, + "grad_norm": 0.2114555537700653, + "learning_rate": 5e-05, + "loss": 0.668, + "step": 2235 + }, + { + "epoch": 0.515326112007375, + "grad_norm": 0.17518627643585205, + "learning_rate": 4.996267119342029e-05, + "loss": 0.6551, + "step": 2236 + }, + { + "epoch": 0.5155565798571099, + "grad_norm": 0.23371440172195435, + "learning_rate": 4.992534240764677e-05, + "loss": 0.6697, + "step": 2237 + }, + { + "epoch": 0.5157870477068449, + "grad_norm": 0.1804964542388916, + "learning_rate": 4.9888013663485586e-05, + "loss": 0.6622, + "step": 2238 + }, + { + "epoch": 0.5160175155565798, + "grad_norm": 0.22412839531898499, + "learning_rate": 4.98506849817429e-05, + "loss": 0.6576, + "step": 2239 + }, + { + "epoch": 0.5162479834063148, + "grad_norm": 0.2174907922744751, + "learning_rate": 4.981335638322484e-05, + "loss": 0.6597, + "step": 2240 + }, + { + "epoch": 0.5164784512560497, + "grad_norm": 0.20137697458267212, + "learning_rate": 4.9776027888737434e-05, + "loss": 0.6747, + "step": 2241 + }, + { + "epoch": 0.5167089191057848, + "grad_norm": 0.2253003716468811, + "learning_rate": 4.973869951908669e-05, + "loss": 0.6686, + "step": 2242 + }, + { + "epoch": 0.5169393869555197, + "grad_norm": 0.2619769871234894, + "learning_rate": 4.9701371295078603e-05, + "loss": 0.6616, + "step": 2243 + }, + { + "epoch": 0.5171698548052547, + "grad_norm": 0.26056036353111267, + "learning_rate": 4.966404323751896e-05, + "loss": 0.6653, + "step": 2244 + }, + { + "epoch": 0.5174003226549896, + "grad_norm": 0.21332813799381256, + "learning_rate": 4.962671536721355e-05, + "loss": 0.6609, + "step": 2245 + }, + { + "epoch": 0.5176307905047246, + "grad_norm": 0.25862419605255127, + "learning_rate": 4.9589387704968054e-05, + "loss": 0.6685, + "step": 2246 + }, + { + "epoch": 0.5178612583544595, + "grad_norm": 0.24932904541492462, + "learning_rate": 4.955206027158798e-05, + "loss": 0.6642, + "step": 2247 + }, + { + "epoch": 0.5180917262041945, + "grad_norm": 0.21036414802074432, + "learning_rate": 4.951473308787876e-05, + "loss": 0.6636, + "step": 2248 + }, + { + "epoch": 0.5183221940539294, + "grad_norm": 0.27057451009750366, + "learning_rate": 4.947740617464568e-05, + "loss": 0.6603, + "step": 2249 + }, + { + "epoch": 0.5185526619036644, + "grad_norm": 0.2021026611328125, + "learning_rate": 4.9440079552693854e-05, + "loss": 0.6779, + "step": 2250 + }, + { + "epoch": 0.5187831297533994, + "grad_norm": 0.24280990660190582, + "learning_rate": 4.940275324282824e-05, + "loss": 0.6528, + "step": 2251 + }, + { + "epoch": 0.5190135976031344, + "grad_norm": 0.21159270405769348, + "learning_rate": 4.9365427265853644e-05, + "loss": 0.6706, + "step": 2252 + }, + { + "epoch": 0.5192440654528693, + "grad_norm": 0.2115175724029541, + "learning_rate": 4.9328101642574646e-05, + "loss": 0.6645, + "step": 2253 + }, + { + "epoch": 0.5194745333026043, + "grad_norm": 0.21375073492527008, + "learning_rate": 4.929077639379566e-05, + "loss": 0.6633, + "step": 2254 + }, + { + "epoch": 0.5197050011523392, + "grad_norm": 0.2012326568365097, + "learning_rate": 4.925345154032092e-05, + "loss": 0.658, + "step": 2255 + }, + { + "epoch": 0.5199354690020742, + "grad_norm": 0.2035793513059616, + "learning_rate": 4.921612710295433e-05, + "loss": 0.6658, + "step": 2256 + }, + { + "epoch": 0.5201659368518091, + "grad_norm": 0.2022104114294052, + "learning_rate": 4.91788031024997e-05, + "loss": 0.664, + "step": 2257 + }, + { + "epoch": 0.5203964047015441, + "grad_norm": 0.2398633062839508, + "learning_rate": 4.9141479559760517e-05, + "loss": 0.6608, + "step": 2258 + }, + { + "epoch": 0.5206268725512792, + "grad_norm": 0.1719241738319397, + "learning_rate": 4.910415649554001e-05, + "loss": 0.663, + "step": 2259 + }, + { + "epoch": 0.5208573404010141, + "grad_norm": 0.22672203183174133, + "learning_rate": 4.906683393064117e-05, + "loss": 0.6632, + "step": 2260 + }, + { + "epoch": 0.5210878082507491, + "grad_norm": 0.19581197202205658, + "learning_rate": 4.9029511885866717e-05, + "loss": 0.6602, + "step": 2261 + }, + { + "epoch": 0.521318276100484, + "grad_norm": 0.22291046380996704, + "learning_rate": 4.899219038201903e-05, + "loss": 0.6525, + "step": 2262 + }, + { + "epoch": 0.521548743950219, + "grad_norm": 0.19381098449230194, + "learning_rate": 4.895486943990023e-05, + "loss": 0.6722, + "step": 2263 + }, + { + "epoch": 0.5217792117999539, + "grad_norm": 0.234474778175354, + "learning_rate": 4.891754908031213e-05, + "loss": 0.6639, + "step": 2264 + }, + { + "epoch": 0.5220096796496889, + "grad_norm": 0.21784402430057526, + "learning_rate": 4.8880229324056184e-05, + "loss": 0.6639, + "step": 2265 + }, + { + "epoch": 0.5222401474994238, + "grad_norm": 0.20804181694984436, + "learning_rate": 4.8842910191933526e-05, + "loss": 0.6595, + "step": 2266 + }, + { + "epoch": 0.5224706153491588, + "grad_norm": 0.2230507731437683, + "learning_rate": 4.880559170474499e-05, + "loss": 0.6716, + "step": 2267 + }, + { + "epoch": 0.5227010831988937, + "grad_norm": 0.21720674633979797, + "learning_rate": 4.876827388329094e-05, + "loss": 0.6584, + "step": 2268 + }, + { + "epoch": 0.5229315510486288, + "grad_norm": 0.23036789894104004, + "learning_rate": 4.873095674837146e-05, + "loss": 0.6654, + "step": 2269 + }, + { + "epoch": 0.5231620188983637, + "grad_norm": 0.21202722191810608, + "learning_rate": 4.869364032078625e-05, + "loss": 0.6704, + "step": 2270 + }, + { + "epoch": 0.5233924867480987, + "grad_norm": 0.22500722110271454, + "learning_rate": 4.8656324621334557e-05, + "loss": 0.6703, + "step": 2271 + }, + { + "epoch": 0.5236229545978336, + "grad_norm": 0.20429456233978271, + "learning_rate": 4.8619009670815265e-05, + "loss": 0.6656, + "step": 2272 + }, + { + "epoch": 0.5238534224475686, + "grad_norm": 0.23129715025424957, + "learning_rate": 4.8581695490026845e-05, + "loss": 0.6689, + "step": 2273 + }, + { + "epoch": 0.5240838902973035, + "grad_norm": 0.21389861404895782, + "learning_rate": 4.854438209976731e-05, + "loss": 0.6718, + "step": 2274 + }, + { + "epoch": 0.5243143581470385, + "grad_norm": 0.20860935747623444, + "learning_rate": 4.850706952083426e-05, + "loss": 0.6725, + "step": 2275 + }, + { + "epoch": 0.5245448259967734, + "grad_norm": 0.20308835804462433, + "learning_rate": 4.846975777402483e-05, + "loss": 0.6665, + "step": 2276 + }, + { + "epoch": 0.5247752938465085, + "grad_norm": 0.2039281278848648, + "learning_rate": 4.843244688013568e-05, + "loss": 0.671, + "step": 2277 + }, + { + "epoch": 0.5250057616962434, + "grad_norm": 0.21113687753677368, + "learning_rate": 4.839513685996301e-05, + "loss": 0.6605, + "step": 2278 + }, + { + "epoch": 0.5252362295459784, + "grad_norm": 0.1845845878124237, + "learning_rate": 4.835782773430255e-05, + "loss": 0.656, + "step": 2279 + }, + { + "epoch": 0.5254666973957133, + "grad_norm": 0.22030138969421387, + "learning_rate": 4.832051952394948e-05, + "loss": 0.672, + "step": 2280 + }, + { + "epoch": 0.5256971652454483, + "grad_norm": 0.19281761348247528, + "learning_rate": 4.8283212249698515e-05, + "loss": 0.6572, + "step": 2281 + }, + { + "epoch": 0.5259276330951832, + "grad_norm": 0.21558502316474915, + "learning_rate": 4.824590593234386e-05, + "loss": 0.6674, + "step": 2282 + }, + { + "epoch": 0.5261581009449182, + "grad_norm": 0.1933777928352356, + "learning_rate": 4.82086005926791e-05, + "loss": 0.6634, + "step": 2283 + }, + { + "epoch": 0.5263885687946531, + "grad_norm": 0.19798806309700012, + "learning_rate": 4.81712962514974e-05, + "loss": 0.6658, + "step": 2284 + }, + { + "epoch": 0.5266190366443881, + "grad_norm": 0.21283666789531708, + "learning_rate": 4.8133992929591265e-05, + "loss": 0.6731, + "step": 2285 + }, + { + "epoch": 0.526849504494123, + "grad_norm": 0.2087125927209854, + "learning_rate": 4.809669064775269e-05, + "loss": 0.6658, + "step": 2286 + }, + { + "epoch": 0.5270799723438581, + "grad_norm": 0.21951022744178772, + "learning_rate": 4.805938942677306e-05, + "loss": 0.6581, + "step": 2287 + }, + { + "epoch": 0.527310440193593, + "grad_norm": 0.19603340327739716, + "learning_rate": 4.802208928744319e-05, + "loss": 0.6689, + "step": 2288 + }, + { + "epoch": 0.527540908043328, + "grad_norm": 0.20134252309799194, + "learning_rate": 4.798479025055327e-05, + "loss": 0.6639, + "step": 2289 + }, + { + "epoch": 0.5277713758930629, + "grad_norm": 0.20186404883861542, + "learning_rate": 4.7947492336892894e-05, + "loss": 0.668, + "step": 2290 + }, + { + "epoch": 0.5280018437427979, + "grad_norm": 0.1830950379371643, + "learning_rate": 4.791019556725104e-05, + "loss": 0.6719, + "step": 2291 + }, + { + "epoch": 0.5282323115925328, + "grad_norm": 0.20880278944969177, + "learning_rate": 4.7872899962416e-05, + "loss": 0.6679, + "step": 2292 + }, + { + "epoch": 0.5284627794422678, + "grad_norm": 0.19241443276405334, + "learning_rate": 4.783560554317546e-05, + "loss": 0.6723, + "step": 2293 + }, + { + "epoch": 0.5286932472920027, + "grad_norm": 0.2183217704296112, + "learning_rate": 4.779831233031647e-05, + "loss": 0.6537, + "step": 2294 + }, + { + "epoch": 0.5289237151417377, + "grad_norm": 0.20928408205509186, + "learning_rate": 4.776102034462532e-05, + "loss": 0.6647, + "step": 2295 + }, + { + "epoch": 0.5291541829914727, + "grad_norm": 0.18923519551753998, + "learning_rate": 4.772372960688768e-05, + "loss": 0.6654, + "step": 2296 + }, + { + "epoch": 0.5293846508412077, + "grad_norm": 0.20825320482254028, + "learning_rate": 4.7686440137888555e-05, + "loss": 0.664, + "step": 2297 + }, + { + "epoch": 0.5296151186909426, + "grad_norm": 0.21106471121311188, + "learning_rate": 4.764915195841214e-05, + "loss": 0.6672, + "step": 2298 + }, + { + "epoch": 0.5298455865406776, + "grad_norm": 0.2134147584438324, + "learning_rate": 4.7611865089242004e-05, + "loss": 0.6697, + "step": 2299 + }, + { + "epoch": 0.5300760543904125, + "grad_norm": 0.19978366792201996, + "learning_rate": 4.757457955116095e-05, + "loss": 0.66, + "step": 2300 + }, + { + "epoch": 0.5303065222401475, + "grad_norm": 0.21647867560386658, + "learning_rate": 4.753729536495104e-05, + "loss": 0.6638, + "step": 2301 + }, + { + "epoch": 0.5305369900898824, + "grad_norm": 0.1859664022922516, + "learning_rate": 4.750001255139358e-05, + "loss": 0.658, + "step": 2302 + }, + { + "epoch": 0.5307674579396174, + "grad_norm": 0.1935039460659027, + "learning_rate": 4.7462731131269114e-05, + "loss": 0.6568, + "step": 2303 + }, + { + "epoch": 0.5309979257893523, + "grad_norm": 0.21178743243217468, + "learning_rate": 4.7425451125357435e-05, + "loss": 0.6627, + "step": 2304 + }, + { + "epoch": 0.5312283936390874, + "grad_norm": 0.20362286269664764, + "learning_rate": 4.738817255443749e-05, + "loss": 0.6597, + "step": 2305 + }, + { + "epoch": 0.5314588614888223, + "grad_norm": 0.20342570543289185, + "learning_rate": 4.735089543928746e-05, + "loss": 0.6692, + "step": 2306 + }, + { + "epoch": 0.5316893293385573, + "grad_norm": 0.1900341808795929, + "learning_rate": 4.731361980068476e-05, + "loss": 0.6613, + "step": 2307 + }, + { + "epoch": 0.5319197971882922, + "grad_norm": 0.19465115666389465, + "learning_rate": 4.727634565940588e-05, + "loss": 0.6727, + "step": 2308 + }, + { + "epoch": 0.5321502650380272, + "grad_norm": 0.19808262586593628, + "learning_rate": 4.723907303622656e-05, + "loss": 0.6605, + "step": 2309 + }, + { + "epoch": 0.5323807328877621, + "grad_norm": 0.1885158270597458, + "learning_rate": 4.7201801951921676e-05, + "loss": 0.6574, + "step": 2310 + }, + { + "epoch": 0.5326112007374971, + "grad_norm": 0.2195241004228592, + "learning_rate": 4.7164532427265204e-05, + "loss": 0.6655, + "step": 2311 + }, + { + "epoch": 0.532841668587232, + "grad_norm": 0.182617649435997, + "learning_rate": 4.712726448303031e-05, + "loss": 0.6608, + "step": 2312 + }, + { + "epoch": 0.533072136436967, + "grad_norm": 0.19868247210979462, + "learning_rate": 4.708999813998924e-05, + "loss": 0.656, + "step": 2313 + }, + { + "epoch": 0.533302604286702, + "grad_norm": 0.17593586444854736, + "learning_rate": 4.7052733418913366e-05, + "loss": 0.6577, + "step": 2314 + }, + { + "epoch": 0.533533072136437, + "grad_norm": 0.20548640191555023, + "learning_rate": 4.701547034057313e-05, + "loss": 0.6563, + "step": 2315 + }, + { + "epoch": 0.5337635399861719, + "grad_norm": 0.20077823102474213, + "learning_rate": 4.697820892573811e-05, + "loss": 0.6607, + "step": 2316 + }, + { + "epoch": 0.5339940078359069, + "grad_norm": 0.2142491191625595, + "learning_rate": 4.694094919517689e-05, + "loss": 0.6625, + "step": 2317 + }, + { + "epoch": 0.5342244756856418, + "grad_norm": 0.1935112625360489, + "learning_rate": 4.6903691169657154e-05, + "loss": 0.6687, + "step": 2318 + }, + { + "epoch": 0.5344549435353768, + "grad_norm": 0.1958986222743988, + "learning_rate": 4.686643486994568e-05, + "loss": 0.6676, + "step": 2319 + }, + { + "epoch": 0.5346854113851118, + "grad_norm": 0.1859496831893921, + "learning_rate": 4.6829180316808165e-05, + "loss": 0.6581, + "step": 2320 + }, + { + "epoch": 0.5349158792348467, + "grad_norm": 0.21781980991363525, + "learning_rate": 4.6791927531009436e-05, + "loss": 0.6678, + "step": 2321 + }, + { + "epoch": 0.5351463470845818, + "grad_norm": 0.19916626811027527, + "learning_rate": 4.675467653331333e-05, + "loss": 0.6597, + "step": 2322 + }, + { + "epoch": 0.5353768149343167, + "grad_norm": 0.21010006964206696, + "learning_rate": 4.67174273444826e-05, + "loss": 0.6651, + "step": 2323 + }, + { + "epoch": 0.5356072827840517, + "grad_norm": 0.19810812175273895, + "learning_rate": 4.668017998527909e-05, + "loss": 0.6601, + "step": 2324 + }, + { + "epoch": 0.5358377506337866, + "grad_norm": 0.21458233892917633, + "learning_rate": 4.664293447646358e-05, + "loss": 0.6638, + "step": 2325 + }, + { + "epoch": 0.5360682184835216, + "grad_norm": 0.1828889101743698, + "learning_rate": 4.660569083879581e-05, + "loss": 0.6631, + "step": 2326 + }, + { + "epoch": 0.5362986863332565, + "grad_norm": 0.21997065842151642, + "learning_rate": 4.656844909303449e-05, + "loss": 0.6602, + "step": 2327 + }, + { + "epoch": 0.5365291541829915, + "grad_norm": 0.18035876750946045, + "learning_rate": 4.653120925993729e-05, + "loss": 0.6686, + "step": 2328 + }, + { + "epoch": 0.5367596220327264, + "grad_norm": 0.1994839310646057, + "learning_rate": 4.649397136026079e-05, + "loss": 0.66, + "step": 2329 + }, + { + "epoch": 0.5369900898824614, + "grad_norm": 0.1981378048658371, + "learning_rate": 4.645673541476049e-05, + "loss": 0.6705, + "step": 2330 + }, + { + "epoch": 0.5372205577321963, + "grad_norm": 0.1892865151166916, + "learning_rate": 4.641950144419085e-05, + "loss": 0.6569, + "step": 2331 + }, + { + "epoch": 0.5374510255819314, + "grad_norm": 0.19472159445285797, + "learning_rate": 4.6382269469305143e-05, + "loss": 0.6662, + "step": 2332 + }, + { + "epoch": 0.5376814934316663, + "grad_norm": 0.20402705669403076, + "learning_rate": 4.634503951085559e-05, + "loss": 0.6651, + "step": 2333 + }, + { + "epoch": 0.5379119612814013, + "grad_norm": 0.19563409686088562, + "learning_rate": 4.630781158959332e-05, + "loss": 0.6596, + "step": 2334 + }, + { + "epoch": 0.5381424291311362, + "grad_norm": 0.20649971067905426, + "learning_rate": 4.627058572626823e-05, + "loss": 0.6567, + "step": 2335 + }, + { + "epoch": 0.5383728969808712, + "grad_norm": 0.178875133395195, + "learning_rate": 4.6233361941629136e-05, + "loss": 0.672, + "step": 2336 + }, + { + "epoch": 0.5386033648306061, + "grad_norm": 0.20355398952960968, + "learning_rate": 4.619614025642371e-05, + "loss": 0.6639, + "step": 2337 + }, + { + "epoch": 0.5388338326803411, + "grad_norm": 0.19655583798885345, + "learning_rate": 4.615892069139839e-05, + "loss": 0.6685, + "step": 2338 + }, + { + "epoch": 0.539064300530076, + "grad_norm": 0.1810363531112671, + "learning_rate": 4.612170326729849e-05, + "loss": 0.6592, + "step": 2339 + }, + { + "epoch": 0.539294768379811, + "grad_norm": 0.19939152896404266, + "learning_rate": 4.6084488004868105e-05, + "loss": 0.6565, + "step": 2340 + }, + { + "epoch": 0.539525236229546, + "grad_norm": 0.18242943286895752, + "learning_rate": 4.604727492485011e-05, + "loss": 0.6669, + "step": 2341 + }, + { + "epoch": 0.539755704079281, + "grad_norm": 0.20084112882614136, + "learning_rate": 4.601006404798621e-05, + "loss": 0.6603, + "step": 2342 + }, + { + "epoch": 0.5399861719290159, + "grad_norm": 0.18553341925144196, + "learning_rate": 4.597285539501684e-05, + "loss": 0.655, + "step": 2343 + }, + { + "epoch": 0.5402166397787509, + "grad_norm": 0.18034282326698303, + "learning_rate": 4.5935648986681215e-05, + "loss": 0.6612, + "step": 2344 + }, + { + "epoch": 0.5404471076284858, + "grad_norm": 0.20821136236190796, + "learning_rate": 4.5898444843717275e-05, + "loss": 0.6679, + "step": 2345 + }, + { + "epoch": 0.5406775754782208, + "grad_norm": 0.18303748965263367, + "learning_rate": 4.586124298686177e-05, + "loss": 0.6627, + "step": 2346 + }, + { + "epoch": 0.5409080433279557, + "grad_norm": 0.2008381336927414, + "learning_rate": 4.582404343685005e-05, + "loss": 0.6603, + "step": 2347 + }, + { + "epoch": 0.5411385111776907, + "grad_norm": 0.18936984241008759, + "learning_rate": 4.578684621441629e-05, + "loss": 0.6602, + "step": 2348 + }, + { + "epoch": 0.5413689790274256, + "grad_norm": 0.22308258712291718, + "learning_rate": 4.574965134029335e-05, + "loss": 0.6597, + "step": 2349 + }, + { + "epoch": 0.5415994468771607, + "grad_norm": 0.1877029538154602, + "learning_rate": 4.5712458835212716e-05, + "loss": 0.6599, + "step": 2350 + }, + { + "epoch": 0.5418299147268956, + "grad_norm": 0.21585100889205933, + "learning_rate": 4.5675268719904624e-05, + "loss": 0.6659, + "step": 2351 + }, + { + "epoch": 0.5420603825766306, + "grad_norm": 0.20248469710350037, + "learning_rate": 4.5638081015097956e-05, + "loss": 0.6575, + "step": 2352 + }, + { + "epoch": 0.5422908504263655, + "grad_norm": 0.19426268339157104, + "learning_rate": 4.560089574152021e-05, + "loss": 0.6667, + "step": 2353 + }, + { + "epoch": 0.5425213182761005, + "grad_norm": 0.22340944409370422, + "learning_rate": 4.5563712919897606e-05, + "loss": 0.6619, + "step": 2354 + }, + { + "epoch": 0.5427517861258354, + "grad_norm": 0.1823105663061142, + "learning_rate": 4.552653257095495e-05, + "loss": 0.6642, + "step": 2355 + }, + { + "epoch": 0.5429822539755704, + "grad_norm": 0.23891690373420715, + "learning_rate": 4.548935471541565e-05, + "loss": 0.6654, + "step": 2356 + }, + { + "epoch": 0.5432127218253053, + "grad_norm": 0.21343962848186493, + "learning_rate": 4.545217937400177e-05, + "loss": 0.6641, + "step": 2357 + }, + { + "epoch": 0.5434431896750404, + "grad_norm": 0.2255440205335617, + "learning_rate": 4.541500656743396e-05, + "loss": 0.6682, + "step": 2358 + }, + { + "epoch": 0.5436736575247753, + "grad_norm": 0.24012601375579834, + "learning_rate": 4.537783631643143e-05, + "loss": 0.6729, + "step": 2359 + }, + { + "epoch": 0.5439041253745103, + "grad_norm": 0.18320710957050323, + "learning_rate": 4.534066864171198e-05, + "loss": 0.6559, + "step": 2360 + }, + { + "epoch": 0.5441345932242452, + "grad_norm": 0.22548629343509674, + "learning_rate": 4.530350356399203e-05, + "loss": 0.6601, + "step": 2361 + }, + { + "epoch": 0.5443650610739802, + "grad_norm": 0.20092882215976715, + "learning_rate": 4.5266341103986436e-05, + "loss": 0.661, + "step": 2362 + }, + { + "epoch": 0.5445955289237151, + "grad_norm": 0.18173551559448242, + "learning_rate": 4.5229181282408705e-05, + "loss": 0.6608, + "step": 2363 + }, + { + "epoch": 0.5448259967734501, + "grad_norm": 0.19635426998138428, + "learning_rate": 4.519202411997083e-05, + "loss": 0.6637, + "step": 2364 + }, + { + "epoch": 0.545056464623185, + "grad_norm": 0.1842816174030304, + "learning_rate": 4.515486963738329e-05, + "loss": 0.664, + "step": 2365 + }, + { + "epoch": 0.54528693247292, + "grad_norm": 0.19681525230407715, + "learning_rate": 4.511771785535513e-05, + "loss": 0.6692, + "step": 2366 + }, + { + "epoch": 0.5455174003226549, + "grad_norm": 0.21367399394512177, + "learning_rate": 4.5080568794593865e-05, + "loss": 0.6541, + "step": 2367 + }, + { + "epoch": 0.54574786817239, + "grad_norm": 0.19998280704021454, + "learning_rate": 4.504342247580546e-05, + "loss": 0.6609, + "step": 2368 + }, + { + "epoch": 0.5459783360221249, + "grad_norm": 0.20067118108272552, + "learning_rate": 4.50062789196944e-05, + "loss": 0.6614, + "step": 2369 + }, + { + "epoch": 0.5462088038718599, + "grad_norm": 0.19367091357707977, + "learning_rate": 4.4969138146963625e-05, + "loss": 0.663, + "step": 2370 + }, + { + "epoch": 0.5464392717215948, + "grad_norm": 0.19135761260986328, + "learning_rate": 4.493200017831448e-05, + "loss": 0.6648, + "step": 2371 + }, + { + "epoch": 0.5466697395713298, + "grad_norm": 0.21066224575042725, + "learning_rate": 4.4894865034446784e-05, + "loss": 0.6591, + "step": 2372 + }, + { + "epoch": 0.5469002074210647, + "grad_norm": 0.19303461909294128, + "learning_rate": 4.4857732736058814e-05, + "loss": 0.6621, + "step": 2373 + }, + { + "epoch": 0.5471306752707997, + "grad_norm": 0.2217341661453247, + "learning_rate": 4.482060330384716e-05, + "loss": 0.6596, + "step": 2374 + }, + { + "epoch": 0.5473611431205346, + "grad_norm": 0.20083634555339813, + "learning_rate": 4.478347675850693e-05, + "loss": 0.6644, + "step": 2375 + }, + { + "epoch": 0.5475916109702696, + "grad_norm": 0.22229771316051483, + "learning_rate": 4.474635312073155e-05, + "loss": 0.6623, + "step": 2376 + }, + { + "epoch": 0.5478220788200046, + "grad_norm": 0.20578941702842712, + "learning_rate": 4.470923241121283e-05, + "loss": 0.654, + "step": 2377 + }, + { + "epoch": 0.5480525466697396, + "grad_norm": 0.21209031343460083, + "learning_rate": 4.467211465064097e-05, + "loss": 0.6662, + "step": 2378 + }, + { + "epoch": 0.5482830145194746, + "grad_norm": 0.2017475962638855, + "learning_rate": 4.4634999859704546e-05, + "loss": 0.6629, + "step": 2379 + }, + { + "epoch": 0.5485134823692095, + "grad_norm": 0.19928203523159027, + "learning_rate": 4.459788805909041e-05, + "loss": 0.6593, + "step": 2380 + }, + { + "epoch": 0.5487439502189445, + "grad_norm": 0.19713225960731506, + "learning_rate": 4.456077926948381e-05, + "loss": 0.6611, + "step": 2381 + }, + { + "epoch": 0.5489744180686794, + "grad_norm": 0.198060542345047, + "learning_rate": 4.45236735115683e-05, + "loss": 0.6569, + "step": 2382 + }, + { + "epoch": 0.5492048859184144, + "grad_norm": 0.19292356073856354, + "learning_rate": 4.448657080602573e-05, + "loss": 0.6666, + "step": 2383 + }, + { + "epoch": 0.5494353537681493, + "grad_norm": 0.18970555067062378, + "learning_rate": 4.444947117353625e-05, + "loss": 0.6612, + "step": 2384 + }, + { + "epoch": 0.5496658216178844, + "grad_norm": 0.19588854908943176, + "learning_rate": 4.441237463477833e-05, + "loss": 0.6697, + "step": 2385 + }, + { + "epoch": 0.5498962894676193, + "grad_norm": 0.20147165656089783, + "learning_rate": 4.437528121042866e-05, + "loss": 0.6583, + "step": 2386 + }, + { + "epoch": 0.5501267573173543, + "grad_norm": 0.21111683547496796, + "learning_rate": 4.433819092116223e-05, + "loss": 0.6589, + "step": 2387 + }, + { + "epoch": 0.5503572251670892, + "grad_norm": 0.17622540891170502, + "learning_rate": 4.430110378765232e-05, + "loss": 0.6652, + "step": 2388 + }, + { + "epoch": 0.5505876930168242, + "grad_norm": 0.22303026914596558, + "learning_rate": 4.4264019830570334e-05, + "loss": 0.6743, + "step": 2389 + }, + { + "epoch": 0.5508181608665591, + "grad_norm": 0.19800515472888947, + "learning_rate": 4.4226939070586035e-05, + "loss": 0.6707, + "step": 2390 + }, + { + "epoch": 0.5510486287162941, + "grad_norm": 0.19506274163722992, + "learning_rate": 4.418986152836734e-05, + "loss": 0.6626, + "step": 2391 + }, + { + "epoch": 0.551279096566029, + "grad_norm": 0.197477325797081, + "learning_rate": 4.4152787224580364e-05, + "loss": 0.6636, + "step": 2392 + }, + { + "epoch": 0.551509564415764, + "grad_norm": 0.2039872705936432, + "learning_rate": 4.411571617988945e-05, + "loss": 0.6642, + "step": 2393 + }, + { + "epoch": 0.551740032265499, + "grad_norm": 0.19217687845230103, + "learning_rate": 4.407864841495711e-05, + "loss": 0.6634, + "step": 2394 + }, + { + "epoch": 0.551970500115234, + "grad_norm": 0.1925874948501587, + "learning_rate": 4.4041583950444004e-05, + "loss": 0.6565, + "step": 2395 + }, + { + "epoch": 0.5522009679649689, + "grad_norm": 0.20073387026786804, + "learning_rate": 4.400452280700899e-05, + "loss": 0.6565, + "step": 2396 + }, + { + "epoch": 0.5524314358147039, + "grad_norm": 0.19481593370437622, + "learning_rate": 4.3967465005309073e-05, + "loss": 0.6638, + "step": 2397 + }, + { + "epoch": 0.5526619036644388, + "grad_norm": 0.18334874510765076, + "learning_rate": 4.393041056599936e-05, + "loss": 0.6625, + "step": 2398 + }, + { + "epoch": 0.5528923715141738, + "grad_norm": 0.187678724527359, + "learning_rate": 4.38933595097331e-05, + "loss": 0.6575, + "step": 2399 + }, + { + "epoch": 0.5531228393639087, + "grad_norm": 0.20493364334106445, + "learning_rate": 4.3856311857161705e-05, + "loss": 0.6672, + "step": 2400 + }, + { + "epoch": 0.5533533072136437, + "grad_norm": 0.1791137009859085, + "learning_rate": 4.381926762893459e-05, + "loss": 0.6603, + "step": 2401 + }, + { + "epoch": 0.5535837750633786, + "grad_norm": 0.19737230241298676, + "learning_rate": 4.378222684569937e-05, + "loss": 0.6646, + "step": 2402 + }, + { + "epoch": 0.5538142429131137, + "grad_norm": 0.22899094223976135, + "learning_rate": 4.374518952810167e-05, + "loss": 0.66, + "step": 2403 + }, + { + "epoch": 0.5540447107628486, + "grad_norm": 0.18935391306877136, + "learning_rate": 4.3708155696785186e-05, + "loss": 0.6459, + "step": 2404 + }, + { + "epoch": 0.5542751786125836, + "grad_norm": 0.17492151260375977, + "learning_rate": 4.3671125372391704e-05, + "loss": 0.6542, + "step": 2405 + }, + { + "epoch": 0.5545056464623185, + "grad_norm": 0.21164800226688385, + "learning_rate": 4.363409857556104e-05, + "loss": 0.6638, + "step": 2406 + }, + { + "epoch": 0.5547361143120535, + "grad_norm": 0.2046276330947876, + "learning_rate": 4.359707532693102e-05, + "loss": 0.6591, + "step": 2407 + }, + { + "epoch": 0.5549665821617884, + "grad_norm": 0.20502637326717377, + "learning_rate": 4.3560055647137536e-05, + "loss": 0.6636, + "step": 2408 + }, + { + "epoch": 0.5551970500115234, + "grad_norm": 0.19672958552837372, + "learning_rate": 4.352303955681447e-05, + "loss": 0.6641, + "step": 2409 + }, + { + "epoch": 0.5554275178612583, + "grad_norm": 0.2050708532333374, + "learning_rate": 4.348602707659369e-05, + "loss": 0.6539, + "step": 2410 + }, + { + "epoch": 0.5556579857109933, + "grad_norm": 0.20396442711353302, + "learning_rate": 4.344901822710506e-05, + "loss": 0.6589, + "step": 2411 + }, + { + "epoch": 0.5558884535607282, + "grad_norm": 0.20161718130111694, + "learning_rate": 4.341201302897645e-05, + "loss": 0.6672, + "step": 2412 + }, + { + "epoch": 0.5561189214104633, + "grad_norm": 0.19954702258110046, + "learning_rate": 4.337501150283364e-05, + "loss": 0.6578, + "step": 2413 + }, + { + "epoch": 0.5563493892601982, + "grad_norm": 0.18954674899578094, + "learning_rate": 4.3338013669300396e-05, + "loss": 0.6642, + "step": 2414 + }, + { + "epoch": 0.5565798571099332, + "grad_norm": 0.2116153985261917, + "learning_rate": 4.3301019548998465e-05, + "loss": 0.6569, + "step": 2415 + }, + { + "epoch": 0.5568103249596681, + "grad_norm": 0.2053348273038864, + "learning_rate": 4.326402916254741e-05, + "loss": 0.6547, + "step": 2416 + }, + { + "epoch": 0.5570407928094031, + "grad_norm": 0.21485279500484467, + "learning_rate": 4.322704253056483e-05, + "loss": 0.6657, + "step": 2417 + }, + { + "epoch": 0.557271260659138, + "grad_norm": 0.21614287793636322, + "learning_rate": 4.31900596736662e-05, + "loss": 0.664, + "step": 2418 + }, + { + "epoch": 0.557501728508873, + "grad_norm": 0.18734347820281982, + "learning_rate": 4.3153080612464835e-05, + "loss": 0.6608, + "step": 2419 + }, + { + "epoch": 0.5577321963586079, + "grad_norm": 0.2166302502155304, + "learning_rate": 4.3116105367572e-05, + "loss": 0.666, + "step": 2420 + }, + { + "epoch": 0.557962664208343, + "grad_norm": 0.18908052146434784, + "learning_rate": 4.3079133959596825e-05, + "loss": 0.6561, + "step": 2421 + }, + { + "epoch": 0.5581931320580779, + "grad_norm": 0.21493315696716309, + "learning_rate": 4.304216640914625e-05, + "loss": 0.658, + "step": 2422 + }, + { + "epoch": 0.5584235999078129, + "grad_norm": 0.17654471099376678, + "learning_rate": 4.300520273682511e-05, + "loss": 0.6602, + "step": 2423 + }, + { + "epoch": 0.5586540677575478, + "grad_norm": 0.22438247501850128, + "learning_rate": 4.2968242963236084e-05, + "loss": 0.6561, + "step": 2424 + }, + { + "epoch": 0.5588845356072828, + "grad_norm": 0.19954872131347656, + "learning_rate": 4.293128710897965e-05, + "loss": 0.6625, + "step": 2425 + }, + { + "epoch": 0.5591150034570177, + "grad_norm": 0.2289677858352661, + "learning_rate": 4.28943351946541e-05, + "loss": 0.6567, + "step": 2426 + }, + { + "epoch": 0.5593454713067527, + "grad_norm": 0.19166941940784454, + "learning_rate": 4.28573872408556e-05, + "loss": 0.6689, + "step": 2427 + }, + { + "epoch": 0.5595759391564876, + "grad_norm": 0.21529777348041534, + "learning_rate": 4.282044326817798e-05, + "loss": 0.6652, + "step": 2428 + }, + { + "epoch": 0.5598064070062226, + "grad_norm": 0.2114769071340561, + "learning_rate": 4.2783503297212975e-05, + "loss": 0.6591, + "step": 2429 + }, + { + "epoch": 0.5600368748559575, + "grad_norm": 0.19013917446136475, + "learning_rate": 4.2746567348550024e-05, + "loss": 0.6567, + "step": 2430 + }, + { + "epoch": 0.5602673427056926, + "grad_norm": 0.2237829715013504, + "learning_rate": 4.2709635442776333e-05, + "loss": 0.6673, + "step": 2431 + }, + { + "epoch": 0.5604978105554275, + "grad_norm": 0.18001073598861694, + "learning_rate": 4.267270760047685e-05, + "loss": 0.663, + "step": 2432 + }, + { + "epoch": 0.5607282784051625, + "grad_norm": 0.21743640303611755, + "learning_rate": 4.263578384223429e-05, + "loss": 0.6747, + "step": 2433 + }, + { + "epoch": 0.5609587462548974, + "grad_norm": 0.1813591569662094, + "learning_rate": 4.2598864188629065e-05, + "loss": 0.6593, + "step": 2434 + }, + { + "epoch": 0.5611892141046324, + "grad_norm": 0.19372020661830902, + "learning_rate": 4.256194866023929e-05, + "loss": 0.6569, + "step": 2435 + }, + { + "epoch": 0.5614196819543673, + "grad_norm": 0.17591120302677155, + "learning_rate": 4.25250372776408e-05, + "loss": 0.6594, + "step": 2436 + }, + { + "epoch": 0.5616501498041023, + "grad_norm": 0.19922824203968048, + "learning_rate": 4.248813006140714e-05, + "loss": 0.6646, + "step": 2437 + }, + { + "epoch": 0.5618806176538373, + "grad_norm": 0.1966255158185959, + "learning_rate": 4.245122703210945e-05, + "loss": 0.6507, + "step": 2438 + }, + { + "epoch": 0.5621110855035722, + "grad_norm": 0.18422779440879822, + "learning_rate": 4.241432821031664e-05, + "loss": 0.6578, + "step": 2439 + }, + { + "epoch": 0.5623415533533073, + "grad_norm": 0.1995001882314682, + "learning_rate": 4.2377433616595244e-05, + "loss": 0.6682, + "step": 2440 + }, + { + "epoch": 0.5625720212030422, + "grad_norm": 0.1927601844072342, + "learning_rate": 4.234054327150935e-05, + "loss": 0.6568, + "step": 2441 + }, + { + "epoch": 0.5628024890527772, + "grad_norm": 0.20397785305976868, + "learning_rate": 4.2303657195620804e-05, + "loss": 0.6562, + "step": 2442 + }, + { + "epoch": 0.5630329569025121, + "grad_norm": 0.18410134315490723, + "learning_rate": 4.2266775409489023e-05, + "loss": 0.6654, + "step": 2443 + }, + { + "epoch": 0.5632634247522471, + "grad_norm": 0.18105067312717438, + "learning_rate": 4.2229897933671006e-05, + "loss": 0.6571, + "step": 2444 + }, + { + "epoch": 0.563493892601982, + "grad_norm": 0.16961467266082764, + "learning_rate": 4.219302478872138e-05, + "loss": 0.6645, + "step": 2445 + }, + { + "epoch": 0.563724360451717, + "grad_norm": 0.19589442014694214, + "learning_rate": 4.2156155995192364e-05, + "loss": 0.6596, + "step": 2446 + }, + { + "epoch": 0.5639548283014519, + "grad_norm": 0.17968355119228363, + "learning_rate": 4.211929157363372e-05, + "loss": 0.6652, + "step": 2447 + }, + { + "epoch": 0.564185296151187, + "grad_norm": 0.19263651967048645, + "learning_rate": 4.208243154459279e-05, + "loss": 0.6628, + "step": 2448 + }, + { + "epoch": 0.5644157640009219, + "grad_norm": 0.22454223036766052, + "learning_rate": 4.20455759286145e-05, + "loss": 0.6667, + "step": 2449 + }, + { + "epoch": 0.5646462318506569, + "grad_norm": 0.18612834811210632, + "learning_rate": 4.200872474624125e-05, + "loss": 0.6652, + "step": 2450 + }, + { + "epoch": 0.5648766997003918, + "grad_norm": 0.1997823268175125, + "learning_rate": 4.197187801801301e-05, + "loss": 0.6521, + "step": 2451 + }, + { + "epoch": 0.5651071675501268, + "grad_norm": 0.1961279809474945, + "learning_rate": 4.1935035764467306e-05, + "loss": 0.6687, + "step": 2452 + }, + { + "epoch": 0.5653376353998617, + "grad_norm": 0.20370718836784363, + "learning_rate": 4.189819800613906e-05, + "loss": 0.6631, + "step": 2453 + }, + { + "epoch": 0.5655681032495967, + "grad_norm": 0.2019963413476944, + "learning_rate": 4.1861364763560785e-05, + "loss": 0.6546, + "step": 2454 + }, + { + "epoch": 0.5657985710993316, + "grad_norm": 0.2010907530784607, + "learning_rate": 4.182453605726247e-05, + "loss": 0.6675, + "step": 2455 + }, + { + "epoch": 0.5660290389490666, + "grad_norm": 0.19456058740615845, + "learning_rate": 4.178771190777151e-05, + "loss": 0.6623, + "step": 2456 + }, + { + "epoch": 0.5662595067988015, + "grad_norm": 0.19006440043449402, + "learning_rate": 4.175089233561282e-05, + "loss": 0.6699, + "step": 2457 + }, + { + "epoch": 0.5664899746485366, + "grad_norm": 0.19521141052246094, + "learning_rate": 4.171407736130876e-05, + "loss": 0.6509, + "step": 2458 + }, + { + "epoch": 0.5667204424982715, + "grad_norm": 0.1771191656589508, + "learning_rate": 4.167726700537909e-05, + "loss": 0.6619, + "step": 2459 + }, + { + "epoch": 0.5669509103480065, + "grad_norm": 0.21703976392745972, + "learning_rate": 4.164046128834103e-05, + "loss": 0.6601, + "step": 2460 + }, + { + "epoch": 0.5671813781977414, + "grad_norm": 0.18353256583213806, + "learning_rate": 4.160366023070921e-05, + "loss": 0.6564, + "step": 2461 + }, + { + "epoch": 0.5674118460474764, + "grad_norm": 0.20448799431324005, + "learning_rate": 4.1566863852995635e-05, + "loss": 0.6552, + "step": 2462 + }, + { + "epoch": 0.5676423138972113, + "grad_norm": 0.18381868302822113, + "learning_rate": 4.153007217570973e-05, + "loss": 0.6629, + "step": 2463 + }, + { + "epoch": 0.5678727817469463, + "grad_norm": 0.20256194472312927, + "learning_rate": 4.149328521935832e-05, + "loss": 0.662, + "step": 2464 + }, + { + "epoch": 0.5681032495966812, + "grad_norm": 0.19661514461040497, + "learning_rate": 4.145650300444553e-05, + "loss": 0.6552, + "step": 2465 + }, + { + "epoch": 0.5683337174464163, + "grad_norm": 0.19295766949653625, + "learning_rate": 4.141972555147291e-05, + "loss": 0.6603, + "step": 2466 + }, + { + "epoch": 0.5685641852961512, + "grad_norm": 0.19877560436725616, + "learning_rate": 4.1382952880939346e-05, + "loss": 0.6532, + "step": 2467 + }, + { + "epoch": 0.5687946531458862, + "grad_norm": 0.20013585686683655, + "learning_rate": 4.1346185013340985e-05, + "loss": 0.6658, + "step": 2468 + }, + { + "epoch": 0.5690251209956211, + "grad_norm": 0.1775607168674469, + "learning_rate": 4.13094219691714e-05, + "loss": 0.6679, + "step": 2469 + }, + { + "epoch": 0.5692555888453561, + "grad_norm": 0.1954258233308792, + "learning_rate": 4.1272663768921426e-05, + "loss": 0.6593, + "step": 2470 + }, + { + "epoch": 0.569486056695091, + "grad_norm": 0.18076026439666748, + "learning_rate": 4.123591043307918e-05, + "loss": 0.66, + "step": 2471 + }, + { + "epoch": 0.569716524544826, + "grad_norm": 0.18170256912708282, + "learning_rate": 4.1199161982130105e-05, + "loss": 0.6651, + "step": 2472 + }, + { + "epoch": 0.5699469923945609, + "grad_norm": 0.19181379675865173, + "learning_rate": 4.116241843655692e-05, + "loss": 0.6556, + "step": 2473 + }, + { + "epoch": 0.5701774602442959, + "grad_norm": 0.1833178848028183, + "learning_rate": 4.1125679816839564e-05, + "loss": 0.6556, + "step": 2474 + }, + { + "epoch": 0.5704079280940308, + "grad_norm": 0.19886410236358643, + "learning_rate": 4.108894614345527e-05, + "loss": 0.6608, + "step": 2475 + }, + { + "epoch": 0.5706383959437659, + "grad_norm": 0.18102464079856873, + "learning_rate": 4.105221743687853e-05, + "loss": 0.6604, + "step": 2476 + }, + { + "epoch": 0.5708688637935008, + "grad_norm": 0.18585921823978424, + "learning_rate": 4.101549371758101e-05, + "loss": 0.6546, + "step": 2477 + }, + { + "epoch": 0.5710993316432358, + "grad_norm": 0.18972377479076385, + "learning_rate": 4.097877500603164e-05, + "loss": 0.6664, + "step": 2478 + }, + { + "epoch": 0.5713297994929707, + "grad_norm": 0.18075346946716309, + "learning_rate": 4.0942061322696584e-05, + "loss": 0.6573, + "step": 2479 + }, + { + "epoch": 0.5715602673427057, + "grad_norm": 0.185418963432312, + "learning_rate": 4.09053526880391e-05, + "loss": 0.6631, + "step": 2480 + }, + { + "epoch": 0.5717907351924406, + "grad_norm": 0.1810043305158615, + "learning_rate": 4.0868649122519756e-05, + "loss": 0.6557, + "step": 2481 + }, + { + "epoch": 0.5720212030421756, + "grad_norm": 0.1840018481016159, + "learning_rate": 4.083195064659622e-05, + "loss": 0.6653, + "step": 2482 + }, + { + "epoch": 0.5722516708919105, + "grad_norm": 0.19294288754463196, + "learning_rate": 4.079525728072334e-05, + "loss": 0.6607, + "step": 2483 + }, + { + "epoch": 0.5724821387416456, + "grad_norm": 0.1916273981332779, + "learning_rate": 4.0758569045353114e-05, + "loss": 0.6646, + "step": 2484 + }, + { + "epoch": 0.5727126065913805, + "grad_norm": 0.18346838653087616, + "learning_rate": 4.07218859609347e-05, + "loss": 0.6529, + "step": 2485 + }, + { + "epoch": 0.5729430744411155, + "grad_norm": 0.19156713783740997, + "learning_rate": 4.0685208047914346e-05, + "loss": 0.6647, + "step": 2486 + }, + { + "epoch": 0.5731735422908504, + "grad_norm": 0.18695339560508728, + "learning_rate": 4.064853532673546e-05, + "loss": 0.6557, + "step": 2487 + }, + { + "epoch": 0.5734040101405854, + "grad_norm": 0.20183449983596802, + "learning_rate": 4.061186781783853e-05, + "loss": 0.6653, + "step": 2488 + }, + { + "epoch": 0.5736344779903203, + "grad_norm": 0.19282028079032898, + "learning_rate": 4.0575205541661135e-05, + "loss": 0.6603, + "step": 2489 + }, + { + "epoch": 0.5738649458400553, + "grad_norm": 0.20781488716602325, + "learning_rate": 4.0538548518637944e-05, + "loss": 0.6528, + "step": 2490 + }, + { + "epoch": 0.5740954136897902, + "grad_norm": 0.21310502290725708, + "learning_rate": 4.050189676920075e-05, + "loss": 0.6605, + "step": 2491 + }, + { + "epoch": 0.5743258815395252, + "grad_norm": 0.2098270207643509, + "learning_rate": 4.046525031377829e-05, + "loss": 0.6566, + "step": 2492 + }, + { + "epoch": 0.5745563493892601, + "grad_norm": 0.19469839334487915, + "learning_rate": 4.042860917279647e-05, + "loss": 0.6613, + "step": 2493 + }, + { + "epoch": 0.5747868172389952, + "grad_norm": 0.20969228446483612, + "learning_rate": 4.039197336667816e-05, + "loss": 0.661, + "step": 2494 + }, + { + "epoch": 0.5750172850887301, + "grad_norm": 0.20014849305152893, + "learning_rate": 4.035534291584328e-05, + "loss": 0.6572, + "step": 2495 + }, + { + "epoch": 0.5752477529384651, + "grad_norm": 0.20189215242862701, + "learning_rate": 4.0318717840708766e-05, + "loss": 0.6551, + "step": 2496 + }, + { + "epoch": 0.5754782207882001, + "grad_norm": 0.20174747705459595, + "learning_rate": 4.028209816168857e-05, + "loss": 0.663, + "step": 2497 + }, + { + "epoch": 0.575708688637935, + "grad_norm": 0.18536518514156342, + "learning_rate": 4.0245483899193595e-05, + "loss": 0.6579, + "step": 2498 + }, + { + "epoch": 0.57593915648767, + "grad_norm": 0.2104630321264267, + "learning_rate": 4.0208875073631767e-05, + "loss": 0.6583, + "step": 2499 + }, + { + "epoch": 0.5761696243374049, + "grad_norm": 0.20068304240703583, + "learning_rate": 4.017227170540797e-05, + "loss": 0.6642, + "step": 2500 + }, + { + "epoch": 0.57640009218714, + "grad_norm": 0.2080548256635666, + "learning_rate": 4.013567381492404e-05, + "loss": 0.6574, + "step": 2501 + }, + { + "epoch": 0.5766305600368749, + "grad_norm": 0.1887027621269226, + "learning_rate": 4.009908142257875e-05, + "loss": 0.6603, + "step": 2502 + }, + { + "epoch": 0.5768610278866099, + "grad_norm": 0.2056475281715393, + "learning_rate": 4.006249454876785e-05, + "loss": 0.6586, + "step": 2503 + }, + { + "epoch": 0.5770914957363448, + "grad_norm": 0.19129523634910583, + "learning_rate": 4.002591321388395e-05, + "loss": 0.6669, + "step": 2504 + }, + { + "epoch": 0.5773219635860798, + "grad_norm": 0.20545504987239838, + "learning_rate": 3.998933743831663e-05, + "loss": 0.6603, + "step": 2505 + }, + { + "epoch": 0.5775524314358147, + "grad_norm": 0.1966099590063095, + "learning_rate": 3.9952767242452364e-05, + "loss": 0.6575, + "step": 2506 + }, + { + "epoch": 0.5777828992855497, + "grad_norm": 0.20709416270256042, + "learning_rate": 3.9916202646674454e-05, + "loss": 0.6619, + "step": 2507 + }, + { + "epoch": 0.5780133671352846, + "grad_norm": 0.20298060774803162, + "learning_rate": 3.987964367136317e-05, + "loss": 0.6602, + "step": 2508 + }, + { + "epoch": 0.5782438349850196, + "grad_norm": 0.19368208944797516, + "learning_rate": 3.98430903368956e-05, + "loss": 0.6637, + "step": 2509 + }, + { + "epoch": 0.5784743028347545, + "grad_norm": 0.21980130672454834, + "learning_rate": 3.980654266364569e-05, + "loss": 0.6596, + "step": 2510 + }, + { + "epoch": 0.5787047706844896, + "grad_norm": 0.20827548205852509, + "learning_rate": 3.977000067198422e-05, + "loss": 0.6553, + "step": 2511 + }, + { + "epoch": 0.5789352385342245, + "grad_norm": 0.2053709775209427, + "learning_rate": 3.9733464382278854e-05, + "loss": 0.6532, + "step": 2512 + }, + { + "epoch": 0.5791657063839595, + "grad_norm": 0.2115817666053772, + "learning_rate": 3.969693381489401e-05, + "loss": 0.6573, + "step": 2513 + }, + { + "epoch": 0.5793961742336944, + "grad_norm": 0.20240476727485657, + "learning_rate": 3.966040899019096e-05, + "loss": 0.6572, + "step": 2514 + }, + { + "epoch": 0.5796266420834294, + "grad_norm": 0.20553265511989594, + "learning_rate": 3.962388992852778e-05, + "loss": 0.6593, + "step": 2515 + }, + { + "epoch": 0.5798571099331643, + "grad_norm": 0.20434992015361786, + "learning_rate": 3.95873766502593e-05, + "loss": 0.657, + "step": 2516 + }, + { + "epoch": 0.5800875777828993, + "grad_norm": 0.19303986430168152, + "learning_rate": 3.955086917573714e-05, + "loss": 0.6632, + "step": 2517 + }, + { + "epoch": 0.5803180456326342, + "grad_norm": 0.18680398166179657, + "learning_rate": 3.951436752530973e-05, + "loss": 0.6624, + "step": 2518 + }, + { + "epoch": 0.5805485134823692, + "grad_norm": 0.19972355663776398, + "learning_rate": 3.947787171932215e-05, + "loss": 0.656, + "step": 2519 + }, + { + "epoch": 0.5807789813321041, + "grad_norm": 0.18271125853061676, + "learning_rate": 3.944138177811633e-05, + "loss": 0.6557, + "step": 2520 + }, + { + "epoch": 0.5810094491818392, + "grad_norm": 0.21198797225952148, + "learning_rate": 3.9404897722030886e-05, + "loss": 0.6563, + "step": 2521 + }, + { + "epoch": 0.5812399170315741, + "grad_norm": 0.18243679404258728, + "learning_rate": 3.936841957140115e-05, + "loss": 0.6591, + "step": 2522 + }, + { + "epoch": 0.5814703848813091, + "grad_norm": 0.21174080669879913, + "learning_rate": 3.933194734655916e-05, + "loss": 0.6634, + "step": 2523 + }, + { + "epoch": 0.581700852731044, + "grad_norm": 0.19727849960327148, + "learning_rate": 3.929548106783367e-05, + "loss": 0.6504, + "step": 2524 + }, + { + "epoch": 0.581931320580779, + "grad_norm": 0.19438689947128296, + "learning_rate": 3.925902075555009e-05, + "loss": 0.6678, + "step": 2525 + }, + { + "epoch": 0.5821617884305139, + "grad_norm": 0.20954324305057526, + "learning_rate": 3.9222566430030536e-05, + "loss": 0.6592, + "step": 2526 + }, + { + "epoch": 0.5823922562802489, + "grad_norm": 0.1915033459663391, + "learning_rate": 3.918611811159379e-05, + "loss": 0.6503, + "step": 2527 + }, + { + "epoch": 0.5826227241299838, + "grad_norm": 0.21163514256477356, + "learning_rate": 3.914967582055524e-05, + "loss": 0.6506, + "step": 2528 + }, + { + "epoch": 0.5828531919797189, + "grad_norm": 0.19895008206367493, + "learning_rate": 3.911323957722694e-05, + "loss": 0.659, + "step": 2529 + }, + { + "epoch": 0.5830836598294538, + "grad_norm": 0.1981336772441864, + "learning_rate": 3.907680940191764e-05, + "loss": 0.6543, + "step": 2530 + }, + { + "epoch": 0.5833141276791888, + "grad_norm": 0.19202502071857452, + "learning_rate": 3.904038531493257e-05, + "loss": 0.6523, + "step": 2531 + }, + { + "epoch": 0.5835445955289237, + "grad_norm": 0.2164509892463684, + "learning_rate": 3.900396733657366e-05, + "loss": 0.6538, + "step": 2532 + }, + { + "epoch": 0.5837750633786587, + "grad_norm": 0.18908166885375977, + "learning_rate": 3.896755548713946e-05, + "loss": 0.6575, + "step": 2533 + }, + { + "epoch": 0.5840055312283936, + "grad_norm": 0.21118958294391632, + "learning_rate": 3.8931149786925e-05, + "loss": 0.6605, + "step": 2534 + }, + { + "epoch": 0.5842359990781286, + "grad_norm": 0.1941760927438736, + "learning_rate": 3.889475025622199e-05, + "loss": 0.6668, + "step": 2535 + }, + { + "epoch": 0.5844664669278635, + "grad_norm": 0.20846354961395264, + "learning_rate": 3.885835691531865e-05, + "loss": 0.6565, + "step": 2536 + }, + { + "epoch": 0.5846969347775985, + "grad_norm": 0.19090569019317627, + "learning_rate": 3.882196978449972e-05, + "loss": 0.6618, + "step": 2537 + }, + { + "epoch": 0.5849274026273334, + "grad_norm": 0.2103571891784668, + "learning_rate": 3.878558888404655e-05, + "loss": 0.6521, + "step": 2538 + }, + { + "epoch": 0.5851578704770685, + "grad_norm": 0.19896583259105682, + "learning_rate": 3.874921423423697e-05, + "loss": 0.6568, + "step": 2539 + }, + { + "epoch": 0.5853883383268034, + "grad_norm": 0.18624718487262726, + "learning_rate": 3.8712845855345325e-05, + "loss": 0.6519, + "step": 2540 + }, + { + "epoch": 0.5856188061765384, + "grad_norm": 0.19909995794296265, + "learning_rate": 3.867648376764248e-05, + "loss": 0.6624, + "step": 2541 + }, + { + "epoch": 0.5858492740262733, + "grad_norm": 0.18448121845722198, + "learning_rate": 3.864012799139581e-05, + "loss": 0.6622, + "step": 2542 + }, + { + "epoch": 0.5860797418760083, + "grad_norm": 0.20322075486183167, + "learning_rate": 3.860377854686913e-05, + "loss": 0.6648, + "step": 2543 + }, + { + "epoch": 0.5863102097257432, + "grad_norm": 0.19707633554935455, + "learning_rate": 3.8567435454322745e-05, + "loss": 0.6581, + "step": 2544 + }, + { + "epoch": 0.5865406775754782, + "grad_norm": 0.18212544918060303, + "learning_rate": 3.853109873401346e-05, + "loss": 0.6537, + "step": 2545 + }, + { + "epoch": 0.5867711454252131, + "grad_norm": 0.19013650715351105, + "learning_rate": 3.849476840619443e-05, + "loss": 0.6569, + "step": 2546 + }, + { + "epoch": 0.5870016132749482, + "grad_norm": 0.20143313705921173, + "learning_rate": 3.845844449111535e-05, + "loss": 0.6615, + "step": 2547 + }, + { + "epoch": 0.5872320811246831, + "grad_norm": 0.20364569127559662, + "learning_rate": 3.842212700902231e-05, + "loss": 0.6565, + "step": 2548 + }, + { + "epoch": 0.5874625489744181, + "grad_norm": 0.1817772090435028, + "learning_rate": 3.838581598015776e-05, + "loss": 0.6533, + "step": 2549 + }, + { + "epoch": 0.587693016824153, + "grad_norm": 0.1907220184803009, + "learning_rate": 3.834951142476063e-05, + "loss": 0.6604, + "step": 2550 + }, + { + "epoch": 0.587923484673888, + "grad_norm": 0.19262264668941498, + "learning_rate": 3.83132133630662e-05, + "loss": 0.656, + "step": 2551 + }, + { + "epoch": 0.5881539525236229, + "grad_norm": 0.18963393568992615, + "learning_rate": 3.827692181530612e-05, + "loss": 0.6608, + "step": 2552 + }, + { + "epoch": 0.5883844203733579, + "grad_norm": 0.20289820432662964, + "learning_rate": 3.8240636801708444e-05, + "loss": 0.6643, + "step": 2553 + }, + { + "epoch": 0.5886148882230928, + "grad_norm": 0.17837589979171753, + "learning_rate": 3.8204358342497584e-05, + "loss": 0.6626, + "step": 2554 + }, + { + "epoch": 0.5888453560728278, + "grad_norm": 0.2154434323310852, + "learning_rate": 3.816808645789425e-05, + "loss": 0.6643, + "step": 2555 + }, + { + "epoch": 0.5890758239225629, + "grad_norm": 0.19185711443424225, + "learning_rate": 3.813182116811552e-05, + "loss": 0.6568, + "step": 2556 + }, + { + "epoch": 0.5893062917722978, + "grad_norm": 0.18890288472175598, + "learning_rate": 3.809556249337484e-05, + "loss": 0.652, + "step": 2557 + }, + { + "epoch": 0.5895367596220328, + "grad_norm": 0.20426598191261292, + "learning_rate": 3.805931045388188e-05, + "loss": 0.6576, + "step": 2558 + }, + { + "epoch": 0.5897672274717677, + "grad_norm": 0.18901459872722626, + "learning_rate": 3.8023065069842653e-05, + "loss": 0.6577, + "step": 2559 + }, + { + "epoch": 0.5899976953215027, + "grad_norm": 0.19619275629520416, + "learning_rate": 3.7986826361459524e-05, + "loss": 0.6664, + "step": 2560 + }, + { + "epoch": 0.5902281631712376, + "grad_norm": 0.1934560090303421, + "learning_rate": 3.795059434893101e-05, + "loss": 0.6597, + "step": 2561 + }, + { + "epoch": 0.5904586310209726, + "grad_norm": 0.1980142444372177, + "learning_rate": 3.7914369052452006e-05, + "loss": 0.6619, + "step": 2562 + }, + { + "epoch": 0.5906890988707075, + "grad_norm": 0.1985742449760437, + "learning_rate": 3.787815049221361e-05, + "loss": 0.6524, + "step": 2563 + }, + { + "epoch": 0.5909195667204425, + "grad_norm": 0.1910460889339447, + "learning_rate": 3.784193868840318e-05, + "loss": 0.6597, + "step": 2564 + }, + { + "epoch": 0.5911500345701775, + "grad_norm": 0.1858750432729721, + "learning_rate": 3.7805733661204306e-05, + "loss": 0.6636, + "step": 2565 + }, + { + "epoch": 0.5913805024199125, + "grad_norm": 0.20883874595165253, + "learning_rate": 3.77695354307968e-05, + "loss": 0.6565, + "step": 2566 + }, + { + "epoch": 0.5916109702696474, + "grad_norm": 0.18166543543338776, + "learning_rate": 3.77333440173567e-05, + "loss": 0.6525, + "step": 2567 + }, + { + "epoch": 0.5918414381193824, + "grad_norm": 0.1856246143579483, + "learning_rate": 3.7697159441056205e-05, + "loss": 0.6615, + "step": 2568 + }, + { + "epoch": 0.5920719059691173, + "grad_norm": 0.19537490606307983, + "learning_rate": 3.7660981722063745e-05, + "loss": 0.6563, + "step": 2569 + }, + { + "epoch": 0.5923023738188523, + "grad_norm": 0.19945433735847473, + "learning_rate": 3.762481088054393e-05, + "loss": 0.6573, + "step": 2570 + }, + { + "epoch": 0.5925328416685872, + "grad_norm": 0.20831739902496338, + "learning_rate": 3.758864693665748e-05, + "loss": 0.6633, + "step": 2571 + }, + { + "epoch": 0.5927633095183222, + "grad_norm": 0.21567675471305847, + "learning_rate": 3.7552489910561326e-05, + "loss": 0.6572, + "step": 2572 + }, + { + "epoch": 0.5929937773680571, + "grad_norm": 0.215605229139328, + "learning_rate": 3.751633982240855e-05, + "loss": 0.6625, + "step": 2573 + }, + { + "epoch": 0.5932242452177922, + "grad_norm": 0.19509264826774597, + "learning_rate": 3.7480196692348315e-05, + "loss": 0.6619, + "step": 2574 + }, + { + "epoch": 0.5934547130675271, + "grad_norm": 0.20573195815086365, + "learning_rate": 3.744406054052594e-05, + "loss": 0.6536, + "step": 2575 + }, + { + "epoch": 0.5936851809172621, + "grad_norm": 0.18587082624435425, + "learning_rate": 3.740793138708285e-05, + "loss": 0.6453, + "step": 2576 + }, + { + "epoch": 0.593915648766997, + "grad_norm": 0.20044218003749847, + "learning_rate": 3.737180925215658e-05, + "loss": 0.6622, + "step": 2577 + }, + { + "epoch": 0.594146116616732, + "grad_norm": 0.20294342935085297, + "learning_rate": 3.733569415588071e-05, + "loss": 0.6649, + "step": 2578 + }, + { + "epoch": 0.5943765844664669, + "grad_norm": 0.1900031715631485, + "learning_rate": 3.729958611838496e-05, + "loss": 0.6607, + "step": 2579 + }, + { + "epoch": 0.5946070523162019, + "grad_norm": 0.21229608356952667, + "learning_rate": 3.7263485159795075e-05, + "loss": 0.6624, + "step": 2580 + }, + { + "epoch": 0.5948375201659368, + "grad_norm": 0.18590863049030304, + "learning_rate": 3.7227391300232845e-05, + "loss": 0.6529, + "step": 2581 + }, + { + "epoch": 0.5950679880156718, + "grad_norm": 0.20582017302513123, + "learning_rate": 3.7191304559816165e-05, + "loss": 0.6516, + "step": 2582 + }, + { + "epoch": 0.5952984558654068, + "grad_norm": 0.18912175297737122, + "learning_rate": 3.715522495865885e-05, + "loss": 0.6669, + "step": 2583 + }, + { + "epoch": 0.5955289237151418, + "grad_norm": 0.2016802579164505, + "learning_rate": 3.711915251687086e-05, + "loss": 0.6562, + "step": 2584 + }, + { + "epoch": 0.5957593915648767, + "grad_norm": 0.18063144385814667, + "learning_rate": 3.7083087254558116e-05, + "loss": 0.6661, + "step": 2585 + }, + { + "epoch": 0.5959898594146117, + "grad_norm": 0.20485232770442963, + "learning_rate": 3.7047029191822455e-05, + "loss": 0.6597, + "step": 2586 + }, + { + "epoch": 0.5962203272643466, + "grad_norm": 0.17791825532913208, + "learning_rate": 3.701097834876185e-05, + "loss": 0.6555, + "step": 2587 + }, + { + "epoch": 0.5964507951140816, + "grad_norm": 0.2008713036775589, + "learning_rate": 3.697493474547016e-05, + "loss": 0.6649, + "step": 2588 + }, + { + "epoch": 0.5966812629638165, + "grad_norm": 0.19855642318725586, + "learning_rate": 3.693889840203719e-05, + "loss": 0.6608, + "step": 2589 + }, + { + "epoch": 0.5969117308135515, + "grad_norm": 0.1890726089477539, + "learning_rate": 3.690286933854877e-05, + "loss": 0.6555, + "step": 2590 + }, + { + "epoch": 0.5971421986632864, + "grad_norm": 0.19872720539569855, + "learning_rate": 3.686684757508663e-05, + "loss": 0.6596, + "step": 2591 + }, + { + "epoch": 0.5973726665130215, + "grad_norm": 0.1832602471113205, + "learning_rate": 3.683083313172841e-05, + "loss": 0.6586, + "step": 2592 + }, + { + "epoch": 0.5976031343627564, + "grad_norm": 0.2064426690340042, + "learning_rate": 3.679482602854773e-05, + "loss": 0.6593, + "step": 2593 + }, + { + "epoch": 0.5978336022124914, + "grad_norm": 0.17292998731136322, + "learning_rate": 3.675882628561408e-05, + "loss": 0.6601, + "step": 2594 + }, + { + "epoch": 0.5980640700622263, + "grad_norm": 0.2106057107448578, + "learning_rate": 3.672283392299282e-05, + "loss": 0.6503, + "step": 2595 + }, + { + "epoch": 0.5982945379119613, + "grad_norm": 0.1862911581993103, + "learning_rate": 3.6686848960745244e-05, + "loss": 0.6562, + "step": 2596 + }, + { + "epoch": 0.5985250057616962, + "grad_norm": 0.1884952336549759, + "learning_rate": 3.6650871418928556e-05, + "loss": 0.6618, + "step": 2597 + }, + { + "epoch": 0.5987554736114312, + "grad_norm": 0.17884479463100433, + "learning_rate": 3.66149013175957e-05, + "loss": 0.6558, + "step": 2598 + }, + { + "epoch": 0.5989859414611661, + "grad_norm": 0.1988133192062378, + "learning_rate": 3.6578938676795575e-05, + "loss": 0.6535, + "step": 2599 + }, + { + "epoch": 0.5992164093109011, + "grad_norm": 0.18073412775993347, + "learning_rate": 3.6542983516572915e-05, + "loss": 0.6506, + "step": 2600 + }, + { + "epoch": 0.599446877160636, + "grad_norm": 0.19410258531570435, + "learning_rate": 3.650703585696824e-05, + "loss": 0.6578, + "step": 2601 + }, + { + "epoch": 0.5996773450103711, + "grad_norm": 0.1835651844739914, + "learning_rate": 3.647109571801792e-05, + "loss": 0.6606, + "step": 2602 + }, + { + "epoch": 0.599907812860106, + "grad_norm": 0.21423934400081635, + "learning_rate": 3.643516311975413e-05, + "loss": 0.6515, + "step": 2603 + }, + { + "epoch": 0.600138280709841, + "grad_norm": 0.19270770251750946, + "learning_rate": 3.639923808220483e-05, + "loss": 0.6521, + "step": 2604 + }, + { + "epoch": 0.6003687485595759, + "grad_norm": 0.18155768513679504, + "learning_rate": 3.636332062539378e-05, + "loss": 0.6557, + "step": 2605 + }, + { + "epoch": 0.6005992164093109, + "grad_norm": 0.2070186883211136, + "learning_rate": 3.6327410769340505e-05, + "loss": 0.6535, + "step": 2606 + }, + { + "epoch": 0.6008296842590458, + "grad_norm": 0.17192067205905914, + "learning_rate": 3.6291508534060296e-05, + "loss": 0.6545, + "step": 2607 + }, + { + "epoch": 0.6010601521087808, + "grad_norm": 0.20923547446727753, + "learning_rate": 3.625561393956417e-05, + "loss": 0.6645, + "step": 2608 + }, + { + "epoch": 0.6012906199585157, + "grad_norm": 0.18093320727348328, + "learning_rate": 3.621972700585898e-05, + "loss": 0.653, + "step": 2609 + }, + { + "epoch": 0.6015210878082508, + "grad_norm": 0.19666415452957153, + "learning_rate": 3.618384775294718e-05, + "loss": 0.6512, + "step": 2610 + }, + { + "epoch": 0.6017515556579857, + "grad_norm": 0.1838379055261612, + "learning_rate": 3.614797620082703e-05, + "loss": 0.658, + "step": 2611 + }, + { + "epoch": 0.6019820235077207, + "grad_norm": 0.19848011434078217, + "learning_rate": 3.611211236949248e-05, + "loss": 0.6495, + "step": 2612 + }, + { + "epoch": 0.6022124913574556, + "grad_norm": 0.19375987350940704, + "learning_rate": 3.6076256278933145e-05, + "loss": 0.6578, + "step": 2613 + }, + { + "epoch": 0.6024429592071906, + "grad_norm": 0.18482361733913422, + "learning_rate": 3.604040794913437e-05, + "loss": 0.6623, + "step": 2614 + }, + { + "epoch": 0.6026734270569255, + "grad_norm": 0.18653304874897003, + "learning_rate": 3.600456740007714e-05, + "loss": 0.6625, + "step": 2615 + }, + { + "epoch": 0.6029038949066605, + "grad_norm": 0.17585410177707672, + "learning_rate": 3.5968734651738124e-05, + "loss": 0.6578, + "step": 2616 + }, + { + "epoch": 0.6031343627563955, + "grad_norm": 0.18889284133911133, + "learning_rate": 3.5932909724089626e-05, + "loss": 0.6487, + "step": 2617 + }, + { + "epoch": 0.6033648306061304, + "grad_norm": 0.1867912858724594, + "learning_rate": 3.589709263709963e-05, + "loss": 0.65, + "step": 2618 + }, + { + "epoch": 0.6035952984558655, + "grad_norm": 0.18602915108203888, + "learning_rate": 3.586128341073167e-05, + "loss": 0.6587, + "step": 2619 + }, + { + "epoch": 0.6038257663056004, + "grad_norm": 0.16733336448669434, + "learning_rate": 3.582548206494499e-05, + "loss": 0.6489, + "step": 2620 + }, + { + "epoch": 0.6040562341553354, + "grad_norm": 0.1732863187789917, + "learning_rate": 3.578968861969441e-05, + "loss": 0.6551, + "step": 2621 + }, + { + "epoch": 0.6042867020050703, + "grad_norm": 0.18000958859920502, + "learning_rate": 3.575390309493029e-05, + "loss": 0.6646, + "step": 2622 + }, + { + "epoch": 0.6045171698548053, + "grad_norm": 0.17645825445652008, + "learning_rate": 3.5718125510598645e-05, + "loss": 0.6579, + "step": 2623 + }, + { + "epoch": 0.6047476377045402, + "grad_norm": 0.18449077010154724, + "learning_rate": 3.5682355886641075e-05, + "loss": 0.6467, + "step": 2624 + }, + { + "epoch": 0.6049781055542752, + "grad_norm": 0.1781143993139267, + "learning_rate": 3.5646594242994646e-05, + "loss": 0.6607, + "step": 2625 + }, + { + "epoch": 0.6052085734040101, + "grad_norm": 0.1856512576341629, + "learning_rate": 3.5610840599592096e-05, + "loss": 0.6588, + "step": 2626 + }, + { + "epoch": 0.6054390412537451, + "grad_norm": 0.18362030386924744, + "learning_rate": 3.5575094976361625e-05, + "loss": 0.6636, + "step": 2627 + }, + { + "epoch": 0.60566950910348, + "grad_norm": 0.172404944896698, + "learning_rate": 3.553935739322698e-05, + "loss": 0.6543, + "step": 2628 + }, + { + "epoch": 0.6058999769532151, + "grad_norm": 0.18513377010822296, + "learning_rate": 3.550362787010744e-05, + "loss": 0.6549, + "step": 2629 + }, + { + "epoch": 0.60613044480295, + "grad_norm": 0.1736796498298645, + "learning_rate": 3.5467906426917795e-05, + "loss": 0.6516, + "step": 2630 + }, + { + "epoch": 0.606360912652685, + "grad_norm": 0.18415535986423492, + "learning_rate": 3.54321930835683e-05, + "loss": 0.6537, + "step": 2631 + }, + { + "epoch": 0.6065913805024199, + "grad_norm": 0.1662503480911255, + "learning_rate": 3.539648785996471e-05, + "loss": 0.6554, + "step": 2632 + }, + { + "epoch": 0.6068218483521549, + "grad_norm": 0.17874038219451904, + "learning_rate": 3.536079077600829e-05, + "loss": 0.6543, + "step": 2633 + }, + { + "epoch": 0.6070523162018898, + "grad_norm": 0.17104828357696533, + "learning_rate": 3.532510185159572e-05, + "loss": 0.6532, + "step": 2634 + }, + { + "epoch": 0.6072827840516248, + "grad_norm": 0.1689852625131607, + "learning_rate": 3.5289421106619126e-05, + "loss": 0.6549, + "step": 2635 + }, + { + "epoch": 0.6075132519013597, + "grad_norm": 0.17767558991909027, + "learning_rate": 3.525374856096616e-05, + "loss": 0.6609, + "step": 2636 + }, + { + "epoch": 0.6077437197510948, + "grad_norm": 0.1855451464653015, + "learning_rate": 3.5218084234519764e-05, + "loss": 0.6619, + "step": 2637 + }, + { + "epoch": 0.6079741876008297, + "grad_norm": 0.17707620561122894, + "learning_rate": 3.518242814715844e-05, + "loss": 0.6615, + "step": 2638 + }, + { + "epoch": 0.6082046554505647, + "grad_norm": 0.18121664226055145, + "learning_rate": 3.514678031875602e-05, + "loss": 0.663, + "step": 2639 + }, + { + "epoch": 0.6084351233002996, + "grad_norm": 0.1841084361076355, + "learning_rate": 3.5111140769181735e-05, + "loss": 0.6526, + "step": 2640 + }, + { + "epoch": 0.6086655911500346, + "grad_norm": 0.18913474678993225, + "learning_rate": 3.5075509518300224e-05, + "loss": 0.6576, + "step": 2641 + }, + { + "epoch": 0.6088960589997695, + "grad_norm": 0.1743757128715515, + "learning_rate": 3.5039886585971496e-05, + "loss": 0.6534, + "step": 2642 + }, + { + "epoch": 0.6091265268495045, + "grad_norm": 0.1718021035194397, + "learning_rate": 3.500427199205091e-05, + "loss": 0.667, + "step": 2643 + }, + { + "epoch": 0.6093569946992394, + "grad_norm": 0.18834391236305237, + "learning_rate": 3.4968665756389174e-05, + "loss": 0.6608, + "step": 2644 + }, + { + "epoch": 0.6095874625489744, + "grad_norm": 0.17941658198833466, + "learning_rate": 3.4933067898832375e-05, + "loss": 0.6586, + "step": 2645 + }, + { + "epoch": 0.6098179303987094, + "grad_norm": 0.1901034712791443, + "learning_rate": 3.489747843922189e-05, + "loss": 0.6558, + "step": 2646 + }, + { + "epoch": 0.6100483982484444, + "grad_norm": 0.1816999912261963, + "learning_rate": 3.4861897397394415e-05, + "loss": 0.6595, + "step": 2647 + }, + { + "epoch": 0.6102788660981793, + "grad_norm": 0.187819242477417, + "learning_rate": 3.482632479318201e-05, + "loss": 0.6464, + "step": 2648 + }, + { + "epoch": 0.6105093339479143, + "grad_norm": 0.20110177993774414, + "learning_rate": 3.479076064641195e-05, + "loss": 0.6544, + "step": 2649 + }, + { + "epoch": 0.6107398017976492, + "grad_norm": 0.19999714195728302, + "learning_rate": 3.475520497690684e-05, + "loss": 0.6536, + "step": 2650 + }, + { + "epoch": 0.6109702696473842, + "grad_norm": 0.20862232148647308, + "learning_rate": 3.471965780448461e-05, + "loss": 0.6516, + "step": 2651 + }, + { + "epoch": 0.6112007374971191, + "grad_norm": 0.1882224977016449, + "learning_rate": 3.4684119148958314e-05, + "loss": 0.6561, + "step": 2652 + }, + { + "epoch": 0.6114312053468541, + "grad_norm": 0.22021904587745667, + "learning_rate": 3.464858903013641e-05, + "loss": 0.6543, + "step": 2653 + }, + { + "epoch": 0.611661673196589, + "grad_norm": 0.1685914695262909, + "learning_rate": 3.461306746782253e-05, + "loss": 0.6563, + "step": 2654 + }, + { + "epoch": 0.6118921410463241, + "grad_norm": 0.19287706911563873, + "learning_rate": 3.457755448181551e-05, + "loss": 0.647, + "step": 2655 + }, + { + "epoch": 0.612122608896059, + "grad_norm": 0.18482106924057007, + "learning_rate": 3.454205009190945e-05, + "loss": 0.6534, + "step": 2656 + }, + { + "epoch": 0.612353076745794, + "grad_norm": 0.18831658363342285, + "learning_rate": 3.450655431789366e-05, + "loss": 0.6577, + "step": 2657 + }, + { + "epoch": 0.6125835445955289, + "grad_norm": 0.1809016317129135, + "learning_rate": 3.447106717955261e-05, + "loss": 0.6562, + "step": 2658 + }, + { + "epoch": 0.6128140124452639, + "grad_norm": 0.19347211718559265, + "learning_rate": 3.443558869666598e-05, + "loss": 0.649, + "step": 2659 + }, + { + "epoch": 0.6130444802949988, + "grad_norm": 0.19009865820407867, + "learning_rate": 3.4400118889008646e-05, + "loss": 0.6468, + "step": 2660 + }, + { + "epoch": 0.6132749481447338, + "grad_norm": 0.19617056846618652, + "learning_rate": 3.4364657776350605e-05, + "loss": 0.658, + "step": 2661 + }, + { + "epoch": 0.6135054159944687, + "grad_norm": 0.19712841510772705, + "learning_rate": 3.432920537845703e-05, + "loss": 0.6584, + "step": 2662 + }, + { + "epoch": 0.6137358838442037, + "grad_norm": 0.18756158649921417, + "learning_rate": 3.429376171508827e-05, + "loss": 0.6505, + "step": 2663 + }, + { + "epoch": 0.6139663516939387, + "grad_norm": 0.19545352458953857, + "learning_rate": 3.425832680599972e-05, + "loss": 0.6567, + "step": 2664 + }, + { + "epoch": 0.6141968195436737, + "grad_norm": 0.19188739359378815, + "learning_rate": 3.4222900670941995e-05, + "loss": 0.6664, + "step": 2665 + }, + { + "epoch": 0.6144272873934086, + "grad_norm": 0.19670219719409943, + "learning_rate": 3.4187483329660755e-05, + "loss": 0.6572, + "step": 2666 + }, + { + "epoch": 0.6146577552431436, + "grad_norm": 0.18227505683898926, + "learning_rate": 3.415207480189676e-05, + "loss": 0.6524, + "step": 2667 + }, + { + "epoch": 0.6148882230928785, + "grad_norm": 0.19035394489765167, + "learning_rate": 3.4116675107385885e-05, + "loss": 0.6538, + "step": 2668 + }, + { + "epoch": 0.6151186909426135, + "grad_norm": 0.18735790252685547, + "learning_rate": 3.408128426585909e-05, + "loss": 0.6554, + "step": 2669 + }, + { + "epoch": 0.6153491587923484, + "grad_norm": 0.20038571953773499, + "learning_rate": 3.4045902297042356e-05, + "loss": 0.6627, + "step": 2670 + }, + { + "epoch": 0.6155796266420834, + "grad_norm": 0.17390772700309753, + "learning_rate": 3.401052922065675e-05, + "loss": 0.6551, + "step": 2671 + }, + { + "epoch": 0.6158100944918183, + "grad_norm": 0.19239278137683868, + "learning_rate": 3.397516505641838e-05, + "loss": 0.654, + "step": 2672 + }, + { + "epoch": 0.6160405623415534, + "grad_norm": 0.1672012060880661, + "learning_rate": 3.393980982403837e-05, + "loss": 0.6537, + "step": 2673 + }, + { + "epoch": 0.6162710301912883, + "grad_norm": 0.19565598666667938, + "learning_rate": 3.3904463543222876e-05, + "loss": 0.6523, + "step": 2674 + }, + { + "epoch": 0.6165014980410233, + "grad_norm": 0.17734399437904358, + "learning_rate": 3.386912623367311e-05, + "loss": 0.6533, + "step": 2675 + }, + { + "epoch": 0.6167319658907583, + "grad_norm": 0.18294768035411835, + "learning_rate": 3.383379791508519e-05, + "loss": 0.6492, + "step": 2676 + }, + { + "epoch": 0.6169624337404932, + "grad_norm": 0.187822625041008, + "learning_rate": 3.3798478607150274e-05, + "loss": 0.6568, + "step": 2677 + }, + { + "epoch": 0.6171929015902282, + "grad_norm": 0.17213211953639984, + "learning_rate": 3.376316832955454e-05, + "loss": 0.6634, + "step": 2678 + }, + { + "epoch": 0.6174233694399631, + "grad_norm": 0.18802215158939362, + "learning_rate": 3.3727867101979036e-05, + "loss": 0.6512, + "step": 2679 + }, + { + "epoch": 0.6176538372896981, + "grad_norm": 0.16777820885181427, + "learning_rate": 3.369257494409985e-05, + "loss": 0.6475, + "step": 2680 + }, + { + "epoch": 0.617884305139433, + "grad_norm": 0.19203679263591766, + "learning_rate": 3.3657291875587996e-05, + "loss": 0.6552, + "step": 2681 + }, + { + "epoch": 0.6181147729891681, + "grad_norm": 0.19129416346549988, + "learning_rate": 3.3622017916109376e-05, + "loss": 0.6645, + "step": 2682 + }, + { + "epoch": 0.618345240838903, + "grad_norm": 0.17216765880584717, + "learning_rate": 3.358675308532486e-05, + "loss": 0.6611, + "step": 2683 + }, + { + "epoch": 0.618575708688638, + "grad_norm": 0.18110480904579163, + "learning_rate": 3.355149740289024e-05, + "loss": 0.6505, + "step": 2684 + }, + { + "epoch": 0.6188061765383729, + "grad_norm": 0.18171606957912445, + "learning_rate": 3.3516250888456165e-05, + "loss": 0.6439, + "step": 2685 + }, + { + "epoch": 0.6190366443881079, + "grad_norm": 0.17510759830474854, + "learning_rate": 3.348101356166819e-05, + "loss": 0.6511, + "step": 2686 + }, + { + "epoch": 0.6192671122378428, + "grad_norm": 0.18047381937503815, + "learning_rate": 3.344578544216678e-05, + "loss": 0.6561, + "step": 2687 + }, + { + "epoch": 0.6194975800875778, + "grad_norm": 0.17559714615345, + "learning_rate": 3.341056654958722e-05, + "loss": 0.6538, + "step": 2688 + }, + { + "epoch": 0.6197280479373127, + "grad_norm": 0.18052367866039276, + "learning_rate": 3.3375356903559676e-05, + "loss": 0.6521, + "step": 2689 + }, + { + "epoch": 0.6199585157870477, + "grad_norm": 0.17809216678142548, + "learning_rate": 3.3340156523709185e-05, + "loss": 0.657, + "step": 2690 + }, + { + "epoch": 0.6201889836367827, + "grad_norm": 0.18209248781204224, + "learning_rate": 3.330496542965556e-05, + "loss": 0.651, + "step": 2691 + }, + { + "epoch": 0.6204194514865177, + "grad_norm": 0.1755290925502777, + "learning_rate": 3.32697836410135e-05, + "loss": 0.6581, + "step": 2692 + }, + { + "epoch": 0.6206499193362526, + "grad_norm": 0.16924412548542023, + "learning_rate": 3.323461117739248e-05, + "loss": 0.6497, + "step": 2693 + }, + { + "epoch": 0.6208803871859876, + "grad_norm": 0.18715137243270874, + "learning_rate": 3.3199448058396786e-05, + "loss": 0.6562, + "step": 2694 + }, + { + "epoch": 0.6211108550357225, + "grad_norm": 0.17708511650562286, + "learning_rate": 3.3164294303625486e-05, + "loss": 0.6584, + "step": 2695 + }, + { + "epoch": 0.6213413228854575, + "grad_norm": 0.1799190789461136, + "learning_rate": 3.312914993267246e-05, + "loss": 0.6591, + "step": 2696 + }, + { + "epoch": 0.6215717907351924, + "grad_norm": 0.19582661986351013, + "learning_rate": 3.309401496512633e-05, + "loss": 0.6479, + "step": 2697 + }, + { + "epoch": 0.6218022585849274, + "grad_norm": 0.17881685495376587, + "learning_rate": 3.305888942057047e-05, + "loss": 0.653, + "step": 2698 + }, + { + "epoch": 0.6220327264346623, + "grad_norm": 0.1858767569065094, + "learning_rate": 3.302377331858302e-05, + "loss": 0.652, + "step": 2699 + }, + { + "epoch": 0.6222631942843974, + "grad_norm": 0.16495144367218018, + "learning_rate": 3.298866667873688e-05, + "loss": 0.6534, + "step": 2700 + }, + { + "epoch": 0.6224936621341323, + "grad_norm": 0.1828274130821228, + "learning_rate": 3.2953569520599606e-05, + "loss": 0.6452, + "step": 2701 + }, + { + "epoch": 0.6227241299838673, + "grad_norm": 0.1840105652809143, + "learning_rate": 3.291848186373353e-05, + "loss": 0.6509, + "step": 2702 + }, + { + "epoch": 0.6229545978336022, + "grad_norm": 0.1832989603281021, + "learning_rate": 3.2883403727695694e-05, + "loss": 0.6469, + "step": 2703 + }, + { + "epoch": 0.6231850656833372, + "grad_norm": 0.18650995194911957, + "learning_rate": 3.2848335132037763e-05, + "loss": 0.654, + "step": 2704 + }, + { + "epoch": 0.6234155335330721, + "grad_norm": 0.17295216023921967, + "learning_rate": 3.2813276096306156e-05, + "loss": 0.654, + "step": 2705 + }, + { + "epoch": 0.6236460013828071, + "grad_norm": 0.17538554966449738, + "learning_rate": 3.2778226640041956e-05, + "loss": 0.6558, + "step": 2706 + }, + { + "epoch": 0.623876469232542, + "grad_norm": 0.15956903994083405, + "learning_rate": 3.274318678278085e-05, + "loss": 0.6538, + "step": 2707 + }, + { + "epoch": 0.624106937082277, + "grad_norm": 0.18525734543800354, + "learning_rate": 3.270815654405324e-05, + "loss": 0.6538, + "step": 2708 + }, + { + "epoch": 0.624337404932012, + "grad_norm": 0.16766776144504547, + "learning_rate": 3.267313594338414e-05, + "loss": 0.6503, + "step": 2709 + }, + { + "epoch": 0.624567872781747, + "grad_norm": 0.19396458566188812, + "learning_rate": 3.2638125000293165e-05, + "loss": 0.6553, + "step": 2710 + }, + { + "epoch": 0.6247983406314819, + "grad_norm": 0.1728057563304901, + "learning_rate": 3.2603123734294606e-05, + "loss": 0.6621, + "step": 2711 + }, + { + "epoch": 0.6250288084812169, + "grad_norm": 0.179381862282753, + "learning_rate": 3.2568132164897316e-05, + "loss": 0.6579, + "step": 2712 + }, + { + "epoch": 0.6252592763309518, + "grad_norm": 0.1791330873966217, + "learning_rate": 3.253315031160475e-05, + "loss": 0.66, + "step": 2713 + }, + { + "epoch": 0.6254897441806868, + "grad_norm": 0.18653972446918488, + "learning_rate": 3.249817819391495e-05, + "loss": 0.6593, + "step": 2714 + }, + { + "epoch": 0.6257202120304217, + "grad_norm": 0.17987924814224243, + "learning_rate": 3.246321583132058e-05, + "loss": 0.6539, + "step": 2715 + }, + { + "epoch": 0.6259506798801567, + "grad_norm": 0.19162432849407196, + "learning_rate": 3.242826324330875e-05, + "loss": 0.6572, + "step": 2716 + }, + { + "epoch": 0.6261811477298916, + "grad_norm": 0.19111159443855286, + "learning_rate": 3.2393320449361246e-05, + "loss": 0.6546, + "step": 2717 + }, + { + "epoch": 0.6264116155796267, + "grad_norm": 0.17735035717487335, + "learning_rate": 3.235838746895434e-05, + "loss": 0.6561, + "step": 2718 + }, + { + "epoch": 0.6266420834293616, + "grad_norm": 0.1921262890100479, + "learning_rate": 3.2323464321558804e-05, + "loss": 0.6506, + "step": 2719 + }, + { + "epoch": 0.6268725512790966, + "grad_norm": 0.1894858479499817, + "learning_rate": 3.2288551026639986e-05, + "loss": 0.6502, + "step": 2720 + }, + { + "epoch": 0.6271030191288315, + "grad_norm": 0.19639429450035095, + "learning_rate": 3.225364760365772e-05, + "loss": 0.6546, + "step": 2721 + }, + { + "epoch": 0.6273334869785665, + "grad_norm": 0.1766640543937683, + "learning_rate": 3.221875407206633e-05, + "loss": 0.66, + "step": 2722 + }, + { + "epoch": 0.6275639548283014, + "grad_norm": 0.18973618745803833, + "learning_rate": 3.2183870451314624e-05, + "loss": 0.6616, + "step": 2723 + }, + { + "epoch": 0.6277944226780364, + "grad_norm": 0.19568420946598053, + "learning_rate": 3.2148996760845924e-05, + "loss": 0.653, + "step": 2724 + }, + { + "epoch": 0.6280248905277713, + "grad_norm": 0.18159765005111694, + "learning_rate": 3.2114133020097945e-05, + "loss": 0.6494, + "step": 2725 + }, + { + "epoch": 0.6282553583775063, + "grad_norm": 0.2033732682466507, + "learning_rate": 3.2079279248502925e-05, + "loss": 0.6529, + "step": 2726 + }, + { + "epoch": 0.6284858262272413, + "grad_norm": 0.17295779287815094, + "learning_rate": 3.204443546548754e-05, + "loss": 0.65, + "step": 2727 + }, + { + "epoch": 0.6287162940769763, + "grad_norm": 0.1940968781709671, + "learning_rate": 3.2009601690472834e-05, + "loss": 0.6572, + "step": 2728 + }, + { + "epoch": 0.6289467619267112, + "grad_norm": 0.17500554025173187, + "learning_rate": 3.197477794287435e-05, + "loss": 0.6498, + "step": 2729 + }, + { + "epoch": 0.6291772297764462, + "grad_norm": 0.18749865889549255, + "learning_rate": 3.1939964242102004e-05, + "loss": 0.65, + "step": 2730 + }, + { + "epoch": 0.6294076976261811, + "grad_norm": 0.19215604662895203, + "learning_rate": 3.190516060756009e-05, + "loss": 0.6679, + "step": 2731 + }, + { + "epoch": 0.6296381654759161, + "grad_norm": 0.18513312935829163, + "learning_rate": 3.187036705864736e-05, + "loss": 0.6617, + "step": 2732 + }, + { + "epoch": 0.629868633325651, + "grad_norm": 0.20599323511123657, + "learning_rate": 3.183558361475687e-05, + "loss": 0.652, + "step": 2733 + }, + { + "epoch": 0.630099101175386, + "grad_norm": 0.18766285479068756, + "learning_rate": 3.1800810295276095e-05, + "loss": 0.6536, + "step": 2734 + }, + { + "epoch": 0.630329569025121, + "grad_norm": 0.19500939548015594, + "learning_rate": 3.1766047119586845e-05, + "loss": 0.6591, + "step": 2735 + }, + { + "epoch": 0.630560036874856, + "grad_norm": 0.19361793994903564, + "learning_rate": 3.173129410706528e-05, + "loss": 0.6434, + "step": 2736 + }, + { + "epoch": 0.630790504724591, + "grad_norm": 0.17595571279525757, + "learning_rate": 3.169655127708189e-05, + "loss": 0.6405, + "step": 2737 + }, + { + "epoch": 0.6310209725743259, + "grad_norm": 0.19558657705783844, + "learning_rate": 3.1661818649001494e-05, + "loss": 0.6524, + "step": 2738 + }, + { + "epoch": 0.6312514404240609, + "grad_norm": 0.18385924398899078, + "learning_rate": 3.1627096242183243e-05, + "loss": 0.6614, + "step": 2739 + }, + { + "epoch": 0.6314819082737958, + "grad_norm": 0.19142907857894897, + "learning_rate": 3.159238407598054e-05, + "loss": 0.657, + "step": 2740 + }, + { + "epoch": 0.6317123761235308, + "grad_norm": 0.18829232454299927, + "learning_rate": 3.155768216974112e-05, + "loss": 0.6483, + "step": 2741 + }, + { + "epoch": 0.6319428439732657, + "grad_norm": 0.1925613284111023, + "learning_rate": 3.1522990542807046e-05, + "loss": 0.6478, + "step": 2742 + }, + { + "epoch": 0.6321733118230007, + "grad_norm": 0.17510046064853668, + "learning_rate": 3.148830921451452e-05, + "loss": 0.6551, + "step": 2743 + }, + { + "epoch": 0.6324037796727356, + "grad_norm": 0.17866365611553192, + "learning_rate": 3.1453638204194135e-05, + "loss": 0.6585, + "step": 2744 + }, + { + "epoch": 0.6326342475224707, + "grad_norm": 0.18686874210834503, + "learning_rate": 3.141897753117067e-05, + "loss": 0.6517, + "step": 2745 + }, + { + "epoch": 0.6328647153722056, + "grad_norm": 0.16817151010036469, + "learning_rate": 3.1384327214763135e-05, + "loss": 0.6591, + "step": 2746 + }, + { + "epoch": 0.6330951832219406, + "grad_norm": 0.1925475150346756, + "learning_rate": 3.13496872742848e-05, + "loss": 0.6576, + "step": 2747 + }, + { + "epoch": 0.6333256510716755, + "grad_norm": 0.19002029299736023, + "learning_rate": 3.131505772904314e-05, + "loss": 0.6528, + "step": 2748 + }, + { + "epoch": 0.6335561189214105, + "grad_norm": 0.1708815097808838, + "learning_rate": 3.128043859833981e-05, + "loss": 0.6493, + "step": 2749 + }, + { + "epoch": 0.6337865867711454, + "grad_norm": 0.19003167748451233, + "learning_rate": 3.12458299014707e-05, + "loss": 0.6568, + "step": 2750 + }, + { + "epoch": 0.6340170546208804, + "grad_norm": 0.1745605319738388, + "learning_rate": 3.121123165772588e-05, + "loss": 0.6547, + "step": 2751 + }, + { + "epoch": 0.6342475224706153, + "grad_norm": 0.18086248636245728, + "learning_rate": 3.1176643886389554e-05, + "loss": 0.6454, + "step": 2752 + }, + { + "epoch": 0.6344779903203503, + "grad_norm": 0.17058676481246948, + "learning_rate": 3.114206660674011e-05, + "loss": 0.6611, + "step": 2753 + }, + { + "epoch": 0.6347084581700853, + "grad_norm": 0.1747319996356964, + "learning_rate": 3.1107499838050146e-05, + "loss": 0.6428, + "step": 2754 + }, + { + "epoch": 0.6349389260198203, + "grad_norm": 0.17745546996593475, + "learning_rate": 3.107294359958628e-05, + "loss": 0.6602, + "step": 2755 + }, + { + "epoch": 0.6351693938695552, + "grad_norm": 0.16670075058937073, + "learning_rate": 3.103839791060937e-05, + "loss": 0.6608, + "step": 2756 + }, + { + "epoch": 0.6353998617192902, + "grad_norm": 0.1686505526304245, + "learning_rate": 3.1003862790374364e-05, + "loss": 0.6535, + "step": 2757 + }, + { + "epoch": 0.6356303295690251, + "grad_norm": 0.1769195795059204, + "learning_rate": 3.0969338258130274e-05, + "loss": 0.6537, + "step": 2758 + }, + { + "epoch": 0.6358607974187601, + "grad_norm": 0.16672547161579132, + "learning_rate": 3.093482433312027e-05, + "loss": 0.644, + "step": 2759 + }, + { + "epoch": 0.636091265268495, + "grad_norm": 0.17630359530448914, + "learning_rate": 3.090032103458159e-05, + "loss": 0.6616, + "step": 2760 + }, + { + "epoch": 0.63632173311823, + "grad_norm": 0.1749248057603836, + "learning_rate": 3.086582838174551e-05, + "loss": 0.6506, + "step": 2761 + }, + { + "epoch": 0.6365522009679649, + "grad_norm": 0.16957643628120422, + "learning_rate": 3.0831346393837445e-05, + "loss": 0.6552, + "step": 2762 + }, + { + "epoch": 0.6367826688177, + "grad_norm": 0.16888821125030518, + "learning_rate": 3.079687509007682e-05, + "loss": 0.6457, + "step": 2763 + }, + { + "epoch": 0.6370131366674349, + "grad_norm": 0.17261339724063873, + "learning_rate": 3.076241448967709e-05, + "loss": 0.6568, + "step": 2764 + }, + { + "epoch": 0.6372436045171699, + "grad_norm": 0.18425433337688446, + "learning_rate": 3.072796461184579e-05, + "loss": 0.6558, + "step": 2765 + }, + { + "epoch": 0.6374740723669048, + "grad_norm": 0.1994054913520813, + "learning_rate": 3.069352547578445e-05, + "loss": 0.6559, + "step": 2766 + }, + { + "epoch": 0.6377045402166398, + "grad_norm": 0.17605717480182648, + "learning_rate": 3.0659097100688607e-05, + "loss": 0.6496, + "step": 2767 + }, + { + "epoch": 0.6379350080663747, + "grad_norm": 0.18694092333316803, + "learning_rate": 3.062467950574781e-05, + "loss": 0.6579, + "step": 2768 + }, + { + "epoch": 0.6381654759161097, + "grad_norm": 0.18670520186424255, + "learning_rate": 3.059027271014564e-05, + "loss": 0.6445, + "step": 2769 + }, + { + "epoch": 0.6383959437658446, + "grad_norm": 0.18028950691223145, + "learning_rate": 3.055587673305955e-05, + "loss": 0.6488, + "step": 2770 + }, + { + "epoch": 0.6386264116155796, + "grad_norm": 0.18050305545330048, + "learning_rate": 3.052149159366109e-05, + "loss": 0.6566, + "step": 2771 + }, + { + "epoch": 0.6388568794653146, + "grad_norm": 0.18738004565238953, + "learning_rate": 3.0487117311115687e-05, + "loss": 0.6465, + "step": 2772 + }, + { + "epoch": 0.6390873473150496, + "grad_norm": 0.1994061917066574, + "learning_rate": 3.0452753904582747e-05, + "loss": 0.65, + "step": 2773 + }, + { + "epoch": 0.6393178151647845, + "grad_norm": 0.1728476881980896, + "learning_rate": 3.04184013932156e-05, + "loss": 0.6544, + "step": 2774 + }, + { + "epoch": 0.6395482830145195, + "grad_norm": 0.20216725766658783, + "learning_rate": 3.0384059796161535e-05, + "loss": 0.6532, + "step": 2775 + }, + { + "epoch": 0.6397787508642544, + "grad_norm": 0.17996247112751007, + "learning_rate": 3.03497291325617e-05, + "loss": 0.6548, + "step": 2776 + }, + { + "epoch": 0.6400092187139894, + "grad_norm": 0.20604853332042694, + "learning_rate": 3.03154094215512e-05, + "loss": 0.6472, + "step": 2777 + }, + { + "epoch": 0.6402396865637243, + "grad_norm": 0.1871616393327713, + "learning_rate": 3.0281100682259032e-05, + "loss": 0.6537, + "step": 2778 + }, + { + "epoch": 0.6404701544134593, + "grad_norm": 0.18597093224525452, + "learning_rate": 3.024680293380804e-05, + "loss": 0.6539, + "step": 2779 + }, + { + "epoch": 0.6407006222631942, + "grad_norm": 0.2045731246471405, + "learning_rate": 3.0212516195314955e-05, + "loss": 0.6574, + "step": 2780 + }, + { + "epoch": 0.6409310901129293, + "grad_norm": 0.19084343314170837, + "learning_rate": 3.017824048589044e-05, + "loss": 0.6615, + "step": 2781 + }, + { + "epoch": 0.6411615579626642, + "grad_norm": 0.17811843752861023, + "learning_rate": 3.0143975824638887e-05, + "loss": 0.6502, + "step": 2782 + }, + { + "epoch": 0.6413920258123992, + "grad_norm": 0.19356852769851685, + "learning_rate": 3.0109722230658633e-05, + "loss": 0.6544, + "step": 2783 + }, + { + "epoch": 0.6416224936621341, + "grad_norm": 0.18267957866191864, + "learning_rate": 3.0075479723041804e-05, + "loss": 0.6531, + "step": 2784 + }, + { + "epoch": 0.6418529615118691, + "grad_norm": 0.18512317538261414, + "learning_rate": 3.0041248320874343e-05, + "loss": 0.6616, + "step": 2785 + }, + { + "epoch": 0.642083429361604, + "grad_norm": 0.17581506073474884, + "learning_rate": 3.0007028043236013e-05, + "loss": 0.6548, + "step": 2786 + }, + { + "epoch": 0.642313897211339, + "grad_norm": 0.17305217683315277, + "learning_rate": 2.9972818909200396e-05, + "loss": 0.6558, + "step": 2787 + }, + { + "epoch": 0.6425443650610739, + "grad_norm": 0.1679084450006485, + "learning_rate": 2.9938620937834804e-05, + "loss": 0.6515, + "step": 2788 + }, + { + "epoch": 0.642774832910809, + "grad_norm": 0.16354484856128693, + "learning_rate": 2.9904434148200395e-05, + "loss": 0.6591, + "step": 2789 + }, + { + "epoch": 0.6430053007605439, + "grad_norm": 0.171888068318367, + "learning_rate": 2.9870258559352067e-05, + "loss": 0.6477, + "step": 2790 + }, + { + "epoch": 0.6432357686102789, + "grad_norm": 0.1731024980545044, + "learning_rate": 2.9836094190338448e-05, + "loss": 0.6491, + "step": 2791 + }, + { + "epoch": 0.6434662364600138, + "grad_norm": 0.1673286110162735, + "learning_rate": 2.9801941060201944e-05, + "loss": 0.6474, + "step": 2792 + }, + { + "epoch": 0.6436967043097488, + "grad_norm": 0.18038569390773773, + "learning_rate": 2.976779918797873e-05, + "loss": 0.6492, + "step": 2793 + }, + { + "epoch": 0.6439271721594838, + "grad_norm": 0.1763104796409607, + "learning_rate": 2.973366859269861e-05, + "loss": 0.6451, + "step": 2794 + }, + { + "epoch": 0.6441576400092187, + "grad_norm": 0.17095611989498138, + "learning_rate": 2.9699549293385176e-05, + "loss": 0.6561, + "step": 2795 + }, + { + "epoch": 0.6443881078589537, + "grad_norm": 0.16650767624378204, + "learning_rate": 2.966544130905574e-05, + "loss": 0.665, + "step": 2796 + }, + { + "epoch": 0.6446185757086886, + "grad_norm": 0.1730603128671646, + "learning_rate": 2.9631344658721215e-05, + "loss": 0.6566, + "step": 2797 + }, + { + "epoch": 0.6448490435584237, + "grad_norm": 0.17055529356002808, + "learning_rate": 2.9597259361386305e-05, + "loss": 0.6559, + "step": 2798 + }, + { + "epoch": 0.6450795114081586, + "grad_norm": 0.246446430683136, + "learning_rate": 2.9563185436049323e-05, + "loss": 0.659, + "step": 2799 + }, + { + "epoch": 0.6453099792578936, + "grad_norm": 0.16206693649291992, + "learning_rate": 2.9529122901702244e-05, + "loss": 0.6473, + "step": 2800 + }, + { + "epoch": 0.6455404471076285, + "grad_norm": 0.17527292668819427, + "learning_rate": 2.9495071777330717e-05, + "loss": 0.6558, + "step": 2801 + }, + { + "epoch": 0.6457709149573635, + "grad_norm": 0.16923247277736664, + "learning_rate": 2.946103208191403e-05, + "loss": 0.6567, + "step": 2802 + }, + { + "epoch": 0.6460013828070984, + "grad_norm": 0.18780246376991272, + "learning_rate": 2.9427003834425075e-05, + "loss": 0.6511, + "step": 2803 + }, + { + "epoch": 0.6462318506568334, + "grad_norm": 0.18644435703754425, + "learning_rate": 2.939298705383039e-05, + "loss": 0.6495, + "step": 2804 + }, + { + "epoch": 0.6464623185065683, + "grad_norm": 0.1641910821199417, + "learning_rate": 2.935898175909012e-05, + "loss": 0.65, + "step": 2805 + }, + { + "epoch": 0.6466927863563033, + "grad_norm": 0.18704113364219666, + "learning_rate": 2.9324987969157974e-05, + "loss": 0.6479, + "step": 2806 + }, + { + "epoch": 0.6469232542060382, + "grad_norm": 0.18336251378059387, + "learning_rate": 2.9291005702981288e-05, + "loss": 0.6506, + "step": 2807 + }, + { + "epoch": 0.6471537220557733, + "grad_norm": 0.18807761371135712, + "learning_rate": 2.9257034979500986e-05, + "loss": 0.6477, + "step": 2808 + }, + { + "epoch": 0.6473841899055082, + "grad_norm": 0.18197950720787048, + "learning_rate": 2.922307581765149e-05, + "loss": 0.6592, + "step": 2809 + }, + { + "epoch": 0.6476146577552432, + "grad_norm": 0.20037148892879486, + "learning_rate": 2.9189128236360852e-05, + "loss": 0.6513, + "step": 2810 + }, + { + "epoch": 0.6478451256049781, + "grad_norm": 0.17539845407009125, + "learning_rate": 2.915519225455065e-05, + "loss": 0.646, + "step": 2811 + }, + { + "epoch": 0.6480755934547131, + "grad_norm": 0.19264990091323853, + "learning_rate": 2.9121267891135952e-05, + "loss": 0.6494, + "step": 2812 + }, + { + "epoch": 0.648306061304448, + "grad_norm": 0.1686212420463562, + "learning_rate": 2.908735516502537e-05, + "loss": 0.6511, + "step": 2813 + }, + { + "epoch": 0.648536529154183, + "grad_norm": 0.1705242097377777, + "learning_rate": 2.905345409512112e-05, + "loss": 0.6491, + "step": 2814 + }, + { + "epoch": 0.6487669970039179, + "grad_norm": 0.1819016933441162, + "learning_rate": 2.9019564700318768e-05, + "loss": 0.6421, + "step": 2815 + }, + { + "epoch": 0.648997464853653, + "grad_norm": 0.17861327528953552, + "learning_rate": 2.8985686999507488e-05, + "loss": 0.6532, + "step": 2816 + }, + { + "epoch": 0.6492279327033879, + "grad_norm": 0.2011132538318634, + "learning_rate": 2.89518210115699e-05, + "loss": 0.6426, + "step": 2817 + }, + { + "epoch": 0.6494584005531229, + "grad_norm": 0.16986913979053497, + "learning_rate": 2.8917966755382048e-05, + "loss": 0.6544, + "step": 2818 + }, + { + "epoch": 0.6496888684028578, + "grad_norm": 0.19294098019599915, + "learning_rate": 2.8884124249813526e-05, + "loss": 0.655, + "step": 2819 + }, + { + "epoch": 0.6499193362525928, + "grad_norm": 0.18572697043418884, + "learning_rate": 2.885029351372735e-05, + "loss": 0.6563, + "step": 2820 + }, + { + "epoch": 0.6501498041023277, + "grad_norm": 0.19467751681804657, + "learning_rate": 2.881647456597991e-05, + "loss": 0.6606, + "step": 2821 + }, + { + "epoch": 0.6503802719520627, + "grad_norm": 0.18023499846458435, + "learning_rate": 2.8782667425421096e-05, + "loss": 0.6527, + "step": 2822 + }, + { + "epoch": 0.6506107398017976, + "grad_norm": 0.19630929827690125, + "learning_rate": 2.87488721108942e-05, + "loss": 0.6475, + "step": 2823 + }, + { + "epoch": 0.6508412076515326, + "grad_norm": 0.17777912318706512, + "learning_rate": 2.871508864123592e-05, + "loss": 0.6543, + "step": 2824 + }, + { + "epoch": 0.6510716755012675, + "grad_norm": 0.20080679655075073, + "learning_rate": 2.868131703527635e-05, + "loss": 0.6556, + "step": 2825 + }, + { + "epoch": 0.6513021433510026, + "grad_norm": 0.1675284504890442, + "learning_rate": 2.864755731183899e-05, + "loss": 0.6482, + "step": 2826 + }, + { + "epoch": 0.6515326112007375, + "grad_norm": 0.1921423077583313, + "learning_rate": 2.8613809489740662e-05, + "loss": 0.6511, + "step": 2827 + }, + { + "epoch": 0.6517630790504725, + "grad_norm": 0.17145875096321106, + "learning_rate": 2.8580073587791596e-05, + "loss": 0.6484, + "step": 2828 + }, + { + "epoch": 0.6519935469002074, + "grad_norm": 0.19045400619506836, + "learning_rate": 2.8546349624795404e-05, + "loss": 0.6555, + "step": 2829 + }, + { + "epoch": 0.6522240147499424, + "grad_norm": 0.18223097920417786, + "learning_rate": 2.8512637619549022e-05, + "loss": 0.643, + "step": 2830 + }, + { + "epoch": 0.6524544825996773, + "grad_norm": 0.17837011814117432, + "learning_rate": 2.847893759084267e-05, + "loss": 0.653, + "step": 2831 + }, + { + "epoch": 0.6526849504494123, + "grad_norm": 0.17700053751468658, + "learning_rate": 2.8445249557459953e-05, + "loss": 0.6537, + "step": 2832 + }, + { + "epoch": 0.6529154182991472, + "grad_norm": 0.17131763696670532, + "learning_rate": 2.8411573538177772e-05, + "loss": 0.6515, + "step": 2833 + }, + { + "epoch": 0.6531458861488822, + "grad_norm": 0.1803225427865982, + "learning_rate": 2.837790955176634e-05, + "loss": 0.6526, + "step": 2834 + }, + { + "epoch": 0.6533763539986172, + "grad_norm": 0.17444370687007904, + "learning_rate": 2.8344257616989144e-05, + "loss": 0.6564, + "step": 2835 + }, + { + "epoch": 0.6536068218483522, + "grad_norm": 0.18952690064907074, + "learning_rate": 2.8310617752602996e-05, + "loss": 0.6535, + "step": 2836 + }, + { + "epoch": 0.6538372896980871, + "grad_norm": 0.16605597734451294, + "learning_rate": 2.8276989977357894e-05, + "loss": 0.6439, + "step": 2837 + }, + { + "epoch": 0.6540677575478221, + "grad_norm": 0.19188542664051056, + "learning_rate": 2.8243374309997166e-05, + "loss": 0.654, + "step": 2838 + }, + { + "epoch": 0.654298225397557, + "grad_norm": 0.17647671699523926, + "learning_rate": 2.8209770769257437e-05, + "loss": 0.643, + "step": 2839 + }, + { + "epoch": 0.654528693247292, + "grad_norm": 0.19883349537849426, + "learning_rate": 2.8176179373868443e-05, + "loss": 0.662, + "step": 2840 + }, + { + "epoch": 0.6547591610970269, + "grad_norm": 0.17992442846298218, + "learning_rate": 2.8142600142553254e-05, + "loss": 0.6503, + "step": 2841 + }, + { + "epoch": 0.6549896289467619, + "grad_norm": 0.19443680346012115, + "learning_rate": 2.8109033094028126e-05, + "loss": 0.6492, + "step": 2842 + }, + { + "epoch": 0.6552200967964968, + "grad_norm": 0.18997105956077576, + "learning_rate": 2.8075478247002518e-05, + "loss": 0.6544, + "step": 2843 + }, + { + "epoch": 0.6554505646462319, + "grad_norm": 0.2033330202102661, + "learning_rate": 2.8041935620179105e-05, + "loss": 0.6607, + "step": 2844 + }, + { + "epoch": 0.6556810324959668, + "grad_norm": 0.18512104451656342, + "learning_rate": 2.8008405232253752e-05, + "loss": 0.6491, + "step": 2845 + }, + { + "epoch": 0.6559115003457018, + "grad_norm": 0.1780814528465271, + "learning_rate": 2.7974887101915458e-05, + "loss": 0.6497, + "step": 2846 + }, + { + "epoch": 0.6561419681954367, + "grad_norm": 0.20027370750904083, + "learning_rate": 2.7941381247846453e-05, + "loss": 0.6509, + "step": 2847 + }, + { + "epoch": 0.6563724360451717, + "grad_norm": 0.1773580014705658, + "learning_rate": 2.7907887688722085e-05, + "loss": 0.6508, + "step": 2848 + }, + { + "epoch": 0.6566029038949066, + "grad_norm": 0.20906241238117218, + "learning_rate": 2.7874406443210866e-05, + "loss": 0.6531, + "step": 2849 + }, + { + "epoch": 0.6568333717446416, + "grad_norm": 0.17152054607868195, + "learning_rate": 2.784093752997443e-05, + "loss": 0.6519, + "step": 2850 + }, + { + "epoch": 0.6570638395943765, + "grad_norm": 0.20871195197105408, + "learning_rate": 2.7807480967667576e-05, + "loss": 0.6517, + "step": 2851 + }, + { + "epoch": 0.6572943074441115, + "grad_norm": 0.18561682105064392, + "learning_rate": 2.7774036774938138e-05, + "loss": 0.6447, + "step": 2852 + }, + { + "epoch": 0.6575247752938465, + "grad_norm": 0.19581186771392822, + "learning_rate": 2.774060497042712e-05, + "loss": 0.6593, + "step": 2853 + }, + { + "epoch": 0.6577552431435815, + "grad_norm": 0.1915295422077179, + "learning_rate": 2.7707185572768656e-05, + "loss": 0.6545, + "step": 2854 + }, + { + "epoch": 0.6579857109933165, + "grad_norm": 0.19746430218219757, + "learning_rate": 2.7673778600589862e-05, + "loss": 0.6521, + "step": 2855 + }, + { + "epoch": 0.6582161788430514, + "grad_norm": 0.17735208570957184, + "learning_rate": 2.7640384072510994e-05, + "loss": 0.6554, + "step": 2856 + }, + { + "epoch": 0.6584466466927864, + "grad_norm": 0.19774432480335236, + "learning_rate": 2.7607002007145377e-05, + "loss": 0.6499, + "step": 2857 + }, + { + "epoch": 0.6586771145425213, + "grad_norm": 0.17123498022556305, + "learning_rate": 2.7573632423099355e-05, + "loss": 0.6539, + "step": 2858 + }, + { + "epoch": 0.6589075823922563, + "grad_norm": 0.19462165236473083, + "learning_rate": 2.7540275338972343e-05, + "loss": 0.6533, + "step": 2859 + }, + { + "epoch": 0.6591380502419912, + "grad_norm": 0.1761290431022644, + "learning_rate": 2.75069307733568e-05, + "loss": 0.6443, + "step": 2860 + }, + { + "epoch": 0.6593685180917263, + "grad_norm": 0.17915204167366028, + "learning_rate": 2.7473598744838146e-05, + "loss": 0.6526, + "step": 2861 + }, + { + "epoch": 0.6595989859414612, + "grad_norm": 0.170929417014122, + "learning_rate": 2.744027927199486e-05, + "loss": 0.6531, + "step": 2862 + }, + { + "epoch": 0.6598294537911962, + "grad_norm": 0.18236766755580902, + "learning_rate": 2.7406972373398443e-05, + "loss": 0.6508, + "step": 2863 + }, + { + "epoch": 0.6600599216409311, + "grad_norm": 0.16858653724193573, + "learning_rate": 2.737367806761334e-05, + "loss": 0.6535, + "step": 2864 + }, + { + "epoch": 0.6602903894906661, + "grad_norm": 0.1613864302635193, + "learning_rate": 2.7340396373196996e-05, + "loss": 0.6503, + "step": 2865 + }, + { + "epoch": 0.660520857340401, + "grad_norm": 0.17449095845222473, + "learning_rate": 2.7307127308699865e-05, + "loss": 0.6549, + "step": 2866 + }, + { + "epoch": 0.660751325190136, + "grad_norm": 0.16756795346736908, + "learning_rate": 2.7273870892665253e-05, + "loss": 0.6488, + "step": 2867 + }, + { + "epoch": 0.6609817930398709, + "grad_norm": 0.16483251750469208, + "learning_rate": 2.724062714362955e-05, + "loss": 0.6564, + "step": 2868 + }, + { + "epoch": 0.6612122608896059, + "grad_norm": 0.16283339262008667, + "learning_rate": 2.7207396080122028e-05, + "loss": 0.655, + "step": 2869 + }, + { + "epoch": 0.6614427287393408, + "grad_norm": 0.16669294238090515, + "learning_rate": 2.717417772066484e-05, + "loss": 0.6508, + "step": 2870 + }, + { + "epoch": 0.6616731965890759, + "grad_norm": 0.18657532334327698, + "learning_rate": 2.7140972083773124e-05, + "loss": 0.6463, + "step": 2871 + }, + { + "epoch": 0.6619036644388108, + "grad_norm": 0.17012335360050201, + "learning_rate": 2.7107779187954908e-05, + "loss": 0.6552, + "step": 2872 + }, + { + "epoch": 0.6621341322885458, + "grad_norm": 0.17444707453250885, + "learning_rate": 2.7074599051711108e-05, + "loss": 0.6454, + "step": 2873 + }, + { + "epoch": 0.6623646001382807, + "grad_norm": 0.17587541043758392, + "learning_rate": 2.704143169353554e-05, + "loss": 0.6565, + "step": 2874 + }, + { + "epoch": 0.6625950679880157, + "grad_norm": 0.16666634380817413, + "learning_rate": 2.7008277131914916e-05, + "loss": 0.6483, + "step": 2875 + }, + { + "epoch": 0.6628255358377506, + "grad_norm": 0.16995498538017273, + "learning_rate": 2.6975135385328743e-05, + "loss": 0.6459, + "step": 2876 + }, + { + "epoch": 0.6630560036874856, + "grad_norm": 0.17786766588687897, + "learning_rate": 2.6942006472249447e-05, + "loss": 0.6482, + "step": 2877 + }, + { + "epoch": 0.6632864715372205, + "grad_norm": 0.15976811945438385, + "learning_rate": 2.690889041114234e-05, + "loss": 0.6426, + "step": 2878 + }, + { + "epoch": 0.6635169393869556, + "grad_norm": 0.17242498695850372, + "learning_rate": 2.6875787220465463e-05, + "loss": 0.6514, + "step": 2879 + }, + { + "epoch": 0.6637474072366905, + "grad_norm": 0.18254664540290833, + "learning_rate": 2.6842696918669768e-05, + "loss": 0.6541, + "step": 2880 + }, + { + "epoch": 0.6639778750864255, + "grad_norm": 0.18394513428211212, + "learning_rate": 2.6809619524199004e-05, + "loss": 0.6491, + "step": 2881 + }, + { + "epoch": 0.6642083429361604, + "grad_norm": 0.17171119153499603, + "learning_rate": 2.677655505548966e-05, + "loss": 0.6435, + "step": 2882 + }, + { + "epoch": 0.6644388107858954, + "grad_norm": 0.17666159570217133, + "learning_rate": 2.6743503530971138e-05, + "loss": 0.6427, + "step": 2883 + }, + { + "epoch": 0.6646692786356303, + "grad_norm": 0.18384835124015808, + "learning_rate": 2.6710464969065563e-05, + "loss": 0.6508, + "step": 2884 + }, + { + "epoch": 0.6648997464853653, + "grad_norm": 0.16794157028198242, + "learning_rate": 2.6677439388187796e-05, + "loss": 0.6547, + "step": 2885 + }, + { + "epoch": 0.6651302143351002, + "grad_norm": 0.17705510556697845, + "learning_rate": 2.6644426806745526e-05, + "loss": 0.6458, + "step": 2886 + }, + { + "epoch": 0.6653606821848352, + "grad_norm": 0.17769016325473785, + "learning_rate": 2.6611427243139166e-05, + "loss": 0.6518, + "step": 2887 + }, + { + "epoch": 0.6655911500345701, + "grad_norm": 0.16915744543075562, + "learning_rate": 2.6578440715761894e-05, + "loss": 0.65, + "step": 2888 + }, + { + "epoch": 0.6658216178843052, + "grad_norm": 0.17128948867321014, + "learning_rate": 2.65454672429996e-05, + "loss": 0.6476, + "step": 2889 + }, + { + "epoch": 0.6660520857340401, + "grad_norm": 0.16499844193458557, + "learning_rate": 2.6512506843230922e-05, + "loss": 0.661, + "step": 2890 + }, + { + "epoch": 0.6662825535837751, + "grad_norm": 0.1791979968547821, + "learning_rate": 2.6479559534827168e-05, + "loss": 0.6513, + "step": 2891 + }, + { + "epoch": 0.66651302143351, + "grad_norm": 0.16432271897792816, + "learning_rate": 2.6446625336152364e-05, + "loss": 0.6536, + "step": 2892 + }, + { + "epoch": 0.666743489283245, + "grad_norm": 0.17049454152584076, + "learning_rate": 2.641370426556331e-05, + "loss": 0.6504, + "step": 2893 + }, + { + "epoch": 0.6669739571329799, + "grad_norm": 0.17584246397018433, + "learning_rate": 2.6380796341409364e-05, + "loss": 0.6444, + "step": 2894 + }, + { + "epoch": 0.6672044249827149, + "grad_norm": 0.1704198718070984, + "learning_rate": 2.6347901582032623e-05, + "loss": 0.6566, + "step": 2895 + }, + { + "epoch": 0.6674348928324498, + "grad_norm": 0.1673116832971573, + "learning_rate": 2.6315020005767843e-05, + "loss": 0.6508, + "step": 2896 + }, + { + "epoch": 0.6676653606821848, + "grad_norm": 0.16023887693881989, + "learning_rate": 2.628215163094242e-05, + "loss": 0.6522, + "step": 2897 + }, + { + "epoch": 0.6678958285319198, + "grad_norm": 0.16509434580802917, + "learning_rate": 2.6249296475876407e-05, + "loss": 0.6573, + "step": 2898 + }, + { + "epoch": 0.6681262963816548, + "grad_norm": 0.17283135652542114, + "learning_rate": 2.6216454558882486e-05, + "loss": 0.6567, + "step": 2899 + }, + { + "epoch": 0.6683567642313897, + "grad_norm": 0.15373359620571136, + "learning_rate": 2.6183625898265918e-05, + "loss": 0.6509, + "step": 2900 + }, + { + "epoch": 0.6685872320811247, + "grad_norm": 0.17244744300842285, + "learning_rate": 2.6150810512324637e-05, + "loss": 0.6472, + "step": 2901 + }, + { + "epoch": 0.6688176999308596, + "grad_norm": 0.17626379430294037, + "learning_rate": 2.6118008419349148e-05, + "loss": 0.6575, + "step": 2902 + }, + { + "epoch": 0.6690481677805946, + "grad_norm": 0.16538438200950623, + "learning_rate": 2.6085219637622544e-05, + "loss": 0.6507, + "step": 2903 + }, + { + "epoch": 0.6692786356303295, + "grad_norm": 0.18583250045776367, + "learning_rate": 2.605244418542051e-05, + "loss": 0.6585, + "step": 2904 + }, + { + "epoch": 0.6695091034800645, + "grad_norm": 0.17038129270076752, + "learning_rate": 2.601968208101132e-05, + "loss": 0.6591, + "step": 2905 + }, + { + "epoch": 0.6697395713297994, + "grad_norm": 0.17771609127521515, + "learning_rate": 2.5986933342655717e-05, + "loss": 0.6568, + "step": 2906 + }, + { + "epoch": 0.6699700391795345, + "grad_norm": 0.17149491608142853, + "learning_rate": 2.5954197988607133e-05, + "loss": 0.6535, + "step": 2907 + }, + { + "epoch": 0.6702005070292694, + "grad_norm": 0.18247468769550323, + "learning_rate": 2.5921476037111448e-05, + "loss": 0.6533, + "step": 2908 + }, + { + "epoch": 0.6704309748790044, + "grad_norm": 0.16240952908992767, + "learning_rate": 2.5888767506407075e-05, + "loss": 0.6556, + "step": 2909 + }, + { + "epoch": 0.6706614427287393, + "grad_norm": 0.18257564306259155, + "learning_rate": 2.5856072414724974e-05, + "loss": 0.6539, + "step": 2910 + }, + { + "epoch": 0.6708919105784743, + "grad_norm": 0.16340118646621704, + "learning_rate": 2.5823390780288604e-05, + "loss": 0.6605, + "step": 2911 + }, + { + "epoch": 0.6711223784282092, + "grad_norm": 0.18919028341770172, + "learning_rate": 2.5790722621313924e-05, + "loss": 0.6558, + "step": 2912 + }, + { + "epoch": 0.6713528462779442, + "grad_norm": 0.1639140248298645, + "learning_rate": 2.575806795600938e-05, + "loss": 0.6472, + "step": 2913 + }, + { + "epoch": 0.6715833141276792, + "grad_norm": 0.18610194325447083, + "learning_rate": 2.5725426802575925e-05, + "loss": 0.6476, + "step": 2914 + }, + { + "epoch": 0.6718137819774141, + "grad_norm": 0.18196745216846466, + "learning_rate": 2.5692799179206906e-05, + "loss": 0.6472, + "step": 2915 + }, + { + "epoch": 0.6720442498271492, + "grad_norm": 0.1704276204109192, + "learning_rate": 2.5660185104088207e-05, + "loss": 0.6442, + "step": 2916 + }, + { + "epoch": 0.6722747176768841, + "grad_norm": 0.1694510579109192, + "learning_rate": 2.5627584595398125e-05, + "loss": 0.6528, + "step": 2917 + }, + { + "epoch": 0.6725051855266191, + "grad_norm": 0.16790097951889038, + "learning_rate": 2.5594997671307397e-05, + "loss": 0.6555, + "step": 2918 + }, + { + "epoch": 0.672735653376354, + "grad_norm": 0.17335295677185059, + "learning_rate": 2.5562424349979198e-05, + "loss": 0.6553, + "step": 2919 + }, + { + "epoch": 0.672966121226089, + "grad_norm": 0.1660207062959671, + "learning_rate": 2.552986464956913e-05, + "loss": 0.648, + "step": 2920 + }, + { + "epoch": 0.6731965890758239, + "grad_norm": 0.1704881191253662, + "learning_rate": 2.5497318588225137e-05, + "loss": 0.6468, + "step": 2921 + }, + { + "epoch": 0.6734270569255589, + "grad_norm": 0.17709209024906158, + "learning_rate": 2.5464786184087665e-05, + "loss": 0.6567, + "step": 2922 + }, + { + "epoch": 0.6736575247752938, + "grad_norm": 0.16337838768959045, + "learning_rate": 2.5432267455289503e-05, + "loss": 0.6459, + "step": 2923 + }, + { + "epoch": 0.6738879926250289, + "grad_norm": 0.1824062317609787, + "learning_rate": 2.5399762419955764e-05, + "loss": 0.6545, + "step": 2924 + }, + { + "epoch": 0.6741184604747638, + "grad_norm": 0.16532284021377563, + "learning_rate": 2.5367271096203994e-05, + "loss": 0.6471, + "step": 2925 + }, + { + "epoch": 0.6743489283244988, + "grad_norm": 0.18639203906059265, + "learning_rate": 2.5334793502144077e-05, + "loss": 0.6585, + "step": 2926 + }, + { + "epoch": 0.6745793961742337, + "grad_norm": 0.1666010320186615, + "learning_rate": 2.5302329655878244e-05, + "loss": 0.656, + "step": 2927 + }, + { + "epoch": 0.6748098640239687, + "grad_norm": 0.17926305532455444, + "learning_rate": 2.5269879575501072e-05, + "loss": 0.6496, + "step": 2928 + }, + { + "epoch": 0.6750403318737036, + "grad_norm": 0.1780381202697754, + "learning_rate": 2.523744327909947e-05, + "loss": 0.6528, + "step": 2929 + }, + { + "epoch": 0.6752707997234386, + "grad_norm": 0.17732974886894226, + "learning_rate": 2.520502078475261e-05, + "loss": 0.6516, + "step": 2930 + }, + { + "epoch": 0.6755012675731735, + "grad_norm": 0.17732085287570953, + "learning_rate": 2.5172612110532012e-05, + "loss": 0.6591, + "step": 2931 + }, + { + "epoch": 0.6757317354229085, + "grad_norm": 0.18524208664894104, + "learning_rate": 2.5140217274501555e-05, + "loss": 0.648, + "step": 2932 + }, + { + "epoch": 0.6759622032726434, + "grad_norm": 0.18076317012310028, + "learning_rate": 2.510783629471728e-05, + "loss": 0.6565, + "step": 2933 + }, + { + "epoch": 0.6761926711223785, + "grad_norm": 0.1893717348575592, + "learning_rate": 2.5075469189227597e-05, + "loss": 0.6421, + "step": 2934 + }, + { + "epoch": 0.6764231389721134, + "grad_norm": 0.17758077383041382, + "learning_rate": 2.5043115976073167e-05, + "loss": 0.646, + "step": 2935 + }, + { + "epoch": 0.6766536068218484, + "grad_norm": 0.17987452447414398, + "learning_rate": 2.5010776673286834e-05, + "loss": 0.6401, + "step": 2936 + }, + { + "epoch": 0.6768840746715833, + "grad_norm": 0.1678553968667984, + "learning_rate": 2.497845129889381e-05, + "loss": 0.647, + "step": 2937 + }, + { + "epoch": 0.6771145425213183, + "grad_norm": 0.1824767142534256, + "learning_rate": 2.4946139870911482e-05, + "loss": 0.6409, + "step": 2938 + }, + { + "epoch": 0.6773450103710532, + "grad_norm": 0.1831919103860855, + "learning_rate": 2.491384240734943e-05, + "loss": 0.6454, + "step": 2939 + }, + { + "epoch": 0.6775754782207882, + "grad_norm": 0.17052872478961945, + "learning_rate": 2.48815589262095e-05, + "loss": 0.6512, + "step": 2940 + }, + { + "epoch": 0.6778059460705231, + "grad_norm": 0.17829042673110962, + "learning_rate": 2.4849289445485723e-05, + "loss": 0.6466, + "step": 2941 + }, + { + "epoch": 0.6780364139202582, + "grad_norm": 0.170830637216568, + "learning_rate": 2.4817033983164345e-05, + "loss": 0.6609, + "step": 2942 + }, + { + "epoch": 0.6782668817699931, + "grad_norm": 0.16118223965168, + "learning_rate": 2.478479255722378e-05, + "loss": 0.6512, + "step": 2943 + }, + { + "epoch": 0.6784973496197281, + "grad_norm": 0.1625395268201828, + "learning_rate": 2.4752565185634645e-05, + "loss": 0.6502, + "step": 2944 + }, + { + "epoch": 0.678727817469463, + "grad_norm": 0.17522037029266357, + "learning_rate": 2.472035188635967e-05, + "loss": 0.6458, + "step": 2945 + }, + { + "epoch": 0.678958285319198, + "grad_norm": 0.16705302894115448, + "learning_rate": 2.468815267735377e-05, + "loss": 0.6545, + "step": 2946 + }, + { + "epoch": 0.6791887531689329, + "grad_norm": 0.1653483510017395, + "learning_rate": 2.4655967576564064e-05, + "loss": 0.6473, + "step": 2947 + }, + { + "epoch": 0.6794192210186679, + "grad_norm": 0.16879500448703766, + "learning_rate": 2.4623796601929712e-05, + "loss": 0.652, + "step": 2948 + }, + { + "epoch": 0.6796496888684028, + "grad_norm": 0.16743235290050507, + "learning_rate": 2.4591639771382043e-05, + "loss": 0.6524, + "step": 2949 + }, + { + "epoch": 0.6798801567181378, + "grad_norm": 0.17148716747760773, + "learning_rate": 2.4559497102844514e-05, + "loss": 0.6468, + "step": 2950 + }, + { + "epoch": 0.6801106245678727, + "grad_norm": 0.15919731557369232, + "learning_rate": 2.452736861423268e-05, + "loss": 0.6566, + "step": 2951 + }, + { + "epoch": 0.6803410924176078, + "grad_norm": 0.17846500873565674, + "learning_rate": 2.449525432345418e-05, + "loss": 0.6603, + "step": 2952 + }, + { + "epoch": 0.6805715602673427, + "grad_norm": 0.16970974206924438, + "learning_rate": 2.4463154248408777e-05, + "loss": 0.6434, + "step": 2953 + }, + { + "epoch": 0.6808020281170777, + "grad_norm": 0.1619907021522522, + "learning_rate": 2.4431068406988238e-05, + "loss": 0.6526, + "step": 2954 + }, + { + "epoch": 0.6810324959668126, + "grad_norm": 0.1664748191833496, + "learning_rate": 2.439899681707646e-05, + "loss": 0.648, + "step": 2955 + }, + { + "epoch": 0.6812629638165476, + "grad_norm": 0.16397912800312042, + "learning_rate": 2.4366939496549378e-05, + "loss": 0.6482, + "step": 2956 + }, + { + "epoch": 0.6814934316662825, + "grad_norm": 0.16301095485687256, + "learning_rate": 2.4334896463274976e-05, + "loss": 0.6452, + "step": 2957 + }, + { + "epoch": 0.6817238995160175, + "grad_norm": 0.16809646785259247, + "learning_rate": 2.430286773511327e-05, + "loss": 0.6518, + "step": 2958 + }, + { + "epoch": 0.6819543673657524, + "grad_norm": 0.15831275284290314, + "learning_rate": 2.4270853329916304e-05, + "loss": 0.6463, + "step": 2959 + }, + { + "epoch": 0.6821848352154875, + "grad_norm": 0.1629604548215866, + "learning_rate": 2.4238853265528143e-05, + "loss": 0.6586, + "step": 2960 + }, + { + "epoch": 0.6824153030652224, + "grad_norm": 0.1589616984128952, + "learning_rate": 2.4206867559784847e-05, + "loss": 0.6457, + "step": 2961 + }, + { + "epoch": 0.6826457709149574, + "grad_norm": 0.17117534577846527, + "learning_rate": 2.417489623051448e-05, + "loss": 0.6551, + "step": 2962 + }, + { + "epoch": 0.6828762387646923, + "grad_norm": 0.1638975292444229, + "learning_rate": 2.4142939295537126e-05, + "loss": 0.649, + "step": 2963 + }, + { + "epoch": 0.6831067066144273, + "grad_norm": 0.1592676341533661, + "learning_rate": 2.411099677266476e-05, + "loss": 0.6549, + "step": 2964 + }, + { + "epoch": 0.6833371744641622, + "grad_norm": 0.1702510565519333, + "learning_rate": 2.407906867970141e-05, + "loss": 0.644, + "step": 2965 + }, + { + "epoch": 0.6835676423138972, + "grad_norm": 0.17177124321460724, + "learning_rate": 2.404715503444302e-05, + "loss": 0.6515, + "step": 2966 + }, + { + "epoch": 0.6837981101636321, + "grad_norm": 0.17113865911960602, + "learning_rate": 2.4015255854677488e-05, + "loss": 0.6572, + "step": 2967 + }, + { + "epoch": 0.6840285780133671, + "grad_norm": 0.17294318974018097, + "learning_rate": 2.398337115818466e-05, + "loss": 0.6504, + "step": 2968 + }, + { + "epoch": 0.684259045863102, + "grad_norm": 0.16647367179393768, + "learning_rate": 2.3951500962736312e-05, + "loss": 0.6593, + "step": 2969 + }, + { + "epoch": 0.6844895137128371, + "grad_norm": 0.16758325695991516, + "learning_rate": 2.391964528609609e-05, + "loss": 0.6422, + "step": 2970 + }, + { + "epoch": 0.684719981562572, + "grad_norm": 0.18237681686878204, + "learning_rate": 2.388780414601959e-05, + "loss": 0.6479, + "step": 2971 + }, + { + "epoch": 0.684950449412307, + "grad_norm": 0.16424010694026947, + "learning_rate": 2.3855977560254338e-05, + "loss": 0.6441, + "step": 2972 + }, + { + "epoch": 0.685180917262042, + "grad_norm": 0.1714371293783188, + "learning_rate": 2.3824165546539673e-05, + "loss": 0.6534, + "step": 2973 + }, + { + "epoch": 0.6854113851117769, + "grad_norm": 0.17393812537193298, + "learning_rate": 2.3792368122606856e-05, + "loss": 0.65, + "step": 2974 + }, + { + "epoch": 0.6856418529615119, + "grad_norm": 0.18081031739711761, + "learning_rate": 2.3760585306179012e-05, + "loss": 0.6493, + "step": 2975 + }, + { + "epoch": 0.6858723208112468, + "grad_norm": 0.16579000651836395, + "learning_rate": 2.3728817114971118e-05, + "loss": 0.6448, + "step": 2976 + }, + { + "epoch": 0.6861027886609818, + "grad_norm": 0.17078308761119843, + "learning_rate": 2.3697063566689998e-05, + "loss": 0.6526, + "step": 2977 + }, + { + "epoch": 0.6863332565107167, + "grad_norm": 0.16518385708332062, + "learning_rate": 2.3665324679034344e-05, + "loss": 0.6374, + "step": 2978 + }, + { + "epoch": 0.6865637243604518, + "grad_norm": 0.16557030379772186, + "learning_rate": 2.3633600469694606e-05, + "loss": 0.6436, + "step": 2979 + }, + { + "epoch": 0.6867941922101867, + "grad_norm": 0.17224299907684326, + "learning_rate": 2.3601890956353118e-05, + "loss": 0.6588, + "step": 2980 + }, + { + "epoch": 0.6870246600599217, + "grad_norm": 0.17414404451847076, + "learning_rate": 2.3570196156684005e-05, + "loss": 0.6473, + "step": 2981 + }, + { + "epoch": 0.6872551279096566, + "grad_norm": 0.16957977414131165, + "learning_rate": 2.353851608835318e-05, + "loss": 0.6469, + "step": 2982 + }, + { + "epoch": 0.6874855957593916, + "grad_norm": 0.16680394113063812, + "learning_rate": 2.3506850769018363e-05, + "loss": 0.648, + "step": 2983 + }, + { + "epoch": 0.6877160636091265, + "grad_norm": 0.1786261647939682, + "learning_rate": 2.3475200216329052e-05, + "loss": 0.6516, + "step": 2984 + }, + { + "epoch": 0.6879465314588615, + "grad_norm": 0.17838843166828156, + "learning_rate": 2.3443564447926447e-05, + "loss": 0.6538, + "step": 2985 + }, + { + "epoch": 0.6881769993085964, + "grad_norm": 0.16465331614017487, + "learning_rate": 2.3411943481443637e-05, + "loss": 0.6512, + "step": 2986 + }, + { + "epoch": 0.6884074671583315, + "grad_norm": 0.16617000102996826, + "learning_rate": 2.3380337334505374e-05, + "loss": 0.6529, + "step": 2987 + }, + { + "epoch": 0.6886379350080664, + "grad_norm": 0.16464616358280182, + "learning_rate": 2.3348746024728142e-05, + "loss": 0.6582, + "step": 2988 + }, + { + "epoch": 0.6888684028578014, + "grad_norm": 0.1769178956747055, + "learning_rate": 2.3317169569720187e-05, + "loss": 0.6525, + "step": 2989 + }, + { + "epoch": 0.6890988707075363, + "grad_norm": 0.16962438821792603, + "learning_rate": 2.328560798708147e-05, + "loss": 0.65, + "step": 2990 + }, + { + "epoch": 0.6893293385572713, + "grad_norm": 0.16247595846652985, + "learning_rate": 2.3254061294403663e-05, + "loss": 0.6484, + "step": 2991 + }, + { + "epoch": 0.6895598064070062, + "grad_norm": 0.17956919968128204, + "learning_rate": 2.322252950927013e-05, + "loss": 0.6464, + "step": 2992 + }, + { + "epoch": 0.6897902742567412, + "grad_norm": 0.17907127737998962, + "learning_rate": 2.3191012649255956e-05, + "loss": 0.653, + "step": 2993 + }, + { + "epoch": 0.6900207421064761, + "grad_norm": 0.16521626710891724, + "learning_rate": 2.3159510731927842e-05, + "loss": 0.6548, + "step": 2994 + }, + { + "epoch": 0.6902512099562111, + "grad_norm": 0.18841508030891418, + "learning_rate": 2.3128023774844194e-05, + "loss": 0.6502, + "step": 2995 + }, + { + "epoch": 0.690481677805946, + "grad_norm": 0.17101146280765533, + "learning_rate": 2.3096551795555155e-05, + "loss": 0.6417, + "step": 2996 + }, + { + "epoch": 0.6907121456556811, + "grad_norm": 0.19493061304092407, + "learning_rate": 2.3065094811602383e-05, + "loss": 0.6547, + "step": 2997 + }, + { + "epoch": 0.690942613505416, + "grad_norm": 0.17434021830558777, + "learning_rate": 2.3033652840519272e-05, + "loss": 0.6511, + "step": 2998 + }, + { + "epoch": 0.691173081355151, + "grad_norm": 0.17021694779396057, + "learning_rate": 2.300222589983084e-05, + "loss": 0.6542, + "step": 2999 + }, + { + "epoch": 0.6914035492048859, + "grad_norm": 0.1962118148803711, + "learning_rate": 2.297081400705365e-05, + "loss": 0.6532, + "step": 3000 + }, + { + "epoch": 0.6916340170546209, + "grad_norm": 0.17823426425457, + "learning_rate": 2.293941717969599e-05, + "loss": 0.6437, + "step": 3001 + }, + { + "epoch": 0.6918644849043558, + "grad_norm": 0.1798575520515442, + "learning_rate": 2.2908035435257706e-05, + "loss": 0.6528, + "step": 3002 + }, + { + "epoch": 0.6920949527540908, + "grad_norm": 0.1800197809934616, + "learning_rate": 2.2876668791230193e-05, + "loss": 0.6527, + "step": 3003 + }, + { + "epoch": 0.6923254206038257, + "grad_norm": 0.17531868815422058, + "learning_rate": 2.2845317265096474e-05, + "loss": 0.651, + "step": 3004 + }, + { + "epoch": 0.6925558884535608, + "grad_norm": 0.19876468181610107, + "learning_rate": 2.2813980874331146e-05, + "loss": 0.6485, + "step": 3005 + }, + { + "epoch": 0.6927863563032957, + "grad_norm": 0.16582679748535156, + "learning_rate": 2.2782659636400355e-05, + "loss": 0.6532, + "step": 3006 + }, + { + "epoch": 0.6930168241530307, + "grad_norm": 0.18158891797065735, + "learning_rate": 2.2751353568761813e-05, + "loss": 0.6509, + "step": 3007 + }, + { + "epoch": 0.6932472920027656, + "grad_norm": 0.17845842242240906, + "learning_rate": 2.272006268886479e-05, + "loss": 0.6502, + "step": 3008 + }, + { + "epoch": 0.6934777598525006, + "grad_norm": 0.17071473598480225, + "learning_rate": 2.2688787014150027e-05, + "loss": 0.6491, + "step": 3009 + }, + { + "epoch": 0.6937082277022355, + "grad_norm": 0.17339655756950378, + "learning_rate": 2.2657526562049836e-05, + "loss": 0.6563, + "step": 3010 + }, + { + "epoch": 0.6939386955519705, + "grad_norm": 0.16644999384880066, + "learning_rate": 2.2626281349988103e-05, + "loss": 0.6527, + "step": 3011 + }, + { + "epoch": 0.6941691634017054, + "grad_norm": 0.16771528124809265, + "learning_rate": 2.25950513953801e-05, + "loss": 0.6455, + "step": 3012 + }, + { + "epoch": 0.6943996312514404, + "grad_norm": 0.17318162322044373, + "learning_rate": 2.2563836715632676e-05, + "loss": 0.6634, + "step": 3013 + }, + { + "epoch": 0.6946300991011753, + "grad_norm": 0.15834879875183105, + "learning_rate": 2.253263732814414e-05, + "loss": 0.6532, + "step": 3014 + }, + { + "epoch": 0.6948605669509104, + "grad_norm": 0.1758836805820465, + "learning_rate": 2.2501453250304283e-05, + "loss": 0.6399, + "step": 3015 + }, + { + "epoch": 0.6950910348006453, + "grad_norm": 0.1595585197210312, + "learning_rate": 2.2470284499494364e-05, + "loss": 0.6525, + "step": 3016 + }, + { + "epoch": 0.6953215026503803, + "grad_norm": 0.1615588665008545, + "learning_rate": 2.2439131093087113e-05, + "loss": 0.643, + "step": 3017 + }, + { + "epoch": 0.6955519705001152, + "grad_norm": 0.16813978552818298, + "learning_rate": 2.2407993048446656e-05, + "loss": 0.6448, + "step": 3018 + }, + { + "epoch": 0.6957824383498502, + "grad_norm": 0.16986919939517975, + "learning_rate": 2.2376870382928607e-05, + "loss": 0.6473, + "step": 3019 + }, + { + "epoch": 0.6960129061995851, + "grad_norm": 0.17389391362667084, + "learning_rate": 2.2345763113879996e-05, + "loss": 0.6468, + "step": 3020 + }, + { + "epoch": 0.6962433740493201, + "grad_norm": 0.17148233950138092, + "learning_rate": 2.2314671258639263e-05, + "loss": 0.6585, + "step": 3021 + }, + { + "epoch": 0.696473841899055, + "grad_norm": 0.18123234808444977, + "learning_rate": 2.228359483453627e-05, + "loss": 0.6487, + "step": 3022 + }, + { + "epoch": 0.69670430974879, + "grad_norm": 0.16465289890766144, + "learning_rate": 2.2252533858892277e-05, + "loss": 0.6497, + "step": 3023 + }, + { + "epoch": 0.696934777598525, + "grad_norm": 0.1766558438539505, + "learning_rate": 2.2221488349019903e-05, + "loss": 0.6505, + "step": 3024 + }, + { + "epoch": 0.69716524544826, + "grad_norm": 0.16725115478038788, + "learning_rate": 2.2190458322223163e-05, + "loss": 0.6475, + "step": 3025 + }, + { + "epoch": 0.6973957132979949, + "grad_norm": 0.17288514971733093, + "learning_rate": 2.2159443795797497e-05, + "loss": 0.6519, + "step": 3026 + }, + { + "epoch": 0.6976261811477299, + "grad_norm": 0.165741428732872, + "learning_rate": 2.2128444787029618e-05, + "loss": 0.6421, + "step": 3027 + }, + { + "epoch": 0.6978566489974648, + "grad_norm": 0.17508132755756378, + "learning_rate": 2.209746131319764e-05, + "loss": 0.6495, + "step": 3028 + }, + { + "epoch": 0.6980871168471998, + "grad_norm": 0.1722734570503235, + "learning_rate": 2.2066493391570996e-05, + "loss": 0.6473, + "step": 3029 + }, + { + "epoch": 0.6983175846969347, + "grad_norm": 0.16845974326133728, + "learning_rate": 2.2035541039410483e-05, + "loss": 0.6449, + "step": 3030 + }, + { + "epoch": 0.6985480525466697, + "grad_norm": 0.18813900649547577, + "learning_rate": 2.200460427396819e-05, + "loss": 0.647, + "step": 3031 + }, + { + "epoch": 0.6987785203964048, + "grad_norm": 0.18029405176639557, + "learning_rate": 2.1973683112487544e-05, + "loss": 0.6461, + "step": 3032 + }, + { + "epoch": 0.6990089882461397, + "grad_norm": 0.18963518738746643, + "learning_rate": 2.194277757220322e-05, + "loss": 0.6387, + "step": 3033 + }, + { + "epoch": 0.6992394560958747, + "grad_norm": 0.17914853990077972, + "learning_rate": 2.191188767034125e-05, + "loss": 0.642, + "step": 3034 + }, + { + "epoch": 0.6994699239456096, + "grad_norm": 0.20282970368862152, + "learning_rate": 2.1881013424118922e-05, + "loss": 0.6574, + "step": 3035 + }, + { + "epoch": 0.6997003917953446, + "grad_norm": 0.17879828810691833, + "learning_rate": 2.18501548507448e-05, + "loss": 0.6485, + "step": 3036 + }, + { + "epoch": 0.6999308596450795, + "grad_norm": 0.18236006796360016, + "learning_rate": 2.1819311967418697e-05, + "loss": 0.6525, + "step": 3037 + }, + { + "epoch": 0.7001613274948145, + "grad_norm": 0.18954585492610931, + "learning_rate": 2.1788484791331738e-05, + "loss": 0.6411, + "step": 3038 + }, + { + "epoch": 0.7003917953445494, + "grad_norm": 0.19611108303070068, + "learning_rate": 2.1757673339666178e-05, + "loss": 0.6474, + "step": 3039 + }, + { + "epoch": 0.7006222631942844, + "grad_norm": 0.1711956113576889, + "learning_rate": 2.172687762959565e-05, + "loss": 0.6458, + "step": 3040 + }, + { + "epoch": 0.7008527310440194, + "grad_norm": 0.1677607148885727, + "learning_rate": 2.169609767828493e-05, + "loss": 0.6419, + "step": 3041 + }, + { + "epoch": 0.7010831988937544, + "grad_norm": 0.19762775301933289, + "learning_rate": 2.1665333502889994e-05, + "loss": 0.6508, + "step": 3042 + }, + { + "epoch": 0.7013136667434893, + "grad_norm": 0.1680401712656021, + "learning_rate": 2.1634585120558078e-05, + "loss": 0.6456, + "step": 3043 + }, + { + "epoch": 0.7015441345932243, + "grad_norm": 0.1744004786014557, + "learning_rate": 2.1603852548427582e-05, + "loss": 0.6496, + "step": 3044 + }, + { + "epoch": 0.7017746024429592, + "grad_norm": 0.179367795586586, + "learning_rate": 2.1573135803628114e-05, + "loss": 0.6438, + "step": 3045 + }, + { + "epoch": 0.7020050702926942, + "grad_norm": 0.16695451736450195, + "learning_rate": 2.154243490328044e-05, + "loss": 0.6508, + "step": 3046 + }, + { + "epoch": 0.7022355381424291, + "grad_norm": 0.18172797560691833, + "learning_rate": 2.1511749864496534e-05, + "loss": 0.6414, + "step": 3047 + }, + { + "epoch": 0.7024660059921641, + "grad_norm": 0.17104046046733856, + "learning_rate": 2.148108070437945e-05, + "loss": 0.649, + "step": 3048 + }, + { + "epoch": 0.702696473841899, + "grad_norm": 0.17437295615673065, + "learning_rate": 2.1450427440023456e-05, + "loss": 0.6541, + "step": 3049 + }, + { + "epoch": 0.702926941691634, + "grad_norm": 0.1672581136226654, + "learning_rate": 2.1419790088513998e-05, + "loss": 0.6469, + "step": 3050 + }, + { + "epoch": 0.703157409541369, + "grad_norm": 0.17412829399108887, + "learning_rate": 2.138916866692754e-05, + "loss": 0.6396, + "step": 3051 + }, + { + "epoch": 0.703387877391104, + "grad_norm": 0.1715768724679947, + "learning_rate": 2.1358563192331747e-05, + "loss": 0.6532, + "step": 3052 + }, + { + "epoch": 0.7036183452408389, + "grad_norm": 0.16486719250679016, + "learning_rate": 2.1327973681785397e-05, + "loss": 0.653, + "step": 3053 + }, + { + "epoch": 0.7038488130905739, + "grad_norm": 0.1680760681629181, + "learning_rate": 2.1297400152338286e-05, + "loss": 0.6564, + "step": 3054 + }, + { + "epoch": 0.7040792809403088, + "grad_norm": 0.16449017822742462, + "learning_rate": 2.1266842621031434e-05, + "loss": 0.6408, + "step": 3055 + }, + { + "epoch": 0.7043097487900438, + "grad_norm": 0.1720770299434662, + "learning_rate": 2.1236301104896866e-05, + "loss": 0.6612, + "step": 3056 + }, + { + "epoch": 0.7045402166397787, + "grad_norm": 0.15701450407505035, + "learning_rate": 2.1205775620957652e-05, + "loss": 0.6421, + "step": 3057 + }, + { + "epoch": 0.7047706844895137, + "grad_norm": 0.17185088992118835, + "learning_rate": 2.1175266186227987e-05, + "loss": 0.6487, + "step": 3058 + }, + { + "epoch": 0.7050011523392486, + "grad_norm": 0.18234078586101532, + "learning_rate": 2.1144772817713103e-05, + "loss": 0.6472, + "step": 3059 + }, + { + "epoch": 0.7052316201889837, + "grad_norm": 0.16597682237625122, + "learning_rate": 2.1114295532409263e-05, + "loss": 0.6511, + "step": 3060 + }, + { + "epoch": 0.7054620880387186, + "grad_norm": 0.17775195837020874, + "learning_rate": 2.1083834347303772e-05, + "loss": 0.6451, + "step": 3061 + }, + { + "epoch": 0.7056925558884536, + "grad_norm": 0.1636880785226822, + "learning_rate": 2.1053389279374987e-05, + "loss": 0.6505, + "step": 3062 + }, + { + "epoch": 0.7059230237381885, + "grad_norm": 0.16783006489276886, + "learning_rate": 2.1022960345592223e-05, + "loss": 0.6394, + "step": 3063 + }, + { + "epoch": 0.7061534915879235, + "grad_norm": 0.17598304152488708, + "learning_rate": 2.0992547562915838e-05, + "loss": 0.6499, + "step": 3064 + }, + { + "epoch": 0.7063839594376584, + "grad_norm": 0.16829919815063477, + "learning_rate": 2.096215094829723e-05, + "loss": 0.6499, + "step": 3065 + }, + { + "epoch": 0.7066144272873934, + "grad_norm": 0.1717023104429245, + "learning_rate": 2.0931770518678707e-05, + "loss": 0.6479, + "step": 3066 + }, + { + "epoch": 0.7068448951371283, + "grad_norm": 0.17776085436344147, + "learning_rate": 2.0901406290993598e-05, + "loss": 0.6604, + "step": 3067 + }, + { + "epoch": 0.7070753629868634, + "grad_norm": 0.17955102026462555, + "learning_rate": 2.087105828216619e-05, + "loss": 0.6479, + "step": 3068 + }, + { + "epoch": 0.7073058308365983, + "grad_norm": 0.17219555377960205, + "learning_rate": 2.0840726509111748e-05, + "loss": 0.6486, + "step": 3069 + }, + { + "epoch": 0.7075362986863333, + "grad_norm": 0.16756534576416016, + "learning_rate": 2.081041098873646e-05, + "loss": 0.636, + "step": 3070 + }, + { + "epoch": 0.7077667665360682, + "grad_norm": 0.17784403264522552, + "learning_rate": 2.0780111737937497e-05, + "loss": 0.6599, + "step": 3071 + }, + { + "epoch": 0.7079972343858032, + "grad_norm": 0.16979734599590302, + "learning_rate": 2.0749828773602898e-05, + "loss": 0.6566, + "step": 3072 + }, + { + "epoch": 0.7082277022355381, + "grad_norm": 0.18243227899074554, + "learning_rate": 2.0719562112611675e-05, + "loss": 0.6452, + "step": 3073 + }, + { + "epoch": 0.7084581700852731, + "grad_norm": 0.16543544828891754, + "learning_rate": 2.0689311771833737e-05, + "loss": 0.6489, + "step": 3074 + }, + { + "epoch": 0.708688637935008, + "grad_norm": 0.17076393961906433, + "learning_rate": 2.0659077768129898e-05, + "loss": 0.6472, + "step": 3075 + }, + { + "epoch": 0.708919105784743, + "grad_norm": 0.17454828321933746, + "learning_rate": 2.0628860118351874e-05, + "loss": 0.6464, + "step": 3076 + }, + { + "epoch": 0.709149573634478, + "grad_norm": 0.16081586480140686, + "learning_rate": 2.0598658839342266e-05, + "loss": 0.6488, + "step": 3077 + }, + { + "epoch": 0.709380041484213, + "grad_norm": 0.17376914620399475, + "learning_rate": 2.0568473947934498e-05, + "loss": 0.6416, + "step": 3078 + }, + { + "epoch": 0.7096105093339479, + "grad_norm": 0.17064812779426575, + "learning_rate": 2.0538305460952945e-05, + "loss": 0.6464, + "step": 3079 + }, + { + "epoch": 0.7098409771836829, + "grad_norm": 0.18025130033493042, + "learning_rate": 2.050815339521281e-05, + "loss": 0.6501, + "step": 3080 + }, + { + "epoch": 0.7100714450334178, + "grad_norm": 0.16442714631557465, + "learning_rate": 2.0478017767520087e-05, + "loss": 0.647, + "step": 3081 + }, + { + "epoch": 0.7103019128831528, + "grad_norm": 0.1766822636127472, + "learning_rate": 2.0447898594671667e-05, + "loss": 0.6456, + "step": 3082 + }, + { + "epoch": 0.7105323807328877, + "grad_norm": 0.17419351637363434, + "learning_rate": 2.0417795893455265e-05, + "loss": 0.6503, + "step": 3083 + }, + { + "epoch": 0.7107628485826227, + "grad_norm": 0.1673542559146881, + "learning_rate": 2.0387709680649397e-05, + "loss": 0.6526, + "step": 3084 + }, + { + "epoch": 0.7109933164323576, + "grad_norm": 0.1788138598203659, + "learning_rate": 2.0357639973023396e-05, + "loss": 0.6441, + "step": 3085 + }, + { + "epoch": 0.7112237842820927, + "grad_norm": 0.16873040795326233, + "learning_rate": 2.032758678733741e-05, + "loss": 0.6467, + "step": 3086 + }, + { + "epoch": 0.7114542521318276, + "grad_norm": 0.17903602123260498, + "learning_rate": 2.0297550140342338e-05, + "loss": 0.647, + "step": 3087 + }, + { + "epoch": 0.7116847199815626, + "grad_norm": 0.15698370337486267, + "learning_rate": 2.0267530048779896e-05, + "loss": 0.6465, + "step": 3088 + }, + { + "epoch": 0.7119151878312975, + "grad_norm": 0.15955950319766998, + "learning_rate": 2.023752652938256e-05, + "loss": 0.6401, + "step": 3089 + }, + { + "epoch": 0.7121456556810325, + "grad_norm": 0.17143939435482025, + "learning_rate": 2.020753959887358e-05, + "loss": 0.6558, + "step": 3090 + }, + { + "epoch": 0.7123761235307675, + "grad_norm": 0.16940918564796448, + "learning_rate": 2.0177569273966945e-05, + "loss": 0.6542, + "step": 3091 + }, + { + "epoch": 0.7126065913805024, + "grad_norm": 0.1974901705980301, + "learning_rate": 2.01476155713674e-05, + "loss": 0.6397, + "step": 3092 + }, + { + "epoch": 0.7128370592302374, + "grad_norm": 0.16214625537395477, + "learning_rate": 2.0117678507770416e-05, + "loss": 0.6492, + "step": 3093 + }, + { + "epoch": 0.7130675270799723, + "grad_norm": 0.19105158746242523, + "learning_rate": 2.0087758099862192e-05, + "loss": 0.6497, + "step": 3094 + }, + { + "epoch": 0.7132979949297074, + "grad_norm": 0.17240196466445923, + "learning_rate": 2.0057854364319646e-05, + "loss": 0.6336, + "step": 3095 + }, + { + "epoch": 0.7135284627794423, + "grad_norm": 0.17806990444660187, + "learning_rate": 2.0027967317810426e-05, + "loss": 0.6529, + "step": 3096 + }, + { + "epoch": 0.7137589306291773, + "grad_norm": 0.1702287495136261, + "learning_rate": 1.9998096976992812e-05, + "loss": 0.6559, + "step": 3097 + }, + { + "epoch": 0.7139893984789122, + "grad_norm": 0.17224186658859253, + "learning_rate": 1.9968243358515837e-05, + "loss": 0.639, + "step": 3098 + }, + { + "epoch": 0.7142198663286472, + "grad_norm": 0.18311919271945953, + "learning_rate": 1.9938406479019183e-05, + "loss": 0.6348, + "step": 3099 + }, + { + "epoch": 0.7144503341783821, + "grad_norm": 0.1758183240890503, + "learning_rate": 1.9908586355133223e-05, + "loss": 0.6486, + "step": 3100 + }, + { + "epoch": 0.7146808020281171, + "grad_norm": 0.1789383888244629, + "learning_rate": 1.9878783003478975e-05, + "loss": 0.649, + "step": 3101 + }, + { + "epoch": 0.714911269877852, + "grad_norm": 0.18306902050971985, + "learning_rate": 1.9848996440668123e-05, + "loss": 0.6387, + "step": 3102 + }, + { + "epoch": 0.715141737727587, + "grad_norm": 0.1843525767326355, + "learning_rate": 1.981922668330293e-05, + "loss": 0.6538, + "step": 3103 + }, + { + "epoch": 0.715372205577322, + "grad_norm": 0.16771547496318817, + "learning_rate": 1.9789473747976412e-05, + "loss": 0.6386, + "step": 3104 + }, + { + "epoch": 0.715602673427057, + "grad_norm": 0.19470912218093872, + "learning_rate": 1.975973765127212e-05, + "loss": 0.6467, + "step": 3105 + }, + { + "epoch": 0.7158331412767919, + "grad_norm": 0.16712939739227295, + "learning_rate": 1.9730018409764218e-05, + "loss": 0.646, + "step": 3106 + }, + { + "epoch": 0.7160636091265269, + "grad_norm": 0.18347413837909698, + "learning_rate": 1.9700316040017515e-05, + "loss": 0.6378, + "step": 3107 + }, + { + "epoch": 0.7162940769762618, + "grad_norm": 0.18481610715389252, + "learning_rate": 1.967063055858739e-05, + "loss": 0.6492, + "step": 3108 + }, + { + "epoch": 0.7165245448259968, + "grad_norm": 0.15738435089588165, + "learning_rate": 1.9640961982019825e-05, + "loss": 0.6417, + "step": 3109 + }, + { + "epoch": 0.7167550126757317, + "grad_norm": 0.17788545787334442, + "learning_rate": 1.9611310326851373e-05, + "loss": 0.6416, + "step": 3110 + }, + { + "epoch": 0.7169854805254667, + "grad_norm": 0.1677306592464447, + "learning_rate": 1.9581675609609173e-05, + "loss": 0.6449, + "step": 3111 + }, + { + "epoch": 0.7172159483752016, + "grad_norm": 0.17086228728294373, + "learning_rate": 1.9552057846810866e-05, + "loss": 0.6473, + "step": 3112 + }, + { + "epoch": 0.7174464162249367, + "grad_norm": 0.1743098348379135, + "learning_rate": 1.9522457054964683e-05, + "loss": 0.6488, + "step": 3113 + }, + { + "epoch": 0.7176768840746716, + "grad_norm": 0.1679782271385193, + "learning_rate": 1.949287325056945e-05, + "loss": 0.6525, + "step": 3114 + }, + { + "epoch": 0.7179073519244066, + "grad_norm": 0.15719559788703918, + "learning_rate": 1.9463306450114416e-05, + "loss": 0.6499, + "step": 3115 + }, + { + "epoch": 0.7181378197741415, + "grad_norm": 0.16834014654159546, + "learning_rate": 1.9433756670079423e-05, + "loss": 0.6547, + "step": 3116 + }, + { + "epoch": 0.7183682876238765, + "grad_norm": 0.17471790313720703, + "learning_rate": 1.9404223926934828e-05, + "loss": 0.6499, + "step": 3117 + }, + { + "epoch": 0.7185987554736114, + "grad_norm": 0.16939441859722137, + "learning_rate": 1.9374708237141413e-05, + "loss": 0.6544, + "step": 3118 + }, + { + "epoch": 0.7188292233233464, + "grad_norm": 0.17127317190170288, + "learning_rate": 1.9345209617150577e-05, + "loss": 0.6403, + "step": 3119 + }, + { + "epoch": 0.7190596911730813, + "grad_norm": 0.16861754655838013, + "learning_rate": 1.9315728083404145e-05, + "loss": 0.6465, + "step": 3120 + }, + { + "epoch": 0.7192901590228163, + "grad_norm": 0.17622019350528717, + "learning_rate": 1.9286263652334368e-05, + "loss": 0.6353, + "step": 3121 + }, + { + "epoch": 0.7195206268725512, + "grad_norm": 0.16617770493030548, + "learning_rate": 1.925681634036404e-05, + "loss": 0.6502, + "step": 3122 + }, + { + "epoch": 0.7197510947222863, + "grad_norm": 0.16550083458423615, + "learning_rate": 1.922738616390639e-05, + "loss": 0.6483, + "step": 3123 + }, + { + "epoch": 0.7199815625720212, + "grad_norm": 0.15915584564208984, + "learning_rate": 1.9197973139365083e-05, + "loss": 0.6441, + "step": 3124 + }, + { + "epoch": 0.7202120304217562, + "grad_norm": 0.16623058915138245, + "learning_rate": 1.9168577283134232e-05, + "loss": 0.6493, + "step": 3125 + }, + { + "epoch": 0.7204424982714911, + "grad_norm": 0.16962306201457977, + "learning_rate": 1.9139198611598404e-05, + "loss": 0.6456, + "step": 3126 + }, + { + "epoch": 0.7206729661212261, + "grad_norm": 0.1750573068857193, + "learning_rate": 1.910983714113253e-05, + "loss": 0.644, + "step": 3127 + }, + { + "epoch": 0.720903433970961, + "grad_norm": 0.17163607478141785, + "learning_rate": 1.9080492888101993e-05, + "loss": 0.641, + "step": 3128 + }, + { + "epoch": 0.721133901820696, + "grad_norm": 0.6170444488525391, + "learning_rate": 1.9051165868862615e-05, + "loss": 0.6669, + "step": 3129 + }, + { + "epoch": 0.7213643696704309, + "grad_norm": 0.17929589748382568, + "learning_rate": 1.9021856099760533e-05, + "loss": 0.6479, + "step": 3130 + }, + { + "epoch": 0.721594837520166, + "grad_norm": 0.16149266064167023, + "learning_rate": 1.8992563597132323e-05, + "loss": 0.6498, + "step": 3131 + }, + { + "epoch": 0.7218253053699009, + "grad_norm": 0.16591522097587585, + "learning_rate": 1.8963288377304916e-05, + "loss": 0.6493, + "step": 3132 + }, + { + "epoch": 0.7220557732196359, + "grad_norm": 0.1631591022014618, + "learning_rate": 1.8934030456595625e-05, + "loss": 0.6517, + "step": 3133 + }, + { + "epoch": 0.7222862410693708, + "grad_norm": 0.1634044647216797, + "learning_rate": 1.890478985131211e-05, + "loss": 0.6413, + "step": 3134 + }, + { + "epoch": 0.7225167089191058, + "grad_norm": 0.1616399884223938, + "learning_rate": 1.88755665777524e-05, + "loss": 0.6432, + "step": 3135 + }, + { + "epoch": 0.7227471767688407, + "grad_norm": 0.16510508954524994, + "learning_rate": 1.8846360652204816e-05, + "loss": 0.6428, + "step": 3136 + }, + { + "epoch": 0.7229776446185757, + "grad_norm": 0.15748094022274017, + "learning_rate": 1.881717209094805e-05, + "loss": 0.6463, + "step": 3137 + }, + { + "epoch": 0.7232081124683106, + "grad_norm": 0.17742668092250824, + "learning_rate": 1.8788000910251103e-05, + "loss": 0.6451, + "step": 3138 + }, + { + "epoch": 0.7234385803180456, + "grad_norm": 0.16781218349933624, + "learning_rate": 1.8758847126373303e-05, + "loss": 0.6499, + "step": 3139 + }, + { + "epoch": 0.7236690481677805, + "grad_norm": 0.1616116166114807, + "learning_rate": 1.8729710755564257e-05, + "loss": 0.6435, + "step": 3140 + }, + { + "epoch": 0.7238995160175156, + "grad_norm": 0.1795063465833664, + "learning_rate": 1.8700591814063905e-05, + "loss": 0.6404, + "step": 3141 + }, + { + "epoch": 0.7241299838672505, + "grad_norm": 0.16432838141918182, + "learning_rate": 1.867149031810241e-05, + "loss": 0.641, + "step": 3142 + }, + { + "epoch": 0.7243604517169855, + "grad_norm": 0.1743660271167755, + "learning_rate": 1.864240628390024e-05, + "loss": 0.6405, + "step": 3143 + }, + { + "epoch": 0.7245909195667204, + "grad_norm": 0.17047765851020813, + "learning_rate": 1.8613339727668194e-05, + "loss": 0.6471, + "step": 3144 + }, + { + "epoch": 0.7248213874164554, + "grad_norm": 0.15893389284610748, + "learning_rate": 1.8584290665607228e-05, + "loss": 0.6404, + "step": 3145 + }, + { + "epoch": 0.7250518552661903, + "grad_norm": 0.20478779077529907, + "learning_rate": 1.8555259113908597e-05, + "loss": 0.6507, + "step": 3146 + }, + { + "epoch": 0.7252823231159253, + "grad_norm": 0.17662855982780457, + "learning_rate": 1.85262450887538e-05, + "loss": 0.6537, + "step": 3147 + }, + { + "epoch": 0.7255127909656602, + "grad_norm": 0.17941661179065704, + "learning_rate": 1.849724860631456e-05, + "loss": 0.6383, + "step": 3148 + }, + { + "epoch": 0.7257432588153953, + "grad_norm": 0.15811479091644287, + "learning_rate": 1.846826968275281e-05, + "loss": 0.6488, + "step": 3149 + }, + { + "epoch": 0.7259737266651302, + "grad_norm": 0.1792788952589035, + "learning_rate": 1.843930833422073e-05, + "loss": 0.6522, + "step": 3150 + }, + { + "epoch": 0.7262041945148652, + "grad_norm": 0.16518691182136536, + "learning_rate": 1.8410364576860646e-05, + "loss": 0.6368, + "step": 3151 + }, + { + "epoch": 0.7264346623646002, + "grad_norm": 0.16625241935253143, + "learning_rate": 1.838143842680513e-05, + "loss": 0.6339, + "step": 3152 + }, + { + "epoch": 0.7266651302143351, + "grad_norm": 0.17410995066165924, + "learning_rate": 1.8352529900176923e-05, + "loss": 0.6421, + "step": 3153 + }, + { + "epoch": 0.7268955980640701, + "grad_norm": 0.16226623952388763, + "learning_rate": 1.832363901308895e-05, + "loss": 0.6515, + "step": 3154 + }, + { + "epoch": 0.727126065913805, + "grad_norm": 0.16737419366836548, + "learning_rate": 1.8294765781644285e-05, + "loss": 0.639, + "step": 3155 + }, + { + "epoch": 0.72735653376354, + "grad_norm": 0.1635529100894928, + "learning_rate": 1.8265910221936206e-05, + "loss": 0.6342, + "step": 3156 + }, + { + "epoch": 0.7275870016132749, + "grad_norm": 0.17363202571868896, + "learning_rate": 1.823707235004805e-05, + "loss": 0.6494, + "step": 3157 + }, + { + "epoch": 0.72781746946301, + "grad_norm": 0.1653600037097931, + "learning_rate": 1.8208252182053403e-05, + "loss": 0.6471, + "step": 3158 + }, + { + "epoch": 0.7280479373127449, + "grad_norm": 0.1543000489473343, + "learning_rate": 1.8179449734015948e-05, + "loss": 0.6399, + "step": 3159 + }, + { + "epoch": 0.7282784051624799, + "grad_norm": 0.1642274111509323, + "learning_rate": 1.8150665021989426e-05, + "loss": 0.6496, + "step": 3160 + }, + { + "epoch": 0.7285088730122148, + "grad_norm": 0.16651754081249237, + "learning_rate": 1.812189806201778e-05, + "loss": 0.6451, + "step": 3161 + }, + { + "epoch": 0.7287393408619498, + "grad_norm": 0.1647575944662094, + "learning_rate": 1.8093148870135e-05, + "loss": 0.6464, + "step": 3162 + }, + { + "epoch": 0.7289698087116847, + "grad_norm": 0.16285358369350433, + "learning_rate": 1.8064417462365226e-05, + "loss": 0.6495, + "step": 3163 + }, + { + "epoch": 0.7292002765614197, + "grad_norm": 0.18453556299209595, + "learning_rate": 1.8035703854722623e-05, + "loss": 0.6457, + "step": 3164 + }, + { + "epoch": 0.7294307444111546, + "grad_norm": 0.16734904050827026, + "learning_rate": 1.800700806321151e-05, + "loss": 0.6464, + "step": 3165 + }, + { + "epoch": 0.7296612122608896, + "grad_norm": 0.167644202709198, + "learning_rate": 1.7978330103826184e-05, + "loss": 0.6494, + "step": 3166 + }, + { + "epoch": 0.7298916801106246, + "grad_norm": 0.17859847843647003, + "learning_rate": 1.7949669992551053e-05, + "loss": 0.6481, + "step": 3167 + }, + { + "epoch": 0.7301221479603596, + "grad_norm": 0.16591614484786987, + "learning_rate": 1.792102774536063e-05, + "loss": 0.6398, + "step": 3168 + }, + { + "epoch": 0.7303526158100945, + "grad_norm": 0.15973828732967377, + "learning_rate": 1.7892403378219364e-05, + "loss": 0.6475, + "step": 3169 + }, + { + "epoch": 0.7305830836598295, + "grad_norm": 0.16776901483535767, + "learning_rate": 1.786379690708181e-05, + "loss": 0.6442, + "step": 3170 + }, + { + "epoch": 0.7308135515095644, + "grad_norm": 0.15858832001686096, + "learning_rate": 1.7835208347892535e-05, + "loss": 0.6528, + "step": 3171 + }, + { + "epoch": 0.7310440193592994, + "grad_norm": 0.16048070788383484, + "learning_rate": 1.7806637716586073e-05, + "loss": 0.6393, + "step": 3172 + }, + { + "epoch": 0.7312744872090343, + "grad_norm": 0.1663476973772049, + "learning_rate": 1.777808502908706e-05, + "loss": 0.6438, + "step": 3173 + }, + { + "epoch": 0.7315049550587693, + "grad_norm": 0.1595594882965088, + "learning_rate": 1.7749550301310074e-05, + "loss": 0.6347, + "step": 3174 + }, + { + "epoch": 0.7317354229085042, + "grad_norm": 0.16028767824172974, + "learning_rate": 1.7721033549159655e-05, + "loss": 0.6495, + "step": 3175 + }, + { + "epoch": 0.7319658907582393, + "grad_norm": 0.16454143822193146, + "learning_rate": 1.7692534788530374e-05, + "loss": 0.653, + "step": 3176 + }, + { + "epoch": 0.7321963586079742, + "grad_norm": 0.16430748999118805, + "learning_rate": 1.7664054035306756e-05, + "loss": 0.6383, + "step": 3177 + }, + { + "epoch": 0.7324268264577092, + "grad_norm": 0.16365505754947662, + "learning_rate": 1.7635591305363292e-05, + "loss": 0.6416, + "step": 3178 + }, + { + "epoch": 0.7326572943074441, + "grad_norm": 0.1731487661600113, + "learning_rate": 1.7607146614564418e-05, + "loss": 0.6551, + "step": 3179 + }, + { + "epoch": 0.7328877621571791, + "grad_norm": 0.1614767163991928, + "learning_rate": 1.7578719978764545e-05, + "loss": 0.6446, + "step": 3180 + }, + { + "epoch": 0.733118230006914, + "grad_norm": 0.16245704889297485, + "learning_rate": 1.755031141380796e-05, + "loss": 0.6451, + "step": 3181 + }, + { + "epoch": 0.733348697856649, + "grad_norm": 0.15642160177230835, + "learning_rate": 1.7521920935528917e-05, + "loss": 0.6422, + "step": 3182 + }, + { + "epoch": 0.7335791657063839, + "grad_norm": 0.162491574883461, + "learning_rate": 1.749354855975164e-05, + "loss": 0.6495, + "step": 3183 + }, + { + "epoch": 0.733809633556119, + "grad_norm": 0.15852685272693634, + "learning_rate": 1.746519430229015e-05, + "loss": 0.6452, + "step": 3184 + }, + { + "epoch": 0.7340401014058539, + "grad_norm": 0.16387516260147095, + "learning_rate": 1.7436858178948457e-05, + "loss": 0.6464, + "step": 3185 + }, + { + "epoch": 0.7342705692555889, + "grad_norm": 0.15998558700084686, + "learning_rate": 1.7408540205520436e-05, + "loss": 0.648, + "step": 3186 + }, + { + "epoch": 0.7345010371053238, + "grad_norm": 0.17251843214035034, + "learning_rate": 1.7380240397789836e-05, + "loss": 0.6492, + "step": 3187 + }, + { + "epoch": 0.7347315049550588, + "grad_norm": 0.1601380854845047, + "learning_rate": 1.7351958771530298e-05, + "loss": 0.6511, + "step": 3188 + }, + { + "epoch": 0.7349619728047937, + "grad_norm": 0.15078677237033844, + "learning_rate": 1.7323695342505342e-05, + "loss": 0.6492, + "step": 3189 + }, + { + "epoch": 0.7351924406545287, + "grad_norm": 0.16088485717773438, + "learning_rate": 1.729545012646828e-05, + "loss": 0.6434, + "step": 3190 + }, + { + "epoch": 0.7354229085042636, + "grad_norm": 0.17706431448459625, + "learning_rate": 1.7267223139162342e-05, + "loss": 0.6491, + "step": 3191 + }, + { + "epoch": 0.7356533763539986, + "grad_norm": 0.15498369932174683, + "learning_rate": 1.7239014396320574e-05, + "loss": 0.646, + "step": 3192 + }, + { + "epoch": 0.7358838442037335, + "grad_norm": 0.1685081273317337, + "learning_rate": 1.7210823913665852e-05, + "loss": 0.6454, + "step": 3193 + }, + { + "epoch": 0.7361143120534686, + "grad_norm": 0.16466312110424042, + "learning_rate": 1.718265170691087e-05, + "loss": 0.6512, + "step": 3194 + }, + { + "epoch": 0.7363447799032035, + "grad_norm": 0.1716887503862381, + "learning_rate": 1.7154497791758157e-05, + "loss": 0.6552, + "step": 3195 + }, + { + "epoch": 0.7365752477529385, + "grad_norm": 0.17006585001945496, + "learning_rate": 1.7126362183899986e-05, + "loss": 0.6402, + "step": 3196 + }, + { + "epoch": 0.7368057156026734, + "grad_norm": 0.15960989892482758, + "learning_rate": 1.7098244899018512e-05, + "loss": 0.6478, + "step": 3197 + }, + { + "epoch": 0.7370361834524084, + "grad_norm": 0.16518181562423706, + "learning_rate": 1.707014595278564e-05, + "loss": 0.6585, + "step": 3198 + }, + { + "epoch": 0.7372666513021433, + "grad_norm": 0.15796171128749847, + "learning_rate": 1.7042065360863007e-05, + "loss": 0.6493, + "step": 3199 + }, + { + "epoch": 0.7374971191518783, + "grad_norm": 0.1603865772485733, + "learning_rate": 1.7014003138902092e-05, + "loss": 0.6361, + "step": 3200 + }, + { + "epoch": 0.7377275870016132, + "grad_norm": 0.1591259092092514, + "learning_rate": 1.698595930254409e-05, + "loss": 0.6409, + "step": 3201 + }, + { + "epoch": 0.7379580548513482, + "grad_norm": 0.16630783677101135, + "learning_rate": 1.6957933867419966e-05, + "loss": 0.6435, + "step": 3202 + }, + { + "epoch": 0.7381885227010831, + "grad_norm": 0.17076456546783447, + "learning_rate": 1.6929926849150428e-05, + "loss": 0.6506, + "step": 3203 + }, + { + "epoch": 0.7384189905508182, + "grad_norm": 0.16689413785934448, + "learning_rate": 1.6901938263345934e-05, + "loss": 0.6521, + "step": 3204 + }, + { + "epoch": 0.7386494584005531, + "grad_norm": 0.16302435100078583, + "learning_rate": 1.687396812560661e-05, + "loss": 0.6446, + "step": 3205 + }, + { + "epoch": 0.7388799262502881, + "grad_norm": 0.1644594967365265, + "learning_rate": 1.6846016451522362e-05, + "loss": 0.6389, + "step": 3206 + }, + { + "epoch": 0.739110394100023, + "grad_norm": 0.16946260631084442, + "learning_rate": 1.681808325667278e-05, + "loss": 0.6493, + "step": 3207 + }, + { + "epoch": 0.739340861949758, + "grad_norm": 0.16244158148765564, + "learning_rate": 1.6790168556627156e-05, + "loss": 0.6507, + "step": 3208 + }, + { + "epoch": 0.7395713297994929, + "grad_norm": 0.18265312910079956, + "learning_rate": 1.6762272366944472e-05, + "loss": 0.6404, + "step": 3209 + }, + { + "epoch": 0.7398017976492279, + "grad_norm": 0.16505154967308044, + "learning_rate": 1.673439470317341e-05, + "loss": 0.6464, + "step": 3210 + }, + { + "epoch": 0.740032265498963, + "grad_norm": 0.1631191074848175, + "learning_rate": 1.6706535580852267e-05, + "loss": 0.6385, + "step": 3211 + }, + { + "epoch": 0.7402627333486979, + "grad_norm": 0.17808805406093597, + "learning_rate": 1.66786950155091e-05, + "loss": 0.6458, + "step": 3212 + }, + { + "epoch": 0.7404932011984329, + "grad_norm": 0.16241349279880524, + "learning_rate": 1.6650873022661563e-05, + "loss": 0.6487, + "step": 3213 + }, + { + "epoch": 0.7407236690481678, + "grad_norm": 0.1839081048965454, + "learning_rate": 1.662306961781694e-05, + "loss": 0.6478, + "step": 3214 + }, + { + "epoch": 0.7409541368979028, + "grad_norm": 0.16764585673809052, + "learning_rate": 1.6595284816472195e-05, + "loss": 0.6424, + "step": 3215 + }, + { + "epoch": 0.7411846047476377, + "grad_norm": 0.15650008618831635, + "learning_rate": 1.6567518634113916e-05, + "loss": 0.6455, + "step": 3216 + }, + { + "epoch": 0.7414150725973727, + "grad_norm": 0.17013835906982422, + "learning_rate": 1.65397710862183e-05, + "loss": 0.6422, + "step": 3217 + }, + { + "epoch": 0.7416455404471076, + "grad_norm": 0.15841804444789886, + "learning_rate": 1.6512042188251164e-05, + "loss": 0.6402, + "step": 3218 + }, + { + "epoch": 0.7418760082968426, + "grad_norm": 0.1625175029039383, + "learning_rate": 1.6484331955667947e-05, + "loss": 0.6491, + "step": 3219 + }, + { + "epoch": 0.7421064761465775, + "grad_norm": 0.17238567769527435, + "learning_rate": 1.6456640403913638e-05, + "loss": 0.6331, + "step": 3220 + }, + { + "epoch": 0.7423369439963126, + "grad_norm": 0.15374347567558289, + "learning_rate": 1.642896754842284e-05, + "loss": 0.656, + "step": 3221 + }, + { + "epoch": 0.7425674118460475, + "grad_norm": 0.17570993304252625, + "learning_rate": 1.640131340461978e-05, + "loss": 0.641, + "step": 3222 + }, + { + "epoch": 0.7427978796957825, + "grad_norm": 0.15289098024368286, + "learning_rate": 1.6373677987918195e-05, + "loss": 0.6542, + "step": 3223 + }, + { + "epoch": 0.7430283475455174, + "grad_norm": 0.16534923017024994, + "learning_rate": 1.634606131372139e-05, + "loss": 0.6461, + "step": 3224 + }, + { + "epoch": 0.7432588153952524, + "grad_norm": 0.1682467758655548, + "learning_rate": 1.631846339742224e-05, + "loss": 0.65, + "step": 3225 + }, + { + "epoch": 0.7434892832449873, + "grad_norm": 0.15141364932060242, + "learning_rate": 1.629088425440317e-05, + "loss": 0.6427, + "step": 3226 + }, + { + "epoch": 0.7437197510947223, + "grad_norm": 0.16520178318023682, + "learning_rate": 1.6263323900036126e-05, + "loss": 0.6427, + "step": 3227 + }, + { + "epoch": 0.7439502189444572, + "grad_norm": 0.1559877246618271, + "learning_rate": 1.6235782349682592e-05, + "loss": 0.6452, + "step": 3228 + }, + { + "epoch": 0.7441806867941922, + "grad_norm": 0.15489576756954193, + "learning_rate": 1.6208259618693583e-05, + "loss": 0.6445, + "step": 3229 + }, + { + "epoch": 0.7444111546439272, + "grad_norm": 0.1593122035264969, + "learning_rate": 1.618075572240957e-05, + "loss": 0.6506, + "step": 3230 + }, + { + "epoch": 0.7446416224936622, + "grad_norm": 0.15990042686462402, + "learning_rate": 1.615327067616057e-05, + "loss": 0.6488, + "step": 3231 + }, + { + "epoch": 0.7448720903433971, + "grad_norm": 0.15468232333660126, + "learning_rate": 1.612580449526614e-05, + "loss": 0.6389, + "step": 3232 + }, + { + "epoch": 0.7451025581931321, + "grad_norm": 0.15509067475795746, + "learning_rate": 1.6098357195035212e-05, + "loss": 0.6439, + "step": 3233 + }, + { + "epoch": 0.745333026042867, + "grad_norm": 0.17395319044589996, + "learning_rate": 1.6070928790766275e-05, + "loss": 0.638, + "step": 3234 + }, + { + "epoch": 0.745563493892602, + "grad_norm": 0.151578888297081, + "learning_rate": 1.6043519297747285e-05, + "loss": 0.6367, + "step": 3235 + }, + { + "epoch": 0.7457939617423369, + "grad_norm": 0.15744104981422424, + "learning_rate": 1.6016128731255575e-05, + "loss": 0.6429, + "step": 3236 + }, + { + "epoch": 0.7460244295920719, + "grad_norm": 0.17480209469795227, + "learning_rate": 1.5988757106558043e-05, + "loss": 0.6509, + "step": 3237 + }, + { + "epoch": 0.7462548974418068, + "grad_norm": 0.16368581354618073, + "learning_rate": 1.5961404438910976e-05, + "loss": 0.6411, + "step": 3238 + }, + { + "epoch": 0.7464853652915419, + "grad_norm": 0.15094834566116333, + "learning_rate": 1.5934070743560065e-05, + "loss": 0.64, + "step": 3239 + }, + { + "epoch": 0.7467158331412768, + "grad_norm": 0.16196765005588531, + "learning_rate": 1.590675603574046e-05, + "loss": 0.6455, + "step": 3240 + }, + { + "epoch": 0.7469463009910118, + "grad_norm": 0.17574729025363922, + "learning_rate": 1.5879460330676743e-05, + "loss": 0.6494, + "step": 3241 + }, + { + "epoch": 0.7471767688407467, + "grad_norm": 0.1642741709947586, + "learning_rate": 1.5852183643582868e-05, + "loss": 0.6417, + "step": 3242 + }, + { + "epoch": 0.7474072366904817, + "grad_norm": 0.1686857044696808, + "learning_rate": 1.5824925989662216e-05, + "loss": 0.6415, + "step": 3243 + }, + { + "epoch": 0.7476377045402166, + "grad_norm": 0.17271287739276886, + "learning_rate": 1.5797687384107558e-05, + "loss": 0.6514, + "step": 3244 + }, + { + "epoch": 0.7478681723899516, + "grad_norm": 0.16826404631137848, + "learning_rate": 1.577046784210101e-05, + "loss": 0.6418, + "step": 3245 + }, + { + "epoch": 0.7480986402396865, + "grad_norm": 0.5926986932754517, + "learning_rate": 1.574326737881409e-05, + "loss": 0.6596, + "step": 3246 + }, + { + "epoch": 0.7483291080894215, + "grad_norm": 0.16710159182548523, + "learning_rate": 1.571608600940774e-05, + "loss": 0.6324, + "step": 3247 + }, + { + "epoch": 0.7485595759391565, + "grad_norm": 0.16913191974163055, + "learning_rate": 1.568892374903214e-05, + "loss": 0.6437, + "step": 3248 + }, + { + "epoch": 0.7487900437888915, + "grad_norm": 0.21093370020389557, + "learning_rate": 1.566178061282691e-05, + "loss": 0.6346, + "step": 3249 + }, + { + "epoch": 0.7490205116386264, + "grad_norm": 0.15978707373142242, + "learning_rate": 1.5634656615920974e-05, + "loss": 0.6546, + "step": 3250 + }, + { + "epoch": 0.7492509794883614, + "grad_norm": 0.17602089047431946, + "learning_rate": 1.56075517734326e-05, + "loss": 0.6429, + "step": 3251 + }, + { + "epoch": 0.7494814473380963, + "grad_norm": 0.16324278712272644, + "learning_rate": 1.558046610046938e-05, + "loss": 0.6439, + "step": 3252 + }, + { + "epoch": 0.7497119151878313, + "grad_norm": 0.1663910150527954, + "learning_rate": 1.5553399612128234e-05, + "loss": 0.6506, + "step": 3253 + }, + { + "epoch": 0.7499423830375662, + "grad_norm": 0.17245832085609436, + "learning_rate": 1.5526352323495336e-05, + "loss": 0.6397, + "step": 3254 + }, + { + "epoch": 0.7501728508873012, + "grad_norm": 0.1658082902431488, + "learning_rate": 1.549932424964622e-05, + "loss": 0.6503, + "step": 3255 + }, + { + "epoch": 0.7504033187370361, + "grad_norm": 0.16782449185848236, + "learning_rate": 1.547231540564567e-05, + "loss": 0.6483, + "step": 3256 + }, + { + "epoch": 0.7506337865867712, + "grad_norm": 0.16004249453544617, + "learning_rate": 1.5445325806547782e-05, + "loss": 0.6485, + "step": 3257 + }, + { + "epoch": 0.7508642544365061, + "grad_norm": 0.1629590541124344, + "learning_rate": 1.5418355467395906e-05, + "loss": 0.6478, + "step": 3258 + }, + { + "epoch": 0.7510947222862411, + "grad_norm": 0.1687597781419754, + "learning_rate": 1.5391404403222676e-05, + "loss": 0.6482, + "step": 3259 + }, + { + "epoch": 0.751325190135976, + "grad_norm": 0.16578345000743866, + "learning_rate": 1.536447262904994e-05, + "loss": 0.648, + "step": 3260 + }, + { + "epoch": 0.751555657985711, + "grad_norm": 0.15404678881168365, + "learning_rate": 1.533756015988882e-05, + "loss": 0.6436, + "step": 3261 + }, + { + "epoch": 0.7517861258354459, + "grad_norm": 0.18324708938598633, + "learning_rate": 1.5310667010739726e-05, + "loss": 0.6466, + "step": 3262 + }, + { + "epoch": 0.7520165936851809, + "grad_norm": 0.16745442152023315, + "learning_rate": 1.5283793196592212e-05, + "loss": 0.6459, + "step": 3263 + }, + { + "epoch": 0.7522470615349158, + "grad_norm": 0.16149534285068512, + "learning_rate": 1.5256938732425107e-05, + "loss": 0.6426, + "step": 3264 + }, + { + "epoch": 0.7524775293846508, + "grad_norm": 0.14993827044963837, + "learning_rate": 1.5230103633206449e-05, + "loss": 0.6334, + "step": 3265 + }, + { + "epoch": 0.7527079972343858, + "grad_norm": 0.15830254554748535, + "learning_rate": 1.5203287913893478e-05, + "loss": 0.6382, + "step": 3266 + }, + { + "epoch": 0.7529384650841208, + "grad_norm": 0.15330778062343597, + "learning_rate": 1.5176491589432628e-05, + "loss": 0.6551, + "step": 3267 + }, + { + "epoch": 0.7531689329338557, + "grad_norm": 0.15092666447162628, + "learning_rate": 1.5149714674759546e-05, + "loss": 0.6435, + "step": 3268 + }, + { + "epoch": 0.7533994007835907, + "grad_norm": 0.16142503917217255, + "learning_rate": 1.5122957184799007e-05, + "loss": 0.6423, + "step": 3269 + }, + { + "epoch": 0.7536298686333257, + "grad_norm": 0.15477819740772247, + "learning_rate": 1.5096219134465017e-05, + "loss": 0.6452, + "step": 3270 + }, + { + "epoch": 0.7538603364830606, + "grad_norm": 0.16160400211811066, + "learning_rate": 1.5069500538660713e-05, + "loss": 0.6451, + "step": 3271 + }, + { + "epoch": 0.7540908043327956, + "grad_norm": 0.17091768980026245, + "learning_rate": 1.5042801412278412e-05, + "loss": 0.6416, + "step": 3272 + }, + { + "epoch": 0.7543212721825305, + "grad_norm": 0.1987750232219696, + "learning_rate": 1.5016121770199553e-05, + "loss": 0.64, + "step": 3273 + }, + { + "epoch": 0.7545517400322655, + "grad_norm": 0.16774295270442963, + "learning_rate": 1.4989461627294755e-05, + "loss": 0.639, + "step": 3274 + }, + { + "epoch": 0.7547822078820005, + "grad_norm": 0.1618596464395523, + "learning_rate": 1.4962820998423683e-05, + "loss": 0.6448, + "step": 3275 + }, + { + "epoch": 0.7550126757317355, + "grad_norm": 0.1566198170185089, + "learning_rate": 1.4936199898435238e-05, + "loss": 0.642, + "step": 3276 + }, + { + "epoch": 0.7552431435814704, + "grad_norm": 0.17685432732105255, + "learning_rate": 1.4909598342167385e-05, + "loss": 0.6434, + "step": 3277 + }, + { + "epoch": 0.7554736114312054, + "grad_norm": 0.15080119669437408, + "learning_rate": 1.488301634444716e-05, + "loss": 0.6513, + "step": 3278 + }, + { + "epoch": 0.7557040792809403, + "grad_norm": 0.15888544917106628, + "learning_rate": 1.485645392009074e-05, + "loss": 0.6462, + "step": 3279 + }, + { + "epoch": 0.7559345471306753, + "grad_norm": 0.16779987514019012, + "learning_rate": 1.4829911083903386e-05, + "loss": 0.6443, + "step": 3280 + }, + { + "epoch": 0.7561650149804102, + "grad_norm": 0.15703189373016357, + "learning_rate": 1.4803387850679445e-05, + "loss": 0.6497, + "step": 3281 + }, + { + "epoch": 0.7563954828301452, + "grad_norm": 0.1584496796131134, + "learning_rate": 1.477688423520232e-05, + "loss": 0.6466, + "step": 3282 + }, + { + "epoch": 0.7566259506798801, + "grad_norm": 0.16846555471420288, + "learning_rate": 1.4750400252244511e-05, + "loss": 0.6452, + "step": 3283 + }, + { + "epoch": 0.7568564185296152, + "grad_norm": 0.1615644246339798, + "learning_rate": 1.4723935916567522e-05, + "loss": 0.6491, + "step": 3284 + }, + { + "epoch": 0.7570868863793501, + "grad_norm": 0.15736454725265503, + "learning_rate": 1.469749124292194e-05, + "loss": 0.6436, + "step": 3285 + }, + { + "epoch": 0.7573173542290851, + "grad_norm": 0.15067139267921448, + "learning_rate": 1.4671066246047438e-05, + "loss": 0.6353, + "step": 3286 + }, + { + "epoch": 0.75754782207882, + "grad_norm": 0.16569077968597412, + "learning_rate": 1.4644660940672627e-05, + "loss": 0.6491, + "step": 3287 + }, + { + "epoch": 0.757778289928555, + "grad_norm": 0.15051403641700745, + "learning_rate": 1.461827534151521e-05, + "loss": 0.6394, + "step": 3288 + }, + { + "epoch": 0.7580087577782899, + "grad_norm": 0.15739889442920685, + "learning_rate": 1.4591909463281894e-05, + "loss": 0.6364, + "step": 3289 + }, + { + "epoch": 0.7582392256280249, + "grad_norm": 0.16223600506782532, + "learning_rate": 1.4565563320668346e-05, + "loss": 0.6472, + "step": 3290 + }, + { + "epoch": 0.7584696934777598, + "grad_norm": 0.159046471118927, + "learning_rate": 1.4539236928359318e-05, + "loss": 0.654, + "step": 3291 + }, + { + "epoch": 0.7587001613274948, + "grad_norm": 0.16086021065711975, + "learning_rate": 1.451293030102851e-05, + "loss": 0.6456, + "step": 3292 + }, + { + "epoch": 0.7589306291772298, + "grad_norm": 0.15306276082992554, + "learning_rate": 1.4486643453338571e-05, + "loss": 0.656, + "step": 3293 + }, + { + "epoch": 0.7591610970269648, + "grad_norm": 0.16488851606845856, + "learning_rate": 1.4460376399941184e-05, + "loss": 0.6519, + "step": 3294 + }, + { + "epoch": 0.7593915648766997, + "grad_norm": 0.16087326407432556, + "learning_rate": 1.443412915547696e-05, + "loss": 0.6482, + "step": 3295 + }, + { + "epoch": 0.7596220327264347, + "grad_norm": 0.15743768215179443, + "learning_rate": 1.4407901734575496e-05, + "loss": 0.6472, + "step": 3296 + }, + { + "epoch": 0.7598525005761696, + "grad_norm": 0.16286884248256683, + "learning_rate": 1.4381694151855318e-05, + "loss": 0.6464, + "step": 3297 + }, + { + "epoch": 0.7600829684259046, + "grad_norm": 0.15299223363399506, + "learning_rate": 1.4355506421923926e-05, + "loss": 0.6535, + "step": 3298 + }, + { + "epoch": 0.7603134362756395, + "grad_norm": 0.16617640852928162, + "learning_rate": 1.4329338559377691e-05, + "loss": 0.6402, + "step": 3299 + }, + { + "epoch": 0.7605439041253745, + "grad_norm": 0.17199209332466125, + "learning_rate": 1.4303190578801967e-05, + "loss": 0.6406, + "step": 3300 + }, + { + "epoch": 0.7607743719751094, + "grad_norm": 0.1543896347284317, + "learning_rate": 1.4277062494771044e-05, + "loss": 0.646, + "step": 3301 + }, + { + "epoch": 0.7610048398248445, + "grad_norm": 0.16141347587108612, + "learning_rate": 1.4250954321848043e-05, + "loss": 0.6484, + "step": 3302 + }, + { + "epoch": 0.7612353076745794, + "grad_norm": 0.17056810855865479, + "learning_rate": 1.4224866074585053e-05, + "loss": 0.641, + "step": 3303 + }, + { + "epoch": 0.7614657755243144, + "grad_norm": 0.17288394272327423, + "learning_rate": 1.4198797767523037e-05, + "loss": 0.6421, + "step": 3304 + }, + { + "epoch": 0.7616962433740493, + "grad_norm": 0.16343657672405243, + "learning_rate": 1.4172749415191844e-05, + "loss": 0.6313, + "step": 3305 + }, + { + "epoch": 0.7619267112237843, + "grad_norm": 0.16413342952728271, + "learning_rate": 1.41467210321102e-05, + "loss": 0.6415, + "step": 3306 + }, + { + "epoch": 0.7621571790735192, + "grad_norm": 0.17422489821910858, + "learning_rate": 1.412071263278571e-05, + "loss": 0.6417, + "step": 3307 + }, + { + "epoch": 0.7623876469232542, + "grad_norm": 0.16758742928504944, + "learning_rate": 1.4094724231714812e-05, + "loss": 0.6429, + "step": 3308 + }, + { + "epoch": 0.7626181147729891, + "grad_norm": 0.16555525362491608, + "learning_rate": 1.406875584338282e-05, + "loss": 0.641, + "step": 3309 + }, + { + "epoch": 0.7628485826227241, + "grad_norm": 0.168875589966774, + "learning_rate": 1.4042807482263904e-05, + "loss": 0.6468, + "step": 3310 + }, + { + "epoch": 0.763079050472459, + "grad_norm": 0.16183854639530182, + "learning_rate": 1.4016879162821044e-05, + "loss": 0.644, + "step": 3311 + }, + { + "epoch": 0.7633095183221941, + "grad_norm": 0.15983159840106964, + "learning_rate": 1.3990970899506072e-05, + "loss": 0.6481, + "step": 3312 + }, + { + "epoch": 0.763539986171929, + "grad_norm": 0.1633015125989914, + "learning_rate": 1.3965082706759646e-05, + "loss": 0.642, + "step": 3313 + }, + { + "epoch": 0.763770454021664, + "grad_norm": 0.15642528235912323, + "learning_rate": 1.3939214599011174e-05, + "loss": 0.6431, + "step": 3314 + }, + { + "epoch": 0.7640009218713989, + "grad_norm": 0.15615011751651764, + "learning_rate": 1.3913366590678966e-05, + "loss": 0.6502, + "step": 3315 + }, + { + "epoch": 0.7642313897211339, + "grad_norm": 0.15808658301830292, + "learning_rate": 1.3887538696170089e-05, + "loss": 0.6522, + "step": 3316 + }, + { + "epoch": 0.7644618575708688, + "grad_norm": 0.15133537352085114, + "learning_rate": 1.3861730929880346e-05, + "loss": 0.6448, + "step": 3317 + }, + { + "epoch": 0.7646923254206038, + "grad_norm": 0.152401402592659, + "learning_rate": 1.3835943306194393e-05, + "loss": 0.6524, + "step": 3318 + }, + { + "epoch": 0.7649227932703387, + "grad_norm": 0.15733560919761658, + "learning_rate": 1.3810175839485628e-05, + "loss": 0.6479, + "step": 3319 + }, + { + "epoch": 0.7651532611200738, + "grad_norm": 0.16682898998260498, + "learning_rate": 1.3784428544116218e-05, + "loss": 0.6391, + "step": 3320 + }, + { + "epoch": 0.7653837289698087, + "grad_norm": 0.14624935388565063, + "learning_rate": 1.3758701434437088e-05, + "loss": 0.6499, + "step": 3321 + }, + { + "epoch": 0.7656141968195437, + "grad_norm": 0.1629732847213745, + "learning_rate": 1.3732994524787935e-05, + "loss": 0.6372, + "step": 3322 + }, + { + "epoch": 0.7658446646692786, + "grad_norm": 0.1721165031194687, + "learning_rate": 1.370730782949713e-05, + "loss": 0.6376, + "step": 3323 + }, + { + "epoch": 0.7660751325190136, + "grad_norm": 0.1540745496749878, + "learning_rate": 1.3681641362881842e-05, + "loss": 0.6442, + "step": 3324 + }, + { + "epoch": 0.7663056003687485, + "grad_norm": 0.1577741801738739, + "learning_rate": 1.365599513924794e-05, + "loss": 0.6475, + "step": 3325 + }, + { + "epoch": 0.7665360682184835, + "grad_norm": 0.15906648337841034, + "learning_rate": 1.3630369172890017e-05, + "loss": 0.6426, + "step": 3326 + }, + { + "epoch": 0.7667665360682184, + "grad_norm": 0.14891204237937927, + "learning_rate": 1.3604763478091375e-05, + "loss": 0.6394, + "step": 3327 + }, + { + "epoch": 0.7669970039179534, + "grad_norm": 0.1566314697265625, + "learning_rate": 1.3579178069124021e-05, + "loss": 0.6483, + "step": 3328 + }, + { + "epoch": 0.7672274717676885, + "grad_norm": 0.16238120198249817, + "learning_rate": 1.3553612960248607e-05, + "loss": 0.6396, + "step": 3329 + }, + { + "epoch": 0.7674579396174234, + "grad_norm": 0.16790124773979187, + "learning_rate": 1.3528068165714552e-05, + "loss": 0.6512, + "step": 3330 + }, + { + "epoch": 0.7676884074671584, + "grad_norm": 0.15930506587028503, + "learning_rate": 1.3502543699759917e-05, + "loss": 0.6483, + "step": 3331 + }, + { + "epoch": 0.7679188753168933, + "grad_norm": 0.15827548503875732, + "learning_rate": 1.34770395766114e-05, + "loss": 0.6445, + "step": 3332 + }, + { + "epoch": 0.7681493431666283, + "grad_norm": 0.16675326228141785, + "learning_rate": 1.3451555810484389e-05, + "loss": 0.6528, + "step": 3333 + }, + { + "epoch": 0.7683798110163632, + "grad_norm": 0.16960427165031433, + "learning_rate": 1.3426092415582936e-05, + "loss": 0.6565, + "step": 3334 + }, + { + "epoch": 0.7686102788660982, + "grad_norm": 0.1545518934726715, + "learning_rate": 1.3400649406099719e-05, + "loss": 0.6461, + "step": 3335 + }, + { + "epoch": 0.7688407467158331, + "grad_norm": 0.1682869791984558, + "learning_rate": 1.337522679621606e-05, + "loss": 0.6409, + "step": 3336 + }, + { + "epoch": 0.7690712145655682, + "grad_norm": 0.1694023311138153, + "learning_rate": 1.3349824600101934e-05, + "loss": 0.6443, + "step": 3337 + }, + { + "epoch": 0.7693016824153031, + "grad_norm": 0.15705664455890656, + "learning_rate": 1.3324442831915878e-05, + "loss": 0.6416, + "step": 3338 + }, + { + "epoch": 0.7695321502650381, + "grad_norm": 0.16872623562812805, + "learning_rate": 1.3299081505805088e-05, + "loss": 0.6481, + "step": 3339 + }, + { + "epoch": 0.769762618114773, + "grad_norm": 0.15904702246189117, + "learning_rate": 1.3273740635905397e-05, + "loss": 0.6493, + "step": 3340 + }, + { + "epoch": 0.769993085964508, + "grad_norm": 0.165695458650589, + "learning_rate": 1.3248420236341147e-05, + "loss": 0.6417, + "step": 3341 + }, + { + "epoch": 0.7702235538142429, + "grad_norm": 0.16277405619621277, + "learning_rate": 1.3223120321225351e-05, + "loss": 0.644, + "step": 3342 + }, + { + "epoch": 0.7704540216639779, + "grad_norm": 0.15501618385314941, + "learning_rate": 1.319784090465958e-05, + "loss": 0.6462, + "step": 3343 + }, + { + "epoch": 0.7706844895137128, + "grad_norm": 0.16321119666099548, + "learning_rate": 1.317258200073393e-05, + "loss": 0.6355, + "step": 3344 + }, + { + "epoch": 0.7709149573634478, + "grad_norm": 0.1589282751083374, + "learning_rate": 1.314734362352716e-05, + "loss": 0.6329, + "step": 3345 + }, + { + "epoch": 0.7711454252131827, + "grad_norm": 0.15726275742053986, + "learning_rate": 1.3122125787106521e-05, + "loss": 0.632, + "step": 3346 + }, + { + "epoch": 0.7713758930629178, + "grad_norm": 0.17059746384620667, + "learning_rate": 1.3096928505527811e-05, + "loss": 0.6396, + "step": 3347 + }, + { + "epoch": 0.7716063609126527, + "grad_norm": 0.15518136322498322, + "learning_rate": 1.3071751792835402e-05, + "loss": 0.651, + "step": 3348 + }, + { + "epoch": 0.7718368287623877, + "grad_norm": 0.16409626603126526, + "learning_rate": 1.3046595663062188e-05, + "loss": 0.6485, + "step": 3349 + }, + { + "epoch": 0.7720672966121226, + "grad_norm": 0.15897217392921448, + "learning_rate": 1.3021460130229596e-05, + "loss": 0.6549, + "step": 3350 + }, + { + "epoch": 0.7722977644618576, + "grad_norm": 0.15244118869304657, + "learning_rate": 1.2996345208347565e-05, + "loss": 0.6445, + "step": 3351 + }, + { + "epoch": 0.7725282323115925, + "grad_norm": 0.15054090321063995, + "learning_rate": 1.2971250911414567e-05, + "loss": 0.6433, + "step": 3352 + }, + { + "epoch": 0.7727587001613275, + "grad_norm": 0.16574056446552277, + "learning_rate": 1.2946177253417525e-05, + "loss": 0.6394, + "step": 3353 + }, + { + "epoch": 0.7729891680110624, + "grad_norm": 0.16119171679019928, + "learning_rate": 1.2921124248331901e-05, + "loss": 0.6408, + "step": 3354 + }, + { + "epoch": 0.7732196358607974, + "grad_norm": 0.15423963963985443, + "learning_rate": 1.2896091910121666e-05, + "loss": 0.6454, + "step": 3355 + }, + { + "epoch": 0.7734501037105324, + "grad_norm": 0.15891356766223907, + "learning_rate": 1.2871080252739247e-05, + "loss": 0.6464, + "step": 3356 + }, + { + "epoch": 0.7736805715602674, + "grad_norm": 0.15453371405601501, + "learning_rate": 1.2846089290125507e-05, + "loss": 0.6365, + "step": 3357 + }, + { + "epoch": 0.7739110394100023, + "grad_norm": 0.15373654663562775, + "learning_rate": 1.2821119036209827e-05, + "loss": 0.6484, + "step": 3358 + }, + { + "epoch": 0.7741415072597373, + "grad_norm": 0.15394984185695648, + "learning_rate": 1.2796169504910028e-05, + "loss": 0.6376, + "step": 3359 + }, + { + "epoch": 0.7743719751094722, + "grad_norm": 0.1592058390378952, + "learning_rate": 1.2771240710132375e-05, + "loss": 0.642, + "step": 3360 + }, + { + "epoch": 0.7746024429592072, + "grad_norm": 0.1550869345664978, + "learning_rate": 1.2746332665771587e-05, + "loss": 0.6512, + "step": 3361 + }, + { + "epoch": 0.7748329108089421, + "grad_norm": 0.16978594660758972, + "learning_rate": 1.2721445385710818e-05, + "loss": 0.6474, + "step": 3362 + }, + { + "epoch": 0.7750633786586771, + "grad_norm": 0.16240744292736053, + "learning_rate": 1.2696578883821614e-05, + "loss": 0.6442, + "step": 3363 + }, + { + "epoch": 0.775293846508412, + "grad_norm": 0.14979274570941925, + "learning_rate": 1.2671733173963968e-05, + "loss": 0.6478, + "step": 3364 + }, + { + "epoch": 0.7755243143581471, + "grad_norm": 0.16255247592926025, + "learning_rate": 1.2646908269986318e-05, + "loss": 0.6445, + "step": 3365 + }, + { + "epoch": 0.775754782207882, + "grad_norm": 0.15680548548698425, + "learning_rate": 1.2622104185725441e-05, + "loss": 0.6426, + "step": 3366 + }, + { + "epoch": 0.775985250057617, + "grad_norm": 0.15975458920001984, + "learning_rate": 1.259732093500654e-05, + "loss": 0.6461, + "step": 3367 + }, + { + "epoch": 0.7762157179073519, + "grad_norm": 0.15226735174655914, + "learning_rate": 1.2572558531643208e-05, + "loss": 0.6403, + "step": 3368 + }, + { + "epoch": 0.7764461857570869, + "grad_norm": 0.1590985804796219, + "learning_rate": 1.2547816989437416e-05, + "loss": 0.645, + "step": 3369 + }, + { + "epoch": 0.7766766536068218, + "grad_norm": 0.1611194908618927, + "learning_rate": 1.2523096322179501e-05, + "loss": 0.6515, + "step": 3370 + }, + { + "epoch": 0.7769071214565568, + "grad_norm": 0.16382546722888947, + "learning_rate": 1.2498396543648195e-05, + "loss": 0.6497, + "step": 3371 + }, + { + "epoch": 0.7771375893062917, + "grad_norm": 0.15387794375419617, + "learning_rate": 1.2473717667610519e-05, + "loss": 0.6458, + "step": 3372 + }, + { + "epoch": 0.7773680571560267, + "grad_norm": 0.15444159507751465, + "learning_rate": 1.2449059707821904e-05, + "loss": 0.6327, + "step": 3373 + }, + { + "epoch": 0.7775985250057617, + "grad_norm": 0.16839702427387238, + "learning_rate": 1.2424422678026116e-05, + "loss": 0.6397, + "step": 3374 + }, + { + "epoch": 0.7778289928554967, + "grad_norm": 0.16093666851520538, + "learning_rate": 1.2399806591955227e-05, + "loss": 0.6428, + "step": 3375 + }, + { + "epoch": 0.7780594607052316, + "grad_norm": 0.1614423543214798, + "learning_rate": 1.2375211463329666e-05, + "loss": 0.6453, + "step": 3376 + }, + { + "epoch": 0.7782899285549666, + "grad_norm": 0.1571039855480194, + "learning_rate": 1.2350637305858176e-05, + "loss": 0.6286, + "step": 3377 + }, + { + "epoch": 0.7785203964047015, + "grad_norm": 0.1751880794763565, + "learning_rate": 1.2326084133237774e-05, + "loss": 0.6371, + "step": 3378 + }, + { + "epoch": 0.7787508642544365, + "grad_norm": 0.16066774725914001, + "learning_rate": 1.2301551959153813e-05, + "loss": 0.6501, + "step": 3379 + }, + { + "epoch": 0.7789813321041714, + "grad_norm": 0.16211430728435516, + "learning_rate": 1.2277040797279976e-05, + "loss": 0.6437, + "step": 3380 + }, + { + "epoch": 0.7792117999539064, + "grad_norm": 0.16450825333595276, + "learning_rate": 1.2252550661278156e-05, + "loss": 0.6409, + "step": 3381 + }, + { + "epoch": 0.7794422678036413, + "grad_norm": 0.16443844139575958, + "learning_rate": 1.2228081564798583e-05, + "loss": 0.6472, + "step": 3382 + }, + { + "epoch": 0.7796727356533764, + "grad_norm": 0.17358103394508362, + "learning_rate": 1.2203633521479735e-05, + "loss": 0.6389, + "step": 3383 + }, + { + "epoch": 0.7799032035031113, + "grad_norm": 0.16539937257766724, + "learning_rate": 1.2179206544948374e-05, + "loss": 0.6404, + "step": 3384 + }, + { + "epoch": 0.7801336713528463, + "grad_norm": 0.15908871591091156, + "learning_rate": 1.2154800648819508e-05, + "loss": 0.6396, + "step": 3385 + }, + { + "epoch": 0.7803641392025812, + "grad_norm": 0.17083421349525452, + "learning_rate": 1.2130415846696414e-05, + "loss": 0.6408, + "step": 3386 + }, + { + "epoch": 0.7805946070523162, + "grad_norm": 0.17554426193237305, + "learning_rate": 1.2106052152170561e-05, + "loss": 0.6402, + "step": 3387 + }, + { + "epoch": 0.7808250749020512, + "grad_norm": 0.16431263089179993, + "learning_rate": 1.2081709578821709e-05, + "loss": 0.6394, + "step": 3388 + }, + { + "epoch": 0.7810555427517861, + "grad_norm": 0.16662830114364624, + "learning_rate": 1.2057388140217818e-05, + "loss": 0.6456, + "step": 3389 + }, + { + "epoch": 0.7812860106015211, + "grad_norm": 0.16625608503818512, + "learning_rate": 1.2033087849915076e-05, + "loss": 0.6474, + "step": 3390 + }, + { + "epoch": 0.781516478451256, + "grad_norm": 0.1618298441171646, + "learning_rate": 1.2008808721457881e-05, + "loss": 0.642, + "step": 3391 + }, + { + "epoch": 0.7817469463009911, + "grad_norm": 0.1568860411643982, + "learning_rate": 1.1984550768378856e-05, + "loss": 0.6519, + "step": 3392 + }, + { + "epoch": 0.781977414150726, + "grad_norm": 0.15564152598381042, + "learning_rate": 1.1960314004198752e-05, + "loss": 0.6391, + "step": 3393 + }, + { + "epoch": 0.782207882000461, + "grad_norm": 0.16593000292778015, + "learning_rate": 1.1936098442426608e-05, + "loss": 0.6472, + "step": 3394 + }, + { + "epoch": 0.7824383498501959, + "grad_norm": 0.1633983701467514, + "learning_rate": 1.1911904096559589e-05, + "loss": 0.6497, + "step": 3395 + }, + { + "epoch": 0.7826688176999309, + "grad_norm": 0.15584073960781097, + "learning_rate": 1.1887730980083023e-05, + "loss": 0.6437, + "step": 3396 + }, + { + "epoch": 0.7828992855496658, + "grad_norm": 0.15618832409381866, + "learning_rate": 1.1863579106470434e-05, + "loss": 0.6372, + "step": 3397 + }, + { + "epoch": 0.7831297533994008, + "grad_norm": 0.16305984556674957, + "learning_rate": 1.1839448489183503e-05, + "loss": 0.6409, + "step": 3398 + }, + { + "epoch": 0.7833602212491357, + "grad_norm": 0.1538400650024414, + "learning_rate": 1.181533914167205e-05, + "loss": 0.6457, + "step": 3399 + }, + { + "epoch": 0.7835906890988708, + "grad_norm": 0.15274596214294434, + "learning_rate": 1.1791251077374043e-05, + "loss": 0.6412, + "step": 3400 + }, + { + "epoch": 0.7838211569486057, + "grad_norm": 0.1625516712665558, + "learning_rate": 1.1767184309715618e-05, + "loss": 0.6418, + "step": 3401 + }, + { + "epoch": 0.7840516247983407, + "grad_norm": 0.14723870158195496, + "learning_rate": 1.1743138852110969e-05, + "loss": 0.6439, + "step": 3402 + }, + { + "epoch": 0.7842820926480756, + "grad_norm": 0.15598557889461517, + "learning_rate": 1.1719114717962476e-05, + "loss": 0.638, + "step": 3403 + }, + { + "epoch": 0.7845125604978106, + "grad_norm": 0.16605372726917267, + "learning_rate": 1.169511192066064e-05, + "loss": 0.6421, + "step": 3404 + }, + { + "epoch": 0.7847430283475455, + "grad_norm": 0.1595214158296585, + "learning_rate": 1.1671130473584013e-05, + "loss": 0.6449, + "step": 3405 + }, + { + "epoch": 0.7849734961972805, + "grad_norm": 0.15899643301963806, + "learning_rate": 1.1647170390099283e-05, + "loss": 0.6373, + "step": 3406 + }, + { + "epoch": 0.7852039640470154, + "grad_norm": 0.16188432276248932, + "learning_rate": 1.1623231683561247e-05, + "loss": 0.6418, + "step": 3407 + }, + { + "epoch": 0.7854344318967504, + "grad_norm": 0.16148293018341064, + "learning_rate": 1.1599314367312725e-05, + "loss": 0.6438, + "step": 3408 + }, + { + "epoch": 0.7856648997464853, + "grad_norm": 0.14987516403198242, + "learning_rate": 1.157541845468469e-05, + "loss": 0.6383, + "step": 3409 + }, + { + "epoch": 0.7858953675962204, + "grad_norm": 0.160349503159523, + "learning_rate": 1.1551543958996148e-05, + "loss": 0.6452, + "step": 3410 + }, + { + "epoch": 0.7861258354459553, + "grad_norm": 0.1663265824317932, + "learning_rate": 1.1527690893554156e-05, + "loss": 0.6423, + "step": 3411 + }, + { + "epoch": 0.7863563032956903, + "grad_norm": 0.15916943550109863, + "learning_rate": 1.1503859271653839e-05, + "loss": 0.6423, + "step": 3412 + }, + { + "epoch": 0.7865867711454252, + "grad_norm": 0.16647377610206604, + "learning_rate": 1.1480049106578377e-05, + "loss": 0.652, + "step": 3413 + }, + { + "epoch": 0.7868172389951602, + "grad_norm": 0.15861909091472626, + "learning_rate": 1.1456260411598984e-05, + "loss": 0.6446, + "step": 3414 + }, + { + "epoch": 0.7870477068448951, + "grad_norm": 0.15791212022304535, + "learning_rate": 1.143249319997491e-05, + "loss": 0.6436, + "step": 3415 + }, + { + "epoch": 0.7872781746946301, + "grad_norm": 0.17296838760375977, + "learning_rate": 1.1408747484953442e-05, + "loss": 0.6361, + "step": 3416 + }, + { + "epoch": 0.787508642544365, + "grad_norm": 0.16724546253681183, + "learning_rate": 1.138502327976984e-05, + "loss": 0.6409, + "step": 3417 + }, + { + "epoch": 0.7877391103941, + "grad_norm": 0.1602526307106018, + "learning_rate": 1.1361320597647407e-05, + "loss": 0.636, + "step": 3418 + }, + { + "epoch": 0.787969578243835, + "grad_norm": 0.15437106788158417, + "learning_rate": 1.1337639451797494e-05, + "loss": 0.6457, + "step": 3419 + }, + { + "epoch": 0.78820004609357, + "grad_norm": 0.15734903514385223, + "learning_rate": 1.1313979855419359e-05, + "loss": 0.6389, + "step": 3420 + }, + { + "epoch": 0.7884305139433049, + "grad_norm": 0.16602694988250732, + "learning_rate": 1.1290341821700313e-05, + "loss": 0.6401, + "step": 3421 + }, + { + "epoch": 0.7886609817930399, + "grad_norm": 0.1661614030599594, + "learning_rate": 1.1266725363815623e-05, + "loss": 0.655, + "step": 3422 + }, + { + "epoch": 0.7888914496427748, + "grad_norm": 0.1512596160173416, + "learning_rate": 1.1243130494928533e-05, + "loss": 0.6383, + "step": 3423 + }, + { + "epoch": 0.7891219174925098, + "grad_norm": 0.14931842684745789, + "learning_rate": 1.1219557228190258e-05, + "loss": 0.6418, + "step": 3424 + }, + { + "epoch": 0.7893523853422447, + "grad_norm": 0.1699947863817215, + "learning_rate": 1.1196005576739993e-05, + "loss": 0.6467, + "step": 3425 + }, + { + "epoch": 0.7895828531919797, + "grad_norm": 0.1627425104379654, + "learning_rate": 1.1172475553704826e-05, + "loss": 0.6307, + "step": 3426 + }, + { + "epoch": 0.7898133210417146, + "grad_norm": 0.17060711979866028, + "learning_rate": 1.1148967172199848e-05, + "loss": 0.652, + "step": 3427 + }, + { + "epoch": 0.7900437888914497, + "grad_norm": 0.16025568544864655, + "learning_rate": 1.1125480445328057e-05, + "loss": 0.6434, + "step": 3428 + }, + { + "epoch": 0.7902742567411846, + "grad_norm": 0.1679077297449112, + "learning_rate": 1.1102015386180404e-05, + "loss": 0.6428, + "step": 3429 + }, + { + "epoch": 0.7905047245909196, + "grad_norm": 0.1602405458688736, + "learning_rate": 1.1078572007835735e-05, + "loss": 0.6387, + "step": 3430 + }, + { + "epoch": 0.7907351924406545, + "grad_norm": 0.14988547563552856, + "learning_rate": 1.1055150323360852e-05, + "loss": 0.6398, + "step": 3431 + }, + { + "epoch": 0.7909656602903895, + "grad_norm": 0.16209550201892853, + "learning_rate": 1.10317503458104e-05, + "loss": 0.6433, + "step": 3432 + }, + { + "epoch": 0.7911961281401244, + "grad_norm": 0.16234451532363892, + "learning_rate": 1.1008372088226992e-05, + "loss": 0.6415, + "step": 3433 + }, + { + "epoch": 0.7914265959898594, + "grad_norm": 0.14663785696029663, + "learning_rate": 1.098501556364112e-05, + "loss": 0.6472, + "step": 3434 + }, + { + "epoch": 0.7916570638395943, + "grad_norm": 0.15545038878917694, + "learning_rate": 1.0961680785071116e-05, + "loss": 0.6441, + "step": 3435 + }, + { + "epoch": 0.7918875316893293, + "grad_norm": 0.17046032845973969, + "learning_rate": 1.0938367765523244e-05, + "loss": 0.6392, + "step": 3436 + }, + { + "epoch": 0.7921179995390643, + "grad_norm": 0.15083369612693787, + "learning_rate": 1.0915076517991617e-05, + "loss": 0.6356, + "step": 3437 + }, + { + "epoch": 0.7923484673887993, + "grad_norm": 0.169847771525383, + "learning_rate": 1.0891807055458226e-05, + "loss": 0.6369, + "step": 3438 + }, + { + "epoch": 0.7925789352385342, + "grad_norm": 0.16202600300312042, + "learning_rate": 1.0868559390892902e-05, + "loss": 0.6494, + "step": 3439 + }, + { + "epoch": 0.7928094030882692, + "grad_norm": 0.1544782817363739, + "learning_rate": 1.0845333537253349e-05, + "loss": 0.6499, + "step": 3440 + }, + { + "epoch": 0.7930398709380041, + "grad_norm": 0.15906357765197754, + "learning_rate": 1.082212950748508e-05, + "loss": 0.6448, + "step": 3441 + }, + { + "epoch": 0.7932703387877391, + "grad_norm": 0.1550484299659729, + "learning_rate": 1.0798947314521468e-05, + "loss": 0.6422, + "step": 3442 + }, + { + "epoch": 0.793500806637474, + "grad_norm": 0.15442262589931488, + "learning_rate": 1.0775786971283725e-05, + "loss": 0.64, + "step": 3443 + }, + { + "epoch": 0.793731274487209, + "grad_norm": 0.1611274778842926, + "learning_rate": 1.0752648490680856e-05, + "loss": 0.642, + "step": 3444 + }, + { + "epoch": 0.7939617423369439, + "grad_norm": 0.15969206392765045, + "learning_rate": 1.07295318856097e-05, + "loss": 0.641, + "step": 3445 + }, + { + "epoch": 0.794192210186679, + "grad_norm": 0.15087677538394928, + "learning_rate": 1.0706437168954913e-05, + "loss": 0.6345, + "step": 3446 + }, + { + "epoch": 0.7944226780364139, + "grad_norm": 0.1505325436592102, + "learning_rate": 1.0683364353588898e-05, + "loss": 0.639, + "step": 3447 + }, + { + "epoch": 0.7946531458861489, + "grad_norm": 0.16190530359745026, + "learning_rate": 1.0660313452371922e-05, + "loss": 0.6442, + "step": 3448 + }, + { + "epoch": 0.7948836137358839, + "grad_norm": 0.15547344088554382, + "learning_rate": 1.063728447815201e-05, + "loss": 0.6475, + "step": 3449 + }, + { + "epoch": 0.7951140815856188, + "grad_norm": 0.1529662311077118, + "learning_rate": 1.0614277443764925e-05, + "loss": 0.642, + "step": 3450 + }, + { + "epoch": 0.7953445494353538, + "grad_norm": 0.15717142820358276, + "learning_rate": 1.0591292362034255e-05, + "loss": 0.6335, + "step": 3451 + }, + { + "epoch": 0.7955750172850887, + "grad_norm": 0.15520958602428436, + "learning_rate": 1.0568329245771336e-05, + "loss": 0.6368, + "step": 3452 + }, + { + "epoch": 0.7958054851348237, + "grad_norm": 0.16604743897914886, + "learning_rate": 1.054538810777525e-05, + "loss": 0.6456, + "step": 3453 + }, + { + "epoch": 0.7960359529845586, + "grad_norm": 0.15997354686260223, + "learning_rate": 1.0522468960832842e-05, + "loss": 0.6438, + "step": 3454 + }, + { + "epoch": 0.7962664208342937, + "grad_norm": 0.1553625613451004, + "learning_rate": 1.0499571817718707e-05, + "loss": 0.6391, + "step": 3455 + }, + { + "epoch": 0.7964968886840286, + "grad_norm": 0.2511269748210907, + "learning_rate": 1.0476696691195138e-05, + "loss": 0.6387, + "step": 3456 + }, + { + "epoch": 0.7967273565337636, + "grad_norm": 0.14997459948062897, + "learning_rate": 1.0453843594012175e-05, + "loss": 0.6303, + "step": 3457 + }, + { + "epoch": 0.7969578243834985, + "grad_norm": 0.14934763312339783, + "learning_rate": 1.0431012538907631e-05, + "loss": 0.6377, + "step": 3458 + }, + { + "epoch": 0.7971882922332335, + "grad_norm": 0.15451250970363617, + "learning_rate": 1.0408203538606948e-05, + "loss": 0.6484, + "step": 3459 + }, + { + "epoch": 0.7974187600829684, + "grad_norm": 0.15496329963207245, + "learning_rate": 1.0385416605823323e-05, + "loss": 0.6356, + "step": 3460 + }, + { + "epoch": 0.7976492279327034, + "grad_norm": 0.15835978090763092, + "learning_rate": 1.0362651753257668e-05, + "loss": 0.6446, + "step": 3461 + }, + { + "epoch": 0.7978796957824383, + "grad_norm": 0.16143862903118134, + "learning_rate": 1.0339908993598518e-05, + "loss": 0.6367, + "step": 3462 + }, + { + "epoch": 0.7981101636321734, + "grad_norm": 0.1462894082069397, + "learning_rate": 1.0317188339522188e-05, + "loss": 0.647, + "step": 3463 + }, + { + "epoch": 0.7983406314819083, + "grad_norm": 0.14987273514270782, + "learning_rate": 1.029448980369262e-05, + "loss": 0.6468, + "step": 3464 + }, + { + "epoch": 0.7985710993316433, + "grad_norm": 0.1489580124616623, + "learning_rate": 1.0271813398761405e-05, + "loss": 0.6474, + "step": 3465 + }, + { + "epoch": 0.7988015671813782, + "grad_norm": 0.1617879867553711, + "learning_rate": 1.0249159137367842e-05, + "loss": 0.6429, + "step": 3466 + }, + { + "epoch": 0.7990320350311132, + "grad_norm": 0.1757936030626297, + "learning_rate": 1.0226527032138878e-05, + "loss": 0.6353, + "step": 3467 + }, + { + "epoch": 0.7992625028808481, + "grad_norm": 0.15790215134620667, + "learning_rate": 1.0203917095689097e-05, + "loss": 0.6442, + "step": 3468 + }, + { + "epoch": 0.7994929707305831, + "grad_norm": 0.155640110373497, + "learning_rate": 1.0181329340620743e-05, + "loss": 0.643, + "step": 3469 + }, + { + "epoch": 0.799723438580318, + "grad_norm": 0.16087128221988678, + "learning_rate": 1.0158763779523695e-05, + "loss": 0.6401, + "step": 3470 + }, + { + "epoch": 0.799953906430053, + "grad_norm": 0.16374313831329346, + "learning_rate": 1.0136220424975435e-05, + "loss": 0.6386, + "step": 3471 + }, + { + "epoch": 0.800184374279788, + "grad_norm": 0.16088031232357025, + "learning_rate": 1.011369928954108e-05, + "loss": 0.6392, + "step": 3472 + }, + { + "epoch": 0.800414842129523, + "grad_norm": 0.1503087878227234, + "learning_rate": 1.0091200385773408e-05, + "loss": 0.6402, + "step": 3473 + }, + { + "epoch": 0.8006453099792579, + "grad_norm": 0.16075649857521057, + "learning_rate": 1.0068723726212742e-05, + "loss": 0.6463, + "step": 3474 + }, + { + "epoch": 0.8008757778289929, + "grad_norm": 0.16603581607341766, + "learning_rate": 1.0046269323387036e-05, + "loss": 0.643, + "step": 3475 + }, + { + "epoch": 0.8011062456787278, + "grad_norm": 0.15744850039482117, + "learning_rate": 1.0023837189811835e-05, + "loss": 0.6437, + "step": 3476 + }, + { + "epoch": 0.8013367135284628, + "grad_norm": 0.16175509989261627, + "learning_rate": 1.0001427337990277e-05, + "loss": 0.6434, + "step": 3477 + }, + { + "epoch": 0.8015671813781977, + "grad_norm": 0.17071296274662018, + "learning_rate": 9.979039780413068e-06, + "loss": 0.6449, + "step": 3478 + }, + { + "epoch": 0.8017976492279327, + "grad_norm": 0.15305480360984802, + "learning_rate": 9.956674529558518e-06, + "loss": 0.6427, + "step": 3479 + }, + { + "epoch": 0.8020281170776676, + "grad_norm": 0.16541430354118347, + "learning_rate": 9.934331597892448e-06, + "loss": 0.6406, + "step": 3480 + }, + { + "epoch": 0.8022585849274027, + "grad_norm": 0.15421070158481598, + "learning_rate": 9.912010997868287e-06, + "loss": 0.6473, + "step": 3481 + }, + { + "epoch": 0.8024890527771376, + "grad_norm": 0.15333889424800873, + "learning_rate": 9.889712741926998e-06, + "loss": 0.6405, + "step": 3482 + }, + { + "epoch": 0.8027195206268726, + "grad_norm": 0.15218707919120789, + "learning_rate": 9.867436842497103e-06, + "loss": 0.632, + "step": 3483 + }, + { + "epoch": 0.8029499884766075, + "grad_norm": 0.15634161233901978, + "learning_rate": 9.845183311994637e-06, + "loss": 0.6378, + "step": 3484 + }, + { + "epoch": 0.8031804563263425, + "grad_norm": 0.157097727060318, + "learning_rate": 9.822952162823201e-06, + "loss": 0.6448, + "step": 3485 + }, + { + "epoch": 0.8034109241760774, + "grad_norm": 0.15480422973632812, + "learning_rate": 9.800743407373896e-06, + "loss": 0.6354, + "step": 3486 + }, + { + "epoch": 0.8036413920258124, + "grad_norm": 0.15486778318881989, + "learning_rate": 9.778557058025356e-06, + "loss": 0.6309, + "step": 3487 + }, + { + "epoch": 0.8038718598755473, + "grad_norm": 0.15796728432178497, + "learning_rate": 9.756393127143709e-06, + "loss": 0.6334, + "step": 3488 + }, + { + "epoch": 0.8041023277252823, + "grad_norm": 0.15152916312217712, + "learning_rate": 9.734251627082613e-06, + "loss": 0.6429, + "step": 3489 + }, + { + "epoch": 0.8043327955750172, + "grad_norm": 0.1551668345928192, + "learning_rate": 9.71213257018319e-06, + "loss": 0.6454, + "step": 3490 + }, + { + "epoch": 0.8045632634247523, + "grad_norm": 0.16736368834972382, + "learning_rate": 9.69003596877408e-06, + "loss": 0.6362, + "step": 3491 + }, + { + "epoch": 0.8047937312744872, + "grad_norm": 0.14687344431877136, + "learning_rate": 9.667961835171402e-06, + "loss": 0.6366, + "step": 3492 + }, + { + "epoch": 0.8050241991242222, + "grad_norm": 0.1573791354894638, + "learning_rate": 9.645910181678741e-06, + "loss": 0.6338, + "step": 3493 + }, + { + "epoch": 0.8052546669739571, + "grad_norm": 0.1550455242395401, + "learning_rate": 9.62388102058716e-06, + "loss": 0.6381, + "step": 3494 + }, + { + "epoch": 0.8054851348236921, + "grad_norm": 0.15506303310394287, + "learning_rate": 9.601874364175206e-06, + "loss": 0.6391, + "step": 3495 + }, + { + "epoch": 0.805715602673427, + "grad_norm": 0.15035480260849, + "learning_rate": 9.579890224708826e-06, + "loss": 0.6455, + "step": 3496 + }, + { + "epoch": 0.805946070523162, + "grad_norm": 0.1552959680557251, + "learning_rate": 9.557928614441458e-06, + "loss": 0.641, + "step": 3497 + }, + { + "epoch": 0.8061765383728969, + "grad_norm": 0.17842282354831696, + "learning_rate": 9.535989545614016e-06, + "loss": 0.6346, + "step": 3498 + }, + { + "epoch": 0.806407006222632, + "grad_norm": 0.15210920572280884, + "learning_rate": 9.514073030454762e-06, + "loss": 0.6414, + "step": 3499 + }, + { + "epoch": 0.8066374740723669, + "grad_norm": 0.16027803719043732, + "learning_rate": 9.49217908117946e-06, + "loss": 0.6539, + "step": 3500 + }, + { + "epoch": 0.8068679419221019, + "grad_norm": 0.1576579064130783, + "learning_rate": 9.470307709991267e-06, + "loss": 0.6397, + "step": 3501 + }, + { + "epoch": 0.8070984097718368, + "grad_norm": 0.15845666825771332, + "learning_rate": 9.448458929080756e-06, + "loss": 0.6395, + "step": 3502 + }, + { + "epoch": 0.8073288776215718, + "grad_norm": 0.15758542716503143, + "learning_rate": 9.426632750625918e-06, + "loss": 0.6437, + "step": 3503 + }, + { + "epoch": 0.8075593454713067, + "grad_norm": 0.15715070068836212, + "learning_rate": 9.404829186792152e-06, + "loss": 0.6347, + "step": 3504 + }, + { + "epoch": 0.8077898133210417, + "grad_norm": 0.15392762422561646, + "learning_rate": 9.383048249732217e-06, + "loss": 0.6407, + "step": 3505 + }, + { + "epoch": 0.8080202811707766, + "grad_norm": 0.15316665172576904, + "learning_rate": 9.3612899515863e-06, + "loss": 0.6394, + "step": 3506 + }, + { + "epoch": 0.8082507490205116, + "grad_norm": 0.15614305436611176, + "learning_rate": 9.339554304481951e-06, + "loss": 0.6348, + "step": 3507 + }, + { + "epoch": 0.8084812168702467, + "grad_norm": 0.16622847318649292, + "learning_rate": 9.317841320534092e-06, + "loss": 0.6378, + "step": 3508 + }, + { + "epoch": 0.8087116847199816, + "grad_norm": 0.1508837789297104, + "learning_rate": 9.296151011845034e-06, + "loss": 0.6428, + "step": 3509 + }, + { + "epoch": 0.8089421525697166, + "grad_norm": 0.15079739689826965, + "learning_rate": 9.27448339050443e-06, + "loss": 0.6462, + "step": 3510 + }, + { + "epoch": 0.8091726204194515, + "grad_norm": 0.15400461852550507, + "learning_rate": 9.252838468589265e-06, + "loss": 0.6382, + "step": 3511 + }, + { + "epoch": 0.8094030882691865, + "grad_norm": 0.16339372098445892, + "learning_rate": 9.231216258163939e-06, + "loss": 0.6392, + "step": 3512 + }, + { + "epoch": 0.8096335561189214, + "grad_norm": 0.15596559643745422, + "learning_rate": 9.209616771280139e-06, + "loss": 0.6341, + "step": 3513 + }, + { + "epoch": 0.8098640239686564, + "grad_norm": 0.1570686399936676, + "learning_rate": 9.18804001997689e-06, + "loss": 0.6424, + "step": 3514 + }, + { + "epoch": 0.8100944918183913, + "grad_norm": 0.1516939401626587, + "learning_rate": 9.166486016280562e-06, + "loss": 0.6523, + "step": 3515 + }, + { + "epoch": 0.8103249596681263, + "grad_norm": 0.15133339166641235, + "learning_rate": 9.14495477220484e-06, + "loss": 0.6376, + "step": 3516 + }, + { + "epoch": 0.8105554275178612, + "grad_norm": 0.1549098789691925, + "learning_rate": 9.12344629975072e-06, + "loss": 0.6424, + "step": 3517 + }, + { + "epoch": 0.8107858953675963, + "grad_norm": 0.1573808342218399, + "learning_rate": 9.101960610906519e-06, + "loss": 0.6441, + "step": 3518 + }, + { + "epoch": 0.8110163632173312, + "grad_norm": 0.16286353766918182, + "learning_rate": 9.08049771764784e-06, + "loss": 0.639, + "step": 3519 + }, + { + "epoch": 0.8112468310670662, + "grad_norm": 0.15851277112960815, + "learning_rate": 9.059057631937567e-06, + "loss": 0.642, + "step": 3520 + }, + { + "epoch": 0.8114772989168011, + "grad_norm": 0.15502890944480896, + "learning_rate": 9.037640365725897e-06, + "loss": 0.6326, + "step": 3521 + }, + { + "epoch": 0.8117077667665361, + "grad_norm": 0.15994207561016083, + "learning_rate": 9.01624593095033e-06, + "loss": 0.6448, + "step": 3522 + }, + { + "epoch": 0.811938234616271, + "grad_norm": 0.1541290283203125, + "learning_rate": 8.994874339535569e-06, + "loss": 0.6481, + "step": 3523 + }, + { + "epoch": 0.812168702466006, + "grad_norm": 0.14977748692035675, + "learning_rate": 8.973525603393645e-06, + "loss": 0.6415, + "step": 3524 + }, + { + "epoch": 0.8123991703157409, + "grad_norm": 0.15313945710659027, + "learning_rate": 8.952199734423843e-06, + "loss": 0.6383, + "step": 3525 + }, + { + "epoch": 0.812629638165476, + "grad_norm": 0.15545286238193512, + "learning_rate": 8.930896744512652e-06, + "loss": 0.6438, + "step": 3526 + }, + { + "epoch": 0.8128601060152109, + "grad_norm": 0.15130577981472015, + "learning_rate": 8.909616645533886e-06, + "loss": 0.6466, + "step": 3527 + }, + { + "epoch": 0.8130905738649459, + "grad_norm": 0.15956194698810577, + "learning_rate": 8.888359449348555e-06, + "loss": 0.6353, + "step": 3528 + }, + { + "epoch": 0.8133210417146808, + "grad_norm": 0.15058216452598572, + "learning_rate": 8.867125167804896e-06, + "loss": 0.6374, + "step": 3529 + }, + { + "epoch": 0.8135515095644158, + "grad_norm": 0.14475852251052856, + "learning_rate": 8.845913812738394e-06, + "loss": 0.6442, + "step": 3530 + }, + { + "epoch": 0.8137819774141507, + "grad_norm": 0.14834396541118622, + "learning_rate": 8.824725395971745e-06, + "loss": 0.6469, + "step": 3531 + }, + { + "epoch": 0.8140124452638857, + "grad_norm": 0.15428373217582703, + "learning_rate": 8.803559929314869e-06, + "loss": 0.6378, + "step": 3532 + }, + { + "epoch": 0.8142429131136206, + "grad_norm": 0.15115933120250702, + "learning_rate": 8.782417424564893e-06, + "loss": 0.6415, + "step": 3533 + }, + { + "epoch": 0.8144733809633556, + "grad_norm": 0.14465634524822235, + "learning_rate": 8.761297893506149e-06, + "loss": 0.6432, + "step": 3534 + }, + { + "epoch": 0.8147038488130905, + "grad_norm": 0.15278743207454681, + "learning_rate": 8.740201347910132e-06, + "loss": 0.6305, + "step": 3535 + }, + { + "epoch": 0.8149343166628256, + "grad_norm": 0.15596990287303925, + "learning_rate": 8.719127799535547e-06, + "loss": 0.6451, + "step": 3536 + }, + { + "epoch": 0.8151647845125605, + "grad_norm": 0.15721940994262695, + "learning_rate": 8.698077260128329e-06, + "loss": 0.636, + "step": 3537 + }, + { + "epoch": 0.8153952523622955, + "grad_norm": 0.15862002968788147, + "learning_rate": 8.677049741421506e-06, + "loss": 0.643, + "step": 3538 + }, + { + "epoch": 0.8156257202120304, + "grad_norm": 0.15502506494522095, + "learning_rate": 8.656045255135314e-06, + "loss": 0.641, + "step": 3539 + }, + { + "epoch": 0.8158561880617654, + "grad_norm": 0.1488857865333557, + "learning_rate": 8.635063812977156e-06, + "loss": 0.6389, + "step": 3540 + }, + { + "epoch": 0.8160866559115003, + "grad_norm": 0.1511356681585312, + "learning_rate": 8.61410542664159e-06, + "loss": 0.6383, + "step": 3541 + }, + { + "epoch": 0.8163171237612353, + "grad_norm": 0.15279202163219452, + "learning_rate": 8.593170107810312e-06, + "loss": 0.633, + "step": 3542 + }, + { + "epoch": 0.8165475916109702, + "grad_norm": 0.1624889373779297, + "learning_rate": 8.572257868152172e-06, + "loss": 0.6456, + "step": 3543 + }, + { + "epoch": 0.8167780594607053, + "grad_norm": 0.15917298197746277, + "learning_rate": 8.551368719323139e-06, + "loss": 0.6379, + "step": 3544 + }, + { + "epoch": 0.8170085273104402, + "grad_norm": 0.16097372770309448, + "learning_rate": 8.530502672966328e-06, + "loss": 0.6475, + "step": 3545 + }, + { + "epoch": 0.8172389951601752, + "grad_norm": 0.14943112432956696, + "learning_rate": 8.509659740711973e-06, + "loss": 0.646, + "step": 3546 + }, + { + "epoch": 0.8174694630099101, + "grad_norm": 0.15080755949020386, + "learning_rate": 8.488839934177422e-06, + "loss": 0.6347, + "step": 3547 + }, + { + "epoch": 0.8176999308596451, + "grad_norm": 0.16167816519737244, + "learning_rate": 8.46804326496714e-06, + "loss": 0.6356, + "step": 3548 + }, + { + "epoch": 0.81793039870938, + "grad_norm": 0.15981729328632355, + "learning_rate": 8.447269744672703e-06, + "loss": 0.6338, + "step": 3549 + }, + { + "epoch": 0.818160866559115, + "grad_norm": 0.1530756950378418, + "learning_rate": 8.426519384872733e-06, + "loss": 0.6392, + "step": 3550 + }, + { + "epoch": 0.8183913344088499, + "grad_norm": 0.1670352965593338, + "learning_rate": 8.405792197133022e-06, + "loss": 0.6457, + "step": 3551 + }, + { + "epoch": 0.8186218022585849, + "grad_norm": 0.1554369479417801, + "learning_rate": 8.385088193006407e-06, + "loss": 0.6376, + "step": 3552 + }, + { + "epoch": 0.8188522701083198, + "grad_norm": 0.15348638594150543, + "learning_rate": 8.364407384032775e-06, + "loss": 0.6355, + "step": 3553 + }, + { + "epoch": 0.8190827379580549, + "grad_norm": 0.15805724263191223, + "learning_rate": 8.343749781739125e-06, + "loss": 0.6443, + "step": 3554 + }, + { + "epoch": 0.8193132058077898, + "grad_norm": 0.15739752352237701, + "learning_rate": 8.323115397639513e-06, + "loss": 0.6445, + "step": 3555 + }, + { + "epoch": 0.8195436736575248, + "grad_norm": 0.16260041296482086, + "learning_rate": 8.302504243235043e-06, + "loss": 0.6431, + "step": 3556 + }, + { + "epoch": 0.8197741415072597, + "grad_norm": 0.14841555058956146, + "learning_rate": 8.281916330013889e-06, + "loss": 0.6307, + "step": 3557 + }, + { + "epoch": 0.8200046093569947, + "grad_norm": 0.15895935893058777, + "learning_rate": 8.261351669451256e-06, + "loss": 0.6315, + "step": 3558 + }, + { + "epoch": 0.8202350772067296, + "grad_norm": 0.14655686914920807, + "learning_rate": 8.240810273009381e-06, + "loss": 0.6334, + "step": 3559 + }, + { + "epoch": 0.8204655450564646, + "grad_norm": 0.1525675356388092, + "learning_rate": 8.220292152137554e-06, + "loss": 0.6474, + "step": 3560 + }, + { + "epoch": 0.8206960129061995, + "grad_norm": 0.15452326834201813, + "learning_rate": 8.199797318272085e-06, + "loss": 0.6402, + "step": 3561 + }, + { + "epoch": 0.8209264807559346, + "grad_norm": 0.15377068519592285, + "learning_rate": 8.179325782836295e-06, + "loss": 0.6416, + "step": 3562 + }, + { + "epoch": 0.8211569486056695, + "grad_norm": 0.15333302319049835, + "learning_rate": 8.158877557240529e-06, + "loss": 0.6398, + "step": 3563 + }, + { + "epoch": 0.8213874164554045, + "grad_norm": 0.14731299877166748, + "learning_rate": 8.138452652882156e-06, + "loss": 0.6459, + "step": 3564 + }, + { + "epoch": 0.8216178843051394, + "grad_norm": 0.15332041680812836, + "learning_rate": 8.118051081145484e-06, + "loss": 0.6408, + "step": 3565 + }, + { + "epoch": 0.8218483521548744, + "grad_norm": 0.15887399017810822, + "learning_rate": 8.097672853401894e-06, + "loss": 0.643, + "step": 3566 + }, + { + "epoch": 0.8220788200046094, + "grad_norm": 0.14689992368221283, + "learning_rate": 8.07731798100973e-06, + "loss": 0.6382, + "step": 3567 + }, + { + "epoch": 0.8223092878543443, + "grad_norm": 0.15063437819480896, + "learning_rate": 8.056986475314283e-06, + "loss": 0.6366, + "step": 3568 + }, + { + "epoch": 0.8225397557040793, + "grad_norm": 0.15097613632678986, + "learning_rate": 8.036678347647853e-06, + "loss": 0.6489, + "step": 3569 + }, + { + "epoch": 0.8227702235538142, + "grad_norm": 0.14924323558807373, + "learning_rate": 8.016393609329703e-06, + "loss": 0.6397, + "step": 3570 + }, + { + "epoch": 0.8230006914035493, + "grad_norm": 0.14785458147525787, + "learning_rate": 7.996132271666062e-06, + "loss": 0.6405, + "step": 3571 + }, + { + "epoch": 0.8232311592532842, + "grad_norm": 0.1523989886045456, + "learning_rate": 7.975894345950114e-06, + "loss": 0.6478, + "step": 3572 + }, + { + "epoch": 0.8234616271030192, + "grad_norm": 0.152211531996727, + "learning_rate": 7.955679843462005e-06, + "loss": 0.6379, + "step": 3573 + }, + { + "epoch": 0.8236920949527541, + "grad_norm": 0.14990997314453125, + "learning_rate": 7.935488775468791e-06, + "loss": 0.6435, + "step": 3574 + }, + { + "epoch": 0.8239225628024891, + "grad_norm": 0.1513289213180542, + "learning_rate": 7.915321153224487e-06, + "loss": 0.6429, + "step": 3575 + }, + { + "epoch": 0.824153030652224, + "grad_norm": 0.150692880153656, + "learning_rate": 7.895176987970071e-06, + "loss": 0.6436, + "step": 3576 + }, + { + "epoch": 0.824383498501959, + "grad_norm": 0.14454329013824463, + "learning_rate": 7.875056290933391e-06, + "loss": 0.6294, + "step": 3577 + }, + { + "epoch": 0.8246139663516939, + "grad_norm": 0.14884406328201294, + "learning_rate": 7.85495907332925e-06, + "loss": 0.639, + "step": 3578 + }, + { + "epoch": 0.8248444342014289, + "grad_norm": 0.15509894490242004, + "learning_rate": 7.83488534635936e-06, + "loss": 0.6402, + "step": 3579 + }, + { + "epoch": 0.8250749020511638, + "grad_norm": 0.15603148937225342, + "learning_rate": 7.814835121212305e-06, + "loss": 0.6336, + "step": 3580 + }, + { + "epoch": 0.8253053699008989, + "grad_norm": 0.1463710367679596, + "learning_rate": 7.794808409063636e-06, + "loss": 0.6454, + "step": 3581 + }, + { + "epoch": 0.8255358377506338, + "grad_norm": 0.15044938027858734, + "learning_rate": 7.774805221075764e-06, + "loss": 0.6402, + "step": 3582 + }, + { + "epoch": 0.8257663056003688, + "grad_norm": 0.1514042317867279, + "learning_rate": 7.754825568397955e-06, + "loss": 0.6413, + "step": 3583 + }, + { + "epoch": 0.8259967734501037, + "grad_norm": 0.15234024822711945, + "learning_rate": 7.7348694621664e-06, + "loss": 0.6444, + "step": 3584 + }, + { + "epoch": 0.8262272412998387, + "grad_norm": 0.14573800563812256, + "learning_rate": 7.71493691350416e-06, + "loss": 0.6449, + "step": 3585 + }, + { + "epoch": 0.8264577091495736, + "grad_norm": 0.14887578785419464, + "learning_rate": 7.695027933521154e-06, + "loss": 0.6321, + "step": 3586 + }, + { + "epoch": 0.8266881769993086, + "grad_norm": 0.15681131184101105, + "learning_rate": 7.675142533314172e-06, + "loss": 0.6428, + "step": 3587 + }, + { + "epoch": 0.8269186448490435, + "grad_norm": 0.15508340299129486, + "learning_rate": 7.655280723966868e-06, + "loss": 0.6383, + "step": 3588 + }, + { + "epoch": 0.8271491126987786, + "grad_norm": 0.15364862978458405, + "learning_rate": 7.635442516549719e-06, + "loss": 0.636, + "step": 3589 + }, + { + "epoch": 0.8273795805485135, + "grad_norm": 0.1485067903995514, + "learning_rate": 7.615627922120056e-06, + "loss": 0.6457, + "step": 3590 + }, + { + "epoch": 0.8276100483982485, + "grad_norm": 0.14897732436656952, + "learning_rate": 7.5958369517221075e-06, + "loss": 0.6437, + "step": 3591 + }, + { + "epoch": 0.8278405162479834, + "grad_norm": 0.1528944969177246, + "learning_rate": 7.576069616386838e-06, + "loss": 0.6447, + "step": 3592 + }, + { + "epoch": 0.8280709840977184, + "grad_norm": 0.15931028127670288, + "learning_rate": 7.556325927132102e-06, + "loss": 0.6421, + "step": 3593 + }, + { + "epoch": 0.8283014519474533, + "grad_norm": 0.15149252116680145, + "learning_rate": 7.536605894962562e-06, + "loss": 0.6398, + "step": 3594 + }, + { + "epoch": 0.8285319197971883, + "grad_norm": 0.14745914936065674, + "learning_rate": 7.5169095308696865e-06, + "loss": 0.65, + "step": 3595 + }, + { + "epoch": 0.8287623876469232, + "grad_norm": 0.1486995816230774, + "learning_rate": 7.497236845831751e-06, + "loss": 0.6366, + "step": 3596 + }, + { + "epoch": 0.8289928554966582, + "grad_norm": 0.14624127745628357, + "learning_rate": 7.4775878508138606e-06, + "loss": 0.6405, + "step": 3597 + }, + { + "epoch": 0.8292233233463931, + "grad_norm": 0.1482323855161667, + "learning_rate": 7.457962556767866e-06, + "loss": 0.6405, + "step": 3598 + }, + { + "epoch": 0.8294537911961282, + "grad_norm": 0.14901240170001984, + "learning_rate": 7.438360974632441e-06, + "loss": 0.6402, + "step": 3599 + }, + { + "epoch": 0.8296842590458631, + "grad_norm": 0.15405641496181488, + "learning_rate": 7.418783115333045e-06, + "loss": 0.6439, + "step": 3600 + }, + { + "epoch": 0.8299147268955981, + "grad_norm": 0.14017413556575775, + "learning_rate": 7.399228989781898e-06, + "loss": 0.6411, + "step": 3601 + }, + { + "epoch": 0.830145194745333, + "grad_norm": 0.14487187564373016, + "learning_rate": 7.3796986088780105e-06, + "loss": 0.6315, + "step": 3602 + }, + { + "epoch": 0.830375662595068, + "grad_norm": 0.15042652189731598, + "learning_rate": 7.360191983507153e-06, + "loss": 0.6384, + "step": 3603 + }, + { + "epoch": 0.8306061304448029, + "grad_norm": 0.15280528366565704, + "learning_rate": 7.340709124541817e-06, + "loss": 0.6446, + "step": 3604 + }, + { + "epoch": 0.8308365982945379, + "grad_norm": 0.1426979899406433, + "learning_rate": 7.321250042841316e-06, + "loss": 0.6448, + "step": 3605 + }, + { + "epoch": 0.8310670661442728, + "grad_norm": 0.14999282360076904, + "learning_rate": 7.301814749251679e-06, + "loss": 0.6387, + "step": 3606 + }, + { + "epoch": 0.8312975339940079, + "grad_norm": 0.151020810008049, + "learning_rate": 7.2824032546056354e-06, + "loss": 0.6489, + "step": 3607 + }, + { + "epoch": 0.8315280018437428, + "grad_norm": 0.15103960037231445, + "learning_rate": 7.2630155697227146e-06, + "loss": 0.6448, + "step": 3608 + }, + { + "epoch": 0.8317584696934778, + "grad_norm": 0.14460238814353943, + "learning_rate": 7.243651705409132e-06, + "loss": 0.6409, + "step": 3609 + }, + { + "epoch": 0.8319889375432127, + "grad_norm": 0.14994823932647705, + "learning_rate": 7.2243116724578565e-06, + "loss": 0.6336, + "step": 3610 + }, + { + "epoch": 0.8322194053929477, + "grad_norm": 0.15003249049186707, + "learning_rate": 7.2049954816485465e-06, + "loss": 0.6397, + "step": 3611 + }, + { + "epoch": 0.8324498732426826, + "grad_norm": 0.15217861533164978, + "learning_rate": 7.185703143747596e-06, + "loss": 0.6425, + "step": 3612 + }, + { + "epoch": 0.8326803410924176, + "grad_norm": 0.1463039368391037, + "learning_rate": 7.166434669508071e-06, + "loss": 0.6341, + "step": 3613 + }, + { + "epoch": 0.8329108089421525, + "grad_norm": 0.1474071443080902, + "learning_rate": 7.147190069669768e-06, + "loss": 0.6397, + "step": 3614 + }, + { + "epoch": 0.8331412767918875, + "grad_norm": 0.14856012165546417, + "learning_rate": 7.127969354959163e-06, + "loss": 0.6444, + "step": 3615 + }, + { + "epoch": 0.8333717446416224, + "grad_norm": 0.147612527012825, + "learning_rate": 7.1087725360894195e-06, + "loss": 0.6347, + "step": 3616 + }, + { + "epoch": 0.8336022124913575, + "grad_norm": 0.15374769270420074, + "learning_rate": 7.08959962376039e-06, + "loss": 0.6435, + "step": 3617 + }, + { + "epoch": 0.8338326803410924, + "grad_norm": 0.14860646426677704, + "learning_rate": 7.070450628658592e-06, + "loss": 0.646, + "step": 3618 + }, + { + "epoch": 0.8340631481908274, + "grad_norm": 0.1480385959148407, + "learning_rate": 7.051325561457217e-06, + "loss": 0.6384, + "step": 3619 + }, + { + "epoch": 0.8342936160405623, + "grad_norm": 0.1520072966814041, + "learning_rate": 7.0322244328161144e-06, + "loss": 0.6458, + "step": 3620 + }, + { + "epoch": 0.8345240838902973, + "grad_norm": 0.14834314584732056, + "learning_rate": 7.013147253381797e-06, + "loss": 0.6341, + "step": 3621 + }, + { + "epoch": 0.8347545517400322, + "grad_norm": 0.14721745252609253, + "learning_rate": 6.99409403378744e-06, + "loss": 0.6417, + "step": 3622 + }, + { + "epoch": 0.8349850195897672, + "grad_norm": 0.1415952891111374, + "learning_rate": 6.975064784652829e-06, + "loss": 0.6443, + "step": 3623 + }, + { + "epoch": 0.8352154874395021, + "grad_norm": 0.14462348818778992, + "learning_rate": 6.9560595165844175e-06, + "loss": 0.633, + "step": 3624 + }, + { + "epoch": 0.8354459552892372, + "grad_norm": 0.1447225958108902, + "learning_rate": 6.937078240175287e-06, + "loss": 0.6442, + "step": 3625 + }, + { + "epoch": 0.8356764231389722, + "grad_norm": 0.1561073660850525, + "learning_rate": 6.918120966005148e-06, + "loss": 0.6468, + "step": 3626 + }, + { + "epoch": 0.8359068909887071, + "grad_norm": 0.1541508585214615, + "learning_rate": 6.899187704640325e-06, + "loss": 0.6415, + "step": 3627 + }, + { + "epoch": 0.8361373588384421, + "grad_norm": 0.15150463581085205, + "learning_rate": 6.880278466633783e-06, + "loss": 0.6413, + "step": 3628 + }, + { + "epoch": 0.836367826688177, + "grad_norm": 0.14670462906360626, + "learning_rate": 6.861393262525035e-06, + "loss": 0.6488, + "step": 3629 + }, + { + "epoch": 0.836598294537912, + "grad_norm": 0.14568495750427246, + "learning_rate": 6.842532102840277e-06, + "loss": 0.6333, + "step": 3630 + }, + { + "epoch": 0.8368287623876469, + "grad_norm": 0.15337829291820526, + "learning_rate": 6.823694998092273e-06, + "loss": 0.6408, + "step": 3631 + }, + { + "epoch": 0.8370592302373819, + "grad_norm": 0.14713677763938904, + "learning_rate": 6.80488195878034e-06, + "loss": 0.6367, + "step": 3632 + }, + { + "epoch": 0.8372896980871168, + "grad_norm": 0.14200221002101898, + "learning_rate": 6.786092995390436e-06, + "loss": 0.6303, + "step": 3633 + }, + { + "epoch": 0.8375201659368519, + "grad_norm": 0.15284350514411926, + "learning_rate": 6.7673281183950665e-06, + "loss": 0.6438, + "step": 3634 + }, + { + "epoch": 0.8377506337865868, + "grad_norm": 0.147255539894104, + "learning_rate": 6.748587338253337e-06, + "loss": 0.643, + "step": 3635 + }, + { + "epoch": 0.8379811016363218, + "grad_norm": 0.14654171466827393, + "learning_rate": 6.729870665410898e-06, + "loss": 0.6404, + "step": 3636 + }, + { + "epoch": 0.8382115694860567, + "grad_norm": 0.15386711061000824, + "learning_rate": 6.711178110299993e-06, + "loss": 0.6394, + "step": 3637 + }, + { + "epoch": 0.8384420373357917, + "grad_norm": 0.15187443792819977, + "learning_rate": 6.692509683339371e-06, + "loss": 0.6411, + "step": 3638 + }, + { + "epoch": 0.8386725051855266, + "grad_norm": 0.14688163995742798, + "learning_rate": 6.673865394934376e-06, + "loss": 0.6341, + "step": 3639 + }, + { + "epoch": 0.8389029730352616, + "grad_norm": 0.15309026837348938, + "learning_rate": 6.655245255476911e-06, + "loss": 0.6373, + "step": 3640 + }, + { + "epoch": 0.8391334408849965, + "grad_norm": 0.15117040276527405, + "learning_rate": 6.6366492753453695e-06, + "loss": 0.6554, + "step": 3641 + }, + { + "epoch": 0.8393639087347315, + "grad_norm": 0.15723955631256104, + "learning_rate": 6.61807746490471e-06, + "loss": 0.6349, + "step": 3642 + }, + { + "epoch": 0.8395943765844665, + "grad_norm": 0.15363402664661407, + "learning_rate": 6.59952983450643e-06, + "loss": 0.641, + "step": 3643 + }, + { + "epoch": 0.8398248444342015, + "grad_norm": 0.14676257967948914, + "learning_rate": 6.581006394488493e-06, + "loss": 0.6338, + "step": 3644 + }, + { + "epoch": 0.8400553122839364, + "grad_norm": 0.14586247503757477, + "learning_rate": 6.562507155175457e-06, + "loss": 0.6316, + "step": 3645 + }, + { + "epoch": 0.8402857801336714, + "grad_norm": 0.1537342369556427, + "learning_rate": 6.544032126878358e-06, + "loss": 0.6407, + "step": 3646 + }, + { + "epoch": 0.8405162479834063, + "grad_norm": 0.14989496767520905, + "learning_rate": 6.525581319894703e-06, + "loss": 0.6309, + "step": 3647 + }, + { + "epoch": 0.8407467158331413, + "grad_norm": 0.15396440029144287, + "learning_rate": 6.507154744508548e-06, + "loss": 0.6486, + "step": 3648 + }, + { + "epoch": 0.8409771836828762, + "grad_norm": 0.14274229109287262, + "learning_rate": 6.488752410990417e-06, + "loss": 0.6309, + "step": 3649 + }, + { + "epoch": 0.8412076515326112, + "grad_norm": 0.14844496548175812, + "learning_rate": 6.470374329597334e-06, + "loss": 0.6343, + "step": 3650 + }, + { + "epoch": 0.8414381193823461, + "grad_norm": 0.15259870886802673, + "learning_rate": 6.452020510572798e-06, + "loss": 0.6426, + "step": 3651 + }, + { + "epoch": 0.8416685872320812, + "grad_norm": 0.1534135341644287, + "learning_rate": 6.433690964146799e-06, + "loss": 0.6334, + "step": 3652 + }, + { + "epoch": 0.8418990550818161, + "grad_norm": 0.14689849317073822, + "learning_rate": 6.415385700535764e-06, + "loss": 0.6429, + "step": 3653 + }, + { + "epoch": 0.8421295229315511, + "grad_norm": 0.14878277480602264, + "learning_rate": 6.39710472994261e-06, + "loss": 0.6417, + "step": 3654 + }, + { + "epoch": 0.842359990781286, + "grad_norm": 0.1506912261247635, + "learning_rate": 6.378848062556741e-06, + "loss": 0.6453, + "step": 3655 + }, + { + "epoch": 0.842590458631021, + "grad_norm": 0.15941332280635834, + "learning_rate": 6.360615708553952e-06, + "loss": 0.6397, + "step": 3656 + }, + { + "epoch": 0.8428209264807559, + "grad_norm": 0.15402281284332275, + "learning_rate": 6.342407678096534e-06, + "loss": 0.6413, + "step": 3657 + }, + { + "epoch": 0.8430513943304909, + "grad_norm": 0.14595894515514374, + "learning_rate": 6.324223981333199e-06, + "loss": 0.6448, + "step": 3658 + }, + { + "epoch": 0.8432818621802258, + "grad_norm": 0.16061711311340332, + "learning_rate": 6.3060646283991106e-06, + "loss": 0.646, + "step": 3659 + }, + { + "epoch": 0.8435123300299608, + "grad_norm": 0.16146208345890045, + "learning_rate": 6.287929629415856e-06, + "loss": 0.6377, + "step": 3660 + }, + { + "epoch": 0.8437427978796957, + "grad_norm": 0.15115495026111603, + "learning_rate": 6.269818994491455e-06, + "loss": 0.6467, + "step": 3661 + }, + { + "epoch": 0.8439732657294308, + "grad_norm": 0.13822171092033386, + "learning_rate": 6.251732733720323e-06, + "loss": 0.639, + "step": 3662 + }, + { + "epoch": 0.8442037335791657, + "grad_norm": 0.14701151847839355, + "learning_rate": 6.23367085718331e-06, + "loss": 0.6439, + "step": 3663 + }, + { + "epoch": 0.8444342014289007, + "grad_norm": 0.1508350670337677, + "learning_rate": 6.215633374947683e-06, + "loss": 0.6387, + "step": 3664 + }, + { + "epoch": 0.8446646692786356, + "grad_norm": 0.15383285284042358, + "learning_rate": 6.197620297067097e-06, + "loss": 0.6405, + "step": 3665 + }, + { + "epoch": 0.8448951371283706, + "grad_norm": 0.1498384326696396, + "learning_rate": 6.179631633581612e-06, + "loss": 0.6461, + "step": 3666 + }, + { + "epoch": 0.8451256049781055, + "grad_norm": 0.15178535878658295, + "learning_rate": 6.1616673945176836e-06, + "loss": 0.6454, + "step": 3667 + }, + { + "epoch": 0.8453560728278405, + "grad_norm": 0.15506048500537872, + "learning_rate": 6.143727589888126e-06, + "loss": 0.6449, + "step": 3668 + }, + { + "epoch": 0.8455865406775754, + "grad_norm": 0.1453298181295395, + "learning_rate": 6.125812229692162e-06, + "loss": 0.6417, + "step": 3669 + }, + { + "epoch": 0.8458170085273105, + "grad_norm": 0.1514149308204651, + "learning_rate": 6.107921323915411e-06, + "loss": 0.6433, + "step": 3670 + }, + { + "epoch": 0.8460474763770454, + "grad_norm": 0.14425182342529297, + "learning_rate": 6.0900548825298e-06, + "loss": 0.6461, + "step": 3671 + }, + { + "epoch": 0.8462779442267804, + "grad_norm": 0.1488070785999298, + "learning_rate": 6.072212915493669e-06, + "loss": 0.6473, + "step": 3672 + }, + { + "epoch": 0.8465084120765153, + "grad_norm": 0.14930947124958038, + "learning_rate": 6.054395432751703e-06, + "loss": 0.6418, + "step": 3673 + }, + { + "epoch": 0.8467388799262503, + "grad_norm": 0.15142978727817535, + "learning_rate": 6.036602444234935e-06, + "loss": 0.638, + "step": 3674 + }, + { + "epoch": 0.8469693477759852, + "grad_norm": 0.13954748213291168, + "learning_rate": 6.018833959860753e-06, + "loss": 0.6415, + "step": 3675 + }, + { + "epoch": 0.8471998156257202, + "grad_norm": 0.14654095470905304, + "learning_rate": 6.001089989532893e-06, + "loss": 0.6398, + "step": 3676 + }, + { + "epoch": 0.8474302834754551, + "grad_norm": 0.14327792823314667, + "learning_rate": 5.9833705431413975e-06, + "loss": 0.6393, + "step": 3677 + }, + { + "epoch": 0.8476607513251901, + "grad_norm": 0.15117095410823822, + "learning_rate": 5.965675630562672e-06, + "loss": 0.6375, + "step": 3678 + }, + { + "epoch": 0.847891219174925, + "grad_norm": 0.14697414636611938, + "learning_rate": 5.948005261659434e-06, + "loss": 0.6385, + "step": 3679 + }, + { + "epoch": 0.8481216870246601, + "grad_norm": 0.14706385135650635, + "learning_rate": 5.930359446280726e-06, + "loss": 0.6391, + "step": 3680 + }, + { + "epoch": 0.848352154874395, + "grad_norm": 0.1475333720445633, + "learning_rate": 5.912738194261902e-06, + "loss": 0.636, + "step": 3681 + }, + { + "epoch": 0.84858262272413, + "grad_norm": 0.15489518642425537, + "learning_rate": 5.895141515424629e-06, + "loss": 0.6409, + "step": 3682 + }, + { + "epoch": 0.8488130905738649, + "grad_norm": 0.1442023515701294, + "learning_rate": 5.87756941957685e-06, + "loss": 0.6374, + "step": 3683 + }, + { + "epoch": 0.8490435584235999, + "grad_norm": 0.14666606485843658, + "learning_rate": 5.860021916512859e-06, + "loss": 0.6388, + "step": 3684 + }, + { + "epoch": 0.8492740262733348, + "grad_norm": 0.1419772058725357, + "learning_rate": 5.842499016013209e-06, + "loss": 0.6369, + "step": 3685 + }, + { + "epoch": 0.8495044941230698, + "grad_norm": 0.14429770410060883, + "learning_rate": 5.8250007278447205e-06, + "loss": 0.637, + "step": 3686 + }, + { + "epoch": 0.8497349619728048, + "grad_norm": 0.15203547477722168, + "learning_rate": 5.807527061760543e-06, + "loss": 0.6346, + "step": 3687 + }, + { + "epoch": 0.8499654298225398, + "grad_norm": 0.14554864168167114, + "learning_rate": 5.790078027500068e-06, + "loss": 0.6423, + "step": 3688 + }, + { + "epoch": 0.8501958976722748, + "grad_norm": 0.15379323065280914, + "learning_rate": 5.772653634788971e-06, + "loss": 0.645, + "step": 3689 + }, + { + "epoch": 0.8504263655220097, + "grad_norm": 0.15001268684864044, + "learning_rate": 5.755253893339185e-06, + "loss": 0.6349, + "step": 3690 + }, + { + "epoch": 0.8506568333717447, + "grad_norm": 0.15877242386341095, + "learning_rate": 5.737878812848929e-06, + "loss": 0.6364, + "step": 3691 + }, + { + "epoch": 0.8508873012214796, + "grad_norm": 0.15002503991127014, + "learning_rate": 5.720528403002634e-06, + "loss": 0.6319, + "step": 3692 + }, + { + "epoch": 0.8511177690712146, + "grad_norm": 0.1416318565607071, + "learning_rate": 5.703202673470992e-06, + "loss": 0.6432, + "step": 3693 + }, + { + "epoch": 0.8513482369209495, + "grad_norm": 0.15145903825759888, + "learning_rate": 5.685901633910989e-06, + "loss": 0.6373, + "step": 3694 + }, + { + "epoch": 0.8515787047706845, + "grad_norm": 0.1428060680627823, + "learning_rate": 5.668625293965774e-06, + "loss": 0.6411, + "step": 3695 + }, + { + "epoch": 0.8518091726204194, + "grad_norm": 0.1449238806962967, + "learning_rate": 5.6513736632647695e-06, + "loss": 0.6353, + "step": 3696 + }, + { + "epoch": 0.8520396404701545, + "grad_norm": 0.15147364139556885, + "learning_rate": 5.634146751423647e-06, + "loss": 0.6433, + "step": 3697 + }, + { + "epoch": 0.8522701083198894, + "grad_norm": 0.14954763650894165, + "learning_rate": 5.616944568044225e-06, + "loss": 0.6279, + "step": 3698 + }, + { + "epoch": 0.8525005761696244, + "grad_norm": 0.14778868854045868, + "learning_rate": 5.599767122714627e-06, + "loss": 0.6368, + "step": 3699 + }, + { + "epoch": 0.8527310440193593, + "grad_norm": 0.14133252203464508, + "learning_rate": 5.582614425009153e-06, + "loss": 0.6416, + "step": 3700 + }, + { + "epoch": 0.8529615118690943, + "grad_norm": 0.15429161489009857, + "learning_rate": 5.565486484488275e-06, + "loss": 0.6418, + "step": 3701 + }, + { + "epoch": 0.8531919797188292, + "grad_norm": 0.1481894999742508, + "learning_rate": 5.548383310698707e-06, + "loss": 0.635, + "step": 3702 + }, + { + "epoch": 0.8534224475685642, + "grad_norm": 0.14938104152679443, + "learning_rate": 5.531304913173357e-06, + "loss": 0.6503, + "step": 3703 + }, + { + "epoch": 0.8536529154182991, + "grad_norm": 0.14781972765922546, + "learning_rate": 5.514251301431306e-06, + "loss": 0.6352, + "step": 3704 + }, + { + "epoch": 0.8538833832680341, + "grad_norm": 0.1436912715435028, + "learning_rate": 5.497222484977826e-06, + "loss": 0.6331, + "step": 3705 + }, + { + "epoch": 0.854113851117769, + "grad_norm": 0.14232562482357025, + "learning_rate": 5.480218473304388e-06, + "loss": 0.6355, + "step": 3706 + }, + { + "epoch": 0.8543443189675041, + "grad_norm": 0.1478080451488495, + "learning_rate": 5.4632392758885985e-06, + "loss": 0.6371, + "step": 3707 + }, + { + "epoch": 0.854574786817239, + "grad_norm": 0.14770181477069855, + "learning_rate": 5.446284902194249e-06, + "loss": 0.6365, + "step": 3708 + }, + { + "epoch": 0.854805254666974, + "grad_norm": 0.14597497880458832, + "learning_rate": 5.429355361671335e-06, + "loss": 0.641, + "step": 3709 + }, + { + "epoch": 0.8550357225167089, + "grad_norm": 0.1463644802570343, + "learning_rate": 5.412450663755941e-06, + "loss": 0.6343, + "step": 3710 + }, + { + "epoch": 0.8552661903664439, + "grad_norm": 0.14388582110404968, + "learning_rate": 5.39557081787036e-06, + "loss": 0.6475, + "step": 3711 + }, + { + "epoch": 0.8554966582161788, + "grad_norm": 0.1477576494216919, + "learning_rate": 5.378715833423004e-06, + "loss": 0.6357, + "step": 3712 + }, + { + "epoch": 0.8557271260659138, + "grad_norm": 0.15977248549461365, + "learning_rate": 5.36188571980844e-06, + "loss": 0.6389, + "step": 3713 + }, + { + "epoch": 0.8559575939156487, + "grad_norm": 0.14352811872959137, + "learning_rate": 5.3450804864073665e-06, + "loss": 0.6389, + "step": 3714 + }, + { + "epoch": 0.8561880617653838, + "grad_norm": 0.13531458377838135, + "learning_rate": 5.328300142586629e-06, + "loss": 0.6385, + "step": 3715 + }, + { + "epoch": 0.8564185296151187, + "grad_norm": 0.15272001922130585, + "learning_rate": 5.311544697699172e-06, + "loss": 0.635, + "step": 3716 + }, + { + "epoch": 0.8566489974648537, + "grad_norm": 0.15149231255054474, + "learning_rate": 5.294814161084083e-06, + "loss": 0.634, + "step": 3717 + }, + { + "epoch": 0.8568794653145886, + "grad_norm": 0.13878074288368225, + "learning_rate": 5.278108542066562e-06, + "loss": 0.6344, + "step": 3718 + }, + { + "epoch": 0.8571099331643236, + "grad_norm": 0.143055722117424, + "learning_rate": 5.261427849957928e-06, + "loss": 0.6459, + "step": 3719 + }, + { + "epoch": 0.8573404010140585, + "grad_norm": 0.14342109858989716, + "learning_rate": 5.244772094055589e-06, + "loss": 0.6344, + "step": 3720 + }, + { + "epoch": 0.8575708688637935, + "grad_norm": 0.1506274938583374, + "learning_rate": 5.228141283643073e-06, + "loss": 0.6441, + "step": 3721 + }, + { + "epoch": 0.8578013367135284, + "grad_norm": 0.13922744989395142, + "learning_rate": 5.211535427989972e-06, + "loss": 0.6469, + "step": 3722 + }, + { + "epoch": 0.8580318045632634, + "grad_norm": 0.13600236177444458, + "learning_rate": 5.194954536352021e-06, + "loss": 0.6365, + "step": 3723 + }, + { + "epoch": 0.8582622724129984, + "grad_norm": 0.140496164560318, + "learning_rate": 5.178398617971003e-06, + "loss": 0.6401, + "step": 3724 + }, + { + "epoch": 0.8584927402627334, + "grad_norm": 0.14220167696475983, + "learning_rate": 5.161867682074773e-06, + "loss": 0.6382, + "step": 3725 + }, + { + "epoch": 0.8587232081124683, + "grad_norm": 0.14413121342658997, + "learning_rate": 5.145361737877291e-06, + "loss": 0.6251, + "step": 3726 + }, + { + "epoch": 0.8589536759622033, + "grad_norm": 0.14119820296764374, + "learning_rate": 5.128880794578572e-06, + "loss": 0.6368, + "step": 3727 + }, + { + "epoch": 0.8591841438119382, + "grad_norm": 0.14928089082241058, + "learning_rate": 5.112424861364701e-06, + "loss": 0.6405, + "step": 3728 + }, + { + "epoch": 0.8594146116616732, + "grad_norm": 0.1459737867116928, + "learning_rate": 5.095993947407818e-06, + "loss": 0.6333, + "step": 3729 + }, + { + "epoch": 0.8596450795114081, + "grad_norm": 0.1493435502052307, + "learning_rate": 5.079588061866125e-06, + "loss": 0.635, + "step": 3730 + }, + { + "epoch": 0.8598755473611431, + "grad_norm": 0.14926058053970337, + "learning_rate": 5.0632072138838584e-06, + "loss": 0.6535, + "step": 3731 + }, + { + "epoch": 0.860106015210878, + "grad_norm": 0.14926740527153015, + "learning_rate": 5.046851412591314e-06, + "loss": 0.6416, + "step": 3732 + }, + { + "epoch": 0.860336483060613, + "grad_norm": 0.148090198636055, + "learning_rate": 5.030520667104821e-06, + "loss": 0.637, + "step": 3733 + }, + { + "epoch": 0.860566950910348, + "grad_norm": 0.14074482023715973, + "learning_rate": 5.01421498652675e-06, + "loss": 0.6391, + "step": 3734 + }, + { + "epoch": 0.860797418760083, + "grad_norm": 0.14252926409244537, + "learning_rate": 4.997934379945491e-06, + "loss": 0.638, + "step": 3735 + }, + { + "epoch": 0.8610278866098179, + "grad_norm": 0.14792828261852264, + "learning_rate": 4.981678856435479e-06, + "loss": 0.6389, + "step": 3736 + }, + { + "epoch": 0.8612583544595529, + "grad_norm": 0.15630586445331573, + "learning_rate": 4.965448425057118e-06, + "loss": 0.6465, + "step": 3737 + }, + { + "epoch": 0.8614888223092878, + "grad_norm": 0.14542563259601593, + "learning_rate": 4.949243094856892e-06, + "loss": 0.64, + "step": 3738 + }, + { + "epoch": 0.8617192901590228, + "grad_norm": 0.1466866284608841, + "learning_rate": 4.933062874867267e-06, + "loss": 0.6441, + "step": 3739 + }, + { + "epoch": 0.8619497580087577, + "grad_norm": 0.14459478855133057, + "learning_rate": 4.916907774106683e-06, + "loss": 0.6341, + "step": 3740 + }, + { + "epoch": 0.8621802258584927, + "grad_norm": 0.152002215385437, + "learning_rate": 4.90077780157962e-06, + "loss": 0.6486, + "step": 3741 + }, + { + "epoch": 0.8624106937082276, + "grad_norm": 0.14948220551013947, + "learning_rate": 4.884672966276538e-06, + "loss": 0.6398, + "step": 3742 + }, + { + "epoch": 0.8626411615579627, + "grad_norm": 0.14576594531536102, + "learning_rate": 4.868593277173878e-06, + "loss": 0.6509, + "step": 3743 + }, + { + "epoch": 0.8628716294076976, + "grad_norm": 0.1429005265235901, + "learning_rate": 4.852538743234081e-06, + "loss": 0.6391, + "step": 3744 + }, + { + "epoch": 0.8631020972574326, + "grad_norm": 0.145622119307518, + "learning_rate": 4.836509373405568e-06, + "loss": 0.6404, + "step": 3745 + }, + { + "epoch": 0.8633325651071676, + "grad_norm": 0.1396528035402298, + "learning_rate": 4.820505176622697e-06, + "loss": 0.6299, + "step": 3746 + }, + { + "epoch": 0.8635630329569025, + "grad_norm": 0.14236707985401154, + "learning_rate": 4.804526161805833e-06, + "loss": 0.6399, + "step": 3747 + }, + { + "epoch": 0.8637935008066375, + "grad_norm": 0.14210587739944458, + "learning_rate": 4.788572337861313e-06, + "loss": 0.6344, + "step": 3748 + }, + { + "epoch": 0.8640239686563724, + "grad_norm": 0.1455959975719452, + "learning_rate": 4.772643713681413e-06, + "loss": 0.6388, + "step": 3749 + }, + { + "epoch": 0.8642544365061074, + "grad_norm": 0.14806394279003143, + "learning_rate": 4.756740298144346e-06, + "loss": 0.6385, + "step": 3750 + }, + { + "epoch": 0.8644849043558424, + "grad_norm": 0.13723453879356384, + "learning_rate": 4.740862100114307e-06, + "loss": 0.6385, + "step": 3751 + }, + { + "epoch": 0.8647153722055774, + "grad_norm": 0.15280765295028687, + "learning_rate": 4.725009128441421e-06, + "loss": 0.6454, + "step": 3752 + }, + { + "epoch": 0.8649458400553123, + "grad_norm": 0.14481478929519653, + "learning_rate": 4.709181391961753e-06, + "loss": 0.6413, + "step": 3753 + }, + { + "epoch": 0.8651763079050473, + "grad_norm": 0.14188295602798462, + "learning_rate": 4.693378899497303e-06, + "loss": 0.6528, + "step": 3754 + }, + { + "epoch": 0.8654067757547822, + "grad_norm": 0.1486392468214035, + "learning_rate": 4.6776016598560124e-06, + "loss": 0.6364, + "step": 3755 + }, + { + "epoch": 0.8656372436045172, + "grad_norm": 0.1420852243900299, + "learning_rate": 4.6618496818317145e-06, + "loss": 0.6371, + "step": 3756 + }, + { + "epoch": 0.8658677114542521, + "grad_norm": 0.1463935822248459, + "learning_rate": 4.646122974204187e-06, + "loss": 0.6368, + "step": 3757 + }, + { + "epoch": 0.8660981793039871, + "grad_norm": 0.14199309051036835, + "learning_rate": 4.630421545739144e-06, + "loss": 0.6407, + "step": 3758 + }, + { + "epoch": 0.866328647153722, + "grad_norm": 0.13984660804271698, + "learning_rate": 4.6147454051881585e-06, + "loss": 0.6389, + "step": 3759 + }, + { + "epoch": 0.8665591150034571, + "grad_norm": 0.14572177827358246, + "learning_rate": 4.5990945612887415e-06, + "loss": 0.6353, + "step": 3760 + }, + { + "epoch": 0.866789582853192, + "grad_norm": 0.14711903035640717, + "learning_rate": 4.583469022764314e-06, + "loss": 0.6408, + "step": 3761 + }, + { + "epoch": 0.867020050702927, + "grad_norm": 0.14538122713565826, + "learning_rate": 4.567868798324143e-06, + "loss": 0.6373, + "step": 3762 + }, + { + "epoch": 0.8672505185526619, + "grad_norm": 0.14264172315597534, + "learning_rate": 4.552293896663451e-06, + "loss": 0.6357, + "step": 3763 + }, + { + "epoch": 0.8674809864023969, + "grad_norm": 0.14745338261127472, + "learning_rate": 4.536744326463304e-06, + "loss": 0.6415, + "step": 3764 + }, + { + "epoch": 0.8677114542521318, + "grad_norm": 0.1417376846075058, + "learning_rate": 4.521220096390655e-06, + "loss": 0.6299, + "step": 3765 + }, + { + "epoch": 0.8679419221018668, + "grad_norm": 0.13753369450569153, + "learning_rate": 4.505721215098335e-06, + "loss": 0.6364, + "step": 3766 + }, + { + "epoch": 0.8681723899516017, + "grad_norm": 0.14812487363815308, + "learning_rate": 4.490247691225058e-06, + "loss": 0.6494, + "step": 3767 + }, + { + "epoch": 0.8684028578013367, + "grad_norm": 0.14765094220638275, + "learning_rate": 4.4747995333953855e-06, + "loss": 0.6363, + "step": 3768 + }, + { + "epoch": 0.8686333256510717, + "grad_norm": 0.14122313261032104, + "learning_rate": 4.459376750219757e-06, + "loss": 0.6319, + "step": 3769 + }, + { + "epoch": 0.8688637935008067, + "grad_norm": 0.13697822391986847, + "learning_rate": 4.443979350294463e-06, + "loss": 0.6328, + "step": 3770 + }, + { + "epoch": 0.8690942613505416, + "grad_norm": 0.1433285027742386, + "learning_rate": 4.428607342201635e-06, + "loss": 0.643, + "step": 3771 + }, + { + "epoch": 0.8693247292002766, + "grad_norm": 0.13931584358215332, + "learning_rate": 4.4132607345092555e-06, + "loss": 0.6451, + "step": 3772 + }, + { + "epoch": 0.8695551970500115, + "grad_norm": 0.1486472487449646, + "learning_rate": 4.397939535771189e-06, + "loss": 0.6352, + "step": 3773 + }, + { + "epoch": 0.8697856648997465, + "grad_norm": 0.14353391528129578, + "learning_rate": 4.382643754527072e-06, + "loss": 0.6599, + "step": 3774 + }, + { + "epoch": 0.8700161327494814, + "grad_norm": 0.14892083406448364, + "learning_rate": 4.36737339930241e-06, + "loss": 0.6315, + "step": 3775 + }, + { + "epoch": 0.8702466005992164, + "grad_norm": 0.1448989063501358, + "learning_rate": 4.352128478608541e-06, + "loss": 0.6396, + "step": 3776 + }, + { + "epoch": 0.8704770684489513, + "grad_norm": 0.1473805457353592, + "learning_rate": 4.3369090009426185e-06, + "loss": 0.6484, + "step": 3777 + }, + { + "epoch": 0.8707075362986864, + "grad_norm": 0.14433009922504425, + "learning_rate": 4.321714974787605e-06, + "loss": 0.6427, + "step": 3778 + }, + { + "epoch": 0.8709380041484213, + "grad_norm": 0.14538750052452087, + "learning_rate": 4.306546408612306e-06, + "loss": 0.6443, + "step": 3779 + }, + { + "epoch": 0.8711684719981563, + "grad_norm": 0.13906364142894745, + "learning_rate": 4.291403310871284e-06, + "loss": 0.6368, + "step": 3780 + }, + { + "epoch": 0.8713989398478912, + "grad_norm": 0.15096677839756012, + "learning_rate": 4.276285690004961e-06, + "loss": 0.6428, + "step": 3781 + }, + { + "epoch": 0.8716294076976262, + "grad_norm": 0.14877794682979584, + "learning_rate": 4.26119355443953e-06, + "loss": 0.6374, + "step": 3782 + }, + { + "epoch": 0.8718598755473611, + "grad_norm": 0.14585071802139282, + "learning_rate": 4.24612691258699e-06, + "loss": 0.6383, + "step": 3783 + }, + { + "epoch": 0.8720903433970961, + "grad_norm": 0.14376723766326904, + "learning_rate": 4.231085772845117e-06, + "loss": 0.6423, + "step": 3784 + }, + { + "epoch": 0.872320811246831, + "grad_norm": 0.1447305977344513, + "learning_rate": 4.2160701435974945e-06, + "loss": 0.6369, + "step": 3785 + }, + { + "epoch": 0.872551279096566, + "grad_norm": 0.14197836816310883, + "learning_rate": 4.201080033213456e-06, + "loss": 0.6373, + "step": 3786 + }, + { + "epoch": 0.872781746946301, + "grad_norm": 0.14709553122520447, + "learning_rate": 4.186115450048128e-06, + "loss": 0.6385, + "step": 3787 + }, + { + "epoch": 0.873012214796036, + "grad_norm": 0.1427144557237625, + "learning_rate": 4.171176402442445e-06, + "loss": 0.6398, + "step": 3788 + }, + { + "epoch": 0.8732426826457709, + "grad_norm": 0.14940115809440613, + "learning_rate": 4.156262898723034e-06, + "loss": 0.6333, + "step": 3789 + }, + { + "epoch": 0.8734731504955059, + "grad_norm": 0.1392568051815033, + "learning_rate": 4.141374947202336e-06, + "loss": 0.6419, + "step": 3790 + }, + { + "epoch": 0.8737036183452408, + "grad_norm": 0.1471542865037918, + "learning_rate": 4.1265125561785465e-06, + "loss": 0.6299, + "step": 3791 + }, + { + "epoch": 0.8739340861949758, + "grad_norm": 0.14388377964496613, + "learning_rate": 4.1116757339355995e-06, + "loss": 0.6431, + "step": 3792 + }, + { + "epoch": 0.8741645540447107, + "grad_norm": 0.1433270126581192, + "learning_rate": 4.0968644887431795e-06, + "loss": 0.6342, + "step": 3793 + }, + { + "epoch": 0.8743950218944457, + "grad_norm": 0.13722054660320282, + "learning_rate": 4.082078828856733e-06, + "loss": 0.6373, + "step": 3794 + }, + { + "epoch": 0.8746254897441806, + "grad_norm": 0.14640505611896515, + "learning_rate": 4.0673187625174195e-06, + "loss": 0.6395, + "step": 3795 + }, + { + "epoch": 0.8748559575939157, + "grad_norm": 0.1450256109237671, + "learning_rate": 4.052584297952145e-06, + "loss": 0.6384, + "step": 3796 + }, + { + "epoch": 0.8750864254436506, + "grad_norm": 0.14310050010681152, + "learning_rate": 4.037875443373546e-06, + "loss": 0.6361, + "step": 3797 + }, + { + "epoch": 0.8753168932933856, + "grad_norm": 0.142232283949852, + "learning_rate": 4.023192206979992e-06, + "loss": 0.6378, + "step": 3798 + }, + { + "epoch": 0.8755473611431205, + "grad_norm": 0.14268440008163452, + "learning_rate": 4.008534596955565e-06, + "loss": 0.6358, + "step": 3799 + }, + { + "epoch": 0.8757778289928555, + "grad_norm": 0.14434240758419037, + "learning_rate": 3.9939026214700695e-06, + "loss": 0.6394, + "step": 3800 + }, + { + "epoch": 0.8760082968425904, + "grad_norm": 0.14181116223335266, + "learning_rate": 3.979296288678996e-06, + "loss": 0.6414, + "step": 3801 + }, + { + "epoch": 0.8762387646923254, + "grad_norm": 0.14815394580364227, + "learning_rate": 3.964715606723585e-06, + "loss": 0.6415, + "step": 3802 + }, + { + "epoch": 0.8764692325420603, + "grad_norm": 0.14605048298835754, + "learning_rate": 3.950160583730761e-06, + "loss": 0.6353, + "step": 3803 + }, + { + "epoch": 0.8766997003917953, + "grad_norm": 0.14189860224723816, + "learning_rate": 3.93563122781313e-06, + "loss": 0.6441, + "step": 3804 + }, + { + "epoch": 0.8769301682415304, + "grad_norm": 0.14701896905899048, + "learning_rate": 3.921127547069014e-06, + "loss": 0.6457, + "step": 3805 + }, + { + "epoch": 0.8771606360912653, + "grad_norm": 0.141061931848526, + "learning_rate": 3.906649549582414e-06, + "loss": 0.6324, + "step": 3806 + }, + { + "epoch": 0.8773911039410003, + "grad_norm": 0.14271359145641327, + "learning_rate": 3.8921972434230185e-06, + "loss": 0.6447, + "step": 3807 + }, + { + "epoch": 0.8776215717907352, + "grad_norm": 0.1448899656534195, + "learning_rate": 3.8777706366462e-06, + "loss": 0.6401, + "step": 3808 + }, + { + "epoch": 0.8778520396404702, + "grad_norm": 0.13925962150096893, + "learning_rate": 3.863369737293005e-06, + "loss": 0.6411, + "step": 3809 + }, + { + "epoch": 0.8780825074902051, + "grad_norm": 0.1407134234905243, + "learning_rate": 3.848994553390134e-06, + "loss": 0.6445, + "step": 3810 + }, + { + "epoch": 0.8783129753399401, + "grad_norm": 0.1439376175403595, + "learning_rate": 3.834645092949973e-06, + "loss": 0.6367, + "step": 3811 + }, + { + "epoch": 0.878543443189675, + "grad_norm": 0.14084570109844208, + "learning_rate": 3.8203213639705915e-06, + "loss": 0.6304, + "step": 3812 + }, + { + "epoch": 0.87877391103941, + "grad_norm": 0.14097703993320465, + "learning_rate": 3.8060233744356633e-06, + "loss": 0.638, + "step": 3813 + }, + { + "epoch": 0.879004378889145, + "grad_norm": 0.1431102752685547, + "learning_rate": 3.7917511323145584e-06, + "loss": 0.6446, + "step": 3814 + }, + { + "epoch": 0.87923484673888, + "grad_norm": 0.14385107159614563, + "learning_rate": 3.7775046455622855e-06, + "loss": 0.6343, + "step": 3815 + }, + { + "epoch": 0.8794653145886149, + "grad_norm": 0.14835472404956818, + "learning_rate": 3.7632839221194706e-06, + "loss": 0.6442, + "step": 3816 + }, + { + "epoch": 0.8796957824383499, + "grad_norm": 0.1427791565656662, + "learning_rate": 3.749088969912429e-06, + "loss": 0.6434, + "step": 3817 + }, + { + "epoch": 0.8799262502880848, + "grad_norm": 0.13777343928813934, + "learning_rate": 3.734919796853087e-06, + "loss": 0.6434, + "step": 3818 + }, + { + "epoch": 0.8801567181378198, + "grad_norm": 0.15572573244571686, + "learning_rate": 3.720776410838983e-06, + "loss": 0.6361, + "step": 3819 + }, + { + "epoch": 0.8803871859875547, + "grad_norm": 0.14312800765037537, + "learning_rate": 3.7066588197533115e-06, + "loss": 0.6331, + "step": 3820 + }, + { + "epoch": 0.8806176538372897, + "grad_norm": 0.14728614687919617, + "learning_rate": 3.6925670314648775e-06, + "loss": 0.6391, + "step": 3821 + }, + { + "epoch": 0.8808481216870246, + "grad_norm": 0.14271116256713867, + "learning_rate": 3.6785010538281093e-06, + "loss": 0.6359, + "step": 3822 + }, + { + "epoch": 0.8810785895367597, + "grad_norm": 0.14498567581176758, + "learning_rate": 3.664460894683036e-06, + "loss": 0.6383, + "step": 3823 + }, + { + "epoch": 0.8813090573864946, + "grad_norm": 0.14503896236419678, + "learning_rate": 3.650446561855325e-06, + "loss": 0.6293, + "step": 3824 + }, + { + "epoch": 0.8815395252362296, + "grad_norm": 0.14249297976493835, + "learning_rate": 3.6364580631562063e-06, + "loss": 0.6329, + "step": 3825 + }, + { + "epoch": 0.8817699930859645, + "grad_norm": 0.14754772186279297, + "learning_rate": 3.622495406382531e-06, + "loss": 0.6412, + "step": 3826 + }, + { + "epoch": 0.8820004609356995, + "grad_norm": 0.17180697619915009, + "learning_rate": 3.6085585993167805e-06, + "loss": 0.6378, + "step": 3827 + }, + { + "epoch": 0.8822309287854344, + "grad_norm": 0.14113938808441162, + "learning_rate": 3.594647649726962e-06, + "loss": 0.6425, + "step": 3828 + }, + { + "epoch": 0.8824613966351694, + "grad_norm": 0.1466577649116516, + "learning_rate": 3.5807625653667243e-06, + "loss": 0.6484, + "step": 3829 + }, + { + "epoch": 0.8826918644849043, + "grad_norm": 0.14053227007389069, + "learning_rate": 3.566903353975276e-06, + "loss": 0.6409, + "step": 3830 + }, + { + "epoch": 0.8829223323346393, + "grad_norm": 0.14258506894111633, + "learning_rate": 3.553070023277405e-06, + "loss": 0.6373, + "step": 3831 + }, + { + "epoch": 0.8831528001843743, + "grad_norm": 0.1399742215871811, + "learning_rate": 3.5392625809834823e-06, + "loss": 0.6461, + "step": 3832 + }, + { + "epoch": 0.8833832680341093, + "grad_norm": 0.13795405626296997, + "learning_rate": 3.525481034789446e-06, + "loss": 0.6329, + "step": 3833 + }, + { + "epoch": 0.8836137358838442, + "grad_norm": 0.14222726225852966, + "learning_rate": 3.5117253923767967e-06, + "loss": 0.6454, + "step": 3834 + }, + { + "epoch": 0.8838442037335792, + "grad_norm": 0.14100506901741028, + "learning_rate": 3.4979956614125953e-06, + "loss": 0.6318, + "step": 3835 + }, + { + "epoch": 0.8840746715833141, + "grad_norm": 0.13950549066066742, + "learning_rate": 3.4842918495494646e-06, + "loss": 0.6348, + "step": 3836 + }, + { + "epoch": 0.8843051394330491, + "grad_norm": 0.17627927660942078, + "learning_rate": 3.4706139644255897e-06, + "loss": 0.639, + "step": 3837 + }, + { + "epoch": 0.884535607282784, + "grad_norm": 0.1371656060218811, + "learning_rate": 3.4569620136646886e-06, + "loss": 0.64, + "step": 3838 + }, + { + "epoch": 0.884766075132519, + "grad_norm": 0.13862690329551697, + "learning_rate": 3.4433360048760357e-06, + "loss": 0.6335, + "step": 3839 + }, + { + "epoch": 0.8849965429822539, + "grad_norm": 0.14692457020282745, + "learning_rate": 3.4297359456544276e-06, + "loss": 0.629, + "step": 3840 + }, + { + "epoch": 0.885227010831989, + "grad_norm": 0.13511526584625244, + "learning_rate": 3.4161618435802233e-06, + "loss": 0.6377, + "step": 3841 + }, + { + "epoch": 0.8854574786817239, + "grad_norm": 0.14511561393737793, + "learning_rate": 3.4026137062193097e-06, + "loss": 0.645, + "step": 3842 + }, + { + "epoch": 0.8856879465314589, + "grad_norm": 0.1404416412115097, + "learning_rate": 3.389091541123074e-06, + "loss": 0.6393, + "step": 3843 + }, + { + "epoch": 0.8859184143811938, + "grad_norm": 0.14536434412002563, + "learning_rate": 3.375595355828454e-06, + "loss": 0.6376, + "step": 3844 + }, + { + "epoch": 0.8861488822309288, + "grad_norm": 0.14059731364250183, + "learning_rate": 3.362125157857904e-06, + "loss": 0.6335, + "step": 3845 + }, + { + "epoch": 0.8863793500806637, + "grad_norm": 0.14244748651981354, + "learning_rate": 3.34868095471938e-06, + "loss": 0.6391, + "step": 3846 + }, + { + "epoch": 0.8866098179303987, + "grad_norm": 0.13621114194393158, + "learning_rate": 3.335262753906371e-06, + "loss": 0.6436, + "step": 3847 + }, + { + "epoch": 0.8868402857801336, + "grad_norm": 0.14168091118335724, + "learning_rate": 3.3218705628978554e-06, + "loss": 0.6382, + "step": 3848 + }, + { + "epoch": 0.8870707536298686, + "grad_norm": 0.14067873358726501, + "learning_rate": 3.3085043891583125e-06, + "loss": 0.6332, + "step": 3849 + }, + { + "epoch": 0.8873012214796036, + "grad_norm": 0.14555051922798157, + "learning_rate": 3.295164240137727e-06, + "loss": 0.6424, + "step": 3850 + }, + { + "epoch": 0.8875316893293386, + "grad_norm": 0.14118191599845886, + "learning_rate": 3.2818501232715794e-06, + "loss": 0.6465, + "step": 3851 + }, + { + "epoch": 0.8877621571790735, + "grad_norm": 0.13933710753917694, + "learning_rate": 3.268562045980844e-06, + "loss": 0.6429, + "step": 3852 + }, + { + "epoch": 0.8879926250288085, + "grad_norm": 0.1409381926059723, + "learning_rate": 3.2553000156719747e-06, + "loss": 0.6372, + "step": 3853 + }, + { + "epoch": 0.8882230928785434, + "grad_norm": 0.13979056477546692, + "learning_rate": 3.242064039736914e-06, + "loss": 0.6342, + "step": 3854 + }, + { + "epoch": 0.8884535607282784, + "grad_norm": 0.14258621633052826, + "learning_rate": 3.2288541255530545e-06, + "loss": 0.6412, + "step": 3855 + }, + { + "epoch": 0.8886840285780133, + "grad_norm": 0.1503862887620926, + "learning_rate": 3.215670280483307e-06, + "loss": 0.6461, + "step": 3856 + }, + { + "epoch": 0.8889144964277483, + "grad_norm": 0.14357925951480865, + "learning_rate": 3.202512511876038e-06, + "loss": 0.6381, + "step": 3857 + }, + { + "epoch": 0.8891449642774832, + "grad_norm": 0.1408717930316925, + "learning_rate": 3.189380827065047e-06, + "loss": 0.6363, + "step": 3858 + }, + { + "epoch": 0.8893754321272183, + "grad_norm": 0.14140430092811584, + "learning_rate": 3.1762752333696297e-06, + "loss": 0.6376, + "step": 3859 + }, + { + "epoch": 0.8896058999769532, + "grad_norm": 0.14138497412204742, + "learning_rate": 3.163195738094532e-06, + "loss": 0.6396, + "step": 3860 + }, + { + "epoch": 0.8898363678266882, + "grad_norm": 0.14000347256660461, + "learning_rate": 3.150142348529955e-06, + "loss": 0.6386, + "step": 3861 + }, + { + "epoch": 0.8900668356764231, + "grad_norm": 0.14265434443950653, + "learning_rate": 3.1371150719515354e-06, + "loss": 0.6347, + "step": 3862 + }, + { + "epoch": 0.8902973035261581, + "grad_norm": 0.14289766550064087, + "learning_rate": 3.1241139156203746e-06, + "loss": 0.6392, + "step": 3863 + }, + { + "epoch": 0.8905277713758931, + "grad_norm": 0.14235663414001465, + "learning_rate": 3.111138886782994e-06, + "loss": 0.6365, + "step": 3864 + }, + { + "epoch": 0.890758239225628, + "grad_norm": 0.1410188376903534, + "learning_rate": 3.0981899926713574e-06, + "loss": 0.6476, + "step": 3865 + }, + { + "epoch": 0.890988707075363, + "grad_norm": 0.1419987976551056, + "learning_rate": 3.0852672405028984e-06, + "loss": 0.6372, + "step": 3866 + }, + { + "epoch": 0.891219174925098, + "grad_norm": 0.14122051000595093, + "learning_rate": 3.072370637480415e-06, + "loss": 0.6286, + "step": 3867 + }, + { + "epoch": 0.891449642774833, + "grad_norm": 0.13950307667255402, + "learning_rate": 3.059500190792186e-06, + "loss": 0.6365, + "step": 3868 + }, + { + "epoch": 0.8916801106245679, + "grad_norm": 0.1390613466501236, + "learning_rate": 3.0466559076118837e-06, + "loss": 0.639, + "step": 3869 + }, + { + "epoch": 0.8919105784743029, + "grad_norm": 0.14016972482204437, + "learning_rate": 3.0338377950985875e-06, + "loss": 0.6386, + "step": 3870 + }, + { + "epoch": 0.8921410463240378, + "grad_norm": 0.14392688870429993, + "learning_rate": 3.0210458603968263e-06, + "loss": 0.6441, + "step": 3871 + }, + { + "epoch": 0.8923715141737728, + "grad_norm": 0.13809806108474731, + "learning_rate": 3.0082801106365187e-06, + "loss": 0.6364, + "step": 3872 + }, + { + "epoch": 0.8926019820235077, + "grad_norm": 0.14696134626865387, + "learning_rate": 2.995540552932974e-06, + "loss": 0.6413, + "step": 3873 + }, + { + "epoch": 0.8928324498732427, + "grad_norm": 0.14118346571922302, + "learning_rate": 2.982827194386917e-06, + "loss": 0.6405, + "step": 3874 + }, + { + "epoch": 0.8930629177229776, + "grad_norm": 0.14888674020767212, + "learning_rate": 2.9701400420844737e-06, + "loss": 0.6413, + "step": 3875 + }, + { + "epoch": 0.8932933855727127, + "grad_norm": 0.14854340255260468, + "learning_rate": 2.9574791030971604e-06, + "loss": 0.6442, + "step": 3876 + }, + { + "epoch": 0.8935238534224476, + "grad_norm": 0.1380283385515213, + "learning_rate": 2.944844384481871e-06, + "loss": 0.6342, + "step": 3877 + }, + { + "epoch": 0.8937543212721826, + "grad_norm": 0.13916511833667755, + "learning_rate": 2.9322358932809157e-06, + "loss": 0.6426, + "step": 3878 + }, + { + "epoch": 0.8939847891219175, + "grad_norm": 0.1366412490606308, + "learning_rate": 2.919653636521935e-06, + "loss": 0.6418, + "step": 3879 + }, + { + "epoch": 0.8942152569716525, + "grad_norm": 0.13759376108646393, + "learning_rate": 2.907097621217986e-06, + "loss": 0.6409, + "step": 3880 + }, + { + "epoch": 0.8944457248213874, + "grad_norm": 0.1421494036912918, + "learning_rate": 2.894567854367508e-06, + "loss": 0.6452, + "step": 3881 + }, + { + "epoch": 0.8946761926711224, + "grad_norm": 0.13909095525741577, + "learning_rate": 2.8820643429542825e-06, + "loss": 0.6435, + "step": 3882 + }, + { + "epoch": 0.8949066605208573, + "grad_norm": 0.1391996145248413, + "learning_rate": 2.8695870939474624e-06, + "loss": 0.6409, + "step": 3883 + }, + { + "epoch": 0.8951371283705923, + "grad_norm": 0.13931603729724884, + "learning_rate": 2.857136114301562e-06, + "loss": 0.6327, + "step": 3884 + }, + { + "epoch": 0.8953675962203272, + "grad_norm": 0.1370822936296463, + "learning_rate": 2.844711410956469e-06, + "loss": 0.642, + "step": 3885 + }, + { + "epoch": 0.8955980640700623, + "grad_norm": 0.14227797091007233, + "learning_rate": 2.83231299083741e-06, + "loss": 0.6278, + "step": 3886 + }, + { + "epoch": 0.8958285319197972, + "grad_norm": 0.1375594437122345, + "learning_rate": 2.8199408608549695e-06, + "loss": 0.6328, + "step": 3887 + }, + { + "epoch": 0.8960589997695322, + "grad_norm": 0.14295582473278046, + "learning_rate": 2.8075950279050855e-06, + "loss": 0.646, + "step": 3888 + }, + { + "epoch": 0.8962894676192671, + "grad_norm": 0.13916680216789246, + "learning_rate": 2.7952754988690046e-06, + "loss": 0.647, + "step": 3889 + }, + { + "epoch": 0.8965199354690021, + "grad_norm": 0.1418418288230896, + "learning_rate": 2.782982280613344e-06, + "loss": 0.6377, + "step": 3890 + }, + { + "epoch": 0.896750403318737, + "grad_norm": 0.1433732807636261, + "learning_rate": 2.770715379990069e-06, + "loss": 0.6426, + "step": 3891 + }, + { + "epoch": 0.896980871168472, + "grad_norm": 0.1423547863960266, + "learning_rate": 2.7584748038364303e-06, + "loss": 0.635, + "step": 3892 + }, + { + "epoch": 0.8972113390182069, + "grad_norm": 0.1434488594532013, + "learning_rate": 2.7462605589750443e-06, + "loss": 0.6382, + "step": 3893 + }, + { + "epoch": 0.897441806867942, + "grad_norm": 0.14350055158138275, + "learning_rate": 2.734072652213837e-06, + "loss": 0.6414, + "step": 3894 + }, + { + "epoch": 0.8976722747176769, + "grad_norm": 0.13868124783039093, + "learning_rate": 2.7219110903460523e-06, + "loss": 0.6353, + "step": 3895 + }, + { + "epoch": 0.8979027425674119, + "grad_norm": 0.13998155295848846, + "learning_rate": 2.7097758801502506e-06, + "loss": 0.6369, + "step": 3896 + }, + { + "epoch": 0.8981332104171468, + "grad_norm": 0.1414719521999359, + "learning_rate": 2.6976670283903215e-06, + "loss": 0.6281, + "step": 3897 + }, + { + "epoch": 0.8983636782668818, + "grad_norm": 0.13889610767364502, + "learning_rate": 2.685584541815428e-06, + "loss": 0.6447, + "step": 3898 + }, + { + "epoch": 0.8985941461166167, + "grad_norm": 0.14231473207473755, + "learning_rate": 2.6735284271600657e-06, + "loss": 0.6355, + "step": 3899 + }, + { + "epoch": 0.8988246139663517, + "grad_norm": 0.1469811350107193, + "learning_rate": 2.6614986911440264e-06, + "loss": 0.6374, + "step": 3900 + }, + { + "epoch": 0.8990550818160866, + "grad_norm": 0.13894438743591309, + "learning_rate": 2.6494953404723965e-06, + "loss": 0.6362, + "step": 3901 + }, + { + "epoch": 0.8992855496658216, + "grad_norm": 0.1419995129108429, + "learning_rate": 2.637518381835552e-06, + "loss": 0.6354, + "step": 3902 + }, + { + "epoch": 0.8995160175155565, + "grad_norm": 0.14471600949764252, + "learning_rate": 2.625567821909175e-06, + "loss": 0.6406, + "step": 3903 + }, + { + "epoch": 0.8997464853652916, + "grad_norm": 0.1410285234451294, + "learning_rate": 2.6136436673541986e-06, + "loss": 0.6399, + "step": 3904 + }, + { + "epoch": 0.8999769532150265, + "grad_norm": 0.14543531835079193, + "learning_rate": 2.601745924816862e-06, + "loss": 0.6349, + "step": 3905 + }, + { + "epoch": 0.9002074210647615, + "grad_norm": 0.14382287859916687, + "learning_rate": 2.58987460092871e-06, + "loss": 0.6331, + "step": 3906 + }, + { + "epoch": 0.9004378889144964, + "grad_norm": 0.14252907037734985, + "learning_rate": 2.5780297023065057e-06, + "loss": 0.6367, + "step": 3907 + }, + { + "epoch": 0.9006683567642314, + "grad_norm": 0.1392444670200348, + "learning_rate": 2.5662112355523183e-06, + "loss": 0.6349, + "step": 3908 + }, + { + "epoch": 0.9008988246139663, + "grad_norm": 0.13947899639606476, + "learning_rate": 2.5544192072534835e-06, + "loss": 0.6431, + "step": 3909 + }, + { + "epoch": 0.9011292924637013, + "grad_norm": 0.14244791865348816, + "learning_rate": 2.542653623982588e-06, + "loss": 0.6393, + "step": 3910 + }, + { + "epoch": 0.9013597603134362, + "grad_norm": 0.1407129168510437, + "learning_rate": 2.530914492297487e-06, + "loss": 0.637, + "step": 3911 + }, + { + "epoch": 0.9015902281631712, + "grad_norm": 0.13949137926101685, + "learning_rate": 2.519201818741301e-06, + "loss": 0.6359, + "step": 3912 + }, + { + "epoch": 0.9018206960129062, + "grad_norm": 0.13872455060482025, + "learning_rate": 2.507515609842376e-06, + "loss": 0.6415, + "step": 3913 + }, + { + "epoch": 0.9020511638626412, + "grad_norm": 0.13934583961963654, + "learning_rate": 2.495855872114333e-06, + "loss": 0.6298, + "step": 3914 + }, + { + "epoch": 0.9022816317123761, + "grad_norm": 0.13912633061408997, + "learning_rate": 2.4842226120560255e-06, + "loss": 0.6321, + "step": 3915 + }, + { + "epoch": 0.9025120995621111, + "grad_norm": 0.1382398009300232, + "learning_rate": 2.4726158361515593e-06, + "loss": 0.6343, + "step": 3916 + }, + { + "epoch": 0.902742567411846, + "grad_norm": 0.14063194394111633, + "learning_rate": 2.461035550870272e-06, + "loss": 0.6505, + "step": 3917 + }, + { + "epoch": 0.902973035261581, + "grad_norm": 0.14703617990016937, + "learning_rate": 2.4494817626667442e-06, + "loss": 0.64, + "step": 3918 + }, + { + "epoch": 0.9032035031113159, + "grad_norm": 0.14321446418762207, + "learning_rate": 2.437954477980753e-06, + "loss": 0.6332, + "step": 3919 + }, + { + "epoch": 0.9034339709610509, + "grad_norm": 0.14084553718566895, + "learning_rate": 2.426453703237358e-06, + "loss": 0.6363, + "step": 3920 + }, + { + "epoch": 0.9036644388107858, + "grad_norm": 0.14175479114055634, + "learning_rate": 2.41497944484681e-06, + "loss": 0.6319, + "step": 3921 + }, + { + "epoch": 0.9038949066605209, + "grad_norm": 0.13902176916599274, + "learning_rate": 2.4035317092045763e-06, + "loss": 0.6345, + "step": 3922 + }, + { + "epoch": 0.9041253745102559, + "grad_norm": 0.1377190500497818, + "learning_rate": 2.3921105026913527e-06, + "loss": 0.6291, + "step": 3923 + }, + { + "epoch": 0.9043558423599908, + "grad_norm": 0.13996082544326782, + "learning_rate": 2.380715831673047e-06, + "loss": 0.6375, + "step": 3924 + }, + { + "epoch": 0.9045863102097258, + "grad_norm": 0.1362973004579544, + "learning_rate": 2.369347702500774e-06, + "loss": 0.6454, + "step": 3925 + }, + { + "epoch": 0.9048167780594607, + "grad_norm": 0.13769546151161194, + "learning_rate": 2.3580061215108585e-06, + "loss": 0.635, + "step": 3926 + }, + { + "epoch": 0.9050472459091957, + "grad_norm": 0.13530834019184113, + "learning_rate": 2.3466910950248332e-06, + "loss": 0.634, + "step": 3927 + }, + { + "epoch": 0.9052777137589306, + "grad_norm": 0.14218156039714813, + "learning_rate": 2.3354026293494034e-06, + "loss": 0.6334, + "step": 3928 + }, + { + "epoch": 0.9055081816086656, + "grad_norm": 0.13567234575748444, + "learning_rate": 2.324140730776497e-06, + "loss": 0.6383, + "step": 3929 + }, + { + "epoch": 0.9057386494584005, + "grad_norm": 0.14312051236629486, + "learning_rate": 2.3129054055832376e-06, + "loss": 0.6454, + "step": 3930 + }, + { + "epoch": 0.9059691173081356, + "grad_norm": 0.13520599901676178, + "learning_rate": 2.3016966600319154e-06, + "loss": 0.6408, + "step": 3931 + }, + { + "epoch": 0.9061995851578705, + "grad_norm": 0.13904651999473572, + "learning_rate": 2.290514500370011e-06, + "loss": 0.6371, + "step": 3932 + }, + { + "epoch": 0.9064300530076055, + "grad_norm": 0.14134728908538818, + "learning_rate": 2.2793589328302056e-06, + "loss": 0.641, + "step": 3933 + }, + { + "epoch": 0.9066605208573404, + "grad_norm": 0.13760726153850555, + "learning_rate": 2.268229963630325e-06, + "loss": 0.6408, + "step": 3934 + }, + { + "epoch": 0.9068909887070754, + "grad_norm": 0.1370038092136383, + "learning_rate": 2.2571275989734076e-06, + "loss": 0.6401, + "step": 3935 + }, + { + "epoch": 0.9071214565568103, + "grad_norm": 0.13965949416160583, + "learning_rate": 2.2460518450476474e-06, + "loss": 0.6404, + "step": 3936 + }, + { + "epoch": 0.9073519244065453, + "grad_norm": 0.1396677941083908, + "learning_rate": 2.2350027080263845e-06, + "loss": 0.6266, + "step": 3937 + }, + { + "epoch": 0.9075823922562802, + "grad_norm": 0.13712947070598602, + "learning_rate": 2.223980194068159e-06, + "loss": 0.6334, + "step": 3938 + }, + { + "epoch": 0.9078128601060153, + "grad_norm": 0.1409660130739212, + "learning_rate": 2.212984309316646e-06, + "loss": 0.6407, + "step": 3939 + }, + { + "epoch": 0.9080433279557502, + "grad_norm": 0.14090897142887115, + "learning_rate": 2.2020150599006916e-06, + "loss": 0.6377, + "step": 3940 + }, + { + "epoch": 0.9082737958054852, + "grad_norm": 0.1404808610677719, + "learning_rate": 2.191072451934295e-06, + "loss": 0.6421, + "step": 3941 + }, + { + "epoch": 0.9085042636552201, + "grad_norm": 0.1363547444343567, + "learning_rate": 2.180156491516605e-06, + "loss": 0.6403, + "step": 3942 + }, + { + "epoch": 0.9087347315049551, + "grad_norm": 0.13701926171779633, + "learning_rate": 2.1692671847319048e-06, + "loss": 0.6341, + "step": 3943 + }, + { + "epoch": 0.90896519935469, + "grad_norm": 0.19568116962909698, + "learning_rate": 2.1584045376496385e-06, + "loss": 0.6294, + "step": 3944 + }, + { + "epoch": 0.909195667204425, + "grad_norm": 0.13929463922977448, + "learning_rate": 2.147568556324392e-06, + "loss": 0.6354, + "step": 3945 + }, + { + "epoch": 0.9094261350541599, + "grad_norm": 0.14168402552604675, + "learning_rate": 2.136759246795872e-06, + "loss": 0.6367, + "step": 3946 + }, + { + "epoch": 0.9096566029038949, + "grad_norm": 0.13594500720500946, + "learning_rate": 2.125976615088926e-06, + "loss": 0.6406, + "step": 3947 + }, + { + "epoch": 0.9098870707536298, + "grad_norm": 0.14707335829734802, + "learning_rate": 2.1152206672135465e-06, + "loss": 0.6409, + "step": 3948 + }, + { + "epoch": 0.9101175386033649, + "grad_norm": 0.13740447163581848, + "learning_rate": 2.104491409164827e-06, + "loss": 0.6325, + "step": 3949 + }, + { + "epoch": 0.9103480064530998, + "grad_norm": 0.1381852775812149, + "learning_rate": 2.0937888469230115e-06, + "loss": 0.6356, + "step": 3950 + }, + { + "epoch": 0.9105784743028348, + "grad_norm": 0.13955402374267578, + "learning_rate": 2.083112986453445e-06, + "loss": 0.6406, + "step": 3951 + }, + { + "epoch": 0.9108089421525697, + "grad_norm": 0.14137601852416992, + "learning_rate": 2.072463833706595e-06, + "loss": 0.6395, + "step": 3952 + }, + { + "epoch": 0.9110394100023047, + "grad_norm": 0.14015664160251617, + "learning_rate": 2.061841394618036e-06, + "loss": 0.6381, + "step": 3953 + }, + { + "epoch": 0.9112698778520396, + "grad_norm": 0.1479710191488266, + "learning_rate": 2.0512456751084763e-06, + "loss": 0.6534, + "step": 3954 + }, + { + "epoch": 0.9115003457017746, + "grad_norm": 0.14452481269836426, + "learning_rate": 2.040676681083703e-06, + "loss": 0.6372, + "step": 3955 + }, + { + "epoch": 0.9117308135515095, + "grad_norm": 0.1414709836244583, + "learning_rate": 2.030134418434626e-06, + "loss": 0.6353, + "step": 3956 + }, + { + "epoch": 0.9119612814012446, + "grad_norm": 0.1382758766412735, + "learning_rate": 2.0196188930372563e-06, + "loss": 0.6351, + "step": 3957 + }, + { + "epoch": 0.9121917492509795, + "grad_norm": 0.14365412294864655, + "learning_rate": 2.0091301107526774e-06, + "loss": 0.6343, + "step": 3958 + }, + { + "epoch": 0.9124222171007145, + "grad_norm": 0.13868463039398193, + "learning_rate": 1.998668077427096e-06, + "loss": 0.6444, + "step": 3959 + }, + { + "epoch": 0.9126526849504494, + "grad_norm": 0.14234106242656708, + "learning_rate": 1.9882327988918038e-06, + "loss": 0.6295, + "step": 3960 + }, + { + "epoch": 0.9128831528001844, + "grad_norm": 0.16360850632190704, + "learning_rate": 1.977824280963164e-06, + "loss": 0.6482, + "step": 3961 + }, + { + "epoch": 0.9131136206499193, + "grad_norm": 0.13877460360527039, + "learning_rate": 1.967442529442637e-06, + "loss": 0.641, + "step": 3962 + }, + { + "epoch": 0.9133440884996543, + "grad_norm": 0.13784171640872955, + "learning_rate": 1.957087550116765e-06, + "loss": 0.6391, + "step": 3963 + }, + { + "epoch": 0.9135745563493892, + "grad_norm": 0.1386898308992386, + "learning_rate": 1.94675934875716e-06, + "loss": 0.6399, + "step": 3964 + }, + { + "epoch": 0.9138050241991242, + "grad_norm": 0.14157116413116455, + "learning_rate": 1.936457931120522e-06, + "loss": 0.6383, + "step": 3965 + }, + { + "epoch": 0.9140354920488591, + "grad_norm": 0.13940860331058502, + "learning_rate": 1.9261833029486088e-06, + "loss": 0.6441, + "step": 3966 + }, + { + "epoch": 0.9142659598985942, + "grad_norm": 0.13591444492340088, + "learning_rate": 1.9159354699682497e-06, + "loss": 0.6382, + "step": 3967 + }, + { + "epoch": 0.9144964277483291, + "grad_norm": 0.14465990662574768, + "learning_rate": 1.9057144378913427e-06, + "loss": 0.6278, + "step": 3968 + }, + { + "epoch": 0.9147268955980641, + "grad_norm": 0.13699522614479065, + "learning_rate": 1.895520212414842e-06, + "loss": 0.6409, + "step": 3969 + }, + { + "epoch": 0.914957363447799, + "grad_norm": 0.14070484042167664, + "learning_rate": 1.8853527992207742e-06, + "loss": 0.6438, + "step": 3970 + }, + { + "epoch": 0.915187831297534, + "grad_norm": 0.1463005095720291, + "learning_rate": 1.875212203976201e-06, + "loss": 0.6356, + "step": 3971 + }, + { + "epoch": 0.9154182991472689, + "grad_norm": 0.1436576545238495, + "learning_rate": 1.8650984323332566e-06, + "loss": 0.6466, + "step": 3972 + }, + { + "epoch": 0.9156487669970039, + "grad_norm": 0.13998855650424957, + "learning_rate": 1.8550114899290983e-06, + "loss": 0.6379, + "step": 3973 + }, + { + "epoch": 0.9158792348467388, + "grad_norm": 0.13579371571540833, + "learning_rate": 1.8449513823859622e-06, + "loss": 0.6415, + "step": 3974 + }, + { + "epoch": 0.9161097026964738, + "grad_norm": 0.14221090078353882, + "learning_rate": 1.8349181153111073e-06, + "loss": 0.6376, + "step": 3975 + }, + { + "epoch": 0.9163401705462088, + "grad_norm": 0.136023610830307, + "learning_rate": 1.8249116942968325e-06, + "loss": 0.6349, + "step": 3976 + }, + { + "epoch": 0.9165706383959438, + "grad_norm": 0.1378968209028244, + "learning_rate": 1.8149321249204765e-06, + "loss": 0.6463, + "step": 3977 + }, + { + "epoch": 0.9168011062456787, + "grad_norm": 0.13700707256793976, + "learning_rate": 1.8049794127444119e-06, + "loss": 0.6424, + "step": 3978 + }, + { + "epoch": 0.9170315740954137, + "grad_norm": 0.1433679163455963, + "learning_rate": 1.7950535633160403e-06, + "loss": 0.6281, + "step": 3979 + }, + { + "epoch": 0.9172620419451486, + "grad_norm": 0.14099539816379547, + "learning_rate": 1.7851545821677973e-06, + "loss": 0.6381, + "step": 3980 + }, + { + "epoch": 0.9174925097948836, + "grad_norm": 0.13756389915943146, + "learning_rate": 1.7752824748171415e-06, + "loss": 0.6286, + "step": 3981 + }, + { + "epoch": 0.9177229776446185, + "grad_norm": 0.13646264374256134, + "learning_rate": 1.7654372467665325e-06, + "loss": 0.626, + "step": 3982 + }, + { + "epoch": 0.9179534454943535, + "grad_norm": 0.1390201449394226, + "learning_rate": 1.7556189035034644e-06, + "loss": 0.6433, + "step": 3983 + }, + { + "epoch": 0.9181839133440886, + "grad_norm": 0.1405726969242096, + "learning_rate": 1.7458274505004702e-06, + "loss": 0.6378, + "step": 3984 + }, + { + "epoch": 0.9184143811938235, + "grad_norm": 0.13295190036296844, + "learning_rate": 1.7360628932150512e-06, + "loss": 0.635, + "step": 3985 + }, + { + "epoch": 0.9186448490435585, + "grad_norm": 0.13831880688667297, + "learning_rate": 1.7263252370897377e-06, + "loss": 0.6451, + "step": 3986 + }, + { + "epoch": 0.9188753168932934, + "grad_norm": 0.1404305100440979, + "learning_rate": 1.7166144875520763e-06, + "loss": 0.6413, + "step": 3987 + }, + { + "epoch": 0.9191057847430284, + "grad_norm": 0.13879506289958954, + "learning_rate": 1.7069306500145875e-06, + "loss": 0.6319, + "step": 3988 + }, + { + "epoch": 0.9193362525927633, + "grad_norm": 0.13712599873542786, + "learning_rate": 1.6972737298748266e-06, + "loss": 0.6433, + "step": 3989 + }, + { + "epoch": 0.9195667204424983, + "grad_norm": 0.13845130801200867, + "learning_rate": 1.6876437325153261e-06, + "loss": 0.6356, + "step": 3990 + }, + { + "epoch": 0.9197971882922332, + "grad_norm": 0.13963979482650757, + "learning_rate": 1.6780406633036095e-06, + "loss": 0.6461, + "step": 3991 + }, + { + "epoch": 0.9200276561419682, + "grad_norm": 0.1397661715745926, + "learning_rate": 1.6684645275922007e-06, + "loss": 0.6437, + "step": 3992 + }, + { + "epoch": 0.9202581239917031, + "grad_norm": 0.13445524871349335, + "learning_rate": 1.6589153307186078e-06, + "loss": 0.6337, + "step": 3993 + }, + { + "epoch": 0.9204885918414382, + "grad_norm": 0.14014729857444763, + "learning_rate": 1.6493930780053235e-06, + "loss": 0.6396, + "step": 3994 + }, + { + "epoch": 0.9207190596911731, + "grad_norm": 0.14015179872512817, + "learning_rate": 1.6398977747598243e-06, + "loss": 0.6377, + "step": 3995 + }, + { + "epoch": 0.9209495275409081, + "grad_norm": 0.13548614084720612, + "learning_rate": 1.6304294262745656e-06, + "loss": 0.635, + "step": 3996 + }, + { + "epoch": 0.921179995390643, + "grad_norm": 0.1403760462999344, + "learning_rate": 1.6209880378269705e-06, + "loss": 0.6384, + "step": 3997 + }, + { + "epoch": 0.921410463240378, + "grad_norm": 0.1437310427427292, + "learning_rate": 1.6115736146794402e-06, + "loss": 0.634, + "step": 3998 + }, + { + "epoch": 0.9216409310901129, + "grad_norm": 0.1346195787191391, + "learning_rate": 1.6021861620793666e-06, + "loss": 0.6337, + "step": 3999 + }, + { + "epoch": 0.9218713989398479, + "grad_norm": 0.13712115585803986, + "learning_rate": 1.5928256852590751e-06, + "loss": 0.6279, + "step": 4000 + }, + { + "epoch": 0.9221018667895828, + "grad_norm": 0.13652116060256958, + "learning_rate": 1.5834921894358701e-06, + "loss": 0.644, + "step": 4001 + }, + { + "epoch": 0.9223323346393179, + "grad_norm": 0.13506993651390076, + "learning_rate": 1.574185679812029e-06, + "loss": 0.6371, + "step": 4002 + }, + { + "epoch": 0.9225628024890528, + "grad_norm": 0.13801835477352142, + "learning_rate": 1.564906161574764e-06, + "loss": 0.6401, + "step": 4003 + }, + { + "epoch": 0.9227932703387878, + "grad_norm": 0.13854144513607025, + "learning_rate": 1.555653639896265e-06, + "loss": 0.6415, + "step": 4004 + }, + { + "epoch": 0.9230237381885227, + "grad_norm": 0.13430531322956085, + "learning_rate": 1.5464281199336683e-06, + "loss": 0.6296, + "step": 4005 + }, + { + "epoch": 0.9232542060382577, + "grad_norm": 0.13435374200344086, + "learning_rate": 1.5372296068290493e-06, + "loss": 0.6304, + "step": 4006 + }, + { + "epoch": 0.9234846738879926, + "grad_norm": 0.13501442968845367, + "learning_rate": 1.5280581057094346e-06, + "loss": 0.6355, + "step": 4007 + }, + { + "epoch": 0.9237151417377276, + "grad_norm": 0.13476891815662384, + "learning_rate": 1.518913621686807e-06, + "loss": 0.6398, + "step": 4008 + }, + { + "epoch": 0.9239456095874625, + "grad_norm": 0.1361880600452423, + "learning_rate": 1.5097961598580845e-06, + "loss": 0.6492, + "step": 4009 + }, + { + "epoch": 0.9241760774371975, + "grad_norm": 0.13403740525245667, + "learning_rate": 1.5007057253051127e-06, + "loss": 0.6257, + "step": 4010 + }, + { + "epoch": 0.9244065452869324, + "grad_norm": 0.13556121289730072, + "learning_rate": 1.4916423230946885e-06, + "loss": 0.6413, + "step": 4011 + }, + { + "epoch": 0.9246370131366675, + "grad_norm": 0.13702526688575745, + "learning_rate": 1.4826059582785324e-06, + "loss": 0.6427, + "step": 4012 + }, + { + "epoch": 0.9248674809864024, + "grad_norm": 0.13721799850463867, + "learning_rate": 1.473596635893293e-06, + "loss": 0.6494, + "step": 4013 + }, + { + "epoch": 0.9250979488361374, + "grad_norm": 0.1371307075023651, + "learning_rate": 1.464614360960559e-06, + "loss": 0.6482, + "step": 4014 + }, + { + "epoch": 0.9253284166858723, + "grad_norm": 0.1363903433084488, + "learning_rate": 1.4556591384868367e-06, + "loss": 0.6343, + "step": 4015 + }, + { + "epoch": 0.9255588845356073, + "grad_norm": 0.14017435908317566, + "learning_rate": 1.4467309734635393e-06, + "loss": 0.6399, + "step": 4016 + }, + { + "epoch": 0.9257893523853422, + "grad_norm": 0.13649635016918182, + "learning_rate": 1.437829870867019e-06, + "loss": 0.6372, + "step": 4017 + }, + { + "epoch": 0.9260198202350772, + "grad_norm": 0.13363580405712128, + "learning_rate": 1.4289558356585353e-06, + "loss": 0.6357, + "step": 4018 + }, + { + "epoch": 0.9262502880848121, + "grad_norm": 0.1385689377784729, + "learning_rate": 1.4201088727842648e-06, + "loss": 0.6344, + "step": 4019 + }, + { + "epoch": 0.9264807559345472, + "grad_norm": 0.13566775619983673, + "learning_rate": 1.411288987175291e-06, + "loss": 0.646, + "step": 4020 + }, + { + "epoch": 0.9267112237842821, + "grad_norm": 0.13955044746398926, + "learning_rate": 1.4024961837476092e-06, + "loss": 0.6411, + "step": 4021 + }, + { + "epoch": 0.9269416916340171, + "grad_norm": 0.13665254414081573, + "learning_rate": 1.39373046740211e-06, + "loss": 0.6417, + "step": 4022 + }, + { + "epoch": 0.927172159483752, + "grad_norm": 0.13750925660133362, + "learning_rate": 1.384991843024591e-06, + "loss": 0.6351, + "step": 4023 + }, + { + "epoch": 0.927402627333487, + "grad_norm": 0.1418118178844452, + "learning_rate": 1.3762803154857729e-06, + "loss": 0.6324, + "step": 4024 + }, + { + "epoch": 0.9276330951832219, + "grad_norm": 0.13481222093105316, + "learning_rate": 1.3675958896412267e-06, + "loss": 0.6325, + "step": 4025 + }, + { + "epoch": 0.9278635630329569, + "grad_norm": 0.1390499770641327, + "learning_rate": 1.3589385703314529e-06, + "loss": 0.6369, + "step": 4026 + }, + { + "epoch": 0.9280940308826918, + "grad_norm": 0.13821762800216675, + "learning_rate": 1.3503083623818412e-06, + "loss": 0.6386, + "step": 4027 + }, + { + "epoch": 0.9283244987324268, + "grad_norm": 0.141762837767601, + "learning_rate": 1.34170527060265e-06, + "loss": 0.6422, + "step": 4028 + }, + { + "epoch": 0.9285549665821617, + "grad_norm": 0.1358966827392578, + "learning_rate": 1.3331292997890377e-06, + "loss": 0.6365, + "step": 4029 + }, + { + "epoch": 0.9287854344318968, + "grad_norm": 0.13388477265834808, + "learning_rate": 1.3245804547210582e-06, + "loss": 0.6495, + "step": 4030 + }, + { + "epoch": 0.9290159022816317, + "grad_norm": 0.1355268657207489, + "learning_rate": 1.3160587401636171e-06, + "loss": 0.6362, + "step": 4031 + }, + { + "epoch": 0.9292463701313667, + "grad_norm": 0.1358853578567505, + "learning_rate": 1.3075641608665202e-06, + "loss": 0.6367, + "step": 4032 + }, + { + "epoch": 0.9294768379811016, + "grad_norm": 0.1385107785463333, + "learning_rate": 1.2990967215644412e-06, + "loss": 0.6408, + "step": 4033 + }, + { + "epoch": 0.9297073058308366, + "grad_norm": 0.1369771957397461, + "learning_rate": 1.2906564269769217e-06, + "loss": 0.6382, + "step": 4034 + }, + { + "epoch": 0.9299377736805715, + "grad_norm": 0.14050287008285522, + "learning_rate": 1.282243281808393e-06, + "loss": 0.6412, + "step": 4035 + }, + { + "epoch": 0.9301682415303065, + "grad_norm": 0.1366300880908966, + "learning_rate": 1.2738572907481315e-06, + "loss": 0.6419, + "step": 4036 + }, + { + "epoch": 0.9303987093800414, + "grad_norm": 0.1353761851787567, + "learning_rate": 1.2654984584702766e-06, + "loss": 0.6393, + "step": 4037 + }, + { + "epoch": 0.9306291772297764, + "grad_norm": 0.18008558452129364, + "learning_rate": 1.2571667896338624e-06, + "loss": 0.6354, + "step": 4038 + }, + { + "epoch": 0.9308596450795114, + "grad_norm": 0.13674607872962952, + "learning_rate": 1.2488622888827517e-06, + "loss": 0.6426, + "step": 4039 + }, + { + "epoch": 0.9310901129292464, + "grad_norm": 0.13963568210601807, + "learning_rate": 1.24058496084567e-06, + "loss": 0.6333, + "step": 4040 + }, + { + "epoch": 0.9313205807789813, + "grad_norm": 0.1362845003604889, + "learning_rate": 1.2323348101362043e-06, + "loss": 0.6435, + "step": 4041 + }, + { + "epoch": 0.9315510486287163, + "grad_norm": 0.1407993733882904, + "learning_rate": 1.224111841352793e-06, + "loss": 0.6403, + "step": 4042 + }, + { + "epoch": 0.9317815164784513, + "grad_norm": 0.13926169276237488, + "learning_rate": 1.2159160590787143e-06, + "loss": 0.6412, + "step": 4043 + }, + { + "epoch": 0.9320119843281862, + "grad_norm": 0.13823188841342926, + "learning_rate": 1.2077474678821088e-06, + "loss": 0.6452, + "step": 4044 + }, + { + "epoch": 0.9322424521779212, + "grad_norm": 0.1374104917049408, + "learning_rate": 1.1996060723159508e-06, + "loss": 0.6343, + "step": 4045 + }, + { + "epoch": 0.9324729200276561, + "grad_norm": 0.13301199674606323, + "learning_rate": 1.1914918769180606e-06, + "loss": 0.6373, + "step": 4046 + }, + { + "epoch": 0.9327033878773912, + "grad_norm": 0.136099711060524, + "learning_rate": 1.1834048862110814e-06, + "loss": 0.6435, + "step": 4047 + }, + { + "epoch": 0.9329338557271261, + "grad_norm": 0.140198215842247, + "learning_rate": 1.17534510470253e-06, + "loss": 0.6392, + "step": 4048 + }, + { + "epoch": 0.9331643235768611, + "grad_norm": 0.1340055614709854, + "learning_rate": 1.1673125368847238e-06, + "loss": 0.6425, + "step": 4049 + }, + { + "epoch": 0.933394791426596, + "grad_norm": 0.14197507500648499, + "learning_rate": 1.1593071872348204e-06, + "loss": 0.6369, + "step": 4050 + }, + { + "epoch": 0.933625259276331, + "grad_norm": 0.13812677562236786, + "learning_rate": 1.1513290602148174e-06, + "loss": 0.6403, + "step": 4051 + }, + { + "epoch": 0.9338557271260659, + "grad_norm": 0.1445024460554123, + "learning_rate": 1.1433781602715189e-06, + "loss": 0.6346, + "step": 4052 + }, + { + "epoch": 0.9340861949758009, + "grad_norm": 0.13985450565814972, + "learning_rate": 1.1354544918365795e-06, + "loss": 0.643, + "step": 4053 + }, + { + "epoch": 0.9343166628255358, + "grad_norm": 0.13591651618480682, + "learning_rate": 1.1275580593264611e-06, + "loss": 0.6331, + "step": 4054 + }, + { + "epoch": 0.9345471306752708, + "grad_norm": 0.13689643144607544, + "learning_rate": 1.1196888671424377e-06, + "loss": 0.6402, + "step": 4055 + }, + { + "epoch": 0.9347775985250057, + "grad_norm": 0.13568592071533203, + "learning_rate": 1.111846919670606e-06, + "loss": 0.6353, + "step": 4056 + }, + { + "epoch": 0.9350080663747408, + "grad_norm": 0.13714367151260376, + "learning_rate": 1.1040322212818922e-06, + "loss": 0.636, + "step": 4057 + }, + { + "epoch": 0.9352385342244757, + "grad_norm": 0.13835318386554718, + "learning_rate": 1.096244776332006e-06, + "loss": 0.6393, + "step": 4058 + }, + { + "epoch": 0.9354690020742107, + "grad_norm": 0.14133679866790771, + "learning_rate": 1.0884845891614925e-06, + "loss": 0.6446, + "step": 4059 + }, + { + "epoch": 0.9356994699239456, + "grad_norm": 0.13992494344711304, + "learning_rate": 1.0807516640956972e-06, + "loss": 0.6326, + "step": 4060 + }, + { + "epoch": 0.9359299377736806, + "grad_norm": 0.13520614802837372, + "learning_rate": 1.0730460054447612e-06, + "loss": 0.6325, + "step": 4061 + }, + { + "epoch": 0.9361604056234155, + "grad_norm": 0.13457529246807098, + "learning_rate": 1.065367617503621e-06, + "loss": 0.634, + "step": 4062 + }, + { + "epoch": 0.9363908734731505, + "grad_norm": 0.1412680298089981, + "learning_rate": 1.057716504552053e-06, + "loss": 0.6391, + "step": 4063 + }, + { + "epoch": 0.9366213413228854, + "grad_norm": 0.13816601037979126, + "learning_rate": 1.0500926708545855e-06, + "loss": 0.6334, + "step": 4064 + }, + { + "epoch": 0.9368518091726205, + "grad_norm": 0.138847216963768, + "learning_rate": 1.0424961206605632e-06, + "loss": 0.6358, + "step": 4065 + }, + { + "epoch": 0.9370822770223554, + "grad_norm": 0.13748426735401154, + "learning_rate": 1.0349268582041161e-06, + "loss": 0.6406, + "step": 4066 + }, + { + "epoch": 0.9373127448720904, + "grad_norm": 0.13643766939640045, + "learning_rate": 1.0273848877041802e-06, + "loss": 0.6331, + "step": 4067 + }, + { + "epoch": 0.9375432127218253, + "grad_norm": 0.1338902860879898, + "learning_rate": 1.0198702133644656e-06, + "loss": 0.639, + "step": 4068 + }, + { + "epoch": 0.9377736805715603, + "grad_norm": 0.13689804077148438, + "learning_rate": 1.0123828393734714e-06, + "loss": 0.6414, + "step": 4069 + }, + { + "epoch": 0.9380041484212952, + "grad_norm": 0.14404194056987762, + "learning_rate": 1.0049227699044762e-06, + "loss": 0.6427, + "step": 4070 + }, + { + "epoch": 0.9382346162710302, + "grad_norm": 0.1352960616350174, + "learning_rate": 9.974900091155425e-07, + "loss": 0.6478, + "step": 4071 + }, + { + "epoch": 0.9384650841207651, + "grad_norm": 0.1446170061826706, + "learning_rate": 9.90084561149518e-07, + "loss": 0.6336, + "step": 4072 + }, + { + "epoch": 0.9386955519705001, + "grad_norm": 0.14939038455486298, + "learning_rate": 9.827064301340228e-07, + "loss": 0.6342, + "step": 4073 + }, + { + "epoch": 0.938926019820235, + "grad_norm": 0.1382802277803421, + "learning_rate": 9.75355620181445e-07, + "loss": 0.6345, + "step": 4074 + }, + { + "epoch": 0.9391564876699701, + "grad_norm": 0.1343778669834137, + "learning_rate": 9.680321353889576e-07, + "loss": 0.6438, + "step": 4075 + }, + { + "epoch": 0.939386955519705, + "grad_norm": 0.13062545657157898, + "learning_rate": 9.607359798384785e-07, + "loss": 0.6332, + "step": 4076 + }, + { + "epoch": 0.93961742336944, + "grad_norm": 0.1372269243001938, + "learning_rate": 9.534671575967213e-07, + "loss": 0.6382, + "step": 4077 + }, + { + "epoch": 0.9398478912191749, + "grad_norm": 0.13971804082393646, + "learning_rate": 9.46225672715162e-07, + "loss": 0.6306, + "step": 4078 + }, + { + "epoch": 0.9400783590689099, + "grad_norm": 0.13551059365272522, + "learning_rate": 9.390115292300162e-07, + "loss": 0.6438, + "step": 4079 + }, + { + "epoch": 0.9403088269186448, + "grad_norm": 0.13725043833255768, + "learning_rate": 9.318247311622785e-07, + "loss": 0.6437, + "step": 4080 + }, + { + "epoch": 0.9405392947683798, + "grad_norm": 0.13926532864570618, + "learning_rate": 9.246652825176949e-07, + "loss": 0.6427, + "step": 4081 + }, + { + "epoch": 0.9407697626181147, + "grad_norm": 0.14660166203975677, + "learning_rate": 9.175331872867732e-07, + "loss": 0.6462, + "step": 4082 + }, + { + "epoch": 0.9410002304678498, + "grad_norm": 0.13765586912631989, + "learning_rate": 9.104284494447779e-07, + "loss": 0.6419, + "step": 4083 + }, + { + "epoch": 0.9412306983175847, + "grad_norm": 0.13550062477588654, + "learning_rate": 9.033510729517136e-07, + "loss": 0.6526, + "step": 4084 + }, + { + "epoch": 0.9414611661673197, + "grad_norm": 0.20040088891983032, + "learning_rate": 8.96301061752336e-07, + "loss": 0.6438, + "step": 4085 + }, + { + "epoch": 0.9416916340170546, + "grad_norm": 0.13654790818691254, + "learning_rate": 8.892784197761572e-07, + "loss": 0.629, + "step": 4086 + }, + { + "epoch": 0.9419221018667896, + "grad_norm": 0.13696341216564178, + "learning_rate": 8.822831509374297e-07, + "loss": 0.6424, + "step": 4087 + }, + { + "epoch": 0.9421525697165245, + "grad_norm": 0.13563202321529388, + "learning_rate": 8.753152591351455e-07, + "loss": 0.642, + "step": 4088 + }, + { + "epoch": 0.9423830375662595, + "grad_norm": 0.13841907680034637, + "learning_rate": 8.683747482530424e-07, + "loss": 0.6395, + "step": 4089 + }, + { + "epoch": 0.9426135054159944, + "grad_norm": 0.13629648089408875, + "learning_rate": 8.614616221595983e-07, + "loss": 0.6394, + "step": 4090 + }, + { + "epoch": 0.9428439732657294, + "grad_norm": 0.13527196645736694, + "learning_rate": 8.545758847080143e-07, + "loss": 0.6384, + "step": 4091 + }, + { + "epoch": 0.9430744411154643, + "grad_norm": 0.13530802726745605, + "learning_rate": 8.47717539736237e-07, + "loss": 0.6322, + "step": 4092 + }, + { + "epoch": 0.9433049089651994, + "grad_norm": 0.1388748586177826, + "learning_rate": 8.408865910669583e-07, + "loss": 0.6369, + "step": 4093 + }, + { + "epoch": 0.9435353768149343, + "grad_norm": 0.13674351572990417, + "learning_rate": 8.340830425075663e-07, + "loss": 0.6384, + "step": 4094 + }, + { + "epoch": 0.9437658446646693, + "grad_norm": 0.13656572997570038, + "learning_rate": 8.273068978501996e-07, + "loss": 0.6455, + "step": 4095 + }, + { + "epoch": 0.9439963125144042, + "grad_norm": 0.13279734551906586, + "learning_rate": 8.205581608717261e-07, + "loss": 0.633, + "step": 4096 + }, + { + "epoch": 0.9442267803641392, + "grad_norm": 0.14017152786254883, + "learning_rate": 8.138368353337255e-07, + "loss": 0.6338, + "step": 4097 + }, + { + "epoch": 0.9444572482138741, + "grad_norm": 0.13749238848686218, + "learning_rate": 8.071429249825013e-07, + "loss": 0.6366, + "step": 4098 + }, + { + "epoch": 0.9446877160636091, + "grad_norm": 0.1360204666852951, + "learning_rate": 8.004764335490856e-07, + "loss": 0.6405, + "step": 4099 + }, + { + "epoch": 0.944918183913344, + "grad_norm": 0.14251859486103058, + "learning_rate": 7.938373647492115e-07, + "loss": 0.6426, + "step": 4100 + }, + { + "epoch": 0.945148651763079, + "grad_norm": 0.13560590147972107, + "learning_rate": 7.872257222833357e-07, + "loss": 0.6376, + "step": 4101 + }, + { + "epoch": 0.9453791196128141, + "grad_norm": 0.1355055868625641, + "learning_rate": 7.806415098366438e-07, + "loss": 0.6379, + "step": 4102 + }, + { + "epoch": 0.945609587462549, + "grad_norm": 0.13701874017715454, + "learning_rate": 7.74084731079e-07, + "loss": 0.6298, + "step": 4103 + }, + { + "epoch": 0.945840055312284, + "grad_norm": 0.13840676844120026, + "learning_rate": 7.67555389665009e-07, + "loss": 0.629, + "step": 4104 + }, + { + "epoch": 0.9460705231620189, + "grad_norm": 0.13396546244621277, + "learning_rate": 7.61053489233965e-07, + "loss": 0.6323, + "step": 4105 + }, + { + "epoch": 0.9463009910117539, + "grad_norm": 0.1397581547498703, + "learning_rate": 7.545790334098579e-07, + "loss": 0.6427, + "step": 4106 + }, + { + "epoch": 0.9465314588614888, + "grad_norm": 0.13133475184440613, + "learning_rate": 7.481320258014124e-07, + "loss": 0.6424, + "step": 4107 + }, + { + "epoch": 0.9467619267112238, + "grad_norm": 0.13664543628692627, + "learning_rate": 7.417124700020373e-07, + "loss": 0.6419, + "step": 4108 + }, + { + "epoch": 0.9469923945609587, + "grad_norm": 0.13687404990196228, + "learning_rate": 7.353203695898203e-07, + "loss": 0.631, + "step": 4109 + }, + { + "epoch": 0.9472228624106938, + "grad_norm": 0.13796484470367432, + "learning_rate": 7.289557281275782e-07, + "loss": 0.6416, + "step": 4110 + }, + { + "epoch": 0.9474533302604287, + "grad_norm": 0.13329102098941803, + "learning_rate": 7.226185491628069e-07, + "loss": 0.6261, + "step": 4111 + }, + { + "epoch": 0.9476837981101637, + "grad_norm": 0.13241469860076904, + "learning_rate": 7.163088362276971e-07, + "loss": 0.6364, + "step": 4112 + }, + { + "epoch": 0.9479142659598986, + "grad_norm": 0.13166265189647675, + "learning_rate": 7.100265928391303e-07, + "loss": 0.6389, + "step": 4113 + }, + { + "epoch": 0.9481447338096336, + "grad_norm": 0.13284790515899658, + "learning_rate": 7.037718224986833e-07, + "loss": 0.6334, + "step": 4114 + }, + { + "epoch": 0.9483752016593685, + "grad_norm": 0.13939420878887177, + "learning_rate": 6.975445286926063e-07, + "loss": 0.6227, + "step": 4115 + }, + { + "epoch": 0.9486056695091035, + "grad_norm": 0.13803811371326447, + "learning_rate": 6.913447148918506e-07, + "loss": 0.6405, + "step": 4116 + }, + { + "epoch": 0.9488361373588384, + "grad_norm": 0.13367731869220734, + "learning_rate": 6.851723845520408e-07, + "loss": 0.6311, + "step": 4117 + }, + { + "epoch": 0.9490666052085734, + "grad_norm": 0.13658930361270905, + "learning_rate": 6.790275411134861e-07, + "loss": 0.6459, + "step": 4118 + }, + { + "epoch": 0.9492970730583083, + "grad_norm": 0.13773563504219055, + "learning_rate": 6.729101880011746e-07, + "loss": 0.6382, + "step": 4119 + }, + { + "epoch": 0.9495275409080434, + "grad_norm": 0.13418744504451752, + "learning_rate": 6.668203286247732e-07, + "loss": 0.6321, + "step": 4120 + }, + { + "epoch": 0.9497580087577783, + "grad_norm": 0.13385219871997833, + "learning_rate": 6.607579663786223e-07, + "loss": 0.6343, + "step": 4121 + }, + { + "epoch": 0.9499884766075133, + "grad_norm": 0.13633115589618683, + "learning_rate": 6.547231046417357e-07, + "loss": 0.6265, + "step": 4122 + }, + { + "epoch": 0.9502189444572482, + "grad_norm": 0.13809369504451752, + "learning_rate": 6.48715746777806e-07, + "loss": 0.6444, + "step": 4123 + }, + { + "epoch": 0.9504494123069832, + "grad_norm": 0.13296149671077728, + "learning_rate": 6.42735896135177e-07, + "loss": 0.6357, + "step": 4124 + }, + { + "epoch": 0.9506798801567181, + "grad_norm": 0.13449513912200928, + "learning_rate": 6.367835560468938e-07, + "loss": 0.6401, + "step": 4125 + }, + { + "epoch": 0.9509103480064531, + "grad_norm": 0.13610585033893585, + "learning_rate": 6.308587298306301e-07, + "loss": 0.6376, + "step": 4126 + }, + { + "epoch": 0.951140815856188, + "grad_norm": 0.13235846161842346, + "learning_rate": 6.2496142078875e-07, + "loss": 0.6397, + "step": 4127 + }, + { + "epoch": 0.951371283705923, + "grad_norm": 0.13302072882652283, + "learning_rate": 6.19091632208274e-07, + "loss": 0.634, + "step": 4128 + }, + { + "epoch": 0.951601751555658, + "grad_norm": 0.13481658697128296, + "learning_rate": 6.132493673608797e-07, + "loss": 0.6375, + "step": 4129 + }, + { + "epoch": 0.951832219405393, + "grad_norm": 0.139765664935112, + "learning_rate": 6.074346295028955e-07, + "loss": 0.6346, + "step": 4130 + }, + { + "epoch": 0.9520626872551279, + "grad_norm": 0.13490067422389984, + "learning_rate": 6.016474218753288e-07, + "loss": 0.6376, + "step": 4131 + }, + { + "epoch": 0.9522931551048629, + "grad_norm": 0.13335686922073364, + "learning_rate": 5.958877477038327e-07, + "loss": 0.6462, + "step": 4132 + }, + { + "epoch": 0.9525236229545978, + "grad_norm": 0.1309853196144104, + "learning_rate": 5.901556101987005e-07, + "loss": 0.632, + "step": 4133 + }, + { + "epoch": 0.9527540908043328, + "grad_norm": 0.13043320178985596, + "learning_rate": 5.84451012554893e-07, + "loss": 0.6474, + "step": 4134 + }, + { + "epoch": 0.9529845586540677, + "grad_norm": 0.13605879247188568, + "learning_rate": 5.787739579520113e-07, + "loss": 0.6376, + "step": 4135 + }, + { + "epoch": 0.9532150265038027, + "grad_norm": 0.13562782108783722, + "learning_rate": 5.731244495543186e-07, + "loss": 0.6292, + "step": 4136 + }, + { + "epoch": 0.9534454943535376, + "grad_norm": 0.1376948058605194, + "learning_rate": 5.675024905107129e-07, + "loss": 0.641, + "step": 4137 + }, + { + "epoch": 0.9536759622032727, + "grad_norm": 0.13430991768836975, + "learning_rate": 5.619080839547375e-07, + "loss": 0.6371, + "step": 4138 + }, + { + "epoch": 0.9539064300530076, + "grad_norm": 0.1342536360025406, + "learning_rate": 5.563412330045758e-07, + "loss": 0.6447, + "step": 4139 + }, + { + "epoch": 0.9541368979027426, + "grad_norm": 0.13564327359199524, + "learning_rate": 5.508019407630572e-07, + "loss": 0.6422, + "step": 4140 + }, + { + "epoch": 0.9543673657524775, + "grad_norm": 0.1371186226606369, + "learning_rate": 5.452902103176616e-07, + "loss": 0.6401, + "step": 4141 + }, + { + "epoch": 0.9545978336022125, + "grad_norm": 0.13882949948310852, + "learning_rate": 5.398060447404818e-07, + "loss": 0.6294, + "step": 4142 + }, + { + "epoch": 0.9548283014519474, + "grad_norm": 0.14043927192687988, + "learning_rate": 5.343494470882671e-07, + "loss": 0.6451, + "step": 4143 + }, + { + "epoch": 0.9550587693016824, + "grad_norm": 0.13908624649047852, + "learning_rate": 5.289204204023957e-07, + "loss": 0.6357, + "step": 4144 + }, + { + "epoch": 0.9552892371514173, + "grad_norm": 0.13900399208068848, + "learning_rate": 5.235189677088692e-07, + "loss": 0.6432, + "step": 4145 + }, + { + "epoch": 0.9555197050011524, + "grad_norm": 0.13600599765777588, + "learning_rate": 5.18145092018335e-07, + "loss": 0.6311, + "step": 4146 + }, + { + "epoch": 0.9557501728508873, + "grad_norm": 0.1335585117340088, + "learning_rate": 5.127987963260583e-07, + "loss": 0.6355, + "step": 4147 + }, + { + "epoch": 0.9559806407006223, + "grad_norm": 0.1321658492088318, + "learning_rate": 5.074800836119442e-07, + "loss": 0.6348, + "step": 4148 + }, + { + "epoch": 0.9562111085503572, + "grad_norm": 0.13960915803909302, + "learning_rate": 5.021889568404991e-07, + "loss": 0.6346, + "step": 4149 + }, + { + "epoch": 0.9564415764000922, + "grad_norm": 0.13837899267673492, + "learning_rate": 4.969254189608863e-07, + "loss": 0.6393, + "step": 4150 + }, + { + "epoch": 0.9566720442498271, + "grad_norm": 0.13521605730056763, + "learning_rate": 4.916894729068644e-07, + "loss": 0.6338, + "step": 4151 + }, + { + "epoch": 0.9569025120995621, + "grad_norm": 0.13600218296051025, + "learning_rate": 4.864811215968324e-07, + "loss": 0.6392, + "step": 4152 + }, + { + "epoch": 0.957132979949297, + "grad_norm": 0.1411590576171875, + "learning_rate": 4.813003679337957e-07, + "loss": 0.6352, + "step": 4153 + }, + { + "epoch": 0.957363447799032, + "grad_norm": 0.13098092377185822, + "learning_rate": 4.761472148053836e-07, + "loss": 0.637, + "step": 4154 + }, + { + "epoch": 0.957593915648767, + "grad_norm": 0.13644741475582123, + "learning_rate": 4.710216650838317e-07, + "loss": 0.6348, + "step": 4155 + }, + { + "epoch": 0.957824383498502, + "grad_norm": 0.13410955667495728, + "learning_rate": 4.6592372162601037e-07, + "loss": 0.6394, + "step": 4156 + }, + { + "epoch": 0.9580548513482369, + "grad_norm": 0.13613279163837433, + "learning_rate": 4.608533872733911e-07, + "loss": 0.642, + "step": 4157 + }, + { + "epoch": 0.9582853191979719, + "grad_norm": 0.13792695105075836, + "learning_rate": 4.558106648520466e-07, + "loss": 0.6296, + "step": 4158 + }, + { + "epoch": 0.9585157870477068, + "grad_norm": 0.14079563319683075, + "learning_rate": 4.5079555717267854e-07, + "loss": 0.649, + "step": 4159 + }, + { + "epoch": 0.9587462548974418, + "grad_norm": 0.13565976917743683, + "learning_rate": 4.4580806703057865e-07, + "loss": 0.6424, + "step": 4160 + }, + { + "epoch": 0.9589767227471768, + "grad_norm": 0.13566036522388458, + "learning_rate": 4.408481972056622e-07, + "loss": 0.6388, + "step": 4161 + }, + { + "epoch": 0.9592071905969117, + "grad_norm": 0.1372612714767456, + "learning_rate": 4.3591595046243994e-07, + "loss": 0.6375, + "step": 4162 + }, + { + "epoch": 0.9594376584466467, + "grad_norm": 0.13461288809776306, + "learning_rate": 4.3101132955002396e-07, + "loss": 0.635, + "step": 4163 + }, + { + "epoch": 0.9596681262963817, + "grad_norm": 0.1348971575498581, + "learning_rate": 4.2613433720213316e-07, + "loss": 0.6376, + "step": 4164 + }, + { + "epoch": 0.9598985941461167, + "grad_norm": 0.13625575602054596, + "learning_rate": 4.212849761370874e-07, + "loss": 0.6393, + "step": 4165 + }, + { + "epoch": 0.9601290619958516, + "grad_norm": 0.13304929435253143, + "learning_rate": 4.16463249057808e-07, + "loss": 0.6365, + "step": 4166 + }, + { + "epoch": 0.9603595298455866, + "grad_norm": 0.13466964662075043, + "learning_rate": 4.116691586518062e-07, + "loss": 0.6374, + "step": 4167 + }, + { + "epoch": 0.9605899976953215, + "grad_norm": 0.1362065076828003, + "learning_rate": 4.0690270759119464e-07, + "loss": 0.6364, + "step": 4168 + }, + { + "epoch": 0.9608204655450565, + "grad_norm": 0.13293537497520447, + "learning_rate": 4.021638985326759e-07, + "loss": 0.6401, + "step": 4169 + }, + { + "epoch": 0.9610509333947914, + "grad_norm": 0.13475392758846283, + "learning_rate": 3.974527341175427e-07, + "loss": 0.6275, + "step": 4170 + }, + { + "epoch": 0.9612814012445264, + "grad_norm": 0.13077649474143982, + "learning_rate": 3.9276921697169455e-07, + "loss": 0.6325, + "step": 4171 + }, + { + "epoch": 0.9615118690942613, + "grad_norm": 0.13671356439590454, + "learning_rate": 3.8811334970561553e-07, + "loss": 0.6435, + "step": 4172 + }, + { + "epoch": 0.9617423369439964, + "grad_norm": 0.1355324536561966, + "learning_rate": 3.834851349143631e-07, + "loss": 0.6361, + "step": 4173 + }, + { + "epoch": 0.9619728047937313, + "grad_norm": 0.13314592838287354, + "learning_rate": 3.78884575177596e-07, + "loss": 0.6362, + "step": 4174 + }, + { + "epoch": 0.9622032726434663, + "grad_norm": 0.14188657701015472, + "learning_rate": 3.743116730595575e-07, + "loss": 0.6382, + "step": 4175 + }, + { + "epoch": 0.9624337404932012, + "grad_norm": 0.133900985121727, + "learning_rate": 3.697664311090754e-07, + "loss": 0.6374, + "step": 4176 + }, + { + "epoch": 0.9626642083429362, + "grad_norm": 0.1333308219909668, + "learning_rate": 3.6524885185955647e-07, + "loss": 0.6408, + "step": 4177 + }, + { + "epoch": 0.9628946761926711, + "grad_norm": 0.13134267926216125, + "learning_rate": 3.6075893782899217e-07, + "loss": 0.632, + "step": 4178 + }, + { + "epoch": 0.9631251440424061, + "grad_norm": 0.13069699704647064, + "learning_rate": 3.5629669151994725e-07, + "loss": 0.6364, + "step": 4179 + }, + { + "epoch": 0.963355611892141, + "grad_norm": 0.1325685828924179, + "learning_rate": 3.518621154195767e-07, + "loss": 0.6371, + "step": 4180 + }, + { + "epoch": 0.963586079741876, + "grad_norm": 0.13809578120708466, + "learning_rate": 3.4745521199960884e-07, + "loss": 0.6378, + "step": 4181 + }, + { + "epoch": 0.963816547591611, + "grad_norm": 0.13212169706821442, + "learning_rate": 3.4307598371633445e-07, + "loss": 0.6327, + "step": 4182 + }, + { + "epoch": 0.964047015441346, + "grad_norm": 0.1352756917476654, + "learning_rate": 3.387244330106454e-07, + "loss": 0.6443, + "step": 4183 + }, + { + "epoch": 0.9642774832910809, + "grad_norm": 0.1354503035545349, + "learning_rate": 3.3440056230797933e-07, + "loss": 0.6386, + "step": 4184 + }, + { + "epoch": 0.9645079511408159, + "grad_norm": 0.13389207422733307, + "learning_rate": 3.30104374018364e-07, + "loss": 0.6328, + "step": 4185 + }, + { + "epoch": 0.9647384189905508, + "grad_norm": 0.13191154599189758, + "learning_rate": 3.2583587053638955e-07, + "loss": 0.6486, + "step": 4186 + }, + { + "epoch": 0.9649688868402858, + "grad_norm": 0.1348099559545517, + "learning_rate": 3.2159505424122495e-07, + "loss": 0.6407, + "step": 4187 + }, + { + "epoch": 0.9651993546900207, + "grad_norm": 0.135834202170372, + "learning_rate": 3.1738192749658503e-07, + "loss": 0.6347, + "step": 4188 + }, + { + "epoch": 0.9654298225397557, + "grad_norm": 0.13616180419921875, + "learning_rate": 3.131964926507747e-07, + "loss": 0.6374, + "step": 4189 + }, + { + "epoch": 0.9656602903894906, + "grad_norm": 0.13497985899448395, + "learning_rate": 3.0903875203665556e-07, + "loss": 0.6316, + "step": 4190 + }, + { + "epoch": 0.9658907582392257, + "grad_norm": 0.13490994274616241, + "learning_rate": 3.049087079716462e-07, + "loss": 0.639, + "step": 4191 + }, + { + "epoch": 0.9661212260889606, + "grad_norm": 0.13283471763134003, + "learning_rate": 3.0080636275774397e-07, + "loss": 0.6342, + "step": 4192 + }, + { + "epoch": 0.9663516939386956, + "grad_norm": 0.1357053518295288, + "learning_rate": 2.967317186814922e-07, + "loss": 0.6337, + "step": 4193 + }, + { + "epoch": 0.9665821617884305, + "grad_norm": 0.13123416900634766, + "learning_rate": 2.926847780139907e-07, + "loss": 0.6394, + "step": 4194 + }, + { + "epoch": 0.9668126296381655, + "grad_norm": 0.13529904186725616, + "learning_rate": 2.8866554301091866e-07, + "loss": 0.6258, + "step": 4195 + }, + { + "epoch": 0.9670430974879004, + "grad_norm": 0.13570614159107208, + "learning_rate": 2.846740159125061e-07, + "loss": 0.6341, + "step": 4196 + }, + { + "epoch": 0.9672735653376354, + "grad_norm": 0.13483721017837524, + "learning_rate": 2.807101989435179e-07, + "loss": 0.6394, + "step": 4197 + }, + { + "epoch": 0.9675040331873703, + "grad_norm": 0.13158808648586273, + "learning_rate": 2.767740943133035e-07, + "loss": 0.6317, + "step": 4198 + }, + { + "epoch": 0.9677345010371053, + "grad_norm": 0.135604590177536, + "learning_rate": 2.7286570421574677e-07, + "loss": 0.6362, + "step": 4199 + }, + { + "epoch": 0.9679649688868402, + "grad_norm": 0.13495594263076782, + "learning_rate": 2.6898503082929406e-07, + "loss": 0.6278, + "step": 4200 + }, + { + "epoch": 0.9681954367365753, + "grad_norm": 0.13914699852466583, + "learning_rate": 2.6513207631693184e-07, + "loss": 0.6368, + "step": 4201 + }, + { + "epoch": 0.9684259045863102, + "grad_norm": 0.1364966183900833, + "learning_rate": 2.6130684282621995e-07, + "loss": 0.6334, + "step": 4202 + }, + { + "epoch": 0.9686563724360452, + "grad_norm": 0.1355530023574829, + "learning_rate": 2.575093324892364e-07, + "loss": 0.637, + "step": 4203 + }, + { + "epoch": 0.9688868402857801, + "grad_norm": 0.1372985690832138, + "learning_rate": 2.5373954742263227e-07, + "loss": 0.6267, + "step": 4204 + }, + { + "epoch": 0.9691173081355151, + "grad_norm": 0.1355789601802826, + "learning_rate": 2.4999748972758805e-07, + "loss": 0.6418, + "step": 4205 + }, + { + "epoch": 0.96934777598525, + "grad_norm": 0.13223963975906372, + "learning_rate": 2.462831614898409e-07, + "loss": 0.6339, + "step": 4206 + }, + { + "epoch": 0.969578243834985, + "grad_norm": 0.1328386813402176, + "learning_rate": 2.42596564779668e-07, + "loss": 0.6352, + "step": 4207 + }, + { + "epoch": 0.9698087116847199, + "grad_norm": 0.13580402731895447, + "learning_rate": 2.3893770165189235e-07, + "loss": 0.634, + "step": 4208 + }, + { + "epoch": 0.970039179534455, + "grad_norm": 0.13300515711307526, + "learning_rate": 2.3530657414586598e-07, + "loss": 0.6409, + "step": 4209 + }, + { + "epoch": 0.9702696473841899, + "grad_norm": 0.13167451322078705, + "learning_rate": 2.317031842855033e-07, + "loss": 0.6353, + "step": 4210 + }, + { + "epoch": 0.9705001152339249, + "grad_norm": 0.13538329303264618, + "learning_rate": 2.281275340792477e-07, + "loss": 0.6342, + "step": 4211 + }, + { + "epoch": 0.9707305830836598, + "grad_norm": 0.1796192079782486, + "learning_rate": 2.2457962552007162e-07, + "loss": 0.6347, + "step": 4212 + }, + { + "epoch": 0.9709610509333948, + "grad_norm": 0.13724969327449799, + "learning_rate": 2.2105946058549876e-07, + "loss": 0.634, + "step": 4213 + }, + { + "epoch": 0.9711915187831297, + "grad_norm": 0.13621461391448975, + "learning_rate": 2.1756704123758742e-07, + "loss": 0.641, + "step": 4214 + }, + { + "epoch": 0.9714219866328647, + "grad_norm": 0.13638457655906677, + "learning_rate": 2.141023694229305e-07, + "loss": 0.6366, + "step": 4215 + }, + { + "epoch": 0.9716524544825996, + "grad_norm": 0.1362111121416092, + "learning_rate": 2.1066544707264435e-07, + "loss": 0.6483, + "step": 4216 + }, + { + "epoch": 0.9718829223323346, + "grad_norm": 0.13554121553897858, + "learning_rate": 2.0725627610239107e-07, + "loss": 0.6455, + "step": 4217 + }, + { + "epoch": 0.9721133901820695, + "grad_norm": 0.13165734708309174, + "learning_rate": 2.038748584123562e-07, + "loss": 0.6362, + "step": 4218 + }, + { + "epoch": 0.9723438580318046, + "grad_norm": 0.13413430750370026, + "learning_rate": 2.0052119588727103e-07, + "loss": 0.6415, + "step": 4219 + }, + { + "epoch": 0.9725743258815395, + "grad_norm": 0.13295698165893555, + "learning_rate": 1.9719529039637919e-07, + "loss": 0.6413, + "step": 4220 + }, + { + "epoch": 0.9728047937312745, + "grad_norm": 0.1375885158777237, + "learning_rate": 1.9389714379346446e-07, + "loss": 0.6374, + "step": 4221 + }, + { + "epoch": 0.9730352615810095, + "grad_norm": 0.1361759752035141, + "learning_rate": 1.906267579168286e-07, + "loss": 0.6349, + "step": 4222 + }, + { + "epoch": 0.9732657294307444, + "grad_norm": 0.13398383557796478, + "learning_rate": 1.8738413458931347e-07, + "loss": 0.6336, + "step": 4223 + }, + { + "epoch": 0.9734961972804794, + "grad_norm": 0.13173305988311768, + "learning_rate": 1.8416927561827336e-07, + "loss": 0.6304, + "step": 4224 + }, + { + "epoch": 0.9737266651302143, + "grad_norm": 0.13570670783519745, + "learning_rate": 1.8098218279559708e-07, + "loss": 0.6381, + "step": 4225 + }, + { + "epoch": 0.9739571329799493, + "grad_norm": 0.13517099618911743, + "learning_rate": 1.7782285789769147e-07, + "loss": 0.6361, + "step": 4226 + }, + { + "epoch": 0.9741876008296843, + "grad_norm": 0.1324930042028427, + "learning_rate": 1.7469130268549238e-07, + "loss": 0.6377, + "step": 4227 + }, + { + "epoch": 0.9744180686794193, + "grad_norm": 0.1336393505334854, + "learning_rate": 1.7158751890444803e-07, + "loss": 0.6356, + "step": 4228 + }, + { + "epoch": 0.9746485365291542, + "grad_norm": 0.13054290413856506, + "learning_rate": 1.6851150828453566e-07, + "loss": 0.64, + "step": 4229 + }, + { + "epoch": 0.9748790043788892, + "grad_norm": 0.130965456366539, + "learning_rate": 1.6546327254025052e-07, + "loss": 0.6411, + "step": 4230 + }, + { + "epoch": 0.9751094722286241, + "grad_norm": 0.13424457609653473, + "learning_rate": 1.6244281337060574e-07, + "loss": 0.6407, + "step": 4231 + }, + { + "epoch": 0.9753399400783591, + "grad_norm": 0.1327114850282669, + "learning_rate": 1.5945013245913799e-07, + "loss": 0.646, + "step": 4232 + }, + { + "epoch": 0.975570407928094, + "grad_norm": 0.13304896652698517, + "learning_rate": 1.5648523147388516e-07, + "loss": 0.6394, + "step": 4233 + }, + { + "epoch": 0.975800875777829, + "grad_norm": 0.1386221945285797, + "learning_rate": 1.5354811206741427e-07, + "loss": 0.645, + "step": 4234 + }, + { + "epoch": 0.9760313436275639, + "grad_norm": 0.13558200001716614, + "learning_rate": 1.5063877587681019e-07, + "loss": 0.6303, + "step": 4235 + }, + { + "epoch": 0.976261811477299, + "grad_norm": 0.13395509123802185, + "learning_rate": 1.4775722452366468e-07, + "loss": 0.6372, + "step": 4236 + }, + { + "epoch": 0.9764922793270339, + "grad_norm": 0.1313483864068985, + "learning_rate": 1.4490345961408746e-07, + "loss": 0.634, + "step": 4237 + }, + { + "epoch": 0.9767227471767689, + "grad_norm": 0.13300201296806335, + "learning_rate": 1.4207748273868948e-07, + "loss": 0.6448, + "step": 4238 + }, + { + "epoch": 0.9769532150265038, + "grad_norm": 0.1316831409931183, + "learning_rate": 1.3927929547261632e-07, + "loss": 0.6384, + "step": 4239 + }, + { + "epoch": 0.9771836828762388, + "grad_norm": 0.13369691371917725, + "learning_rate": 1.365088993755037e-07, + "loss": 0.6355, + "step": 4240 + }, + { + "epoch": 0.9774141507259737, + "grad_norm": 0.13882844150066376, + "learning_rate": 1.337662959914998e-07, + "loss": 0.6482, + "step": 4241 + }, + { + "epoch": 0.9776446185757087, + "grad_norm": 0.13777481019496918, + "learning_rate": 1.3105148684927072e-07, + "loss": 0.6451, + "step": 4242 + }, + { + "epoch": 0.9778750864254436, + "grad_norm": 0.1354648768901825, + "learning_rate": 1.283644734619893e-07, + "loss": 0.6485, + "step": 4243 + }, + { + "epoch": 0.9781055542751786, + "grad_norm": 0.13480645418167114, + "learning_rate": 1.257052573273243e-07, + "loss": 0.6384, + "step": 4244 + }, + { + "epoch": 0.9783360221249136, + "grad_norm": 0.1347130984067917, + "learning_rate": 1.2307383992746225e-07, + "loss": 0.6335, + "step": 4245 + }, + { + "epoch": 0.9785664899746486, + "grad_norm": 0.13522972166538239, + "learning_rate": 1.2047022272909102e-07, + "loss": 0.6338, + "step": 4246 + }, + { + "epoch": 0.9787969578243835, + "grad_norm": 0.137363463640213, + "learning_rate": 1.1789440718341093e-07, + "loss": 0.6306, + "step": 4247 + }, + { + "epoch": 0.9790274256741185, + "grad_norm": 0.13565847277641296, + "learning_rate": 1.1534639472611242e-07, + "loss": 0.6334, + "step": 4248 + }, + { + "epoch": 0.9792578935238534, + "grad_norm": 0.1334226429462433, + "learning_rate": 1.1282618677739831e-07, + "loss": 0.6332, + "step": 4249 + }, + { + "epoch": 0.9794883613735884, + "grad_norm": 0.13705900311470032, + "learning_rate": 1.1033378474197276e-07, + "loss": 0.6332, + "step": 4250 + }, + { + "epoch": 0.9797188292233233, + "grad_norm": 0.1340656876564026, + "learning_rate": 1.0786919000903562e-07, + "loss": 0.6424, + "step": 4251 + }, + { + "epoch": 0.9799492970730583, + "grad_norm": 0.13730952143669128, + "learning_rate": 1.054324039523047e-07, + "loss": 0.6377, + "step": 4252 + }, + { + "epoch": 0.9801797649227932, + "grad_norm": 0.1353830099105835, + "learning_rate": 1.0302342792997688e-07, + "loss": 0.6389, + "step": 4253 + }, + { + "epoch": 0.9804102327725283, + "grad_norm": 0.13165293633937836, + "learning_rate": 1.0064226328476145e-07, + "loss": 0.6366, + "step": 4254 + }, + { + "epoch": 0.9806407006222632, + "grad_norm": 0.12924884259700775, + "learning_rate": 9.828891134385786e-08, + "loss": 0.6363, + "step": 4255 + }, + { + "epoch": 0.9808711684719982, + "grad_norm": 0.13422457873821259, + "learning_rate": 9.596337341897243e-08, + "loss": 0.635, + "step": 4256 + }, + { + "epoch": 0.9811016363217331, + "grad_norm": 0.1396060436964035, + "learning_rate": 9.366565080630163e-08, + "loss": 0.6328, + "step": 4257 + }, + { + "epoch": 0.9813321041714681, + "grad_norm": 0.1325884759426117, + "learning_rate": 9.139574478654322e-08, + "loss": 0.6278, + "step": 4258 + }, + { + "epoch": 0.981562572021203, + "grad_norm": 0.13009661436080933, + "learning_rate": 8.915365662488518e-08, + "loss": 0.6329, + "step": 4259 + }, + { + "epoch": 0.981793039870938, + "grad_norm": 0.13182280957698822, + "learning_rate": 8.693938757101672e-08, + "loss": 0.6293, + "step": 4260 + }, + { + "epoch": 0.9820235077206729, + "grad_norm": 0.13452081382274628, + "learning_rate": 8.475293885911173e-08, + "loss": 0.6298, + "step": 4261 + }, + { + "epoch": 0.982253975570408, + "grad_norm": 0.13344185054302216, + "learning_rate": 8.259431170785647e-08, + "loss": 0.6335, + "step": 4262 + }, + { + "epoch": 0.9824844434201429, + "grad_norm": 0.1369807869195938, + "learning_rate": 8.046350732041075e-08, + "loss": 0.632, + "step": 4263 + }, + { + "epoch": 0.9827149112698779, + "grad_norm": 0.13430048525333405, + "learning_rate": 7.836052688443007e-08, + "loss": 0.6378, + "step": 4264 + }, + { + "epoch": 0.9829453791196128, + "grad_norm": 0.13494747877120972, + "learning_rate": 7.628537157207128e-08, + "loss": 0.6394, + "step": 4265 + }, + { + "epoch": 0.9831758469693478, + "grad_norm": 0.13669085502624512, + "learning_rate": 7.423804253997579e-08, + "loss": 0.6392, + "step": 4266 + }, + { + "epoch": 0.9834063148190827, + "grad_norm": 0.13560469448566437, + "learning_rate": 7.221854092926971e-08, + "loss": 0.6522, + "step": 4267 + }, + { + "epoch": 0.9836367826688177, + "grad_norm": 0.13016438484191895, + "learning_rate": 7.022686786558042e-08, + "loss": 0.6298, + "step": 4268 + }, + { + "epoch": 0.9838672505185526, + "grad_norm": 0.13416080176830292, + "learning_rate": 6.826302445901989e-08, + "loss": 0.6433, + "step": 4269 + }, + { + "epoch": 0.9840977183682876, + "grad_norm": 0.13181835412979126, + "learning_rate": 6.632701180418476e-08, + "loss": 0.6342, + "step": 4270 + }, + { + "epoch": 0.9843281862180225, + "grad_norm": 0.13344010710716248, + "learning_rate": 6.441883098015633e-08, + "loss": 0.6337, + "step": 4271 + }, + { + "epoch": 0.9845586540677576, + "grad_norm": 0.13779869675636292, + "learning_rate": 6.253848305052268e-08, + "loss": 0.6446, + "step": 4272 + }, + { + "epoch": 0.9847891219174925, + "grad_norm": 0.13841082155704498, + "learning_rate": 6.06859690633288e-08, + "loss": 0.6393, + "step": 4273 + }, + { + "epoch": 0.9850195897672275, + "grad_norm": 0.13432055711746216, + "learning_rate": 5.886129005113206e-08, + "loss": 0.6352, + "step": 4274 + }, + { + "epoch": 0.9852500576169624, + "grad_norm": 0.1333775520324707, + "learning_rate": 5.706444703096336e-08, + "loss": 0.6288, + "step": 4275 + }, + { + "epoch": 0.9854805254666974, + "grad_norm": 0.13299840688705444, + "learning_rate": 5.5295441004332704e-08, + "loss": 0.6288, + "step": 4276 + }, + { + "epoch": 0.9857109933164323, + "grad_norm": 0.1346500962972641, + "learning_rate": 5.355427295725135e-08, + "loss": 0.6369, + "step": 4277 + }, + { + "epoch": 0.9859414611661673, + "grad_norm": 0.13148584961891174, + "learning_rate": 5.184094386019855e-08, + "loss": 0.6326, + "step": 4278 + }, + { + "epoch": 0.9861719290159022, + "grad_norm": 0.1373184323310852, + "learning_rate": 5.0155454668149304e-08, + "loss": 0.6345, + "step": 4279 + }, + { + "epoch": 0.9864023968656372, + "grad_norm": 0.1355026662349701, + "learning_rate": 4.849780632054657e-08, + "loss": 0.6454, + "step": 4280 + }, + { + "epoch": 0.9866328647153723, + "grad_norm": 0.13367265462875366, + "learning_rate": 4.6867999741323496e-08, + "loss": 0.6447, + "step": 4281 + }, + { + "epoch": 0.9868633325651072, + "grad_norm": 0.134071946144104, + "learning_rate": 4.5266035838903434e-08, + "loss": 0.6395, + "step": 4282 + }, + { + "epoch": 0.9870938004148422, + "grad_norm": 0.1350136399269104, + "learning_rate": 4.3691915506177686e-08, + "loss": 0.6453, + "step": 4283 + }, + { + "epoch": 0.9873242682645771, + "grad_norm": 0.13533277809619904, + "learning_rate": 4.21456396205222e-08, + "loss": 0.6347, + "step": 4284 + }, + { + "epoch": 0.9875547361143121, + "grad_norm": 0.13973018527030945, + "learning_rate": 4.062720904379757e-08, + "loss": 0.6393, + "step": 4285 + }, + { + "epoch": 0.987785203964047, + "grad_norm": 0.13376709818840027, + "learning_rate": 3.913662462233791e-08, + "loss": 0.6352, + "step": 4286 + }, + { + "epoch": 0.988015671813782, + "grad_norm": 0.13301095366477966, + "learning_rate": 3.767388718696197e-08, + "loss": 0.6446, + "step": 4287 + }, + { + "epoch": 0.9882461396635169, + "grad_norm": 0.13637404143810272, + "learning_rate": 3.6238997552956456e-08, + "loss": 0.6414, + "step": 4288 + }, + { + "epoch": 0.988476607513252, + "grad_norm": 0.13000015914440155, + "learning_rate": 3.483195652010385e-08, + "loss": 0.6326, + "step": 4289 + }, + { + "epoch": 0.9887070753629869, + "grad_norm": 0.1287553757429123, + "learning_rate": 3.3452764872649036e-08, + "loss": 0.6359, + "step": 4290 + }, + { + "epoch": 0.9889375432127219, + "grad_norm": 0.13659948110580444, + "learning_rate": 3.210142337932709e-08, + "loss": 0.6306, + "step": 4291 + }, + { + "epoch": 0.9891680110624568, + "grad_norm": 0.13787119090557098, + "learning_rate": 3.0777932793335516e-08, + "loss": 0.6243, + "step": 4292 + }, + { + "epoch": 0.9893984789121918, + "grad_norm": 0.13213881850242615, + "learning_rate": 2.948229385236201e-08, + "loss": 0.6381, + "step": 4293 + }, + { + "epoch": 0.9896289467619267, + "grad_norm": 0.13811156153678894, + "learning_rate": 2.8214507278556678e-08, + "loss": 0.6412, + "step": 4294 + }, + { + "epoch": 0.9898594146116617, + "grad_norm": 0.1367308348417282, + "learning_rate": 2.6974573778565383e-08, + "loss": 0.6411, + "step": 4295 + }, + { + "epoch": 0.9900898824613966, + "grad_norm": 0.1384420096874237, + "learning_rate": 2.5762494043485296e-08, + "loss": 0.6355, + "step": 4296 + }, + { + "epoch": 0.9903203503111316, + "grad_norm": 0.13190501928329468, + "learning_rate": 2.4578268748909338e-08, + "loss": 0.6296, + "step": 4297 + }, + { + "epoch": 0.9905508181608665, + "grad_norm": 0.13569803535938263, + "learning_rate": 2.3421898554892852e-08, + "loss": 0.6342, + "step": 4298 + }, + { + "epoch": 0.9907812860106016, + "grad_norm": 0.13529643416404724, + "learning_rate": 2.229338410597026e-08, + "loss": 0.6311, + "step": 4299 + }, + { + "epoch": 0.9910117538603365, + "grad_norm": 0.13517087697982788, + "learning_rate": 2.1192726031143974e-08, + "loss": 0.6434, + "step": 4300 + }, + { + "epoch": 0.9912422217100715, + "grad_norm": 0.13566431403160095, + "learning_rate": 2.0119924943901025e-08, + "loss": 0.6371, + "step": 4301 + }, + { + "epoch": 0.9914726895598064, + "grad_norm": 0.1320231556892395, + "learning_rate": 1.9074981442185336e-08, + "loss": 0.6346, + "step": 4302 + }, + { + "epoch": 0.9917031574095414, + "grad_norm": 0.13202235102653503, + "learning_rate": 1.8057896108436558e-08, + "loss": 0.6322, + "step": 4303 + }, + { + "epoch": 0.9919336252592763, + "grad_norm": 0.13193956017494202, + "learning_rate": 1.7068669509545665e-08, + "loss": 0.6393, + "step": 4304 + }, + { + "epoch": 0.9921640931090113, + "grad_norm": 0.13464799523353577, + "learning_rate": 1.6107302196882724e-08, + "loss": 0.6363, + "step": 4305 + }, + { + "epoch": 0.9923945609587462, + "grad_norm": 0.13650983572006226, + "learning_rate": 1.5173794706291324e-08, + "loss": 0.6332, + "step": 4306 + }, + { + "epoch": 0.9926250288084812, + "grad_norm": 0.1360529065132141, + "learning_rate": 1.4268147558088585e-08, + "loss": 0.6251, + "step": 4307 + }, + { + "epoch": 0.9928554966582162, + "grad_norm": 0.1333739310503006, + "learning_rate": 1.3390361257059614e-08, + "loss": 0.6345, + "step": 4308 + }, + { + "epoch": 0.9930859645079512, + "grad_norm": 0.13426019251346588, + "learning_rate": 1.2540436292463043e-08, + "loss": 0.6416, + "step": 4309 + }, + { + "epoch": 0.9933164323576861, + "grad_norm": 0.13250836730003357, + "learning_rate": 1.1718373138019933e-08, + "loss": 0.6278, + "step": 4310 + }, + { + "epoch": 0.9935469002074211, + "grad_norm": 0.13704052567481995, + "learning_rate": 1.0924172251941534e-08, + "loss": 0.6428, + "step": 4311 + }, + { + "epoch": 0.993777368057156, + "grad_norm": 0.13682594895362854, + "learning_rate": 1.0157834076879313e-08, + "loss": 0.6392, + "step": 4312 + }, + { + "epoch": 0.994007835906891, + "grad_norm": 0.13360513746738434, + "learning_rate": 9.419359039986032e-09, + "loss": 0.6351, + "step": 4313 + }, + { + "epoch": 0.9942383037566259, + "grad_norm": 0.1286047101020813, + "learning_rate": 8.70874755286577e-09, + "loss": 0.6381, + "step": 4314 + }, + { + "epoch": 0.9944687716063609, + "grad_norm": 0.13860009610652924, + "learning_rate": 8.026000011596146e-09, + "loss": 0.642, + "step": 4315 + }, + { + "epoch": 0.9946992394560958, + "grad_norm": 0.1374213993549347, + "learning_rate": 7.371116796717203e-09, + "loss": 0.6356, + "step": 4316 + }, + { + "epoch": 0.9949297073058309, + "grad_norm": 0.13426125049591064, + "learning_rate": 6.74409827325917e-09, + "loss": 0.6325, + "step": 4317 + }, + { + "epoch": 0.9951601751555658, + "grad_norm": 0.13406482338905334, + "learning_rate": 6.144944790692497e-09, + "loss": 0.6422, + "step": 4318 + }, + { + "epoch": 0.9953906430053008, + "grad_norm": 0.1396821290254593, + "learning_rate": 5.573656682977824e-09, + "loss": 0.6443, + "step": 4319 + }, + { + "epoch": 0.9956211108550357, + "grad_norm": 0.12982842326164246, + "learning_rate": 5.030234268543765e-09, + "loss": 0.6245, + "step": 4320 + }, + { + "epoch": 0.9958515787047707, + "grad_norm": 0.1334078311920166, + "learning_rate": 4.514677850270266e-09, + "loss": 0.6401, + "step": 4321 + }, + { + "epoch": 0.9960820465545056, + "grad_norm": 0.13134385645389557, + "learning_rate": 4.0269877155219016e-09, + "loss": 0.6331, + "step": 4322 + }, + { + "epoch": 0.9963125144042406, + "grad_norm": 0.1397440880537033, + "learning_rate": 3.567164136120127e-09, + "loss": 0.6252, + "step": 4323 + }, + { + "epoch": 0.9965429822539755, + "grad_norm": 0.13442939519882202, + "learning_rate": 3.1352073683654783e-09, + "loss": 0.635, + "step": 4324 + }, + { + "epoch": 0.9967734501037105, + "grad_norm": 0.13672947883605957, + "learning_rate": 2.7311176530209203e-09, + "loss": 0.6365, + "step": 4325 + }, + { + "epoch": 0.9970039179534455, + "grad_norm": 0.13293464481830597, + "learning_rate": 2.3548952153118476e-09, + "loss": 0.6337, + "step": 4326 + }, + { + "epoch": 0.9972343858031805, + "grad_norm": 0.1351083517074585, + "learning_rate": 2.0065402649371845e-09, + "loss": 0.633, + "step": 4327 + }, + { + "epoch": 0.9974648536529154, + "grad_norm": 0.13411957025527954, + "learning_rate": 1.6860529960638361e-09, + "loss": 0.6376, + "step": 4328 + }, + { + "epoch": 0.9976953215026504, + "grad_norm": 0.1313239187002182, + "learning_rate": 1.3934335873155846e-09, + "loss": 0.6344, + "step": 4329 + }, + { + "epoch": 0.9979257893523853, + "grad_norm": 0.13476663827896118, + "learning_rate": 1.1286822018008458e-09, + "loss": 0.6432, + "step": 4330 + }, + { + "epoch": 0.9981562572021203, + "grad_norm": 0.13341675698757172, + "learning_rate": 8.917989870849131e-10, + "loss": 0.6288, + "step": 4331 + }, + { + "epoch": 0.9983867250518552, + "grad_norm": 0.13334329426288605, + "learning_rate": 6.827840751955083e-10, + "loss": 0.6403, + "step": 4332 + }, + { + "epoch": 0.9986171929015902, + "grad_norm": 0.135373055934906, + "learning_rate": 5.016375826394359e-10, + "loss": 0.6377, + "step": 4333 + }, + { + "epoch": 0.9988476607513251, + "grad_norm": 0.13281653821468353, + "learning_rate": 3.4835961037482655e-10, + "loss": 0.6261, + "step": 4334 + }, + { + "epoch": 0.9990781286010602, + "grad_norm": 0.1364908516407013, + "learning_rate": 2.2295024383889306e-10, + "loss": 0.6386, + "step": 4335 + }, + { + "epoch": 0.9993085964507951, + "grad_norm": 0.1349654197692871, + "learning_rate": 1.2540955293682822e-10, + "loss": 0.6366, + "step": 4336 + }, + { + "epoch": 0.9995390643005301, + "grad_norm": 0.14018984138965607, + "learning_rate": 5.573759202515127e-11, + "loss": 0.6342, + "step": 4337 + }, + { + "epoch": 0.999769532150265, + "grad_norm": 0.1340036541223526, + "learning_rate": 1.3934399950565891e-11, + "loss": 0.6367, + "step": 4338 + }, + { + "epoch": 1.0, + "grad_norm": 0.13604800403118134, + "learning_rate": 0.0, + "loss": 0.6404, + "step": 4339 + }, + { + "epoch": 1.0, + "step": 4339, + "total_flos": 1.6526087271915035e+20, + "train_loss": 0.7063895864925178, + "train_runtime": 30118.2929, + "train_samples_per_second": 589.978, + "train_steps_per_second": 0.144 + } + ], + "logging_steps": 1.0, + "max_steps": 4339, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6526087271915035e+20, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}