diff --git "a/checkpoint-1628/trainer_state.json" "b/checkpoint-1628/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1628/trainer_state.json" @@ -0,0 +1,5731 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 1628, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002457002457002457, + "grad_norm": 4.019700050354004, + "learning_rate": 2.4390243902439027e-06, + "loss": 6.2086, + "step": 2 + }, + { + "epoch": 0.004914004914004914, + "grad_norm": 4.2945332527160645, + "learning_rate": 4.8780487804878055e-06, + "loss": 6.1853, + "step": 4 + }, + { + "epoch": 0.007371007371007371, + "grad_norm": 3.930802583694458, + "learning_rate": 7.317073170731707e-06, + "loss": 6.38, + "step": 6 + }, + { + "epoch": 0.009828009828009828, + "grad_norm": 3.6977317333221436, + "learning_rate": 9.756097560975611e-06, + "loss": 6.1601, + "step": 8 + }, + { + "epoch": 0.012285012285012284, + "grad_norm": 25.07744789123535, + "learning_rate": 1.2195121951219513e-05, + "loss": 6.5514, + "step": 10 + }, + { + "epoch": 0.014742014742014743, + "grad_norm": 3.566401720046997, + "learning_rate": 1.4634146341463415e-05, + "loss": 5.6854, + "step": 12 + }, + { + "epoch": 0.0171990171990172, + "grad_norm": 24.404541015625, + "learning_rate": 1.707317073170732e-05, + "loss": 5.8838, + "step": 14 + }, + { + "epoch": 0.019656019656019656, + "grad_norm": 3.1777548789978027, + "learning_rate": 1.9512195121951222e-05, + "loss": 5.3108, + "step": 16 + }, + { + "epoch": 0.022113022113022112, + "grad_norm": 2.769148349761963, + "learning_rate": 2.1951219512195124e-05, + "loss": 4.8014, + "step": 18 + }, + { + "epoch": 0.02457002457002457, + "grad_norm": 3.1796083450317383, + "learning_rate": 2.4390243902439026e-05, + "loss": 5.3203, + "step": 20 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 3.617638349533081, + "learning_rate": 2.682926829268293e-05, + "loss": 5.5772, + "step": 22 + }, + { + "epoch": 0.029484029484029485, + "grad_norm": 7.00071382522583, + "learning_rate": 2.926829268292683e-05, + "loss": 5.0532, + "step": 24 + }, + { + "epoch": 0.03194103194103194, + "grad_norm": 3.52091908454895, + "learning_rate": 3.170731707317073e-05, + "loss": 5.1329, + "step": 26 + }, + { + "epoch": 0.0343980343980344, + "grad_norm": 3.3309226036071777, + "learning_rate": 3.414634146341464e-05, + "loss": 4.6793, + "step": 28 + }, + { + "epoch": 0.036855036855036855, + "grad_norm": 2.865582227706909, + "learning_rate": 3.6585365853658535e-05, + "loss": 4.5865, + "step": 30 + }, + { + "epoch": 0.03931203931203931, + "grad_norm": 3.092682123184204, + "learning_rate": 3.9024390243902444e-05, + "loss": 4.4673, + "step": 32 + }, + { + "epoch": 0.04176904176904177, + "grad_norm": 2.843824863433838, + "learning_rate": 4.146341463414634e-05, + "loss": 4.4617, + "step": 34 + }, + { + "epoch": 0.044226044226044224, + "grad_norm": 3.029207229614258, + "learning_rate": 4.390243902439025e-05, + "loss": 4.394, + "step": 36 + }, + { + "epoch": 0.04668304668304668, + "grad_norm": 2.6937527656555176, + "learning_rate": 4.634146341463415e-05, + "loss": 4.1312, + "step": 38 + }, + { + "epoch": 0.04914004914004914, + "grad_norm": 205.6438751220703, + "learning_rate": 4.878048780487805e-05, + "loss": 4.1717, + "step": 40 + }, + { + "epoch": 0.051597051597051594, + "grad_norm": 2.583071708679199, + "learning_rate": 5.121951219512195e-05, + "loss": 4.2337, + "step": 42 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 2.7350614070892334, + "learning_rate": 5.365853658536586e-05, + "loss": 4.2223, + "step": 44 + }, + { + "epoch": 0.056511056511056514, + "grad_norm": 2.6410160064697266, + "learning_rate": 5.6097560975609764e-05, + "loss": 4.0693, + "step": 46 + }, + { + "epoch": 0.05896805896805897, + "grad_norm": 2.716932535171509, + "learning_rate": 5.853658536585366e-05, + "loss": 4.0604, + "step": 48 + }, + { + "epoch": 0.06142506142506143, + "grad_norm": 2.662912368774414, + "learning_rate": 6.097560975609756e-05, + "loss": 4.106, + "step": 50 + }, + { + "epoch": 0.06388206388206388, + "grad_norm": 2.6940219402313232, + "learning_rate": 6.341463414634146e-05, + "loss": 3.9797, + "step": 52 + }, + { + "epoch": 0.06633906633906633, + "grad_norm": 2.474919319152832, + "learning_rate": 6.585365853658538e-05, + "loss": 3.9617, + "step": 54 + }, + { + "epoch": 0.0687960687960688, + "grad_norm": 3.2239887714385986, + "learning_rate": 6.829268292682928e-05, + "loss": 3.8198, + "step": 56 + }, + { + "epoch": 0.07125307125307126, + "grad_norm": 2.245703935623169, + "learning_rate": 7.073170731707317e-05, + "loss": 3.7654, + "step": 58 + }, + { + "epoch": 0.07371007371007371, + "grad_norm": 2.289674758911133, + "learning_rate": 7.317073170731707e-05, + "loss": 3.9226, + "step": 60 + }, + { + "epoch": 0.07616707616707617, + "grad_norm": 2.497066020965576, + "learning_rate": 7.560975609756099e-05, + "loss": 3.8096, + "step": 62 + }, + { + "epoch": 0.07862407862407862, + "grad_norm": 2.301783800125122, + "learning_rate": 7.804878048780489e-05, + "loss": 3.7816, + "step": 64 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 2.323812484741211, + "learning_rate": 8.048780487804879e-05, + "loss": 3.852, + "step": 66 + }, + { + "epoch": 0.08353808353808354, + "grad_norm": 2.500802755355835, + "learning_rate": 8.292682926829268e-05, + "loss": 3.6985, + "step": 68 + }, + { + "epoch": 0.085995085995086, + "grad_norm": 2.634605646133423, + "learning_rate": 8.53658536585366e-05, + "loss": 3.6143, + "step": 70 + }, + { + "epoch": 0.08845208845208845, + "grad_norm": 2.6327457427978516, + "learning_rate": 8.78048780487805e-05, + "loss": 3.6925, + "step": 72 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 2.969693899154663, + "learning_rate": 9.02439024390244e-05, + "loss": 3.7667, + "step": 74 + }, + { + "epoch": 0.09336609336609336, + "grad_norm": 2.198855400085449, + "learning_rate": 9.26829268292683e-05, + "loss": 3.4533, + "step": 76 + }, + { + "epoch": 0.09582309582309582, + "grad_norm": 2.361680030822754, + "learning_rate": 9.51219512195122e-05, + "loss": 3.592, + "step": 78 + }, + { + "epoch": 0.09828009828009827, + "grad_norm": 2.2100822925567627, + "learning_rate": 9.75609756097561e-05, + "loss": 3.6584, + "step": 80 + }, + { + "epoch": 0.10073710073710074, + "grad_norm": 2.6485509872436523, + "learning_rate": 0.0001, + "loss": 3.7041, + "step": 82 + }, + { + "epoch": 0.10319410319410319, + "grad_norm": 2.305530548095703, + "learning_rate": 9.999958706645134e-05, + "loss": 3.4613, + "step": 84 + }, + { + "epoch": 0.10565110565110565, + "grad_norm": 2.2900867462158203, + "learning_rate": 9.999834827262588e-05, + "loss": 3.5832, + "step": 86 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 2.804309368133545, + "learning_rate": 9.999628363898526e-05, + "loss": 3.6276, + "step": 88 + }, + { + "epoch": 0.11056511056511056, + "grad_norm": 2.169964075088501, + "learning_rate": 9.999339319963168e-05, + "loss": 3.6038, + "step": 90 + }, + { + "epoch": 0.11302211302211303, + "grad_norm": 1.9803478717803955, + "learning_rate": 9.998967700230757e-05, + "loss": 3.6076, + "step": 92 + }, + { + "epoch": 0.11547911547911548, + "grad_norm": 2.2166459560394287, + "learning_rate": 9.998513510839458e-05, + "loss": 3.5336, + "step": 94 + }, + { + "epoch": 0.11793611793611794, + "grad_norm": 2.4060072898864746, + "learning_rate": 9.997976759291276e-05, + "loss": 3.4553, + "step": 96 + }, + { + "epoch": 0.12039312039312039, + "grad_norm": 2.0433080196380615, + "learning_rate": 9.997357454451919e-05, + "loss": 3.5237, + "step": 98 + }, + { + "epoch": 0.12285012285012285, + "grad_norm": 14.874445915222168, + "learning_rate": 9.996655606550656e-05, + "loss": 3.5126, + "step": 100 + }, + { + "epoch": 0.12530712530712532, + "grad_norm": 2.1172244548797607, + "learning_rate": 9.99587122718015e-05, + "loss": 3.6411, + "step": 102 + }, + { + "epoch": 0.12776412776412777, + "grad_norm": 2.2137668132781982, + "learning_rate": 9.995004329296263e-05, + "loss": 3.7789, + "step": 104 + }, + { + "epoch": 0.13022113022113022, + "grad_norm": 2.1240861415863037, + "learning_rate": 9.994054927217842e-05, + "loss": 3.5804, + "step": 106 + }, + { + "epoch": 0.13267813267813267, + "grad_norm": 2.106127977371216, + "learning_rate": 9.993023036626488e-05, + "loss": 3.5019, + "step": 108 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 2.0348684787750244, + "learning_rate": 9.99190867456629e-05, + "loss": 3.5321, + "step": 110 + }, + { + "epoch": 0.1375921375921376, + "grad_norm": 2.0568437576293945, + "learning_rate": 9.990711859443546e-05, + "loss": 3.3903, + "step": 112 + }, + { + "epoch": 0.14004914004914004, + "grad_norm": 2.1058425903320312, + "learning_rate": 9.989432611026464e-05, + "loss": 3.2349, + "step": 114 + }, + { + "epoch": 0.14250614250614252, + "grad_norm": 2.5870158672332764, + "learning_rate": 9.988070950444823e-05, + "loss": 3.3838, + "step": 116 + }, + { + "epoch": 0.14496314496314497, + "grad_norm": 1.9624409675598145, + "learning_rate": 9.986626900189641e-05, + "loss": 3.4498, + "step": 118 + }, + { + "epoch": 0.14742014742014742, + "grad_norm": 2.063462972640991, + "learning_rate": 9.985100484112785e-05, + "loss": 3.3871, + "step": 120 + }, + { + "epoch": 0.14987714987714987, + "grad_norm": 2.190028667449951, + "learning_rate": 9.983491727426598e-05, + "loss": 3.3708, + "step": 122 + }, + { + "epoch": 0.15233415233415235, + "grad_norm": 1.8697468042373657, + "learning_rate": 9.981800656703457e-05, + "loss": 3.5375, + "step": 124 + }, + { + "epoch": 0.1547911547911548, + "grad_norm": 2.1959290504455566, + "learning_rate": 9.980027299875358e-05, + "loss": 3.4274, + "step": 126 + }, + { + "epoch": 0.15724815724815724, + "grad_norm": 1.9716213941574097, + "learning_rate": 9.978171686233445e-05, + "loss": 3.2983, + "step": 128 + }, + { + "epoch": 0.1597051597051597, + "grad_norm": 2.224968910217285, + "learning_rate": 9.97623384642752e-05, + "loss": 3.3369, + "step": 130 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 1.896340012550354, + "learning_rate": 9.974213812465547e-05, + "loss": 3.4335, + "step": 132 + }, + { + "epoch": 0.16461916461916462, + "grad_norm": 2.3304531574249268, + "learning_rate": 9.972111617713116e-05, + "loss": 3.2502, + "step": 134 + }, + { + "epoch": 0.16707616707616707, + "grad_norm": 2.0420844554901123, + "learning_rate": 9.969927296892898e-05, + "loss": 3.4312, + "step": 136 + }, + { + "epoch": 0.16953316953316952, + "grad_norm": 2.021531105041504, + "learning_rate": 9.967660886084066e-05, + "loss": 3.2728, + "step": 138 + }, + { + "epoch": 0.171990171990172, + "grad_norm": 2.0841152667999268, + "learning_rate": 9.965312422721704e-05, + "loss": 3.1727, + "step": 140 + }, + { + "epoch": 0.17444717444717445, + "grad_norm": 2.6040847301483154, + "learning_rate": 9.962881945596184e-05, + "loss": 3.3739, + "step": 142 + }, + { + "epoch": 0.1769041769041769, + "grad_norm": 15.763100624084473, + "learning_rate": 9.960369494852525e-05, + "loss": 3.1624, + "step": 144 + }, + { + "epoch": 0.17936117936117937, + "grad_norm": 2.2386856079101562, + "learning_rate": 9.95777511198974e-05, + "loss": 3.2584, + "step": 146 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 2.857926607131958, + "learning_rate": 9.955098839860133e-05, + "loss": 3.4066, + "step": 148 + }, + { + "epoch": 0.18427518427518427, + "grad_norm": 2.4381282329559326, + "learning_rate": 9.952340722668609e-05, + "loss": 3.2631, + "step": 150 + }, + { + "epoch": 0.18673218673218672, + "grad_norm": 2.13820481300354, + "learning_rate": 9.949500805971932e-05, + "loss": 3.3381, + "step": 152 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 2.0036275386810303, + "learning_rate": 9.946579136677978e-05, + "loss": 3.3253, + "step": 154 + }, + { + "epoch": 0.19164619164619165, + "grad_norm": 2.3847107887268066, + "learning_rate": 9.943575763044955e-05, + "loss": 3.1658, + "step": 156 + }, + { + "epoch": 0.1941031941031941, + "grad_norm": 1.883772373199463, + "learning_rate": 9.940490734680614e-05, + "loss": 3.1963, + "step": 158 + }, + { + "epoch": 0.19656019656019655, + "grad_norm": 1.8654661178588867, + "learning_rate": 9.937324102541423e-05, + "loss": 3.2319, + "step": 160 + }, + { + "epoch": 0.19901719901719903, + "grad_norm": 1.8905117511749268, + "learning_rate": 9.93407591893173e-05, + "loss": 3.3699, + "step": 162 + }, + { + "epoch": 0.20147420147420148, + "grad_norm": 1.8974312543869019, + "learning_rate": 9.930746237502892e-05, + "loss": 3.2576, + "step": 164 + }, + { + "epoch": 0.20393120393120392, + "grad_norm": 1.9088815450668335, + "learning_rate": 9.927335113252396e-05, + "loss": 3.3929, + "step": 166 + }, + { + "epoch": 0.20638820638820637, + "grad_norm": 1.8603475093841553, + "learning_rate": 9.923842602522949e-05, + "loss": 3.2274, + "step": 168 + }, + { + "epoch": 0.20884520884520885, + "grad_norm": 1.8516125679016113, + "learning_rate": 9.920268763001542e-05, + "loss": 3.1596, + "step": 170 + }, + { + "epoch": 0.2113022113022113, + "grad_norm": 1.9465285539627075, + "learning_rate": 9.916613653718509e-05, + "loss": 3.2853, + "step": 172 + }, + { + "epoch": 0.21375921375921375, + "grad_norm": 1.9433836936950684, + "learning_rate": 9.912877335046535e-05, + "loss": 3.1986, + "step": 174 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 1.9821792840957642, + "learning_rate": 9.909059868699678e-05, + "loss": 3.1233, + "step": 176 + }, + { + "epoch": 0.21867321867321868, + "grad_norm": 1.904826283454895, + "learning_rate": 9.905161317732331e-05, + "loss": 3.221, + "step": 178 + }, + { + "epoch": 0.22113022113022113, + "grad_norm": 1.9415316581726074, + "learning_rate": 9.901181746538196e-05, + "loss": 3.1107, + "step": 180 + }, + { + "epoch": 0.22358722358722358, + "grad_norm": 1.8300875425338745, + "learning_rate": 9.897121220849208e-05, + "loss": 3.1114, + "step": 182 + }, + { + "epoch": 0.22604422604422605, + "grad_norm": 1.972747802734375, + "learning_rate": 9.892979807734462e-05, + "loss": 3.1652, + "step": 184 + }, + { + "epoch": 0.2285012285012285, + "grad_norm": 1.9549387693405151, + "learning_rate": 9.888757575599093e-05, + "loss": 3.2123, + "step": 186 + }, + { + "epoch": 0.23095823095823095, + "grad_norm": 1.856306552886963, + "learning_rate": 9.884454594183154e-05, + "loss": 3.235, + "step": 188 + }, + { + "epoch": 0.2334152334152334, + "grad_norm": 1.743513584136963, + "learning_rate": 9.880070934560458e-05, + "loss": 3.1647, + "step": 190 + }, + { + "epoch": 0.23587223587223588, + "grad_norm": 1.884634256362915, + "learning_rate": 9.875606669137412e-05, + "loss": 3.1963, + "step": 192 + }, + { + "epoch": 0.23832923832923833, + "grad_norm": 1.966579556465149, + "learning_rate": 9.871061871651815e-05, + "loss": 2.9626, + "step": 194 + }, + { + "epoch": 0.24078624078624078, + "grad_norm": 1.7920762300491333, + "learning_rate": 9.866436617171638e-05, + "loss": 3.3341, + "step": 196 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 1.9862432479858398, + "learning_rate": 9.861730982093793e-05, + "loss": 3.2039, + "step": 198 + }, + { + "epoch": 0.2457002457002457, + "grad_norm": 1.7585490942001343, + "learning_rate": 9.856945044142865e-05, + "loss": 3.1915, + "step": 200 + }, + { + "epoch": 0.24815724815724816, + "grad_norm": 1.7467507123947144, + "learning_rate": 9.852078882369827e-05, + "loss": 3.1105, + "step": 202 + }, + { + "epoch": 0.25061425061425063, + "grad_norm": 1.9189780950546265, + "learning_rate": 9.847132577150733e-05, + "loss": 3.1065, + "step": 204 + }, + { + "epoch": 0.25307125307125306, + "grad_norm": 1.8920857906341553, + "learning_rate": 9.842106210185403e-05, + "loss": 3.1573, + "step": 206 + }, + { + "epoch": 0.25552825552825553, + "grad_norm": 1.7997002601623535, + "learning_rate": 9.836999864496057e-05, + "loss": 3.0638, + "step": 208 + }, + { + "epoch": 0.257985257985258, + "grad_norm": 1.8181427717208862, + "learning_rate": 9.831813624425952e-05, + "loss": 3.166, + "step": 210 + }, + { + "epoch": 0.26044226044226043, + "grad_norm": 1.7454043626785278, + "learning_rate": 9.82654757563799e-05, + "loss": 2.9648, + "step": 212 + }, + { + "epoch": 0.2628992628992629, + "grad_norm": 1.9787805080413818, + "learning_rate": 9.821201805113298e-05, + "loss": 3.0446, + "step": 214 + }, + { + "epoch": 0.26535626535626533, + "grad_norm": 1.8889926671981812, + "learning_rate": 9.815776401149796e-05, + "loss": 3.1998, + "step": 216 + }, + { + "epoch": 0.2678132678132678, + "grad_norm": 1.9100897312164307, + "learning_rate": 9.810271453360738e-05, + "loss": 3.0167, + "step": 218 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 1.72272527217865, + "learning_rate": 9.804687052673229e-05, + "loss": 3.0516, + "step": 220 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 1.7406550645828247, + "learning_rate": 9.799023291326722e-05, + "loss": 3.1492, + "step": 222 + }, + { + "epoch": 0.2751842751842752, + "grad_norm": 1.7723503112792969, + "learning_rate": 9.793280262871502e-05, + "loss": 3.1282, + "step": 224 + }, + { + "epoch": 0.27764127764127766, + "grad_norm": 1.7942124605178833, + "learning_rate": 9.787458062167134e-05, + "loss": 3.0463, + "step": 226 + }, + { + "epoch": 0.2800982800982801, + "grad_norm": 1.8124969005584717, + "learning_rate": 9.781556785380899e-05, + "loss": 3.295, + "step": 228 + }, + { + "epoch": 0.28255528255528256, + "grad_norm": 1.7787127494812012, + "learning_rate": 9.775576529986199e-05, + "loss": 3.3415, + "step": 230 + }, + { + "epoch": 0.28501228501228504, + "grad_norm": 1.967888355255127, + "learning_rate": 9.769517394760962e-05, + "loss": 2.9891, + "step": 232 + }, + { + "epoch": 0.28746928746928746, + "grad_norm": 1.9067909717559814, + "learning_rate": 9.763379479785995e-05, + "loss": 3.1963, + "step": 234 + }, + { + "epoch": 0.28992628992628994, + "grad_norm": 1.8959527015686035, + "learning_rate": 9.757162886443336e-05, + "loss": 3.0906, + "step": 236 + }, + { + "epoch": 0.29238329238329236, + "grad_norm": 1.756986141204834, + "learning_rate": 9.750867717414586e-05, + "loss": 3.1538, + "step": 238 + }, + { + "epoch": 0.29484029484029484, + "grad_norm": 1.7642282247543335, + "learning_rate": 9.744494076679205e-05, + "loss": 3.0886, + "step": 240 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 1.8129557371139526, + "learning_rate": 9.738042069512795e-05, + "loss": 3.2687, + "step": 242 + }, + { + "epoch": 0.29975429975429974, + "grad_norm": 1.6599668264389038, + "learning_rate": 9.731511802485364e-05, + "loss": 3.0165, + "step": 244 + }, + { + "epoch": 0.3022113022113022, + "grad_norm": 1.7890832424163818, + "learning_rate": 9.724903383459566e-05, + "loss": 3.1224, + "step": 246 + }, + { + "epoch": 0.3046683046683047, + "grad_norm": 1.901267170906067, + "learning_rate": 9.718216921588919e-05, + "loss": 2.9377, + "step": 248 + }, + { + "epoch": 0.3071253071253071, + "grad_norm": 1.8047585487365723, + "learning_rate": 9.711452527315998e-05, + "loss": 3.2727, + "step": 250 + }, + { + "epoch": 0.3095823095823096, + "grad_norm": 1.9266289472579956, + "learning_rate": 9.704610312370617e-05, + "loss": 3.0795, + "step": 252 + }, + { + "epoch": 0.31203931203931207, + "grad_norm": 1.8360109329223633, + "learning_rate": 9.697690389767981e-05, + "loss": 3.1791, + "step": 254 + }, + { + "epoch": 0.3144963144963145, + "grad_norm": 1.7929311990737915, + "learning_rate": 9.690692873806816e-05, + "loss": 3.1926, + "step": 256 + }, + { + "epoch": 0.31695331695331697, + "grad_norm": 1.7981804609298706, + "learning_rate": 9.683617880067489e-05, + "loss": 3.2032, + "step": 258 + }, + { + "epoch": 0.3194103194103194, + "grad_norm": 1.8404676914215088, + "learning_rate": 9.676465525410088e-05, + "loss": 3.1692, + "step": 260 + }, + { + "epoch": 0.32186732186732187, + "grad_norm": 1.7135553359985352, + "learning_rate": 9.669235927972502e-05, + "loss": 3.2311, + "step": 262 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 1.8173389434814453, + "learning_rate": 9.661929207168463e-05, + "loss": 2.938, + "step": 264 + }, + { + "epoch": 0.32678132678132676, + "grad_norm": 1.8518667221069336, + "learning_rate": 9.654545483685578e-05, + "loss": 2.9593, + "step": 266 + }, + { + "epoch": 0.32923832923832924, + "grad_norm": 1.7250562906265259, + "learning_rate": 9.647084879483332e-05, + "loss": 3.2025, + "step": 268 + }, + { + "epoch": 0.3316953316953317, + "grad_norm": 1.7107222080230713, + "learning_rate": 9.639547517791076e-05, + "loss": 3.0682, + "step": 270 + }, + { + "epoch": 0.33415233415233414, + "grad_norm": 1.6773762702941895, + "learning_rate": 9.631933523105991e-05, + "loss": 2.99, + "step": 272 + }, + { + "epoch": 0.3366093366093366, + "grad_norm": 1.758596420288086, + "learning_rate": 9.624243021191029e-05, + "loss": 2.9892, + "step": 274 + }, + { + "epoch": 0.33906633906633904, + "grad_norm": 1.8285568952560425, + "learning_rate": 9.61647613907284e-05, + "loss": 3.0917, + "step": 276 + }, + { + "epoch": 0.3415233415233415, + "grad_norm": 1.8026080131530762, + "learning_rate": 9.608633005039675e-05, + "loss": 3.0702, + "step": 278 + }, + { + "epoch": 0.343980343980344, + "grad_norm": 11.102814674377441, + "learning_rate": 9.600713748639258e-05, + "loss": 3.1327, + "step": 280 + }, + { + "epoch": 0.3464373464373464, + "grad_norm": 1.9404098987579346, + "learning_rate": 9.592718500676656e-05, + "loss": 3.0348, + "step": 282 + }, + { + "epoch": 0.3488943488943489, + "grad_norm": 1.7170964479446411, + "learning_rate": 9.584647393212113e-05, + "loss": 3.0137, + "step": 284 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 2.194800615310669, + "learning_rate": 9.576500559558869e-05, + "loss": 3.0657, + "step": 286 + }, + { + "epoch": 0.3538083538083538, + "grad_norm": 2.2819526195526123, + "learning_rate": 9.568278134280966e-05, + "loss": 2.9922, + "step": 288 + }, + { + "epoch": 0.35626535626535627, + "grad_norm": 1.764850378036499, + "learning_rate": 9.55998025319101e-05, + "loss": 3.0742, + "step": 290 + }, + { + "epoch": 0.35872235872235875, + "grad_norm": 1.675075650215149, + "learning_rate": 9.551607053347942e-05, + "loss": 3.14, + "step": 292 + }, + { + "epoch": 0.36117936117936117, + "grad_norm": 1.730245590209961, + "learning_rate": 9.543158673054767e-05, + "loss": 2.9738, + "step": 294 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.6660993099212646, + "learning_rate": 9.534635251856267e-05, + "loss": 2.7211, + "step": 296 + }, + { + "epoch": 0.36609336609336607, + "grad_norm": 1.6620466709136963, + "learning_rate": 9.526036930536712e-05, + "loss": 2.9747, + "step": 298 + }, + { + "epoch": 0.36855036855036855, + "grad_norm": 1.6838281154632568, + "learning_rate": 9.517363851117512e-05, + "loss": 3.0342, + "step": 300 + }, + { + "epoch": 0.371007371007371, + "grad_norm": 3.0981221199035645, + "learning_rate": 9.508616156854883e-05, + "loss": 3.1151, + "step": 302 + }, + { + "epoch": 0.37346437346437344, + "grad_norm": 1.954408049583435, + "learning_rate": 9.499793992237485e-05, + "loss": 2.7723, + "step": 304 + }, + { + "epoch": 0.3759213759213759, + "grad_norm": 1.7550610303878784, + "learning_rate": 9.490897502984028e-05, + "loss": 2.9576, + "step": 306 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 1.854491114616394, + "learning_rate": 9.481926836040866e-05, + "loss": 2.9745, + "step": 308 + }, + { + "epoch": 0.3808353808353808, + "grad_norm": 1.6988489627838135, + "learning_rate": 9.472882139579572e-05, + "loss": 3.1205, + "step": 310 + }, + { + "epoch": 0.3832923832923833, + "grad_norm": 1.682507038116455, + "learning_rate": 9.463763562994491e-05, + "loss": 3.0174, + "step": 312 + }, + { + "epoch": 0.3857493857493858, + "grad_norm": 1.7135891914367676, + "learning_rate": 9.454571256900272e-05, + "loss": 2.8791, + "step": 314 + }, + { + "epoch": 0.3882063882063882, + "grad_norm": 1.6604820489883423, + "learning_rate": 9.445305373129375e-05, + "loss": 2.9726, + "step": 316 + }, + { + "epoch": 0.3906633906633907, + "grad_norm": 1.8734002113342285, + "learning_rate": 9.435966064729574e-05, + "loss": 3.2479, + "step": 318 + }, + { + "epoch": 0.3931203931203931, + "grad_norm": 2.204241991043091, + "learning_rate": 9.426553485961415e-05, + "loss": 3.092, + "step": 320 + }, + { + "epoch": 0.3955773955773956, + "grad_norm": 1.7275304794311523, + "learning_rate": 9.417067792295684e-05, + "loss": 2.9288, + "step": 322 + }, + { + "epoch": 0.39803439803439805, + "grad_norm": 1.720664381980896, + "learning_rate": 9.407509140410826e-05, + "loss": 2.9628, + "step": 324 + }, + { + "epoch": 0.4004914004914005, + "grad_norm": 1.6769790649414062, + "learning_rate": 9.397877688190362e-05, + "loss": 2.8215, + "step": 326 + }, + { + "epoch": 0.40294840294840295, + "grad_norm": 2.712700843811035, + "learning_rate": 9.388173594720281e-05, + "loss": 2.9122, + "step": 328 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 1.8222036361694336, + "learning_rate": 9.378397020286417e-05, + "loss": 3.0608, + "step": 330 + }, + { + "epoch": 0.40786240786240785, + "grad_norm": 1.794284462928772, + "learning_rate": 9.368548126371788e-05, + "loss": 2.994, + "step": 332 + }, + { + "epoch": 0.4103194103194103, + "grad_norm": 1.6459068059921265, + "learning_rate": 9.358627075653946e-05, + "loss": 2.9682, + "step": 334 + }, + { + "epoch": 0.41277641277641275, + "grad_norm": 1.6979544162750244, + "learning_rate": 9.348634032002277e-05, + "loss": 3.0303, + "step": 336 + }, + { + "epoch": 0.4152334152334152, + "grad_norm": 1.724778175354004, + "learning_rate": 9.338569160475299e-05, + "loss": 2.7861, + "step": 338 + }, + { + "epoch": 0.4176904176904177, + "grad_norm": 1.737809658050537, + "learning_rate": 9.328432627317938e-05, + "loss": 2.9365, + "step": 340 + }, + { + "epoch": 0.4201474201474201, + "grad_norm": 1.6635717153549194, + "learning_rate": 9.318224599958778e-05, + "loss": 2.9818, + "step": 342 + }, + { + "epoch": 0.4226044226044226, + "grad_norm": 1.696031093597412, + "learning_rate": 9.307945247007299e-05, + "loss": 3.0177, + "step": 344 + }, + { + "epoch": 0.4250614250614251, + "grad_norm": 1.7607593536376953, + "learning_rate": 9.297594738251086e-05, + "loss": 3.0018, + "step": 346 + }, + { + "epoch": 0.4275184275184275, + "grad_norm": 1.6658580303192139, + "learning_rate": 9.287173244653032e-05, + "loss": 2.9692, + "step": 348 + }, + { + "epoch": 0.42997542997543, + "grad_norm": 1.7650336027145386, + "learning_rate": 9.276680938348512e-05, + "loss": 2.9828, + "step": 350 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 1.6512577533721924, + "learning_rate": 9.266117992642536e-05, + "loss": 2.9751, + "step": 352 + }, + { + "epoch": 0.4348894348894349, + "grad_norm": 1.7206581830978394, + "learning_rate": 9.25548458200689e-05, + "loss": 2.948, + "step": 354 + }, + { + "epoch": 0.43734643734643736, + "grad_norm": 1.7989583015441895, + "learning_rate": 9.244780882077254e-05, + "loss": 2.6979, + "step": 356 + }, + { + "epoch": 0.4398034398034398, + "grad_norm": 1.7424843311309814, + "learning_rate": 9.2340070696503e-05, + "loss": 3.0316, + "step": 358 + }, + { + "epoch": 0.44226044226044225, + "grad_norm": 1.726643443107605, + "learning_rate": 9.223163322680772e-05, + "loss": 2.9751, + "step": 360 + }, + { + "epoch": 0.44471744471744473, + "grad_norm": 1.670101523399353, + "learning_rate": 9.212249820278545e-05, + "loss": 2.8993, + "step": 362 + }, + { + "epoch": 0.44717444717444715, + "grad_norm": 1.6568448543548584, + "learning_rate": 9.201266742705672e-05, + "loss": 3.1154, + "step": 364 + }, + { + "epoch": 0.44963144963144963, + "grad_norm": 1.776298999786377, + "learning_rate": 9.190214271373398e-05, + "loss": 2.8871, + "step": 366 + }, + { + "epoch": 0.4520884520884521, + "grad_norm": 1.67928147315979, + "learning_rate": 9.179092588839178e-05, + "loss": 2.7472, + "step": 368 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 1.657324194908142, + "learning_rate": 9.167901878803638e-05, + "loss": 2.8401, + "step": 370 + }, + { + "epoch": 0.457002457002457, + "grad_norm": 1.7573364973068237, + "learning_rate": 9.156642326107565e-05, + "loss": 2.91, + "step": 372 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 1.6229298114776611, + "learning_rate": 9.145314116728841e-05, + "loss": 3.0171, + "step": 374 + }, + { + "epoch": 0.4619164619164619, + "grad_norm": 1.7045820951461792, + "learning_rate": 9.133917437779375e-05, + "loss": 2.9076, + "step": 376 + }, + { + "epoch": 0.4643734643734644, + "grad_norm": 1.5423696041107178, + "learning_rate": 9.12245247750201e-05, + "loss": 2.9438, + "step": 378 + }, + { + "epoch": 0.4668304668304668, + "grad_norm": 1.6251273155212402, + "learning_rate": 9.110919425267415e-05, + "loss": 2.998, + "step": 380 + }, + { + "epoch": 0.4692874692874693, + "grad_norm": 1.7428491115570068, + "learning_rate": 9.099318471570957e-05, + "loss": 2.9253, + "step": 382 + }, + { + "epoch": 0.47174447174447176, + "grad_norm": 1.7721710205078125, + "learning_rate": 9.087649808029554e-05, + "loss": 2.8592, + "step": 384 + }, + { + "epoch": 0.4742014742014742, + "grad_norm": 1.8491050004959106, + "learning_rate": 9.075913627378513e-05, + "loss": 2.994, + "step": 386 + }, + { + "epoch": 0.47665847665847666, + "grad_norm": 1.6777352094650269, + "learning_rate": 9.064110123468345e-05, + "loss": 2.9855, + "step": 388 + }, + { + "epoch": 0.47911547911547914, + "grad_norm": 1.717166781425476, + "learning_rate": 9.052239491261559e-05, + "loss": 3.0357, + "step": 390 + }, + { + "epoch": 0.48157248157248156, + "grad_norm": 1.6935992240905762, + "learning_rate": 9.040301926829445e-05, + "loss": 3.0939, + "step": 392 + }, + { + "epoch": 0.48402948402948404, + "grad_norm": 1.7372186183929443, + "learning_rate": 9.028297627348835e-05, + "loss": 2.7582, + "step": 394 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 1.6863627433776855, + "learning_rate": 9.016226791098851e-05, + "loss": 3.0831, + "step": 396 + }, + { + "epoch": 0.48894348894348894, + "grad_norm": 1.5812780857086182, + "learning_rate": 9.004089617457625e-05, + "loss": 2.8814, + "step": 398 + }, + { + "epoch": 0.4914004914004914, + "grad_norm": 1.6280958652496338, + "learning_rate": 8.991886306899002e-05, + "loss": 3.0318, + "step": 400 + }, + { + "epoch": 0.49385749385749383, + "grad_norm": 1.703506588935852, + "learning_rate": 8.979617060989234e-05, + "loss": 3.1265, + "step": 402 + }, + { + "epoch": 0.4963144963144963, + "grad_norm": 1.596043348312378, + "learning_rate": 8.967282082383652e-05, + "loss": 3.0872, + "step": 404 + }, + { + "epoch": 0.4987714987714988, + "grad_norm": 1.8576503992080688, + "learning_rate": 8.954881574823317e-05, + "loss": 2.9286, + "step": 406 + }, + { + "epoch": 0.5012285012285013, + "grad_norm": 1.8761273622512817, + "learning_rate": 8.942415743131651e-05, + "loss": 2.8426, + "step": 408 + }, + { + "epoch": 0.5036855036855037, + "grad_norm": 1.6055220365524292, + "learning_rate": 8.92988479321106e-05, + "loss": 2.7407, + "step": 410 + }, + { + "epoch": 0.5061425061425061, + "grad_norm": 1.5771170854568481, + "learning_rate": 8.917288932039529e-05, + "loss": 3.0324, + "step": 412 + }, + { + "epoch": 0.5085995085995086, + "grad_norm": 1.5361248254776, + "learning_rate": 8.904628367667202e-05, + "loss": 2.901, + "step": 414 + }, + { + "epoch": 0.5110565110565111, + "grad_norm": 1.662143588066101, + "learning_rate": 8.891903309212952e-05, + "loss": 2.8503, + "step": 416 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 1.6230075359344482, + "learning_rate": 8.87911396686092e-05, + "loss": 2.7839, + "step": 418 + }, + { + "epoch": 0.515970515970516, + "grad_norm": 1.7319310903549194, + "learning_rate": 8.866260551857045e-05, + "loss": 2.9908, + "step": 420 + }, + { + "epoch": 0.5184275184275184, + "grad_norm": 1.6889699697494507, + "learning_rate": 8.853343276505581e-05, + "loss": 2.7828, + "step": 422 + }, + { + "epoch": 0.5208845208845209, + "grad_norm": 11.279559135437012, + "learning_rate": 8.840362354165581e-05, + "loss": 2.8784, + "step": 424 + }, + { + "epoch": 0.5233415233415234, + "grad_norm": 1.7862719297409058, + "learning_rate": 8.827317999247378e-05, + "loss": 2.9174, + "step": 426 + }, + { + "epoch": 0.5257985257985258, + "grad_norm": 1.6923807859420776, + "learning_rate": 8.81421042720904e-05, + "loss": 3.0122, + "step": 428 + }, + { + "epoch": 0.5282555282555282, + "grad_norm": 29.5804386138916, + "learning_rate": 8.801039854552821e-05, + "loss": 2.9432, + "step": 430 + }, + { + "epoch": 0.5307125307125307, + "grad_norm": 1.7361953258514404, + "learning_rate": 8.787806498821571e-05, + "loss": 2.881, + "step": 432 + }, + { + "epoch": 0.5331695331695332, + "grad_norm": 1.7489663362503052, + "learning_rate": 8.774510578595153e-05, + "loss": 2.9763, + "step": 434 + }, + { + "epoch": 0.5356265356265356, + "grad_norm": 1.5390799045562744, + "learning_rate": 8.761152313486824e-05, + "loss": 2.9691, + "step": 436 + }, + { + "epoch": 0.538083538083538, + "grad_norm": 1.8254520893096924, + "learning_rate": 8.747731924139622e-05, + "loss": 2.7518, + "step": 438 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 1.6369190216064453, + "learning_rate": 8.734249632222702e-05, + "loss": 2.9397, + "step": 440 + }, + { + "epoch": 0.542997542997543, + "grad_norm": 1.5464283227920532, + "learning_rate": 8.720705660427692e-05, + "loss": 2.85, + "step": 442 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.9427014589309692, + "learning_rate": 8.707100232465007e-05, + "loss": 2.8669, + "step": 444 + }, + { + "epoch": 0.547911547911548, + "grad_norm": 1.6004284620285034, + "learning_rate": 8.69343357306015e-05, + "loss": 2.8216, + "step": 446 + }, + { + "epoch": 0.5503685503685504, + "grad_norm": 1.615379810333252, + "learning_rate": 8.67970590795001e-05, + "loss": 2.6078, + "step": 448 + }, + { + "epoch": 0.5528255528255528, + "grad_norm": 1.7116811275482178, + "learning_rate": 8.665917463879125e-05, + "loss": 3.1718, + "step": 450 + }, + { + "epoch": 0.5552825552825553, + "grad_norm": 1.7578415870666504, + "learning_rate": 8.65206846859594e-05, + "loss": 2.9675, + "step": 452 + }, + { + "epoch": 0.5577395577395577, + "grad_norm": 1.6257036924362183, + "learning_rate": 8.638159150849046e-05, + "loss": 3.0276, + "step": 454 + }, + { + "epoch": 0.5601965601965602, + "grad_norm": 1.6307907104492188, + "learning_rate": 8.6241897403834e-05, + "loss": 2.984, + "step": 456 + }, + { + "epoch": 0.5626535626535627, + "grad_norm": 1.628524661064148, + "learning_rate": 8.610160467936533e-05, + "loss": 2.9575, + "step": 458 + }, + { + "epoch": 0.5651105651105651, + "grad_norm": 1.656004786491394, + "learning_rate": 8.596071565234733e-05, + "loss": 3.0361, + "step": 460 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 1.5190882682800293, + "learning_rate": 8.581923264989228e-05, + "loss": 2.8684, + "step": 462 + }, + { + "epoch": 0.5700245700245701, + "grad_norm": 1.6448097229003906, + "learning_rate": 8.567715800892326e-05, + "loss": 3.0113, + "step": 464 + }, + { + "epoch": 0.5724815724815725, + "grad_norm": 1.7265264987945557, + "learning_rate": 8.553449407613572e-05, + "loss": 2.9945, + "step": 466 + }, + { + "epoch": 0.5749385749385749, + "grad_norm": 7.519404411315918, + "learning_rate": 8.539124320795862e-05, + "loss": 3.0108, + "step": 468 + }, + { + "epoch": 0.5773955773955773, + "grad_norm": 1.586311936378479, + "learning_rate": 8.524740777051555e-05, + "loss": 2.7747, + "step": 470 + }, + { + "epoch": 0.5798525798525799, + "grad_norm": 1.7437552213668823, + "learning_rate": 8.510299013958558e-05, + "loss": 3.0769, + "step": 472 + }, + { + "epoch": 0.5823095823095823, + "grad_norm": 1.757537603378296, + "learning_rate": 8.495799270056412e-05, + "loss": 2.9696, + "step": 474 + }, + { + "epoch": 0.5847665847665847, + "grad_norm": 1.5627938508987427, + "learning_rate": 8.481241784842344e-05, + "loss": 2.7725, + "step": 476 + }, + { + "epoch": 0.5872235872235873, + "grad_norm": 1.6397974491119385, + "learning_rate": 8.466626798767318e-05, + "loss": 3.0094, + "step": 478 + }, + { + "epoch": 0.5896805896805897, + "grad_norm": 1.6290757656097412, + "learning_rate": 8.451954553232055e-05, + "loss": 3.0959, + "step": 480 + }, + { + "epoch": 0.5921375921375921, + "grad_norm": 1.628265619277954, + "learning_rate": 8.437225290583051e-05, + "loss": 2.7791, + "step": 482 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 1.6443836688995361, + "learning_rate": 8.422439254108576e-05, + "loss": 2.9753, + "step": 484 + }, + { + "epoch": 0.597051597051597, + "grad_norm": 1.617613673210144, + "learning_rate": 8.407596688034648e-05, + "loss": 3.0032, + "step": 486 + }, + { + "epoch": 0.5995085995085995, + "grad_norm": 1.6962013244628906, + "learning_rate": 8.392697837521007e-05, + "loss": 2.845, + "step": 488 + }, + { + "epoch": 0.601965601965602, + "grad_norm": 1.5888972282409668, + "learning_rate": 8.37774294865706e-05, + "loss": 2.7606, + "step": 490 + }, + { + "epoch": 0.6044226044226044, + "grad_norm": 1.5973700284957886, + "learning_rate": 8.362732268457824e-05, + "loss": 3.1137, + "step": 492 + }, + { + "epoch": 0.6068796068796068, + "grad_norm": 27.63825035095215, + "learning_rate": 8.347666044859833e-05, + "loss": 2.8452, + "step": 494 + }, + { + "epoch": 0.6093366093366094, + "grad_norm": 1.592497706413269, + "learning_rate": 8.332544526717057e-05, + "loss": 2.8386, + "step": 496 + }, + { + "epoch": 0.6117936117936118, + "grad_norm": 1.5605237483978271, + "learning_rate": 8.317367963796778e-05, + "loss": 2.9155, + "step": 498 + }, + { + "epoch": 0.6142506142506142, + "grad_norm": 1.5933369398117065, + "learning_rate": 8.30213660677548e-05, + "loss": 2.8719, + "step": 500 + }, + { + "epoch": 0.6167076167076168, + "grad_norm": 1.649196743965149, + "learning_rate": 8.286850707234691e-05, + "loss": 2.8277, + "step": 502 + }, + { + "epoch": 0.6191646191646192, + "grad_norm": 1.671309232711792, + "learning_rate": 8.271510517656845e-05, + "loss": 2.8637, + "step": 504 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 1.6627384424209595, + "learning_rate": 8.256116291421094e-05, + "loss": 3.0589, + "step": 506 + }, + { + "epoch": 0.6240786240786241, + "grad_norm": 1.6332685947418213, + "learning_rate": 8.24066828279914e-05, + "loss": 2.8909, + "step": 508 + }, + { + "epoch": 0.6265356265356266, + "grad_norm": 1.5245287418365479, + "learning_rate": 8.225166746951023e-05, + "loss": 3.0116, + "step": 510 + }, + { + "epoch": 0.628992628992629, + "grad_norm": 1.6836432218551636, + "learning_rate": 8.209611939920912e-05, + "loss": 2.6435, + "step": 512 + }, + { + "epoch": 0.6314496314496314, + "grad_norm": 1.5706799030303955, + "learning_rate": 8.194004118632873e-05, + "loss": 2.8328, + "step": 514 + }, + { + "epoch": 0.6339066339066339, + "grad_norm": 1.6746190786361694, + "learning_rate": 8.178343540886626e-05, + "loss": 2.8675, + "step": 516 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 1.629543662071228, + "learning_rate": 8.162630465353292e-05, + "loss": 2.7241, + "step": 518 + }, + { + "epoch": 0.6388206388206388, + "grad_norm": 1.6499792337417603, + "learning_rate": 8.146865151571108e-05, + "loss": 2.8229, + "step": 520 + }, + { + "epoch": 0.6412776412776413, + "grad_norm": 1.7184644937515259, + "learning_rate": 8.131047859941156e-05, + "loss": 2.77, + "step": 522 + }, + { + "epoch": 0.6437346437346437, + "grad_norm": 4.247711181640625, + "learning_rate": 8.11517885172305e-05, + "loss": 2.9487, + "step": 524 + }, + { + "epoch": 0.6461916461916462, + "grad_norm": 1.6026562452316284, + "learning_rate": 8.099258389030624e-05, + "loss": 2.6282, + "step": 526 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 1.6910419464111328, + "learning_rate": 8.083286734827605e-05, + "loss": 2.8437, + "step": 528 + }, + { + "epoch": 0.6511056511056511, + "grad_norm": 2.3516945838928223, + "learning_rate": 8.067264152923268e-05, + "loss": 2.8406, + "step": 530 + }, + { + "epoch": 0.6535626535626535, + "grad_norm": 1.7951064109802246, + "learning_rate": 8.051190907968076e-05, + "loss": 2.7437, + "step": 532 + }, + { + "epoch": 0.6560196560196561, + "grad_norm": 1.6154842376708984, + "learning_rate": 8.035067265449312e-05, + "loss": 2.7701, + "step": 534 + }, + { + "epoch": 0.6584766584766585, + "grad_norm": 1.5919100046157837, + "learning_rate": 8.018893491686692e-05, + "loss": 2.8796, + "step": 536 + }, + { + "epoch": 0.6609336609336609, + "grad_norm": 1.6625549793243408, + "learning_rate": 8.00266985382797e-05, + "loss": 2.8874, + "step": 538 + }, + { + "epoch": 0.6633906633906634, + "grad_norm": 1.5272475481033325, + "learning_rate": 7.986396619844519e-05, + "loss": 2.8395, + "step": 540 + }, + { + "epoch": 0.6658476658476659, + "grad_norm": 1.6629564762115479, + "learning_rate": 7.970074058526908e-05, + "loss": 2.8472, + "step": 542 + }, + { + "epoch": 0.6683046683046683, + "grad_norm": 1.5899837017059326, + "learning_rate": 7.953702439480468e-05, + "loss": 2.5589, + "step": 544 + }, + { + "epoch": 0.6707616707616708, + "grad_norm": 1.6535660028457642, + "learning_rate": 7.937282033120825e-05, + "loss": 3.002, + "step": 546 + }, + { + "epoch": 0.6732186732186732, + "grad_norm": 1.7113556861877441, + "learning_rate": 7.920813110669445e-05, + "loss": 2.921, + "step": 548 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 1.6539169549942017, + "learning_rate": 7.904295944149157e-05, + "loss": 2.7657, + "step": 550 + }, + { + "epoch": 0.6781326781326781, + "grad_norm": 1.499321699142456, + "learning_rate": 7.887730806379641e-05, + "loss": 2.5954, + "step": 552 + }, + { + "epoch": 0.6805896805896806, + "grad_norm": 1.5506938695907593, + "learning_rate": 7.871117970972948e-05, + "loss": 2.9977, + "step": 554 + }, + { + "epoch": 0.683046683046683, + "grad_norm": 1.60562264919281, + "learning_rate": 7.854457712328957e-05, + "loss": 2.8725, + "step": 556 + }, + { + "epoch": 0.6855036855036855, + "grad_norm": 1.5586713552474976, + "learning_rate": 7.837750305630862e-05, + "loss": 2.7757, + "step": 558 + }, + { + "epoch": 0.687960687960688, + "grad_norm": 1.4952479600906372, + "learning_rate": 7.820996026840607e-05, + "loss": 2.8109, + "step": 560 + }, + { + "epoch": 0.6904176904176904, + "grad_norm": 1.5375022888183594, + "learning_rate": 7.804195152694347e-05, + "loss": 2.8958, + "step": 562 + }, + { + "epoch": 0.6928746928746928, + "grad_norm": 1.5429644584655762, + "learning_rate": 7.787347960697863e-05, + "loss": 2.848, + "step": 564 + }, + { + "epoch": 0.6953316953316954, + "grad_norm": 1.5820600986480713, + "learning_rate": 7.77045472912199e-05, + "loss": 2.7349, + "step": 566 + }, + { + "epoch": 0.6977886977886978, + "grad_norm": 1.6391702890396118, + "learning_rate": 7.753515736998007e-05, + "loss": 2.7536, + "step": 568 + }, + { + "epoch": 0.7002457002457002, + "grad_norm": 1.8837157487869263, + "learning_rate": 7.736531264113041e-05, + "loss": 2.5788, + "step": 570 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 1.6547040939331055, + "learning_rate": 7.719501591005436e-05, + "loss": 2.9089, + "step": 572 + }, + { + "epoch": 0.7051597051597052, + "grad_norm": 1.6191688776016235, + "learning_rate": 7.702426998960129e-05, + "loss": 2.7423, + "step": 574 + }, + { + "epoch": 0.7076167076167076, + "grad_norm": 1.63784658908844, + "learning_rate": 7.685307770003993e-05, + "loss": 2.9405, + "step": 576 + }, + { + "epoch": 0.7100737100737101, + "grad_norm": 1.7041395902633667, + "learning_rate": 7.668144186901189e-05, + "loss": 2.7419, + "step": 578 + }, + { + "epoch": 0.7125307125307125, + "grad_norm": 1.626767635345459, + "learning_rate": 7.650936533148485e-05, + "loss": 2.917, + "step": 580 + }, + { + "epoch": 0.714987714987715, + "grad_norm": 1.5805187225341797, + "learning_rate": 7.633685092970584e-05, + "loss": 2.9039, + "step": 582 + }, + { + "epoch": 0.7174447174447175, + "grad_norm": 1.5578582286834717, + "learning_rate": 7.616390151315422e-05, + "loss": 2.7313, + "step": 584 + }, + { + "epoch": 0.7199017199017199, + "grad_norm": 1.5582646131515503, + "learning_rate": 7.599051993849467e-05, + "loss": 2.4935, + "step": 586 + }, + { + "epoch": 0.7223587223587223, + "grad_norm": 1.6257047653198242, + "learning_rate": 7.58167090695299e-05, + "loss": 2.9326, + "step": 588 + }, + { + "epoch": 0.7248157248157249, + "grad_norm": 1.534566879272461, + "learning_rate": 7.56424717771535e-05, + "loss": 2.7481, + "step": 590 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 4.033563137054443, + "learning_rate": 7.546781093930238e-05, + "loss": 2.6916, + "step": 592 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 1.7314389944076538, + "learning_rate": 7.529272944090935e-05, + "loss": 2.8562, + "step": 594 + }, + { + "epoch": 0.7321867321867321, + "grad_norm": 1.5590124130249023, + "learning_rate": 7.511723017385538e-05, + "loss": 2.6231, + "step": 596 + }, + { + "epoch": 0.7346437346437347, + "grad_norm": 1.5711228847503662, + "learning_rate": 7.494131603692187e-05, + "loss": 2.7342, + "step": 598 + }, + { + "epoch": 0.7371007371007371, + "grad_norm": 1.6933467388153076, + "learning_rate": 7.476498993574277e-05, + "loss": 2.8793, + "step": 600 + }, + { + "epoch": 0.7395577395577395, + "grad_norm": 1.652563214302063, + "learning_rate": 7.45882547827566e-05, + "loss": 2.8555, + "step": 602 + }, + { + "epoch": 0.742014742014742, + "grad_norm": 1.5554332733154297, + "learning_rate": 7.441111349715832e-05, + "loss": 2.7455, + "step": 604 + }, + { + "epoch": 0.7444717444717445, + "grad_norm": 1.5497958660125732, + "learning_rate": 7.423356900485108e-05, + "loss": 2.7962, + "step": 606 + }, + { + "epoch": 0.7469287469287469, + "grad_norm": 2.072221040725708, + "learning_rate": 7.405562423839801e-05, + "loss": 2.7827, + "step": 608 + }, + { + "epoch": 0.7493857493857494, + "grad_norm": 1.6481581926345825, + "learning_rate": 7.387728213697365e-05, + "loss": 2.7402, + "step": 610 + }, + { + "epoch": 0.7518427518427518, + "grad_norm": 1.605137825012207, + "learning_rate": 7.369854564631548e-05, + "loss": 2.9223, + "step": 612 + }, + { + "epoch": 0.7542997542997543, + "grad_norm": 1.502097725868225, + "learning_rate": 7.351941771867523e-05, + "loss": 2.811, + "step": 614 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 1.4896032810211182, + "learning_rate": 7.333990131277013e-05, + "loss": 2.7072, + "step": 616 + }, + { + "epoch": 0.7592137592137592, + "grad_norm": 1.5070993900299072, + "learning_rate": 7.315999939373404e-05, + "loss": 2.6862, + "step": 618 + }, + { + "epoch": 0.7616707616707616, + "grad_norm": 1.5684906244277954, + "learning_rate": 7.297971493306848e-05, + "loss": 2.9692, + "step": 620 + }, + { + "epoch": 0.7641277641277642, + "grad_norm": 1.6148993968963623, + "learning_rate": 7.279905090859352e-05, + "loss": 2.6739, + "step": 622 + }, + { + "epoch": 0.7665847665847666, + "grad_norm": 1.8809860944747925, + "learning_rate": 7.261801030439864e-05, + "loss": 2.6416, + "step": 624 + }, + { + "epoch": 0.769041769041769, + "grad_norm": 1.9866212606430054, + "learning_rate": 7.243659611079343e-05, + "loss": 2.6516, + "step": 626 + }, + { + "epoch": 0.7714987714987716, + "grad_norm": 1.5821627378463745, + "learning_rate": 7.225481132425812e-05, + "loss": 2.5458, + "step": 628 + }, + { + "epoch": 0.773955773955774, + "grad_norm": 2.8795077800750732, + "learning_rate": 7.20726589473942e-05, + "loss": 2.7961, + "step": 630 + }, + { + "epoch": 0.7764127764127764, + "grad_norm": 1.6154601573944092, + "learning_rate": 7.189014198887478e-05, + "loss": 2.9093, + "step": 632 + }, + { + "epoch": 0.7788697788697788, + "grad_norm": 1.677990436553955, + "learning_rate": 7.170726346339488e-05, + "loss": 2.888, + "step": 634 + }, + { + "epoch": 0.7813267813267813, + "grad_norm": 2.146972417831421, + "learning_rate": 7.15240263916216e-05, + "loss": 2.7926, + "step": 636 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 1.5269049406051636, + "learning_rate": 7.134043380014436e-05, + "loss": 2.6542, + "step": 638 + }, + { + "epoch": 0.7862407862407862, + "grad_norm": 1.5493632555007935, + "learning_rate": 7.115648872142475e-05, + "loss": 2.5847, + "step": 640 + }, + { + "epoch": 0.7886977886977887, + "grad_norm": 1.7266803979873657, + "learning_rate": 7.097219419374652e-05, + "loss": 2.8569, + "step": 642 + }, + { + "epoch": 0.7911547911547911, + "grad_norm": 1.5810487270355225, + "learning_rate": 7.078755326116542e-05, + "loss": 2.5271, + "step": 644 + }, + { + "epoch": 0.7936117936117936, + "grad_norm": 1.8903851509094238, + "learning_rate": 7.060256897345888e-05, + "loss": 2.8828, + "step": 646 + }, + { + "epoch": 0.7960687960687961, + "grad_norm": 1.6093307733535767, + "learning_rate": 7.041724438607563e-05, + "loss": 2.9436, + "step": 648 + }, + { + "epoch": 0.7985257985257985, + "grad_norm": 1.629585862159729, + "learning_rate": 7.023158256008521e-05, + "loss": 2.9666, + "step": 650 + }, + { + "epoch": 0.800982800982801, + "grad_norm": 1.62311851978302, + "learning_rate": 7.004558656212753e-05, + "loss": 2.8761, + "step": 652 + }, + { + "epoch": 0.8034398034398035, + "grad_norm": 1.5945847034454346, + "learning_rate": 6.985925946436213e-05, + "loss": 2.8419, + "step": 654 + }, + { + "epoch": 0.8058968058968059, + "grad_norm": 1.5463379621505737, + "learning_rate": 6.967260434441729e-05, + "loss": 2.7639, + "step": 656 + }, + { + "epoch": 0.8083538083538083, + "grad_norm": 3.601102590560913, + "learning_rate": 6.948562428533955e-05, + "loss": 2.8573, + "step": 658 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 1.59696364402771, + "learning_rate": 6.929832237554241e-05, + "loss": 2.8708, + "step": 660 + }, + { + "epoch": 0.8132678132678133, + "grad_norm": 1.577546238899231, + "learning_rate": 6.911070170875562e-05, + "loss": 2.8815, + "step": 662 + }, + { + "epoch": 0.8157248157248157, + "grad_norm": 1.4518203735351562, + "learning_rate": 6.892276538397384e-05, + "loss": 2.7745, + "step": 664 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 1.5612823963165283, + "learning_rate": 6.873451650540566e-05, + "loss": 2.6452, + "step": 666 + }, + { + "epoch": 0.8206388206388207, + "grad_norm": 1.5459407567977905, + "learning_rate": 6.854595818242213e-05, + "loss": 2.8471, + "step": 668 + }, + { + "epoch": 0.8230958230958231, + "grad_norm": 1.6408060789108276, + "learning_rate": 6.835709352950557e-05, + "loss": 2.8203, + "step": 670 + }, + { + "epoch": 0.8255528255528255, + "grad_norm": 1.6304421424865723, + "learning_rate": 6.816792566619806e-05, + "loss": 2.6816, + "step": 672 + }, + { + "epoch": 0.828009828009828, + "grad_norm": 1.6135345697402954, + "learning_rate": 6.797845771704983e-05, + "loss": 2.7026, + "step": 674 + }, + { + "epoch": 0.8304668304668305, + "grad_norm": 1.6707392930984497, + "learning_rate": 6.778869281156784e-05, + "loss": 2.8929, + "step": 676 + }, + { + "epoch": 0.8329238329238329, + "grad_norm": 1.621004581451416, + "learning_rate": 6.759863408416386e-05, + "loss": 2.7963, + "step": 678 + }, + { + "epoch": 0.8353808353808354, + "grad_norm": 1.5636838674545288, + "learning_rate": 6.740828467410294e-05, + "loss": 2.9369, + "step": 680 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 1.6135865449905396, + "learning_rate": 6.721764772545135e-05, + "loss": 2.5881, + "step": 682 + }, + { + "epoch": 0.8402948402948403, + "grad_norm": 1.6814947128295898, + "learning_rate": 6.702672638702475e-05, + "loss": 2.8039, + "step": 684 + }, + { + "epoch": 0.8427518427518428, + "grad_norm": 1.5667632818222046, + "learning_rate": 6.68355238123362e-05, + "loss": 2.8448, + "step": 686 + }, + { + "epoch": 0.8452088452088452, + "grad_norm": 1.6321567296981812, + "learning_rate": 6.664404315954397e-05, + "loss": 2.7062, + "step": 688 + }, + { + "epoch": 0.8476658476658476, + "grad_norm": 1.4741226434707642, + "learning_rate": 6.64522875913995e-05, + "loss": 2.8175, + "step": 690 + }, + { + "epoch": 0.8501228501228502, + "grad_norm": 1.583012580871582, + "learning_rate": 6.626026027519509e-05, + "loss": 2.7698, + "step": 692 + }, + { + "epoch": 0.8525798525798526, + "grad_norm": 1.650833010673523, + "learning_rate": 6.606796438271156e-05, + "loss": 2.7542, + "step": 694 + }, + { + "epoch": 0.855036855036855, + "grad_norm": 1.736618161201477, + "learning_rate": 6.587540309016592e-05, + "loss": 2.5914, + "step": 696 + }, + { + "epoch": 0.8574938574938575, + "grad_norm": 1.5331878662109375, + "learning_rate": 6.568257957815893e-05, + "loss": 2.7587, + "step": 698 + }, + { + "epoch": 0.85995085995086, + "grad_norm": 2.8066611289978027, + "learning_rate": 6.54894970316224e-05, + "loss": 2.5376, + "step": 700 + }, + { + "epoch": 0.8624078624078624, + "grad_norm": 1.8552104234695435, + "learning_rate": 6.529615863976684e-05, + "loss": 2.8213, + "step": 702 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 7.016737937927246, + "learning_rate": 6.510256759602857e-05, + "loss": 2.763, + "step": 704 + }, + { + "epoch": 0.8673218673218673, + "grad_norm": 8.383584022521973, + "learning_rate": 6.4908727098017e-05, + "loss": 2.8134, + "step": 706 + }, + { + "epoch": 0.8697788697788698, + "grad_norm": 2.5546131134033203, + "learning_rate": 6.4714640347462e-05, + "loss": 2.5981, + "step": 708 + }, + { + "epoch": 0.8722358722358723, + "grad_norm": 1.4681930541992188, + "learning_rate": 6.452031055016073e-05, + "loss": 2.6272, + "step": 710 + }, + { + "epoch": 0.8746928746928747, + "grad_norm": 1.540204405784607, + "learning_rate": 6.432574091592494e-05, + "loss": 2.6063, + "step": 712 + }, + { + "epoch": 0.8771498771498771, + "grad_norm": 2.637376308441162, + "learning_rate": 6.41309346585278e-05, + "loss": 2.931, + "step": 714 + }, + { + "epoch": 0.8796068796068796, + "grad_norm": 1.6025326251983643, + "learning_rate": 6.393589499565088e-05, + "loss": 2.6114, + "step": 716 + }, + { + "epoch": 0.8820638820638821, + "grad_norm": 1.6360759735107422, + "learning_rate": 6.374062514883099e-05, + "loss": 2.8207, + "step": 718 + }, + { + "epoch": 0.8845208845208845, + "grad_norm": 1.5794533491134644, + "learning_rate": 6.354512834340695e-05, + "loss": 2.6199, + "step": 720 + }, + { + "epoch": 0.8869778869778869, + "grad_norm": 1.4947880506515503, + "learning_rate": 6.334940780846634e-05, + "loss": 2.6621, + "step": 722 + }, + { + "epoch": 0.8894348894348895, + "grad_norm": 1.632408618927002, + "learning_rate": 6.315346677679218e-05, + "loss": 2.8979, + "step": 724 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 1.636464238166809, + "learning_rate": 6.295730848480947e-05, + "loss": 2.714, + "step": 726 + }, + { + "epoch": 0.8943488943488943, + "grad_norm": 1.55814790725708, + "learning_rate": 6.276093617253182e-05, + "loss": 2.7157, + "step": 728 + }, + { + "epoch": 0.8968058968058968, + "grad_norm": 1.559779405593872, + "learning_rate": 6.256435308350786e-05, + "loss": 2.6694, + "step": 730 + }, + { + "epoch": 0.8992628992628993, + "grad_norm": 1.5911952257156372, + "learning_rate": 6.236756246476765e-05, + "loss": 2.8256, + "step": 732 + }, + { + "epoch": 0.9017199017199017, + "grad_norm": 1.542946457862854, + "learning_rate": 6.217056756676917e-05, + "loss": 2.5519, + "step": 734 + }, + { + "epoch": 0.9041769041769042, + "grad_norm": 1.6930584907531738, + "learning_rate": 6.197337164334453e-05, + "loss": 2.8785, + "step": 736 + }, + { + "epoch": 0.9066339066339066, + "grad_norm": 1.5558332204818726, + "learning_rate": 6.177597795164616e-05, + "loss": 2.6596, + "step": 738 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 1.4722115993499756, + "learning_rate": 6.157838975209323e-05, + "loss": 2.5197, + "step": 740 + }, + { + "epoch": 0.9115479115479116, + "grad_norm": 1.4730453491210938, + "learning_rate": 6.138061030831755e-05, + "loss": 2.8153, + "step": 742 + }, + { + "epoch": 0.914004914004914, + "grad_norm": 1.6478359699249268, + "learning_rate": 6.118264288710988e-05, + "loss": 2.7405, + "step": 744 + }, + { + "epoch": 0.9164619164619164, + "grad_norm": 1.4796196222305298, + "learning_rate": 6.098449075836575e-05, + "loss": 2.753, + "step": 746 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 1.9191288948059082, + "learning_rate": 6.0786157195031653e-05, + "loss": 2.6252, + "step": 748 + }, + { + "epoch": 0.9213759213759214, + "grad_norm": 1.553387999534607, + "learning_rate": 6.058764547305088e-05, + "loss": 2.8045, + "step": 750 + }, + { + "epoch": 0.9238329238329238, + "grad_norm": 1.4824281930923462, + "learning_rate": 6.038895887130942e-05, + "loss": 2.7996, + "step": 752 + }, + { + "epoch": 0.9262899262899262, + "grad_norm": 1.5677618980407715, + "learning_rate": 6.019010067158181e-05, + "loss": 2.891, + "step": 754 + }, + { + "epoch": 0.9287469287469288, + "grad_norm": 1.6281031370162964, + "learning_rate": 5.9991074158476935e-05, + "loss": 2.7762, + "step": 756 + }, + { + "epoch": 0.9312039312039312, + "grad_norm": 1.5614237785339355, + "learning_rate": 5.9791882619383766e-05, + "loss": 2.726, + "step": 758 + }, + { + "epoch": 0.9336609336609336, + "grad_norm": 1.446755290031433, + "learning_rate": 5.959252934441707e-05, + "loss": 2.5548, + "step": 760 + }, + { + "epoch": 0.9361179361179361, + "grad_norm": 1.5004243850708008, + "learning_rate": 5.939301762636307e-05, + "loss": 2.8673, + "step": 762 + }, + { + "epoch": 0.9385749385749386, + "grad_norm": 8.901308059692383, + "learning_rate": 5.9193350760625014e-05, + "loss": 2.6982, + "step": 764 + }, + { + "epoch": 0.941031941031941, + "grad_norm": 1.6001726388931274, + "learning_rate": 5.8993532045168795e-05, + "loss": 2.6233, + "step": 766 + }, + { + "epoch": 0.9434889434889435, + "grad_norm": 1.5054491758346558, + "learning_rate": 5.879356478046849e-05, + "loss": 2.7115, + "step": 768 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 1.4467328786849976, + "learning_rate": 5.8593452269451775e-05, + "loss": 2.6884, + "step": 770 + }, + { + "epoch": 0.9484029484029484, + "grad_norm": 1.4677785634994507, + "learning_rate": 5.839319781744542e-05, + "loss": 2.845, + "step": 772 + }, + { + "epoch": 0.9508599508599509, + "grad_norm": 1.5005624294281006, + "learning_rate": 5.81928047321207e-05, + "loss": 2.6341, + "step": 774 + }, + { + "epoch": 0.9533169533169533, + "grad_norm": 1.468900442123413, + "learning_rate": 5.79922763234387e-05, + "loss": 2.6078, + "step": 776 + }, + { + "epoch": 0.9557739557739557, + "grad_norm": 1.5839577913284302, + "learning_rate": 5.779161590359573e-05, + "loss": 2.6946, + "step": 778 + }, + { + "epoch": 0.9582309582309583, + "grad_norm": 1.5110023021697998, + "learning_rate": 5.7590826786968576e-05, + "loss": 2.6369, + "step": 780 + }, + { + "epoch": 0.9606879606879607, + "grad_norm": 1.4900883436203003, + "learning_rate": 5.738991229005972e-05, + "loss": 2.5128, + "step": 782 + }, + { + "epoch": 0.9631449631449631, + "grad_norm": 1.5296499729156494, + "learning_rate": 5.7188875731442605e-05, + "loss": 2.5457, + "step": 784 + }, + { + "epoch": 0.9656019656019657, + "grad_norm": 1.5462408065795898, + "learning_rate": 5.6987720431706826e-05, + "loss": 2.5989, + "step": 786 + }, + { + "epoch": 0.9680589680589681, + "grad_norm": 1.476729154586792, + "learning_rate": 5.678644971340326e-05, + "loss": 2.9144, + "step": 788 + }, + { + "epoch": 0.9705159705159705, + "grad_norm": 1.6275049448013306, + "learning_rate": 5.658506690098916e-05, + "loss": 2.7696, + "step": 790 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 1.4835902452468872, + "learning_rate": 5.638357532077331e-05, + "loss": 2.5136, + "step": 792 + }, + { + "epoch": 0.9754299754299754, + "grad_norm": 1.526121973991394, + "learning_rate": 5.6181978300861046e-05, + "loss": 2.4791, + "step": 794 + }, + { + "epoch": 0.9778869778869779, + "grad_norm": 1.4545388221740723, + "learning_rate": 5.598027917109929e-05, + "loss": 2.5994, + "step": 796 + }, + { + "epoch": 0.9803439803439803, + "grad_norm": 2.1908226013183594, + "learning_rate": 5.577848126302152e-05, + "loss": 2.8895, + "step": 798 + }, + { + "epoch": 0.9828009828009828, + "grad_norm": 1.504654049873352, + "learning_rate": 5.55765879097928e-05, + "loss": 2.7705, + "step": 800 + }, + { + "epoch": 0.9852579852579852, + "grad_norm": 1.6015645265579224, + "learning_rate": 5.5374602446154665e-05, + "loss": 2.845, + "step": 802 + }, + { + "epoch": 0.9877149877149877, + "grad_norm": 1.4529505968093872, + "learning_rate": 5.517252820837011e-05, + "loss": 2.5013, + "step": 804 + }, + { + "epoch": 0.9901719901719902, + "grad_norm": 1.4882707595825195, + "learning_rate": 5.49703685341684e-05, + "loss": 2.7789, + "step": 806 + }, + { + "epoch": 0.9926289926289926, + "grad_norm": 1.589077353477478, + "learning_rate": 5.4768126762690034e-05, + "loss": 2.6236, + "step": 808 + }, + { + "epoch": 0.995085995085995, + "grad_norm": 1.4018021821975708, + "learning_rate": 5.456580623443145e-05, + "loss": 2.6779, + "step": 810 + }, + { + "epoch": 0.9975429975429976, + "grad_norm": 1.4764320850372314, + "learning_rate": 5.436341029119004e-05, + "loss": 2.7036, + "step": 812 + }, + { + "epoch": 1.0, + "grad_norm": 4.599942684173584, + "learning_rate": 5.416094227600881e-05, + "loss": 3.0131, + "step": 814 + }, + { + "epoch": 1.0024570024570025, + "grad_norm": 1.4779109954833984, + "learning_rate": 5.395840553312117e-05, + "loss": 2.3572, + "step": 816 + }, + { + "epoch": 1.0049140049140048, + "grad_norm": 1.549442172050476, + "learning_rate": 5.375580340789579e-05, + "loss": 2.3533, + "step": 818 + }, + { + "epoch": 1.0073710073710074, + "grad_norm": 1.3227856159210205, + "learning_rate": 5.355313924678123e-05, + "loss": 2.1369, + "step": 820 + }, + { + "epoch": 1.00982800982801, + "grad_norm": 1.521327257156372, + "learning_rate": 5.3350416397250735e-05, + "loss": 2.3716, + "step": 822 + }, + { + "epoch": 1.0122850122850122, + "grad_norm": 1.627074122428894, + "learning_rate": 5.314763820774695e-05, + "loss": 2.1913, + "step": 824 + }, + { + "epoch": 1.0147420147420148, + "grad_norm": 1.4463263750076294, + "learning_rate": 5.2944808027626566e-05, + "loss": 2.1152, + "step": 826 + }, + { + "epoch": 1.0171990171990173, + "grad_norm": 1.678314447402954, + "learning_rate": 5.2741929207105034e-05, + "loss": 2.3256, + "step": 828 + }, + { + "epoch": 1.0196560196560196, + "grad_norm": 1.4523744583129883, + "learning_rate": 5.253900509720118e-05, + "loss": 2.24, + "step": 830 + }, + { + "epoch": 1.0221130221130221, + "grad_norm": 1.408027172088623, + "learning_rate": 5.2336039049681926e-05, + "loss": 2.1322, + "step": 832 + }, + { + "epoch": 1.0245700245700247, + "grad_norm": 1.3995742797851562, + "learning_rate": 5.213303441700691e-05, + "loss": 2.1241, + "step": 834 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 1.534173607826233, + "learning_rate": 5.192999455227302e-05, + "loss": 2.2434, + "step": 836 + }, + { + "epoch": 1.0294840294840295, + "grad_norm": 1.461700201034546, + "learning_rate": 5.172692280915915e-05, + "loss": 2.1441, + "step": 838 + }, + { + "epoch": 1.031941031941032, + "grad_norm": 1.5100444555282593, + "learning_rate": 5.152382254187076e-05, + "loss": 2.0964, + "step": 840 + }, + { + "epoch": 1.0343980343980343, + "grad_norm": 1.459774374961853, + "learning_rate": 5.132069710508435e-05, + "loss": 2.1989, + "step": 842 + }, + { + "epoch": 1.0368550368550369, + "grad_norm": 1.457028865814209, + "learning_rate": 5.1117549853892254e-05, + "loss": 2.4655, + "step": 844 + }, + { + "epoch": 1.0393120393120394, + "grad_norm": 1.5125858783721924, + "learning_rate": 5.091438414374709e-05, + "loss": 2.098, + "step": 846 + }, + { + "epoch": 1.0417690417690417, + "grad_norm": 1.5458678007125854, + "learning_rate": 5.0711203330406334e-05, + "loss": 2.3988, + "step": 848 + }, + { + "epoch": 1.0442260442260443, + "grad_norm": 1.5320196151733398, + "learning_rate": 5.050801076987699e-05, + "loss": 2.1486, + "step": 850 + }, + { + "epoch": 1.0466830466830466, + "grad_norm": 1.5283520221710205, + "learning_rate": 5.0304809818360046e-05, + "loss": 2.531, + "step": 852 + }, + { + "epoch": 1.049140049140049, + "grad_norm": 1.4647068977355957, + "learning_rate": 5.01016038321951e-05, + "loss": 2.1089, + "step": 854 + }, + { + "epoch": 1.0515970515970516, + "grad_norm": 1.6575099229812622, + "learning_rate": 4.989839616780492e-05, + "loss": 2.1275, + "step": 856 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 1.459604024887085, + "learning_rate": 4.969519018163996e-05, + "loss": 2.3404, + "step": 858 + }, + { + "epoch": 1.0565110565110565, + "grad_norm": 1.6122591495513916, + "learning_rate": 4.949198923012302e-05, + "loss": 2.4887, + "step": 860 + }, + { + "epoch": 1.058968058968059, + "grad_norm": 1.424753189086914, + "learning_rate": 4.9288796669593684e-05, + "loss": 2.285, + "step": 862 + }, + { + "epoch": 1.0614250614250613, + "grad_norm": 1.4969350099563599, + "learning_rate": 4.908561585625293e-05, + "loss": 2.3444, + "step": 864 + }, + { + "epoch": 1.0638820638820639, + "grad_norm": 1.4562941789627075, + "learning_rate": 4.888245014610775e-05, + "loss": 2.2335, + "step": 866 + }, + { + "epoch": 1.0663390663390664, + "grad_norm": 1.4847339391708374, + "learning_rate": 4.8679302894915664e-05, + "loss": 2.1127, + "step": 868 + }, + { + "epoch": 1.0687960687960687, + "grad_norm": 1.5414636135101318, + "learning_rate": 4.8476177458129254e-05, + "loss": 2.3521, + "step": 870 + }, + { + "epoch": 1.0712530712530712, + "grad_norm": 1.515874981880188, + "learning_rate": 4.827307719084084e-05, + "loss": 2.096, + "step": 872 + }, + { + "epoch": 1.0737100737100738, + "grad_norm": 1.5700652599334717, + "learning_rate": 4.807000544772698e-05, + "loss": 2.2585, + "step": 874 + }, + { + "epoch": 1.076167076167076, + "grad_norm": 2.0547258853912354, + "learning_rate": 4.786696558299311e-05, + "loss": 2.2072, + "step": 876 + }, + { + "epoch": 1.0786240786240786, + "grad_norm": 1.5897148847579956, + "learning_rate": 4.7663960950318085e-05, + "loss": 2.3934, + "step": 878 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 1.4570897817611694, + "learning_rate": 4.7460994902798834e-05, + "loss": 2.3312, + "step": 880 + }, + { + "epoch": 1.0835380835380835, + "grad_norm": 1.6115684509277344, + "learning_rate": 4.725807079289498e-05, + "loss": 2.2842, + "step": 882 + }, + { + "epoch": 1.085995085995086, + "grad_norm": 1.5855250358581543, + "learning_rate": 4.705519197237344e-05, + "loss": 2.2557, + "step": 884 + }, + { + "epoch": 1.0884520884520885, + "grad_norm": 1.5922023057937622, + "learning_rate": 4.685236179225304e-05, + "loss": 2.227, + "step": 886 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 2.3632519245147705, + "learning_rate": 4.664958360274928e-05, + "loss": 2.3233, + "step": 888 + }, + { + "epoch": 1.0933660933660934, + "grad_norm": 1.5227491855621338, + "learning_rate": 4.64468607532188e-05, + "loss": 2.3709, + "step": 890 + }, + { + "epoch": 1.095823095823096, + "grad_norm": 1.5834206342697144, + "learning_rate": 4.624419659210423e-05, + "loss": 2.2404, + "step": 892 + }, + { + "epoch": 1.0982800982800982, + "grad_norm": 1.4218862056732178, + "learning_rate": 4.604159446687884e-05, + "loss": 2.2469, + "step": 894 + }, + { + "epoch": 1.1007371007371007, + "grad_norm": 1.5175880193710327, + "learning_rate": 4.58390577239912e-05, + "loss": 2.4355, + "step": 896 + }, + { + "epoch": 1.1031941031941033, + "grad_norm": 1.523693323135376, + "learning_rate": 4.563658970880996e-05, + "loss": 2.2409, + "step": 898 + }, + { + "epoch": 1.1056511056511056, + "grad_norm": 1.5392237901687622, + "learning_rate": 4.543419376556855e-05, + "loss": 2.1449, + "step": 900 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 1.3254413604736328, + "learning_rate": 4.523187323731e-05, + "loss": 2.0661, + "step": 902 + }, + { + "epoch": 1.1105651105651106, + "grad_norm": 2.804891586303711, + "learning_rate": 4.502963146583161e-05, + "loss": 2.2777, + "step": 904 + }, + { + "epoch": 1.113022113022113, + "grad_norm": 1.5989583730697632, + "learning_rate": 4.48274717916299e-05, + "loss": 2.2534, + "step": 906 + }, + { + "epoch": 1.1154791154791155, + "grad_norm": 1.4605116844177246, + "learning_rate": 4.462539755384534e-05, + "loss": 2.3944, + "step": 908 + }, + { + "epoch": 1.117936117936118, + "grad_norm": 1.5217384099960327, + "learning_rate": 4.442341209020722e-05, + "loss": 2.223, + "step": 910 + }, + { + "epoch": 1.1203931203931203, + "grad_norm": 1.5808994770050049, + "learning_rate": 4.4221518736978484e-05, + "loss": 2.3372, + "step": 912 + }, + { + "epoch": 1.1228501228501229, + "grad_norm": 1.3402371406555176, + "learning_rate": 4.4019720828900726e-05, + "loss": 2.2651, + "step": 914 + }, + { + "epoch": 1.1253071253071254, + "grad_norm": 1.541191577911377, + "learning_rate": 4.3818021699138966e-05, + "loss": 2.2003, + "step": 916 + }, + { + "epoch": 1.1277641277641277, + "grad_norm": 1.8297321796417236, + "learning_rate": 4.3616424679226705e-05, + "loss": 2.1892, + "step": 918 + }, + { + "epoch": 1.1302211302211302, + "grad_norm": 1.6328003406524658, + "learning_rate": 4.3414933099010854e-05, + "loss": 2.2373, + "step": 920 + }, + { + "epoch": 1.1326781326781328, + "grad_norm": 1.6647323369979858, + "learning_rate": 4.3213550286596754e-05, + "loss": 2.2735, + "step": 922 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 1.477112889289856, + "learning_rate": 4.3012279568293165e-05, + "loss": 2.4351, + "step": 924 + }, + { + "epoch": 1.1375921375921376, + "grad_norm": 1.490395188331604, + "learning_rate": 4.2811124268557406e-05, + "loss": 2.3081, + "step": 926 + }, + { + "epoch": 1.1400491400491402, + "grad_norm": 1.5305448770523071, + "learning_rate": 4.26100877099403e-05, + "loss": 2.2579, + "step": 928 + }, + { + "epoch": 1.1425061425061425, + "grad_norm": 1.6744205951690674, + "learning_rate": 4.2409173213031436e-05, + "loss": 2.2845, + "step": 930 + }, + { + "epoch": 1.144963144963145, + "grad_norm": 1.5053201913833618, + "learning_rate": 4.220838409640428e-05, + "loss": 2.4011, + "step": 932 + }, + { + "epoch": 1.1474201474201475, + "grad_norm": 1.486065149307251, + "learning_rate": 4.20077236765613e-05, + "loss": 2.4018, + "step": 934 + }, + { + "epoch": 1.1498771498771498, + "grad_norm": 1.5150314569473267, + "learning_rate": 4.180719526787931e-05, + "loss": 2.2454, + "step": 936 + }, + { + "epoch": 1.1523341523341524, + "grad_norm": 1.4720960855484009, + "learning_rate": 4.160680218255458e-05, + "loss": 2.1771, + "step": 938 + }, + { + "epoch": 1.154791154791155, + "grad_norm": 1.4490469694137573, + "learning_rate": 4.140654773054824e-05, + "loss": 2.3108, + "step": 940 + }, + { + "epoch": 1.1572481572481572, + "grad_norm": 1.4559823274612427, + "learning_rate": 4.120643521953152e-05, + "loss": 2.1008, + "step": 942 + }, + { + "epoch": 1.1597051597051597, + "grad_norm": 1.4725589752197266, + "learning_rate": 4.1006467954831216e-05, + "loss": 2.1791, + "step": 944 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 1.5066269636154175, + "learning_rate": 4.0806649239375005e-05, + "loss": 2.281, + "step": 946 + }, + { + "epoch": 1.1646191646191646, + "grad_norm": 1.642109990119934, + "learning_rate": 4.060698237363695e-05, + "loss": 2.1889, + "step": 948 + }, + { + "epoch": 1.1670761670761671, + "grad_norm": 1.5414645671844482, + "learning_rate": 4.0407470655582935e-05, + "loss": 2.3946, + "step": 950 + }, + { + "epoch": 1.1695331695331694, + "grad_norm": 1.4948586225509644, + "learning_rate": 4.020811738061625e-05, + "loss": 2.0528, + "step": 952 + }, + { + "epoch": 1.171990171990172, + "grad_norm": 1.5673713684082031, + "learning_rate": 4.0008925841523083e-05, + "loss": 2.3508, + "step": 954 + }, + { + "epoch": 1.1744471744471745, + "grad_norm": 1.4704508781433105, + "learning_rate": 3.980989932841821e-05, + "loss": 2.1674, + "step": 956 + }, + { + "epoch": 1.1769041769041768, + "grad_norm": 1.4955954551696777, + "learning_rate": 3.9611041128690586e-05, + "loss": 2.3911, + "step": 958 + }, + { + "epoch": 1.1793611793611793, + "grad_norm": 1.5151094198226929, + "learning_rate": 3.941235452694913e-05, + "loss": 2.0934, + "step": 960 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 1.5122382640838623, + "learning_rate": 3.921384280496835e-05, + "loss": 2.1734, + "step": 962 + }, + { + "epoch": 1.1842751842751842, + "grad_norm": 1.5011439323425293, + "learning_rate": 3.9015509241634265e-05, + "loss": 2.3088, + "step": 964 + }, + { + "epoch": 1.1867321867321867, + "grad_norm": 1.4409561157226562, + "learning_rate": 3.8817357112890154e-05, + "loss": 2.1152, + "step": 966 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 1.5257965326309204, + "learning_rate": 3.861938969168245e-05, + "loss": 2.3275, + "step": 968 + }, + { + "epoch": 1.1916461916461916, + "grad_norm": 1.519891619682312, + "learning_rate": 3.842161024790679e-05, + "loss": 2.054, + "step": 970 + }, + { + "epoch": 1.194103194103194, + "grad_norm": 1.4816685914993286, + "learning_rate": 3.8224022048353844e-05, + "loss": 2.0573, + "step": 972 + }, + { + "epoch": 1.1965601965601966, + "grad_norm": 1.5991785526275635, + "learning_rate": 3.802662835665549e-05, + "loss": 2.4364, + "step": 974 + }, + { + "epoch": 1.199017199017199, + "grad_norm": 1.5157305002212524, + "learning_rate": 3.782943243323083e-05, + "loss": 2.2804, + "step": 976 + }, + { + "epoch": 1.2014742014742015, + "grad_norm": 1.4873096942901611, + "learning_rate": 3.763243753523237e-05, + "loss": 2.1844, + "step": 978 + }, + { + "epoch": 1.203931203931204, + "grad_norm": 1.4316937923431396, + "learning_rate": 3.743564691649216e-05, + "loss": 2.2101, + "step": 980 + }, + { + "epoch": 1.2063882063882063, + "grad_norm": 1.577020287513733, + "learning_rate": 3.723906382746819e-05, + "loss": 2.3495, + "step": 982 + }, + { + "epoch": 1.2088452088452089, + "grad_norm": 1.49762761592865, + "learning_rate": 3.704269151519053e-05, + "loss": 2.261, + "step": 984 + }, + { + "epoch": 1.2113022113022114, + "grad_norm": 1.6478139162063599, + "learning_rate": 3.6846533223207836e-05, + "loss": 2.195, + "step": 986 + }, + { + "epoch": 1.2137592137592137, + "grad_norm": 1.467128872871399, + "learning_rate": 3.665059219153366e-05, + "loss": 2.1655, + "step": 988 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 1.4964045286178589, + "learning_rate": 3.645487165659305e-05, + "loss": 2.0594, + "step": 990 + }, + { + "epoch": 1.2186732186732188, + "grad_norm": 3.7448129653930664, + "learning_rate": 3.6259374851169025e-05, + "loss": 2.2382, + "step": 992 + }, + { + "epoch": 1.221130221130221, + "grad_norm": 1.5176564455032349, + "learning_rate": 3.606410500434913e-05, + "loss": 2.1226, + "step": 994 + }, + { + "epoch": 1.2235872235872236, + "grad_norm": 1.6118485927581787, + "learning_rate": 3.58690653414722e-05, + "loss": 2.2326, + "step": 996 + }, + { + "epoch": 1.2260442260442261, + "grad_norm": 1.5225954055786133, + "learning_rate": 3.567425908407507e-05, + "loss": 1.9996, + "step": 998 + }, + { + "epoch": 1.2285012285012284, + "grad_norm": 1.5977141857147217, + "learning_rate": 3.547968944983927e-05, + "loss": 2.3153, + "step": 1000 + }, + { + "epoch": 1.230958230958231, + "grad_norm": 1.789543628692627, + "learning_rate": 3.528535965253802e-05, + "loss": 2.1118, + "step": 1002 + }, + { + "epoch": 1.2334152334152333, + "grad_norm": 1.474496841430664, + "learning_rate": 3.5091272901983014e-05, + "loss": 2.0499, + "step": 1004 + }, + { + "epoch": 1.2358722358722358, + "grad_norm": 1.514814853668213, + "learning_rate": 3.4897432403971456e-05, + "loss": 2.0693, + "step": 1006 + }, + { + "epoch": 1.2383292383292384, + "grad_norm": 1.5560026168823242, + "learning_rate": 3.470384136023316e-05, + "loss": 2.2738, + "step": 1008 + }, + { + "epoch": 1.2407862407862407, + "grad_norm": 1.4812030792236328, + "learning_rate": 3.45105029683776e-05, + "loss": 2.1405, + "step": 1010 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 3.473381757736206, + "learning_rate": 3.431742042184108e-05, + "loss": 2.0961, + "step": 1012 + }, + { + "epoch": 1.2457002457002457, + "grad_norm": 1.422070026397705, + "learning_rate": 3.412459690983408e-05, + "loss": 1.8186, + "step": 1014 + }, + { + "epoch": 1.248157248157248, + "grad_norm": 1.6386499404907227, + "learning_rate": 3.393203561728845e-05, + "loss": 2.3395, + "step": 1016 + }, + { + "epoch": 1.2506142506142506, + "grad_norm": 1.6020177602767944, + "learning_rate": 3.373973972480493e-05, + "loss": 2.0407, + "step": 1018 + }, + { + "epoch": 1.253071253071253, + "grad_norm": 1.4519404172897339, + "learning_rate": 3.354771240860052e-05, + "loss": 2.1731, + "step": 1020 + }, + { + "epoch": 1.2555282555282554, + "grad_norm": 1.552148461341858, + "learning_rate": 3.335595684045604e-05, + "loss": 2.2207, + "step": 1022 + }, + { + "epoch": 1.257985257985258, + "grad_norm": 1.5464601516723633, + "learning_rate": 3.316447618766381e-05, + "loss": 2.0184, + "step": 1024 + }, + { + "epoch": 1.2604422604422605, + "grad_norm": 1.5165269374847412, + "learning_rate": 3.297327361297525e-05, + "loss": 2.1262, + "step": 1026 + }, + { + "epoch": 1.2628992628992628, + "grad_norm": 1.4744261503219604, + "learning_rate": 3.278235227454865e-05, + "loss": 2.2121, + "step": 1028 + }, + { + "epoch": 1.2653562653562653, + "grad_norm": 1.7227579355239868, + "learning_rate": 3.259171532589707e-05, + "loss": 2.0901, + "step": 1030 + }, + { + "epoch": 1.2678132678132679, + "grad_norm": 1.4957431554794312, + "learning_rate": 3.240136591583615e-05, + "loss": 2.3082, + "step": 1032 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 1.4819146394729614, + "learning_rate": 3.221130718843218e-05, + "loss": 1.9536, + "step": 1034 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 1.5298371315002441, + "learning_rate": 3.202154228295018e-05, + "loss": 2.3408, + "step": 1036 + }, + { + "epoch": 1.2751842751842752, + "grad_norm": 2.7232489585876465, + "learning_rate": 3.1832074333801954e-05, + "loss": 2.1727, + "step": 1038 + }, + { + "epoch": 1.2776412776412776, + "grad_norm": 1.6206049919128418, + "learning_rate": 3.164290647049443e-05, + "loss": 2.2159, + "step": 1040 + }, + { + "epoch": 1.28009828009828, + "grad_norm": 1.588446855545044, + "learning_rate": 3.14540418175779e-05, + "loss": 2.3948, + "step": 1042 + }, + { + "epoch": 1.2825552825552826, + "grad_norm": 1.722697377204895, + "learning_rate": 3.1265483494594363e-05, + "loss": 2.2415, + "step": 1044 + }, + { + "epoch": 1.285012285012285, + "grad_norm": 1.59923255443573, + "learning_rate": 3.1077234616026165e-05, + "loss": 2.2489, + "step": 1046 + }, + { + "epoch": 1.2874692874692875, + "grad_norm": 1.4835443496704102, + "learning_rate": 3.08892982912444e-05, + "loss": 2.2028, + "step": 1048 + }, + { + "epoch": 1.28992628992629, + "grad_norm": 1.577497959136963, + "learning_rate": 3.070167762445759e-05, + "loss": 2.3542, + "step": 1050 + }, + { + "epoch": 1.2923832923832923, + "grad_norm": 1.4639438390731812, + "learning_rate": 3.051437571466047e-05, + "loss": 2.0745, + "step": 1052 + }, + { + "epoch": 1.2948402948402948, + "grad_norm": 1.7099831104278564, + "learning_rate": 3.03273956555827e-05, + "loss": 2.3836, + "step": 1054 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 1.5152896642684937, + "learning_rate": 3.01407405356379e-05, + "loss": 2.1901, + "step": 1056 + }, + { + "epoch": 1.2997542997542997, + "grad_norm": 1.6085963249206543, + "learning_rate": 2.9954413437872464e-05, + "loss": 2.3145, + "step": 1058 + }, + { + "epoch": 1.3022113022113022, + "grad_norm": 1.545065999031067, + "learning_rate": 2.9768417439914792e-05, + "loss": 2.2041, + "step": 1060 + }, + { + "epoch": 1.3046683046683047, + "grad_norm": 1.6498991250991821, + "learning_rate": 2.9582755613924385e-05, + "loss": 2.2438, + "step": 1062 + }, + { + "epoch": 1.307125307125307, + "grad_norm": 1.4869440793991089, + "learning_rate": 2.939743102654113e-05, + "loss": 2.2388, + "step": 1064 + }, + { + "epoch": 1.3095823095823096, + "grad_norm": 1.5121225118637085, + "learning_rate": 2.9212446738834576e-05, + "loss": 2.127, + "step": 1066 + }, + { + "epoch": 1.3120393120393121, + "grad_norm": 1.4147547483444214, + "learning_rate": 2.9027805806253505e-05, + "loss": 2.1123, + "step": 1068 + }, + { + "epoch": 1.3144963144963144, + "grad_norm": 1.5282615423202515, + "learning_rate": 2.8843511278575265e-05, + "loss": 2.0531, + "step": 1070 + }, + { + "epoch": 1.316953316953317, + "grad_norm": 1.6827208995819092, + "learning_rate": 2.865956619985565e-05, + "loss": 2.1956, + "step": 1072 + }, + { + "epoch": 1.3194103194103195, + "grad_norm": 1.513061285018921, + "learning_rate": 2.8475973608378405e-05, + "loss": 2.2247, + "step": 1074 + }, + { + "epoch": 1.3218673218673218, + "grad_norm": 1.5746976137161255, + "learning_rate": 2.8292736536605146e-05, + "loss": 2.3413, + "step": 1076 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 1.4488451480865479, + "learning_rate": 2.810985801112521e-05, + "loss": 2.167, + "step": 1078 + }, + { + "epoch": 1.3267813267813269, + "grad_norm": 1.6251620054244995, + "learning_rate": 2.79273410526058e-05, + "loss": 2.0743, + "step": 1080 + }, + { + "epoch": 1.3292383292383292, + "grad_norm": 1.4623501300811768, + "learning_rate": 2.774518867574189e-05, + "loss": 2.1771, + "step": 1082 + }, + { + "epoch": 1.3316953316953317, + "grad_norm": 6.961853504180908, + "learning_rate": 2.7563403889206585e-05, + "loss": 2.1211, + "step": 1084 + }, + { + "epoch": 1.3341523341523343, + "grad_norm": 1.794399380683899, + "learning_rate": 2.7381989695601362e-05, + "loss": 2.0229, + "step": 1086 + }, + { + "epoch": 1.3366093366093366, + "grad_norm": 1.4306811094284058, + "learning_rate": 2.7200949091406496e-05, + "loss": 2.0289, + "step": 1088 + }, + { + "epoch": 1.339066339066339, + "grad_norm": 1.4475698471069336, + "learning_rate": 2.702028506693153e-05, + "loss": 2.073, + "step": 1090 + }, + { + "epoch": 1.3415233415233416, + "grad_norm": 1.458843469619751, + "learning_rate": 2.684000060626597e-05, + "loss": 1.9602, + "step": 1092 + }, + { + "epoch": 1.343980343980344, + "grad_norm": 1.5635676383972168, + "learning_rate": 2.6660098687229902e-05, + "loss": 2.042, + "step": 1094 + }, + { + "epoch": 1.3464373464373465, + "grad_norm": 1.535068392753601, + "learning_rate": 2.6480582281324784e-05, + "loss": 2.1949, + "step": 1096 + }, + { + "epoch": 1.348894348894349, + "grad_norm": 1.5624455213546753, + "learning_rate": 2.630145435368453e-05, + "loss": 2.2688, + "step": 1098 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 1.5718533992767334, + "learning_rate": 2.612271786302636e-05, + "loss": 2.241, + "step": 1100 + }, + { + "epoch": 1.3538083538083538, + "grad_norm": 1.5356695652008057, + "learning_rate": 2.5944375761601987e-05, + "loss": 2.1428, + "step": 1102 + }, + { + "epoch": 1.3562653562653564, + "grad_norm": 1.47645902633667, + "learning_rate": 2.5766430995148916e-05, + "loss": 1.9967, + "step": 1104 + }, + { + "epoch": 1.3587223587223587, + "grad_norm": 1.5116705894470215, + "learning_rate": 2.5588886502841692e-05, + "loss": 2.3021, + "step": 1106 + }, + { + "epoch": 1.3611793611793612, + "grad_norm": 1.5203925371170044, + "learning_rate": 2.541174521724341e-05, + "loss": 2.2395, + "step": 1108 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 1.5775456428527832, + "learning_rate": 2.5235010064257236e-05, + "loss": 2.3145, + "step": 1110 + }, + { + "epoch": 1.366093366093366, + "grad_norm": 1.4771641492843628, + "learning_rate": 2.5058683963078143e-05, + "loss": 2.2134, + "step": 1112 + }, + { + "epoch": 1.3685503685503686, + "grad_norm": 1.5161223411560059, + "learning_rate": 2.488276982614462e-05, + "loss": 2.2528, + "step": 1114 + }, + { + "epoch": 1.3710073710073711, + "grad_norm": 1.511385202407837, + "learning_rate": 2.470727055909065e-05, + "loss": 2.2802, + "step": 1116 + }, + { + "epoch": 1.3734643734643734, + "grad_norm": 1.7755464315414429, + "learning_rate": 2.4532189060697624e-05, + "loss": 2.097, + "step": 1118 + }, + { + "epoch": 1.375921375921376, + "grad_norm": 1.5679901838302612, + "learning_rate": 2.4357528222846517e-05, + "loss": 2.3338, + "step": 1120 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 1.4639720916748047, + "learning_rate": 2.4183290930470116e-05, + "loss": 2.1668, + "step": 1122 + }, + { + "epoch": 1.3808353808353808, + "grad_norm": 1.484370470046997, + "learning_rate": 2.4009480061505352e-05, + "loss": 2.2835, + "step": 1124 + }, + { + "epoch": 1.3832923832923834, + "grad_norm": 1.4937642812728882, + "learning_rate": 2.3836098486845788e-05, + "loss": 2.0235, + "step": 1126 + }, + { + "epoch": 1.3857493857493859, + "grad_norm": 1.546862244606018, + "learning_rate": 2.366314907029416e-05, + "loss": 2.4075, + "step": 1128 + }, + { + "epoch": 1.3882063882063882, + "grad_norm": 1.4853980541229248, + "learning_rate": 2.3490634668515154e-05, + "loss": 2.0339, + "step": 1130 + }, + { + "epoch": 1.3906633906633907, + "grad_norm": 1.4826372861862183, + "learning_rate": 2.331855813098813e-05, + "loss": 2.2912, + "step": 1132 + }, + { + "epoch": 1.393120393120393, + "grad_norm": 1.503356695175171, + "learning_rate": 2.3146922299960076e-05, + "loss": 2.2172, + "step": 1134 + }, + { + "epoch": 1.3955773955773956, + "grad_norm": 3.102303981781006, + "learning_rate": 2.297573001039873e-05, + "loss": 2.0942, + "step": 1136 + }, + { + "epoch": 1.398034398034398, + "grad_norm": 1.4009228944778442, + "learning_rate": 2.280498408994565e-05, + "loss": 2.0301, + "step": 1138 + }, + { + "epoch": 1.4004914004914004, + "grad_norm": 1.4026343822479248, + "learning_rate": 2.2634687358869594e-05, + "loss": 1.9603, + "step": 1140 + }, + { + "epoch": 1.402948402948403, + "grad_norm": 1.588887095451355, + "learning_rate": 2.2464842630019932e-05, + "loss": 2.3436, + "step": 1142 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 1.4971401691436768, + "learning_rate": 2.22954527087801e-05, + "loss": 2.2901, + "step": 1144 + }, + { + "epoch": 1.4078624078624078, + "grad_norm": 1.4828369617462158, + "learning_rate": 2.2126520393021367e-05, + "loss": 1.9858, + "step": 1146 + }, + { + "epoch": 1.4103194103194103, + "grad_norm": 1.565244436264038, + "learning_rate": 2.195804847305654e-05, + "loss": 2.188, + "step": 1148 + }, + { + "epoch": 1.4127764127764126, + "grad_norm": 1.5699490308761597, + "learning_rate": 2.1790039731593947e-05, + "loss": 2.2779, + "step": 1150 + }, + { + "epoch": 1.4152334152334152, + "grad_norm": 1.5688115358352661, + "learning_rate": 2.1622496943691388e-05, + "loss": 2.0154, + "step": 1152 + }, + { + "epoch": 1.4176904176904177, + "grad_norm": 1.543989658355713, + "learning_rate": 2.1455422876710425e-05, + "loss": 2.0484, + "step": 1154 + }, + { + "epoch": 1.42014742014742, + "grad_norm": 2.775977611541748, + "learning_rate": 2.1288820290270534e-05, + "loss": 2.0981, + "step": 1156 + }, + { + "epoch": 1.4226044226044225, + "grad_norm": 1.5382612943649292, + "learning_rate": 2.1122691936203598e-05, + "loss": 2.274, + "step": 1158 + }, + { + "epoch": 1.425061425061425, + "grad_norm": 1.5721843242645264, + "learning_rate": 2.0957040558508462e-05, + "loss": 2.176, + "step": 1160 + }, + { + "epoch": 1.4275184275184274, + "grad_norm": 1.4855724573135376, + "learning_rate": 2.079186889330556e-05, + "loss": 1.9464, + "step": 1162 + }, + { + "epoch": 1.42997542997543, + "grad_norm": 1.5165448188781738, + "learning_rate": 2.062717966879178e-05, + "loss": 2.083, + "step": 1164 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 1.4863147735595703, + "learning_rate": 2.046297560519533e-05, + "loss": 2.1314, + "step": 1166 + }, + { + "epoch": 1.4348894348894348, + "grad_norm": 1.5833560228347778, + "learning_rate": 2.0299259414730914e-05, + "loss": 2.1081, + "step": 1168 + }, + { + "epoch": 1.4373464373464373, + "grad_norm": 1.500045895576477, + "learning_rate": 2.0136033801554822e-05, + "loss": 2.2103, + "step": 1170 + }, + { + "epoch": 1.4398034398034398, + "grad_norm": 2.045681953430176, + "learning_rate": 1.9973301461720313e-05, + "loss": 2.4128, + "step": 1172 + }, + { + "epoch": 1.4422604422604421, + "grad_norm": 1.5052695274353027, + "learning_rate": 1.9811065083133097e-05, + "loss": 2.2454, + "step": 1174 + }, + { + "epoch": 1.4447174447174447, + "grad_norm": 1.5001286268234253, + "learning_rate": 1.9649327345506906e-05, + "loss": 2.2998, + "step": 1176 + }, + { + "epoch": 1.4471744471744472, + "grad_norm": 1.5142590999603271, + "learning_rate": 1.9488090920319247e-05, + "loss": 2.2612, + "step": 1178 + }, + { + "epoch": 1.4496314496314495, + "grad_norm": 1.5888320207595825, + "learning_rate": 1.9327358470767325e-05, + "loss": 2.426, + "step": 1180 + }, + { + "epoch": 1.452088452088452, + "grad_norm": 1.4589530229568481, + "learning_rate": 1.916713265172395e-05, + "loss": 2.2698, + "step": 1182 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 1.3764737844467163, + "learning_rate": 1.900741610969377e-05, + "loss": 2.1888, + "step": 1184 + }, + { + "epoch": 1.457002457002457, + "grad_norm": 1.4318088293075562, + "learning_rate": 1.884821148276952e-05, + "loss": 2.0345, + "step": 1186 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 1.5526138544082642, + "learning_rate": 1.8689521400588463e-05, + "loss": 2.0367, + "step": 1188 + }, + { + "epoch": 1.461916461916462, + "grad_norm": 1.4541133642196655, + "learning_rate": 1.8531348484288924e-05, + "loss": 2.0453, + "step": 1190 + }, + { + "epoch": 1.4643734643734643, + "grad_norm": 1.8645811080932617, + "learning_rate": 1.83736953464671e-05, + "loss": 1.8959, + "step": 1192 + }, + { + "epoch": 1.4668304668304668, + "grad_norm": 2.281449794769287, + "learning_rate": 1.8216564591133746e-05, + "loss": 2.1153, + "step": 1194 + }, + { + "epoch": 1.4692874692874693, + "grad_norm": 1.5182552337646484, + "learning_rate": 1.8059958813671286e-05, + "loss": 2.3368, + "step": 1196 + }, + { + "epoch": 1.4717444717444716, + "grad_norm": 1.4919682741165161, + "learning_rate": 1.790388060079089e-05, + "loss": 2.3376, + "step": 1198 + }, + { + "epoch": 1.4742014742014742, + "grad_norm": 1.5697439908981323, + "learning_rate": 1.774833253048978e-05, + "loss": 2.1936, + "step": 1200 + }, + { + "epoch": 1.4766584766584767, + "grad_norm": 1.6530416011810303, + "learning_rate": 1.75933171720086e-05, + "loss": 2.5177, + "step": 1202 + }, + { + "epoch": 1.479115479115479, + "grad_norm": 1.4762450456619263, + "learning_rate": 1.743883708578906e-05, + "loss": 2.0809, + "step": 1204 + }, + { + "epoch": 1.4815724815724816, + "grad_norm": 2.0405399799346924, + "learning_rate": 1.7284894823431568e-05, + "loss": 2.2776, + "step": 1206 + }, + { + "epoch": 1.484029484029484, + "grad_norm": 2.369046449661255, + "learning_rate": 1.7131492927653098e-05, + "loss": 2.3183, + "step": 1208 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 1.5134222507476807, + "learning_rate": 1.6978633932245218e-05, + "loss": 2.2988, + "step": 1210 + }, + { + "epoch": 1.488943488943489, + "grad_norm": 1.6127910614013672, + "learning_rate": 1.6826320362032232e-05, + "loss": 2.2625, + "step": 1212 + }, + { + "epoch": 1.4914004914004915, + "grad_norm": 1.4955569505691528, + "learning_rate": 1.6674554732829455e-05, + "loss": 2.1632, + "step": 1214 + }, + { + "epoch": 1.4938574938574938, + "grad_norm": 1.5238553285598755, + "learning_rate": 1.6523339551401672e-05, + "loss": 2.1019, + "step": 1216 + }, + { + "epoch": 1.4963144963144963, + "grad_norm": 1.5292433500289917, + "learning_rate": 1.6372677315421768e-05, + "loss": 2.1687, + "step": 1218 + }, + { + "epoch": 1.4987714987714988, + "grad_norm": 1.5058326721191406, + "learning_rate": 1.62225705134294e-05, + "loss": 2.0533, + "step": 1220 + }, + { + "epoch": 1.5012285012285012, + "grad_norm": 1.4159148931503296, + "learning_rate": 1.6073021624789934e-05, + "loss": 2.1105, + "step": 1222 + }, + { + "epoch": 1.5036855036855037, + "grad_norm": 1.4431846141815186, + "learning_rate": 1.5924033119653543e-05, + "loss": 2.0021, + "step": 1224 + }, + { + "epoch": 1.5061425061425062, + "grad_norm": 1.5581754446029663, + "learning_rate": 1.577560745891426e-05, + "loss": 2.188, + "step": 1226 + }, + { + "epoch": 1.5085995085995085, + "grad_norm": 1.6353570222854614, + "learning_rate": 1.5627747094169482e-05, + "loss": 2.2272, + "step": 1228 + }, + { + "epoch": 1.511056511056511, + "grad_norm": 1.7231024503707886, + "learning_rate": 1.548045446767945e-05, + "loss": 1.9323, + "step": 1230 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 1.4902280569076538, + "learning_rate": 1.533373201232682e-05, + "loss": 2.2629, + "step": 1232 + }, + { + "epoch": 1.515970515970516, + "grad_norm": 1.4090461730957031, + "learning_rate": 1.5187582151576562e-05, + "loss": 2.0648, + "step": 1234 + }, + { + "epoch": 1.5184275184275184, + "grad_norm": 1.5375722646713257, + "learning_rate": 1.5042007299435895e-05, + "loss": 2.2276, + "step": 1236 + }, + { + "epoch": 1.520884520884521, + "grad_norm": 1.5067601203918457, + "learning_rate": 1.4897009860414441e-05, + "loss": 1.9903, + "step": 1238 + }, + { + "epoch": 1.5233415233415233, + "grad_norm": 1.5371257066726685, + "learning_rate": 1.4752592229484463e-05, + "loss": 2.1906, + "step": 1240 + }, + { + "epoch": 1.5257985257985258, + "grad_norm": 1.5708671808242798, + "learning_rate": 1.4608756792041377e-05, + "loss": 2.1423, + "step": 1242 + }, + { + "epoch": 1.5282555282555284, + "grad_norm": 1.5206773281097412, + "learning_rate": 1.4465505923864281e-05, + "loss": 2.1689, + "step": 1244 + }, + { + "epoch": 1.5307125307125307, + "grad_norm": 1.469963550567627, + "learning_rate": 1.4322841991076746e-05, + "loss": 2.1178, + "step": 1246 + }, + { + "epoch": 1.5331695331695332, + "grad_norm": 1.5863155126571655, + "learning_rate": 1.4180767350107733e-05, + "loss": 2.3579, + "step": 1248 + }, + { + "epoch": 1.5356265356265357, + "grad_norm": 2.27736234664917, + "learning_rate": 1.403928434765267e-05, + "loss": 2.1562, + "step": 1250 + }, + { + "epoch": 1.538083538083538, + "grad_norm": 1.516559362411499, + "learning_rate": 1.3898395320634688e-05, + "loss": 2.1235, + "step": 1252 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 1.5483007431030273, + "learning_rate": 1.3758102596166007e-05, + "loss": 2.2266, + "step": 1254 + }, + { + "epoch": 1.542997542997543, + "grad_norm": 1.4693801403045654, + "learning_rate": 1.3618408491509555e-05, + "loss": 2.1205, + "step": 1256 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 1.5313624143600464, + "learning_rate": 1.3479315314040619e-05, + "loss": 2.1975, + "step": 1258 + }, + { + "epoch": 1.547911547911548, + "grad_norm": 1.5125219821929932, + "learning_rate": 1.3340825361208758e-05, + "loss": 2.0635, + "step": 1260 + }, + { + "epoch": 1.5503685503685505, + "grad_norm": 1.4909876585006714, + "learning_rate": 1.3202940920499918e-05, + "loss": 2.1662, + "step": 1262 + }, + { + "epoch": 1.5528255528255528, + "grad_norm": 1.4954192638397217, + "learning_rate": 1.306566426939852e-05, + "loss": 2.0689, + "step": 1264 + }, + { + "epoch": 1.5552825552825553, + "grad_norm": 1.4638853073120117, + "learning_rate": 1.2928997675349941e-05, + "loss": 2.0633, + "step": 1266 + }, + { + "epoch": 1.5577395577395579, + "grad_norm": 1.4579492807388306, + "learning_rate": 1.2792943395723083e-05, + "loss": 2.1987, + "step": 1268 + }, + { + "epoch": 1.5601965601965602, + "grad_norm": 1.4410840272903442, + "learning_rate": 1.2657503677772991e-05, + "loss": 2.1861, + "step": 1270 + }, + { + "epoch": 1.5626535626535627, + "grad_norm": 1.6172857284545898, + "learning_rate": 1.2522680758603788e-05, + "loss": 2.0121, + "step": 1272 + }, + { + "epoch": 1.5651105651105652, + "grad_norm": 1.506893515586853, + "learning_rate": 1.238847686513177e-05, + "loss": 2.1572, + "step": 1274 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 1.5652704238891602, + "learning_rate": 1.2254894214048496e-05, + "loss": 2.1499, + "step": 1276 + }, + { + "epoch": 1.57002457002457, + "grad_norm": 1.5030075311660767, + "learning_rate": 1.2121935011784286e-05, + "loss": 1.8415, + "step": 1278 + }, + { + "epoch": 1.5724815724815726, + "grad_norm": 1.4392123222351074, + "learning_rate": 1.1989601454471788e-05, + "loss": 2.2703, + "step": 1280 + }, + { + "epoch": 1.574938574938575, + "grad_norm": 1.44460129737854, + "learning_rate": 1.1857895727909596e-05, + "loss": 2.0363, + "step": 1282 + }, + { + "epoch": 1.5773955773955772, + "grad_norm": 1.4928911924362183, + "learning_rate": 1.172682000752624e-05, + "loss": 2.1526, + "step": 1284 + }, + { + "epoch": 1.57985257985258, + "grad_norm": 1.761731505393982, + "learning_rate": 1.1596376458344189e-05, + "loss": 2.2225, + "step": 1286 + }, + { + "epoch": 1.5823095823095823, + "grad_norm": 1.4366579055786133, + "learning_rate": 1.1466567234944203e-05, + "loss": 2.1126, + "step": 1288 + }, + { + "epoch": 1.5847665847665846, + "grad_norm": 1.4728341102600098, + "learning_rate": 1.1337394481429564e-05, + "loss": 2.0328, + "step": 1290 + }, + { + "epoch": 1.5872235872235874, + "grad_norm": 1.4520374536514282, + "learning_rate": 1.120886033139082e-05, + "loss": 2.0009, + "step": 1292 + }, + { + "epoch": 1.5896805896805897, + "grad_norm": 1.4696489572525024, + "learning_rate": 1.1080966907870494e-05, + "loss": 2.1204, + "step": 1294 + }, + { + "epoch": 1.592137592137592, + "grad_norm": 1.540980339050293, + "learning_rate": 1.0953716323327989e-05, + "loss": 2.216, + "step": 1296 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 1.5559231042861938, + "learning_rate": 1.0827110679604712e-05, + "loss": 2.2384, + "step": 1298 + }, + { + "epoch": 1.597051597051597, + "grad_norm": 1.4445903301239014, + "learning_rate": 1.0701152067889408e-05, + "loss": 2.2694, + "step": 1300 + }, + { + "epoch": 1.5995085995085994, + "grad_norm": 1.5548192262649536, + "learning_rate": 1.0575842568683497e-05, + "loss": 2.1511, + "step": 1302 + }, + { + "epoch": 1.6019656019656021, + "grad_norm": 1.985463261604309, + "learning_rate": 1.0451184251766832e-05, + "loss": 2.0459, + "step": 1304 + }, + { + "epoch": 1.6044226044226044, + "grad_norm": 1.5023545026779175, + "learning_rate": 1.0327179176163482e-05, + "loss": 2.2092, + "step": 1306 + }, + { + "epoch": 1.6068796068796067, + "grad_norm": 1.5086380243301392, + "learning_rate": 1.0203829390107673e-05, + "loss": 2.1328, + "step": 1308 + }, + { + "epoch": 1.6093366093366095, + "grad_norm": 1.5008928775787354, + "learning_rate": 1.0081136931009982e-05, + "loss": 2.0552, + "step": 1310 + }, + { + "epoch": 1.6117936117936118, + "grad_norm": 1.5161304473876953, + "learning_rate": 9.959103825423743e-06, + "loss": 2.3019, + "step": 1312 + }, + { + "epoch": 1.6142506142506141, + "grad_norm": 1.4673563241958618, + "learning_rate": 9.837732089011492e-06, + "loss": 2.2056, + "step": 1314 + }, + { + "epoch": 1.6167076167076169, + "grad_norm": 1.519995927810669, + "learning_rate": 9.717023726511653e-06, + "loss": 2.0393, + "step": 1316 + }, + { + "epoch": 1.6191646191646192, + "grad_norm": 1.503371238708496, + "learning_rate": 9.596980731705574e-06, + "loss": 2.005, + "step": 1318 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 1.4858044385910034, + "learning_rate": 9.477605087384428e-06, + "loss": 2.1498, + "step": 1320 + }, + { + "epoch": 1.6240786240786242, + "grad_norm": 1.4308788776397705, + "learning_rate": 9.358898765316554e-06, + "loss": 1.9853, + "step": 1322 + }, + { + "epoch": 1.6265356265356266, + "grad_norm": 1.4703772068023682, + "learning_rate": 9.24086372621486e-06, + "loss": 2.0003, + "step": 1324 + }, + { + "epoch": 1.6289926289926289, + "grad_norm": 1.5121406316757202, + "learning_rate": 9.123501919704475e-06, + "loss": 2.2549, + "step": 1326 + }, + { + "epoch": 1.6314496314496314, + "grad_norm": 1.723720908164978, + "learning_rate": 9.006815284290443e-06, + "loss": 2.0871, + "step": 1328 + }, + { + "epoch": 1.633906633906634, + "grad_norm": 1.5584907531738281, + "learning_rate": 8.890805747325865e-06, + "loss": 2.1639, + "step": 1330 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 1.5387362241744995, + "learning_rate": 8.775475224979907e-06, + "loss": 2.224, + "step": 1332 + }, + { + "epoch": 1.6388206388206388, + "grad_norm": 1.4272263050079346, + "learning_rate": 8.660825622206253e-06, + "loss": 1.927, + "step": 1334 + }, + { + "epoch": 1.6412776412776413, + "grad_norm": 1.4782159328460693, + "learning_rate": 8.546858832711585e-06, + "loss": 2.3164, + "step": 1336 + }, + { + "epoch": 1.6437346437346436, + "grad_norm": 1.5216556787490845, + "learning_rate": 8.433576738924353e-06, + "loss": 2.0712, + "step": 1338 + }, + { + "epoch": 1.6461916461916462, + "grad_norm": 1.5300792455673218, + "learning_rate": 8.320981211963646e-06, + "loss": 2.0898, + "step": 1340 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 1.5001579523086548, + "learning_rate": 8.209074111608239e-06, + "loss": 2.1395, + "step": 1342 + }, + { + "epoch": 1.651105651105651, + "grad_norm": 1.440273642539978, + "learning_rate": 8.097857286266014e-06, + "loss": 1.9906, + "step": 1344 + }, + { + "epoch": 1.6535626535626535, + "grad_norm": 1.5727773904800415, + "learning_rate": 7.987332572943295e-06, + "loss": 1.9092, + "step": 1346 + }, + { + "epoch": 1.656019656019656, + "grad_norm": 1.5014179944992065, + "learning_rate": 7.877501797214553e-06, + "loss": 2.1114, + "step": 1348 + }, + { + "epoch": 1.6584766584766584, + "grad_norm": 1.5219117403030396, + "learning_rate": 7.768366773192286e-06, + "loss": 2.079, + "step": 1350 + }, + { + "epoch": 1.660933660933661, + "grad_norm": 2.714843511581421, + "learning_rate": 7.659929303497015e-06, + "loss": 2.1555, + "step": 1352 + }, + { + "epoch": 1.6633906633906634, + "grad_norm": 1.7876430749893188, + "learning_rate": 7.552191179227464e-06, + "loss": 2.1865, + "step": 1354 + }, + { + "epoch": 1.6658476658476657, + "grad_norm": 1.4951201677322388, + "learning_rate": 7.445154179931102e-06, + "loss": 2.2055, + "step": 1356 + }, + { + "epoch": 1.6683046683046683, + "grad_norm": 1.429513692855835, + "learning_rate": 7.3388200735746486e-06, + "loss": 2.0879, + "step": 1358 + }, + { + "epoch": 1.6707616707616708, + "grad_norm": 1.5509145259857178, + "learning_rate": 7.23319061651489e-06, + "loss": 2.1114, + "step": 1360 + }, + { + "epoch": 1.6732186732186731, + "grad_norm": 1.5352951288223267, + "learning_rate": 7.128267553469675e-06, + "loss": 1.9494, + "step": 1362 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 1.449985384941101, + "learning_rate": 7.024052617489157e-06, + "loss": 2.2586, + "step": 1364 + }, + { + "epoch": 1.6781326781326782, + "grad_norm": 1.4463045597076416, + "learning_rate": 6.920547529927018e-06, + "loss": 1.925, + "step": 1366 + }, + { + "epoch": 1.6805896805896805, + "grad_norm": 1.5020941495895386, + "learning_rate": 6.817754000412219e-06, + "loss": 2.0315, + "step": 1368 + }, + { + "epoch": 1.683046683046683, + "grad_norm": 1.5441267490386963, + "learning_rate": 6.715673726820626e-06, + "loss": 2.2866, + "step": 1370 + }, + { + "epoch": 1.6855036855036856, + "grad_norm": 2.12288236618042, + "learning_rate": 6.614308395247021e-06, + "loss": 2.2449, + "step": 1372 + }, + { + "epoch": 1.6879606879606879, + "grad_norm": 1.4717684984207153, + "learning_rate": 6.513659679977235e-06, + "loss": 1.955, + "step": 1374 + }, + { + "epoch": 1.6904176904176904, + "grad_norm": 1.5139594078063965, + "learning_rate": 6.413729243460542e-06, + "loss": 2.2489, + "step": 1376 + }, + { + "epoch": 1.692874692874693, + "grad_norm": 1.4395731687545776, + "learning_rate": 6.314518736282132e-06, + "loss": 2.115, + "step": 1378 + }, + { + "epoch": 1.6953316953316953, + "grad_norm": 1.4303041696548462, + "learning_rate": 6.216029797135842e-06, + "loss": 2.1616, + "step": 1380 + }, + { + "epoch": 1.6977886977886978, + "grad_norm": 1.965557336807251, + "learning_rate": 6.118264052797185e-06, + "loss": 2.1315, + "step": 1382 + }, + { + "epoch": 1.7002457002457003, + "grad_norm": 1.4882440567016602, + "learning_rate": 6.021223118096387e-06, + "loss": 2.1687, + "step": 1384 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 1.5187492370605469, + "learning_rate": 5.9249085958917425e-06, + "loss": 2.0725, + "step": 1386 + }, + { + "epoch": 1.7051597051597052, + "grad_norm": 1.4788079261779785, + "learning_rate": 5.829322077043159e-06, + "loss": 2.0433, + "step": 1388 + }, + { + "epoch": 1.7076167076167077, + "grad_norm": 1.5279337167739868, + "learning_rate": 5.734465140385864e-06, + "loss": 1.9352, + "step": 1390 + }, + { + "epoch": 1.71007371007371, + "grad_norm": 1.5428900718688965, + "learning_rate": 5.640339352704277e-06, + "loss": 2.3354, + "step": 1392 + }, + { + "epoch": 1.7125307125307125, + "grad_norm": 1.9930230379104614, + "learning_rate": 5.54694626870626e-06, + "loss": 2.2981, + "step": 1394 + }, + { + "epoch": 1.714987714987715, + "grad_norm": 1.617288589477539, + "learning_rate": 5.454287430997296e-06, + "loss": 1.9582, + "step": 1396 + }, + { + "epoch": 1.7174447174447174, + "grad_norm": 1.4917547702789307, + "learning_rate": 5.36236437005509e-06, + "loss": 2.2466, + "step": 1398 + }, + { + "epoch": 1.71990171990172, + "grad_norm": 1.4349709749221802, + "learning_rate": 5.271178604204285e-06, + "loss": 1.9704, + "step": 1400 + }, + { + "epoch": 1.7223587223587224, + "grad_norm": 1.4842623472213745, + "learning_rate": 5.180731639591352e-06, + "loss": 2.2308, + "step": 1402 + }, + { + "epoch": 1.7248157248157248, + "grad_norm": 1.4840158224105835, + "learning_rate": 5.09102497015973e-06, + "loss": 1.9857, + "step": 1404 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 1.553490400314331, + "learning_rate": 5.002060077625159e-06, + "loss": 2.0217, + "step": 1406 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 1.4331109523773193, + "learning_rate": 4.913838431451184e-06, + "loss": 2.1303, + "step": 1408 + }, + { + "epoch": 1.7321867321867321, + "grad_norm": 1.5888655185699463, + "learning_rate": 4.826361488824898e-06, + "loss": 2.2863, + "step": 1410 + }, + { + "epoch": 1.7346437346437347, + "grad_norm": 1.5078940391540527, + "learning_rate": 4.739630694632879e-06, + "loss": 2.2173, + "step": 1412 + }, + { + "epoch": 1.7371007371007372, + "grad_norm": 1.4708291292190552, + "learning_rate": 4.6536474814373235e-06, + "loss": 2.1078, + "step": 1414 + }, + { + "epoch": 1.7395577395577395, + "grad_norm": 2.0618860721588135, + "learning_rate": 4.5684132694523485e-06, + "loss": 1.9959, + "step": 1416 + }, + { + "epoch": 1.742014742014742, + "grad_norm": 1.889565110206604, + "learning_rate": 4.483929466520592e-06, + "loss": 2.0008, + "step": 1418 + }, + { + "epoch": 1.7444717444717446, + "grad_norm": 1.6665345430374146, + "learning_rate": 4.400197468089906e-06, + "loss": 2.1364, + "step": 1420 + }, + { + "epoch": 1.746928746928747, + "grad_norm": 1.6015936136245728, + "learning_rate": 4.317218657190347e-06, + "loss": 2.2864, + "step": 1422 + }, + { + "epoch": 1.7493857493857494, + "grad_norm": 1.4520282745361328, + "learning_rate": 4.234994404411297e-06, + "loss": 2.0385, + "step": 1424 + }, + { + "epoch": 1.751842751842752, + "grad_norm": 1.4684919118881226, + "learning_rate": 4.153526067878877e-06, + "loss": 2.1212, + "step": 1426 + }, + { + "epoch": 1.7542997542997543, + "grad_norm": 1.5177769660949707, + "learning_rate": 4.072814993233442e-06, + "loss": 2.1327, + "step": 1428 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 1.605587124824524, + "learning_rate": 3.99286251360742e-06, + "loss": 2.2521, + "step": 1430 + }, + { + "epoch": 1.7592137592137593, + "grad_norm": 1.4767132997512817, + "learning_rate": 3.913669949603249e-06, + "loss": 2.0502, + "step": 1432 + }, + { + "epoch": 1.7616707616707616, + "grad_norm": 1.4715032577514648, + "learning_rate": 3.835238609271597e-06, + "loss": 2.0457, + "step": 1434 + }, + { + "epoch": 1.7641277641277642, + "grad_norm": 1.7920334339141846, + "learning_rate": 3.7575697880897155e-06, + "loss": 2.2316, + "step": 1436 + }, + { + "epoch": 1.7665847665847667, + "grad_norm": 1.5475815534591675, + "learning_rate": 3.680664768940101e-06, + "loss": 2.112, + "step": 1438 + }, + { + "epoch": 1.769041769041769, + "grad_norm": 1.5356806516647339, + "learning_rate": 3.6045248220892447e-06, + "loss": 2.1446, + "step": 1440 + }, + { + "epoch": 1.7714987714987716, + "grad_norm": 1.443109393119812, + "learning_rate": 3.5291512051666863e-06, + "loss": 2.303, + "step": 1442 + }, + { + "epoch": 1.773955773955774, + "grad_norm": 1.486393690109253, + "learning_rate": 3.454545163144224e-06, + "loss": 2.0364, + "step": 1444 + }, + { + "epoch": 1.7764127764127764, + "grad_norm": 1.4741051197052002, + "learning_rate": 3.380707928315374e-06, + "loss": 2.1089, + "step": 1446 + }, + { + "epoch": 1.7788697788697787, + "grad_norm": 1.5320779085159302, + "learning_rate": 3.3076407202749903e-06, + "loss": 2.1228, + "step": 1448 + }, + { + "epoch": 1.7813267813267815, + "grad_norm": 1.587825059890747, + "learning_rate": 3.235344745899116e-06, + "loss": 2.2439, + "step": 1450 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 1.4482508897781372, + "learning_rate": 3.163821199325112e-06, + "loss": 2.0064, + "step": 1452 + }, + { + "epoch": 1.786240786240786, + "grad_norm": 17.515474319458008, + "learning_rate": 3.093071261931835e-06, + "loss": 2.0661, + "step": 1454 + }, + { + "epoch": 1.7886977886977888, + "grad_norm": 1.4586145877838135, + "learning_rate": 3.023096102320194e-06, + "loss": 2.0865, + "step": 1456 + }, + { + "epoch": 1.7911547911547911, + "grad_norm": 1.4751797914505005, + "learning_rate": 2.9538968762938377e-06, + "loss": 1.9135, + "step": 1458 + }, + { + "epoch": 1.7936117936117935, + "grad_norm": 1.4772670269012451, + "learning_rate": 2.8854747268400315e-06, + "loss": 2.06, + "step": 1460 + }, + { + "epoch": 1.7960687960687962, + "grad_norm": 1.4039340019226074, + "learning_rate": 2.8178307841108197e-06, + "loss": 1.9852, + "step": 1462 + }, + { + "epoch": 1.7985257985257985, + "grad_norm": 1.6266793012619019, + "learning_rate": 2.750966165404345e-06, + "loss": 2.1592, + "step": 1464 + }, + { + "epoch": 1.8009828009828008, + "grad_norm": 1.5819138288497925, + "learning_rate": 2.6848819751463693e-06, + "loss": 2.2393, + "step": 1466 + }, + { + "epoch": 1.8034398034398036, + "grad_norm": 2.642188787460327, + "learning_rate": 2.619579304872061e-06, + "loss": 2.2103, + "step": 1468 + }, + { + "epoch": 1.805896805896806, + "grad_norm": 1.4835178852081299, + "learning_rate": 2.5550592332079514e-06, + "loss": 1.9957, + "step": 1470 + }, + { + "epoch": 1.8083538083538082, + "grad_norm": 1.474769115447998, + "learning_rate": 2.4913228258541365e-06, + "loss": 2.0459, + "step": 1472 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 1.502580165863037, + "learning_rate": 2.428371135566637e-06, + "loss": 2.0931, + "step": 1474 + }, + { + "epoch": 1.8132678132678133, + "grad_norm": 1.4862070083618164, + "learning_rate": 2.366205202140065e-06, + "loss": 1.9705, + "step": 1476 + }, + { + "epoch": 1.8157248157248156, + "grad_norm": 1.452000617980957, + "learning_rate": 2.304826052390385e-06, + "loss": 1.9833, + "step": 1478 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 1.50684654712677, + "learning_rate": 2.244234700138015e-06, + "loss": 2.1596, + "step": 1480 + }, + { + "epoch": 1.8206388206388207, + "grad_norm": 1.4713008403778076, + "learning_rate": 2.184432146191029e-06, + "loss": 1.9812, + "step": 1482 + }, + { + "epoch": 1.823095823095823, + "grad_norm": 1.507290244102478, + "learning_rate": 2.1254193783286634e-06, + "loss": 1.9974, + "step": 1484 + }, + { + "epoch": 1.8255528255528255, + "grad_norm": 1.4416630268096924, + "learning_rate": 2.067197371284979e-06, + "loss": 2.1458, + "step": 1486 + }, + { + "epoch": 1.828009828009828, + "grad_norm": 1.4889832735061646, + "learning_rate": 2.0097670867327835e-06, + "loss": 2.1121, + "step": 1488 + }, + { + "epoch": 1.8304668304668303, + "grad_norm": 1.449535846710205, + "learning_rate": 1.953129473267723e-06, + "loss": 1.9937, + "step": 1490 + }, + { + "epoch": 1.8329238329238329, + "grad_norm": 1.6293812990188599, + "learning_rate": 1.897285466392623e-06, + "loss": 1.9339, + "step": 1492 + }, + { + "epoch": 1.8353808353808354, + "grad_norm": 3.697316884994507, + "learning_rate": 1.8422359885020447e-06, + "loss": 2.2421, + "step": 1494 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 1.4677774906158447, + "learning_rate": 1.7879819488670346e-06, + "loss": 2.1215, + "step": 1496 + }, + { + "epoch": 1.8402948402948403, + "grad_norm": 1.4397891759872437, + "learning_rate": 1.734524243620117e-06, + "loss": 2.2512, + "step": 1498 + }, + { + "epoch": 1.8427518427518428, + "grad_norm": 1.4745187759399414, + "learning_rate": 1.681863755740487e-06, + "loss": 1.998, + "step": 1500 + }, + { + "epoch": 1.845208845208845, + "grad_norm": 1.507273554801941, + "learning_rate": 1.63000135503944e-06, + "loss": 2.2467, + "step": 1502 + }, + { + "epoch": 1.8476658476658476, + "grad_norm": 1.47614586353302, + "learning_rate": 1.5789378981459769e-06, + "loss": 2.1659, + "step": 1504 + }, + { + "epoch": 1.8501228501228502, + "grad_norm": 1.68035888671875, + "learning_rate": 1.528674228492677e-06, + "loss": 2.4466, + "step": 1506 + }, + { + "epoch": 1.8525798525798525, + "grad_norm": 1.9172157049179077, + "learning_rate": 1.4792111763017536e-06, + "loss": 1.9358, + "step": 1508 + }, + { + "epoch": 1.855036855036855, + "grad_norm": 1.452444314956665, + "learning_rate": 1.4305495585713592e-06, + "loss": 1.9827, + "step": 1510 + }, + { + "epoch": 1.8574938574938575, + "grad_norm": 1.3575811386108398, + "learning_rate": 1.3826901790620684e-06, + "loss": 1.8126, + "step": 1512 + }, + { + "epoch": 1.8599508599508598, + "grad_norm": 1.5134092569351196, + "learning_rate": 1.3356338282836278e-06, + "loss": 2.0454, + "step": 1514 + }, + { + "epoch": 1.8624078624078624, + "grad_norm": 1.457497477531433, + "learning_rate": 1.2893812834818663e-06, + "loss": 2.0123, + "step": 1516 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 1.401816487312317, + "learning_rate": 1.243933308625883e-06, + "loss": 2.1347, + "step": 1518 + }, + { + "epoch": 1.8673218673218672, + "grad_norm": 1.4954553842544556, + "learning_rate": 1.199290654395424e-06, + "loss": 2.1775, + "step": 1520 + }, + { + "epoch": 1.8697788697788698, + "grad_norm": 1.5043278932571411, + "learning_rate": 1.1554540581684704e-06, + "loss": 2.1036, + "step": 1522 + }, + { + "epoch": 1.8722358722358723, + "grad_norm": 1.4515742063522339, + "learning_rate": 1.1124242440090694e-06, + "loss": 2.0966, + "step": 1524 + }, + { + "epoch": 1.8746928746928746, + "grad_norm": 1.4932043552398682, + "learning_rate": 1.0702019226553783e-06, + "loss": 2.016, + "step": 1526 + }, + { + "epoch": 1.8771498771498771, + "grad_norm": 1.4192034006118774, + "learning_rate": 1.028787791507918e-06, + "loss": 1.9939, + "step": 1528 + }, + { + "epoch": 1.8796068796068797, + "grad_norm": 1.5298486948013306, + "learning_rate": 9.88182534618054e-07, + "loss": 2.2519, + "step": 1530 + }, + { + "epoch": 1.882063882063882, + "grad_norm": 1.411036729812622, + "learning_rate": 9.483868226767001e-07, + "loss": 1.94, + "step": 1532 + }, + { + "epoch": 1.8845208845208845, + "grad_norm": 1.3987832069396973, + "learning_rate": 9.09401313003233e-07, + "loss": 2.0146, + "step": 1534 + }, + { + "epoch": 1.886977886977887, + "grad_norm": 1.4715187549591064, + "learning_rate": 8.71226649534651e-07, + "loss": 2.0628, + "step": 1536 + }, + { + "epoch": 1.8894348894348894, + "grad_norm": 1.5129356384277344, + "learning_rate": 8.338634628149211e-07, + "loss": 2.1729, + "step": 1538 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 1.5162420272827148, + "learning_rate": 7.973123699845819e-07, + "loss": 2.1926, + "step": 1540 + }, + { + "epoch": 1.8943488943488944, + "grad_norm": 1.4543683528900146, + "learning_rate": 7.615739747705298e-07, + "loss": 2.0132, + "step": 1542 + }, + { + "epoch": 1.8968058968058967, + "grad_norm": 2.944164514541626, + "learning_rate": 7.266488674760486e-07, + "loss": 2.133, + "step": 1544 + }, + { + "epoch": 1.8992628992628993, + "grad_norm": 2.392069101333618, + "learning_rate": 6.925376249710958e-07, + "loss": 2.0738, + "step": 1546 + }, + { + "epoch": 1.9017199017199018, + "grad_norm": 1.4587106704711914, + "learning_rate": 6.592408106827152e-07, + "loss": 2.0429, + "step": 1548 + }, + { + "epoch": 1.904176904176904, + "grad_norm": 1.498578667640686, + "learning_rate": 6.267589745857727e-07, + "loss": 1.9591, + "step": 1550 + }, + { + "epoch": 1.9066339066339066, + "grad_norm": 1.5612292289733887, + "learning_rate": 5.950926531938683e-07, + "loss": 2.1256, + "step": 1552 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 1.4940868616104126, + "learning_rate": 5.642423695504607e-07, + "loss": 2.1534, + "step": 1554 + }, + { + "epoch": 1.9115479115479115, + "grad_norm": 1.5300830602645874, + "learning_rate": 5.342086332202345e-07, + "loss": 2.066, + "step": 1556 + }, + { + "epoch": 1.914004914004914, + "grad_norm": 2.079641819000244, + "learning_rate": 5.049919402806802e-07, + "loss": 2.2201, + "step": 1558 + }, + { + "epoch": 1.9164619164619165, + "grad_norm": 1.442713975906372, + "learning_rate": 4.7659277331391084e-07, + "loss": 2.1985, + "step": 1560 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 1.5434784889221191, + "learning_rate": 4.4901160139866337e-07, + "loss": 2.0088, + "step": 1562 + }, + { + "epoch": 1.9213759213759214, + "grad_norm": 1.5334758758544922, + "learning_rate": 4.222488801025992e-07, + "loss": 2.0176, + "step": 1564 + }, + { + "epoch": 1.923832923832924, + "grad_norm": 1.47463858127594, + "learning_rate": 3.9630505147473796e-07, + "loss": 2.0804, + "step": 1566 + }, + { + "epoch": 1.9262899262899262, + "grad_norm": 1.490317463874817, + "learning_rate": 3.7118054403816326e-07, + "loss": 2.1926, + "step": 1568 + }, + { + "epoch": 1.9287469287469288, + "grad_norm": 1.566702961921692, + "learning_rate": 3.4687577278295634e-07, + "loss": 2.1292, + "step": 1570 + }, + { + "epoch": 1.9312039312039313, + "grad_norm": 1.4762805700302124, + "learning_rate": 3.233911391593347e-07, + "loss": 2.0333, + "step": 1572 + }, + { + "epoch": 1.9336609336609336, + "grad_norm": 1.5117943286895752, + "learning_rate": 3.007270310710242e-07, + "loss": 2.1872, + "step": 1574 + }, + { + "epoch": 1.9361179361179361, + "grad_norm": 1.4353746175765991, + "learning_rate": 2.788838228688473e-07, + "loss": 2.0186, + "step": 1576 + }, + { + "epoch": 1.9385749385749387, + "grad_norm": 1.470143437385559, + "learning_rate": 2.578618753445394e-07, + "loss": 2.0722, + "step": 1578 + }, + { + "epoch": 1.941031941031941, + "grad_norm": 1.5072541236877441, + "learning_rate": 2.3766153572480353e-07, + "loss": 1.9765, + "step": 1580 + }, + { + "epoch": 1.9434889434889435, + "grad_norm": 1.4271113872528076, + "learning_rate": 2.1828313766555364e-07, + "loss": 2.0097, + "step": 1582 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 1.8777159452438354, + "learning_rate": 1.9972700124641364e-07, + "loss": 2.2184, + "step": 1584 + }, + { + "epoch": 1.9484029484029484, + "grad_norm": 1.3978275060653687, + "learning_rate": 1.8199343296543824e-07, + "loss": 1.8914, + "step": 1586 + }, + { + "epoch": 1.950859950859951, + "grad_norm": 1.4834709167480469, + "learning_rate": 1.6508272573403373e-07, + "loss": 2.1887, + "step": 1588 + }, + { + "epoch": 1.9533169533169534, + "grad_norm": 1.7158738374710083, + "learning_rate": 1.4899515887213944e-07, + "loss": 2.3368, + "step": 1590 + }, + { + "epoch": 1.9557739557739557, + "grad_norm": 1.4961293935775757, + "learning_rate": 1.3373099810359834e-07, + "loss": 2.1679, + "step": 1592 + }, + { + "epoch": 1.9582309582309583, + "grad_norm": 1.476669430732727, + "learning_rate": 1.1929049555176597e-07, + "loss": 1.7831, + "step": 1594 + }, + { + "epoch": 1.9606879606879608, + "grad_norm": 1.3573822975158691, + "learning_rate": 1.0567388973536929e-07, + "loss": 1.9332, + "step": 1596 + }, + { + "epoch": 1.9631449631449631, + "grad_norm": 1.5783758163452148, + "learning_rate": 9.288140556453773e-08, + "loss": 2.1729, + "step": 1598 + }, + { + "epoch": 1.9656019656019657, + "grad_norm": 1.6986812353134155, + "learning_rate": 8.091325433710606e-08, + "loss": 2.0146, + "step": 1600 + }, + { + "epoch": 1.9680589680589682, + "grad_norm": 1.50715172290802, + "learning_rate": 6.976963373512834e-08, + "loss": 2.0288, + "step": 1602 + }, + { + "epoch": 1.9705159705159705, + "grad_norm": 1.4856563806533813, + "learning_rate": 5.9450727821586115e-08, + "loss": 2.0767, + "step": 1604 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 1.494751214981079, + "learning_rate": 4.99567070373852e-08, + "loss": 2.2026, + "step": 1606 + }, + { + "epoch": 1.9754299754299756, + "grad_norm": 1.4676605463027954, + "learning_rate": 4.128772819850801e-08, + "loss": 2.0638, + "step": 1608 + }, + { + "epoch": 1.9778869778869779, + "grad_norm": 1.5908865928649902, + "learning_rate": 3.344393449344341e-08, + "loss": 2.1933, + "step": 1610 + }, + { + "epoch": 1.9803439803439802, + "grad_norm": 1.5087192058563232, + "learning_rate": 2.642545548081632e-08, + "loss": 1.9326, + "step": 1612 + }, + { + "epoch": 1.982800982800983, + "grad_norm": 1.4953945875167847, + "learning_rate": 2.0232407087245055e-08, + "loss": 2.1202, + "step": 1614 + }, + { + "epoch": 1.9852579852579852, + "grad_norm": 1.494329571723938, + "learning_rate": 1.4864891605420595e-08, + "loss": 2.1431, + "step": 1616 + }, + { + "epoch": 1.9877149877149876, + "grad_norm": 1.470465064048767, + "learning_rate": 1.0322997692441272e-08, + "loss": 2.0489, + "step": 1618 + }, + { + "epoch": 1.9901719901719903, + "grad_norm": 1.5035438537597656, + "learning_rate": 6.606800368313959e-09, + "loss": 2.1744, + "step": 1620 + }, + { + "epoch": 1.9926289926289926, + "grad_norm": 1.4546782970428467, + "learning_rate": 3.7163610147494808e-09, + "loss": 2.0479, + "step": 1622 + }, + { + "epoch": 1.995085995085995, + "grad_norm": 1.541097640991211, + "learning_rate": 1.6517273741134543e-09, + "loss": 2.0689, + "step": 1624 + }, + { + "epoch": 1.9975429975429977, + "grad_norm": 1.5054864883422852, + "learning_rate": 4.129335486713348e-10, + "loss": 2.069, + "step": 1626 + }, + { + "epoch": 2.0, + "grad_norm": 4.215028762817383, + "learning_rate": 0.0, + "loss": 2.1469, + "step": 1628 + } + ], + "logging_steps": 2, + "max_steps": 1628, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 814, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.7527904208355328e+17, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}