{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 814, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002457002457002457, "grad_norm": 4.019700050354004, "learning_rate": 2.4390243902439027e-06, "loss": 6.2086, "step": 2 }, { "epoch": 0.004914004914004914, "grad_norm": 4.2945332527160645, "learning_rate": 4.8780487804878055e-06, "loss": 6.1853, "step": 4 }, { "epoch": 0.007371007371007371, "grad_norm": 3.930802583694458, "learning_rate": 7.317073170731707e-06, "loss": 6.38, "step": 6 }, { "epoch": 0.009828009828009828, "grad_norm": 3.6977317333221436, "learning_rate": 9.756097560975611e-06, "loss": 6.1601, "step": 8 }, { "epoch": 0.012285012285012284, "grad_norm": 25.07744789123535, "learning_rate": 1.2195121951219513e-05, "loss": 6.5514, "step": 10 }, { "epoch": 0.014742014742014743, "grad_norm": 3.566401720046997, "learning_rate": 1.4634146341463415e-05, "loss": 5.6854, "step": 12 }, { "epoch": 0.0171990171990172, "grad_norm": 24.404541015625, "learning_rate": 1.707317073170732e-05, "loss": 5.8838, "step": 14 }, { "epoch": 0.019656019656019656, "grad_norm": 3.1777548789978027, "learning_rate": 1.9512195121951222e-05, "loss": 5.3108, "step": 16 }, { "epoch": 0.022113022113022112, "grad_norm": 2.769148349761963, "learning_rate": 2.1951219512195124e-05, "loss": 4.8014, "step": 18 }, { "epoch": 0.02457002457002457, "grad_norm": 3.1796083450317383, "learning_rate": 2.4390243902439026e-05, "loss": 5.3203, "step": 20 }, { "epoch": 0.02702702702702703, "grad_norm": 3.617638349533081, "learning_rate": 2.682926829268293e-05, "loss": 5.5772, "step": 22 }, { "epoch": 0.029484029484029485, "grad_norm": 7.00071382522583, "learning_rate": 2.926829268292683e-05, "loss": 5.0532, "step": 24 }, { "epoch": 0.03194103194103194, "grad_norm": 3.52091908454895, "learning_rate": 3.170731707317073e-05, "loss": 5.1329, "step": 26 }, { "epoch": 0.0343980343980344, "grad_norm": 3.3309226036071777, "learning_rate": 3.414634146341464e-05, "loss": 4.6793, "step": 28 }, { "epoch": 0.036855036855036855, "grad_norm": 2.865582227706909, "learning_rate": 3.6585365853658535e-05, "loss": 4.5865, "step": 30 }, { "epoch": 0.03931203931203931, "grad_norm": 3.092682123184204, "learning_rate": 3.9024390243902444e-05, "loss": 4.4673, "step": 32 }, { "epoch": 0.04176904176904177, "grad_norm": 2.843824863433838, "learning_rate": 4.146341463414634e-05, "loss": 4.4617, "step": 34 }, { "epoch": 0.044226044226044224, "grad_norm": 3.029207229614258, "learning_rate": 4.390243902439025e-05, "loss": 4.394, "step": 36 }, { "epoch": 0.04668304668304668, "grad_norm": 2.6937527656555176, "learning_rate": 4.634146341463415e-05, "loss": 4.1312, "step": 38 }, { "epoch": 0.04914004914004914, "grad_norm": 205.6438751220703, "learning_rate": 4.878048780487805e-05, "loss": 4.1717, "step": 40 }, { "epoch": 0.051597051597051594, "grad_norm": 2.583071708679199, "learning_rate": 5.121951219512195e-05, "loss": 4.2337, "step": 42 }, { "epoch": 0.05405405405405406, "grad_norm": 2.7350614070892334, "learning_rate": 5.365853658536586e-05, "loss": 4.2223, "step": 44 }, { "epoch": 0.056511056511056514, "grad_norm": 2.6410160064697266, "learning_rate": 5.6097560975609764e-05, "loss": 4.0693, "step": 46 }, { "epoch": 0.05896805896805897, "grad_norm": 2.716932535171509, "learning_rate": 5.853658536585366e-05, "loss": 4.0604, "step": 48 }, { "epoch": 0.06142506142506143, "grad_norm": 2.662912368774414, "learning_rate": 6.097560975609756e-05, "loss": 4.106, "step": 50 }, { "epoch": 0.06388206388206388, "grad_norm": 2.6940219402313232, "learning_rate": 6.341463414634146e-05, "loss": 3.9797, "step": 52 }, { "epoch": 0.06633906633906633, "grad_norm": 2.474919319152832, "learning_rate": 6.585365853658538e-05, "loss": 3.9617, "step": 54 }, { "epoch": 0.0687960687960688, "grad_norm": 3.2239887714385986, "learning_rate": 6.829268292682928e-05, "loss": 3.8198, "step": 56 }, { "epoch": 0.07125307125307126, "grad_norm": 2.245703935623169, "learning_rate": 7.073170731707317e-05, "loss": 3.7654, "step": 58 }, { "epoch": 0.07371007371007371, "grad_norm": 2.289674758911133, "learning_rate": 7.317073170731707e-05, "loss": 3.9226, "step": 60 }, { "epoch": 0.07616707616707617, "grad_norm": 2.497066020965576, "learning_rate": 7.560975609756099e-05, "loss": 3.8096, "step": 62 }, { "epoch": 0.07862407862407862, "grad_norm": 2.301783800125122, "learning_rate": 7.804878048780489e-05, "loss": 3.7816, "step": 64 }, { "epoch": 0.08108108108108109, "grad_norm": 2.323812484741211, "learning_rate": 8.048780487804879e-05, "loss": 3.852, "step": 66 }, { "epoch": 0.08353808353808354, "grad_norm": 2.500802755355835, "learning_rate": 8.292682926829268e-05, "loss": 3.6985, "step": 68 }, { "epoch": 0.085995085995086, "grad_norm": 2.634605646133423, "learning_rate": 8.53658536585366e-05, "loss": 3.6143, "step": 70 }, { "epoch": 0.08845208845208845, "grad_norm": 2.6327457427978516, "learning_rate": 8.78048780487805e-05, "loss": 3.6925, "step": 72 }, { "epoch": 0.09090909090909091, "grad_norm": 2.969693899154663, "learning_rate": 9.02439024390244e-05, "loss": 3.7667, "step": 74 }, { "epoch": 0.09336609336609336, "grad_norm": 2.198855400085449, "learning_rate": 9.26829268292683e-05, "loss": 3.4533, "step": 76 }, { "epoch": 0.09582309582309582, "grad_norm": 2.361680030822754, "learning_rate": 9.51219512195122e-05, "loss": 3.592, "step": 78 }, { "epoch": 0.09828009828009827, "grad_norm": 2.2100822925567627, "learning_rate": 9.75609756097561e-05, "loss": 3.6584, "step": 80 }, { "epoch": 0.10073710073710074, "grad_norm": 2.6485509872436523, "learning_rate": 0.0001, "loss": 3.7041, "step": 82 }, { "epoch": 0.10319410319410319, "grad_norm": 2.305530548095703, "learning_rate": 9.999958706645134e-05, "loss": 3.4613, "step": 84 }, { "epoch": 0.10565110565110565, "grad_norm": 2.2900867462158203, "learning_rate": 9.999834827262588e-05, "loss": 3.5832, "step": 86 }, { "epoch": 0.10810810810810811, "grad_norm": 2.804309368133545, "learning_rate": 9.999628363898526e-05, "loss": 3.6276, "step": 88 }, { "epoch": 0.11056511056511056, "grad_norm": 2.169964075088501, "learning_rate": 9.999339319963168e-05, "loss": 3.6038, "step": 90 }, { "epoch": 0.11302211302211303, "grad_norm": 1.9803478717803955, "learning_rate": 9.998967700230757e-05, "loss": 3.6076, "step": 92 }, { "epoch": 0.11547911547911548, "grad_norm": 2.2166459560394287, "learning_rate": 9.998513510839458e-05, "loss": 3.5336, "step": 94 }, { "epoch": 0.11793611793611794, "grad_norm": 2.4060072898864746, "learning_rate": 9.997976759291276e-05, "loss": 3.4553, "step": 96 }, { "epoch": 0.12039312039312039, "grad_norm": 2.0433080196380615, "learning_rate": 9.997357454451919e-05, "loss": 3.5237, "step": 98 }, { "epoch": 0.12285012285012285, "grad_norm": 14.874445915222168, "learning_rate": 9.996655606550656e-05, "loss": 3.5126, "step": 100 }, { "epoch": 0.12530712530712532, "grad_norm": 2.1172244548797607, "learning_rate": 9.99587122718015e-05, "loss": 3.6411, "step": 102 }, { "epoch": 0.12776412776412777, "grad_norm": 2.2137668132781982, "learning_rate": 9.995004329296263e-05, "loss": 3.7789, "step": 104 }, { "epoch": 0.13022113022113022, "grad_norm": 2.1240861415863037, "learning_rate": 9.994054927217842e-05, "loss": 3.5804, "step": 106 }, { "epoch": 0.13267813267813267, "grad_norm": 2.106127977371216, "learning_rate": 9.993023036626488e-05, "loss": 3.5019, "step": 108 }, { "epoch": 0.13513513513513514, "grad_norm": 2.0348684787750244, "learning_rate": 9.99190867456629e-05, "loss": 3.5321, "step": 110 }, { "epoch": 0.1375921375921376, "grad_norm": 2.0568437576293945, "learning_rate": 9.990711859443546e-05, "loss": 3.3903, "step": 112 }, { "epoch": 0.14004914004914004, "grad_norm": 2.1058425903320312, "learning_rate": 9.989432611026464e-05, "loss": 3.2349, "step": 114 }, { "epoch": 0.14250614250614252, "grad_norm": 2.5870158672332764, "learning_rate": 9.988070950444823e-05, "loss": 3.3838, "step": 116 }, { "epoch": 0.14496314496314497, "grad_norm": 1.9624409675598145, "learning_rate": 9.986626900189641e-05, "loss": 3.4498, "step": 118 }, { "epoch": 0.14742014742014742, "grad_norm": 2.063462972640991, "learning_rate": 9.985100484112785e-05, "loss": 3.3871, "step": 120 }, { "epoch": 0.14987714987714987, "grad_norm": 2.190028667449951, "learning_rate": 9.983491727426598e-05, "loss": 3.3708, "step": 122 }, { "epoch": 0.15233415233415235, "grad_norm": 1.8697468042373657, "learning_rate": 9.981800656703457e-05, "loss": 3.5375, "step": 124 }, { "epoch": 0.1547911547911548, "grad_norm": 2.1959290504455566, "learning_rate": 9.980027299875358e-05, "loss": 3.4274, "step": 126 }, { "epoch": 0.15724815724815724, "grad_norm": 1.9716213941574097, "learning_rate": 9.978171686233445e-05, "loss": 3.2983, "step": 128 }, { "epoch": 0.1597051597051597, "grad_norm": 2.224968910217285, "learning_rate": 9.97623384642752e-05, "loss": 3.3369, "step": 130 }, { "epoch": 0.16216216216216217, "grad_norm": 1.896340012550354, "learning_rate": 9.974213812465547e-05, "loss": 3.4335, "step": 132 }, { "epoch": 0.16461916461916462, "grad_norm": 2.3304531574249268, "learning_rate": 9.972111617713116e-05, "loss": 3.2502, "step": 134 }, { "epoch": 0.16707616707616707, "grad_norm": 2.0420844554901123, "learning_rate": 9.969927296892898e-05, "loss": 3.4312, "step": 136 }, { "epoch": 0.16953316953316952, "grad_norm": 2.021531105041504, "learning_rate": 9.967660886084066e-05, "loss": 3.2728, "step": 138 }, { "epoch": 0.171990171990172, "grad_norm": 2.0841152667999268, "learning_rate": 9.965312422721704e-05, "loss": 3.1727, "step": 140 }, { "epoch": 0.17444717444717445, "grad_norm": 2.6040847301483154, "learning_rate": 9.962881945596184e-05, "loss": 3.3739, "step": 142 }, { "epoch": 0.1769041769041769, "grad_norm": 15.763100624084473, "learning_rate": 9.960369494852525e-05, "loss": 3.1624, "step": 144 }, { "epoch": 0.17936117936117937, "grad_norm": 2.2386856079101562, "learning_rate": 9.95777511198974e-05, "loss": 3.2584, "step": 146 }, { "epoch": 0.18181818181818182, "grad_norm": 2.857926607131958, "learning_rate": 9.955098839860133e-05, "loss": 3.4066, "step": 148 }, { "epoch": 0.18427518427518427, "grad_norm": 2.4381282329559326, "learning_rate": 9.952340722668609e-05, "loss": 3.2631, "step": 150 }, { "epoch": 0.18673218673218672, "grad_norm": 2.13820481300354, "learning_rate": 9.949500805971932e-05, "loss": 3.3381, "step": 152 }, { "epoch": 0.1891891891891892, "grad_norm": 2.0036275386810303, "learning_rate": 9.946579136677978e-05, "loss": 3.3253, "step": 154 }, { "epoch": 0.19164619164619165, "grad_norm": 2.3847107887268066, "learning_rate": 9.943575763044955e-05, "loss": 3.1658, "step": 156 }, { "epoch": 0.1941031941031941, "grad_norm": 1.883772373199463, "learning_rate": 9.940490734680614e-05, "loss": 3.1963, "step": 158 }, { "epoch": 0.19656019656019655, "grad_norm": 1.8654661178588867, "learning_rate": 9.937324102541423e-05, "loss": 3.2319, "step": 160 }, { "epoch": 0.19901719901719903, "grad_norm": 1.8905117511749268, "learning_rate": 9.93407591893173e-05, "loss": 3.3699, "step": 162 }, { "epoch": 0.20147420147420148, "grad_norm": 1.8974312543869019, "learning_rate": 9.930746237502892e-05, "loss": 3.2576, "step": 164 }, { "epoch": 0.20393120393120392, "grad_norm": 1.9088815450668335, "learning_rate": 9.927335113252396e-05, "loss": 3.3929, "step": 166 }, { "epoch": 0.20638820638820637, "grad_norm": 1.8603475093841553, "learning_rate": 9.923842602522949e-05, "loss": 3.2274, "step": 168 }, { "epoch": 0.20884520884520885, "grad_norm": 1.8516125679016113, "learning_rate": 9.920268763001542e-05, "loss": 3.1596, "step": 170 }, { "epoch": 0.2113022113022113, "grad_norm": 1.9465285539627075, "learning_rate": 9.916613653718509e-05, "loss": 3.2853, "step": 172 }, { "epoch": 0.21375921375921375, "grad_norm": 1.9433836936950684, "learning_rate": 9.912877335046535e-05, "loss": 3.1986, "step": 174 }, { "epoch": 0.21621621621621623, "grad_norm": 1.9821792840957642, "learning_rate": 9.909059868699678e-05, "loss": 3.1233, "step": 176 }, { "epoch": 0.21867321867321868, "grad_norm": 1.904826283454895, "learning_rate": 9.905161317732331e-05, "loss": 3.221, "step": 178 }, { "epoch": 0.22113022113022113, "grad_norm": 1.9415316581726074, "learning_rate": 9.901181746538196e-05, "loss": 3.1107, "step": 180 }, { "epoch": 0.22358722358722358, "grad_norm": 1.8300875425338745, "learning_rate": 9.897121220849208e-05, "loss": 3.1114, "step": 182 }, { "epoch": 0.22604422604422605, "grad_norm": 1.972747802734375, "learning_rate": 9.892979807734462e-05, "loss": 3.1652, "step": 184 }, { "epoch": 0.2285012285012285, "grad_norm": 1.9549387693405151, "learning_rate": 9.888757575599093e-05, "loss": 3.2123, "step": 186 }, { "epoch": 0.23095823095823095, "grad_norm": 1.856306552886963, "learning_rate": 9.884454594183154e-05, "loss": 3.235, "step": 188 }, { "epoch": 0.2334152334152334, "grad_norm": 1.743513584136963, "learning_rate": 9.880070934560458e-05, "loss": 3.1647, "step": 190 }, { "epoch": 0.23587223587223588, "grad_norm": 1.884634256362915, "learning_rate": 9.875606669137412e-05, "loss": 3.1963, "step": 192 }, { "epoch": 0.23832923832923833, "grad_norm": 1.966579556465149, "learning_rate": 9.871061871651815e-05, "loss": 2.9626, "step": 194 }, { "epoch": 0.24078624078624078, "grad_norm": 1.7920762300491333, "learning_rate": 9.866436617171638e-05, "loss": 3.3341, "step": 196 }, { "epoch": 0.24324324324324326, "grad_norm": 1.9862432479858398, "learning_rate": 9.861730982093793e-05, "loss": 3.2039, "step": 198 }, { "epoch": 0.2457002457002457, "grad_norm": 1.7585490942001343, "learning_rate": 9.856945044142865e-05, "loss": 3.1915, "step": 200 }, { "epoch": 0.24815724815724816, "grad_norm": 1.7467507123947144, "learning_rate": 9.852078882369827e-05, "loss": 3.1105, "step": 202 }, { "epoch": 0.25061425061425063, "grad_norm": 1.9189780950546265, "learning_rate": 9.847132577150733e-05, "loss": 3.1065, "step": 204 }, { "epoch": 0.25307125307125306, "grad_norm": 1.8920857906341553, "learning_rate": 9.842106210185403e-05, "loss": 3.1573, "step": 206 }, { "epoch": 0.25552825552825553, "grad_norm": 1.7997002601623535, "learning_rate": 9.836999864496057e-05, "loss": 3.0638, "step": 208 }, { "epoch": 0.257985257985258, "grad_norm": 1.8181427717208862, "learning_rate": 9.831813624425952e-05, "loss": 3.166, "step": 210 }, { "epoch": 0.26044226044226043, "grad_norm": 1.7454043626785278, "learning_rate": 9.82654757563799e-05, "loss": 2.9648, "step": 212 }, { "epoch": 0.2628992628992629, "grad_norm": 1.9787805080413818, "learning_rate": 9.821201805113298e-05, "loss": 3.0446, "step": 214 }, { "epoch": 0.26535626535626533, "grad_norm": 1.8889926671981812, "learning_rate": 9.815776401149796e-05, "loss": 3.1998, "step": 216 }, { "epoch": 0.2678132678132678, "grad_norm": 1.9100897312164307, "learning_rate": 9.810271453360738e-05, "loss": 3.0167, "step": 218 }, { "epoch": 0.2702702702702703, "grad_norm": 1.72272527217865, "learning_rate": 9.804687052673229e-05, "loss": 3.0516, "step": 220 }, { "epoch": 0.2727272727272727, "grad_norm": 1.7406550645828247, "learning_rate": 9.799023291326722e-05, "loss": 3.1492, "step": 222 }, { "epoch": 0.2751842751842752, "grad_norm": 1.7723503112792969, "learning_rate": 9.793280262871502e-05, "loss": 3.1282, "step": 224 }, { "epoch": 0.27764127764127766, "grad_norm": 1.7942124605178833, "learning_rate": 9.787458062167134e-05, "loss": 3.0463, "step": 226 }, { "epoch": 0.2800982800982801, "grad_norm": 1.8124969005584717, "learning_rate": 9.781556785380899e-05, "loss": 3.295, "step": 228 }, { "epoch": 0.28255528255528256, "grad_norm": 1.7787127494812012, "learning_rate": 9.775576529986199e-05, "loss": 3.3415, "step": 230 }, { "epoch": 0.28501228501228504, "grad_norm": 1.967888355255127, "learning_rate": 9.769517394760962e-05, "loss": 2.9891, "step": 232 }, { "epoch": 0.28746928746928746, "grad_norm": 1.9067909717559814, "learning_rate": 9.763379479785995e-05, "loss": 3.1963, "step": 234 }, { "epoch": 0.28992628992628994, "grad_norm": 1.8959527015686035, "learning_rate": 9.757162886443336e-05, "loss": 3.0906, "step": 236 }, { "epoch": 0.29238329238329236, "grad_norm": 1.756986141204834, "learning_rate": 9.750867717414586e-05, "loss": 3.1538, "step": 238 }, { "epoch": 0.29484029484029484, "grad_norm": 1.7642282247543335, "learning_rate": 9.744494076679205e-05, "loss": 3.0886, "step": 240 }, { "epoch": 0.2972972972972973, "grad_norm": 1.8129557371139526, "learning_rate": 9.738042069512795e-05, "loss": 3.2687, "step": 242 }, { "epoch": 0.29975429975429974, "grad_norm": 1.6599668264389038, "learning_rate": 9.731511802485364e-05, "loss": 3.0165, "step": 244 }, { "epoch": 0.3022113022113022, "grad_norm": 1.7890832424163818, "learning_rate": 9.724903383459566e-05, "loss": 3.1224, "step": 246 }, { "epoch": 0.3046683046683047, "grad_norm": 1.901267170906067, "learning_rate": 9.718216921588919e-05, "loss": 2.9377, "step": 248 }, { "epoch": 0.3071253071253071, "grad_norm": 1.8047585487365723, "learning_rate": 9.711452527315998e-05, "loss": 3.2727, "step": 250 }, { "epoch": 0.3095823095823096, "grad_norm": 1.9266289472579956, "learning_rate": 9.704610312370617e-05, "loss": 3.0795, "step": 252 }, { "epoch": 0.31203931203931207, "grad_norm": 1.8360109329223633, "learning_rate": 9.697690389767981e-05, "loss": 3.1791, "step": 254 }, { "epoch": 0.3144963144963145, "grad_norm": 1.7929311990737915, "learning_rate": 9.690692873806816e-05, "loss": 3.1926, "step": 256 }, { "epoch": 0.31695331695331697, "grad_norm": 1.7981804609298706, "learning_rate": 9.683617880067489e-05, "loss": 3.2032, "step": 258 }, { "epoch": 0.3194103194103194, "grad_norm": 1.8404676914215088, "learning_rate": 9.676465525410088e-05, "loss": 3.1692, "step": 260 }, { "epoch": 0.32186732186732187, "grad_norm": 1.7135553359985352, "learning_rate": 9.669235927972502e-05, "loss": 3.2311, "step": 262 }, { "epoch": 0.32432432432432434, "grad_norm": 1.8173389434814453, "learning_rate": 9.661929207168463e-05, "loss": 2.938, "step": 264 }, { "epoch": 0.32678132678132676, "grad_norm": 1.8518667221069336, "learning_rate": 9.654545483685578e-05, "loss": 2.9593, "step": 266 }, { "epoch": 0.32923832923832924, "grad_norm": 1.7250562906265259, "learning_rate": 9.647084879483332e-05, "loss": 3.2025, "step": 268 }, { "epoch": 0.3316953316953317, "grad_norm": 1.7107222080230713, "learning_rate": 9.639547517791076e-05, "loss": 3.0682, "step": 270 }, { "epoch": 0.33415233415233414, "grad_norm": 1.6773762702941895, "learning_rate": 9.631933523105991e-05, "loss": 2.99, "step": 272 }, { "epoch": 0.3366093366093366, "grad_norm": 1.758596420288086, "learning_rate": 9.624243021191029e-05, "loss": 2.9892, "step": 274 }, { "epoch": 0.33906633906633904, "grad_norm": 1.8285568952560425, "learning_rate": 9.61647613907284e-05, "loss": 3.0917, "step": 276 }, { "epoch": 0.3415233415233415, "grad_norm": 1.8026080131530762, "learning_rate": 9.608633005039675e-05, "loss": 3.0702, "step": 278 }, { "epoch": 0.343980343980344, "grad_norm": 11.102814674377441, "learning_rate": 9.600713748639258e-05, "loss": 3.1327, "step": 280 }, { "epoch": 0.3464373464373464, "grad_norm": 1.9404098987579346, "learning_rate": 9.592718500676656e-05, "loss": 3.0348, "step": 282 }, { "epoch": 0.3488943488943489, "grad_norm": 1.7170964479446411, "learning_rate": 9.584647393212113e-05, "loss": 3.0137, "step": 284 }, { "epoch": 0.35135135135135137, "grad_norm": 2.194800615310669, "learning_rate": 9.576500559558869e-05, "loss": 3.0657, "step": 286 }, { "epoch": 0.3538083538083538, "grad_norm": 2.2819526195526123, "learning_rate": 9.568278134280966e-05, "loss": 2.9922, "step": 288 }, { "epoch": 0.35626535626535627, "grad_norm": 1.764850378036499, "learning_rate": 9.55998025319101e-05, "loss": 3.0742, "step": 290 }, { "epoch": 0.35872235872235875, "grad_norm": 1.675075650215149, "learning_rate": 9.551607053347942e-05, "loss": 3.14, "step": 292 }, { "epoch": 0.36117936117936117, "grad_norm": 1.730245590209961, "learning_rate": 9.543158673054767e-05, "loss": 2.9738, "step": 294 }, { "epoch": 0.36363636363636365, "grad_norm": 1.6660993099212646, "learning_rate": 9.534635251856267e-05, "loss": 2.7211, "step": 296 }, { "epoch": 0.36609336609336607, "grad_norm": 1.6620466709136963, "learning_rate": 9.526036930536712e-05, "loss": 2.9747, "step": 298 }, { "epoch": 0.36855036855036855, "grad_norm": 1.6838281154632568, "learning_rate": 9.517363851117512e-05, "loss": 3.0342, "step": 300 }, { "epoch": 0.371007371007371, "grad_norm": 3.0981221199035645, "learning_rate": 9.508616156854883e-05, "loss": 3.1151, "step": 302 }, { "epoch": 0.37346437346437344, "grad_norm": 1.954408049583435, "learning_rate": 9.499793992237485e-05, "loss": 2.7723, "step": 304 }, { "epoch": 0.3759213759213759, "grad_norm": 1.7550610303878784, "learning_rate": 9.490897502984028e-05, "loss": 2.9576, "step": 306 }, { "epoch": 0.3783783783783784, "grad_norm": 1.854491114616394, "learning_rate": 9.481926836040866e-05, "loss": 2.9745, "step": 308 }, { "epoch": 0.3808353808353808, "grad_norm": 1.6988489627838135, "learning_rate": 9.472882139579572e-05, "loss": 3.1205, "step": 310 }, { "epoch": 0.3832923832923833, "grad_norm": 1.682507038116455, "learning_rate": 9.463763562994491e-05, "loss": 3.0174, "step": 312 }, { "epoch": 0.3857493857493858, "grad_norm": 1.7135891914367676, "learning_rate": 9.454571256900272e-05, "loss": 2.8791, "step": 314 }, { "epoch": 0.3882063882063882, "grad_norm": 1.6604820489883423, "learning_rate": 9.445305373129375e-05, "loss": 2.9726, "step": 316 }, { "epoch": 0.3906633906633907, "grad_norm": 1.8734002113342285, "learning_rate": 9.435966064729574e-05, "loss": 3.2479, "step": 318 }, { "epoch": 0.3931203931203931, "grad_norm": 2.204241991043091, "learning_rate": 9.426553485961415e-05, "loss": 3.092, "step": 320 }, { "epoch": 0.3955773955773956, "grad_norm": 1.7275304794311523, "learning_rate": 9.417067792295684e-05, "loss": 2.9288, "step": 322 }, { "epoch": 0.39803439803439805, "grad_norm": 1.720664381980896, "learning_rate": 9.407509140410826e-05, "loss": 2.9628, "step": 324 }, { "epoch": 0.4004914004914005, "grad_norm": 1.6769790649414062, "learning_rate": 9.397877688190362e-05, "loss": 2.8215, "step": 326 }, { "epoch": 0.40294840294840295, "grad_norm": 2.712700843811035, "learning_rate": 9.388173594720281e-05, "loss": 2.9122, "step": 328 }, { "epoch": 0.40540540540540543, "grad_norm": 1.8222036361694336, "learning_rate": 9.378397020286417e-05, "loss": 3.0608, "step": 330 }, { "epoch": 0.40786240786240785, "grad_norm": 1.794284462928772, "learning_rate": 9.368548126371788e-05, "loss": 2.994, "step": 332 }, { "epoch": 0.4103194103194103, "grad_norm": 1.6459068059921265, "learning_rate": 9.358627075653946e-05, "loss": 2.9682, "step": 334 }, { "epoch": 0.41277641277641275, "grad_norm": 1.6979544162750244, "learning_rate": 9.348634032002277e-05, "loss": 3.0303, "step": 336 }, { "epoch": 0.4152334152334152, "grad_norm": 1.724778175354004, "learning_rate": 9.338569160475299e-05, "loss": 2.7861, "step": 338 }, { "epoch": 0.4176904176904177, "grad_norm": 1.737809658050537, "learning_rate": 9.328432627317938e-05, "loss": 2.9365, "step": 340 }, { "epoch": 0.4201474201474201, "grad_norm": 1.6635717153549194, "learning_rate": 9.318224599958778e-05, "loss": 2.9818, "step": 342 }, { "epoch": 0.4226044226044226, "grad_norm": 1.696031093597412, "learning_rate": 9.307945247007299e-05, "loss": 3.0177, "step": 344 }, { "epoch": 0.4250614250614251, "grad_norm": 1.7607593536376953, "learning_rate": 9.297594738251086e-05, "loss": 3.0018, "step": 346 }, { "epoch": 0.4275184275184275, "grad_norm": 1.6658580303192139, "learning_rate": 9.287173244653032e-05, "loss": 2.9692, "step": 348 }, { "epoch": 0.42997542997543, "grad_norm": 1.7650336027145386, "learning_rate": 9.276680938348512e-05, "loss": 2.9828, "step": 350 }, { "epoch": 0.43243243243243246, "grad_norm": 1.6512577533721924, "learning_rate": 9.266117992642536e-05, "loss": 2.9751, "step": 352 }, { "epoch": 0.4348894348894349, "grad_norm": 1.7206581830978394, "learning_rate": 9.25548458200689e-05, "loss": 2.948, "step": 354 }, { "epoch": 0.43734643734643736, "grad_norm": 1.7989583015441895, "learning_rate": 9.244780882077254e-05, "loss": 2.6979, "step": 356 }, { "epoch": 0.4398034398034398, "grad_norm": 1.7424843311309814, "learning_rate": 9.2340070696503e-05, "loss": 3.0316, "step": 358 }, { "epoch": 0.44226044226044225, "grad_norm": 1.726643443107605, "learning_rate": 9.223163322680772e-05, "loss": 2.9751, "step": 360 }, { "epoch": 0.44471744471744473, "grad_norm": 1.670101523399353, "learning_rate": 9.212249820278545e-05, "loss": 2.8993, "step": 362 }, { "epoch": 0.44717444717444715, "grad_norm": 1.6568448543548584, "learning_rate": 9.201266742705672e-05, "loss": 3.1154, "step": 364 }, { "epoch": 0.44963144963144963, "grad_norm": 1.776298999786377, "learning_rate": 9.190214271373398e-05, "loss": 2.8871, "step": 366 }, { "epoch": 0.4520884520884521, "grad_norm": 1.67928147315979, "learning_rate": 9.179092588839178e-05, "loss": 2.7472, "step": 368 }, { "epoch": 0.45454545454545453, "grad_norm": 1.657324194908142, "learning_rate": 9.167901878803638e-05, "loss": 2.8401, "step": 370 }, { "epoch": 0.457002457002457, "grad_norm": 1.7573364973068237, "learning_rate": 9.156642326107565e-05, "loss": 2.91, "step": 372 }, { "epoch": 0.4594594594594595, "grad_norm": 1.6229298114776611, "learning_rate": 9.145314116728841e-05, "loss": 3.0171, "step": 374 }, { "epoch": 0.4619164619164619, "grad_norm": 1.7045820951461792, "learning_rate": 9.133917437779375e-05, "loss": 2.9076, "step": 376 }, { "epoch": 0.4643734643734644, "grad_norm": 1.5423696041107178, "learning_rate": 9.12245247750201e-05, "loss": 2.9438, "step": 378 }, { "epoch": 0.4668304668304668, "grad_norm": 1.6251273155212402, "learning_rate": 9.110919425267415e-05, "loss": 2.998, "step": 380 }, { "epoch": 0.4692874692874693, "grad_norm": 1.7428491115570068, "learning_rate": 9.099318471570957e-05, "loss": 2.9253, "step": 382 }, { "epoch": 0.47174447174447176, "grad_norm": 1.7721710205078125, "learning_rate": 9.087649808029554e-05, "loss": 2.8592, "step": 384 }, { "epoch": 0.4742014742014742, "grad_norm": 1.8491050004959106, "learning_rate": 9.075913627378513e-05, "loss": 2.994, "step": 386 }, { "epoch": 0.47665847665847666, "grad_norm": 1.6777352094650269, "learning_rate": 9.064110123468345e-05, "loss": 2.9855, "step": 388 }, { "epoch": 0.47911547911547914, "grad_norm": 1.717166781425476, "learning_rate": 9.052239491261559e-05, "loss": 3.0357, "step": 390 }, { "epoch": 0.48157248157248156, "grad_norm": 1.6935992240905762, "learning_rate": 9.040301926829445e-05, "loss": 3.0939, "step": 392 }, { "epoch": 0.48402948402948404, "grad_norm": 1.7372186183929443, "learning_rate": 9.028297627348835e-05, "loss": 2.7582, "step": 394 }, { "epoch": 0.4864864864864865, "grad_norm": 1.6863627433776855, "learning_rate": 9.016226791098851e-05, "loss": 3.0831, "step": 396 }, { "epoch": 0.48894348894348894, "grad_norm": 1.5812780857086182, "learning_rate": 9.004089617457625e-05, "loss": 2.8814, "step": 398 }, { "epoch": 0.4914004914004914, "grad_norm": 1.6280958652496338, "learning_rate": 8.991886306899002e-05, "loss": 3.0318, "step": 400 }, { "epoch": 0.49385749385749383, "grad_norm": 1.703506588935852, "learning_rate": 8.979617060989234e-05, "loss": 3.1265, "step": 402 }, { "epoch": 0.4963144963144963, "grad_norm": 1.596043348312378, "learning_rate": 8.967282082383652e-05, "loss": 3.0872, "step": 404 }, { "epoch": 0.4987714987714988, "grad_norm": 1.8576503992080688, "learning_rate": 8.954881574823317e-05, "loss": 2.9286, "step": 406 }, { "epoch": 0.5012285012285013, "grad_norm": 1.8761273622512817, "learning_rate": 8.942415743131651e-05, "loss": 2.8426, "step": 408 }, { "epoch": 0.5036855036855037, "grad_norm": 1.6055220365524292, "learning_rate": 8.92988479321106e-05, "loss": 2.7407, "step": 410 }, { "epoch": 0.5061425061425061, "grad_norm": 1.5771170854568481, "learning_rate": 8.917288932039529e-05, "loss": 3.0324, "step": 412 }, { "epoch": 0.5085995085995086, "grad_norm": 1.5361248254776, "learning_rate": 8.904628367667202e-05, "loss": 2.901, "step": 414 }, { "epoch": 0.5110565110565111, "grad_norm": 1.662143588066101, "learning_rate": 8.891903309212952e-05, "loss": 2.8503, "step": 416 }, { "epoch": 0.5135135135135135, "grad_norm": 1.6230075359344482, "learning_rate": 8.87911396686092e-05, "loss": 2.7839, "step": 418 }, { "epoch": 0.515970515970516, "grad_norm": 1.7319310903549194, "learning_rate": 8.866260551857045e-05, "loss": 2.9908, "step": 420 }, { "epoch": 0.5184275184275184, "grad_norm": 1.6889699697494507, "learning_rate": 8.853343276505581e-05, "loss": 2.7828, "step": 422 }, { "epoch": 0.5208845208845209, "grad_norm": 11.279559135437012, "learning_rate": 8.840362354165581e-05, "loss": 2.8784, "step": 424 }, { "epoch": 0.5233415233415234, "grad_norm": 1.7862719297409058, "learning_rate": 8.827317999247378e-05, "loss": 2.9174, "step": 426 }, { "epoch": 0.5257985257985258, "grad_norm": 1.6923807859420776, "learning_rate": 8.81421042720904e-05, "loss": 3.0122, "step": 428 }, { "epoch": 0.5282555282555282, "grad_norm": 29.5804386138916, "learning_rate": 8.801039854552821e-05, "loss": 2.9432, "step": 430 }, { "epoch": 0.5307125307125307, "grad_norm": 1.7361953258514404, "learning_rate": 8.787806498821571e-05, "loss": 2.881, "step": 432 }, { "epoch": 0.5331695331695332, "grad_norm": 1.7489663362503052, "learning_rate": 8.774510578595153e-05, "loss": 2.9763, "step": 434 }, { "epoch": 0.5356265356265356, "grad_norm": 1.5390799045562744, "learning_rate": 8.761152313486824e-05, "loss": 2.9691, "step": 436 }, { "epoch": 0.538083538083538, "grad_norm": 1.8254520893096924, "learning_rate": 8.747731924139622e-05, "loss": 2.7518, "step": 438 }, { "epoch": 0.5405405405405406, "grad_norm": 1.6369190216064453, "learning_rate": 8.734249632222702e-05, "loss": 2.9397, "step": 440 }, { "epoch": 0.542997542997543, "grad_norm": 1.5464283227920532, "learning_rate": 8.720705660427692e-05, "loss": 2.85, "step": 442 }, { "epoch": 0.5454545454545454, "grad_norm": 1.9427014589309692, "learning_rate": 8.707100232465007e-05, "loss": 2.8669, "step": 444 }, { "epoch": 0.547911547911548, "grad_norm": 1.6004284620285034, "learning_rate": 8.69343357306015e-05, "loss": 2.8216, "step": 446 }, { "epoch": 0.5503685503685504, "grad_norm": 1.615379810333252, "learning_rate": 8.67970590795001e-05, "loss": 2.6078, "step": 448 }, { "epoch": 0.5528255528255528, "grad_norm": 1.7116811275482178, "learning_rate": 8.665917463879125e-05, "loss": 3.1718, "step": 450 }, { "epoch": 0.5552825552825553, "grad_norm": 1.7578415870666504, "learning_rate": 8.65206846859594e-05, "loss": 2.9675, "step": 452 }, { "epoch": 0.5577395577395577, "grad_norm": 1.6257036924362183, "learning_rate": 8.638159150849046e-05, "loss": 3.0276, "step": 454 }, { "epoch": 0.5601965601965602, "grad_norm": 1.6307907104492188, "learning_rate": 8.6241897403834e-05, "loss": 2.984, "step": 456 }, { "epoch": 0.5626535626535627, "grad_norm": 1.628524661064148, "learning_rate": 8.610160467936533e-05, "loss": 2.9575, "step": 458 }, { "epoch": 0.5651105651105651, "grad_norm": 1.656004786491394, "learning_rate": 8.596071565234733e-05, "loss": 3.0361, "step": 460 }, { "epoch": 0.5675675675675675, "grad_norm": 1.5190882682800293, "learning_rate": 8.581923264989228e-05, "loss": 2.8684, "step": 462 }, { "epoch": 0.5700245700245701, "grad_norm": 1.6448097229003906, "learning_rate": 8.567715800892326e-05, "loss": 3.0113, "step": 464 }, { "epoch": 0.5724815724815725, "grad_norm": 1.7265264987945557, "learning_rate": 8.553449407613572e-05, "loss": 2.9945, "step": 466 }, { "epoch": 0.5749385749385749, "grad_norm": 7.519404411315918, "learning_rate": 8.539124320795862e-05, "loss": 3.0108, "step": 468 }, { "epoch": 0.5773955773955773, "grad_norm": 1.586311936378479, "learning_rate": 8.524740777051555e-05, "loss": 2.7747, "step": 470 }, { "epoch": 0.5798525798525799, "grad_norm": 1.7437552213668823, "learning_rate": 8.510299013958558e-05, "loss": 3.0769, "step": 472 }, { "epoch": 0.5823095823095823, "grad_norm": 1.757537603378296, "learning_rate": 8.495799270056412e-05, "loss": 2.9696, "step": 474 }, { "epoch": 0.5847665847665847, "grad_norm": 1.5627938508987427, "learning_rate": 8.481241784842344e-05, "loss": 2.7725, "step": 476 }, { "epoch": 0.5872235872235873, "grad_norm": 1.6397974491119385, "learning_rate": 8.466626798767318e-05, "loss": 3.0094, "step": 478 }, { "epoch": 0.5896805896805897, "grad_norm": 1.6290757656097412, "learning_rate": 8.451954553232055e-05, "loss": 3.0959, "step": 480 }, { "epoch": 0.5921375921375921, "grad_norm": 1.628265619277954, "learning_rate": 8.437225290583051e-05, "loss": 2.7791, "step": 482 }, { "epoch": 0.5945945945945946, "grad_norm": 1.6443836688995361, "learning_rate": 8.422439254108576e-05, "loss": 2.9753, "step": 484 }, { "epoch": 0.597051597051597, "grad_norm": 1.617613673210144, "learning_rate": 8.407596688034648e-05, "loss": 3.0032, "step": 486 }, { "epoch": 0.5995085995085995, "grad_norm": 1.6962013244628906, "learning_rate": 8.392697837521007e-05, "loss": 2.845, "step": 488 }, { "epoch": 0.601965601965602, "grad_norm": 1.5888972282409668, "learning_rate": 8.37774294865706e-05, "loss": 2.7606, "step": 490 }, { "epoch": 0.6044226044226044, "grad_norm": 1.5973700284957886, "learning_rate": 8.362732268457824e-05, "loss": 3.1137, "step": 492 }, { "epoch": 0.6068796068796068, "grad_norm": 27.63825035095215, "learning_rate": 8.347666044859833e-05, "loss": 2.8452, "step": 494 }, { "epoch": 0.6093366093366094, "grad_norm": 1.592497706413269, "learning_rate": 8.332544526717057e-05, "loss": 2.8386, "step": 496 }, { "epoch": 0.6117936117936118, "grad_norm": 1.5605237483978271, "learning_rate": 8.317367963796778e-05, "loss": 2.9155, "step": 498 }, { "epoch": 0.6142506142506142, "grad_norm": 1.5933369398117065, "learning_rate": 8.30213660677548e-05, "loss": 2.8719, "step": 500 }, { "epoch": 0.6167076167076168, "grad_norm": 1.649196743965149, "learning_rate": 8.286850707234691e-05, "loss": 2.8277, "step": 502 }, { "epoch": 0.6191646191646192, "grad_norm": 1.671309232711792, "learning_rate": 8.271510517656845e-05, "loss": 2.8637, "step": 504 }, { "epoch": 0.6216216216216216, "grad_norm": 1.6627384424209595, "learning_rate": 8.256116291421094e-05, "loss": 3.0589, "step": 506 }, { "epoch": 0.6240786240786241, "grad_norm": 1.6332685947418213, "learning_rate": 8.24066828279914e-05, "loss": 2.8909, "step": 508 }, { "epoch": 0.6265356265356266, "grad_norm": 1.5245287418365479, "learning_rate": 8.225166746951023e-05, "loss": 3.0116, "step": 510 }, { "epoch": 0.628992628992629, "grad_norm": 1.6836432218551636, "learning_rate": 8.209611939920912e-05, "loss": 2.6435, "step": 512 }, { "epoch": 0.6314496314496314, "grad_norm": 1.5706799030303955, "learning_rate": 8.194004118632873e-05, "loss": 2.8328, "step": 514 }, { "epoch": 0.6339066339066339, "grad_norm": 1.6746190786361694, "learning_rate": 8.178343540886626e-05, "loss": 2.8675, "step": 516 }, { "epoch": 0.6363636363636364, "grad_norm": 1.629543662071228, "learning_rate": 8.162630465353292e-05, "loss": 2.7241, "step": 518 }, { "epoch": 0.6388206388206388, "grad_norm": 1.6499792337417603, "learning_rate": 8.146865151571108e-05, "loss": 2.8229, "step": 520 }, { "epoch": 0.6412776412776413, "grad_norm": 1.7184644937515259, "learning_rate": 8.131047859941156e-05, "loss": 2.77, "step": 522 }, { "epoch": 0.6437346437346437, "grad_norm": 4.247711181640625, "learning_rate": 8.11517885172305e-05, "loss": 2.9487, "step": 524 }, { "epoch": 0.6461916461916462, "grad_norm": 1.6026562452316284, "learning_rate": 8.099258389030624e-05, "loss": 2.6282, "step": 526 }, { "epoch": 0.6486486486486487, "grad_norm": 1.6910419464111328, "learning_rate": 8.083286734827605e-05, "loss": 2.8437, "step": 528 }, { "epoch": 0.6511056511056511, "grad_norm": 2.3516945838928223, "learning_rate": 8.067264152923268e-05, "loss": 2.8406, "step": 530 }, { "epoch": 0.6535626535626535, "grad_norm": 1.7951064109802246, "learning_rate": 8.051190907968076e-05, "loss": 2.7437, "step": 532 }, { "epoch": 0.6560196560196561, "grad_norm": 1.6154842376708984, "learning_rate": 8.035067265449312e-05, "loss": 2.7701, "step": 534 }, { "epoch": 0.6584766584766585, "grad_norm": 1.5919100046157837, "learning_rate": 8.018893491686692e-05, "loss": 2.8796, "step": 536 }, { "epoch": 0.6609336609336609, "grad_norm": 1.6625549793243408, "learning_rate": 8.00266985382797e-05, "loss": 2.8874, "step": 538 }, { "epoch": 0.6633906633906634, "grad_norm": 1.5272475481033325, "learning_rate": 7.986396619844519e-05, "loss": 2.8395, "step": 540 }, { "epoch": 0.6658476658476659, "grad_norm": 1.6629564762115479, "learning_rate": 7.970074058526908e-05, "loss": 2.8472, "step": 542 }, { "epoch": 0.6683046683046683, "grad_norm": 1.5899837017059326, "learning_rate": 7.953702439480468e-05, "loss": 2.5589, "step": 544 }, { "epoch": 0.6707616707616708, "grad_norm": 1.6535660028457642, "learning_rate": 7.937282033120825e-05, "loss": 3.002, "step": 546 }, { "epoch": 0.6732186732186732, "grad_norm": 1.7113556861877441, "learning_rate": 7.920813110669445e-05, "loss": 2.921, "step": 548 }, { "epoch": 0.6756756756756757, "grad_norm": 1.6539169549942017, "learning_rate": 7.904295944149157e-05, "loss": 2.7657, "step": 550 }, { "epoch": 0.6781326781326781, "grad_norm": 1.499321699142456, "learning_rate": 7.887730806379641e-05, "loss": 2.5954, "step": 552 }, { "epoch": 0.6805896805896806, "grad_norm": 1.5506938695907593, "learning_rate": 7.871117970972948e-05, "loss": 2.9977, "step": 554 }, { "epoch": 0.683046683046683, "grad_norm": 1.60562264919281, "learning_rate": 7.854457712328957e-05, "loss": 2.8725, "step": 556 }, { "epoch": 0.6855036855036855, "grad_norm": 1.5586713552474976, "learning_rate": 7.837750305630862e-05, "loss": 2.7757, "step": 558 }, { "epoch": 0.687960687960688, "grad_norm": 1.4952479600906372, "learning_rate": 7.820996026840607e-05, "loss": 2.8109, "step": 560 }, { "epoch": 0.6904176904176904, "grad_norm": 1.5375022888183594, "learning_rate": 7.804195152694347e-05, "loss": 2.8958, "step": 562 }, { "epoch": 0.6928746928746928, "grad_norm": 1.5429644584655762, "learning_rate": 7.787347960697863e-05, "loss": 2.848, "step": 564 }, { "epoch": 0.6953316953316954, "grad_norm": 1.5820600986480713, "learning_rate": 7.77045472912199e-05, "loss": 2.7349, "step": 566 }, { "epoch": 0.6977886977886978, "grad_norm": 1.6391702890396118, "learning_rate": 7.753515736998007e-05, "loss": 2.7536, "step": 568 }, { "epoch": 0.7002457002457002, "grad_norm": 1.8837157487869263, "learning_rate": 7.736531264113041e-05, "loss": 2.5788, "step": 570 }, { "epoch": 0.7027027027027027, "grad_norm": 1.6547040939331055, "learning_rate": 7.719501591005436e-05, "loss": 2.9089, "step": 572 }, { "epoch": 0.7051597051597052, "grad_norm": 1.6191688776016235, "learning_rate": 7.702426998960129e-05, "loss": 2.7423, "step": 574 }, { "epoch": 0.7076167076167076, "grad_norm": 1.63784658908844, "learning_rate": 7.685307770003993e-05, "loss": 2.9405, "step": 576 }, { "epoch": 0.7100737100737101, "grad_norm": 1.7041395902633667, "learning_rate": 7.668144186901189e-05, "loss": 2.7419, "step": 578 }, { "epoch": 0.7125307125307125, "grad_norm": 1.626767635345459, "learning_rate": 7.650936533148485e-05, "loss": 2.917, "step": 580 }, { "epoch": 0.714987714987715, "grad_norm": 1.5805187225341797, "learning_rate": 7.633685092970584e-05, "loss": 2.9039, "step": 582 }, { "epoch": 0.7174447174447175, "grad_norm": 1.5578582286834717, "learning_rate": 7.616390151315422e-05, "loss": 2.7313, "step": 584 }, { "epoch": 0.7199017199017199, "grad_norm": 1.5582646131515503, "learning_rate": 7.599051993849467e-05, "loss": 2.4935, "step": 586 }, { "epoch": 0.7223587223587223, "grad_norm": 1.6257047653198242, "learning_rate": 7.58167090695299e-05, "loss": 2.9326, "step": 588 }, { "epoch": 0.7248157248157249, "grad_norm": 1.534566879272461, "learning_rate": 7.56424717771535e-05, "loss": 2.7481, "step": 590 }, { "epoch": 0.7272727272727273, "grad_norm": 4.033563137054443, "learning_rate": 7.546781093930238e-05, "loss": 2.6916, "step": 592 }, { "epoch": 0.7297297297297297, "grad_norm": 1.7314389944076538, "learning_rate": 7.529272944090935e-05, "loss": 2.8562, "step": 594 }, { "epoch": 0.7321867321867321, "grad_norm": 1.5590124130249023, "learning_rate": 7.511723017385538e-05, "loss": 2.6231, "step": 596 }, { "epoch": 0.7346437346437347, "grad_norm": 1.5711228847503662, "learning_rate": 7.494131603692187e-05, "loss": 2.7342, "step": 598 }, { "epoch": 0.7371007371007371, "grad_norm": 1.6933467388153076, "learning_rate": 7.476498993574277e-05, "loss": 2.8793, "step": 600 }, { "epoch": 0.7395577395577395, "grad_norm": 1.652563214302063, "learning_rate": 7.45882547827566e-05, "loss": 2.8555, "step": 602 }, { "epoch": 0.742014742014742, "grad_norm": 1.5554332733154297, "learning_rate": 7.441111349715832e-05, "loss": 2.7455, "step": 604 }, { "epoch": 0.7444717444717445, "grad_norm": 1.5497958660125732, "learning_rate": 7.423356900485108e-05, "loss": 2.7962, "step": 606 }, { "epoch": 0.7469287469287469, "grad_norm": 2.072221040725708, "learning_rate": 7.405562423839801e-05, "loss": 2.7827, "step": 608 }, { "epoch": 0.7493857493857494, "grad_norm": 1.6481581926345825, "learning_rate": 7.387728213697365e-05, "loss": 2.7402, "step": 610 }, { "epoch": 0.7518427518427518, "grad_norm": 1.605137825012207, "learning_rate": 7.369854564631548e-05, "loss": 2.9223, "step": 612 }, { "epoch": 0.7542997542997543, "grad_norm": 1.502097725868225, "learning_rate": 7.351941771867523e-05, "loss": 2.811, "step": 614 }, { "epoch": 0.7567567567567568, "grad_norm": 1.4896032810211182, "learning_rate": 7.333990131277013e-05, "loss": 2.7072, "step": 616 }, { "epoch": 0.7592137592137592, "grad_norm": 1.5070993900299072, "learning_rate": 7.315999939373404e-05, "loss": 2.6862, "step": 618 }, { "epoch": 0.7616707616707616, "grad_norm": 1.5684906244277954, "learning_rate": 7.297971493306848e-05, "loss": 2.9692, "step": 620 }, { "epoch": 0.7641277641277642, "grad_norm": 1.6148993968963623, "learning_rate": 7.279905090859352e-05, "loss": 2.6739, "step": 622 }, { "epoch": 0.7665847665847666, "grad_norm": 1.8809860944747925, "learning_rate": 7.261801030439864e-05, "loss": 2.6416, "step": 624 }, { "epoch": 0.769041769041769, "grad_norm": 1.9866212606430054, "learning_rate": 7.243659611079343e-05, "loss": 2.6516, "step": 626 }, { "epoch": 0.7714987714987716, "grad_norm": 1.5821627378463745, "learning_rate": 7.225481132425812e-05, "loss": 2.5458, "step": 628 }, { "epoch": 0.773955773955774, "grad_norm": 2.8795077800750732, "learning_rate": 7.20726589473942e-05, "loss": 2.7961, "step": 630 }, { "epoch": 0.7764127764127764, "grad_norm": 1.6154601573944092, "learning_rate": 7.189014198887478e-05, "loss": 2.9093, "step": 632 }, { "epoch": 0.7788697788697788, "grad_norm": 1.677990436553955, "learning_rate": 7.170726346339488e-05, "loss": 2.888, "step": 634 }, { "epoch": 0.7813267813267813, "grad_norm": 2.146972417831421, "learning_rate": 7.15240263916216e-05, "loss": 2.7926, "step": 636 }, { "epoch": 0.7837837837837838, "grad_norm": 1.5269049406051636, "learning_rate": 7.134043380014436e-05, "loss": 2.6542, "step": 638 }, { "epoch": 0.7862407862407862, "grad_norm": 1.5493632555007935, "learning_rate": 7.115648872142475e-05, "loss": 2.5847, "step": 640 }, { "epoch": 0.7886977886977887, "grad_norm": 1.7266803979873657, "learning_rate": 7.097219419374652e-05, "loss": 2.8569, "step": 642 }, { "epoch": 0.7911547911547911, "grad_norm": 1.5810487270355225, "learning_rate": 7.078755326116542e-05, "loss": 2.5271, "step": 644 }, { "epoch": 0.7936117936117936, "grad_norm": 1.8903851509094238, "learning_rate": 7.060256897345888e-05, "loss": 2.8828, "step": 646 }, { "epoch": 0.7960687960687961, "grad_norm": 1.6093307733535767, "learning_rate": 7.041724438607563e-05, "loss": 2.9436, "step": 648 }, { "epoch": 0.7985257985257985, "grad_norm": 1.629585862159729, "learning_rate": 7.023158256008521e-05, "loss": 2.9666, "step": 650 }, { "epoch": 0.800982800982801, "grad_norm": 1.62311851978302, "learning_rate": 7.004558656212753e-05, "loss": 2.8761, "step": 652 }, { "epoch": 0.8034398034398035, "grad_norm": 1.5945847034454346, "learning_rate": 6.985925946436213e-05, "loss": 2.8419, "step": 654 }, { "epoch": 0.8058968058968059, "grad_norm": 1.5463379621505737, "learning_rate": 6.967260434441729e-05, "loss": 2.7639, "step": 656 }, { "epoch": 0.8083538083538083, "grad_norm": 3.601102590560913, "learning_rate": 6.948562428533955e-05, "loss": 2.8573, "step": 658 }, { "epoch": 0.8108108108108109, "grad_norm": 1.59696364402771, "learning_rate": 6.929832237554241e-05, "loss": 2.8708, "step": 660 }, { "epoch": 0.8132678132678133, "grad_norm": 1.577546238899231, "learning_rate": 6.911070170875562e-05, "loss": 2.8815, "step": 662 }, { "epoch": 0.8157248157248157, "grad_norm": 1.4518203735351562, "learning_rate": 6.892276538397384e-05, "loss": 2.7745, "step": 664 }, { "epoch": 0.8181818181818182, "grad_norm": 1.5612823963165283, "learning_rate": 6.873451650540566e-05, "loss": 2.6452, "step": 666 }, { "epoch": 0.8206388206388207, "grad_norm": 1.5459407567977905, "learning_rate": 6.854595818242213e-05, "loss": 2.8471, "step": 668 }, { "epoch": 0.8230958230958231, "grad_norm": 1.6408060789108276, "learning_rate": 6.835709352950557e-05, "loss": 2.8203, "step": 670 }, { "epoch": 0.8255528255528255, "grad_norm": 1.6304421424865723, "learning_rate": 6.816792566619806e-05, "loss": 2.6816, "step": 672 }, { "epoch": 0.828009828009828, "grad_norm": 1.6135345697402954, "learning_rate": 6.797845771704983e-05, "loss": 2.7026, "step": 674 }, { "epoch": 0.8304668304668305, "grad_norm": 1.6707392930984497, "learning_rate": 6.778869281156784e-05, "loss": 2.8929, "step": 676 }, { "epoch": 0.8329238329238329, "grad_norm": 1.621004581451416, "learning_rate": 6.759863408416386e-05, "loss": 2.7963, "step": 678 }, { "epoch": 0.8353808353808354, "grad_norm": 1.5636838674545288, "learning_rate": 6.740828467410294e-05, "loss": 2.9369, "step": 680 }, { "epoch": 0.8378378378378378, "grad_norm": 1.6135865449905396, "learning_rate": 6.721764772545135e-05, "loss": 2.5881, "step": 682 }, { "epoch": 0.8402948402948403, "grad_norm": 1.6814947128295898, "learning_rate": 6.702672638702475e-05, "loss": 2.8039, "step": 684 }, { "epoch": 0.8427518427518428, "grad_norm": 1.5667632818222046, "learning_rate": 6.68355238123362e-05, "loss": 2.8448, "step": 686 }, { "epoch": 0.8452088452088452, "grad_norm": 1.6321567296981812, "learning_rate": 6.664404315954397e-05, "loss": 2.7062, "step": 688 }, { "epoch": 0.8476658476658476, "grad_norm": 1.4741226434707642, "learning_rate": 6.64522875913995e-05, "loss": 2.8175, "step": 690 }, { "epoch": 0.8501228501228502, "grad_norm": 1.583012580871582, "learning_rate": 6.626026027519509e-05, "loss": 2.7698, "step": 692 }, { "epoch": 0.8525798525798526, "grad_norm": 1.650833010673523, "learning_rate": 6.606796438271156e-05, "loss": 2.7542, "step": 694 }, { "epoch": 0.855036855036855, "grad_norm": 1.736618161201477, "learning_rate": 6.587540309016592e-05, "loss": 2.5914, "step": 696 }, { "epoch": 0.8574938574938575, "grad_norm": 1.5331878662109375, "learning_rate": 6.568257957815893e-05, "loss": 2.7587, "step": 698 }, { "epoch": 0.85995085995086, "grad_norm": 2.8066611289978027, "learning_rate": 6.54894970316224e-05, "loss": 2.5376, "step": 700 }, { "epoch": 0.8624078624078624, "grad_norm": 1.8552104234695435, "learning_rate": 6.529615863976684e-05, "loss": 2.8213, "step": 702 }, { "epoch": 0.8648648648648649, "grad_norm": 7.016737937927246, "learning_rate": 6.510256759602857e-05, "loss": 2.763, "step": 704 }, { "epoch": 0.8673218673218673, "grad_norm": 8.383584022521973, "learning_rate": 6.4908727098017e-05, "loss": 2.8134, "step": 706 }, { "epoch": 0.8697788697788698, "grad_norm": 2.5546131134033203, "learning_rate": 6.4714640347462e-05, "loss": 2.5981, "step": 708 }, { "epoch": 0.8722358722358723, "grad_norm": 1.4681930541992188, "learning_rate": 6.452031055016073e-05, "loss": 2.6272, "step": 710 }, { "epoch": 0.8746928746928747, "grad_norm": 1.540204405784607, "learning_rate": 6.432574091592494e-05, "loss": 2.6063, "step": 712 }, { "epoch": 0.8771498771498771, "grad_norm": 2.637376308441162, "learning_rate": 6.41309346585278e-05, "loss": 2.931, "step": 714 }, { "epoch": 0.8796068796068796, "grad_norm": 1.6025326251983643, "learning_rate": 6.393589499565088e-05, "loss": 2.6114, "step": 716 }, { "epoch": 0.8820638820638821, "grad_norm": 1.6360759735107422, "learning_rate": 6.374062514883099e-05, "loss": 2.8207, "step": 718 }, { "epoch": 0.8845208845208845, "grad_norm": 1.5794533491134644, "learning_rate": 6.354512834340695e-05, "loss": 2.6199, "step": 720 }, { "epoch": 0.8869778869778869, "grad_norm": 1.4947880506515503, "learning_rate": 6.334940780846634e-05, "loss": 2.6621, "step": 722 }, { "epoch": 0.8894348894348895, "grad_norm": 1.632408618927002, "learning_rate": 6.315346677679218e-05, "loss": 2.8979, "step": 724 }, { "epoch": 0.8918918918918919, "grad_norm": 1.636464238166809, "learning_rate": 6.295730848480947e-05, "loss": 2.714, "step": 726 }, { "epoch": 0.8943488943488943, "grad_norm": 1.55814790725708, "learning_rate": 6.276093617253182e-05, "loss": 2.7157, "step": 728 }, { "epoch": 0.8968058968058968, "grad_norm": 1.559779405593872, "learning_rate": 6.256435308350786e-05, "loss": 2.6694, "step": 730 }, { "epoch": 0.8992628992628993, "grad_norm": 1.5911952257156372, "learning_rate": 6.236756246476765e-05, "loss": 2.8256, "step": 732 }, { "epoch": 0.9017199017199017, "grad_norm": 1.542946457862854, "learning_rate": 6.217056756676917e-05, "loss": 2.5519, "step": 734 }, { "epoch": 0.9041769041769042, "grad_norm": 1.6930584907531738, "learning_rate": 6.197337164334453e-05, "loss": 2.8785, "step": 736 }, { "epoch": 0.9066339066339066, "grad_norm": 1.5558332204818726, "learning_rate": 6.177597795164616e-05, "loss": 2.6596, "step": 738 }, { "epoch": 0.9090909090909091, "grad_norm": 1.4722115993499756, "learning_rate": 6.157838975209323e-05, "loss": 2.5197, "step": 740 }, { "epoch": 0.9115479115479116, "grad_norm": 1.4730453491210938, "learning_rate": 6.138061030831755e-05, "loss": 2.8153, "step": 742 }, { "epoch": 0.914004914004914, "grad_norm": 1.6478359699249268, "learning_rate": 6.118264288710988e-05, "loss": 2.7405, "step": 744 }, { "epoch": 0.9164619164619164, "grad_norm": 1.4796196222305298, "learning_rate": 6.098449075836575e-05, "loss": 2.753, "step": 746 }, { "epoch": 0.918918918918919, "grad_norm": 1.9191288948059082, "learning_rate": 6.0786157195031653e-05, "loss": 2.6252, "step": 748 }, { "epoch": 0.9213759213759214, "grad_norm": 1.553387999534607, "learning_rate": 6.058764547305088e-05, "loss": 2.8045, "step": 750 }, { "epoch": 0.9238329238329238, "grad_norm": 1.4824281930923462, "learning_rate": 6.038895887130942e-05, "loss": 2.7996, "step": 752 }, { "epoch": 0.9262899262899262, "grad_norm": 1.5677618980407715, "learning_rate": 6.019010067158181e-05, "loss": 2.891, "step": 754 }, { "epoch": 0.9287469287469288, "grad_norm": 1.6281031370162964, "learning_rate": 5.9991074158476935e-05, "loss": 2.7762, "step": 756 }, { "epoch": 0.9312039312039312, "grad_norm": 1.5614237785339355, "learning_rate": 5.9791882619383766e-05, "loss": 2.726, "step": 758 }, { "epoch": 0.9336609336609336, "grad_norm": 1.446755290031433, "learning_rate": 5.959252934441707e-05, "loss": 2.5548, "step": 760 }, { "epoch": 0.9361179361179361, "grad_norm": 1.5004243850708008, "learning_rate": 5.939301762636307e-05, "loss": 2.8673, "step": 762 }, { "epoch": 0.9385749385749386, "grad_norm": 8.901308059692383, "learning_rate": 5.9193350760625014e-05, "loss": 2.6982, "step": 764 }, { "epoch": 0.941031941031941, "grad_norm": 1.6001726388931274, "learning_rate": 5.8993532045168795e-05, "loss": 2.6233, "step": 766 }, { "epoch": 0.9434889434889435, "grad_norm": 1.5054491758346558, "learning_rate": 5.879356478046849e-05, "loss": 2.7115, "step": 768 }, { "epoch": 0.9459459459459459, "grad_norm": 1.4467328786849976, "learning_rate": 5.8593452269451775e-05, "loss": 2.6884, "step": 770 }, { "epoch": 0.9484029484029484, "grad_norm": 1.4677785634994507, "learning_rate": 5.839319781744542e-05, "loss": 2.845, "step": 772 }, { "epoch": 0.9508599508599509, "grad_norm": 1.5005624294281006, "learning_rate": 5.81928047321207e-05, "loss": 2.6341, "step": 774 }, { "epoch": 0.9533169533169533, "grad_norm": 1.468900442123413, "learning_rate": 5.79922763234387e-05, "loss": 2.6078, "step": 776 }, { "epoch": 0.9557739557739557, "grad_norm": 1.5839577913284302, "learning_rate": 5.779161590359573e-05, "loss": 2.6946, "step": 778 }, { "epoch": 0.9582309582309583, "grad_norm": 1.5110023021697998, "learning_rate": 5.7590826786968576e-05, "loss": 2.6369, "step": 780 }, { "epoch": 0.9606879606879607, "grad_norm": 1.4900883436203003, "learning_rate": 5.738991229005972e-05, "loss": 2.5128, "step": 782 }, { "epoch": 0.9631449631449631, "grad_norm": 1.5296499729156494, "learning_rate": 5.7188875731442605e-05, "loss": 2.5457, "step": 784 }, { "epoch": 0.9656019656019657, "grad_norm": 1.5462408065795898, "learning_rate": 5.6987720431706826e-05, "loss": 2.5989, "step": 786 }, { "epoch": 0.9680589680589681, "grad_norm": 1.476729154586792, "learning_rate": 5.678644971340326e-05, "loss": 2.9144, "step": 788 }, { "epoch": 0.9705159705159705, "grad_norm": 1.6275049448013306, "learning_rate": 5.658506690098916e-05, "loss": 2.7696, "step": 790 }, { "epoch": 0.972972972972973, "grad_norm": 1.4835902452468872, "learning_rate": 5.638357532077331e-05, "loss": 2.5136, "step": 792 }, { "epoch": 0.9754299754299754, "grad_norm": 1.526121973991394, "learning_rate": 5.6181978300861046e-05, "loss": 2.4791, "step": 794 }, { "epoch": 0.9778869778869779, "grad_norm": 1.4545388221740723, "learning_rate": 5.598027917109929e-05, "loss": 2.5994, "step": 796 }, { "epoch": 0.9803439803439803, "grad_norm": 2.1908226013183594, "learning_rate": 5.577848126302152e-05, "loss": 2.8895, "step": 798 }, { "epoch": 0.9828009828009828, "grad_norm": 1.504654049873352, "learning_rate": 5.55765879097928e-05, "loss": 2.7705, "step": 800 }, { "epoch": 0.9852579852579852, "grad_norm": 1.6015645265579224, "learning_rate": 5.5374602446154665e-05, "loss": 2.845, "step": 802 }, { "epoch": 0.9877149877149877, "grad_norm": 1.4529505968093872, "learning_rate": 5.517252820837011e-05, "loss": 2.5013, "step": 804 }, { "epoch": 0.9901719901719902, "grad_norm": 1.4882707595825195, "learning_rate": 5.49703685341684e-05, "loss": 2.7789, "step": 806 }, { "epoch": 0.9926289926289926, "grad_norm": 1.589077353477478, "learning_rate": 5.4768126762690034e-05, "loss": 2.6236, "step": 808 }, { "epoch": 0.995085995085995, "grad_norm": 1.4018021821975708, "learning_rate": 5.456580623443145e-05, "loss": 2.6779, "step": 810 }, { "epoch": 0.9975429975429976, "grad_norm": 1.4764320850372314, "learning_rate": 5.436341029119004e-05, "loss": 2.7036, "step": 812 }, { "epoch": 1.0, "grad_norm": 4.599942684173584, "learning_rate": 5.416094227600881e-05, "loss": 3.0131, "step": 814 } ], "logging_steps": 2, "max_steps": 1628, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 814, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3763952104177664e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }