diff --git "a/checkpoint-2992/trainer_state.json" "b/checkpoint-2992/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2992/trainer_state.json" @@ -0,0 +1,20978 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 500, + "global_step": 2992, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013377926421404682, + "grad_norm": 4.130218448113739, + "learning_rate": 0.0, + "loss": 1.6972, + "step": 1 + }, + { + "epoch": 0.0026755852842809363, + "grad_norm": 4.130768032132593, + "learning_rate": 2.6737967914438503e-08, + "loss": 1.5664, + "step": 2 + }, + { + "epoch": 0.004013377926421404, + "grad_norm": 4.2690376912914685, + "learning_rate": 5.3475935828877005e-08, + "loss": 1.6254, + "step": 3 + }, + { + "epoch": 0.005351170568561873, + "grad_norm": 4.392283010443138, + "learning_rate": 8.021390374331552e-08, + "loss": 1.6071, + "step": 4 + }, + { + "epoch": 0.006688963210702341, + "grad_norm": 3.911112169163038, + "learning_rate": 1.0695187165775401e-07, + "loss": 1.5569, + "step": 5 + }, + { + "epoch": 0.008026755852842809, + "grad_norm": 4.532416445203919, + "learning_rate": 1.3368983957219251e-07, + "loss": 1.5842, + "step": 6 + }, + { + "epoch": 0.009364548494983277, + "grad_norm": 4.14532331367055, + "learning_rate": 1.6042780748663104e-07, + "loss": 1.6447, + "step": 7 + }, + { + "epoch": 0.010702341137123745, + "grad_norm": 4.220442870402265, + "learning_rate": 1.8716577540106952e-07, + "loss": 1.5102, + "step": 8 + }, + { + "epoch": 0.012040133779264214, + "grad_norm": 4.61452547363993, + "learning_rate": 2.1390374331550802e-07, + "loss": 1.5931, + "step": 9 + }, + { + "epoch": 0.013377926421404682, + "grad_norm": 3.754431853721134, + "learning_rate": 2.4064171122994655e-07, + "loss": 1.6803, + "step": 10 + }, + { + "epoch": 0.01471571906354515, + "grad_norm": 4.089595872893903, + "learning_rate": 2.6737967914438503e-07, + "loss": 1.5065, + "step": 11 + }, + { + "epoch": 0.016053511705685617, + "grad_norm": 4.366718796889029, + "learning_rate": 2.9411764705882356e-07, + "loss": 1.7901, + "step": 12 + }, + { + "epoch": 0.017391304347826087, + "grad_norm": 4.568811780145197, + "learning_rate": 3.208556149732621e-07, + "loss": 1.6723, + "step": 13 + }, + { + "epoch": 0.018729096989966554, + "grad_norm": 4.207380485937817, + "learning_rate": 3.4759358288770056e-07, + "loss": 1.4679, + "step": 14 + }, + { + "epoch": 0.020066889632107024, + "grad_norm": 4.323033511441801, + "learning_rate": 3.7433155080213904e-07, + "loss": 1.6148, + "step": 15 + }, + { + "epoch": 0.02140468227424749, + "grad_norm": 3.7793298822011603, + "learning_rate": 4.0106951871657757e-07, + "loss": 1.5828, + "step": 16 + }, + { + "epoch": 0.02274247491638796, + "grad_norm": 4.328519169073789, + "learning_rate": 4.2780748663101604e-07, + "loss": 1.5925, + "step": 17 + }, + { + "epoch": 0.024080267558528427, + "grad_norm": 3.957603673536805, + "learning_rate": 4.5454545454545457e-07, + "loss": 1.669, + "step": 18 + }, + { + "epoch": 0.025418060200668897, + "grad_norm": 3.789163336944083, + "learning_rate": 4.812834224598931e-07, + "loss": 1.5194, + "step": 19 + }, + { + "epoch": 0.026755852842809364, + "grad_norm": 3.8030809315659115, + "learning_rate": 5.080213903743316e-07, + "loss": 1.767, + "step": 20 + }, + { + "epoch": 0.028093645484949834, + "grad_norm": 3.160938354602823, + "learning_rate": 5.347593582887701e-07, + "loss": 1.5973, + "step": 21 + }, + { + "epoch": 0.0294314381270903, + "grad_norm": 3.691718574389292, + "learning_rate": 5.614973262032086e-07, + "loss": 1.5087, + "step": 22 + }, + { + "epoch": 0.03076923076923077, + "grad_norm": 3.6237857595469323, + "learning_rate": 5.882352941176471e-07, + "loss": 1.5299, + "step": 23 + }, + { + "epoch": 0.032107023411371234, + "grad_norm": 3.3775241798650115, + "learning_rate": 6.149732620320856e-07, + "loss": 1.6244, + "step": 24 + }, + { + "epoch": 0.033444816053511704, + "grad_norm": 3.660788359686034, + "learning_rate": 6.417112299465242e-07, + "loss": 1.7999, + "step": 25 + }, + { + "epoch": 0.034782608695652174, + "grad_norm": 2.9989654941121846, + "learning_rate": 6.684491978609627e-07, + "loss": 1.3574, + "step": 26 + }, + { + "epoch": 0.036120401337792644, + "grad_norm": 2.6268410756693945, + "learning_rate": 6.951871657754011e-07, + "loss": 1.434, + "step": 27 + }, + { + "epoch": 0.03745819397993311, + "grad_norm": 2.924580892022415, + "learning_rate": 7.219251336898397e-07, + "loss": 1.7444, + "step": 28 + }, + { + "epoch": 0.03879598662207358, + "grad_norm": 2.5150770737541452, + "learning_rate": 7.486631016042781e-07, + "loss": 1.4548, + "step": 29 + }, + { + "epoch": 0.04013377926421405, + "grad_norm": 2.4144632935126658, + "learning_rate": 7.754010695187167e-07, + "loss": 1.5844, + "step": 30 + }, + { + "epoch": 0.04147157190635452, + "grad_norm": 2.486606431590659, + "learning_rate": 8.021390374331551e-07, + "loss": 1.4603, + "step": 31 + }, + { + "epoch": 0.04280936454849498, + "grad_norm": 2.4478764224957343, + "learning_rate": 8.288770053475937e-07, + "loss": 1.6182, + "step": 32 + }, + { + "epoch": 0.04414715719063545, + "grad_norm": 2.101457139865831, + "learning_rate": 8.556149732620321e-07, + "loss": 1.4975, + "step": 33 + }, + { + "epoch": 0.04548494983277592, + "grad_norm": 2.304634468503368, + "learning_rate": 8.823529411764707e-07, + "loss": 1.5347, + "step": 34 + }, + { + "epoch": 0.046822742474916385, + "grad_norm": 2.273419553175906, + "learning_rate": 9.090909090909091e-07, + "loss": 1.411, + "step": 35 + }, + { + "epoch": 0.048160535117056855, + "grad_norm": 1.9949533472956398, + "learning_rate": 9.358288770053477e-07, + "loss": 1.5803, + "step": 36 + }, + { + "epoch": 0.049498327759197325, + "grad_norm": 2.035383399323819, + "learning_rate": 9.625668449197862e-07, + "loss": 1.5353, + "step": 37 + }, + { + "epoch": 0.050836120401337795, + "grad_norm": 2.763447018128038, + "learning_rate": 9.893048128342248e-07, + "loss": 1.6342, + "step": 38 + }, + { + "epoch": 0.05217391304347826, + "grad_norm": 2.4052969081534723, + "learning_rate": 1.0160427807486633e-06, + "loss": 1.6106, + "step": 39 + }, + { + "epoch": 0.05351170568561873, + "grad_norm": 2.479041343731303, + "learning_rate": 1.0427807486631017e-06, + "loss": 1.4333, + "step": 40 + }, + { + "epoch": 0.0548494983277592, + "grad_norm": 2.4851231508538514, + "learning_rate": 1.0695187165775401e-06, + "loss": 1.4797, + "step": 41 + }, + { + "epoch": 0.05618729096989967, + "grad_norm": 2.173781201870592, + "learning_rate": 1.0962566844919787e-06, + "loss": 1.4322, + "step": 42 + }, + { + "epoch": 0.05752508361204013, + "grad_norm": 2.4718858192500566, + "learning_rate": 1.1229946524064172e-06, + "loss": 1.3844, + "step": 43 + }, + { + "epoch": 0.0588628762541806, + "grad_norm": 2.1734105698888704, + "learning_rate": 1.1497326203208558e-06, + "loss": 1.5656, + "step": 44 + }, + { + "epoch": 0.06020066889632107, + "grad_norm": 2.051418247760723, + "learning_rate": 1.1764705882352942e-06, + "loss": 1.5057, + "step": 45 + }, + { + "epoch": 0.06153846153846154, + "grad_norm": 1.7296000044413613, + "learning_rate": 1.2032085561497326e-06, + "loss": 1.529, + "step": 46 + }, + { + "epoch": 0.06287625418060201, + "grad_norm": 2.108996385651272, + "learning_rate": 1.2299465240641713e-06, + "loss": 1.6555, + "step": 47 + }, + { + "epoch": 0.06421404682274247, + "grad_norm": 1.6286587302951325, + "learning_rate": 1.2566844919786097e-06, + "loss": 1.4173, + "step": 48 + }, + { + "epoch": 0.06555183946488294, + "grad_norm": 1.6478996568782027, + "learning_rate": 1.2834224598930483e-06, + "loss": 1.5247, + "step": 49 + }, + { + "epoch": 0.06688963210702341, + "grad_norm": 1.6805378746602497, + "learning_rate": 1.3101604278074868e-06, + "loss": 1.6756, + "step": 50 + }, + { + "epoch": 0.06822742474916388, + "grad_norm": 1.541787106949026, + "learning_rate": 1.3368983957219254e-06, + "loss": 1.7708, + "step": 51 + }, + { + "epoch": 0.06956521739130435, + "grad_norm": 1.6627110847669206, + "learning_rate": 1.3636363636363636e-06, + "loss": 1.3618, + "step": 52 + }, + { + "epoch": 0.07090301003344482, + "grad_norm": 1.4363175712516445, + "learning_rate": 1.3903743315508022e-06, + "loss": 1.6554, + "step": 53 + }, + { + "epoch": 0.07224080267558529, + "grad_norm": 1.5376493590985185, + "learning_rate": 1.4171122994652409e-06, + "loss": 1.6018, + "step": 54 + }, + { + "epoch": 0.07357859531772576, + "grad_norm": 1.423921528103638, + "learning_rate": 1.4438502673796793e-06, + "loss": 1.6879, + "step": 55 + }, + { + "epoch": 0.07491638795986622, + "grad_norm": 1.8355529159781494, + "learning_rate": 1.4705882352941177e-06, + "loss": 1.4992, + "step": 56 + }, + { + "epoch": 0.07625418060200669, + "grad_norm": 1.2536815011762341, + "learning_rate": 1.4973262032085562e-06, + "loss": 1.4658, + "step": 57 + }, + { + "epoch": 0.07759197324414716, + "grad_norm": 1.318438172766236, + "learning_rate": 1.5240641711229948e-06, + "loss": 1.4126, + "step": 58 + }, + { + "epoch": 0.07892976588628763, + "grad_norm": 1.3928279129778949, + "learning_rate": 1.5508021390374334e-06, + "loss": 1.5644, + "step": 59 + }, + { + "epoch": 0.0802675585284281, + "grad_norm": 1.304310659701626, + "learning_rate": 1.5775401069518716e-06, + "loss": 1.528, + "step": 60 + }, + { + "epoch": 0.08160535117056857, + "grad_norm": 1.345091652426695, + "learning_rate": 1.6042780748663103e-06, + "loss": 1.3906, + "step": 61 + }, + { + "epoch": 0.08294314381270904, + "grad_norm": 1.2601939021504467, + "learning_rate": 1.631016042780749e-06, + "loss": 1.486, + "step": 62 + }, + { + "epoch": 0.08428093645484949, + "grad_norm": 1.1578962029347895, + "learning_rate": 1.6577540106951873e-06, + "loss": 1.5569, + "step": 63 + }, + { + "epoch": 0.08561872909698996, + "grad_norm": 1.1517358911835358, + "learning_rate": 1.684491978609626e-06, + "loss": 1.3867, + "step": 64 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 1.238602904141402, + "learning_rate": 1.7112299465240642e-06, + "loss": 1.5432, + "step": 65 + }, + { + "epoch": 0.0882943143812709, + "grad_norm": 1.050302863223541, + "learning_rate": 1.7379679144385028e-06, + "loss": 1.3296, + "step": 66 + }, + { + "epoch": 0.08963210702341137, + "grad_norm": 1.3131706878649532, + "learning_rate": 1.7647058823529414e-06, + "loss": 1.4986, + "step": 67 + }, + { + "epoch": 0.09096989966555184, + "grad_norm": 1.1396883909404865, + "learning_rate": 1.7914438502673799e-06, + "loss": 1.3727, + "step": 68 + }, + { + "epoch": 0.09230769230769231, + "grad_norm": 1.2760796772326428, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.5836, + "step": 69 + }, + { + "epoch": 0.09364548494983277, + "grad_norm": 1.0373571955456051, + "learning_rate": 1.8449197860962567e-06, + "loss": 1.3909, + "step": 70 + }, + { + "epoch": 0.09498327759197324, + "grad_norm": 1.1075328764917243, + "learning_rate": 1.8716577540106954e-06, + "loss": 1.303, + "step": 71 + }, + { + "epoch": 0.09632107023411371, + "grad_norm": 1.319244106351142, + "learning_rate": 1.898395721925134e-06, + "loss": 1.6075, + "step": 72 + }, + { + "epoch": 0.09765886287625418, + "grad_norm": 1.082078060618253, + "learning_rate": 1.9251336898395724e-06, + "loss": 1.2409, + "step": 73 + }, + { + "epoch": 0.09899665551839465, + "grad_norm": 1.0878616966940067, + "learning_rate": 1.951871657754011e-06, + "loss": 1.4239, + "step": 74 + }, + { + "epoch": 0.10033444816053512, + "grad_norm": 1.0517123694983512, + "learning_rate": 1.9786096256684497e-06, + "loss": 1.4607, + "step": 75 + }, + { + "epoch": 0.10167224080267559, + "grad_norm": 1.0272741662288747, + "learning_rate": 2.0053475935828877e-06, + "loss": 1.4925, + "step": 76 + }, + { + "epoch": 0.10301003344481606, + "grad_norm": 1.1127374795693439, + "learning_rate": 2.0320855614973265e-06, + "loss": 1.4997, + "step": 77 + }, + { + "epoch": 0.10434782608695652, + "grad_norm": 1.3807887389901958, + "learning_rate": 2.058823529411765e-06, + "loss": 1.4594, + "step": 78 + }, + { + "epoch": 0.10568561872909699, + "grad_norm": 1.1257379387638473, + "learning_rate": 2.0855614973262034e-06, + "loss": 1.3871, + "step": 79 + }, + { + "epoch": 0.10702341137123746, + "grad_norm": 1.1962192512365752, + "learning_rate": 2.112299465240642e-06, + "loss": 1.3023, + "step": 80 + }, + { + "epoch": 0.10836120401337793, + "grad_norm": 1.1653582819414467, + "learning_rate": 2.1390374331550802e-06, + "loss": 1.4151, + "step": 81 + }, + { + "epoch": 0.1096989966555184, + "grad_norm": 1.1487637962890191, + "learning_rate": 2.165775401069519e-06, + "loss": 1.3692, + "step": 82 + }, + { + "epoch": 0.11103678929765887, + "grad_norm": 0.9961273795208875, + "learning_rate": 2.1925133689839575e-06, + "loss": 1.4592, + "step": 83 + }, + { + "epoch": 0.11237458193979934, + "grad_norm": 0.9469105261486475, + "learning_rate": 2.219251336898396e-06, + "loss": 1.3149, + "step": 84 + }, + { + "epoch": 0.11371237458193979, + "grad_norm": 1.1612438930012847, + "learning_rate": 2.2459893048128343e-06, + "loss": 1.4604, + "step": 85 + }, + { + "epoch": 0.11505016722408026, + "grad_norm": 1.1823119585101949, + "learning_rate": 2.2727272727272728e-06, + "loss": 1.6066, + "step": 86 + }, + { + "epoch": 0.11638795986622073, + "grad_norm": 1.0790306132338785, + "learning_rate": 2.2994652406417116e-06, + "loss": 1.1938, + "step": 87 + }, + { + "epoch": 0.1177257525083612, + "grad_norm": 1.0688075199086942, + "learning_rate": 2.32620320855615e-06, + "loss": 1.4067, + "step": 88 + }, + { + "epoch": 0.11906354515050167, + "grad_norm": 1.1535108669978373, + "learning_rate": 2.3529411764705885e-06, + "loss": 1.2511, + "step": 89 + }, + { + "epoch": 0.12040133779264214, + "grad_norm": 1.0079983375893522, + "learning_rate": 2.379679144385027e-06, + "loss": 1.2918, + "step": 90 + }, + { + "epoch": 0.12173913043478261, + "grad_norm": 1.3688448443207042, + "learning_rate": 2.4064171122994653e-06, + "loss": 1.4254, + "step": 91 + }, + { + "epoch": 0.12307692307692308, + "grad_norm": 1.046889569108616, + "learning_rate": 2.433155080213904e-06, + "loss": 1.3493, + "step": 92 + }, + { + "epoch": 0.12441471571906354, + "grad_norm": 1.0073035967350052, + "learning_rate": 2.4598930481283426e-06, + "loss": 1.427, + "step": 93 + }, + { + "epoch": 0.12575250836120402, + "grad_norm": 1.062256258940436, + "learning_rate": 2.486631016042781e-06, + "loss": 1.5043, + "step": 94 + }, + { + "epoch": 0.12709030100334448, + "grad_norm": 1.181432275707845, + "learning_rate": 2.5133689839572194e-06, + "loss": 1.3468, + "step": 95 + }, + { + "epoch": 0.12842809364548494, + "grad_norm": 0.8961822277608905, + "learning_rate": 2.5401069518716583e-06, + "loss": 1.2682, + "step": 96 + }, + { + "epoch": 0.12976588628762542, + "grad_norm": 1.2775796020291708, + "learning_rate": 2.5668449197860967e-06, + "loss": 1.2532, + "step": 97 + }, + { + "epoch": 0.13110367892976588, + "grad_norm": 0.9769330838236839, + "learning_rate": 2.5935828877005347e-06, + "loss": 1.2817, + "step": 98 + }, + { + "epoch": 0.13244147157190636, + "grad_norm": 0.8581715203341259, + "learning_rate": 2.6203208556149735e-06, + "loss": 1.3021, + "step": 99 + }, + { + "epoch": 0.13377926421404682, + "grad_norm": 1.111744983472255, + "learning_rate": 2.647058823529412e-06, + "loss": 1.3473, + "step": 100 + }, + { + "epoch": 0.1351170568561873, + "grad_norm": 1.1169297718767288, + "learning_rate": 2.673796791443851e-06, + "loss": 1.496, + "step": 101 + }, + { + "epoch": 0.13645484949832776, + "grad_norm": 1.0944624518645307, + "learning_rate": 2.7005347593582892e-06, + "loss": 1.2951, + "step": 102 + }, + { + "epoch": 0.13779264214046824, + "grad_norm": 0.9945716463486808, + "learning_rate": 2.7272727272727272e-06, + "loss": 1.5734, + "step": 103 + }, + { + "epoch": 0.1391304347826087, + "grad_norm": 1.0614456546141133, + "learning_rate": 2.754010695187166e-06, + "loss": 1.5195, + "step": 104 + }, + { + "epoch": 0.14046822742474915, + "grad_norm": 0.9682729380333608, + "learning_rate": 2.7807486631016045e-06, + "loss": 1.5648, + "step": 105 + }, + { + "epoch": 0.14180602006688964, + "grad_norm": 0.9320712805992005, + "learning_rate": 2.807486631016043e-06, + "loss": 1.2919, + "step": 106 + }, + { + "epoch": 0.1431438127090301, + "grad_norm": 1.103397757825478, + "learning_rate": 2.8342245989304818e-06, + "loss": 1.3494, + "step": 107 + }, + { + "epoch": 0.14448160535117058, + "grad_norm": 1.076785866593652, + "learning_rate": 2.8609625668449198e-06, + "loss": 1.6155, + "step": 108 + }, + { + "epoch": 0.14581939799331103, + "grad_norm": 1.1164863375288008, + "learning_rate": 2.8877005347593586e-06, + "loss": 1.7089, + "step": 109 + }, + { + "epoch": 0.14715719063545152, + "grad_norm": 1.0619820340060873, + "learning_rate": 2.914438502673797e-06, + "loss": 1.3017, + "step": 110 + }, + { + "epoch": 0.14849498327759197, + "grad_norm": 1.0001056978751732, + "learning_rate": 2.9411764705882355e-06, + "loss": 1.4229, + "step": 111 + }, + { + "epoch": 0.14983277591973243, + "grad_norm": 1.1084412703011362, + "learning_rate": 2.9679144385026743e-06, + "loss": 1.5925, + "step": 112 + }, + { + "epoch": 0.15117056856187291, + "grad_norm": 1.0081396051255507, + "learning_rate": 2.9946524064171123e-06, + "loss": 1.27, + "step": 113 + }, + { + "epoch": 0.15250836120401337, + "grad_norm": 0.8740008225571271, + "learning_rate": 3.0213903743315507e-06, + "loss": 1.4101, + "step": 114 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.9524154172136331, + "learning_rate": 3.0481283422459896e-06, + "loss": 1.288, + "step": 115 + }, + { + "epoch": 0.1551839464882943, + "grad_norm": 0.9415897375343164, + "learning_rate": 3.074866310160428e-06, + "loss": 1.4442, + "step": 116 + }, + { + "epoch": 0.1565217391304348, + "grad_norm": 0.9225208397780869, + "learning_rate": 3.101604278074867e-06, + "loss": 1.3028, + "step": 117 + }, + { + "epoch": 0.15785953177257525, + "grad_norm": 1.1285083741089659, + "learning_rate": 3.128342245989305e-06, + "loss": 1.3319, + "step": 118 + }, + { + "epoch": 0.1591973244147157, + "grad_norm": 0.9190996239870372, + "learning_rate": 3.1550802139037433e-06, + "loss": 1.4224, + "step": 119 + }, + { + "epoch": 0.1605351170568562, + "grad_norm": 1.1671901001164469, + "learning_rate": 3.181818181818182e-06, + "loss": 1.2356, + "step": 120 + }, + { + "epoch": 0.16187290969899665, + "grad_norm": 1.0344162139797675, + "learning_rate": 3.2085561497326205e-06, + "loss": 1.5509, + "step": 121 + }, + { + "epoch": 0.16321070234113713, + "grad_norm": 0.9215823914779945, + "learning_rate": 3.2352941176470594e-06, + "loss": 1.2885, + "step": 122 + }, + { + "epoch": 0.1645484949832776, + "grad_norm": 1.0782705919385471, + "learning_rate": 3.262032085561498e-06, + "loss": 1.4401, + "step": 123 + }, + { + "epoch": 0.16588628762541807, + "grad_norm": 0.891874035516688, + "learning_rate": 3.288770053475936e-06, + "loss": 1.5143, + "step": 124 + }, + { + "epoch": 0.16722408026755853, + "grad_norm": 1.061205824224354, + "learning_rate": 3.3155080213903747e-06, + "loss": 1.2111, + "step": 125 + }, + { + "epoch": 0.16856187290969898, + "grad_norm": 1.2979234271673241, + "learning_rate": 3.342245989304813e-06, + "loss": 1.3839, + "step": 126 + }, + { + "epoch": 0.16989966555183947, + "grad_norm": 1.0640131153829553, + "learning_rate": 3.368983957219252e-06, + "loss": 1.6252, + "step": 127 + }, + { + "epoch": 0.17123745819397992, + "grad_norm": 0.9615759362125401, + "learning_rate": 3.3957219251336904e-06, + "loss": 1.4546, + "step": 128 + }, + { + "epoch": 0.1725752508361204, + "grad_norm": 0.899750685998711, + "learning_rate": 3.4224598930481284e-06, + "loss": 1.5047, + "step": 129 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 0.8297649188410355, + "learning_rate": 3.449197860962567e-06, + "loss": 1.4403, + "step": 130 + }, + { + "epoch": 0.17525083612040135, + "grad_norm": 0.9185754704276508, + "learning_rate": 3.4759358288770056e-06, + "loss": 1.471, + "step": 131 + }, + { + "epoch": 0.1765886287625418, + "grad_norm": 1.00520478387991, + "learning_rate": 3.5026737967914445e-06, + "loss": 1.3642, + "step": 132 + }, + { + "epoch": 0.17792642140468226, + "grad_norm": 0.9774566085646007, + "learning_rate": 3.529411764705883e-06, + "loss": 1.3422, + "step": 133 + }, + { + "epoch": 0.17926421404682275, + "grad_norm": 0.8492114954517735, + "learning_rate": 3.556149732620321e-06, + "loss": 1.2884, + "step": 134 + }, + { + "epoch": 0.1806020066889632, + "grad_norm": 1.0245591241122465, + "learning_rate": 3.5828877005347597e-06, + "loss": 1.1566, + "step": 135 + }, + { + "epoch": 0.18193979933110369, + "grad_norm": 0.8510142319477291, + "learning_rate": 3.609625668449198e-06, + "loss": 1.243, + "step": 136 + }, + { + "epoch": 0.18327759197324414, + "grad_norm": 1.182405747033026, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.5858, + "step": 137 + }, + { + "epoch": 0.18461538461538463, + "grad_norm": 1.0290035751724027, + "learning_rate": 3.6631016042780754e-06, + "loss": 1.7362, + "step": 138 + }, + { + "epoch": 0.18595317725752508, + "grad_norm": 0.9544694951272571, + "learning_rate": 3.6898395721925134e-06, + "loss": 1.3692, + "step": 139 + }, + { + "epoch": 0.18729096989966554, + "grad_norm": 0.89624018142819, + "learning_rate": 3.716577540106952e-06, + "loss": 1.3631, + "step": 140 + }, + { + "epoch": 0.18862876254180602, + "grad_norm": 0.8940845870235314, + "learning_rate": 3.7433155080213907e-06, + "loss": 1.1923, + "step": 141 + }, + { + "epoch": 0.18996655518394648, + "grad_norm": 0.8104601669423683, + "learning_rate": 3.770053475935829e-06, + "loss": 1.2478, + "step": 142 + }, + { + "epoch": 0.19130434782608696, + "grad_norm": 1.2508262497480958, + "learning_rate": 3.796791443850268e-06, + "loss": 1.3621, + "step": 143 + }, + { + "epoch": 0.19264214046822742, + "grad_norm": 1.1090525435466025, + "learning_rate": 3.8235294117647055e-06, + "loss": 1.3968, + "step": 144 + }, + { + "epoch": 0.1939799331103679, + "grad_norm": 0.9082339520398943, + "learning_rate": 3.850267379679145e-06, + "loss": 1.5215, + "step": 145 + }, + { + "epoch": 0.19531772575250836, + "grad_norm": 1.11025129864654, + "learning_rate": 3.877005347593583e-06, + "loss": 1.435, + "step": 146 + }, + { + "epoch": 0.19665551839464884, + "grad_norm": 0.9939048098623043, + "learning_rate": 3.903743315508022e-06, + "loss": 1.3707, + "step": 147 + }, + { + "epoch": 0.1979933110367893, + "grad_norm": 0.9698785949144207, + "learning_rate": 3.93048128342246e-06, + "loss": 1.3076, + "step": 148 + }, + { + "epoch": 0.19933110367892976, + "grad_norm": 1.049432862173099, + "learning_rate": 3.957219251336899e-06, + "loss": 1.4713, + "step": 149 + }, + { + "epoch": 0.20066889632107024, + "grad_norm": 1.1487690827564274, + "learning_rate": 3.983957219251337e-06, + "loss": 1.5249, + "step": 150 + }, + { + "epoch": 0.2020066889632107, + "grad_norm": 0.8697593357690977, + "learning_rate": 4.010695187165775e-06, + "loss": 1.4901, + "step": 151 + }, + { + "epoch": 0.20334448160535118, + "grad_norm": 1.1721048677054824, + "learning_rate": 4.037433155080215e-06, + "loss": 1.3921, + "step": 152 + }, + { + "epoch": 0.20468227424749164, + "grad_norm": 0.9560169597289735, + "learning_rate": 4.064171122994653e-06, + "loss": 1.6577, + "step": 153 + }, + { + "epoch": 0.20602006688963212, + "grad_norm": 0.9930761547078468, + "learning_rate": 4.0909090909090915e-06, + "loss": 1.5119, + "step": 154 + }, + { + "epoch": 0.20735785953177258, + "grad_norm": 1.572341864408476, + "learning_rate": 4.11764705882353e-06, + "loss": 1.2178, + "step": 155 + }, + { + "epoch": 0.20869565217391303, + "grad_norm": 1.1293428399771321, + "learning_rate": 4.144385026737968e-06, + "loss": 1.3351, + "step": 156 + }, + { + "epoch": 0.21003344481605352, + "grad_norm": 0.9739166860971282, + "learning_rate": 4.171122994652407e-06, + "loss": 1.2054, + "step": 157 + }, + { + "epoch": 0.21137123745819397, + "grad_norm": 1.0881471768257145, + "learning_rate": 4.197860962566845e-06, + "loss": 1.2596, + "step": 158 + }, + { + "epoch": 0.21270903010033446, + "grad_norm": 1.0336412364265317, + "learning_rate": 4.224598930481284e-06, + "loss": 1.3533, + "step": 159 + }, + { + "epoch": 0.2140468227424749, + "grad_norm": 1.0129673438482856, + "learning_rate": 4.251336898395722e-06, + "loss": 1.4104, + "step": 160 + }, + { + "epoch": 0.2153846153846154, + "grad_norm": 1.082265336644719, + "learning_rate": 4.2780748663101604e-06, + "loss": 1.3657, + "step": 161 + }, + { + "epoch": 0.21672240802675585, + "grad_norm": 1.0664538651662654, + "learning_rate": 4.304812834224599e-06, + "loss": 1.4207, + "step": 162 + }, + { + "epoch": 0.2180602006688963, + "grad_norm": 0.9389581663450862, + "learning_rate": 4.331550802139038e-06, + "loss": 1.0296, + "step": 163 + }, + { + "epoch": 0.2193979933110368, + "grad_norm": 1.2992972332287922, + "learning_rate": 4.3582887700534766e-06, + "loss": 1.5601, + "step": 164 + }, + { + "epoch": 0.22073578595317725, + "grad_norm": 1.0654091978085092, + "learning_rate": 4.385026737967915e-06, + "loss": 1.3527, + "step": 165 + }, + { + "epoch": 0.22207357859531773, + "grad_norm": 0.9496570463095897, + "learning_rate": 4.411764705882353e-06, + "loss": 1.4246, + "step": 166 + }, + { + "epoch": 0.2234113712374582, + "grad_norm": 1.24124498625348, + "learning_rate": 4.438502673796792e-06, + "loss": 1.3753, + "step": 167 + }, + { + "epoch": 0.22474916387959867, + "grad_norm": 0.8818510149913092, + "learning_rate": 4.46524064171123e-06, + "loss": 1.2146, + "step": 168 + }, + { + "epoch": 0.22608695652173913, + "grad_norm": 0.8224434112664454, + "learning_rate": 4.491978609625669e-06, + "loss": 1.2166, + "step": 169 + }, + { + "epoch": 0.22742474916387959, + "grad_norm": 0.9696981188636683, + "learning_rate": 4.518716577540107e-06, + "loss": 1.2845, + "step": 170 + }, + { + "epoch": 0.22876254180602007, + "grad_norm": 1.0255274238363419, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.3232, + "step": 171 + }, + { + "epoch": 0.23010033444816053, + "grad_norm": 1.035490685844757, + "learning_rate": 4.572192513368984e-06, + "loss": 1.3782, + "step": 172 + }, + { + "epoch": 0.231438127090301, + "grad_norm": 1.1520733579135656, + "learning_rate": 4.598930481283423e-06, + "loss": 1.3859, + "step": 173 + }, + { + "epoch": 0.23277591973244147, + "grad_norm": 1.0585730806144442, + "learning_rate": 4.625668449197862e-06, + "loss": 1.3288, + "step": 174 + }, + { + "epoch": 0.23411371237458195, + "grad_norm": 0.9422470906676385, + "learning_rate": 4.6524064171123e-06, + "loss": 1.279, + "step": 175 + }, + { + "epoch": 0.2354515050167224, + "grad_norm": 0.9594006411149355, + "learning_rate": 4.6791443850267385e-06, + "loss": 1.4266, + "step": 176 + }, + { + "epoch": 0.23678929765886286, + "grad_norm": 1.1609185473134782, + "learning_rate": 4.705882352941177e-06, + "loss": 1.4222, + "step": 177 + }, + { + "epoch": 0.23812709030100335, + "grad_norm": 1.0310204783428252, + "learning_rate": 4.732620320855615e-06, + "loss": 1.386, + "step": 178 + }, + { + "epoch": 0.2394648829431438, + "grad_norm": 1.0217597147605564, + "learning_rate": 4.759358288770054e-06, + "loss": 1.3439, + "step": 179 + }, + { + "epoch": 0.2408026755852843, + "grad_norm": 0.8769791571362656, + "learning_rate": 4.786096256684493e-06, + "loss": 1.474, + "step": 180 + }, + { + "epoch": 0.24214046822742474, + "grad_norm": 1.0274648195375464, + "learning_rate": 4.812834224598931e-06, + "loss": 1.3571, + "step": 181 + }, + { + "epoch": 0.24347826086956523, + "grad_norm": 0.9167736367746951, + "learning_rate": 4.839572192513369e-06, + "loss": 1.2036, + "step": 182 + }, + { + "epoch": 0.24481605351170568, + "grad_norm": 1.0420094870948453, + "learning_rate": 4.866310160427808e-06, + "loss": 1.5992, + "step": 183 + }, + { + "epoch": 0.24615384615384617, + "grad_norm": 0.8836095419909563, + "learning_rate": 4.893048128342247e-06, + "loss": 1.5475, + "step": 184 + }, + { + "epoch": 0.24749163879598662, + "grad_norm": 1.2633642043623206, + "learning_rate": 4.919786096256685e-06, + "loss": 1.3457, + "step": 185 + }, + { + "epoch": 0.24882943143812708, + "grad_norm": 1.1364954220008037, + "learning_rate": 4.9465240641711236e-06, + "loss": 1.2977, + "step": 186 + }, + { + "epoch": 0.25016722408026754, + "grad_norm": 1.0794189150654416, + "learning_rate": 4.973262032085562e-06, + "loss": 1.1869, + "step": 187 + }, + { + "epoch": 0.25150501672240805, + "grad_norm": 1.046851568477193, + "learning_rate": 5e-06, + "loss": 1.3839, + "step": 188 + }, + { + "epoch": 0.2528428093645485, + "grad_norm": 1.0042511548048567, + "learning_rate": 5.026737967914439e-06, + "loss": 1.3756, + "step": 189 + }, + { + "epoch": 0.25418060200668896, + "grad_norm": 0.9939922407682784, + "learning_rate": 5.053475935828877e-06, + "loss": 1.4505, + "step": 190 + }, + { + "epoch": 0.2555183946488294, + "grad_norm": 0.9715309151881161, + "learning_rate": 5.0802139037433165e-06, + "loss": 1.2653, + "step": 191 + }, + { + "epoch": 0.2568561872909699, + "grad_norm": 1.063672235525638, + "learning_rate": 5.106951871657755e-06, + "loss": 1.357, + "step": 192 + }, + { + "epoch": 0.2581939799331104, + "grad_norm": 0.9710523591040529, + "learning_rate": 5.133689839572193e-06, + "loss": 1.337, + "step": 193 + }, + { + "epoch": 0.25953177257525084, + "grad_norm": 0.9509882513345431, + "learning_rate": 5.160427807486631e-06, + "loss": 1.2419, + "step": 194 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 1.0033531375064206, + "learning_rate": 5.187165775401069e-06, + "loss": 1.4376, + "step": 195 + }, + { + "epoch": 0.26220735785953175, + "grad_norm": 0.9421886275243602, + "learning_rate": 5.213903743315508e-06, + "loss": 1.5029, + "step": 196 + }, + { + "epoch": 0.26354515050167227, + "grad_norm": 1.072887295978468, + "learning_rate": 5.240641711229947e-06, + "loss": 1.3459, + "step": 197 + }, + { + "epoch": 0.2648829431438127, + "grad_norm": 0.8146343842910847, + "learning_rate": 5.2673796791443855e-06, + "loss": 1.2312, + "step": 198 + }, + { + "epoch": 0.2662207357859532, + "grad_norm": 1.028765037484271, + "learning_rate": 5.294117647058824e-06, + "loss": 1.3271, + "step": 199 + }, + { + "epoch": 0.26755852842809363, + "grad_norm": 0.9663926992907028, + "learning_rate": 5.320855614973262e-06, + "loss": 1.385, + "step": 200 + }, + { + "epoch": 0.2688963210702341, + "grad_norm": 1.2246322577474746, + "learning_rate": 5.347593582887702e-06, + "loss": 1.5505, + "step": 201 + }, + { + "epoch": 0.2702341137123746, + "grad_norm": 1.0319971040570286, + "learning_rate": 5.37433155080214e-06, + "loss": 1.4229, + "step": 202 + }, + { + "epoch": 0.27157190635451506, + "grad_norm": 1.0392210494668102, + "learning_rate": 5.4010695187165785e-06, + "loss": 1.2406, + "step": 203 + }, + { + "epoch": 0.2729096989966555, + "grad_norm": 0.8670784376746984, + "learning_rate": 5.427807486631016e-06, + "loss": 1.3523, + "step": 204 + }, + { + "epoch": 0.27424749163879597, + "grad_norm": 0.9766068352778432, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.2654, + "step": 205 + }, + { + "epoch": 0.2755852842809365, + "grad_norm": 0.9810457507623077, + "learning_rate": 5.481283422459893e-06, + "loss": 1.264, + "step": 206 + }, + { + "epoch": 0.27692307692307694, + "grad_norm": 0.904333259305881, + "learning_rate": 5.508021390374332e-06, + "loss": 1.394, + "step": 207 + }, + { + "epoch": 0.2782608695652174, + "grad_norm": 1.142913837360352, + "learning_rate": 5.5347593582887706e-06, + "loss": 1.1669, + "step": 208 + }, + { + "epoch": 0.27959866220735785, + "grad_norm": 1.1118677327140036, + "learning_rate": 5.561497326203209e-06, + "loss": 1.6196, + "step": 209 + }, + { + "epoch": 0.2809364548494983, + "grad_norm": 1.0252300195350812, + "learning_rate": 5.588235294117647e-06, + "loss": 1.3763, + "step": 210 + }, + { + "epoch": 0.2822742474916388, + "grad_norm": 1.0987403204527393, + "learning_rate": 5.614973262032086e-06, + "loss": 1.3365, + "step": 211 + }, + { + "epoch": 0.2836120401337793, + "grad_norm": 0.9920050248201312, + "learning_rate": 5.641711229946525e-06, + "loss": 1.3293, + "step": 212 + }, + { + "epoch": 0.28494983277591973, + "grad_norm": 0.9649354985996215, + "learning_rate": 5.6684491978609635e-06, + "loss": 1.411, + "step": 213 + }, + { + "epoch": 0.2862876254180602, + "grad_norm": 1.133805734786604, + "learning_rate": 5.695187165775401e-06, + "loss": 1.3241, + "step": 214 + }, + { + "epoch": 0.28762541806020064, + "grad_norm": 1.2181248732898289, + "learning_rate": 5.7219251336898395e-06, + "loss": 1.4046, + "step": 215 + }, + { + "epoch": 0.28896321070234116, + "grad_norm": 1.024351270510013, + "learning_rate": 5.748663101604278e-06, + "loss": 1.3937, + "step": 216 + }, + { + "epoch": 0.2903010033444816, + "grad_norm": 1.2318635253299581, + "learning_rate": 5.775401069518717e-06, + "loss": 1.2779, + "step": 217 + }, + { + "epoch": 0.29163879598662207, + "grad_norm": 1.2280987034866617, + "learning_rate": 5.802139037433156e-06, + "loss": 1.2685, + "step": 218 + }, + { + "epoch": 0.2929765886287625, + "grad_norm": 0.9698500985683687, + "learning_rate": 5.828877005347594e-06, + "loss": 1.3348, + "step": 219 + }, + { + "epoch": 0.29431438127090304, + "grad_norm": 1.0735453730276354, + "learning_rate": 5.8556149732620325e-06, + "loss": 1.1674, + "step": 220 + }, + { + "epoch": 0.2956521739130435, + "grad_norm": 1.0670400819343935, + "learning_rate": 5.882352941176471e-06, + "loss": 1.3719, + "step": 221 + }, + { + "epoch": 0.29698996655518395, + "grad_norm": 0.9195631835924983, + "learning_rate": 5.90909090909091e-06, + "loss": 1.3579, + "step": 222 + }, + { + "epoch": 0.2983277591973244, + "grad_norm": 1.34565918113007, + "learning_rate": 5.935828877005349e-06, + "loss": 1.0646, + "step": 223 + }, + { + "epoch": 0.29966555183946486, + "grad_norm": 1.1072083577728888, + "learning_rate": 5.962566844919787e-06, + "loss": 1.2214, + "step": 224 + }, + { + "epoch": 0.3010033444816054, + "grad_norm": 1.0380891375252423, + "learning_rate": 5.989304812834225e-06, + "loss": 1.4218, + "step": 225 + }, + { + "epoch": 0.30234113712374583, + "grad_norm": 0.8809726071839706, + "learning_rate": 6.016042780748663e-06, + "loss": 1.0521, + "step": 226 + }, + { + "epoch": 0.3036789297658863, + "grad_norm": 1.116896332469975, + "learning_rate": 6.0427807486631015e-06, + "loss": 1.2609, + "step": 227 + }, + { + "epoch": 0.30501672240802674, + "grad_norm": 1.0006202226334993, + "learning_rate": 6.069518716577541e-06, + "loss": 1.2325, + "step": 228 + }, + { + "epoch": 0.3063545150501672, + "grad_norm": 0.9350263650849192, + "learning_rate": 6.096256684491979e-06, + "loss": 1.2312, + "step": 229 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 1.0460436874941725, + "learning_rate": 6.122994652406418e-06, + "loss": 1.2583, + "step": 230 + }, + { + "epoch": 0.30903010033444817, + "grad_norm": 1.0876704196938816, + "learning_rate": 6.149732620320856e-06, + "loss": 1.4895, + "step": 231 + }, + { + "epoch": 0.3103678929765886, + "grad_norm": 0.9175497232292475, + "learning_rate": 6.176470588235295e-06, + "loss": 1.2283, + "step": 232 + }, + { + "epoch": 0.3117056856187291, + "grad_norm": 1.1765076792417404, + "learning_rate": 6.203208556149734e-06, + "loss": 1.2861, + "step": 233 + }, + { + "epoch": 0.3130434782608696, + "grad_norm": 1.0848327507952489, + "learning_rate": 6.229946524064172e-06, + "loss": 1.1205, + "step": 234 + }, + { + "epoch": 0.31438127090301005, + "grad_norm": 0.9313386018445499, + "learning_rate": 6.25668449197861e-06, + "loss": 1.2987, + "step": 235 + }, + { + "epoch": 0.3157190635451505, + "grad_norm": 1.025423556457117, + "learning_rate": 6.283422459893048e-06, + "loss": 1.3579, + "step": 236 + }, + { + "epoch": 0.31705685618729096, + "grad_norm": 0.9756911706630799, + "learning_rate": 6.3101604278074865e-06, + "loss": 1.4273, + "step": 237 + }, + { + "epoch": 0.3183946488294314, + "grad_norm": 1.0869158544708264, + "learning_rate": 6.336898395721926e-06, + "loss": 1.4063, + "step": 238 + }, + { + "epoch": 0.3197324414715719, + "grad_norm": 1.075266795121673, + "learning_rate": 6.363636363636364e-06, + "loss": 1.4018, + "step": 239 + }, + { + "epoch": 0.3210702341137124, + "grad_norm": 0.9472496287801255, + "learning_rate": 6.390374331550803e-06, + "loss": 1.2395, + "step": 240 + }, + { + "epoch": 0.32240802675585284, + "grad_norm": 1.3731882561442197, + "learning_rate": 6.417112299465241e-06, + "loss": 1.3803, + "step": 241 + }, + { + "epoch": 0.3237458193979933, + "grad_norm": 1.021324563608375, + "learning_rate": 6.4438502673796795e-06, + "loss": 1.2902, + "step": 242 + }, + { + "epoch": 0.3250836120401338, + "grad_norm": 0.9307933642968367, + "learning_rate": 6.470588235294119e-06, + "loss": 1.1745, + "step": 243 + }, + { + "epoch": 0.32642140468227426, + "grad_norm": 1.0746894807316532, + "learning_rate": 6.497326203208557e-06, + "loss": 1.2324, + "step": 244 + }, + { + "epoch": 0.3277591973244147, + "grad_norm": 0.9192963638227226, + "learning_rate": 6.524064171122996e-06, + "loss": 1.2197, + "step": 245 + }, + { + "epoch": 0.3290969899665552, + "grad_norm": 1.277061353409073, + "learning_rate": 6.550802139037433e-06, + "loss": 1.3765, + "step": 246 + }, + { + "epoch": 0.33043478260869563, + "grad_norm": 1.0454043316989905, + "learning_rate": 6.577540106951872e-06, + "loss": 1.2133, + "step": 247 + }, + { + "epoch": 0.33177257525083614, + "grad_norm": 1.1110640142045864, + "learning_rate": 6.60427807486631e-06, + "loss": 1.5621, + "step": 248 + }, + { + "epoch": 0.3331103678929766, + "grad_norm": 0.8724188151051414, + "learning_rate": 6.631016042780749e-06, + "loss": 1.2794, + "step": 249 + }, + { + "epoch": 0.33444816053511706, + "grad_norm": 1.1150904150212262, + "learning_rate": 6.657754010695188e-06, + "loss": 1.2461, + "step": 250 + }, + { + "epoch": 0.3357859531772575, + "grad_norm": 1.1116547266235468, + "learning_rate": 6.684491978609626e-06, + "loss": 1.3525, + "step": 251 + }, + { + "epoch": 0.33712374581939797, + "grad_norm": 1.1179172500307484, + "learning_rate": 6.711229946524065e-06, + "loss": 1.4727, + "step": 252 + }, + { + "epoch": 0.3384615384615385, + "grad_norm": 0.9676646460575367, + "learning_rate": 6.737967914438504e-06, + "loss": 1.3965, + "step": 253 + }, + { + "epoch": 0.33979933110367894, + "grad_norm": 0.9225430448625723, + "learning_rate": 6.764705882352942e-06, + "loss": 1.1864, + "step": 254 + }, + { + "epoch": 0.3411371237458194, + "grad_norm": 1.0018353272749858, + "learning_rate": 6.791443850267381e-06, + "loss": 1.4202, + "step": 255 + }, + { + "epoch": 0.34247491638795985, + "grad_norm": 1.0253809591859149, + "learning_rate": 6.818181818181818e-06, + "loss": 1.4186, + "step": 256 + }, + { + "epoch": 0.34381270903010036, + "grad_norm": 0.8990574671133639, + "learning_rate": 6.844919786096257e-06, + "loss": 1.3766, + "step": 257 + }, + { + "epoch": 0.3451505016722408, + "grad_norm": 0.9591770064266341, + "learning_rate": 6.871657754010695e-06, + "loss": 1.3059, + "step": 258 + }, + { + "epoch": 0.3464882943143813, + "grad_norm": 0.9262218154461306, + "learning_rate": 6.898395721925134e-06, + "loss": 1.1316, + "step": 259 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 0.9919441450085155, + "learning_rate": 6.925133689839573e-06, + "loss": 1.4031, + "step": 260 + }, + { + "epoch": 0.3491638795986622, + "grad_norm": 0.9326581450453234, + "learning_rate": 6.951871657754011e-06, + "loss": 1.2842, + "step": 261 + }, + { + "epoch": 0.3505016722408027, + "grad_norm": 1.2252041314540254, + "learning_rate": 6.97860962566845e-06, + "loss": 1.3071, + "step": 262 + }, + { + "epoch": 0.35183946488294315, + "grad_norm": 0.9679444231958054, + "learning_rate": 7.005347593582889e-06, + "loss": 1.4833, + "step": 263 + }, + { + "epoch": 0.3531772575250836, + "grad_norm": 1.0330410177187401, + "learning_rate": 7.032085561497327e-06, + "loss": 1.4715, + "step": 264 + }, + { + "epoch": 0.35451505016722407, + "grad_norm": 1.0254912333349466, + "learning_rate": 7.058823529411766e-06, + "loss": 1.1503, + "step": 265 + }, + { + "epoch": 0.3558528428093645, + "grad_norm": 0.9945792202058218, + "learning_rate": 7.085561497326203e-06, + "loss": 1.3157, + "step": 266 + }, + { + "epoch": 0.35719063545150503, + "grad_norm": 0.848056069073794, + "learning_rate": 7.112299465240642e-06, + "loss": 1.2576, + "step": 267 + }, + { + "epoch": 0.3585284280936455, + "grad_norm": 1.2487263571693172, + "learning_rate": 7.13903743315508e-06, + "loss": 1.44, + "step": 268 + }, + { + "epoch": 0.35986622073578595, + "grad_norm": 0.9743595243732323, + "learning_rate": 7.1657754010695195e-06, + "loss": 1.1386, + "step": 269 + }, + { + "epoch": 0.3612040133779264, + "grad_norm": 1.2579266000272613, + "learning_rate": 7.192513368983958e-06, + "loss": 1.2316, + "step": 270 + }, + { + "epoch": 0.3625418060200669, + "grad_norm": 1.2518553136533634, + "learning_rate": 7.219251336898396e-06, + "loss": 1.466, + "step": 271 + }, + { + "epoch": 0.36387959866220737, + "grad_norm": 1.0249362077663402, + "learning_rate": 7.245989304812835e-06, + "loss": 1.3646, + "step": 272 + }, + { + "epoch": 0.3652173913043478, + "grad_norm": 1.063712855427759, + "learning_rate": 7.272727272727273e-06, + "loss": 1.4051, + "step": 273 + }, + { + "epoch": 0.3665551839464883, + "grad_norm": 0.9751991824419184, + "learning_rate": 7.2994652406417124e-06, + "loss": 1.4226, + "step": 274 + }, + { + "epoch": 0.36789297658862874, + "grad_norm": 1.2347737948470199, + "learning_rate": 7.326203208556151e-06, + "loss": 1.3293, + "step": 275 + }, + { + "epoch": 0.36923076923076925, + "grad_norm": 1.1422954411620567, + "learning_rate": 7.352941176470589e-06, + "loss": 1.4302, + "step": 276 + }, + { + "epoch": 0.3705685618729097, + "grad_norm": 1.0656558473783477, + "learning_rate": 7.379679144385027e-06, + "loss": 1.2831, + "step": 277 + }, + { + "epoch": 0.37190635451505016, + "grad_norm": 1.1175023290544226, + "learning_rate": 7.406417112299465e-06, + "loss": 1.2767, + "step": 278 + }, + { + "epoch": 0.3732441471571906, + "grad_norm": 1.2747460411445615, + "learning_rate": 7.433155080213904e-06, + "loss": 1.2882, + "step": 279 + }, + { + "epoch": 0.3745819397993311, + "grad_norm": 1.04710132315656, + "learning_rate": 7.459893048128343e-06, + "loss": 1.5069, + "step": 280 + }, + { + "epoch": 0.3759197324414716, + "grad_norm": 0.964991457048075, + "learning_rate": 7.486631016042781e-06, + "loss": 1.1532, + "step": 281 + }, + { + "epoch": 0.37725752508361204, + "grad_norm": 1.00254668425242, + "learning_rate": 7.51336898395722e-06, + "loss": 1.2553, + "step": 282 + }, + { + "epoch": 0.3785953177257525, + "grad_norm": 1.2057360132848884, + "learning_rate": 7.540106951871658e-06, + "loss": 1.4891, + "step": 283 + }, + { + "epoch": 0.37993311036789296, + "grad_norm": 1.077304830471415, + "learning_rate": 7.5668449197860975e-06, + "loss": 1.3974, + "step": 284 + }, + { + "epoch": 0.38127090301003347, + "grad_norm": 1.0487231494743792, + "learning_rate": 7.593582887700536e-06, + "loss": 1.275, + "step": 285 + }, + { + "epoch": 0.3826086956521739, + "grad_norm": 1.1369296537183577, + "learning_rate": 7.620320855614974e-06, + "loss": 1.5237, + "step": 286 + }, + { + "epoch": 0.3839464882943144, + "grad_norm": 1.0102773355724113, + "learning_rate": 7.647058823529411e-06, + "loss": 1.2813, + "step": 287 + }, + { + "epoch": 0.38528428093645484, + "grad_norm": 1.1883556343665362, + "learning_rate": 7.67379679144385e-06, + "loss": 1.4339, + "step": 288 + }, + { + "epoch": 0.3866220735785953, + "grad_norm": 1.0832771868484499, + "learning_rate": 7.70053475935829e-06, + "loss": 1.3905, + "step": 289 + }, + { + "epoch": 0.3879598662207358, + "grad_norm": 1.105240173070673, + "learning_rate": 7.727272727272727e-06, + "loss": 1.4702, + "step": 290 + }, + { + "epoch": 0.38929765886287626, + "grad_norm": 1.0915397775002422, + "learning_rate": 7.754010695187166e-06, + "loss": 1.3256, + "step": 291 + }, + { + "epoch": 0.3906354515050167, + "grad_norm": 1.1233297197771008, + "learning_rate": 7.780748663101606e-06, + "loss": 1.5248, + "step": 292 + }, + { + "epoch": 0.3919732441471572, + "grad_norm": 1.1012421471322587, + "learning_rate": 7.807486631016043e-06, + "loss": 1.3055, + "step": 293 + }, + { + "epoch": 0.3933110367892977, + "grad_norm": 1.1036160006476297, + "learning_rate": 7.834224598930483e-06, + "loss": 1.3978, + "step": 294 + }, + { + "epoch": 0.39464882943143814, + "grad_norm": 0.9687587990299631, + "learning_rate": 7.86096256684492e-06, + "loss": 1.328, + "step": 295 + }, + { + "epoch": 0.3959866220735786, + "grad_norm": 0.9661364406143786, + "learning_rate": 7.88770053475936e-06, + "loss": 1.1659, + "step": 296 + }, + { + "epoch": 0.39732441471571905, + "grad_norm": 0.8182746775560444, + "learning_rate": 7.914438502673799e-06, + "loss": 1.2153, + "step": 297 + }, + { + "epoch": 0.3986622073578595, + "grad_norm": 1.221244413635622, + "learning_rate": 7.941176470588236e-06, + "loss": 1.2195, + "step": 298 + }, + { + "epoch": 0.4, + "grad_norm": 1.1561643361656462, + "learning_rate": 7.967914438502674e-06, + "loss": 1.3828, + "step": 299 + }, + { + "epoch": 0.4013377926421405, + "grad_norm": 1.0972474904275535, + "learning_rate": 7.994652406417113e-06, + "loss": 1.5024, + "step": 300 + }, + { + "epoch": 0.40267558528428093, + "grad_norm": 1.0433172516237257, + "learning_rate": 8.02139037433155e-06, + "loss": 1.45, + "step": 301 + }, + { + "epoch": 0.4040133779264214, + "grad_norm": 1.3422931676044443, + "learning_rate": 8.04812834224599e-06, + "loss": 1.2317, + "step": 302 + }, + { + "epoch": 0.40535117056856185, + "grad_norm": 0.9505984867013394, + "learning_rate": 8.07486631016043e-06, + "loss": 1.21, + "step": 303 + }, + { + "epoch": 0.40668896321070236, + "grad_norm": 0.9118301798933891, + "learning_rate": 8.101604278074867e-06, + "loss": 1.3208, + "step": 304 + }, + { + "epoch": 0.4080267558528428, + "grad_norm": 1.1288817007638254, + "learning_rate": 8.128342245989306e-06, + "loss": 1.2965, + "step": 305 + }, + { + "epoch": 0.40936454849498327, + "grad_norm": 1.140125229948653, + "learning_rate": 8.155080213903744e-06, + "loss": 1.2516, + "step": 306 + }, + { + "epoch": 0.4107023411371237, + "grad_norm": 0.9357185573604134, + "learning_rate": 8.181818181818183e-06, + "loss": 1.4462, + "step": 307 + }, + { + "epoch": 0.41204013377926424, + "grad_norm": 0.9781701314640479, + "learning_rate": 8.20855614973262e-06, + "loss": 1.1222, + "step": 308 + }, + { + "epoch": 0.4133779264214047, + "grad_norm": 0.9291741694487226, + "learning_rate": 8.23529411764706e-06, + "loss": 1.3339, + "step": 309 + }, + { + "epoch": 0.41471571906354515, + "grad_norm": 0.9067137564793069, + "learning_rate": 8.262032085561497e-06, + "loss": 1.139, + "step": 310 + }, + { + "epoch": 0.4160535117056856, + "grad_norm": 0.9693389983771075, + "learning_rate": 8.288770053475937e-06, + "loss": 1.135, + "step": 311 + }, + { + "epoch": 0.41739130434782606, + "grad_norm": 0.9399963307284136, + "learning_rate": 8.315508021390374e-06, + "loss": 1.2576, + "step": 312 + }, + { + "epoch": 0.4187290969899666, + "grad_norm": 1.1716202538973304, + "learning_rate": 8.342245989304813e-06, + "loss": 1.3787, + "step": 313 + }, + { + "epoch": 0.42006688963210703, + "grad_norm": 1.174675924292417, + "learning_rate": 8.368983957219253e-06, + "loss": 1.5278, + "step": 314 + }, + { + "epoch": 0.4214046822742475, + "grad_norm": 1.0738366439172033, + "learning_rate": 8.39572192513369e-06, + "loss": 1.2706, + "step": 315 + }, + { + "epoch": 0.42274247491638794, + "grad_norm": 0.9838798705987866, + "learning_rate": 8.42245989304813e-06, + "loss": 1.4889, + "step": 316 + }, + { + "epoch": 0.4240802675585284, + "grad_norm": 1.014511868117561, + "learning_rate": 8.449197860962567e-06, + "loss": 1.3783, + "step": 317 + }, + { + "epoch": 0.4254180602006689, + "grad_norm": 0.9813140683200884, + "learning_rate": 8.475935828877005e-06, + "loss": 1.5264, + "step": 318 + }, + { + "epoch": 0.42675585284280937, + "grad_norm": 0.9687185460025312, + "learning_rate": 8.502673796791444e-06, + "loss": 1.1525, + "step": 319 + }, + { + "epoch": 0.4280936454849498, + "grad_norm": 1.2477580719663948, + "learning_rate": 8.529411764705883e-06, + "loss": 1.4372, + "step": 320 + }, + { + "epoch": 0.4294314381270903, + "grad_norm": 1.1154842657428934, + "learning_rate": 8.556149732620321e-06, + "loss": 0.934, + "step": 321 + }, + { + "epoch": 0.4307692307692308, + "grad_norm": 1.9052435578219111, + "learning_rate": 8.58288770053476e-06, + "loss": 1.6066, + "step": 322 + }, + { + "epoch": 0.43210702341137125, + "grad_norm": 1.102907223544303, + "learning_rate": 8.609625668449198e-06, + "loss": 1.5432, + "step": 323 + }, + { + "epoch": 0.4334448160535117, + "grad_norm": 1.0383871927802637, + "learning_rate": 8.636363636363637e-06, + "loss": 1.3859, + "step": 324 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 0.9839100369429172, + "learning_rate": 8.663101604278076e-06, + "loss": 1.234, + "step": 325 + }, + { + "epoch": 0.4361204013377926, + "grad_norm": 1.068395380747385, + "learning_rate": 8.689839572192514e-06, + "loss": 1.2672, + "step": 326 + }, + { + "epoch": 0.43745819397993313, + "grad_norm": 1.1101633076194743, + "learning_rate": 8.716577540106953e-06, + "loss": 1.193, + "step": 327 + }, + { + "epoch": 0.4387959866220736, + "grad_norm": 0.9202376623485748, + "learning_rate": 8.743315508021392e-06, + "loss": 1.3291, + "step": 328 + }, + { + "epoch": 0.44013377926421404, + "grad_norm": 1.3938387332055933, + "learning_rate": 8.77005347593583e-06, + "loss": 1.49, + "step": 329 + }, + { + "epoch": 0.4414715719063545, + "grad_norm": 1.2053625169940356, + "learning_rate": 8.796791443850268e-06, + "loss": 1.3975, + "step": 330 + }, + { + "epoch": 0.442809364548495, + "grad_norm": 1.174383874450084, + "learning_rate": 8.823529411764707e-06, + "loss": 1.3322, + "step": 331 + }, + { + "epoch": 0.44414715719063547, + "grad_norm": 1.3155256698136732, + "learning_rate": 8.850267379679144e-06, + "loss": 1.2882, + "step": 332 + }, + { + "epoch": 0.4454849498327759, + "grad_norm": 0.9899033453310961, + "learning_rate": 8.877005347593584e-06, + "loss": 1.3443, + "step": 333 + }, + { + "epoch": 0.4468227424749164, + "grad_norm": 1.1404461041761027, + "learning_rate": 8.903743315508023e-06, + "loss": 1.2951, + "step": 334 + }, + { + "epoch": 0.44816053511705684, + "grad_norm": 1.0586844859955677, + "learning_rate": 8.93048128342246e-06, + "loss": 1.3718, + "step": 335 + }, + { + "epoch": 0.44949832775919735, + "grad_norm": 1.1285788080776542, + "learning_rate": 8.9572192513369e-06, + "loss": 1.4138, + "step": 336 + }, + { + "epoch": 0.4508361204013378, + "grad_norm": 1.0446260557656064, + "learning_rate": 8.983957219251337e-06, + "loss": 1.243, + "step": 337 + }, + { + "epoch": 0.45217391304347826, + "grad_norm": 1.2499054027278211, + "learning_rate": 9.010695187165777e-06, + "loss": 1.109, + "step": 338 + }, + { + "epoch": 0.4535117056856187, + "grad_norm": 0.9002517332888438, + "learning_rate": 9.037433155080214e-06, + "loss": 1.1362, + "step": 339 + }, + { + "epoch": 0.45484949832775917, + "grad_norm": 1.0124731135263707, + "learning_rate": 9.064171122994653e-06, + "loss": 1.173, + "step": 340 + }, + { + "epoch": 0.4561872909698997, + "grad_norm": 1.0864331677972565, + "learning_rate": 9.090909090909091e-06, + "loss": 1.3109, + "step": 341 + }, + { + "epoch": 0.45752508361204014, + "grad_norm": 1.3730941724006802, + "learning_rate": 9.11764705882353e-06, + "loss": 1.2797, + "step": 342 + }, + { + "epoch": 0.4588628762541806, + "grad_norm": 1.2636897697609841, + "learning_rate": 9.144385026737968e-06, + "loss": 1.3901, + "step": 343 + }, + { + "epoch": 0.46020066889632105, + "grad_norm": 1.3681046625731847, + "learning_rate": 9.171122994652407e-06, + "loss": 1.3197, + "step": 344 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 1.0823108856487789, + "learning_rate": 9.197860962566846e-06, + "loss": 1.2606, + "step": 345 + }, + { + "epoch": 0.462876254180602, + "grad_norm": 0.9846125258720843, + "learning_rate": 9.224598930481284e-06, + "loss": 1.2947, + "step": 346 + }, + { + "epoch": 0.4642140468227425, + "grad_norm": 1.1602343910479296, + "learning_rate": 9.251336898395723e-06, + "loss": 1.2496, + "step": 347 + }, + { + "epoch": 0.46555183946488293, + "grad_norm": 1.1891200679795988, + "learning_rate": 9.278074866310161e-06, + "loss": 1.4306, + "step": 348 + }, + { + "epoch": 0.4668896321070234, + "grad_norm": 1.1168951306798085, + "learning_rate": 9.3048128342246e-06, + "loss": 1.3251, + "step": 349 + }, + { + "epoch": 0.4682274247491639, + "grad_norm": 1.1968370877286512, + "learning_rate": 9.331550802139038e-06, + "loss": 1.4089, + "step": 350 + }, + { + "epoch": 0.46956521739130436, + "grad_norm": 1.2507407320646036, + "learning_rate": 9.358288770053477e-06, + "loss": 1.2798, + "step": 351 + }, + { + "epoch": 0.4709030100334448, + "grad_norm": 1.0545483148085961, + "learning_rate": 9.385026737967915e-06, + "loss": 1.5891, + "step": 352 + }, + { + "epoch": 0.47224080267558527, + "grad_norm": 1.0428240415412426, + "learning_rate": 9.411764705882354e-06, + "loss": 1.2006, + "step": 353 + }, + { + "epoch": 0.4735785953177257, + "grad_norm": 1.527841538177909, + "learning_rate": 9.438502673796791e-06, + "loss": 1.1208, + "step": 354 + }, + { + "epoch": 0.47491638795986624, + "grad_norm": 1.3459056868368857, + "learning_rate": 9.46524064171123e-06, + "loss": 1.4354, + "step": 355 + }, + { + "epoch": 0.4762541806020067, + "grad_norm": 1.1407494010639587, + "learning_rate": 9.49197860962567e-06, + "loss": 1.241, + "step": 356 + }, + { + "epoch": 0.47759197324414715, + "grad_norm": 1.1690331844619897, + "learning_rate": 9.518716577540108e-06, + "loss": 1.2104, + "step": 357 + }, + { + "epoch": 0.4789297658862876, + "grad_norm": 1.0527162683092417, + "learning_rate": 9.545454545454547e-06, + "loss": 1.2076, + "step": 358 + }, + { + "epoch": 0.4802675585284281, + "grad_norm": 1.2524177864607835, + "learning_rate": 9.572192513368986e-06, + "loss": 1.142, + "step": 359 + }, + { + "epoch": 0.4816053511705686, + "grad_norm": 1.1904778080345975, + "learning_rate": 9.598930481283422e-06, + "loss": 1.243, + "step": 360 + }, + { + "epoch": 0.48294314381270903, + "grad_norm": 1.1960570052837391, + "learning_rate": 9.625668449197861e-06, + "loss": 1.4298, + "step": 361 + }, + { + "epoch": 0.4842809364548495, + "grad_norm": 0.9404139975643057, + "learning_rate": 9.6524064171123e-06, + "loss": 1.0769, + "step": 362 + }, + { + "epoch": 0.48561872909698994, + "grad_norm": 1.0312161435719314, + "learning_rate": 9.679144385026738e-06, + "loss": 1.3026, + "step": 363 + }, + { + "epoch": 0.48695652173913045, + "grad_norm": 1.1047076137640803, + "learning_rate": 9.705882352941177e-06, + "loss": 1.3505, + "step": 364 + }, + { + "epoch": 0.4882943143812709, + "grad_norm": 1.0694686264079072, + "learning_rate": 9.732620320855617e-06, + "loss": 1.2595, + "step": 365 + }, + { + "epoch": 0.48963210702341137, + "grad_norm": 1.0735804966902511, + "learning_rate": 9.759358288770054e-06, + "loss": 1.3839, + "step": 366 + }, + { + "epoch": 0.4909698996655518, + "grad_norm": 1.3190164617739641, + "learning_rate": 9.786096256684493e-06, + "loss": 1.3338, + "step": 367 + }, + { + "epoch": 0.49230769230769234, + "grad_norm": 1.0341987546055667, + "learning_rate": 9.812834224598931e-06, + "loss": 1.0503, + "step": 368 + }, + { + "epoch": 0.4936454849498328, + "grad_norm": 0.9306735813501108, + "learning_rate": 9.83957219251337e-06, + "loss": 1.2448, + "step": 369 + }, + { + "epoch": 0.49498327759197325, + "grad_norm": 1.1161248923499716, + "learning_rate": 9.866310160427808e-06, + "loss": 1.3898, + "step": 370 + }, + { + "epoch": 0.4963210702341137, + "grad_norm": 1.0405681793337183, + "learning_rate": 9.893048128342247e-06, + "loss": 1.2628, + "step": 371 + }, + { + "epoch": 0.49765886287625416, + "grad_norm": 1.3134555191242052, + "learning_rate": 9.919786096256685e-06, + "loss": 1.1887, + "step": 372 + }, + { + "epoch": 0.49899665551839467, + "grad_norm": 1.1809112028656026, + "learning_rate": 9.946524064171124e-06, + "loss": 1.319, + "step": 373 + }, + { + "epoch": 0.5003344481605351, + "grad_norm": 1.1235861396875366, + "learning_rate": 9.973262032085562e-06, + "loss": 1.2173, + "step": 374 + }, + { + "epoch": 0.5016722408026756, + "grad_norm": 1.010150851354676, + "learning_rate": 1e-05, + "loss": 1.4349, + "step": 375 + }, + { + "epoch": 0.5030100334448161, + "grad_norm": 1.1656605483879519, + "learning_rate": 9.999997822232566e-06, + "loss": 1.286, + "step": 376 + }, + { + "epoch": 0.5043478260869565, + "grad_norm": 1.0978585358619815, + "learning_rate": 9.99999128893216e-06, + "loss": 1.3702, + "step": 377 + }, + { + "epoch": 0.505685618729097, + "grad_norm": 1.1243266620020984, + "learning_rate": 9.999980400104472e-06, + "loss": 1.4088, + "step": 378 + }, + { + "epoch": 0.5070234113712374, + "grad_norm": 1.2680163792958743, + "learning_rate": 9.99996515575899e-06, + "loss": 1.1623, + "step": 379 + }, + { + "epoch": 0.5083612040133779, + "grad_norm": 1.1200588161547043, + "learning_rate": 9.99994555590899e-06, + "loss": 1.4072, + "step": 380 + }, + { + "epoch": 0.5096989966555184, + "grad_norm": 1.1496093043838669, + "learning_rate": 9.99992160057155e-06, + "loss": 1.0761, + "step": 381 + }, + { + "epoch": 0.5110367892976588, + "grad_norm": 1.0327061311595507, + "learning_rate": 9.999893289767533e-06, + "loss": 1.078, + "step": 382 + }, + { + "epoch": 0.5123745819397993, + "grad_norm": 1.1354531867457962, + "learning_rate": 9.999860623521604e-06, + "loss": 1.4271, + "step": 383 + }, + { + "epoch": 0.5137123745819397, + "grad_norm": 0.9481821179292578, + "learning_rate": 9.999823601862217e-06, + "loss": 1.0913, + "step": 384 + }, + { + "epoch": 0.5150501672240803, + "grad_norm": 1.0757693312477359, + "learning_rate": 9.999782224821624e-06, + "loss": 1.3748, + "step": 385 + }, + { + "epoch": 0.5163879598662208, + "grad_norm": 1.0743664007833102, + "learning_rate": 9.999736492435867e-06, + "loss": 1.3429, + "step": 386 + }, + { + "epoch": 0.5177257525083612, + "grad_norm": 0.8664195395884678, + "learning_rate": 9.999686404744782e-06, + "loss": 1.3607, + "step": 387 + }, + { + "epoch": 0.5190635451505017, + "grad_norm": 1.0818460600162818, + "learning_rate": 9.999631961792006e-06, + "loss": 1.6752, + "step": 388 + }, + { + "epoch": 0.5204013377926422, + "grad_norm": 1.319558968072271, + "learning_rate": 9.99957316362496e-06, + "loss": 1.2743, + "step": 389 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.9776634025592558, + "learning_rate": 9.999510010294864e-06, + "loss": 1.4404, + "step": 390 + }, + { + "epoch": 0.5230769230769231, + "grad_norm": 1.0342595188769228, + "learning_rate": 9.999442501856736e-06, + "loss": 1.2664, + "step": 391 + }, + { + "epoch": 0.5244147157190635, + "grad_norm": 1.1120529651895847, + "learning_rate": 9.999370638369377e-06, + "loss": 1.0912, + "step": 392 + }, + { + "epoch": 0.525752508361204, + "grad_norm": 1.0917171608281049, + "learning_rate": 9.999294419895389e-06, + "loss": 1.1951, + "step": 393 + }, + { + "epoch": 0.5270903010033445, + "grad_norm": 1.169868919552183, + "learning_rate": 9.99921384650117e-06, + "loss": 1.0812, + "step": 394 + }, + { + "epoch": 0.5284280936454849, + "grad_norm": 1.4541208793540792, + "learning_rate": 9.999128918256904e-06, + "loss": 1.3994, + "step": 395 + }, + { + "epoch": 0.5297658862876254, + "grad_norm": 1.0897250929209976, + "learning_rate": 9.999039635236576e-06, + "loss": 1.1402, + "step": 396 + }, + { + "epoch": 0.5311036789297658, + "grad_norm": 1.0684748765617633, + "learning_rate": 9.998945997517957e-06, + "loss": 1.2486, + "step": 397 + }, + { + "epoch": 0.5324414715719064, + "grad_norm": 0.9297657839245542, + "learning_rate": 9.99884800518262e-06, + "loss": 1.2062, + "step": 398 + }, + { + "epoch": 0.5337792642140469, + "grad_norm": 1.067490368908779, + "learning_rate": 9.998745658315924e-06, + "loss": 1.3403, + "step": 399 + }, + { + "epoch": 0.5351170568561873, + "grad_norm": 1.0590593920136884, + "learning_rate": 9.998638957007024e-06, + "loss": 1.3893, + "step": 400 + }, + { + "epoch": 0.5364548494983278, + "grad_norm": 1.1548601214006393, + "learning_rate": 9.998527901348869e-06, + "loss": 1.3423, + "step": 401 + }, + { + "epoch": 0.5377926421404682, + "grad_norm": 1.0681997258554845, + "learning_rate": 9.998412491438201e-06, + "loss": 1.1359, + "step": 402 + }, + { + "epoch": 0.5391304347826087, + "grad_norm": 1.2630207039875703, + "learning_rate": 9.998292727375554e-06, + "loss": 1.4682, + "step": 403 + }, + { + "epoch": 0.5404682274247492, + "grad_norm": 1.1355959037203591, + "learning_rate": 9.998168609265254e-06, + "loss": 1.4012, + "step": 404 + }, + { + "epoch": 0.5418060200668896, + "grad_norm": 1.0395107922306437, + "learning_rate": 9.998040137215423e-06, + "loss": 1.3927, + "step": 405 + }, + { + "epoch": 0.5431438127090301, + "grad_norm": 0.9866467573749679, + "learning_rate": 9.997907311337973e-06, + "loss": 1.2175, + "step": 406 + }, + { + "epoch": 0.5444816053511705, + "grad_norm": 1.1848222943083444, + "learning_rate": 9.99777013174861e-06, + "loss": 1.2749, + "step": 407 + }, + { + "epoch": 0.545819397993311, + "grad_norm": 0.9880823216590806, + "learning_rate": 9.99762859856683e-06, + "loss": 1.1475, + "step": 408 + }, + { + "epoch": 0.5471571906354515, + "grad_norm": 1.13436479840815, + "learning_rate": 9.997482711915926e-06, + "loss": 1.3453, + "step": 409 + }, + { + "epoch": 0.5484949832775919, + "grad_norm": 1.3198193739292239, + "learning_rate": 9.997332471922981e-06, + "loss": 1.2699, + "step": 410 + }, + { + "epoch": 0.5498327759197325, + "grad_norm": 1.0643598443915665, + "learning_rate": 9.99717787871887e-06, + "loss": 1.4254, + "step": 411 + }, + { + "epoch": 0.551170568561873, + "grad_norm": 1.1493035724627947, + "learning_rate": 9.997018932438256e-06, + "loss": 1.154, + "step": 412 + }, + { + "epoch": 0.5525083612040134, + "grad_norm": 0.9357315944981988, + "learning_rate": 9.996855633219605e-06, + "loss": 1.2665, + "step": 413 + }, + { + "epoch": 0.5538461538461539, + "grad_norm": 1.1816114564429636, + "learning_rate": 9.996687981205163e-06, + "loss": 1.5169, + "step": 414 + }, + { + "epoch": 0.5551839464882943, + "grad_norm": 1.1787252222018036, + "learning_rate": 9.996515976540974e-06, + "loss": 1.3434, + "step": 415 + }, + { + "epoch": 0.5565217391304348, + "grad_norm": 0.926080619202031, + "learning_rate": 9.996339619376876e-06, + "loss": 1.2244, + "step": 416 + }, + { + "epoch": 0.5578595317725753, + "grad_norm": 1.0605312035834678, + "learning_rate": 9.99615890986649e-06, + "loss": 1.3243, + "step": 417 + }, + { + "epoch": 0.5591973244147157, + "grad_norm": 1.0006472860332245, + "learning_rate": 9.995973848167234e-06, + "loss": 1.4952, + "step": 418 + }, + { + "epoch": 0.5605351170568562, + "grad_norm": 0.9551452938503893, + "learning_rate": 9.99578443444032e-06, + "loss": 1.2175, + "step": 419 + }, + { + "epoch": 0.5618729096989966, + "grad_norm": 1.1468159018040656, + "learning_rate": 9.995590668850745e-06, + "loss": 1.3423, + "step": 420 + }, + { + "epoch": 0.5632107023411371, + "grad_norm": 1.0935327819124128, + "learning_rate": 9.9953925515673e-06, + "loss": 1.3247, + "step": 421 + }, + { + "epoch": 0.5645484949832776, + "grad_norm": 1.0639779523372608, + "learning_rate": 9.995190082762566e-06, + "loss": 1.3689, + "step": 422 + }, + { + "epoch": 0.565886287625418, + "grad_norm": 1.132589496909307, + "learning_rate": 9.994983262612916e-06, + "loss": 1.2894, + "step": 423 + }, + { + "epoch": 0.5672240802675586, + "grad_norm": 1.2261564583370035, + "learning_rate": 9.99477209129851e-06, + "loss": 1.4269, + "step": 424 + }, + { + "epoch": 0.568561872909699, + "grad_norm": 1.15120233673902, + "learning_rate": 9.994556569003305e-06, + "loss": 1.4992, + "step": 425 + }, + { + "epoch": 0.5698996655518395, + "grad_norm": 1.1767859733054233, + "learning_rate": 9.994336695915041e-06, + "loss": 1.4299, + "step": 426 + }, + { + "epoch": 0.57123745819398, + "grad_norm": 0.9217145196495236, + "learning_rate": 9.99411247222525e-06, + "loss": 1.1147, + "step": 427 + }, + { + "epoch": 0.5725752508361204, + "grad_norm": 1.1674796924560404, + "learning_rate": 9.993883898129259e-06, + "loss": 1.3883, + "step": 428 + }, + { + "epoch": 0.5739130434782609, + "grad_norm": 1.1205443690497903, + "learning_rate": 9.993650973826177e-06, + "loss": 1.3154, + "step": 429 + }, + { + "epoch": 0.5752508361204013, + "grad_norm": 1.229457937498444, + "learning_rate": 9.993413699518906e-06, + "loss": 1.3437, + "step": 430 + }, + { + "epoch": 0.5765886287625418, + "grad_norm": 1.1812222632116642, + "learning_rate": 9.99317207541414e-06, + "loss": 1.5636, + "step": 431 + }, + { + "epoch": 0.5779264214046823, + "grad_norm": 1.1892641991559765, + "learning_rate": 9.992926101722355e-06, + "loss": 1.3576, + "step": 432 + }, + { + "epoch": 0.5792642140468227, + "grad_norm": 1.1615910177307236, + "learning_rate": 9.992675778657824e-06, + "loss": 1.374, + "step": 433 + }, + { + "epoch": 0.5806020066889632, + "grad_norm": 0.9751237528409985, + "learning_rate": 9.992421106438606e-06, + "loss": 1.2839, + "step": 434 + }, + { + "epoch": 0.5819397993311036, + "grad_norm": 1.048157533636061, + "learning_rate": 9.992162085286543e-06, + "loss": 1.2566, + "step": 435 + }, + { + "epoch": 0.5832775919732441, + "grad_norm": 1.0081933603037847, + "learning_rate": 9.991898715427274e-06, + "loss": 1.2556, + "step": 436 + }, + { + "epoch": 0.5846153846153846, + "grad_norm": 1.1999429967074817, + "learning_rate": 9.991630997090222e-06, + "loss": 1.366, + "step": 437 + }, + { + "epoch": 0.585953177257525, + "grad_norm": 1.0203355344115692, + "learning_rate": 9.991358930508599e-06, + "loss": 1.1353, + "step": 438 + }, + { + "epoch": 0.5872909698996656, + "grad_norm": 1.0269535581553713, + "learning_rate": 9.991082515919402e-06, + "loss": 1.2899, + "step": 439 + }, + { + "epoch": 0.5886287625418061, + "grad_norm": 1.0253559685559788, + "learning_rate": 9.990801753563418e-06, + "loss": 1.4366, + "step": 440 + }, + { + "epoch": 0.5899665551839465, + "grad_norm": 1.1729941858237294, + "learning_rate": 9.990516643685222e-06, + "loss": 1.3545, + "step": 441 + }, + { + "epoch": 0.591304347826087, + "grad_norm": 0.9630379692770136, + "learning_rate": 9.990227186533174e-06, + "loss": 1.1949, + "step": 442 + }, + { + "epoch": 0.5926421404682274, + "grad_norm": 0.9907418734286796, + "learning_rate": 9.989933382359423e-06, + "loss": 1.2892, + "step": 443 + }, + { + "epoch": 0.5939799331103679, + "grad_norm": 1.0636681837548423, + "learning_rate": 9.989635231419903e-06, + "loss": 1.3391, + "step": 444 + }, + { + "epoch": 0.5953177257525084, + "grad_norm": 1.0553822413917073, + "learning_rate": 9.989332733974337e-06, + "loss": 1.1596, + "step": 445 + }, + { + "epoch": 0.5966555183946488, + "grad_norm": 1.1381240211859476, + "learning_rate": 9.989025890286233e-06, + "loss": 1.4715, + "step": 446 + }, + { + "epoch": 0.5979933110367893, + "grad_norm": 1.1960179450553898, + "learning_rate": 9.988714700622882e-06, + "loss": 1.2919, + "step": 447 + }, + { + "epoch": 0.5993311036789297, + "grad_norm": 0.9225606949682303, + "learning_rate": 9.988399165255365e-06, + "loss": 1.2626, + "step": 448 + }, + { + "epoch": 0.6006688963210702, + "grad_norm": 1.1804064863925816, + "learning_rate": 9.988079284458547e-06, + "loss": 1.2725, + "step": 449 + }, + { + "epoch": 0.6020066889632107, + "grad_norm": 1.1985983576362949, + "learning_rate": 9.987755058511079e-06, + "loss": 1.2762, + "step": 450 + }, + { + "epoch": 0.6033444816053511, + "grad_norm": 1.0131240110883692, + "learning_rate": 9.987426487695396e-06, + "loss": 1.2179, + "step": 451 + }, + { + "epoch": 0.6046822742474917, + "grad_norm": 1.4829485252144634, + "learning_rate": 9.987093572297716e-06, + "loss": 1.3693, + "step": 452 + }, + { + "epoch": 0.6060200668896321, + "grad_norm": 1.2747451749489627, + "learning_rate": 9.986756312608048e-06, + "loss": 1.3304, + "step": 453 + }, + { + "epoch": 0.6073578595317726, + "grad_norm": 0.9179755325475105, + "learning_rate": 9.98641470892018e-06, + "loss": 1.2412, + "step": 454 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 1.0279237457942552, + "learning_rate": 9.986068761531681e-06, + "loss": 1.1253, + "step": 455 + }, + { + "epoch": 0.6100334448160535, + "grad_norm": 1.163279415519898, + "learning_rate": 9.985718470743916e-06, + "loss": 1.3655, + "step": 456 + }, + { + "epoch": 0.611371237458194, + "grad_norm": 0.9950219721597383, + "learning_rate": 9.985363836862021e-06, + "loss": 1.1779, + "step": 457 + }, + { + "epoch": 0.6127090301003344, + "grad_norm": 1.1101874749463359, + "learning_rate": 9.98500486019492e-06, + "loss": 1.4073, + "step": 458 + }, + { + "epoch": 0.6140468227424749, + "grad_norm": 1.1604325090714505, + "learning_rate": 9.98464154105532e-06, + "loss": 1.2796, + "step": 459 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 1.1000300751821481, + "learning_rate": 9.984273879759713e-06, + "loss": 1.2403, + "step": 460 + }, + { + "epoch": 0.6167224080267558, + "grad_norm": 0.9777475150242518, + "learning_rate": 9.983901876628369e-06, + "loss": 1.3506, + "step": 461 + }, + { + "epoch": 0.6180602006688963, + "grad_norm": 1.0330053472113943, + "learning_rate": 9.983525531985343e-06, + "loss": 1.2697, + "step": 462 + }, + { + "epoch": 0.6193979933110368, + "grad_norm": 1.2020931602983405, + "learning_rate": 9.983144846158472e-06, + "loss": 1.378, + "step": 463 + }, + { + "epoch": 0.6207357859531772, + "grad_norm": 1.1195897026791057, + "learning_rate": 9.982759819479375e-06, + "loss": 1.252, + "step": 464 + }, + { + "epoch": 0.6220735785953178, + "grad_norm": 1.235852835708849, + "learning_rate": 9.982370452283451e-06, + "loss": 1.4758, + "step": 465 + }, + { + "epoch": 0.6234113712374582, + "grad_norm": 1.0694458681284376, + "learning_rate": 9.981976744909878e-06, + "loss": 1.538, + "step": 466 + }, + { + "epoch": 0.6247491638795987, + "grad_norm": 1.065602324805708, + "learning_rate": 9.98157869770162e-06, + "loss": 1.5467, + "step": 467 + }, + { + "epoch": 0.6260869565217392, + "grad_norm": 0.9527364425283569, + "learning_rate": 9.981176311005419e-06, + "loss": 1.0447, + "step": 468 + }, + { + "epoch": 0.6274247491638796, + "grad_norm": 1.302699034009706, + "learning_rate": 9.980769585171795e-06, + "loss": 1.3239, + "step": 469 + }, + { + "epoch": 0.6287625418060201, + "grad_norm": 0.9907125326861103, + "learning_rate": 9.980358520555048e-06, + "loss": 1.409, + "step": 470 + }, + { + "epoch": 0.6301003344481605, + "grad_norm": 1.0971825913522288, + "learning_rate": 9.979943117513265e-06, + "loss": 1.3173, + "step": 471 + }, + { + "epoch": 0.631438127090301, + "grad_norm": 1.0043529030196814, + "learning_rate": 9.9795233764083e-06, + "loss": 1.3376, + "step": 472 + }, + { + "epoch": 0.6327759197324415, + "grad_norm": 1.052406983475307, + "learning_rate": 9.979099297605798e-06, + "loss": 1.3805, + "step": 473 + }, + { + "epoch": 0.6341137123745819, + "grad_norm": 1.0370824734907576, + "learning_rate": 9.978670881475173e-06, + "loss": 1.3802, + "step": 474 + }, + { + "epoch": 0.6354515050167224, + "grad_norm": 1.0531230329288617, + "learning_rate": 9.978238128389623e-06, + "loss": 1.2639, + "step": 475 + }, + { + "epoch": 0.6367892976588628, + "grad_norm": 0.9786967982883796, + "learning_rate": 9.977801038726123e-06, + "loss": 1.0538, + "step": 476 + }, + { + "epoch": 0.6381270903010033, + "grad_norm": 1.05167352707507, + "learning_rate": 9.977359612865424e-06, + "loss": 1.2906, + "step": 477 + }, + { + "epoch": 0.6394648829431439, + "grad_norm": 1.0901744798690292, + "learning_rate": 9.976913851192053e-06, + "loss": 1.2782, + "step": 478 + }, + { + "epoch": 0.6408026755852843, + "grad_norm": 1.0123797234136187, + "learning_rate": 9.976463754094321e-06, + "loss": 1.2375, + "step": 479 + }, + { + "epoch": 0.6421404682274248, + "grad_norm": 0.9984436755224965, + "learning_rate": 9.976009321964306e-06, + "loss": 1.1978, + "step": 480 + }, + { + "epoch": 0.6434782608695652, + "grad_norm": 1.0897978553174903, + "learning_rate": 9.97555055519787e-06, + "loss": 1.4029, + "step": 481 + }, + { + "epoch": 0.6448160535117057, + "grad_norm": 1.1707929350792625, + "learning_rate": 9.975087454194645e-06, + "loss": 1.4053, + "step": 482 + }, + { + "epoch": 0.6461538461538462, + "grad_norm": 1.3204845923359476, + "learning_rate": 9.974620019358046e-06, + "loss": 1.6028, + "step": 483 + }, + { + "epoch": 0.6474916387959866, + "grad_norm": 1.2089585910703213, + "learning_rate": 9.974148251095253e-06, + "loss": 1.28, + "step": 484 + }, + { + "epoch": 0.6488294314381271, + "grad_norm": 1.2533077935845505, + "learning_rate": 9.973672149817232e-06, + "loss": 1.2873, + "step": 485 + }, + { + "epoch": 0.6501672240802676, + "grad_norm": 0.9732708945018871, + "learning_rate": 9.973191715938715e-06, + "loss": 1.2073, + "step": 486 + }, + { + "epoch": 0.651505016722408, + "grad_norm": 1.0235984673096665, + "learning_rate": 9.972706949878212e-06, + "loss": 1.3958, + "step": 487 + }, + { + "epoch": 0.6528428093645485, + "grad_norm": 1.1207618354969364, + "learning_rate": 9.972217852058006e-06, + "loss": 1.3954, + "step": 488 + }, + { + "epoch": 0.6541806020066889, + "grad_norm": 0.9712501134730327, + "learning_rate": 9.971724422904154e-06, + "loss": 1.2134, + "step": 489 + }, + { + "epoch": 0.6555183946488294, + "grad_norm": 0.9622738504849393, + "learning_rate": 9.971226662846485e-06, + "loss": 1.1892, + "step": 490 + }, + { + "epoch": 0.65685618729097, + "grad_norm": 0.9695736496405435, + "learning_rate": 9.970724572318602e-06, + "loss": 1.2766, + "step": 491 + }, + { + "epoch": 0.6581939799331104, + "grad_norm": 1.2010044827868913, + "learning_rate": 9.97021815175788e-06, + "loss": 1.2962, + "step": 492 + }, + { + "epoch": 0.6595317725752509, + "grad_norm": 0.9163060879128939, + "learning_rate": 9.969707401605464e-06, + "loss": 1.4051, + "step": 493 + }, + { + "epoch": 0.6608695652173913, + "grad_norm": 0.9482197578926375, + "learning_rate": 9.969192322306271e-06, + "loss": 1.1303, + "step": 494 + }, + { + "epoch": 0.6622073578595318, + "grad_norm": 1.0056736828267627, + "learning_rate": 9.968672914308995e-06, + "loss": 1.3657, + "step": 495 + }, + { + "epoch": 0.6635451505016723, + "grad_norm": 1.0863969649540868, + "learning_rate": 9.96814917806609e-06, + "loss": 1.0947, + "step": 496 + }, + { + "epoch": 0.6648829431438127, + "grad_norm": 1.0498577935937559, + "learning_rate": 9.96762111403379e-06, + "loss": 1.3732, + "step": 497 + }, + { + "epoch": 0.6662207357859532, + "grad_norm": 1.015530395327257, + "learning_rate": 9.967088722672094e-06, + "loss": 1.3385, + "step": 498 + }, + { + "epoch": 0.6675585284280936, + "grad_norm": 1.0506291571613753, + "learning_rate": 9.966552004444772e-06, + "loss": 1.3236, + "step": 499 + }, + { + "epoch": 0.6688963210702341, + "grad_norm": 0.9059203087142448, + "learning_rate": 9.966010959819363e-06, + "loss": 1.2673, + "step": 500 + }, + { + "epoch": 0.6702341137123746, + "grad_norm": 1.0086794362547984, + "learning_rate": 9.965465589267176e-06, + "loss": 1.3319, + "step": 501 + }, + { + "epoch": 0.671571906354515, + "grad_norm": 0.9439227689571684, + "learning_rate": 9.964915893263285e-06, + "loss": 1.228, + "step": 502 + }, + { + "epoch": 0.6729096989966555, + "grad_norm": 0.8913715199281573, + "learning_rate": 9.964361872286534e-06, + "loss": 1.0525, + "step": 503 + }, + { + "epoch": 0.6742474916387959, + "grad_norm": 1.0417089711771321, + "learning_rate": 9.963803526819537e-06, + "loss": 1.2443, + "step": 504 + }, + { + "epoch": 0.6755852842809364, + "grad_norm": 1.254409894006796, + "learning_rate": 9.963240857348671e-06, + "loss": 1.3699, + "step": 505 + }, + { + "epoch": 0.676923076923077, + "grad_norm": 1.0316992280719044, + "learning_rate": 9.962673864364081e-06, + "loss": 1.1456, + "step": 506 + }, + { + "epoch": 0.6782608695652174, + "grad_norm": 0.859208393751457, + "learning_rate": 9.96210254835968e-06, + "loss": 1.3191, + "step": 507 + }, + { + "epoch": 0.6795986622073579, + "grad_norm": 0.9206981735733913, + "learning_rate": 9.961526909833143e-06, + "loss": 1.1054, + "step": 508 + }, + { + "epoch": 0.6809364548494983, + "grad_norm": 1.310073214825605, + "learning_rate": 9.960946949285915e-06, + "loss": 1.3174, + "step": 509 + }, + { + "epoch": 0.6822742474916388, + "grad_norm": 0.9749073979279647, + "learning_rate": 9.960362667223202e-06, + "loss": 1.2621, + "step": 510 + }, + { + "epoch": 0.6836120401337793, + "grad_norm": 1.1336341276843758, + "learning_rate": 9.959774064153977e-06, + "loss": 1.1527, + "step": 511 + }, + { + "epoch": 0.6849498327759197, + "grad_norm": 0.9222503225780725, + "learning_rate": 9.959181140590977e-06, + "loss": 1.0212, + "step": 512 + }, + { + "epoch": 0.6862876254180602, + "grad_norm": 0.9756159369666614, + "learning_rate": 9.9585838970507e-06, + "loss": 1.236, + "step": 513 + }, + { + "epoch": 0.6876254180602007, + "grad_norm": 1.2181382484229888, + "learning_rate": 9.95798233405341e-06, + "loss": 1.376, + "step": 514 + }, + { + "epoch": 0.6889632107023411, + "grad_norm": 1.098343352999825, + "learning_rate": 9.957376452123133e-06, + "loss": 1.0681, + "step": 515 + }, + { + "epoch": 0.6903010033444816, + "grad_norm": 1.003418365013215, + "learning_rate": 9.956766251787657e-06, + "loss": 1.3253, + "step": 516 + }, + { + "epoch": 0.691638795986622, + "grad_norm": 1.0429214404501754, + "learning_rate": 9.956151733578533e-06, + "loss": 1.3839, + "step": 517 + }, + { + "epoch": 0.6929765886287625, + "grad_norm": 1.1508278594848527, + "learning_rate": 9.955532898031069e-06, + "loss": 1.3728, + "step": 518 + }, + { + "epoch": 0.6943143812709031, + "grad_norm": 1.1259204928507576, + "learning_rate": 9.954909745684339e-06, + "loss": 1.5027, + "step": 519 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 1.0376522315177514, + "learning_rate": 9.954282277081173e-06, + "loss": 1.3145, + "step": 520 + }, + { + "epoch": 0.696989966555184, + "grad_norm": 0.9115171801166071, + "learning_rate": 9.953650492768167e-06, + "loss": 1.2695, + "step": 521 + }, + { + "epoch": 0.6983277591973244, + "grad_norm": 1.0140828490643332, + "learning_rate": 9.95301439329567e-06, + "loss": 1.2478, + "step": 522 + }, + { + "epoch": 0.6996655518394649, + "grad_norm": 1.1750909653360533, + "learning_rate": 9.952373979217795e-06, + "loss": 1.3377, + "step": 523 + }, + { + "epoch": 0.7010033444816054, + "grad_norm": 0.9948984306922928, + "learning_rate": 9.951729251092408e-06, + "loss": 1.2224, + "step": 524 + }, + { + "epoch": 0.7023411371237458, + "grad_norm": 0.9756589723986113, + "learning_rate": 9.951080209481138e-06, + "loss": 1.5203, + "step": 525 + }, + { + "epoch": 0.7036789297658863, + "grad_norm": 1.3629022247231284, + "learning_rate": 9.950426854949371e-06, + "loss": 1.2922, + "step": 526 + }, + { + "epoch": 0.7050167224080267, + "grad_norm": 1.0481841629235165, + "learning_rate": 9.949769188066247e-06, + "loss": 1.5235, + "step": 527 + }, + { + "epoch": 0.7063545150501672, + "grad_norm": 1.029435505481308, + "learning_rate": 9.949107209404664e-06, + "loss": 1.6635, + "step": 528 + }, + { + "epoch": 0.7076923076923077, + "grad_norm": 1.2525223292758443, + "learning_rate": 9.948440919541277e-06, + "loss": 1.4505, + "step": 529 + }, + { + "epoch": 0.7090301003344481, + "grad_norm": 1.3312368668433607, + "learning_rate": 9.947770319056496e-06, + "loss": 1.2059, + "step": 530 + }, + { + "epoch": 0.7103678929765886, + "grad_norm": 1.5297250169624774, + "learning_rate": 9.947095408534483e-06, + "loss": 1.3333, + "step": 531 + }, + { + "epoch": 0.711705685618729, + "grad_norm": 1.2483245104540257, + "learning_rate": 9.946416188563163e-06, + "loss": 1.2722, + "step": 532 + }, + { + "epoch": 0.7130434782608696, + "grad_norm": 1.2893681812039872, + "learning_rate": 9.945732659734204e-06, + "loss": 1.5186, + "step": 533 + }, + { + "epoch": 0.7143812709030101, + "grad_norm": 1.0213163650509205, + "learning_rate": 9.945044822643033e-06, + "loss": 1.4491, + "step": 534 + }, + { + "epoch": 0.7157190635451505, + "grad_norm": 1.1222498792315954, + "learning_rate": 9.944352677888833e-06, + "loss": 1.3213, + "step": 535 + }, + { + "epoch": 0.717056856187291, + "grad_norm": 0.9644540270199444, + "learning_rate": 9.943656226074534e-06, + "loss": 1.284, + "step": 536 + }, + { + "epoch": 0.7183946488294315, + "grad_norm": 1.0854925438590666, + "learning_rate": 9.94295546780682e-06, + "loss": 1.3287, + "step": 537 + }, + { + "epoch": 0.7197324414715719, + "grad_norm": 1.4152678476705796, + "learning_rate": 9.942250403696126e-06, + "loss": 1.3362, + "step": 538 + }, + { + "epoch": 0.7210702341137124, + "grad_norm": 0.9527637019227764, + "learning_rate": 9.94154103435664e-06, + "loss": 1.3821, + "step": 539 + }, + { + "epoch": 0.7224080267558528, + "grad_norm": 0.9399869468868102, + "learning_rate": 9.940827360406297e-06, + "loss": 1.3431, + "step": 540 + }, + { + "epoch": 0.7237458193979933, + "grad_norm": 1.0207544425211497, + "learning_rate": 9.940109382466785e-06, + "loss": 1.2374, + "step": 541 + }, + { + "epoch": 0.7250836120401338, + "grad_norm": 1.0431620775408554, + "learning_rate": 9.939387101163538e-06, + "loss": 1.2428, + "step": 542 + }, + { + "epoch": 0.7264214046822742, + "grad_norm": 0.8681608289273625, + "learning_rate": 9.93866051712574e-06, + "loss": 1.194, + "step": 543 + }, + { + "epoch": 0.7277591973244147, + "grad_norm": 1.0682156658430866, + "learning_rate": 9.937929630986324e-06, + "loss": 1.3021, + "step": 544 + }, + { + "epoch": 0.7290969899665551, + "grad_norm": 0.9564341892592291, + "learning_rate": 9.937194443381972e-06, + "loss": 1.1525, + "step": 545 + }, + { + "epoch": 0.7304347826086957, + "grad_norm": 1.2357756828227056, + "learning_rate": 9.936454954953108e-06, + "loss": 1.4221, + "step": 546 + }, + { + "epoch": 0.7317725752508362, + "grad_norm": 1.0600656411238274, + "learning_rate": 9.935711166343909e-06, + "loss": 1.3114, + "step": 547 + }, + { + "epoch": 0.7331103678929766, + "grad_norm": 0.9192223056142138, + "learning_rate": 9.934963078202289e-06, + "loss": 1.0569, + "step": 548 + }, + { + "epoch": 0.7344481605351171, + "grad_norm": 1.0721879437110524, + "learning_rate": 9.934210691179918e-06, + "loss": 1.3024, + "step": 549 + }, + { + "epoch": 0.7357859531772575, + "grad_norm": 0.9665252701120022, + "learning_rate": 9.933454005932204e-06, + "loss": 1.267, + "step": 550 + }, + { + "epoch": 0.737123745819398, + "grad_norm": 0.8891978309716932, + "learning_rate": 9.932693023118299e-06, + "loss": 1.2008, + "step": 551 + }, + { + "epoch": 0.7384615384615385, + "grad_norm": 0.9395848392915587, + "learning_rate": 9.931927743401102e-06, + "loss": 1.3322, + "step": 552 + }, + { + "epoch": 0.7397993311036789, + "grad_norm": 1.1836899988845726, + "learning_rate": 9.931158167447254e-06, + "loss": 1.3947, + "step": 553 + }, + { + "epoch": 0.7411371237458194, + "grad_norm": 1.027991988078457, + "learning_rate": 9.930384295927137e-06, + "loss": 1.4411, + "step": 554 + }, + { + "epoch": 0.7424749163879598, + "grad_norm": 0.9635784293406117, + "learning_rate": 9.929606129514875e-06, + "loss": 1.2771, + "step": 555 + }, + { + "epoch": 0.7438127090301003, + "grad_norm": 0.8807495788321378, + "learning_rate": 9.928823668888337e-06, + "loss": 1.1726, + "step": 556 + }, + { + "epoch": 0.7451505016722408, + "grad_norm": 0.9808281945327478, + "learning_rate": 9.928036914729129e-06, + "loss": 1.3806, + "step": 557 + }, + { + "epoch": 0.7464882943143812, + "grad_norm": 1.053309084680161, + "learning_rate": 9.927245867722596e-06, + "loss": 1.3468, + "step": 558 + }, + { + "epoch": 0.7478260869565218, + "grad_norm": 1.1810484750244898, + "learning_rate": 9.926450528557828e-06, + "loss": 1.3709, + "step": 559 + }, + { + "epoch": 0.7491638795986622, + "grad_norm": 1.0856262773041774, + "learning_rate": 9.925650897927646e-06, + "loss": 1.2514, + "step": 560 + }, + { + "epoch": 0.7505016722408027, + "grad_norm": 1.0142329444196578, + "learning_rate": 9.924846976528618e-06, + "loss": 1.2385, + "step": 561 + }, + { + "epoch": 0.7518394648829432, + "grad_norm": 0.9340905381910848, + "learning_rate": 9.924038765061042e-06, + "loss": 1.241, + "step": 562 + }, + { + "epoch": 0.7531772575250836, + "grad_norm": 0.9697533733473317, + "learning_rate": 9.923226264228958e-06, + "loss": 1.1214, + "step": 563 + }, + { + "epoch": 0.7545150501672241, + "grad_norm": 1.0810175940854412, + "learning_rate": 9.922409474740142e-06, + "loss": 1.3351, + "step": 564 + }, + { + "epoch": 0.7558528428093646, + "grad_norm": 1.0171360124073874, + "learning_rate": 9.921588397306105e-06, + "loss": 1.1633, + "step": 565 + }, + { + "epoch": 0.757190635451505, + "grad_norm": 0.9859051779097874, + "learning_rate": 9.920763032642094e-06, + "loss": 1.2517, + "step": 566 + }, + { + "epoch": 0.7585284280936455, + "grad_norm": 1.2635841749285441, + "learning_rate": 9.919933381467088e-06, + "loss": 1.3028, + "step": 567 + }, + { + "epoch": 0.7598662207357859, + "grad_norm": 0.9178962102311397, + "learning_rate": 9.919099444503804e-06, + "loss": 1.1243, + "step": 568 + }, + { + "epoch": 0.7612040133779264, + "grad_norm": 1.0907287478737506, + "learning_rate": 9.918261222478687e-06, + "loss": 1.296, + "step": 569 + }, + { + "epoch": 0.7625418060200669, + "grad_norm": 1.0909807257520592, + "learning_rate": 9.91741871612192e-06, + "loss": 1.1316, + "step": 570 + }, + { + "epoch": 0.7638795986622073, + "grad_norm": 0.9060898567643482, + "learning_rate": 9.916571926167417e-06, + "loss": 1.2373, + "step": 571 + }, + { + "epoch": 0.7652173913043478, + "grad_norm": 0.8626260696037537, + "learning_rate": 9.915720853352821e-06, + "loss": 1.0971, + "step": 572 + }, + { + "epoch": 0.7665551839464882, + "grad_norm": 1.2134291951770706, + "learning_rate": 9.91486549841951e-06, + "loss": 1.2124, + "step": 573 + }, + { + "epoch": 0.7678929765886288, + "grad_norm": 0.9931356431517179, + "learning_rate": 9.914005862112587e-06, + "loss": 1.3749, + "step": 574 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.9759813468921607, + "learning_rate": 9.913141945180888e-06, + "loss": 1.2687, + "step": 575 + }, + { + "epoch": 0.7705685618729097, + "grad_norm": 0.9278527248764286, + "learning_rate": 9.912273748376976e-06, + "loss": 1.3477, + "step": 576 + }, + { + "epoch": 0.7719063545150502, + "grad_norm": 1.171956920926958, + "learning_rate": 9.911401272457145e-06, + "loss": 1.229, + "step": 577 + }, + { + "epoch": 0.7732441471571906, + "grad_norm": 1.1168088933650642, + "learning_rate": 9.910524518181416e-06, + "loss": 1.3586, + "step": 578 + }, + { + "epoch": 0.7745819397993311, + "grad_norm": 1.0213367406262686, + "learning_rate": 9.909643486313533e-06, + "loss": 1.3034, + "step": 579 + }, + { + "epoch": 0.7759197324414716, + "grad_norm": 0.957281353689765, + "learning_rate": 9.908758177620972e-06, + "loss": 1.3214, + "step": 580 + }, + { + "epoch": 0.777257525083612, + "grad_norm": 1.0495271038288003, + "learning_rate": 9.907868592874927e-06, + "loss": 1.1207, + "step": 581 + }, + { + "epoch": 0.7785953177257525, + "grad_norm": 1.1068542459169866, + "learning_rate": 9.906974732850327e-06, + "loss": 1.284, + "step": 582 + }, + { + "epoch": 0.7799331103678929, + "grad_norm": 1.15604514645504, + "learning_rate": 9.906076598325815e-06, + "loss": 1.275, + "step": 583 + }, + { + "epoch": 0.7812709030100334, + "grad_norm": 1.1351017559908314, + "learning_rate": 9.905174190083763e-06, + "loss": 1.3985, + "step": 584 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 1.005481361144937, + "learning_rate": 9.904267508910269e-06, + "loss": 1.2334, + "step": 585 + }, + { + "epoch": 0.7839464882943143, + "grad_norm": 0.9982761421625572, + "learning_rate": 9.903356555595143e-06, + "loss": 1.3486, + "step": 586 + }, + { + "epoch": 0.7852842809364549, + "grad_norm": 1.1584856839837012, + "learning_rate": 9.90244133093193e-06, + "loss": 1.318, + "step": 587 + }, + { + "epoch": 0.7866220735785954, + "grad_norm": 1.114455775051334, + "learning_rate": 9.901521835717882e-06, + "loss": 1.0816, + "step": 588 + }, + { + "epoch": 0.7879598662207358, + "grad_norm": 1.099730783963084, + "learning_rate": 9.900598070753981e-06, + "loss": 1.1339, + "step": 589 + }, + { + "epoch": 0.7892976588628763, + "grad_norm": 0.8557475165868178, + "learning_rate": 9.899670036844926e-06, + "loss": 1.0908, + "step": 590 + }, + { + "epoch": 0.7906354515050167, + "grad_norm": 0.9651745852981312, + "learning_rate": 9.898737734799134e-06, + "loss": 1.3149, + "step": 591 + }, + { + "epoch": 0.7919732441471572, + "grad_norm": 0.9378406708378495, + "learning_rate": 9.897801165428736e-06, + "loss": 1.1758, + "step": 592 + }, + { + "epoch": 0.7933110367892977, + "grad_norm": 1.2017351924217488, + "learning_rate": 9.896860329549585e-06, + "loss": 1.3506, + "step": 593 + }, + { + "epoch": 0.7946488294314381, + "grad_norm": 1.1994876150548386, + "learning_rate": 9.895915227981254e-06, + "loss": 1.4715, + "step": 594 + }, + { + "epoch": 0.7959866220735786, + "grad_norm": 1.0615493353435002, + "learning_rate": 9.894965861547023e-06, + "loss": 1.3893, + "step": 595 + }, + { + "epoch": 0.797324414715719, + "grad_norm": 1.130331703835116, + "learning_rate": 9.894012231073895e-06, + "loss": 1.2918, + "step": 596 + }, + { + "epoch": 0.7986622073578595, + "grad_norm": 0.9920141126606498, + "learning_rate": 9.89305433739258e-06, + "loss": 1.2065, + "step": 597 + }, + { + "epoch": 0.8, + "grad_norm": 1.3670023229764092, + "learning_rate": 9.892092181337512e-06, + "loss": 1.1605, + "step": 598 + }, + { + "epoch": 0.8013377926421404, + "grad_norm": 1.0035506638749367, + "learning_rate": 9.891125763746824e-06, + "loss": 1.0728, + "step": 599 + }, + { + "epoch": 0.802675585284281, + "grad_norm": 1.004908750054442, + "learning_rate": 9.890155085462376e-06, + "loss": 1.511, + "step": 600 + }, + { + "epoch": 0.8040133779264214, + "grad_norm": 0.934315090840971, + "learning_rate": 9.889180147329731e-06, + "loss": 1.3682, + "step": 601 + }, + { + "epoch": 0.8053511705685619, + "grad_norm": 1.160299776766074, + "learning_rate": 9.888200950198162e-06, + "loss": 1.2191, + "step": 602 + }, + { + "epoch": 0.8066889632107024, + "grad_norm": 1.0987177481020471, + "learning_rate": 9.887217494920655e-06, + "loss": 1.3398, + "step": 603 + }, + { + "epoch": 0.8080267558528428, + "grad_norm": 1.0402474858408706, + "learning_rate": 9.886229782353907e-06, + "loss": 1.1631, + "step": 604 + }, + { + "epoch": 0.8093645484949833, + "grad_norm": 1.0923792374313679, + "learning_rate": 9.88523781335832e-06, + "loss": 1.2853, + "step": 605 + }, + { + "epoch": 0.8107023411371237, + "grad_norm": 1.045848903163111, + "learning_rate": 9.884241588798004e-06, + "loss": 1.3018, + "step": 606 + }, + { + "epoch": 0.8120401337792642, + "grad_norm": 0.8977571466137961, + "learning_rate": 9.88324110954078e-06, + "loss": 1.2543, + "step": 607 + }, + { + "epoch": 0.8133779264214047, + "grad_norm": 0.8619902171479032, + "learning_rate": 9.88223637645817e-06, + "loss": 1.4028, + "step": 608 + }, + { + "epoch": 0.8147157190635451, + "grad_norm": 0.9049490352037889, + "learning_rate": 9.881227390425404e-06, + "loss": 1.2081, + "step": 609 + }, + { + "epoch": 0.8160535117056856, + "grad_norm": 0.9681660157051899, + "learning_rate": 9.880214152321417e-06, + "loss": 1.365, + "step": 610 + }, + { + "epoch": 0.8173913043478261, + "grad_norm": 1.1357486402380326, + "learning_rate": 9.879196663028847e-06, + "loss": 1.3814, + "step": 611 + }, + { + "epoch": 0.8187290969899665, + "grad_norm": 1.0227849025626665, + "learning_rate": 9.87817492343404e-06, + "loss": 1.5482, + "step": 612 + }, + { + "epoch": 0.820066889632107, + "grad_norm": 1.084049451110017, + "learning_rate": 9.877148934427037e-06, + "loss": 1.2472, + "step": 613 + }, + { + "epoch": 0.8214046822742475, + "grad_norm": 0.8851485732820816, + "learning_rate": 9.876118696901585e-06, + "loss": 1.1615, + "step": 614 + }, + { + "epoch": 0.822742474916388, + "grad_norm": 1.1079037586998346, + "learning_rate": 9.875084211755127e-06, + "loss": 1.5063, + "step": 615 + }, + { + "epoch": 0.8240802675585285, + "grad_norm": 1.002297488490659, + "learning_rate": 9.874045479888819e-06, + "loss": 1.3397, + "step": 616 + }, + { + "epoch": 0.8254180602006689, + "grad_norm": 1.0375910517711284, + "learning_rate": 9.873002502207502e-06, + "loss": 1.4683, + "step": 617 + }, + { + "epoch": 0.8267558528428094, + "grad_norm": 1.452735502198677, + "learning_rate": 9.871955279619721e-06, + "loss": 1.3028, + "step": 618 + }, + { + "epoch": 0.8280936454849498, + "grad_norm": 1.0218369464655834, + "learning_rate": 9.87090381303772e-06, + "loss": 1.3315, + "step": 619 + }, + { + "epoch": 0.8294314381270903, + "grad_norm": 1.1672918323728783, + "learning_rate": 9.86984810337744e-06, + "loss": 1.5013, + "step": 620 + }, + { + "epoch": 0.8307692307692308, + "grad_norm": 1.0803161519420936, + "learning_rate": 9.868788151558513e-06, + "loss": 1.2891, + "step": 621 + }, + { + "epoch": 0.8321070234113712, + "grad_norm": 1.139627054789444, + "learning_rate": 9.867723958504275e-06, + "loss": 1.3427, + "step": 622 + }, + { + "epoch": 0.8334448160535117, + "grad_norm": 0.9487483555655597, + "learning_rate": 9.86665552514175e-06, + "loss": 1.3179, + "step": 623 + }, + { + "epoch": 0.8347826086956521, + "grad_norm": 1.0666768665767643, + "learning_rate": 9.865582852401659e-06, + "loss": 1.3598, + "step": 624 + }, + { + "epoch": 0.8361204013377926, + "grad_norm": 0.8370961723005313, + "learning_rate": 9.86450594121841e-06, + "loss": 1.0332, + "step": 625 + }, + { + "epoch": 0.8374581939799332, + "grad_norm": 1.2333255320216736, + "learning_rate": 9.863424792530114e-06, + "loss": 1.2916, + "step": 626 + }, + { + "epoch": 0.8387959866220736, + "grad_norm": 0.9384107305615497, + "learning_rate": 9.862339407278564e-06, + "loss": 1.1353, + "step": 627 + }, + { + "epoch": 0.8401337792642141, + "grad_norm": 1.0343072650262393, + "learning_rate": 9.861249786409248e-06, + "loss": 1.1969, + "step": 628 + }, + { + "epoch": 0.8414715719063545, + "grad_norm": 1.2442990175067865, + "learning_rate": 9.860155930871341e-06, + "loss": 1.457, + "step": 629 + }, + { + "epoch": 0.842809364548495, + "grad_norm": 1.1058298785134566, + "learning_rate": 9.859057841617709e-06, + "loss": 1.1552, + "step": 630 + }, + { + "epoch": 0.8441471571906355, + "grad_norm": 1.0493576642485076, + "learning_rate": 9.857955519604906e-06, + "loss": 1.4781, + "step": 631 + }, + { + "epoch": 0.8454849498327759, + "grad_norm": 1.229717984984398, + "learning_rate": 9.856848965793168e-06, + "loss": 1.2105, + "step": 632 + }, + { + "epoch": 0.8468227424749164, + "grad_norm": 1.161281640489081, + "learning_rate": 9.855738181146427e-06, + "loss": 1.2858, + "step": 633 + }, + { + "epoch": 0.8481605351170568, + "grad_norm": 1.0295030296630892, + "learning_rate": 9.854623166632296e-06, + "loss": 1.2835, + "step": 634 + }, + { + "epoch": 0.8494983277591973, + "grad_norm": 0.9335370517029412, + "learning_rate": 9.853503923222066e-06, + "loss": 1.5089, + "step": 635 + }, + { + "epoch": 0.8508361204013378, + "grad_norm": 1.0427256956430393, + "learning_rate": 9.852380451890723e-06, + "loss": 1.3125, + "step": 636 + }, + { + "epoch": 0.8521739130434782, + "grad_norm": 1.0317803360765194, + "learning_rate": 9.851252753616928e-06, + "loss": 1.1397, + "step": 637 + }, + { + "epoch": 0.8535117056856187, + "grad_norm": 1.062434027736285, + "learning_rate": 9.850120829383027e-06, + "loss": 1.2692, + "step": 638 + }, + { + "epoch": 0.8548494983277592, + "grad_norm": 1.1789277679608807, + "learning_rate": 9.848984680175049e-06, + "loss": 1.3828, + "step": 639 + }, + { + "epoch": 0.8561872909698997, + "grad_norm": 1.046997439273385, + "learning_rate": 9.847844306982698e-06, + "loss": 1.2104, + "step": 640 + }, + { + "epoch": 0.8575250836120402, + "grad_norm": 0.8172308685243332, + "learning_rate": 9.846699710799365e-06, + "loss": 1.2598, + "step": 641 + }, + { + "epoch": 0.8588628762541806, + "grad_norm": 1.0994813920696158, + "learning_rate": 9.845550892622113e-06, + "loss": 1.4065, + "step": 642 + }, + { + "epoch": 0.8602006688963211, + "grad_norm": 1.3094394021575002, + "learning_rate": 9.844397853451687e-06, + "loss": 1.3789, + "step": 643 + }, + { + "epoch": 0.8615384615384616, + "grad_norm": 1.021856888389773, + "learning_rate": 9.843240594292507e-06, + "loss": 1.0711, + "step": 644 + }, + { + "epoch": 0.862876254180602, + "grad_norm": 1.3033824948608186, + "learning_rate": 9.84207911615267e-06, + "loss": 1.2091, + "step": 645 + }, + { + "epoch": 0.8642140468227425, + "grad_norm": 1.0745673029749538, + "learning_rate": 9.840913420043945e-06, + "loss": 1.5013, + "step": 646 + }, + { + "epoch": 0.8655518394648829, + "grad_norm": 0.9297027526909145, + "learning_rate": 9.839743506981783e-06, + "loss": 1.2537, + "step": 647 + }, + { + "epoch": 0.8668896321070234, + "grad_norm": 0.9981553814788461, + "learning_rate": 9.838569377985298e-06, + "loss": 1.3096, + "step": 648 + }, + { + "epoch": 0.8682274247491639, + "grad_norm": 0.9918289920076159, + "learning_rate": 9.837391034077286e-06, + "loss": 1.2783, + "step": 649 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 0.9993028216317441, + "learning_rate": 9.836208476284208e-06, + "loss": 1.3, + "step": 650 + }, + { + "epoch": 0.8709030100334448, + "grad_norm": 0.9605914397329968, + "learning_rate": 9.835021705636201e-06, + "loss": 1.1916, + "step": 651 + }, + { + "epoch": 0.8722408026755852, + "grad_norm": 1.1224688447704763, + "learning_rate": 9.833830723167067e-06, + "loss": 1.4842, + "step": 652 + }, + { + "epoch": 0.8735785953177257, + "grad_norm": 1.0272375569718981, + "learning_rate": 9.83263552991428e-06, + "loss": 1.2653, + "step": 653 + }, + { + "epoch": 0.8749163879598663, + "grad_norm": 1.0379913153866371, + "learning_rate": 9.83143612691898e-06, + "loss": 1.4023, + "step": 654 + }, + { + "epoch": 0.8762541806020067, + "grad_norm": 0.9466840145479339, + "learning_rate": 9.830232515225976e-06, + "loss": 1.2231, + "step": 655 + }, + { + "epoch": 0.8775919732441472, + "grad_norm": 1.1166113879991955, + "learning_rate": 9.829024695883746e-06, + "loss": 1.34, + "step": 656 + }, + { + "epoch": 0.8789297658862876, + "grad_norm": 1.078204110296972, + "learning_rate": 9.827812669944423e-06, + "loss": 1.1903, + "step": 657 + }, + { + "epoch": 0.8802675585284281, + "grad_norm": 0.9906702803257798, + "learning_rate": 9.826596438463818e-06, + "loss": 1.3139, + "step": 658 + }, + { + "epoch": 0.8816053511705686, + "grad_norm": 0.9515594213507171, + "learning_rate": 9.825376002501393e-06, + "loss": 1.3307, + "step": 659 + }, + { + "epoch": 0.882943143812709, + "grad_norm": 1.0756286959720103, + "learning_rate": 9.824151363120283e-06, + "loss": 1.3487, + "step": 660 + }, + { + "epoch": 0.8842809364548495, + "grad_norm": 1.1464604655313775, + "learning_rate": 9.822922521387277e-06, + "loss": 1.1596, + "step": 661 + }, + { + "epoch": 0.88561872909699, + "grad_norm": 0.9645177378370682, + "learning_rate": 9.821689478372827e-06, + "loss": 1.2449, + "step": 662 + }, + { + "epoch": 0.8869565217391304, + "grad_norm": 1.0261685074375548, + "learning_rate": 9.82045223515105e-06, + "loss": 1.2088, + "step": 663 + }, + { + "epoch": 0.8882943143812709, + "grad_norm": 1.1042621704002435, + "learning_rate": 9.819210792799711e-06, + "loss": 1.3256, + "step": 664 + }, + { + "epoch": 0.8896321070234113, + "grad_norm": 1.1702095796584522, + "learning_rate": 9.817965152400244e-06, + "loss": 1.3725, + "step": 665 + }, + { + "epoch": 0.8909698996655518, + "grad_norm": 1.026255648136877, + "learning_rate": 9.816715315037733e-06, + "loss": 1.4135, + "step": 666 + }, + { + "epoch": 0.8923076923076924, + "grad_norm": 0.9307706239139668, + "learning_rate": 9.815461281800917e-06, + "loss": 1.4073, + "step": 667 + }, + { + "epoch": 0.8936454849498328, + "grad_norm": 0.9587787631135158, + "learning_rate": 9.814203053782201e-06, + "loss": 1.2218, + "step": 668 + }, + { + "epoch": 0.8949832775919733, + "grad_norm": 1.0910511038929538, + "learning_rate": 9.812940632077629e-06, + "loss": 1.1341, + "step": 669 + }, + { + "epoch": 0.8963210702341137, + "grad_norm": 1.1854104448944038, + "learning_rate": 9.811674017786908e-06, + "loss": 1.3252, + "step": 670 + }, + { + "epoch": 0.8976588628762542, + "grad_norm": 0.997738066944622, + "learning_rate": 9.810403212013395e-06, + "loss": 1.1927, + "step": 671 + }, + { + "epoch": 0.8989966555183947, + "grad_norm": 1.0481368030578422, + "learning_rate": 9.809128215864096e-06, + "loss": 1.2145, + "step": 672 + }, + { + "epoch": 0.9003344481605351, + "grad_norm": 1.1791451088608362, + "learning_rate": 9.807849030449671e-06, + "loss": 1.4395, + "step": 673 + }, + { + "epoch": 0.9016722408026756, + "grad_norm": 1.3288111858928215, + "learning_rate": 9.806565656884426e-06, + "loss": 1.1313, + "step": 674 + }, + { + "epoch": 0.903010033444816, + "grad_norm": 1.0771637279576842, + "learning_rate": 9.805278096286318e-06, + "loss": 1.2143, + "step": 675 + }, + { + "epoch": 0.9043478260869565, + "grad_norm": 0.9230530154095179, + "learning_rate": 9.803986349776948e-06, + "loss": 1.3059, + "step": 676 + }, + { + "epoch": 0.905685618729097, + "grad_norm": 1.0555479423725032, + "learning_rate": 9.802690418481569e-06, + "loss": 1.2886, + "step": 677 + }, + { + "epoch": 0.9070234113712374, + "grad_norm": 1.2226126787171283, + "learning_rate": 9.80139030352907e-06, + "loss": 1.1859, + "step": 678 + }, + { + "epoch": 0.9083612040133779, + "grad_norm": 1.2773920690064966, + "learning_rate": 9.800086006051996e-06, + "loss": 1.1973, + "step": 679 + }, + { + "epoch": 0.9096989966555183, + "grad_norm": 1.1327921504350962, + "learning_rate": 9.798777527186527e-06, + "loss": 1.2421, + "step": 680 + }, + { + "epoch": 0.9110367892976589, + "grad_norm": 1.0493454890770435, + "learning_rate": 9.797464868072489e-06, + "loss": 1.1988, + "step": 681 + }, + { + "epoch": 0.9123745819397994, + "grad_norm": 0.9255996360200152, + "learning_rate": 9.796148029853345e-06, + "loss": 1.1988, + "step": 682 + }, + { + "epoch": 0.9137123745819398, + "grad_norm": 0.9635963716283334, + "learning_rate": 9.794827013676206e-06, + "loss": 1.2107, + "step": 683 + }, + { + "epoch": 0.9150501672240803, + "grad_norm": 0.9714677884037952, + "learning_rate": 9.793501820691818e-06, + "loss": 1.2714, + "step": 684 + }, + { + "epoch": 0.9163879598662207, + "grad_norm": 1.0284214973380343, + "learning_rate": 9.792172452054565e-06, + "loss": 1.3462, + "step": 685 + }, + { + "epoch": 0.9177257525083612, + "grad_norm": 1.4111670082549888, + "learning_rate": 9.790838908922468e-06, + "loss": 1.4893, + "step": 686 + }, + { + "epoch": 0.9190635451505017, + "grad_norm": 1.1986986260878063, + "learning_rate": 9.789501192457188e-06, + "loss": 1.4193, + "step": 687 + }, + { + "epoch": 0.9204013377926421, + "grad_norm": 1.0477056202272648, + "learning_rate": 9.788159303824018e-06, + "loss": 1.2951, + "step": 688 + }, + { + "epoch": 0.9217391304347826, + "grad_norm": 0.9780440581785511, + "learning_rate": 9.786813244191885e-06, + "loss": 1.5407, + "step": 689 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 1.031524293811604, + "learning_rate": 9.785463014733356e-06, + "loss": 1.3034, + "step": 690 + }, + { + "epoch": 0.9244147157190635, + "grad_norm": 0.8713182699597491, + "learning_rate": 9.78410861662462e-06, + "loss": 1.0373, + "step": 691 + }, + { + "epoch": 0.925752508361204, + "grad_norm": 1.0080491091319905, + "learning_rate": 9.782750051045504e-06, + "loss": 1.1293, + "step": 692 + }, + { + "epoch": 0.9270903010033444, + "grad_norm": 0.9915783892578384, + "learning_rate": 9.781387319179465e-06, + "loss": 1.1002, + "step": 693 + }, + { + "epoch": 0.928428093645485, + "grad_norm": 1.340163493005468, + "learning_rate": 9.78002042221359e-06, + "loss": 1.1394, + "step": 694 + }, + { + "epoch": 0.9297658862876255, + "grad_norm": 1.0172794178879336, + "learning_rate": 9.778649361338588e-06, + "loss": 1.2228, + "step": 695 + }, + { + "epoch": 0.9311036789297659, + "grad_norm": 1.105552845021092, + "learning_rate": 9.777274137748802e-06, + "loss": 1.3378, + "step": 696 + }, + { + "epoch": 0.9324414715719064, + "grad_norm": 1.1033706501877478, + "learning_rate": 9.775894752642199e-06, + "loss": 1.3059, + "step": 697 + }, + { + "epoch": 0.9337792642140468, + "grad_norm": 0.861979531437992, + "learning_rate": 9.774511207220369e-06, + "loss": 1.4587, + "step": 698 + }, + { + "epoch": 0.9351170568561873, + "grad_norm": 1.1320665746462633, + "learning_rate": 9.773123502688532e-06, + "loss": 1.3568, + "step": 699 + }, + { + "epoch": 0.9364548494983278, + "grad_norm": 1.193514639452271, + "learning_rate": 9.771731640255525e-06, + "loss": 1.2629, + "step": 700 + }, + { + "epoch": 0.9377926421404682, + "grad_norm": 1.1812141496781143, + "learning_rate": 9.770335621133808e-06, + "loss": 1.1578, + "step": 701 + }, + { + "epoch": 0.9391304347826087, + "grad_norm": 0.9154468395123028, + "learning_rate": 9.768935446539464e-06, + "loss": 1.1679, + "step": 702 + }, + { + "epoch": 0.9404682274247491, + "grad_norm": 1.015017535317398, + "learning_rate": 9.767531117692196e-06, + "loss": 1.3675, + "step": 703 + }, + { + "epoch": 0.9418060200668896, + "grad_norm": 0.9833241689107443, + "learning_rate": 9.766122635815322e-06, + "loss": 1.3472, + "step": 704 + }, + { + "epoch": 0.9431438127090301, + "grad_norm": 1.046865686627144, + "learning_rate": 9.764710002135784e-06, + "loss": 1.1526, + "step": 705 + }, + { + "epoch": 0.9444816053511705, + "grad_norm": 1.0226137909602409, + "learning_rate": 9.763293217884133e-06, + "loss": 1.3383, + "step": 706 + }, + { + "epoch": 0.945819397993311, + "grad_norm": 1.1119152763375337, + "learning_rate": 9.761872284294542e-06, + "loss": 1.0147, + "step": 707 + }, + { + "epoch": 0.9471571906354515, + "grad_norm": 1.0882284332487828, + "learning_rate": 9.760447202604796e-06, + "loss": 1.0925, + "step": 708 + }, + { + "epoch": 0.948494983277592, + "grad_norm": 1.1289728287898222, + "learning_rate": 9.759017974056292e-06, + "loss": 1.27, + "step": 709 + }, + { + "epoch": 0.9498327759197325, + "grad_norm": 1.0308937978422033, + "learning_rate": 9.757584599894045e-06, + "loss": 1.4069, + "step": 710 + }, + { + "epoch": 0.9511705685618729, + "grad_norm": 1.045373822636566, + "learning_rate": 9.756147081366673e-06, + "loss": 1.0762, + "step": 711 + }, + { + "epoch": 0.9525083612040134, + "grad_norm": 0.9942329283154021, + "learning_rate": 9.75470541972641e-06, + "loss": 1.3919, + "step": 712 + }, + { + "epoch": 0.9538461538461539, + "grad_norm": 1.2006270824206908, + "learning_rate": 9.753259616229096e-06, + "loss": 1.4321, + "step": 713 + }, + { + "epoch": 0.9551839464882943, + "grad_norm": 0.9727546267475362, + "learning_rate": 9.751809672134184e-06, + "loss": 1.1129, + "step": 714 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 0.9168015764148519, + "learning_rate": 9.750355588704728e-06, + "loss": 1.3475, + "step": 715 + }, + { + "epoch": 0.9578595317725752, + "grad_norm": 1.084765809426878, + "learning_rate": 9.748897367207391e-06, + "loss": 1.1468, + "step": 716 + }, + { + "epoch": 0.9591973244147157, + "grad_norm": 0.873149048196804, + "learning_rate": 9.747435008912438e-06, + "loss": 1.1601, + "step": 717 + }, + { + "epoch": 0.9605351170568562, + "grad_norm": 0.9250802399820107, + "learning_rate": 9.745968515093741e-06, + "loss": 1.128, + "step": 718 + }, + { + "epoch": 0.9618729096989966, + "grad_norm": 0.9823942823105787, + "learning_rate": 9.744497887028774e-06, + "loss": 1.1336, + "step": 719 + }, + { + "epoch": 0.9632107023411371, + "grad_norm": 0.9321335321773481, + "learning_rate": 9.74302312599861e-06, + "loss": 1.1883, + "step": 720 + }, + { + "epoch": 0.9645484949832775, + "grad_norm": 0.9898345083909337, + "learning_rate": 9.741544233287924e-06, + "loss": 1.0442, + "step": 721 + }, + { + "epoch": 0.9658862876254181, + "grad_norm": 1.1202432342245665, + "learning_rate": 9.74006121018499e-06, + "loss": 1.4301, + "step": 722 + }, + { + "epoch": 0.9672240802675586, + "grad_norm": 1.0242270664176694, + "learning_rate": 9.73857405798168e-06, + "loss": 1.2914, + "step": 723 + }, + { + "epoch": 0.968561872909699, + "grad_norm": 1.27987489941618, + "learning_rate": 9.73708277797346e-06, + "loss": 1.2685, + "step": 724 + }, + { + "epoch": 0.9698996655518395, + "grad_norm": 0.9032405523257021, + "learning_rate": 9.735587371459399e-06, + "loss": 1.2646, + "step": 725 + }, + { + "epoch": 0.9712374581939799, + "grad_norm": 0.9543643075665318, + "learning_rate": 9.734087839742152e-06, + "loss": 1.3033, + "step": 726 + }, + { + "epoch": 0.9725752508361204, + "grad_norm": 1.0451217921656486, + "learning_rate": 9.732584184127973e-06, + "loss": 1.1134, + "step": 727 + }, + { + "epoch": 0.9739130434782609, + "grad_norm": 1.023839065383507, + "learning_rate": 9.731076405926706e-06, + "loss": 1.2917, + "step": 728 + }, + { + "epoch": 0.9752508361204013, + "grad_norm": 0.893259072094324, + "learning_rate": 9.729564506451791e-06, + "loss": 1.0639, + "step": 729 + }, + { + "epoch": 0.9765886287625418, + "grad_norm": 0.9035518035579341, + "learning_rate": 9.72804848702025e-06, + "loss": 1.3637, + "step": 730 + }, + { + "epoch": 0.9779264214046822, + "grad_norm": 1.2224758356561103, + "learning_rate": 9.7265283489527e-06, + "loss": 1.2891, + "step": 731 + }, + { + "epoch": 0.9792642140468227, + "grad_norm": 0.9656188962794257, + "learning_rate": 9.725004093573343e-06, + "loss": 1.2294, + "step": 732 + }, + { + "epoch": 0.9806020066889632, + "grad_norm": 1.1247448032880865, + "learning_rate": 9.72347572220997e-06, + "loss": 1.4286, + "step": 733 + }, + { + "epoch": 0.9819397993311036, + "grad_norm": 1.084447006808121, + "learning_rate": 9.721943236193952e-06, + "loss": 1.235, + "step": 734 + }, + { + "epoch": 0.9832775919732442, + "grad_norm": 1.1151359984004203, + "learning_rate": 9.720406636860252e-06, + "loss": 1.3767, + "step": 735 + }, + { + "epoch": 0.9846153846153847, + "grad_norm": 1.0459907644295254, + "learning_rate": 9.718865925547411e-06, + "loss": 1.3412, + "step": 736 + }, + { + "epoch": 0.9859531772575251, + "grad_norm": 0.9341414702111724, + "learning_rate": 9.717321103597556e-06, + "loss": 1.2998, + "step": 737 + }, + { + "epoch": 0.9872909698996656, + "grad_norm": 0.8285381105671298, + "learning_rate": 9.715772172356388e-06, + "loss": 1.1707, + "step": 738 + }, + { + "epoch": 0.988628762541806, + "grad_norm": 1.27074945740432, + "learning_rate": 9.714219133173194e-06, + "loss": 1.3549, + "step": 739 + }, + { + "epoch": 0.9899665551839465, + "grad_norm": 1.1112429500589538, + "learning_rate": 9.712661987400838e-06, + "loss": 1.0558, + "step": 740 + }, + { + "epoch": 0.991304347826087, + "grad_norm": 0.9611589459935225, + "learning_rate": 9.711100736395758e-06, + "loss": 1.0736, + "step": 741 + }, + { + "epoch": 0.9926421404682274, + "grad_norm": 0.963929606670458, + "learning_rate": 9.709535381517973e-06, + "loss": 1.1885, + "step": 742 + }, + { + "epoch": 0.9939799331103679, + "grad_norm": 1.1675081379263275, + "learning_rate": 9.707965924131074e-06, + "loss": 1.1289, + "step": 743 + }, + { + "epoch": 0.9953177257525083, + "grad_norm": 1.154412112193559, + "learning_rate": 9.706392365602224e-06, + "loss": 1.3345, + "step": 744 + }, + { + "epoch": 0.9966555183946488, + "grad_norm": 1.117518337613896, + "learning_rate": 9.704814707302166e-06, + "loss": 1.6519, + "step": 745 + }, + { + "epoch": 0.9979933110367893, + "grad_norm": 1.01678991853141, + "learning_rate": 9.703232950605203e-06, + "loss": 1.1562, + "step": 746 + }, + { + "epoch": 0.9993311036789297, + "grad_norm": 1.0319140590727147, + "learning_rate": 9.70164709688922e-06, + "loss": 1.1894, + "step": 747 + }, + { + "epoch": 1.0, + "grad_norm": 1.0319140590727147, + "learning_rate": 9.70005714753566e-06, + "loss": 1.1671, + "step": 748 + }, + { + "epoch": 1.0013377926421405, + "grad_norm": 1.9294370005907837, + "learning_rate": 9.698463103929542e-06, + "loss": 1.1519, + "step": 749 + }, + { + "epoch": 1.002675585284281, + "grad_norm": 1.0967499787357209, + "learning_rate": 9.69686496745945e-06, + "loss": 0.9536, + "step": 750 + }, + { + "epoch": 1.0040133779264213, + "grad_norm": 0.9109009609656542, + "learning_rate": 9.695262739517528e-06, + "loss": 1.0719, + "step": 751 + }, + { + "epoch": 1.0053511705685618, + "grad_norm": 1.0501673748900981, + "learning_rate": 9.69365642149949e-06, + "loss": 1.2003, + "step": 752 + }, + { + "epoch": 1.0066889632107023, + "grad_norm": 1.1585236705699764, + "learning_rate": 9.69204601480461e-06, + "loss": 1.1723, + "step": 753 + }, + { + "epoch": 1.0080267558528428, + "grad_norm": 1.3462072971391714, + "learning_rate": 9.690431520835725e-06, + "loss": 1.3035, + "step": 754 + }, + { + "epoch": 1.0093645484949834, + "grad_norm": 0.9832429503215145, + "learning_rate": 9.688812940999232e-06, + "loss": 1.2926, + "step": 755 + }, + { + "epoch": 1.0107023411371236, + "grad_norm": 0.9746474643715769, + "learning_rate": 9.687190276705088e-06, + "loss": 0.9998, + "step": 756 + }, + { + "epoch": 1.0120401337792642, + "grad_norm": 0.9406429321744871, + "learning_rate": 9.685563529366806e-06, + "loss": 0.9708, + "step": 757 + }, + { + "epoch": 1.0133779264214047, + "grad_norm": 1.1783370004784455, + "learning_rate": 9.683932700401457e-06, + "loss": 1.1713, + "step": 758 + }, + { + "epoch": 1.0147157190635452, + "grad_norm": 1.044499961397136, + "learning_rate": 9.682297791229668e-06, + "loss": 1.2128, + "step": 759 + }, + { + "epoch": 1.0160535117056857, + "grad_norm": 0.9530474783994695, + "learning_rate": 9.68065880327562e-06, + "loss": 1.3368, + "step": 760 + }, + { + "epoch": 1.017391304347826, + "grad_norm": 1.1434368264033739, + "learning_rate": 9.679015737967046e-06, + "loss": 1.2546, + "step": 761 + }, + { + "epoch": 1.0187290969899665, + "grad_norm": 1.0875349681553446, + "learning_rate": 9.677368596735232e-06, + "loss": 1.1472, + "step": 762 + }, + { + "epoch": 1.020066889632107, + "grad_norm": 1.2597639474757023, + "learning_rate": 9.675717381015014e-06, + "loss": 1.1994, + "step": 763 + }, + { + "epoch": 1.0214046822742475, + "grad_norm": 1.009441323462975, + "learning_rate": 9.674062092244779e-06, + "loss": 1.1509, + "step": 764 + }, + { + "epoch": 1.022742474916388, + "grad_norm": 1.0396985061684139, + "learning_rate": 9.67240273186646e-06, + "loss": 1.1496, + "step": 765 + }, + { + "epoch": 1.0240802675585283, + "grad_norm": 1.1056890313312362, + "learning_rate": 9.670739301325534e-06, + "loss": 1.3384, + "step": 766 + }, + { + "epoch": 1.0254180602006688, + "grad_norm": 1.2344979981697453, + "learning_rate": 9.669071802071032e-06, + "loss": 1.0361, + "step": 767 + }, + { + "epoch": 1.0267558528428093, + "grad_norm": 1.012818194786392, + "learning_rate": 9.66740023555552e-06, + "loss": 1.2104, + "step": 768 + }, + { + "epoch": 1.0280936454849499, + "grad_norm": 1.0711359156759233, + "learning_rate": 9.665724603235115e-06, + "loss": 1.0245, + "step": 769 + }, + { + "epoch": 1.0294314381270904, + "grad_norm": 0.9756522151559834, + "learning_rate": 9.66404490656947e-06, + "loss": 1.1486, + "step": 770 + }, + { + "epoch": 1.0307692307692307, + "grad_norm": 0.9020923775989191, + "learning_rate": 9.66236114702178e-06, + "loss": 1.1751, + "step": 771 + }, + { + "epoch": 1.0321070234113712, + "grad_norm": 1.1952424976677005, + "learning_rate": 9.66067332605878e-06, + "loss": 1.3284, + "step": 772 + }, + { + "epoch": 1.0334448160535117, + "grad_norm": 0.9489280743522861, + "learning_rate": 9.658981445150744e-06, + "loss": 1.1131, + "step": 773 + }, + { + "epoch": 1.0347826086956522, + "grad_norm": 1.1158600114686683, + "learning_rate": 9.65728550577148e-06, + "loss": 1.2552, + "step": 774 + }, + { + "epoch": 1.0361204013377927, + "grad_norm": 0.9012692029884478, + "learning_rate": 9.655585509398334e-06, + "loss": 1.2359, + "step": 775 + }, + { + "epoch": 1.037458193979933, + "grad_norm": 0.9793335372908638, + "learning_rate": 9.65388145751218e-06, + "loss": 1.3175, + "step": 776 + }, + { + "epoch": 1.0387959866220735, + "grad_norm": 0.9674876963119349, + "learning_rate": 9.652173351597435e-06, + "loss": 1.1971, + "step": 777 + }, + { + "epoch": 1.040133779264214, + "grad_norm": 1.1209356408412128, + "learning_rate": 9.650461193142042e-06, + "loss": 1.306, + "step": 778 + }, + { + "epoch": 1.0414715719063545, + "grad_norm": 0.9495114840007771, + "learning_rate": 9.648744983637471e-06, + "loss": 1.3125, + "step": 779 + }, + { + "epoch": 1.042809364548495, + "grad_norm": 1.2886217478670097, + "learning_rate": 9.647024724578724e-06, + "loss": 1.2577, + "step": 780 + }, + { + "epoch": 1.0441471571906356, + "grad_norm": 1.1814883351392271, + "learning_rate": 9.645300417464332e-06, + "loss": 1.0881, + "step": 781 + }, + { + "epoch": 1.0454849498327758, + "grad_norm": 1.1695540076695474, + "learning_rate": 9.643572063796352e-06, + "loss": 1.1914, + "step": 782 + }, + { + "epoch": 1.0468227424749164, + "grad_norm": 1.1668399956671287, + "learning_rate": 9.641839665080363e-06, + "loss": 1.1652, + "step": 783 + }, + { + "epoch": 1.0481605351170569, + "grad_norm": 0.8580852105697561, + "learning_rate": 9.640103222825472e-06, + "loss": 1.033, + "step": 784 + }, + { + "epoch": 1.0494983277591974, + "grad_norm": 0.855244076612585, + "learning_rate": 9.638362738544302e-06, + "loss": 1.3418, + "step": 785 + }, + { + "epoch": 1.050836120401338, + "grad_norm": 1.5835837591321373, + "learning_rate": 9.636618213753006e-06, + "loss": 1.2698, + "step": 786 + }, + { + "epoch": 1.0521739130434782, + "grad_norm": 0.8780603260660833, + "learning_rate": 9.634869649971247e-06, + "loss": 1.3535, + "step": 787 + }, + { + "epoch": 1.0535117056856187, + "grad_norm": 1.3258763552050057, + "learning_rate": 9.633117048722213e-06, + "loss": 1.0777, + "step": 788 + }, + { + "epoch": 1.0548494983277592, + "grad_norm": 0.9753886925199756, + "learning_rate": 9.631360411532609e-06, + "loss": 1.2928, + "step": 789 + }, + { + "epoch": 1.0561872909698997, + "grad_norm": 1.2287094207539686, + "learning_rate": 9.629599739932652e-06, + "loss": 1.2498, + "step": 790 + }, + { + "epoch": 1.0575250836120402, + "grad_norm": 1.0564505227182859, + "learning_rate": 9.627835035456074e-06, + "loss": 1.2, + "step": 791 + }, + { + "epoch": 1.0588628762541805, + "grad_norm": 1.068459289711206, + "learning_rate": 9.626066299640124e-06, + "loss": 1.1905, + "step": 792 + }, + { + "epoch": 1.060200668896321, + "grad_norm": 1.0381994325893666, + "learning_rate": 9.62429353402556e-06, + "loss": 1.1631, + "step": 793 + }, + { + "epoch": 1.0615384615384615, + "grad_norm": 0.9411430879542393, + "learning_rate": 9.62251674015665e-06, + "loss": 1.1232, + "step": 794 + }, + { + "epoch": 1.062876254180602, + "grad_norm": 1.2913809238180147, + "learning_rate": 9.620735919581168e-06, + "loss": 1.2807, + "step": 795 + }, + { + "epoch": 1.0642140468227426, + "grad_norm": 1.0070840146197753, + "learning_rate": 9.618951073850404e-06, + "loss": 1.1441, + "step": 796 + }, + { + "epoch": 1.0655518394648829, + "grad_norm": 1.0210607661341322, + "learning_rate": 9.617162204519147e-06, + "loss": 1.1059, + "step": 797 + }, + { + "epoch": 1.0668896321070234, + "grad_norm": 1.1398420475232436, + "learning_rate": 9.615369313145695e-06, + "loss": 1.3001, + "step": 798 + }, + { + "epoch": 1.0682274247491639, + "grad_norm": 1.2224121230829978, + "learning_rate": 9.61357240129185e-06, + "loss": 1.1892, + "step": 799 + }, + { + "epoch": 1.0695652173913044, + "grad_norm": 1.2520232569801282, + "learning_rate": 9.611771470522908e-06, + "loss": 1.2522, + "step": 800 + }, + { + "epoch": 1.070903010033445, + "grad_norm": 1.0138175892538668, + "learning_rate": 9.609966522407678e-06, + "loss": 1.012, + "step": 801 + }, + { + "epoch": 1.0722408026755852, + "grad_norm": 0.9903902765207505, + "learning_rate": 9.60815755851846e-06, + "loss": 1.1901, + "step": 802 + }, + { + "epoch": 1.0735785953177257, + "grad_norm": 1.0500269334987333, + "learning_rate": 9.60634458043106e-06, + "loss": 1.3041, + "step": 803 + }, + { + "epoch": 1.0749163879598662, + "grad_norm": 0.9433027826235135, + "learning_rate": 9.60452758972477e-06, + "loss": 1.1448, + "step": 804 + }, + { + "epoch": 1.0762541806020067, + "grad_norm": 1.2235443269266524, + "learning_rate": 9.602706587982384e-06, + "loss": 1.1318, + "step": 805 + }, + { + "epoch": 1.0775919732441472, + "grad_norm": 1.1297829459517061, + "learning_rate": 9.600881576790194e-06, + "loss": 1.0914, + "step": 806 + }, + { + "epoch": 1.0789297658862875, + "grad_norm": 1.344438048281299, + "learning_rate": 9.599052557737973e-06, + "loss": 1.0844, + "step": 807 + }, + { + "epoch": 1.080267558528428, + "grad_norm": 1.0188409977836408, + "learning_rate": 9.597219532418997e-06, + "loss": 1.1234, + "step": 808 + }, + { + "epoch": 1.0816053511705686, + "grad_norm": 1.1641552783555136, + "learning_rate": 9.59538250243003e-06, + "loss": 1.112, + "step": 809 + }, + { + "epoch": 1.082943143812709, + "grad_norm": 1.1642947554661864, + "learning_rate": 9.593541469371313e-06, + "loss": 1.1916, + "step": 810 + }, + { + "epoch": 1.0842809364548496, + "grad_norm": 0.9940746826425544, + "learning_rate": 9.591696434846589e-06, + "loss": 1.3097, + "step": 811 + }, + { + "epoch": 1.0856187290969899, + "grad_norm": 1.0935960901075776, + "learning_rate": 9.589847400463079e-06, + "loss": 1.3325, + "step": 812 + }, + { + "epoch": 1.0869565217391304, + "grad_norm": 0.9637582346184419, + "learning_rate": 9.58799436783149e-06, + "loss": 1.2292, + "step": 813 + }, + { + "epoch": 1.0882943143812709, + "grad_norm": 1.0353241414545342, + "learning_rate": 9.586137338566012e-06, + "loss": 1.2372, + "step": 814 + }, + { + "epoch": 1.0896321070234114, + "grad_norm": 0.7268306035046894, + "learning_rate": 9.584276314284316e-06, + "loss": 1.1301, + "step": 815 + }, + { + "epoch": 1.090969899665552, + "grad_norm": 1.1012067547438262, + "learning_rate": 9.58241129660755e-06, + "loss": 1.3335, + "step": 816 + }, + { + "epoch": 1.0923076923076924, + "grad_norm": 0.9599806443416289, + "learning_rate": 9.580542287160348e-06, + "loss": 1.1795, + "step": 817 + }, + { + "epoch": 1.0936454849498327, + "grad_norm": 1.0283192274956472, + "learning_rate": 9.578669287570817e-06, + "loss": 1.1717, + "step": 818 + }, + { + "epoch": 1.0949832775919732, + "grad_norm": 1.013982883727875, + "learning_rate": 9.576792299470537e-06, + "loss": 1.0785, + "step": 819 + }, + { + "epoch": 1.0963210702341137, + "grad_norm": 1.0401459535439064, + "learning_rate": 9.574911324494569e-06, + "loss": 1.1736, + "step": 820 + }, + { + "epoch": 1.0976588628762542, + "grad_norm": 0.8156208630872778, + "learning_rate": 9.573026364281441e-06, + "loss": 1.19, + "step": 821 + }, + { + "epoch": 1.0989966555183948, + "grad_norm": 0.9914700915617213, + "learning_rate": 9.571137420473154e-06, + "loss": 1.1738, + "step": 822 + }, + { + "epoch": 1.100334448160535, + "grad_norm": 1.131186223669788, + "learning_rate": 9.569244494715183e-06, + "loss": 1.1804, + "step": 823 + }, + { + "epoch": 1.1016722408026756, + "grad_norm": 1.1432293979817678, + "learning_rate": 9.567347588656468e-06, + "loss": 1.3695, + "step": 824 + }, + { + "epoch": 1.103010033444816, + "grad_norm": 1.1073724558970799, + "learning_rate": 9.565446703949417e-06, + "loss": 1.199, + "step": 825 + }, + { + "epoch": 1.1043478260869566, + "grad_norm": 1.2162384907089658, + "learning_rate": 9.563541842249903e-06, + "loss": 1.5254, + "step": 826 + }, + { + "epoch": 1.105685618729097, + "grad_norm": 1.012790472535047, + "learning_rate": 9.561633005217264e-06, + "loss": 1.4241, + "step": 827 + }, + { + "epoch": 1.1070234113712374, + "grad_norm": 0.8890373411201817, + "learning_rate": 9.559720194514303e-06, + "loss": 1.1701, + "step": 828 + }, + { + "epoch": 1.108361204013378, + "grad_norm": 1.2949432417008235, + "learning_rate": 9.557803411807283e-06, + "loss": 1.2848, + "step": 829 + }, + { + "epoch": 1.1096989966555184, + "grad_norm": 0.9168863031457178, + "learning_rate": 9.555882658765924e-06, + "loss": 1.3994, + "step": 830 + }, + { + "epoch": 1.111036789297659, + "grad_norm": 0.8707196915737979, + "learning_rate": 9.55395793706341e-06, + "loss": 1.4445, + "step": 831 + }, + { + "epoch": 1.1123745819397994, + "grad_norm": 1.1908433181898428, + "learning_rate": 9.552029248376378e-06, + "loss": 1.1527, + "step": 832 + }, + { + "epoch": 1.1137123745819397, + "grad_norm": 1.3186083655360914, + "learning_rate": 9.550096594384923e-06, + "loss": 1.2128, + "step": 833 + }, + { + "epoch": 1.1150501672240802, + "grad_norm": 1.2320203721725165, + "learning_rate": 9.548159976772593e-06, + "loss": 1.2993, + "step": 834 + }, + { + "epoch": 1.1163879598662207, + "grad_norm": 1.1515583261039577, + "learning_rate": 9.54621939722639e-06, + "loss": 1.2145, + "step": 835 + }, + { + "epoch": 1.1177257525083613, + "grad_norm": 0.9256331509246678, + "learning_rate": 9.544274857436763e-06, + "loss": 1.3065, + "step": 836 + }, + { + "epoch": 1.1190635451505018, + "grad_norm": 1.0000450648313177, + "learning_rate": 9.542326359097619e-06, + "loss": 1.1364, + "step": 837 + }, + { + "epoch": 1.120401337792642, + "grad_norm": 0.929348173941459, + "learning_rate": 9.540373903906306e-06, + "loss": 1.2252, + "step": 838 + }, + { + "epoch": 1.1217391304347826, + "grad_norm": 0.9228525906039842, + "learning_rate": 9.538417493563621e-06, + "loss": 1.36, + "step": 839 + }, + { + "epoch": 1.123076923076923, + "grad_norm": 1.140813013500768, + "learning_rate": 9.536457129773808e-06, + "loss": 1.2034, + "step": 840 + }, + { + "epoch": 1.1244147157190636, + "grad_norm": 1.0325997716702882, + "learning_rate": 9.534492814244552e-06, + "loss": 1.103, + "step": 841 + }, + { + "epoch": 1.125752508361204, + "grad_norm": 0.9067947290310915, + "learning_rate": 9.532524548686984e-06, + "loss": 1.26, + "step": 842 + }, + { + "epoch": 1.1270903010033444, + "grad_norm": 1.0982014731254655, + "learning_rate": 9.530552334815672e-06, + "loss": 1.5089, + "step": 843 + }, + { + "epoch": 1.128428093645485, + "grad_norm": 0.9462385880745092, + "learning_rate": 9.528576174348625e-06, + "loss": 1.3257, + "step": 844 + }, + { + "epoch": 1.1297658862876254, + "grad_norm": 0.9849549127408984, + "learning_rate": 9.526596069007292e-06, + "loss": 1.241, + "step": 845 + }, + { + "epoch": 1.131103678929766, + "grad_norm": 0.9239200031145137, + "learning_rate": 9.524612020516556e-06, + "loss": 1.3058, + "step": 846 + }, + { + "epoch": 1.1324414715719064, + "grad_norm": 0.9544051958988343, + "learning_rate": 9.522624030604735e-06, + "loss": 1.199, + "step": 847 + }, + { + "epoch": 1.1337792642140467, + "grad_norm": 1.1567149790985343, + "learning_rate": 9.520632101003579e-06, + "loss": 1.4175, + "step": 848 + }, + { + "epoch": 1.1351170568561872, + "grad_norm": 0.8084202320171914, + "learning_rate": 9.518636233448276e-06, + "loss": 1.2547, + "step": 849 + }, + { + "epoch": 1.1364548494983278, + "grad_norm": 0.9618462595281949, + "learning_rate": 9.516636429677437e-06, + "loss": 1.3194, + "step": 850 + }, + { + "epoch": 1.1377926421404683, + "grad_norm": 1.1439667701344962, + "learning_rate": 9.514632691433108e-06, + "loss": 1.2199, + "step": 851 + }, + { + "epoch": 1.1391304347826088, + "grad_norm": 0.8835340909942884, + "learning_rate": 9.512625020460754e-06, + "loss": 1.4355, + "step": 852 + }, + { + "epoch": 1.140468227424749, + "grad_norm": 0.9625211343412031, + "learning_rate": 9.510613418509276e-06, + "loss": 1.1385, + "step": 853 + }, + { + "epoch": 1.1418060200668896, + "grad_norm": 1.283699051896843, + "learning_rate": 9.508597887330993e-06, + "loss": 1.1224, + "step": 854 + }, + { + "epoch": 1.14314381270903, + "grad_norm": 1.0456173978689374, + "learning_rate": 9.506578428681648e-06, + "loss": 1.1379, + "step": 855 + }, + { + "epoch": 1.1444816053511706, + "grad_norm": 1.2764840101566073, + "learning_rate": 9.504555044320407e-06, + "loss": 1.3426, + "step": 856 + }, + { + "epoch": 1.1458193979933111, + "grad_norm": 0.9481965414269219, + "learning_rate": 9.50252773600985e-06, + "loss": 1.1792, + "step": 857 + }, + { + "epoch": 1.1471571906354514, + "grad_norm": 1.1677973017207879, + "learning_rate": 9.500496505515986e-06, + "loss": 1.3581, + "step": 858 + }, + { + "epoch": 1.148494983277592, + "grad_norm": 0.8018718145464837, + "learning_rate": 9.498461354608228e-06, + "loss": 1.232, + "step": 859 + }, + { + "epoch": 1.1498327759197324, + "grad_norm": 1.1214551641840482, + "learning_rate": 9.496422285059412e-06, + "loss": 1.2131, + "step": 860 + }, + { + "epoch": 1.151170568561873, + "grad_norm": 1.0230278897185967, + "learning_rate": 9.494379298645788e-06, + "loss": 0.9189, + "step": 861 + }, + { + "epoch": 1.1525083612040135, + "grad_norm": 0.9066031603023443, + "learning_rate": 9.492332397147013e-06, + "loss": 1.1486, + "step": 862 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 1.0241994124013034, + "learning_rate": 9.490281582346159e-06, + "loss": 1.2818, + "step": 863 + }, + { + "epoch": 1.1551839464882943, + "grad_norm": 1.0069786772362612, + "learning_rate": 9.488226856029704e-06, + "loss": 1.2781, + "step": 864 + }, + { + "epoch": 1.1565217391304348, + "grad_norm": 1.077816809679936, + "learning_rate": 9.486168219987534e-06, + "loss": 0.9181, + "step": 865 + }, + { + "epoch": 1.1578595317725753, + "grad_norm": 0.9797801391526473, + "learning_rate": 9.484105676012943e-06, + "loss": 1.0999, + "step": 866 + }, + { + "epoch": 1.1591973244147158, + "grad_norm": 1.1439055229417807, + "learning_rate": 9.482039225902623e-06, + "loss": 1.4112, + "step": 867 + }, + { + "epoch": 1.160535117056856, + "grad_norm": 1.0934083818915548, + "learning_rate": 9.47996887145668e-06, + "loss": 1.3854, + "step": 868 + }, + { + "epoch": 1.1618729096989966, + "grad_norm": 0.8426265982544393, + "learning_rate": 9.47789461447861e-06, + "loss": 1.2024, + "step": 869 + }, + { + "epoch": 1.163210702341137, + "grad_norm": 1.4088409374323907, + "learning_rate": 9.475816456775313e-06, + "loss": 1.2699, + "step": 870 + }, + { + "epoch": 1.1645484949832776, + "grad_norm": 1.220433713242878, + "learning_rate": 9.473734400157086e-06, + "loss": 1.2771, + "step": 871 + }, + { + "epoch": 1.1658862876254181, + "grad_norm": 1.2594833212470515, + "learning_rate": 9.471648446437625e-06, + "loss": 1.0833, + "step": 872 + }, + { + "epoch": 1.1672240802675584, + "grad_norm": 0.8550820110030183, + "learning_rate": 9.469558597434018e-06, + "loss": 1.0621, + "step": 873 + }, + { + "epoch": 1.168561872909699, + "grad_norm": 0.9594411759896534, + "learning_rate": 9.467464854966746e-06, + "loss": 1.1124, + "step": 874 + }, + { + "epoch": 1.1698996655518394, + "grad_norm": 1.2880664400015684, + "learning_rate": 9.465367220859684e-06, + "loss": 1.2395, + "step": 875 + }, + { + "epoch": 1.17123745819398, + "grad_norm": 1.0706921302981818, + "learning_rate": 9.463265696940095e-06, + "loss": 1.2314, + "step": 876 + }, + { + "epoch": 1.1725752508361205, + "grad_norm": 0.9946579813562366, + "learning_rate": 9.461160285038632e-06, + "loss": 1.192, + "step": 877 + }, + { + "epoch": 1.1739130434782608, + "grad_norm": 1.151054677896507, + "learning_rate": 9.459050986989333e-06, + "loss": 1.2844, + "step": 878 + }, + { + "epoch": 1.1752508361204013, + "grad_norm": 0.9599350490628374, + "learning_rate": 9.456937804629623e-06, + "loss": 1.3538, + "step": 879 + }, + { + "epoch": 1.1765886287625418, + "grad_norm": 1.145882379339644, + "learning_rate": 9.45482073980031e-06, + "loss": 1.1543, + "step": 880 + }, + { + "epoch": 1.1779264214046823, + "grad_norm": 0.9947278614968377, + "learning_rate": 9.452699794345583e-06, + "loss": 1.2875, + "step": 881 + }, + { + "epoch": 1.1792642140468228, + "grad_norm": 1.106017406791316, + "learning_rate": 9.45057497011301e-06, + "loss": 1.1418, + "step": 882 + }, + { + "epoch": 1.180602006688963, + "grad_norm": 0.9558745194488887, + "learning_rate": 9.448446268953549e-06, + "loss": 1.1827, + "step": 883 + }, + { + "epoch": 1.1819397993311036, + "grad_norm": 0.8519603029947985, + "learning_rate": 9.446313692721515e-06, + "loss": 1.2079, + "step": 884 + }, + { + "epoch": 1.1832775919732441, + "grad_norm": 1.1038318247789436, + "learning_rate": 9.444177243274619e-06, + "loss": 1.1247, + "step": 885 + }, + { + "epoch": 1.1846153846153846, + "grad_norm": 1.0242790363025613, + "learning_rate": 9.442036922473932e-06, + "loss": 1.2929, + "step": 886 + }, + { + "epoch": 1.1859531772575251, + "grad_norm": 0.9644303477631009, + "learning_rate": 9.439892732183903e-06, + "loss": 1.3474, + "step": 887 + }, + { + "epoch": 1.1872909698996654, + "grad_norm": 1.017577782855063, + "learning_rate": 9.437744674272353e-06, + "loss": 1.0564, + "step": 888 + }, + { + "epoch": 1.188628762541806, + "grad_norm": 0.9646273210043782, + "learning_rate": 9.435592750610469e-06, + "loss": 1.1416, + "step": 889 + }, + { + "epoch": 1.1899665551839465, + "grad_norm": 1.3251566405281718, + "learning_rate": 9.433436963072807e-06, + "loss": 0.9598, + "step": 890 + }, + { + "epoch": 1.191304347826087, + "grad_norm": 1.2144956226560566, + "learning_rate": 9.43127731353729e-06, + "loss": 0.986, + "step": 891 + }, + { + "epoch": 1.1926421404682275, + "grad_norm": 0.9938477545576753, + "learning_rate": 9.429113803885199e-06, + "loss": 1.1141, + "step": 892 + }, + { + "epoch": 1.193979933110368, + "grad_norm": 0.9647529731202138, + "learning_rate": 9.426946436001189e-06, + "loss": 1.2388, + "step": 893 + }, + { + "epoch": 1.1953177257525083, + "grad_norm": 1.2896989010383002, + "learning_rate": 9.424775211773263e-06, + "loss": 1.1422, + "step": 894 + }, + { + "epoch": 1.1966555183946488, + "grad_norm": 1.04779302443477, + "learning_rate": 9.422600133092795e-06, + "loss": 1.2458, + "step": 895 + }, + { + "epoch": 1.1979933110367893, + "grad_norm": 1.0891627196536031, + "learning_rate": 9.420421201854507e-06, + "loss": 1.0551, + "step": 896 + }, + { + "epoch": 1.1993311036789298, + "grad_norm": 1.0180040598320736, + "learning_rate": 9.418238419956484e-06, + "loss": 1.3399, + "step": 897 + }, + { + "epoch": 1.2006688963210703, + "grad_norm": 0.8604184548003588, + "learning_rate": 9.41605178930016e-06, + "loss": 1.3039, + "step": 898 + }, + { + "epoch": 1.2020066889632106, + "grad_norm": 1.1250151995268745, + "learning_rate": 9.413861311790327e-06, + "loss": 1.285, + "step": 899 + }, + { + "epoch": 1.2033444816053511, + "grad_norm": 0.9818594917904804, + "learning_rate": 9.411666989335123e-06, + "loss": 1.0514, + "step": 900 + }, + { + "epoch": 1.2046822742474916, + "grad_norm": 1.086728840671554, + "learning_rate": 9.409468823846038e-06, + "loss": 1.0451, + "step": 901 + }, + { + "epoch": 1.2060200668896321, + "grad_norm": 1.088213036269632, + "learning_rate": 9.40726681723791e-06, + "loss": 1.2614, + "step": 902 + }, + { + "epoch": 1.2073578595317727, + "grad_norm": 0.7969652598291863, + "learning_rate": 9.405060971428924e-06, + "loss": 1.1156, + "step": 903 + }, + { + "epoch": 1.208695652173913, + "grad_norm": 1.0390298552322335, + "learning_rate": 9.402851288340604e-06, + "loss": 1.2802, + "step": 904 + }, + { + "epoch": 1.2100334448160535, + "grad_norm": 1.130973189508361, + "learning_rate": 9.400637769897823e-06, + "loss": 1.4039, + "step": 905 + }, + { + "epoch": 1.211371237458194, + "grad_norm": 0.959851479012741, + "learning_rate": 9.398420418028789e-06, + "loss": 1.1041, + "step": 906 + }, + { + "epoch": 1.2127090301003345, + "grad_norm": 1.0933491855386934, + "learning_rate": 9.396199234665056e-06, + "loss": 1.3217, + "step": 907 + }, + { + "epoch": 1.214046822742475, + "grad_norm": 1.0985461757221067, + "learning_rate": 9.393974221741514e-06, + "loss": 1.0507, + "step": 908 + }, + { + "epoch": 1.2153846153846155, + "grad_norm": 1.0701767835295632, + "learning_rate": 9.391745381196382e-06, + "loss": 1.2262, + "step": 909 + }, + { + "epoch": 1.2167224080267558, + "grad_norm": 0.8692295419491648, + "learning_rate": 9.38951271497122e-06, + "loss": 1.1746, + "step": 910 + }, + { + "epoch": 1.2180602006688963, + "grad_norm": 0.975383747872018, + "learning_rate": 9.387276225010925e-06, + "loss": 1.4702, + "step": 911 + }, + { + "epoch": 1.2193979933110368, + "grad_norm": 0.9371315385445316, + "learning_rate": 9.38503591326371e-06, + "loss": 1.3478, + "step": 912 + }, + { + "epoch": 1.2207357859531773, + "grad_norm": 1.1146302081880026, + "learning_rate": 9.382791781681133e-06, + "loss": 1.0381, + "step": 913 + }, + { + "epoch": 1.2220735785953178, + "grad_norm": 0.8043404551029198, + "learning_rate": 9.380543832218069e-06, + "loss": 1.1208, + "step": 914 + }, + { + "epoch": 1.2234113712374581, + "grad_norm": 1.165302354659954, + "learning_rate": 9.378292066832723e-06, + "loss": 1.2833, + "step": 915 + }, + { + "epoch": 1.2247491638795986, + "grad_norm": 1.0331489785535668, + "learning_rate": 9.376036487486626e-06, + "loss": 1.3232, + "step": 916 + }, + { + "epoch": 1.2260869565217392, + "grad_norm": 0.9536522159281586, + "learning_rate": 9.373777096144625e-06, + "loss": 1.0864, + "step": 917 + }, + { + "epoch": 1.2274247491638797, + "grad_norm": 1.150666313065755, + "learning_rate": 9.371513894774894e-06, + "loss": 1.0211, + "step": 918 + }, + { + "epoch": 1.2287625418060202, + "grad_norm": 0.9619442260838535, + "learning_rate": 9.369246885348926e-06, + "loss": 1.2667, + "step": 919 + }, + { + "epoch": 1.2301003344481605, + "grad_norm": 0.9094934137977146, + "learning_rate": 9.366976069841524e-06, + "loss": 1.2811, + "step": 920 + }, + { + "epoch": 1.231438127090301, + "grad_norm": 1.0360067976072291, + "learning_rate": 9.364701450230813e-06, + "loss": 1.1809, + "step": 921 + }, + { + "epoch": 1.2327759197324415, + "grad_norm": 0.8940583487977339, + "learning_rate": 9.362423028498229e-06, + "loss": 1.0123, + "step": 922 + }, + { + "epoch": 1.234113712374582, + "grad_norm": 0.8449258287975997, + "learning_rate": 9.360140806628523e-06, + "loss": 1.1692, + "step": 923 + }, + { + "epoch": 1.2354515050167225, + "grad_norm": 1.3063162634430672, + "learning_rate": 9.357854786609754e-06, + "loss": 1.3765, + "step": 924 + }, + { + "epoch": 1.2367892976588628, + "grad_norm": 1.1687539658182942, + "learning_rate": 9.355564970433288e-06, + "loss": 1.2885, + "step": 925 + }, + { + "epoch": 1.2381270903010033, + "grad_norm": 1.095849881767696, + "learning_rate": 9.353271360093802e-06, + "loss": 1.0504, + "step": 926 + }, + { + "epoch": 1.2394648829431438, + "grad_norm": 0.9798886564925238, + "learning_rate": 9.350973957589278e-06, + "loss": 1.106, + "step": 927 + }, + { + "epoch": 1.2408026755852843, + "grad_norm": 1.0507093642986498, + "learning_rate": 9.348672764920995e-06, + "loss": 1.0807, + "step": 928 + }, + { + "epoch": 1.2421404682274249, + "grad_norm": 0.9904670638801792, + "learning_rate": 9.346367784093538e-06, + "loss": 1.072, + "step": 929 + }, + { + "epoch": 1.2434782608695651, + "grad_norm": 1.0922096351164619, + "learning_rate": 9.344059017114796e-06, + "loss": 1.0927, + "step": 930 + }, + { + "epoch": 1.2448160535117057, + "grad_norm": 1.0829881554357508, + "learning_rate": 9.341746465995947e-06, + "loss": 1.2227, + "step": 931 + }, + { + "epoch": 1.2461538461538462, + "grad_norm": 1.0784225916652328, + "learning_rate": 9.339430132751474e-06, + "loss": 1.0465, + "step": 932 + }, + { + "epoch": 1.2474916387959867, + "grad_norm": 0.8421288209367113, + "learning_rate": 9.33711001939915e-06, + "loss": 1.3578, + "step": 933 + }, + { + "epoch": 1.2488294314381272, + "grad_norm": 1.1462917700740722, + "learning_rate": 9.33478612796004e-06, + "loss": 1.2839, + "step": 934 + }, + { + "epoch": 1.2501672240802675, + "grad_norm": 0.800063837190317, + "learning_rate": 9.332458460458507e-06, + "loss": 1.2949, + "step": 935 + }, + { + "epoch": 1.251505016722408, + "grad_norm": 1.4104489635039261, + "learning_rate": 9.330127018922195e-06, + "loss": 1.2082, + "step": 936 + }, + { + "epoch": 1.2528428093645485, + "grad_norm": 1.005490899905048, + "learning_rate": 9.327791805382038e-06, + "loss": 1.4391, + "step": 937 + }, + { + "epoch": 1.254180602006689, + "grad_norm": 1.042721301683898, + "learning_rate": 9.325452821872258e-06, + "loss": 1.1595, + "step": 938 + }, + { + "epoch": 1.2555183946488295, + "grad_norm": 0.8468235442897906, + "learning_rate": 9.32311007043036e-06, + "loss": 1.2369, + "step": 939 + }, + { + "epoch": 1.2568561872909698, + "grad_norm": 1.104667960970807, + "learning_rate": 9.320763553097132e-06, + "loss": 1.1234, + "step": 940 + }, + { + "epoch": 1.2581939799331103, + "grad_norm": 1.2795106126789342, + "learning_rate": 9.31841327191664e-06, + "loss": 1.1978, + "step": 941 + }, + { + "epoch": 1.2595317725752508, + "grad_norm": 0.9587063544644768, + "learning_rate": 9.316059228936231e-06, + "loss": 1.1646, + "step": 942 + }, + { + "epoch": 1.2608695652173914, + "grad_norm": 1.2387067053592629, + "learning_rate": 9.31370142620653e-06, + "loss": 1.2495, + "step": 943 + }, + { + "epoch": 1.2622073578595319, + "grad_norm": 0.9535277861011645, + "learning_rate": 9.311339865781432e-06, + "loss": 1.3312, + "step": 944 + }, + { + "epoch": 1.2635451505016722, + "grad_norm": 0.789184758955248, + "learning_rate": 9.30897454971811e-06, + "loss": 1.3881, + "step": 945 + }, + { + "epoch": 1.2648829431438127, + "grad_norm": 1.1252033539230655, + "learning_rate": 9.30660548007701e-06, + "loss": 1.2707, + "step": 946 + }, + { + "epoch": 1.2662207357859532, + "grad_norm": 0.7911375563817133, + "learning_rate": 9.30423265892184e-06, + "loss": 1.2955, + "step": 947 + }, + { + "epoch": 1.2675585284280937, + "grad_norm": 1.002256533160927, + "learning_rate": 9.301856088319584e-06, + "loss": 1.0777, + "step": 948 + }, + { + "epoch": 1.2688963210702342, + "grad_norm": 1.0412488306139212, + "learning_rate": 9.299475770340492e-06, + "loss": 1.1568, + "step": 949 + }, + { + "epoch": 1.2702341137123745, + "grad_norm": 1.2962656234656655, + "learning_rate": 9.297091707058071e-06, + "loss": 1.2568, + "step": 950 + }, + { + "epoch": 1.271571906354515, + "grad_norm": 1.0761088289234755, + "learning_rate": 9.294703900549096e-06, + "loss": 1.0458, + "step": 951 + }, + { + "epoch": 1.2729096989966555, + "grad_norm": 1.0986103104188802, + "learning_rate": 9.292312352893603e-06, + "loss": 1.172, + "step": 952 + }, + { + "epoch": 1.274247491638796, + "grad_norm": 0.979600772131951, + "learning_rate": 9.289917066174887e-06, + "loss": 1.3347, + "step": 953 + }, + { + "epoch": 1.2755852842809365, + "grad_norm": 0.9056857492521745, + "learning_rate": 9.287518042479495e-06, + "loss": 1.2663, + "step": 954 + }, + { + "epoch": 1.2769230769230768, + "grad_norm": 0.8720848249487606, + "learning_rate": 9.285115283897237e-06, + "loss": 1.3118, + "step": 955 + }, + { + "epoch": 1.2782608695652173, + "grad_norm": 1.0240165800457504, + "learning_rate": 9.282708792521173e-06, + "loss": 1.2471, + "step": 956 + }, + { + "epoch": 1.2795986622073579, + "grad_norm": 1.0767355670845187, + "learning_rate": 9.280298570447612e-06, + "loss": 1.1379, + "step": 957 + }, + { + "epoch": 1.2809364548494984, + "grad_norm": 1.2956222208325503, + "learning_rate": 9.277884619776116e-06, + "loss": 1.2265, + "step": 958 + }, + { + "epoch": 1.2822742474916389, + "grad_norm": 1.1383011316877591, + "learning_rate": 9.275466942609495e-06, + "loss": 1.2144, + "step": 959 + }, + { + "epoch": 1.2836120401337792, + "grad_norm": 1.0887853934968055, + "learning_rate": 9.273045541053805e-06, + "loss": 1.0526, + "step": 960 + }, + { + "epoch": 1.2849498327759197, + "grad_norm": 1.0797737836044554, + "learning_rate": 9.270620417218344e-06, + "loss": 1.2161, + "step": 961 + }, + { + "epoch": 1.2862876254180602, + "grad_norm": 1.094215628019905, + "learning_rate": 9.268191573215653e-06, + "loss": 1.2244, + "step": 962 + }, + { + "epoch": 1.2876254180602007, + "grad_norm": 0.9147955749794136, + "learning_rate": 9.265759011161519e-06, + "loss": 1.2471, + "step": 963 + }, + { + "epoch": 1.2889632107023412, + "grad_norm": 0.9642297833148933, + "learning_rate": 9.263322733174962e-06, + "loss": 1.4357, + "step": 964 + }, + { + "epoch": 1.2903010033444815, + "grad_norm": 1.0297185055075062, + "learning_rate": 9.26088274137824e-06, + "loss": 1.175, + "step": 965 + }, + { + "epoch": 1.291638795986622, + "grad_norm": 1.1901192500308888, + "learning_rate": 9.258439037896846e-06, + "loss": 1.2687, + "step": 966 + }, + { + "epoch": 1.2929765886287625, + "grad_norm": 0.9751493279247477, + "learning_rate": 9.25599162485951e-06, + "loss": 1.1803, + "step": 967 + }, + { + "epoch": 1.294314381270903, + "grad_norm": 0.9970720870414708, + "learning_rate": 9.25354050439819e-06, + "loss": 1.066, + "step": 968 + }, + { + "epoch": 1.2956521739130435, + "grad_norm": 1.1439321049503104, + "learning_rate": 9.251085678648072e-06, + "loss": 1.5515, + "step": 969 + }, + { + "epoch": 1.2969899665551838, + "grad_norm": 0.8016787191017097, + "learning_rate": 9.248627149747573e-06, + "loss": 1.3032, + "step": 970 + }, + { + "epoch": 1.2983277591973243, + "grad_norm": 0.8275866004329332, + "learning_rate": 9.246164919838334e-06, + "loss": 1.4839, + "step": 971 + }, + { + "epoch": 1.2996655518394649, + "grad_norm": 1.0355606322445652, + "learning_rate": 9.243698991065222e-06, + "loss": 1.2555, + "step": 972 + }, + { + "epoch": 1.3010033444816054, + "grad_norm": 0.962125590388706, + "learning_rate": 9.241229365576325e-06, + "loss": 1.2293, + "step": 973 + }, + { + "epoch": 1.3023411371237459, + "grad_norm": 0.991939238632372, + "learning_rate": 9.238756045522949e-06, + "loss": 1.3073, + "step": 974 + }, + { + "epoch": 1.3036789297658862, + "grad_norm": 0.9455331711868543, + "learning_rate": 9.236279033059622e-06, + "loss": 1.2721, + "step": 975 + }, + { + "epoch": 1.3050167224080267, + "grad_norm": 0.9222982289835991, + "learning_rate": 9.233798330344085e-06, + "loss": 1.1472, + "step": 976 + }, + { + "epoch": 1.3063545150501672, + "grad_norm": 1.023248912081682, + "learning_rate": 9.231313939537298e-06, + "loss": 1.195, + "step": 977 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 1.0939719824269913, + "learning_rate": 9.22882586280343e-06, + "loss": 1.2173, + "step": 978 + }, + { + "epoch": 1.3090301003344482, + "grad_norm": 0.9677078624696247, + "learning_rate": 9.226334102309862e-06, + "loss": 1.2898, + "step": 979 + }, + { + "epoch": 1.3103678929765885, + "grad_norm": 1.0732301738023646, + "learning_rate": 9.223838660227183e-06, + "loss": 1.1945, + "step": 980 + }, + { + "epoch": 1.311705685618729, + "grad_norm": 0.9144956811097196, + "learning_rate": 9.221339538729191e-06, + "loss": 0.9736, + "step": 981 + }, + { + "epoch": 1.3130434782608695, + "grad_norm": 0.872152640457717, + "learning_rate": 9.21883673999289e-06, + "loss": 1.2444, + "step": 982 + }, + { + "epoch": 1.31438127090301, + "grad_norm": 1.0976183527857455, + "learning_rate": 9.21633026619848e-06, + "loss": 1.2626, + "step": 983 + }, + { + "epoch": 1.3157190635451506, + "grad_norm": 1.1577925769947868, + "learning_rate": 9.213820119529372e-06, + "loss": 1.2124, + "step": 984 + }, + { + "epoch": 1.3170568561872908, + "grad_norm": 1.2312720589901598, + "learning_rate": 9.211306302172174e-06, + "loss": 1.3225, + "step": 985 + }, + { + "epoch": 1.3183946488294314, + "grad_norm": 0.9669579064548182, + "learning_rate": 9.208788816316684e-06, + "loss": 1.4, + "step": 986 + }, + { + "epoch": 1.3197324414715719, + "grad_norm": 1.094515456025991, + "learning_rate": 9.206267664155906e-06, + "loss": 1.3802, + "step": 987 + }, + { + "epoch": 1.3210702341137124, + "grad_norm": 0.9834748275238886, + "learning_rate": 9.203742847886033e-06, + "loss": 1.2874, + "step": 988 + }, + { + "epoch": 1.322408026755853, + "grad_norm": 0.8179652683140917, + "learning_rate": 9.201214369706448e-06, + "loss": 1.2921, + "step": 989 + }, + { + "epoch": 1.3237458193979932, + "grad_norm": 1.2471169795346058, + "learning_rate": 9.198682231819727e-06, + "loss": 1.3742, + "step": 990 + }, + { + "epoch": 1.325083612040134, + "grad_norm": 0.9541928385984232, + "learning_rate": 9.196146436431635e-06, + "loss": 1.1598, + "step": 991 + }, + { + "epoch": 1.3264214046822742, + "grad_norm": 1.2560972787740659, + "learning_rate": 9.193606985751117e-06, + "loss": 1.2246, + "step": 992 + }, + { + "epoch": 1.3277591973244147, + "grad_norm": 0.9500272391445814, + "learning_rate": 9.191063881990308e-06, + "loss": 1.2625, + "step": 993 + }, + { + "epoch": 1.3290969899665552, + "grad_norm": 0.9014039899436193, + "learning_rate": 9.188517127364524e-06, + "loss": 1.1856, + "step": 994 + }, + { + "epoch": 1.3304347826086955, + "grad_norm": 0.8991356511516617, + "learning_rate": 9.185966724092261e-06, + "loss": 1.3328, + "step": 995 + }, + { + "epoch": 1.3317725752508363, + "grad_norm": 1.2531472999215845, + "learning_rate": 9.183412674395193e-06, + "loss": 1.1599, + "step": 996 + }, + { + "epoch": 1.3331103678929765, + "grad_norm": 0.9906989543398484, + "learning_rate": 9.180854980498168e-06, + "loss": 1.3914, + "step": 997 + }, + { + "epoch": 1.334448160535117, + "grad_norm": 0.8904315182016247, + "learning_rate": 9.178293644629214e-06, + "loss": 1.2367, + "step": 998 + }, + { + "epoch": 1.3357859531772576, + "grad_norm": 0.9560377816167329, + "learning_rate": 9.17572866901953e-06, + "loss": 1.2679, + "step": 999 + }, + { + "epoch": 1.3371237458193979, + "grad_norm": 1.0141571390926412, + "learning_rate": 9.173160055903478e-06, + "loss": 1.1652, + "step": 1000 + }, + { + "epoch": 1.3384615384615386, + "grad_norm": 0.9991255790122938, + "learning_rate": 9.1705878075186e-06, + "loss": 1.3119, + "step": 1001 + }, + { + "epoch": 1.3397993311036789, + "grad_norm": 0.9321133572919247, + "learning_rate": 9.168011926105598e-06, + "loss": 1.0393, + "step": 1002 + }, + { + "epoch": 1.3411371237458194, + "grad_norm": 1.0074845199646074, + "learning_rate": 9.165432413908341e-06, + "loss": 1.0391, + "step": 1003 + }, + { + "epoch": 1.34247491638796, + "grad_norm": 0.8401127567950541, + "learning_rate": 9.162849273173857e-06, + "loss": 1.4022, + "step": 1004 + }, + { + "epoch": 1.3438127090301004, + "grad_norm": 1.174887232794134, + "learning_rate": 9.160262506152343e-06, + "loss": 1.1956, + "step": 1005 + }, + { + "epoch": 1.345150501672241, + "grad_norm": 0.8263883294463455, + "learning_rate": 9.157672115097145e-06, + "loss": 1.2299, + "step": 1006 + }, + { + "epoch": 1.3464882943143812, + "grad_norm": 0.9499395626411784, + "learning_rate": 9.155078102264773e-06, + "loss": 1.2251, + "step": 1007 + }, + { + "epoch": 1.3478260869565217, + "grad_norm": 1.0345648213392276, + "learning_rate": 9.15248046991489e-06, + "loss": 1.2237, + "step": 1008 + }, + { + "epoch": 1.3491638795986622, + "grad_norm": 19.941636001730835, + "learning_rate": 9.14987922031031e-06, + "loss": 1.8186, + "step": 1009 + }, + { + "epoch": 1.3505016722408028, + "grad_norm": 0.9922634012232548, + "learning_rate": 9.147274355717002e-06, + "loss": 1.2344, + "step": 1010 + }, + { + "epoch": 1.3518394648829433, + "grad_norm": 1.0779149442654485, + "learning_rate": 9.14466587840408e-06, + "loss": 1.3902, + "step": 1011 + }, + { + "epoch": 1.3531772575250836, + "grad_norm": 1.1683211351446599, + "learning_rate": 9.142053790643806e-06, + "loss": 1.3201, + "step": 1012 + }, + { + "epoch": 1.354515050167224, + "grad_norm": 1.1462639856037646, + "learning_rate": 9.13943809471159e-06, + "loss": 1.1889, + "step": 1013 + }, + { + "epoch": 1.3558528428093646, + "grad_norm": 1.0003012967831026, + "learning_rate": 9.136818792885981e-06, + "loss": 1.0814, + "step": 1014 + }, + { + "epoch": 1.357190635451505, + "grad_norm": 0.9534122011809676, + "learning_rate": 9.134195887448673e-06, + "loss": 1.3905, + "step": 1015 + }, + { + "epoch": 1.3585284280936456, + "grad_norm": 1.1189134622024333, + "learning_rate": 9.131569380684497e-06, + "loss": 1.2571, + "step": 1016 + }, + { + "epoch": 1.359866220735786, + "grad_norm": 0.9115368065239414, + "learning_rate": 9.12893927488142e-06, + "loss": 1.186, + "step": 1017 + }, + { + "epoch": 1.3612040133779264, + "grad_norm": 0.9078492073088505, + "learning_rate": 9.126305572330547e-06, + "loss": 1.229, + "step": 1018 + }, + { + "epoch": 1.362541806020067, + "grad_norm": 1.1565685253322278, + "learning_rate": 9.123668275326113e-06, + "loss": 1.0849, + "step": 1019 + }, + { + "epoch": 1.3638795986622074, + "grad_norm": 0.7449007020726667, + "learning_rate": 9.121027386165487e-06, + "loss": 1.2702, + "step": 1020 + }, + { + "epoch": 1.365217391304348, + "grad_norm": 0.8297754308159428, + "learning_rate": 9.118382907149164e-06, + "loss": 1.3174, + "step": 1021 + }, + { + "epoch": 1.3665551839464882, + "grad_norm": 1.2819417767977328, + "learning_rate": 9.115734840580772e-06, + "loss": 1.2338, + "step": 1022 + }, + { + "epoch": 1.3678929765886287, + "grad_norm": 0.911566226004418, + "learning_rate": 9.113083188767057e-06, + "loss": 1.1984, + "step": 1023 + }, + { + "epoch": 1.3692307692307693, + "grad_norm": 0.9014697701393071, + "learning_rate": 9.110427954017891e-06, + "loss": 1.2202, + "step": 1024 + }, + { + "epoch": 1.3705685618729098, + "grad_norm": 0.9155974438110213, + "learning_rate": 9.107769138646273e-06, + "loss": 1.0215, + "step": 1025 + }, + { + "epoch": 1.3719063545150503, + "grad_norm": 0.8609819771294143, + "learning_rate": 9.105106744968308e-06, + "loss": 1.1313, + "step": 1026 + }, + { + "epoch": 1.3732441471571906, + "grad_norm": 0.8975427147646119, + "learning_rate": 9.10244077530323e-06, + "loss": 1.1819, + "step": 1027 + }, + { + "epoch": 1.374581939799331, + "grad_norm": 0.8174060019665034, + "learning_rate": 9.099771231973382e-06, + "loss": 1.2126, + "step": 1028 + }, + { + "epoch": 1.3759197324414716, + "grad_norm": 0.8276566414736056, + "learning_rate": 9.097098117304223e-06, + "loss": 1.0207, + "step": 1029 + }, + { + "epoch": 1.377257525083612, + "grad_norm": 1.2061351752836542, + "learning_rate": 9.094421433624322e-06, + "loss": 1.248, + "step": 1030 + }, + { + "epoch": 1.3785953177257526, + "grad_norm": 0.9931480769786961, + "learning_rate": 9.09174118326536e-06, + "loss": 1.247, + "step": 1031 + }, + { + "epoch": 1.379933110367893, + "grad_norm": 1.1046811447042795, + "learning_rate": 9.089057368562113e-06, + "loss": 1.1755, + "step": 1032 + }, + { + "epoch": 1.3812709030100334, + "grad_norm": 0.9453362868739454, + "learning_rate": 9.086369991852478e-06, + "loss": 1.2357, + "step": 1033 + }, + { + "epoch": 1.382608695652174, + "grad_norm": 1.0635051755076053, + "learning_rate": 9.083679055477446e-06, + "loss": 1.4167, + "step": 1034 + }, + { + "epoch": 1.3839464882943144, + "grad_norm": 1.0356771915779504, + "learning_rate": 9.08098456178111e-06, + "loss": 1.2836, + "step": 1035 + }, + { + "epoch": 1.385284280936455, + "grad_norm": 0.9296883427255624, + "learning_rate": 9.078286513110661e-06, + "loss": 1.1677, + "step": 1036 + }, + { + "epoch": 1.3866220735785952, + "grad_norm": 1.1227487042717241, + "learning_rate": 9.07558491181639e-06, + "loss": 1.0197, + "step": 1037 + }, + { + "epoch": 1.3879598662207357, + "grad_norm": 0.9979170104080862, + "learning_rate": 9.07287976025168e-06, + "loss": 1.2241, + "step": 1038 + }, + { + "epoch": 1.3892976588628763, + "grad_norm": 0.8278988863098805, + "learning_rate": 9.070171060773007e-06, + "loss": 1.2929, + "step": 1039 + }, + { + "epoch": 1.3906354515050168, + "grad_norm": 1.1773639164194982, + "learning_rate": 9.067458815739938e-06, + "loss": 1.3397, + "step": 1040 + }, + { + "epoch": 1.3919732441471573, + "grad_norm": 0.9957011409464378, + "learning_rate": 9.064743027515127e-06, + "loss": 1.3156, + "step": 1041 + }, + { + "epoch": 1.3933110367892976, + "grad_norm": 1.2222072101795807, + "learning_rate": 9.062023698464322e-06, + "loss": 1.1773, + "step": 1042 + }, + { + "epoch": 1.394648829431438, + "grad_norm": 0.9919896867191496, + "learning_rate": 9.059300830956343e-06, + "loss": 1.3358, + "step": 1043 + }, + { + "epoch": 1.3959866220735786, + "grad_norm": 1.1047854908205597, + "learning_rate": 9.056574427363102e-06, + "loss": 1.2323, + "step": 1044 + }, + { + "epoch": 1.397324414715719, + "grad_norm": 1.033801272109346, + "learning_rate": 9.053844490059589e-06, + "loss": 1.2398, + "step": 1045 + }, + { + "epoch": 1.3986622073578596, + "grad_norm": 0.9671308358876771, + "learning_rate": 9.051111021423868e-06, + "loss": 1.1755, + "step": 1046 + }, + { + "epoch": 1.4, + "grad_norm": 1.0886700966146703, + "learning_rate": 9.048374023837086e-06, + "loss": 1.0798, + "step": 1047 + }, + { + "epoch": 1.4013377926421404, + "grad_norm": 1.1981739854432376, + "learning_rate": 9.045633499683457e-06, + "loss": 1.3144, + "step": 1048 + }, + { + "epoch": 1.402675585284281, + "grad_norm": 1.6004215713627743, + "learning_rate": 9.042889451350274e-06, + "loss": 1.1856, + "step": 1049 + }, + { + "epoch": 1.4040133779264214, + "grad_norm": 1.059014382156057, + "learning_rate": 9.040141881227897e-06, + "loss": 1.2269, + "step": 1050 + }, + { + "epoch": 1.405351170568562, + "grad_norm": 0.9882959508628972, + "learning_rate": 9.03739079170975e-06, + "loss": 1.1694, + "step": 1051 + }, + { + "epoch": 1.4066889632107022, + "grad_norm": 0.87062117824566, + "learning_rate": 9.034636185192329e-06, + "loss": 1.2771, + "step": 1052 + }, + { + "epoch": 1.4080267558528428, + "grad_norm": 0.9649674468658204, + "learning_rate": 9.03187806407519e-06, + "loss": 1.2399, + "step": 1053 + }, + { + "epoch": 1.4093645484949833, + "grad_norm": 1.1672696080032086, + "learning_rate": 9.029116430760952e-06, + "loss": 1.1631, + "step": 1054 + }, + { + "epoch": 1.4107023411371238, + "grad_norm": 0.921444623047592, + "learning_rate": 9.026351287655294e-06, + "loss": 1.3865, + "step": 1055 + }, + { + "epoch": 1.4120401337792643, + "grad_norm": 1.2352028207594157, + "learning_rate": 9.023582637166948e-06, + "loss": 1.148, + "step": 1056 + }, + { + "epoch": 1.4133779264214046, + "grad_norm": 1.1223383306053056, + "learning_rate": 9.020810481707709e-06, + "loss": 1.1363, + "step": 1057 + }, + { + "epoch": 1.414715719063545, + "grad_norm": 0.8710563854323395, + "learning_rate": 9.01803482369242e-06, + "loss": 1.1259, + "step": 1058 + }, + { + "epoch": 1.4160535117056856, + "grad_norm": 1.068506816451257, + "learning_rate": 9.015255665538972e-06, + "loss": 1.3515, + "step": 1059 + }, + { + "epoch": 1.4173913043478261, + "grad_norm": 0.9233499889981182, + "learning_rate": 9.012473009668314e-06, + "loss": 1.0901, + "step": 1060 + }, + { + "epoch": 1.4187290969899666, + "grad_norm": 1.0562872958430851, + "learning_rate": 9.009686858504434e-06, + "loss": 1.196, + "step": 1061 + }, + { + "epoch": 1.420066889632107, + "grad_norm": 1.098434007192559, + "learning_rate": 9.00689721447437e-06, + "loss": 1.2314, + "step": 1062 + }, + { + "epoch": 1.4214046822742474, + "grad_norm": 0.9272680612889436, + "learning_rate": 9.004104080008198e-06, + "loss": 1.1304, + "step": 1063 + }, + { + "epoch": 1.422742474916388, + "grad_norm": 0.8372808907532759, + "learning_rate": 9.001307457539038e-06, + "loss": 1.2362, + "step": 1064 + }, + { + "epoch": 1.4240802675585285, + "grad_norm": 1.0367462098857163, + "learning_rate": 8.998507349503048e-06, + "loss": 1.2736, + "step": 1065 + }, + { + "epoch": 1.425418060200669, + "grad_norm": 1.0620106114702004, + "learning_rate": 8.99570375833942e-06, + "loss": 1.0994, + "step": 1066 + }, + { + "epoch": 1.4267558528428093, + "grad_norm": 1.155522444151508, + "learning_rate": 8.992896686490384e-06, + "loss": 1.2647, + "step": 1067 + }, + { + "epoch": 1.4280936454849498, + "grad_norm": 0.7922099433425501, + "learning_rate": 8.990086136401199e-06, + "loss": 1.2036, + "step": 1068 + }, + { + "epoch": 1.4294314381270903, + "grad_norm": 1.0170507619144022, + "learning_rate": 8.987272110520154e-06, + "loss": 1.3614, + "step": 1069 + }, + { + "epoch": 1.4307692307692308, + "grad_norm": 1.0385950246613989, + "learning_rate": 8.984454611298565e-06, + "loss": 1.1858, + "step": 1070 + }, + { + "epoch": 1.4321070234113713, + "grad_norm": 1.1725651706891171, + "learning_rate": 8.981633641190779e-06, + "loss": 1.1558, + "step": 1071 + }, + { + "epoch": 1.4334448160535116, + "grad_norm": 0.8207772954807604, + "learning_rate": 8.978809202654161e-06, + "loss": 1.1166, + "step": 1072 + }, + { + "epoch": 1.434782608695652, + "grad_norm": 1.0630993118740673, + "learning_rate": 8.975981298149099e-06, + "loss": 1.1384, + "step": 1073 + }, + { + "epoch": 1.4361204013377926, + "grad_norm": 1.0660642696151819, + "learning_rate": 8.973149930139e-06, + "loss": 1.1206, + "step": 1074 + }, + { + "epoch": 1.4374581939799331, + "grad_norm": 0.9020934896041308, + "learning_rate": 8.97031510109029e-06, + "loss": 1.4381, + "step": 1075 + }, + { + "epoch": 1.4387959866220736, + "grad_norm": 1.2122434769518264, + "learning_rate": 8.967476813472407e-06, + "loss": 1.1333, + "step": 1076 + }, + { + "epoch": 1.440133779264214, + "grad_norm": 1.050918335970559, + "learning_rate": 8.964635069757803e-06, + "loss": 1.2153, + "step": 1077 + }, + { + "epoch": 1.4414715719063544, + "grad_norm": 0.9717415728639043, + "learning_rate": 8.96178987242194e-06, + "loss": 1.2685, + "step": 1078 + }, + { + "epoch": 1.442809364548495, + "grad_norm": 1.0294375234798938, + "learning_rate": 8.958941223943292e-06, + "loss": 1.2226, + "step": 1079 + }, + { + "epoch": 1.4441471571906355, + "grad_norm": 1.3048612401705417, + "learning_rate": 8.956089126803333e-06, + "loss": 1.1894, + "step": 1080 + }, + { + "epoch": 1.445484949832776, + "grad_norm": 0.854036177570934, + "learning_rate": 8.953233583486548e-06, + "loss": 1.0063, + "step": 1081 + }, + { + "epoch": 1.4468227424749163, + "grad_norm": 0.9773108068008826, + "learning_rate": 8.950374596480419e-06, + "loss": 1.2044, + "step": 1082 + }, + { + "epoch": 1.4481605351170568, + "grad_norm": 1.1657713195364285, + "learning_rate": 8.94751216827543e-06, + "loss": 1.0969, + "step": 1083 + }, + { + "epoch": 1.4494983277591973, + "grad_norm": 0.9286922011614165, + "learning_rate": 8.944646301365061e-06, + "loss": 1.2832, + "step": 1084 + }, + { + "epoch": 1.4508361204013378, + "grad_norm": 1.0418957551372185, + "learning_rate": 8.94177699824579e-06, + "loss": 1.3196, + "step": 1085 + }, + { + "epoch": 1.4521739130434783, + "grad_norm": 1.0432209481025003, + "learning_rate": 8.938904261417088e-06, + "loss": 1.155, + "step": 1086 + }, + { + "epoch": 1.4535117056856186, + "grad_norm": 1.1026415394797335, + "learning_rate": 8.936028093381414e-06, + "loss": 1.2538, + "step": 1087 + }, + { + "epoch": 1.4548494983277591, + "grad_norm": 0.9095971324236375, + "learning_rate": 8.933148496644218e-06, + "loss": 1.0192, + "step": 1088 + }, + { + "epoch": 1.4561872909698996, + "grad_norm": 1.2041982105684632, + "learning_rate": 8.930265473713939e-06, + "loss": 1.4025, + "step": 1089 + }, + { + "epoch": 1.4575250836120401, + "grad_norm": 1.1157002882503038, + "learning_rate": 8.927379027101994e-06, + "loss": 1.2702, + "step": 1090 + }, + { + "epoch": 1.4588628762541807, + "grad_norm": 1.2365143755652492, + "learning_rate": 8.924489159322792e-06, + "loss": 1.4227, + "step": 1091 + }, + { + "epoch": 1.460200668896321, + "grad_norm": 0.9101889949809833, + "learning_rate": 8.921595872893714e-06, + "loss": 1.1483, + "step": 1092 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.9564238299006802, + "learning_rate": 8.918699170335123e-06, + "loss": 1.2236, + "step": 1093 + }, + { + "epoch": 1.462876254180602, + "grad_norm": 1.2130229382326556, + "learning_rate": 8.915799054170357e-06, + "loss": 1.2328, + "step": 1094 + }, + { + "epoch": 1.4642140468227425, + "grad_norm": 1.1970960887032074, + "learning_rate": 8.912895526925726e-06, + "loss": 1.2345, + "step": 1095 + }, + { + "epoch": 1.465551839464883, + "grad_norm": 1.1585193805035001, + "learning_rate": 8.909988591130514e-06, + "loss": 1.1797, + "step": 1096 + }, + { + "epoch": 1.4668896321070233, + "grad_norm": 1.1710812812628821, + "learning_rate": 8.907078249316972e-06, + "loss": 1.1354, + "step": 1097 + }, + { + "epoch": 1.468227424749164, + "grad_norm": 1.1594362432726348, + "learning_rate": 8.904164504020321e-06, + "loss": 1.3169, + "step": 1098 + }, + { + "epoch": 1.4695652173913043, + "grad_norm": 0.8651691047258486, + "learning_rate": 8.901247357778742e-06, + "loss": 1.1774, + "step": 1099 + }, + { + "epoch": 1.4709030100334448, + "grad_norm": 0.887069050902162, + "learning_rate": 8.898326813133385e-06, + "loss": 1.2378, + "step": 1100 + }, + { + "epoch": 1.4722408026755853, + "grad_norm": 1.0137417027347357, + "learning_rate": 8.895402872628352e-06, + "loss": 1.0831, + "step": 1101 + }, + { + "epoch": 1.4735785953177256, + "grad_norm": 0.8313083492847975, + "learning_rate": 8.892475538810714e-06, + "loss": 1.079, + "step": 1102 + }, + { + "epoch": 1.4749163879598663, + "grad_norm": 1.2126040640149927, + "learning_rate": 8.889544814230487e-06, + "loss": 1.2404, + "step": 1103 + }, + { + "epoch": 1.4762541806020066, + "grad_norm": 0.764220299411412, + "learning_rate": 8.886610701440648e-06, + "loss": 1.1628, + "step": 1104 + }, + { + "epoch": 1.4775919732441471, + "grad_norm": 1.1088491207625026, + "learning_rate": 8.883673202997121e-06, + "loss": 1.5537, + "step": 1105 + }, + { + "epoch": 1.4789297658862877, + "grad_norm": 0.9388740481944372, + "learning_rate": 8.880732321458785e-06, + "loss": 1.1153, + "step": 1106 + }, + { + "epoch": 1.4802675585284282, + "grad_norm": 1.010844223485598, + "learning_rate": 8.87778805938746e-06, + "loss": 1.1967, + "step": 1107 + }, + { + "epoch": 1.4816053511705687, + "grad_norm": 0.9668305028403509, + "learning_rate": 8.874840419347912e-06, + "loss": 1.1373, + "step": 1108 + }, + { + "epoch": 1.482943143812709, + "grad_norm": 0.7751816105524765, + "learning_rate": 8.871889403907853e-06, + "loss": 1.2746, + "step": 1109 + }, + { + "epoch": 1.4842809364548495, + "grad_norm": 1.1370415589738425, + "learning_rate": 8.868935015637932e-06, + "loss": 1.1559, + "step": 1110 + }, + { + "epoch": 1.48561872909699, + "grad_norm": 1.0254606509302928, + "learning_rate": 8.865977257111738e-06, + "loss": 1.2465, + "step": 1111 + }, + { + "epoch": 1.4869565217391305, + "grad_norm": 0.9578190191811341, + "learning_rate": 8.863016130905795e-06, + "loss": 1.2228, + "step": 1112 + }, + { + "epoch": 1.488294314381271, + "grad_norm": 1.2009537901221174, + "learning_rate": 8.86005163959956e-06, + "loss": 1.3003, + "step": 1113 + }, + { + "epoch": 1.4896321070234113, + "grad_norm": 0.9398761584699801, + "learning_rate": 8.857083785775423e-06, + "loss": 1.3299, + "step": 1114 + }, + { + "epoch": 1.4909698996655518, + "grad_norm": 0.9614376060644311, + "learning_rate": 8.854112572018702e-06, + "loss": 1.159, + "step": 1115 + }, + { + "epoch": 1.4923076923076923, + "grad_norm": 0.974157764541156, + "learning_rate": 8.851138000917641e-06, + "loss": 1.224, + "step": 1116 + }, + { + "epoch": 1.4936454849498328, + "grad_norm": 1.1897975043001048, + "learning_rate": 8.84816007506341e-06, + "loss": 1.2363, + "step": 1117 + }, + { + "epoch": 1.4949832775919734, + "grad_norm": 1.0374693138351918, + "learning_rate": 8.845178797050102e-06, + "loss": 1.0076, + "step": 1118 + }, + { + "epoch": 1.4963210702341136, + "grad_norm": 1.0073318634174595, + "learning_rate": 8.842194169474727e-06, + "loss": 1.3009, + "step": 1119 + }, + { + "epoch": 1.4976588628762542, + "grad_norm": 0.9733488666879512, + "learning_rate": 8.839206194937218e-06, + "loss": 1.2222, + "step": 1120 + }, + { + "epoch": 1.4989966555183947, + "grad_norm": 0.9883117460287831, + "learning_rate": 8.836214876040416e-06, + "loss": 1.146, + "step": 1121 + }, + { + "epoch": 1.500334448160535, + "grad_norm": 0.9931580920108711, + "learning_rate": 8.833220215390085e-06, + "loss": 0.9951, + "step": 1122 + }, + { + "epoch": 1.5016722408026757, + "grad_norm": 0.942280264195419, + "learning_rate": 8.83022221559489e-06, + "loss": 1.25, + "step": 1123 + }, + { + "epoch": 1.503010033444816, + "grad_norm": 1.0446604626074532, + "learning_rate": 8.827220879266414e-06, + "loss": 1.1995, + "step": 1124 + }, + { + "epoch": 1.5043478260869565, + "grad_norm": 1.249145081277354, + "learning_rate": 8.824216209019139e-06, + "loss": 1.1643, + "step": 1125 + }, + { + "epoch": 1.505685618729097, + "grad_norm": 0.8841657831866403, + "learning_rate": 8.821208207470454e-06, + "loss": 1.131, + "step": 1126 + }, + { + "epoch": 1.5070234113712373, + "grad_norm": 0.9715343121121147, + "learning_rate": 8.818196877240652e-06, + "loss": 1.2352, + "step": 1127 + }, + { + "epoch": 1.508361204013378, + "grad_norm": 1.0088817867296578, + "learning_rate": 8.815182220952922e-06, + "loss": 1.1705, + "step": 1128 + }, + { + "epoch": 1.5096989966555183, + "grad_norm": 0.8361944785073098, + "learning_rate": 8.812164241233354e-06, + "loss": 1.2485, + "step": 1129 + }, + { + "epoch": 1.5110367892976588, + "grad_norm": 1.1190081275225168, + "learning_rate": 8.80914294071093e-06, + "loss": 1.2406, + "step": 1130 + }, + { + "epoch": 1.5123745819397993, + "grad_norm": 1.0469567499728767, + "learning_rate": 8.806118322017525e-06, + "loss": 1.3154, + "step": 1131 + }, + { + "epoch": 1.5137123745819396, + "grad_norm": 0.9920567761481239, + "learning_rate": 8.803090387787909e-06, + "loss": 1.2081, + "step": 1132 + }, + { + "epoch": 1.5150501672240804, + "grad_norm": 1.032133221403735, + "learning_rate": 8.800059140659731e-06, + "loss": 1.0879, + "step": 1133 + }, + { + "epoch": 1.5163879598662207, + "grad_norm": 0.8840890539138067, + "learning_rate": 8.797024583273536e-06, + "loss": 1.3182, + "step": 1134 + }, + { + "epoch": 1.5177257525083612, + "grad_norm": 1.0340933587721703, + "learning_rate": 8.793986718272747e-06, + "loss": 1.4647, + "step": 1135 + }, + { + "epoch": 1.5190635451505017, + "grad_norm": 0.9546575104828505, + "learning_rate": 8.790945548303669e-06, + "loss": 1.1425, + "step": 1136 + }, + { + "epoch": 1.5204013377926422, + "grad_norm": 1.310371287409543, + "learning_rate": 8.787901076015487e-06, + "loss": 1.2691, + "step": 1137 + }, + { + "epoch": 1.5217391304347827, + "grad_norm": 1.1135716801328095, + "learning_rate": 8.784853304060262e-06, + "loss": 1.2727, + "step": 1138 + }, + { + "epoch": 1.523076923076923, + "grad_norm": 0.8415656033337595, + "learning_rate": 8.781802235092927e-06, + "loss": 1.1996, + "step": 1139 + }, + { + "epoch": 1.5244147157190635, + "grad_norm": 0.979100960609878, + "learning_rate": 8.778747871771293e-06, + "loss": 1.2339, + "step": 1140 + }, + { + "epoch": 1.525752508361204, + "grad_norm": 0.9162126626062809, + "learning_rate": 8.775690216756035e-06, + "loss": 1.1938, + "step": 1141 + }, + { + "epoch": 1.5270903010033445, + "grad_norm": 1.0829168648872023, + "learning_rate": 8.772629272710698e-06, + "loss": 1.2271, + "step": 1142 + }, + { + "epoch": 1.528428093645485, + "grad_norm": 1.0902197776159066, + "learning_rate": 8.769565042301692e-06, + "loss": 1.2375, + "step": 1143 + }, + { + "epoch": 1.5297658862876253, + "grad_norm": 1.0059131266583865, + "learning_rate": 8.766497528198289e-06, + "loss": 1.3393, + "step": 1144 + }, + { + "epoch": 1.5311036789297658, + "grad_norm": 1.1463294963350308, + "learning_rate": 8.763426733072624e-06, + "loss": 1.2976, + "step": 1145 + }, + { + "epoch": 1.5324414715719064, + "grad_norm": 1.0244359033827497, + "learning_rate": 8.760352659599684e-06, + "loss": 1.2302, + "step": 1146 + }, + { + "epoch": 1.5337792642140469, + "grad_norm": 0.985896930859517, + "learning_rate": 8.757275310457321e-06, + "loss": 1.3561, + "step": 1147 + }, + { + "epoch": 1.5351170568561874, + "grad_norm": 1.133915019353679, + "learning_rate": 8.754194688326229e-06, + "loss": 1.3548, + "step": 1148 + }, + { + "epoch": 1.5364548494983277, + "grad_norm": 0.9944788298504075, + "learning_rate": 8.751110795889966e-06, + "loss": 1.1602, + "step": 1149 + }, + { + "epoch": 1.5377926421404682, + "grad_norm": 1.0112639385646816, + "learning_rate": 8.748023635834927e-06, + "loss": 1.1086, + "step": 1150 + }, + { + "epoch": 1.5391304347826087, + "grad_norm": 1.0463545822166416, + "learning_rate": 8.744933210850363e-06, + "loss": 1.0752, + "step": 1151 + }, + { + "epoch": 1.5404682274247492, + "grad_norm": 0.8183986929646418, + "learning_rate": 8.741839523628361e-06, + "loss": 1.0426, + "step": 1152 + }, + { + "epoch": 1.5418060200668897, + "grad_norm": 1.0346633138168515, + "learning_rate": 8.738742576863855e-06, + "loss": 1.2678, + "step": 1153 + }, + { + "epoch": 1.54314381270903, + "grad_norm": 1.3052047714818849, + "learning_rate": 8.735642373254617e-06, + "loss": 1.2484, + "step": 1154 + }, + { + "epoch": 1.5444816053511705, + "grad_norm": 1.0296316809633221, + "learning_rate": 8.732538915501257e-06, + "loss": 1.315, + "step": 1155 + }, + { + "epoch": 1.545819397993311, + "grad_norm": 0.7905189233336698, + "learning_rate": 8.729432206307218e-06, + "loss": 1.2485, + "step": 1156 + }, + { + "epoch": 1.5471571906354515, + "grad_norm": 1.113022742395122, + "learning_rate": 8.726322248378775e-06, + "loss": 1.2607, + "step": 1157 + }, + { + "epoch": 1.548494983277592, + "grad_norm": 0.9523946815865699, + "learning_rate": 8.723209044425034e-06, + "loss": 1.3057, + "step": 1158 + }, + { + "epoch": 1.5498327759197323, + "grad_norm": 1.0675008470403904, + "learning_rate": 8.72009259715793e-06, + "loss": 1.164, + "step": 1159 + }, + { + "epoch": 1.551170568561873, + "grad_norm": 0.8247613799077199, + "learning_rate": 8.71697290929222e-06, + "loss": 1.2052, + "step": 1160 + }, + { + "epoch": 1.5525083612040134, + "grad_norm": 0.8837784684072908, + "learning_rate": 8.71384998354549e-06, + "loss": 1.137, + "step": 1161 + }, + { + "epoch": 1.5538461538461539, + "grad_norm": 0.8940851948327215, + "learning_rate": 8.710723822638138e-06, + "loss": 0.9655, + "step": 1162 + }, + { + "epoch": 1.5551839464882944, + "grad_norm": 0.8122177005315826, + "learning_rate": 8.707594429293387e-06, + "loss": 1.0862, + "step": 1163 + }, + { + "epoch": 1.5565217391304347, + "grad_norm": 0.9988270147182248, + "learning_rate": 8.704461806237272e-06, + "loss": 1.3162, + "step": 1164 + }, + { + "epoch": 1.5578595317725754, + "grad_norm": 1.0722219975702394, + "learning_rate": 8.701325956198643e-06, + "loss": 1.1103, + "step": 1165 + }, + { + "epoch": 1.5591973244147157, + "grad_norm": 0.7157481499028749, + "learning_rate": 8.69818688190916e-06, + "loss": 1.2412, + "step": 1166 + }, + { + "epoch": 1.5605351170568562, + "grad_norm": 1.0010108435243625, + "learning_rate": 8.695044586103297e-06, + "loss": 1.2156, + "step": 1167 + }, + { + "epoch": 1.5618729096989967, + "grad_norm": 1.024706357390756, + "learning_rate": 8.691899071518323e-06, + "loss": 1.2945, + "step": 1168 + }, + { + "epoch": 1.563210702341137, + "grad_norm": 0.7969641684019669, + "learning_rate": 8.688750340894324e-06, + "loss": 1.03, + "step": 1169 + }, + { + "epoch": 1.5645484949832777, + "grad_norm": 0.8266401448018248, + "learning_rate": 8.685598396974178e-06, + "loss": 1.1439, + "step": 1170 + }, + { + "epoch": 1.565886287625418, + "grad_norm": 1.0262111807423946, + "learning_rate": 8.682443242503564e-06, + "loss": 1.0741, + "step": 1171 + }, + { + "epoch": 1.5672240802675586, + "grad_norm": 0.8402664367801227, + "learning_rate": 8.679284880230963e-06, + "loss": 1.2732, + "step": 1172 + }, + { + "epoch": 1.568561872909699, + "grad_norm": 0.768799166390661, + "learning_rate": 8.676123312907641e-06, + "loss": 0.9808, + "step": 1173 + }, + { + "epoch": 1.5698996655518394, + "grad_norm": 0.9914292574643382, + "learning_rate": 8.672958543287666e-06, + "loss": 1.2198, + "step": 1174 + }, + { + "epoch": 1.57123745819398, + "grad_norm": 1.1025681369240994, + "learning_rate": 8.66979057412789e-06, + "loss": 1.3054, + "step": 1175 + }, + { + "epoch": 1.5725752508361204, + "grad_norm": 1.0885067589809334, + "learning_rate": 8.666619408187953e-06, + "loss": 1.1674, + "step": 1176 + }, + { + "epoch": 1.5739130434782609, + "grad_norm": 0.8074444579471776, + "learning_rate": 8.663445048230278e-06, + "loss": 1.2951, + "step": 1177 + }, + { + "epoch": 1.5752508361204014, + "grad_norm": 1.0980035012133076, + "learning_rate": 8.660267497020074e-06, + "loss": 1.0834, + "step": 1178 + }, + { + "epoch": 1.5765886287625417, + "grad_norm": 0.7975720325819651, + "learning_rate": 8.657086757325328e-06, + "loss": 0.9001, + "step": 1179 + }, + { + "epoch": 1.5779264214046824, + "grad_norm": 0.8734209938287509, + "learning_rate": 8.653902831916803e-06, + "loss": 1.0204, + "step": 1180 + }, + { + "epoch": 1.5792642140468227, + "grad_norm": 0.8934824794228605, + "learning_rate": 8.650715723568039e-06, + "loss": 1.1632, + "step": 1181 + }, + { + "epoch": 1.5806020066889632, + "grad_norm": 0.9287707340597434, + "learning_rate": 8.64752543505535e-06, + "loss": 1.4041, + "step": 1182 + }, + { + "epoch": 1.5819397993311037, + "grad_norm": 1.2320860526892872, + "learning_rate": 8.644331969157815e-06, + "loss": 1.1988, + "step": 1183 + }, + { + "epoch": 1.583277591973244, + "grad_norm": 1.2112862985816066, + "learning_rate": 8.641135328657288e-06, + "loss": 1.064, + "step": 1184 + }, + { + "epoch": 1.5846153846153848, + "grad_norm": 1.0825521342748703, + "learning_rate": 8.637935516338384e-06, + "loss": 1.3545, + "step": 1185 + }, + { + "epoch": 1.585953177257525, + "grad_norm": 0.8243527696426569, + "learning_rate": 8.63473253498848e-06, + "loss": 1.1163, + "step": 1186 + }, + { + "epoch": 1.5872909698996656, + "grad_norm": 0.8126710905581507, + "learning_rate": 8.63152638739772e-06, + "loss": 1.0537, + "step": 1187 + }, + { + "epoch": 1.588628762541806, + "grad_norm": 1.1316070444194284, + "learning_rate": 8.628317076358997e-06, + "loss": 1.3708, + "step": 1188 + }, + { + "epoch": 1.5899665551839464, + "grad_norm": 0.6819928549050367, + "learning_rate": 8.625104604667965e-06, + "loss": 1.2402, + "step": 1189 + }, + { + "epoch": 1.591304347826087, + "grad_norm": 0.936305857773396, + "learning_rate": 8.62188897512303e-06, + "loss": 1.279, + "step": 1190 + }, + { + "epoch": 1.5926421404682274, + "grad_norm": 1.0490888752309349, + "learning_rate": 8.61867019052535e-06, + "loss": 1.2292, + "step": 1191 + }, + { + "epoch": 1.593979933110368, + "grad_norm": 0.9311608436498332, + "learning_rate": 8.615448253678834e-06, + "loss": 1.4308, + "step": 1192 + }, + { + "epoch": 1.5953177257525084, + "grad_norm": 1.0081167828847273, + "learning_rate": 8.61222316739013e-06, + "loss": 1.1041, + "step": 1193 + }, + { + "epoch": 1.5966555183946487, + "grad_norm": 1.1975070118592226, + "learning_rate": 8.608994934468633e-06, + "loss": 1.24, + "step": 1194 + }, + { + "epoch": 1.5979933110367894, + "grad_norm": 1.1299324565643754, + "learning_rate": 8.60576355772648e-06, + "loss": 1.1813, + "step": 1195 + }, + { + "epoch": 1.5993311036789297, + "grad_norm": 1.0686264461712143, + "learning_rate": 8.602529039978546e-06, + "loss": 1.3454, + "step": 1196 + }, + { + "epoch": 1.6006688963210702, + "grad_norm": 1.506694252254871, + "learning_rate": 8.599291384042442e-06, + "loss": 1.135, + "step": 1197 + }, + { + "epoch": 1.6020066889632107, + "grad_norm": 0.8173261060725364, + "learning_rate": 8.596050592738514e-06, + "loss": 1.1018, + "step": 1198 + }, + { + "epoch": 1.603344481605351, + "grad_norm": 1.0796094956856843, + "learning_rate": 8.592806668889835e-06, + "loss": 1.1425, + "step": 1199 + }, + { + "epoch": 1.6046822742474918, + "grad_norm": 1.1120005974537381, + "learning_rate": 8.58955961532221e-06, + "loss": 1.3145, + "step": 1200 + }, + { + "epoch": 1.606020066889632, + "grad_norm": 0.9813382553264794, + "learning_rate": 8.586309434864173e-06, + "loss": 1.3068, + "step": 1201 + }, + { + "epoch": 1.6073578595317726, + "grad_norm": 1.0222794892839107, + "learning_rate": 8.583056130346977e-06, + "loss": 1.0933, + "step": 1202 + }, + { + "epoch": 1.608695652173913, + "grad_norm": 1.2297999941053692, + "learning_rate": 8.579799704604597e-06, + "loss": 1.4461, + "step": 1203 + }, + { + "epoch": 1.6100334448160534, + "grad_norm": 0.8509500529256493, + "learning_rate": 8.57654016047373e-06, + "loss": 1.058, + "step": 1204 + }, + { + "epoch": 1.611371237458194, + "grad_norm": 0.8566000746924506, + "learning_rate": 8.573277500793788e-06, + "loss": 1.1154, + "step": 1205 + }, + { + "epoch": 1.6127090301003344, + "grad_norm": 1.1680026316292529, + "learning_rate": 8.570011728406895e-06, + "loss": 1.2368, + "step": 1206 + }, + { + "epoch": 1.614046822742475, + "grad_norm": 0.8056178738310297, + "learning_rate": 8.56674284615789e-06, + "loss": 1.2766, + "step": 1207 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.9615171070760908, + "learning_rate": 8.563470856894316e-06, + "loss": 1.0969, + "step": 1208 + }, + { + "epoch": 1.6167224080267557, + "grad_norm": 0.9186925296614636, + "learning_rate": 8.560195763466428e-06, + "loss": 1.2195, + "step": 1209 + }, + { + "epoch": 1.6180602006688964, + "grad_norm": 1.0127548937398956, + "learning_rate": 8.556917568727182e-06, + "loss": 1.1957, + "step": 1210 + }, + { + "epoch": 1.6193979933110367, + "grad_norm": 0.9142301375233928, + "learning_rate": 8.553636275532236e-06, + "loss": 1.2496, + "step": 1211 + }, + { + "epoch": 1.6207357859531772, + "grad_norm": 0.9641069938720815, + "learning_rate": 8.550351886739949e-06, + "loss": 1.2846, + "step": 1212 + }, + { + "epoch": 1.6220735785953178, + "grad_norm": 1.125712144920233, + "learning_rate": 8.547064405211376e-06, + "loss": 1.3517, + "step": 1213 + }, + { + "epoch": 1.623411371237458, + "grad_norm": 1.1225619712520456, + "learning_rate": 8.54377383381026e-06, + "loss": 1.0787, + "step": 1214 + }, + { + "epoch": 1.6247491638795988, + "grad_norm": 1.0880201025715235, + "learning_rate": 8.540480175403045e-06, + "loss": 1.0896, + "step": 1215 + }, + { + "epoch": 1.626086956521739, + "grad_norm": 0.8381046575363313, + "learning_rate": 8.53718343285886e-06, + "loss": 1.1302, + "step": 1216 + }, + { + "epoch": 1.6274247491638796, + "grad_norm": 0.8870361492463814, + "learning_rate": 8.533883609049517e-06, + "loss": 1.3735, + "step": 1217 + }, + { + "epoch": 1.62876254180602, + "grad_norm": 0.9775204567287861, + "learning_rate": 8.530580706849518e-06, + "loss": 1.2682, + "step": 1218 + }, + { + "epoch": 1.6301003344481604, + "grad_norm": 0.9992148071721407, + "learning_rate": 8.527274729136042e-06, + "loss": 1.335, + "step": 1219 + }, + { + "epoch": 1.6314381270903011, + "grad_norm": 0.9388519071471952, + "learning_rate": 8.523965678788952e-06, + "loss": 1.1498, + "step": 1220 + }, + { + "epoch": 1.6327759197324414, + "grad_norm": 1.0028333393337947, + "learning_rate": 8.520653558690785e-06, + "loss": 1.0837, + "step": 1221 + }, + { + "epoch": 1.634113712374582, + "grad_norm": 1.1192743130516538, + "learning_rate": 8.51733837172675e-06, + "loss": 1.4747, + "step": 1222 + }, + { + "epoch": 1.6354515050167224, + "grad_norm": 0.970387510873497, + "learning_rate": 8.51402012078473e-06, + "loss": 1.2783, + "step": 1223 + }, + { + "epoch": 1.6367892976588627, + "grad_norm": 0.7121037653454058, + "learning_rate": 8.510698808755275e-06, + "loss": 1.2041, + "step": 1224 + }, + { + "epoch": 1.6381270903010035, + "grad_norm": 1.0039223528532768, + "learning_rate": 8.507374438531606e-06, + "loss": 1.1833, + "step": 1225 + }, + { + "epoch": 1.6394648829431437, + "grad_norm": 0.9605517411948864, + "learning_rate": 8.504047013009605e-06, + "loss": 1.2484, + "step": 1226 + }, + { + "epoch": 1.6408026755852843, + "grad_norm": 0.8538811369008511, + "learning_rate": 8.500716535087815e-06, + "loss": 1.116, + "step": 1227 + }, + { + "epoch": 1.6421404682274248, + "grad_norm": 0.8702001820725137, + "learning_rate": 8.497383007667435e-06, + "loss": 1.3254, + "step": 1228 + }, + { + "epoch": 1.643478260869565, + "grad_norm": 1.1014485672922483, + "learning_rate": 8.494046433652327e-06, + "loss": 1.2576, + "step": 1229 + }, + { + "epoch": 1.6448160535117058, + "grad_norm": 0.8212462842181725, + "learning_rate": 8.490706815949006e-06, + "loss": 1.3841, + "step": 1230 + }, + { + "epoch": 1.646153846153846, + "grad_norm": 1.030909406036088, + "learning_rate": 8.487364157466633e-06, + "loss": 1.0935, + "step": 1231 + }, + { + "epoch": 1.6474916387959866, + "grad_norm": 0.7628997753544281, + "learning_rate": 8.484018461117023e-06, + "loss": 1.3772, + "step": 1232 + }, + { + "epoch": 1.648829431438127, + "grad_norm": 1.011734645994114, + "learning_rate": 8.480669729814635e-06, + "loss": 1.2418, + "step": 1233 + }, + { + "epoch": 1.6501672240802676, + "grad_norm": 1.1110665601485443, + "learning_rate": 8.477317966476569e-06, + "loss": 1.524, + "step": 1234 + }, + { + "epoch": 1.6515050167224081, + "grad_norm": 0.9825282559094409, + "learning_rate": 8.473963174022574e-06, + "loss": 1.1882, + "step": 1235 + }, + { + "epoch": 1.6528428093645484, + "grad_norm": 1.12274847201327, + "learning_rate": 8.470605355375033e-06, + "loss": 1.2298, + "step": 1236 + }, + { + "epoch": 1.654180602006689, + "grad_norm": 1.015481948377949, + "learning_rate": 8.467244513458961e-06, + "loss": 1.1921, + "step": 1237 + }, + { + "epoch": 1.6555183946488294, + "grad_norm": 0.9533707658652604, + "learning_rate": 8.463880651202014e-06, + "loss": 1.2544, + "step": 1238 + }, + { + "epoch": 1.65685618729097, + "grad_norm": 0.835799028769581, + "learning_rate": 8.460513771534475e-06, + "loss": 1.2215, + "step": 1239 + }, + { + "epoch": 1.6581939799331105, + "grad_norm": 1.7655713338614, + "learning_rate": 8.457143877389258e-06, + "loss": 1.3071, + "step": 1240 + }, + { + "epoch": 1.6595317725752508, + "grad_norm": 1.0413088355545845, + "learning_rate": 8.453770971701899e-06, + "loss": 1.2423, + "step": 1241 + }, + { + "epoch": 1.6608695652173913, + "grad_norm": 1.1522945502604902, + "learning_rate": 8.450395057410561e-06, + "loss": 1.2183, + "step": 1242 + }, + { + "epoch": 1.6622073578595318, + "grad_norm": 1.170398287009412, + "learning_rate": 8.447016137456025e-06, + "loss": 1.2542, + "step": 1243 + }, + { + "epoch": 1.6635451505016723, + "grad_norm": 0.7948613881302046, + "learning_rate": 8.443634214781693e-06, + "loss": 1.2815, + "step": 1244 + }, + { + "epoch": 1.6648829431438128, + "grad_norm": 1.1426769884832928, + "learning_rate": 8.440249292333583e-06, + "loss": 1.2045, + "step": 1245 + }, + { + "epoch": 1.666220735785953, + "grad_norm": 1.0587316533021889, + "learning_rate": 8.43686137306032e-06, + "loss": 1.1968, + "step": 1246 + }, + { + "epoch": 1.6675585284280936, + "grad_norm": 0.802035604743874, + "learning_rate": 8.43347045991315e-06, + "loss": 1.2353, + "step": 1247 + }, + { + "epoch": 1.6688963210702341, + "grad_norm": 1.0781867012853161, + "learning_rate": 8.430076555845917e-06, + "loss": 1.1869, + "step": 1248 + }, + { + "epoch": 1.6702341137123746, + "grad_norm": 1.0690302895173525, + "learning_rate": 8.426679663815073e-06, + "loss": 1.08, + "step": 1249 + }, + { + "epoch": 1.6715719063545151, + "grad_norm": 0.9996986726570624, + "learning_rate": 8.42327978677968e-06, + "loss": 1.3441, + "step": 1250 + }, + { + "epoch": 1.6729096989966554, + "grad_norm": 0.9340406536598379, + "learning_rate": 8.41987692770139e-06, + "loss": 1.5133, + "step": 1251 + }, + { + "epoch": 1.674247491638796, + "grad_norm": 1.1731273043247636, + "learning_rate": 8.41647108954446e-06, + "loss": 1.2017, + "step": 1252 + }, + { + "epoch": 1.6755852842809364, + "grad_norm": 1.1158763799866995, + "learning_rate": 8.413062275275737e-06, + "loss": 1.2581, + "step": 1253 + }, + { + "epoch": 1.676923076923077, + "grad_norm": 0.9912675648311491, + "learning_rate": 8.409650487864662e-06, + "loss": 1.4627, + "step": 1254 + }, + { + "epoch": 1.6782608695652175, + "grad_norm": 1.207775755791076, + "learning_rate": 8.40623573028327e-06, + "loss": 1.237, + "step": 1255 + }, + { + "epoch": 1.6795986622073578, + "grad_norm": 0.7993880230135922, + "learning_rate": 8.402818005506181e-06, + "loss": 1.3152, + "step": 1256 + }, + { + "epoch": 1.6809364548494983, + "grad_norm": 1.0570277038287723, + "learning_rate": 8.399397316510596e-06, + "loss": 1.2754, + "step": 1257 + }, + { + "epoch": 1.6822742474916388, + "grad_norm": 0.9899654473018303, + "learning_rate": 8.395973666276301e-06, + "loss": 1.2189, + "step": 1258 + }, + { + "epoch": 1.6836120401337793, + "grad_norm": 0.920044509655933, + "learning_rate": 8.392547057785662e-06, + "loss": 1.1201, + "step": 1259 + }, + { + "epoch": 1.6849498327759198, + "grad_norm": 0.9101468519796663, + "learning_rate": 8.389117494023622e-06, + "loss": 1.1247, + "step": 1260 + }, + { + "epoch": 1.68628762541806, + "grad_norm": 1.1937229360431443, + "learning_rate": 8.385684977977698e-06, + "loss": 1.3514, + "step": 1261 + }, + { + "epoch": 1.6876254180602008, + "grad_norm": 1.48347505320392, + "learning_rate": 8.382249512637978e-06, + "loss": 1.18, + "step": 1262 + }, + { + "epoch": 1.6889632107023411, + "grad_norm": 1.2800412122156815, + "learning_rate": 8.378811100997122e-06, + "loss": 1.3585, + "step": 1263 + }, + { + "epoch": 1.6903010033444816, + "grad_norm": 1.3596632926097427, + "learning_rate": 8.375369746050353e-06, + "loss": 1.3286, + "step": 1264 + }, + { + "epoch": 1.6916387959866221, + "grad_norm": 0.9779767025806692, + "learning_rate": 8.371925450795458e-06, + "loss": 1.2249, + "step": 1265 + }, + { + "epoch": 1.6929765886287624, + "grad_norm": 1.1545429511045855, + "learning_rate": 8.368478218232787e-06, + "loss": 1.1757, + "step": 1266 + }, + { + "epoch": 1.6943143812709032, + "grad_norm": 0.9812924944748339, + "learning_rate": 8.365028051365249e-06, + "loss": 1.0506, + "step": 1267 + }, + { + "epoch": 1.6956521739130435, + "grad_norm": 0.9951219339975371, + "learning_rate": 8.361574953198309e-06, + "loss": 1.3159, + "step": 1268 + }, + { + "epoch": 1.696989966555184, + "grad_norm": 1.181587838891036, + "learning_rate": 8.358118926739984e-06, + "loss": 1.0702, + "step": 1269 + }, + { + "epoch": 1.6983277591973245, + "grad_norm": 0.7189077731455846, + "learning_rate": 8.354659975000843e-06, + "loss": 1.2224, + "step": 1270 + }, + { + "epoch": 1.6996655518394648, + "grad_norm": 0.8700908834227391, + "learning_rate": 8.351198100994003e-06, + "loss": 1.0792, + "step": 1271 + }, + { + "epoch": 1.7010033444816055, + "grad_norm": 1.2921526483454553, + "learning_rate": 8.347733307735127e-06, + "loss": 1.3416, + "step": 1272 + }, + { + "epoch": 1.7023411371237458, + "grad_norm": 1.0258191444275013, + "learning_rate": 8.34426559824242e-06, + "loss": 1.3179, + "step": 1273 + }, + { + "epoch": 1.7036789297658863, + "grad_norm": 0.8475153029002066, + "learning_rate": 8.340794975536627e-06, + "loss": 1.1505, + "step": 1274 + }, + { + "epoch": 1.7050167224080268, + "grad_norm": 0.8954541326902556, + "learning_rate": 8.337321442641036e-06, + "loss": 1.1924, + "step": 1275 + }, + { + "epoch": 1.706354515050167, + "grad_norm": 1.043274824834305, + "learning_rate": 8.33384500258146e-06, + "loss": 1.0706, + "step": 1276 + }, + { + "epoch": 1.7076923076923078, + "grad_norm": 0.8679477009057583, + "learning_rate": 8.330365658386252e-06, + "loss": 1.1672, + "step": 1277 + }, + { + "epoch": 1.7090301003344481, + "grad_norm": 1.1508745736299333, + "learning_rate": 8.326883413086295e-06, + "loss": 1.1211, + "step": 1278 + }, + { + "epoch": 1.7103678929765886, + "grad_norm": 0.8603800083841641, + "learning_rate": 8.323398269714994e-06, + "loss": 1.1226, + "step": 1279 + }, + { + "epoch": 1.7117056856187292, + "grad_norm": 1.1012470102755616, + "learning_rate": 8.319910231308285e-06, + "loss": 1.1355, + "step": 1280 + }, + { + "epoch": 1.7130434782608694, + "grad_norm": 0.8967573399765266, + "learning_rate": 8.316419300904622e-06, + "loss": 1.3166, + "step": 1281 + }, + { + "epoch": 1.7143812709030102, + "grad_norm": 1.1357750537455604, + "learning_rate": 8.312925481544976e-06, + "loss": 1.1434, + "step": 1282 + }, + { + "epoch": 1.7157190635451505, + "grad_norm": 0.9842478838202819, + "learning_rate": 8.309428776272838e-06, + "loss": 1.0848, + "step": 1283 + }, + { + "epoch": 1.717056856187291, + "grad_norm": 1.0801947015154376, + "learning_rate": 8.305929188134216e-06, + "loss": 1.4422, + "step": 1284 + }, + { + "epoch": 1.7183946488294315, + "grad_norm": 0.96343357654809, + "learning_rate": 8.302426720177624e-06, + "loss": 1.2375, + "step": 1285 + }, + { + "epoch": 1.7197324414715718, + "grad_norm": 0.9594597025899366, + "learning_rate": 8.298921375454083e-06, + "loss": 1.101, + "step": 1286 + }, + { + "epoch": 1.7210702341137125, + "grad_norm": 1.0925706417632564, + "learning_rate": 8.295413157017127e-06, + "loss": 1.2317, + "step": 1287 + }, + { + "epoch": 1.7224080267558528, + "grad_norm": 1.0440885065577232, + "learning_rate": 8.291902067922791e-06, + "loss": 1.2013, + "step": 1288 + }, + { + "epoch": 1.7237458193979933, + "grad_norm": 0.8044993329251405, + "learning_rate": 8.288388111229601e-06, + "loss": 1.3685, + "step": 1289 + }, + { + "epoch": 1.7250836120401338, + "grad_norm": 0.9669791535938266, + "learning_rate": 8.284871289998599e-06, + "loss": 1.2555, + "step": 1290 + }, + { + "epoch": 1.7264214046822741, + "grad_norm": 0.9041275362895903, + "learning_rate": 8.281351607293307e-06, + "loss": 1.0484, + "step": 1291 + }, + { + "epoch": 1.7277591973244149, + "grad_norm": 1.0738079973960568, + "learning_rate": 8.277829066179746e-06, + "loss": 1.2904, + "step": 1292 + }, + { + "epoch": 1.7290969899665551, + "grad_norm": 1.3700228885392056, + "learning_rate": 8.274303669726427e-06, + "loss": 1.4043, + "step": 1293 + }, + { + "epoch": 1.7304347826086957, + "grad_norm": 1.045966146049695, + "learning_rate": 8.270775421004345e-06, + "loss": 1.1582, + "step": 1294 + }, + { + "epoch": 1.7317725752508362, + "grad_norm": 0.9606107039045566, + "learning_rate": 8.267244323086985e-06, + "loss": 1.1877, + "step": 1295 + }, + { + "epoch": 1.7331103678929765, + "grad_norm": 1.0569302671929748, + "learning_rate": 8.263710379050311e-06, + "loss": 1.1788, + "step": 1296 + }, + { + "epoch": 1.7344481605351172, + "grad_norm": 1.001458166043886, + "learning_rate": 8.260173591972765e-06, + "loss": 1.3164, + "step": 1297 + }, + { + "epoch": 1.7357859531772575, + "grad_norm": 1.1130436026316213, + "learning_rate": 8.256633964935268e-06, + "loss": 1.0146, + "step": 1298 + }, + { + "epoch": 1.737123745819398, + "grad_norm": 0.9999447938407903, + "learning_rate": 8.25309150102121e-06, + "loss": 1.1532, + "step": 1299 + }, + { + "epoch": 1.7384615384615385, + "grad_norm": 1.0600129937317113, + "learning_rate": 8.249546203316461e-06, + "loss": 1.3877, + "step": 1300 + }, + { + "epoch": 1.7397993311036788, + "grad_norm": 0.8127574490279109, + "learning_rate": 8.245998074909354e-06, + "loss": 1.1985, + "step": 1301 + }, + { + "epoch": 1.7411371237458195, + "grad_norm": 1.0398336960857024, + "learning_rate": 8.242447118890686e-06, + "loss": 1.1785, + "step": 1302 + }, + { + "epoch": 1.7424749163879598, + "grad_norm": 0.8789849963422429, + "learning_rate": 8.23889333835372e-06, + "loss": 1.4031, + "step": 1303 + }, + { + "epoch": 1.7438127090301003, + "grad_norm": 1.1368074554552954, + "learning_rate": 8.235336736394179e-06, + "loss": 1.2239, + "step": 1304 + }, + { + "epoch": 1.7451505016722408, + "grad_norm": 0.8790978940933545, + "learning_rate": 8.231777316110245e-06, + "loss": 1.2385, + "step": 1305 + }, + { + "epoch": 1.7464882943143811, + "grad_norm": 0.9198925880220414, + "learning_rate": 8.228215080602554e-06, + "loss": 0.9471, + "step": 1306 + }, + { + "epoch": 1.7478260869565219, + "grad_norm": 0.8140657522181091, + "learning_rate": 8.22465003297419e-06, + "loss": 1.1751, + "step": 1307 + }, + { + "epoch": 1.7491638795986622, + "grad_norm": 0.9044585927557095, + "learning_rate": 8.221082176330697e-06, + "loss": 1.1792, + "step": 1308 + }, + { + "epoch": 1.7505016722408027, + "grad_norm": 1.179292425755935, + "learning_rate": 8.217511513780056e-06, + "loss": 1.2552, + "step": 1309 + }, + { + "epoch": 1.7518394648829432, + "grad_norm": 0.7700809956659329, + "learning_rate": 8.213938048432697e-06, + "loss": 1.0655, + "step": 1310 + }, + { + "epoch": 1.7531772575250835, + "grad_norm": 1.0391382210771862, + "learning_rate": 8.210361783401491e-06, + "loss": 1.3133, + "step": 1311 + }, + { + "epoch": 1.7545150501672242, + "grad_norm": 1.0667604999923233, + "learning_rate": 8.206782721801747e-06, + "loss": 1.2366, + "step": 1312 + }, + { + "epoch": 1.7558528428093645, + "grad_norm": 1.04737668364115, + "learning_rate": 8.203200866751212e-06, + "loss": 1.1742, + "step": 1313 + }, + { + "epoch": 1.757190635451505, + "grad_norm": 1.0041834997136865, + "learning_rate": 8.19961622137006e-06, + "loss": 1.21, + "step": 1314 + }, + { + "epoch": 1.7585284280936455, + "grad_norm": 0.9782721283353152, + "learning_rate": 8.196028788780905e-06, + "loss": 1.2055, + "step": 1315 + }, + { + "epoch": 1.7598662207357858, + "grad_norm": 0.9424697734506533, + "learning_rate": 8.192438572108786e-06, + "loss": 1.1283, + "step": 1316 + }, + { + "epoch": 1.7612040133779265, + "grad_norm": 1.0683884225953866, + "learning_rate": 8.188845574481162e-06, + "loss": 1.1419, + "step": 1317 + }, + { + "epoch": 1.7625418060200668, + "grad_norm": 0.7736351972345937, + "learning_rate": 8.185249799027919e-06, + "loss": 1.081, + "step": 1318 + }, + { + "epoch": 1.7638795986622073, + "grad_norm": 0.8513923344109299, + "learning_rate": 8.181651248881364e-06, + "loss": 0.9709, + "step": 1319 + }, + { + "epoch": 1.7652173913043478, + "grad_norm": 0.9788568970284907, + "learning_rate": 8.178049927176217e-06, + "loss": 1.223, + "step": 1320 + }, + { + "epoch": 1.7665551839464881, + "grad_norm": 1.107429689369738, + "learning_rate": 8.174445837049614e-06, + "loss": 1.062, + "step": 1321 + }, + { + "epoch": 1.7678929765886289, + "grad_norm": 1.1093886542890203, + "learning_rate": 8.170838981641108e-06, + "loss": 1.1207, + "step": 1322 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 1.0192832691243576, + "learning_rate": 8.167229364092648e-06, + "loss": 1.1257, + "step": 1323 + }, + { + "epoch": 1.7705685618729097, + "grad_norm": 0.918588594503292, + "learning_rate": 8.163616987548605e-06, + "loss": 1.0161, + "step": 1324 + }, + { + "epoch": 1.7719063545150502, + "grad_norm": 1.0327607880423089, + "learning_rate": 8.16000185515574e-06, + "loss": 1.1532, + "step": 1325 + }, + { + "epoch": 1.7732441471571905, + "grad_norm": 1.0037737188813032, + "learning_rate": 8.15638397006322e-06, + "loss": 1.3248, + "step": 1326 + }, + { + "epoch": 1.7745819397993312, + "grad_norm": 1.1748082015370165, + "learning_rate": 8.152763335422612e-06, + "loss": 1.1839, + "step": 1327 + }, + { + "epoch": 1.7759197324414715, + "grad_norm": 1.006249543370292, + "learning_rate": 8.14913995438788e-06, + "loss": 1.0503, + "step": 1328 + }, + { + "epoch": 1.777257525083612, + "grad_norm": 0.8583897576269887, + "learning_rate": 8.145513830115367e-06, + "loss": 1.164, + "step": 1329 + }, + { + "epoch": 1.7785953177257525, + "grad_norm": 0.8989674809752083, + "learning_rate": 8.141884965763822e-06, + "loss": 1.2752, + "step": 1330 + }, + { + "epoch": 1.7799331103678928, + "grad_norm": 0.8993142429400532, + "learning_rate": 8.138253364494374e-06, + "loss": 1.2448, + "step": 1331 + }, + { + "epoch": 1.7812709030100335, + "grad_norm": 1.0348960977365809, + "learning_rate": 8.134619029470535e-06, + "loss": 1.2146, + "step": 1332 + }, + { + "epoch": 1.7826086956521738, + "grad_norm": 0.9594305037649374, + "learning_rate": 8.130981963858195e-06, + "loss": 1.0649, + "step": 1333 + }, + { + "epoch": 1.7839464882943143, + "grad_norm": 1.0674256168169054, + "learning_rate": 8.127342170825635e-06, + "loss": 1.1559, + "step": 1334 + }, + { + "epoch": 1.7852842809364549, + "grad_norm": 0.9792779735857403, + "learning_rate": 8.1236996535435e-06, + "loss": 1.1621, + "step": 1335 + }, + { + "epoch": 1.7866220735785954, + "grad_norm": 1.0146696155579973, + "learning_rate": 8.120054415184811e-06, + "loss": 1.2176, + "step": 1336 + }, + { + "epoch": 1.7879598662207359, + "grad_norm": 0.7762970444802157, + "learning_rate": 8.116406458924964e-06, + "loss": 1.1345, + "step": 1337 + }, + { + "epoch": 1.7892976588628762, + "grad_norm": 0.9326220747384746, + "learning_rate": 8.112755787941718e-06, + "loss": 1.2115, + "step": 1338 + }, + { + "epoch": 1.7906354515050167, + "grad_norm": 0.9071119264360391, + "learning_rate": 8.109102405415195e-06, + "loss": 1.254, + "step": 1339 + }, + { + "epoch": 1.7919732441471572, + "grad_norm": 1.3081344999977043, + "learning_rate": 8.105446314527885e-06, + "loss": 1.1423, + "step": 1340 + }, + { + "epoch": 1.7933110367892977, + "grad_norm": 1.1589540752287972, + "learning_rate": 8.101787518464634e-06, + "loss": 1.2005, + "step": 1341 + }, + { + "epoch": 1.7946488294314382, + "grad_norm": 1.2922347687854185, + "learning_rate": 8.098126020412644e-06, + "loss": 1.2022, + "step": 1342 + }, + { + "epoch": 1.7959866220735785, + "grad_norm": 1.067706323835505, + "learning_rate": 8.094461823561473e-06, + "loss": 1.1445, + "step": 1343 + }, + { + "epoch": 1.797324414715719, + "grad_norm": 1.1028439616439896, + "learning_rate": 8.090794931103026e-06, + "loss": 0.9659, + "step": 1344 + }, + { + "epoch": 1.7986622073578595, + "grad_norm": 0.8764923873162692, + "learning_rate": 8.087125346231562e-06, + "loss": 1.2954, + "step": 1345 + }, + { + "epoch": 1.8, + "grad_norm": 1.3802063504302375, + "learning_rate": 8.083453072143678e-06, + "loss": 1.1588, + "step": 1346 + }, + { + "epoch": 1.8013377926421406, + "grad_norm": 1.0552767562161727, + "learning_rate": 8.079778112038318e-06, + "loss": 0.9367, + "step": 1347 + }, + { + "epoch": 1.8026755852842808, + "grad_norm": 0.853838730215928, + "learning_rate": 8.07610046911677e-06, + "loss": 1.2785, + "step": 1348 + }, + { + "epoch": 1.8040133779264214, + "grad_norm": 1.009859282235668, + "learning_rate": 8.072420146582649e-06, + "loss": 0.9451, + "step": 1349 + }, + { + "epoch": 1.8053511705685619, + "grad_norm": 1.3051181561995897, + "learning_rate": 8.068737147641913e-06, + "loss": 1.1432, + "step": 1350 + }, + { + "epoch": 1.8066889632107024, + "grad_norm": 0.7847713460710487, + "learning_rate": 8.065051475502847e-06, + "loss": 1.3642, + "step": 1351 + }, + { + "epoch": 1.808026755852843, + "grad_norm": 1.1262469795957473, + "learning_rate": 8.061363133376065e-06, + "loss": 1.4175, + "step": 1352 + }, + { + "epoch": 1.8093645484949832, + "grad_norm": 0.8929376823667708, + "learning_rate": 8.057672124474508e-06, + "loss": 1.384, + "step": 1353 + }, + { + "epoch": 1.8107023411371237, + "grad_norm": 0.8366319009625, + "learning_rate": 8.05397845201344e-06, + "loss": 1.198, + "step": 1354 + }, + { + "epoch": 1.8120401337792642, + "grad_norm": 0.9470861980355739, + "learning_rate": 8.050282119210443e-06, + "loss": 1.1148, + "step": 1355 + }, + { + "epoch": 1.8133779264214047, + "grad_norm": 0.910899805752771, + "learning_rate": 8.046583129285422e-06, + "loss": 1.1738, + "step": 1356 + }, + { + "epoch": 1.8147157190635452, + "grad_norm": 1.210438118666916, + "learning_rate": 8.042881485460591e-06, + "loss": 1.2958, + "step": 1357 + }, + { + "epoch": 1.8160535117056855, + "grad_norm": 1.0955787236957624, + "learning_rate": 8.039177190960476e-06, + "loss": 1.0759, + "step": 1358 + }, + { + "epoch": 1.8173913043478263, + "grad_norm": 1.1209559693300193, + "learning_rate": 8.035470249011916e-06, + "loss": 1.0494, + "step": 1359 + }, + { + "epoch": 1.8187290969899665, + "grad_norm": 0.837495140474993, + "learning_rate": 8.031760662844053e-06, + "loss": 1.3571, + "step": 1360 + }, + { + "epoch": 1.820066889632107, + "grad_norm": 1.020954184231738, + "learning_rate": 8.028048435688333e-06, + "loss": 1.2216, + "step": 1361 + }, + { + "epoch": 1.8214046822742476, + "grad_norm": 1.0831618932957676, + "learning_rate": 8.024333570778507e-06, + "loss": 1.1316, + "step": 1362 + }, + { + "epoch": 1.8227424749163879, + "grad_norm": 1.2299260936796375, + "learning_rate": 8.020616071350613e-06, + "loss": 1.1367, + "step": 1363 + }, + { + "epoch": 1.8240802675585286, + "grad_norm": 0.8142015644561459, + "learning_rate": 8.016895940642994e-06, + "loss": 1.0637, + "step": 1364 + }, + { + "epoch": 1.8254180602006689, + "grad_norm": 1.1333975974965, + "learning_rate": 8.013173181896283e-06, + "loss": 1.0001, + "step": 1365 + }, + { + "epoch": 1.8267558528428094, + "grad_norm": 0.9496423869472496, + "learning_rate": 8.0094477983534e-06, + "loss": 1.1323, + "step": 1366 + }, + { + "epoch": 1.82809364548495, + "grad_norm": 0.9251219595016189, + "learning_rate": 8.005719793259552e-06, + "loss": 1.2864, + "step": 1367 + }, + { + "epoch": 1.8294314381270902, + "grad_norm": 1.1857763437879727, + "learning_rate": 8.00198916986223e-06, + "loss": 1.3213, + "step": 1368 + }, + { + "epoch": 1.830769230769231, + "grad_norm": 0.9414928454302343, + "learning_rate": 7.998255931411208e-06, + "loss": 1.2802, + "step": 1369 + }, + { + "epoch": 1.8321070234113712, + "grad_norm": 1.0282669627124126, + "learning_rate": 7.994520081158534e-06, + "loss": 1.0988, + "step": 1370 + }, + { + "epoch": 1.8334448160535117, + "grad_norm": 1.1242701751060253, + "learning_rate": 7.990781622358535e-06, + "loss": 1.0051, + "step": 1371 + }, + { + "epoch": 1.8347826086956522, + "grad_norm": 0.8124389392673825, + "learning_rate": 7.987040558267807e-06, + "loss": 1.135, + "step": 1372 + }, + { + "epoch": 1.8361204013377925, + "grad_norm": 1.2021292069438714, + "learning_rate": 7.983296892145218e-06, + "loss": 1.3221, + "step": 1373 + }, + { + "epoch": 1.8374581939799333, + "grad_norm": 0.9264286825213028, + "learning_rate": 7.979550627251901e-06, + "loss": 1.4134, + "step": 1374 + }, + { + "epoch": 1.8387959866220736, + "grad_norm": 0.9753292357486177, + "learning_rate": 7.975801766851255e-06, + "loss": 1.1557, + "step": 1375 + }, + { + "epoch": 1.840133779264214, + "grad_norm": 0.9485813603651105, + "learning_rate": 7.972050314208934e-06, + "loss": 1.2439, + "step": 1376 + }, + { + "epoch": 1.8414715719063546, + "grad_norm": 0.9117777393970588, + "learning_rate": 7.968296272592862e-06, + "loss": 1.2476, + "step": 1377 + }, + { + "epoch": 1.8428093645484949, + "grad_norm": 1.1022411782559918, + "learning_rate": 7.964539645273204e-06, + "loss": 1.1092, + "step": 1378 + }, + { + "epoch": 1.8441471571906356, + "grad_norm": 0.8177192539916954, + "learning_rate": 7.960780435522387e-06, + "loss": 1.5274, + "step": 1379 + }, + { + "epoch": 1.845484949832776, + "grad_norm": 1.1398601670521618, + "learning_rate": 7.957018646615085e-06, + "loss": 1.3506, + "step": 1380 + }, + { + "epoch": 1.8468227424749164, + "grad_norm": 1.1263686454404582, + "learning_rate": 7.953254281828217e-06, + "loss": 1.0622, + "step": 1381 + }, + { + "epoch": 1.848160535117057, + "grad_norm": 0.9682169053566897, + "learning_rate": 7.94948734444095e-06, + "loss": 1.2852, + "step": 1382 + }, + { + "epoch": 1.8494983277591972, + "grad_norm": 1.124140666743383, + "learning_rate": 7.945717837734688e-06, + "loss": 1.1268, + "step": 1383 + }, + { + "epoch": 1.850836120401338, + "grad_norm": 1.0002735765105575, + "learning_rate": 7.941945764993074e-06, + "loss": 1.2091, + "step": 1384 + }, + { + "epoch": 1.8521739130434782, + "grad_norm": 1.047868847432623, + "learning_rate": 7.938171129501988e-06, + "loss": 1.3425, + "step": 1385 + }, + { + "epoch": 1.8535117056856187, + "grad_norm": 1.156748654466471, + "learning_rate": 7.934393934549542e-06, + "loss": 1.4397, + "step": 1386 + }, + { + "epoch": 1.8548494983277592, + "grad_norm": 1.0522880516474884, + "learning_rate": 7.930614183426074e-06, + "loss": 1.2495, + "step": 1387 + }, + { + "epoch": 1.8561872909698995, + "grad_norm": 0.7627543036872003, + "learning_rate": 7.926831879424154e-06, + "loss": 1.2439, + "step": 1388 + }, + { + "epoch": 1.8575250836120403, + "grad_norm": 1.0927216851386292, + "learning_rate": 7.923047025838573e-06, + "loss": 1.2319, + "step": 1389 + }, + { + "epoch": 1.8588628762541806, + "grad_norm": 1.139833514744169, + "learning_rate": 7.919259625966342e-06, + "loss": 1.1488, + "step": 1390 + }, + { + "epoch": 1.860200668896321, + "grad_norm": 0.9388578425885211, + "learning_rate": 7.915469683106694e-06, + "loss": 1.3927, + "step": 1391 + }, + { + "epoch": 1.8615384615384616, + "grad_norm": 1.004374576648706, + "learning_rate": 7.91167720056107e-06, + "loss": 1.2371, + "step": 1392 + }, + { + "epoch": 1.8628762541806019, + "grad_norm": 1.0801879827941414, + "learning_rate": 7.907882181633134e-06, + "loss": 1.2352, + "step": 1393 + }, + { + "epoch": 1.8642140468227426, + "grad_norm": 0.828500668209237, + "learning_rate": 7.90408462962875e-06, + "loss": 1.2634, + "step": 1394 + }, + { + "epoch": 1.865551839464883, + "grad_norm": 1.3067975437028363, + "learning_rate": 7.900284547855992e-06, + "loss": 1.2264, + "step": 1395 + }, + { + "epoch": 1.8668896321070234, + "grad_norm": 1.0329208779614765, + "learning_rate": 7.896481939625139e-06, + "loss": 1.2776, + "step": 1396 + }, + { + "epoch": 1.868227424749164, + "grad_norm": 0.7791239738889686, + "learning_rate": 7.892676808248666e-06, + "loss": 1.2219, + "step": 1397 + }, + { + "epoch": 1.8695652173913042, + "grad_norm": 0.8739627790888935, + "learning_rate": 7.888869157041257e-06, + "loss": 1.1305, + "step": 1398 + }, + { + "epoch": 1.870903010033445, + "grad_norm": 0.9863836148258056, + "learning_rate": 7.885058989319776e-06, + "loss": 1.1487, + "step": 1399 + }, + { + "epoch": 1.8722408026755852, + "grad_norm": 1.0057857178982157, + "learning_rate": 7.88124630840329e-06, + "loss": 1.2399, + "step": 1400 + }, + { + "epoch": 1.8735785953177257, + "grad_norm": 0.9782731858337229, + "learning_rate": 7.87743111761305e-06, + "loss": 0.9925, + "step": 1401 + }, + { + "epoch": 1.8749163879598663, + "grad_norm": 0.8744345121247641, + "learning_rate": 7.8736134202725e-06, + "loss": 1.3726, + "step": 1402 + }, + { + "epoch": 1.8762541806020065, + "grad_norm": 1.036305213037371, + "learning_rate": 7.869793219707258e-06, + "loss": 1.2316, + "step": 1403 + }, + { + "epoch": 1.8775919732441473, + "grad_norm": 0.8078638305611642, + "learning_rate": 7.865970519245129e-06, + "loss": 1.1264, + "step": 1404 + }, + { + "epoch": 1.8789297658862876, + "grad_norm": 0.7233547505550761, + "learning_rate": 7.862145322216092e-06, + "loss": 1.1931, + "step": 1405 + }, + { + "epoch": 1.880267558528428, + "grad_norm": 0.9250933923248871, + "learning_rate": 7.858317631952307e-06, + "loss": 1.3118, + "step": 1406 + }, + { + "epoch": 1.8816053511705686, + "grad_norm": 1.1696533006964804, + "learning_rate": 7.8544874517881e-06, + "loss": 1.2518, + "step": 1407 + }, + { + "epoch": 1.8829431438127089, + "grad_norm": 1.0001536716193131, + "learning_rate": 7.850654785059966e-06, + "loss": 0.978, + "step": 1408 + }, + { + "epoch": 1.8842809364548496, + "grad_norm": 1.0533258426480085, + "learning_rate": 7.846819635106569e-06, + "loss": 1.0989, + "step": 1409 + }, + { + "epoch": 1.88561872909699, + "grad_norm": 0.9007589992185036, + "learning_rate": 7.842982005268733e-06, + "loss": 1.1604, + "step": 1410 + }, + { + "epoch": 1.8869565217391304, + "grad_norm": 1.3292649541439998, + "learning_rate": 7.83914189888945e-06, + "loss": 1.3052, + "step": 1411 + }, + { + "epoch": 1.888294314381271, + "grad_norm": 0.8219518420288159, + "learning_rate": 7.835299319313854e-06, + "loss": 1.1859, + "step": 1412 + }, + { + "epoch": 1.8896321070234112, + "grad_norm": 0.8903150231775708, + "learning_rate": 7.831454269889251e-06, + "loss": 1.1524, + "step": 1413 + }, + { + "epoch": 1.890969899665552, + "grad_norm": 0.8040158429066, + "learning_rate": 7.827606753965086e-06, + "loss": 1.124, + "step": 1414 + }, + { + "epoch": 1.8923076923076922, + "grad_norm": 1.0341613010230757, + "learning_rate": 7.823756774892961e-06, + "loss": 1.1192, + "step": 1415 + }, + { + "epoch": 1.8936454849498328, + "grad_norm": 1.0411916987039624, + "learning_rate": 7.819904336026615e-06, + "loss": 1.3488, + "step": 1416 + }, + { + "epoch": 1.8949832775919733, + "grad_norm": 1.2740312629959305, + "learning_rate": 7.816049440721937e-06, + "loss": 1.1882, + "step": 1417 + }, + { + "epoch": 1.8963210702341136, + "grad_norm": 1.1398907605003359, + "learning_rate": 7.812192092336951e-06, + "loss": 1.162, + "step": 1418 + }, + { + "epoch": 1.8976588628762543, + "grad_norm": 1.0844930004808009, + "learning_rate": 7.808332294231824e-06, + "loss": 1.1569, + "step": 1419 + }, + { + "epoch": 1.8989966555183946, + "grad_norm": 1.0440491458018775, + "learning_rate": 7.80447004976885e-06, + "loss": 1.3399, + "step": 1420 + }, + { + "epoch": 1.900334448160535, + "grad_norm": 1.0299314006969, + "learning_rate": 7.800605362312456e-06, + "loss": 0.9904, + "step": 1421 + }, + { + "epoch": 1.9016722408026756, + "grad_norm": 0.7934816030317837, + "learning_rate": 7.796738235229203e-06, + "loss": 1.0254, + "step": 1422 + }, + { + "epoch": 1.903010033444816, + "grad_norm": 0.9938989709874128, + "learning_rate": 7.792868671887768e-06, + "loss": 1.2945, + "step": 1423 + }, + { + "epoch": 1.9043478260869566, + "grad_norm": 1.4990194426848038, + "learning_rate": 7.788996675658955e-06, + "loss": 1.415, + "step": 1424 + }, + { + "epoch": 1.905685618729097, + "grad_norm": 1.1430581145383907, + "learning_rate": 7.785122249915688e-06, + "loss": 1.2386, + "step": 1425 + }, + { + "epoch": 1.9070234113712374, + "grad_norm": 1.1149230404766488, + "learning_rate": 7.781245398033009e-06, + "loss": 1.411, + "step": 1426 + }, + { + "epoch": 1.908361204013378, + "grad_norm": 0.8854628726594097, + "learning_rate": 7.777366123388065e-06, + "loss": 1.1972, + "step": 1427 + }, + { + "epoch": 1.9096989966555182, + "grad_norm": 0.8819752476003322, + "learning_rate": 7.773484429360122e-06, + "loss": 1.3086, + "step": 1428 + }, + { + "epoch": 1.911036789297659, + "grad_norm": 1.1229961393775236, + "learning_rate": 7.769600319330553e-06, + "loss": 1.2113, + "step": 1429 + }, + { + "epoch": 1.9123745819397993, + "grad_norm": 0.9076716681123522, + "learning_rate": 7.765713796682829e-06, + "loss": 1.2971, + "step": 1430 + }, + { + "epoch": 1.9137123745819398, + "grad_norm": 1.0524996583925132, + "learning_rate": 7.76182486480253e-06, + "loss": 1.1244, + "step": 1431 + }, + { + "epoch": 1.9150501672240803, + "grad_norm": 1.1236868430356397, + "learning_rate": 7.75793352707733e-06, + "loss": 1.4211, + "step": 1432 + }, + { + "epoch": 1.9163879598662206, + "grad_norm": 0.7598821271373021, + "learning_rate": 7.754039786897004e-06, + "loss": 1.1487, + "step": 1433 + }, + { + "epoch": 1.9177257525083613, + "grad_norm": 1.138998506819778, + "learning_rate": 7.750143647653409e-06, + "loss": 1.1366, + "step": 1434 + }, + { + "epoch": 1.9190635451505016, + "grad_norm": 1.174354311564228, + "learning_rate": 7.746245112740507e-06, + "loss": 1.2642, + "step": 1435 + }, + { + "epoch": 1.920401337792642, + "grad_norm": 0.8296245021987857, + "learning_rate": 7.742344185554335e-06, + "loss": 1.1924, + "step": 1436 + }, + { + "epoch": 1.9217391304347826, + "grad_norm": 1.0877829977352251, + "learning_rate": 7.738440869493018e-06, + "loss": 1.1882, + "step": 1437 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 1.1892286002226726, + "learning_rate": 7.734535167956761e-06, + "loss": 1.2718, + "step": 1438 + }, + { + "epoch": 1.9244147157190636, + "grad_norm": 1.0695628696254362, + "learning_rate": 7.73062708434785e-06, + "loss": 1.2098, + "step": 1439 + }, + { + "epoch": 1.925752508361204, + "grad_norm": 0.8974226110642151, + "learning_rate": 7.726716622070643e-06, + "loss": 1.0406, + "step": 1440 + }, + { + "epoch": 1.9270903010033444, + "grad_norm": 0.9540017049924671, + "learning_rate": 7.722803784531572e-06, + "loss": 1.0918, + "step": 1441 + }, + { + "epoch": 1.928428093645485, + "grad_norm": 0.9234144492062104, + "learning_rate": 7.718888575139134e-06, + "loss": 0.9986, + "step": 1442 + }, + { + "epoch": 1.9297658862876255, + "grad_norm": 0.7914160870681707, + "learning_rate": 7.714970997303898e-06, + "loss": 1.0764, + "step": 1443 + }, + { + "epoch": 1.931103678929766, + "grad_norm": 0.8594742852930191, + "learning_rate": 7.711051054438491e-06, + "loss": 1.2475, + "step": 1444 + }, + { + "epoch": 1.9324414715719063, + "grad_norm": 0.8900384904233337, + "learning_rate": 7.707128749957606e-06, + "loss": 1.2651, + "step": 1445 + }, + { + "epoch": 1.9337792642140468, + "grad_norm": 1.02486623514189, + "learning_rate": 7.703204087277989e-06, + "loss": 1.1683, + "step": 1446 + }, + { + "epoch": 1.9351170568561873, + "grad_norm": 0.7562724411803665, + "learning_rate": 7.699277069818439e-06, + "loss": 1.2949, + "step": 1447 + }, + { + "epoch": 1.9364548494983278, + "grad_norm": 1.1670977279488088, + "learning_rate": 7.69534770099981e-06, + "loss": 1.1852, + "step": 1448 + }, + { + "epoch": 1.9377926421404683, + "grad_norm": 0.9275683935800696, + "learning_rate": 7.691415984244998e-06, + "loss": 1.3003, + "step": 1449 + }, + { + "epoch": 1.9391304347826086, + "grad_norm": 0.7356993079615883, + "learning_rate": 7.687481922978955e-06, + "loss": 1.2097, + "step": 1450 + }, + { + "epoch": 1.9404682274247491, + "grad_norm": 1.1375982720217301, + "learning_rate": 7.683545520628667e-06, + "loss": 1.1081, + "step": 1451 + }, + { + "epoch": 1.9418060200668896, + "grad_norm": 0.8816137688346307, + "learning_rate": 7.679606780623162e-06, + "loss": 1.1583, + "step": 1452 + }, + { + "epoch": 1.9431438127090301, + "grad_norm": 0.9300731054733782, + "learning_rate": 7.675665706393502e-06, + "loss": 1.165, + "step": 1453 + }, + { + "epoch": 1.9444816053511706, + "grad_norm": 0.9620983551175202, + "learning_rate": 7.671722301372788e-06, + "loss": 1.0931, + "step": 1454 + }, + { + "epoch": 1.945819397993311, + "grad_norm": 1.0257193603958121, + "learning_rate": 7.667776568996143e-06, + "loss": 1.2343, + "step": 1455 + }, + { + "epoch": 1.9471571906354515, + "grad_norm": 0.8942116338928946, + "learning_rate": 7.663828512700724e-06, + "loss": 1.172, + "step": 1456 + }, + { + "epoch": 1.948494983277592, + "grad_norm": 1.0388618898220512, + "learning_rate": 7.65987813592571e-06, + "loss": 1.1029, + "step": 1457 + }, + { + "epoch": 1.9498327759197325, + "grad_norm": 0.9524617766030047, + "learning_rate": 7.655925442112303e-06, + "loss": 1.1643, + "step": 1458 + }, + { + "epoch": 1.951170568561873, + "grad_norm": 1.5291507940544362, + "learning_rate": 7.651970434703724e-06, + "loss": 1.1298, + "step": 1459 + }, + { + "epoch": 1.9525083612040133, + "grad_norm": 0.9697741422415199, + "learning_rate": 7.648013117145203e-06, + "loss": 1.4326, + "step": 1460 + }, + { + "epoch": 1.953846153846154, + "grad_norm": 1.080162923832604, + "learning_rate": 7.64405349288399e-06, + "loss": 1.1792, + "step": 1461 + }, + { + "epoch": 1.9551839464882943, + "grad_norm": 0.7603223662904232, + "learning_rate": 7.640091565369339e-06, + "loss": 1.0763, + "step": 1462 + }, + { + "epoch": 1.9565217391304348, + "grad_norm": 0.8258375103792883, + "learning_rate": 7.636127338052513e-06, + "loss": 1.2086, + "step": 1463 + }, + { + "epoch": 1.9578595317725753, + "grad_norm": 0.8213511133795591, + "learning_rate": 7.63216081438678e-06, + "loss": 1.0085, + "step": 1464 + }, + { + "epoch": 1.9591973244147156, + "grad_norm": 0.7489608373342995, + "learning_rate": 7.628191997827405e-06, + "loss": 0.9389, + "step": 1465 + }, + { + "epoch": 1.9605351170568563, + "grad_norm": 0.9846290835735136, + "learning_rate": 7.624220891831653e-06, + "loss": 1.3314, + "step": 1466 + }, + { + "epoch": 1.9618729096989966, + "grad_norm": 0.8682529162796038, + "learning_rate": 7.62024749985878e-06, + "loss": 1.11, + "step": 1467 + }, + { + "epoch": 1.9632107023411371, + "grad_norm": 1.0910672375286787, + "learning_rate": 7.616271825370037e-06, + "loss": 1.2869, + "step": 1468 + }, + { + "epoch": 1.9645484949832777, + "grad_norm": 0.8971338792404514, + "learning_rate": 7.612293871828662e-06, + "loss": 1.074, + "step": 1469 + }, + { + "epoch": 1.965886287625418, + "grad_norm": 1.1075207143511616, + "learning_rate": 7.6083136426998786e-06, + "loss": 1.1871, + "step": 1470 + }, + { + "epoch": 1.9672240802675587, + "grad_norm": 0.7936303535955338, + "learning_rate": 7.604331141450889e-06, + "loss": 1.3251, + "step": 1471 + }, + { + "epoch": 1.968561872909699, + "grad_norm": 0.955350610858148, + "learning_rate": 7.600346371550882e-06, + "loss": 1.2645, + "step": 1472 + }, + { + "epoch": 1.9698996655518395, + "grad_norm": 0.8247956172603081, + "learning_rate": 7.596359336471015e-06, + "loss": 1.3152, + "step": 1473 + }, + { + "epoch": 1.97123745819398, + "grad_norm": 0.9583832448735001, + "learning_rate": 7.592370039684424e-06, + "loss": 1.3613, + "step": 1474 + }, + { + "epoch": 1.9725752508361203, + "grad_norm": 0.942180000358921, + "learning_rate": 7.588378484666214e-06, + "loss": 1.2654, + "step": 1475 + }, + { + "epoch": 1.973913043478261, + "grad_norm": 1.1193829256551848, + "learning_rate": 7.584384674893454e-06, + "loss": 1.2384, + "step": 1476 + }, + { + "epoch": 1.9752508361204013, + "grad_norm": 0.7708206147452822, + "learning_rate": 7.58038861384518e-06, + "loss": 1.484, + "step": 1477 + }, + { + "epoch": 1.9765886287625418, + "grad_norm": 1.0855207997906173, + "learning_rate": 7.576390305002389e-06, + "loss": 1.2906, + "step": 1478 + }, + { + "epoch": 1.9779264214046823, + "grad_norm": 0.8889023502550213, + "learning_rate": 7.572389751848037e-06, + "loss": 1.0842, + "step": 1479 + }, + { + "epoch": 1.9792642140468226, + "grad_norm": 1.032100788501089, + "learning_rate": 7.568386957867033e-06, + "loss": 1.2532, + "step": 1480 + }, + { + "epoch": 1.9806020066889634, + "grad_norm": 1.0387407400481121, + "learning_rate": 7.564381926546238e-06, + "loss": 1.2226, + "step": 1481 + }, + { + "epoch": 1.9819397993311036, + "grad_norm": 1.1448809451228703, + "learning_rate": 7.560374661374463e-06, + "loss": 1.2648, + "step": 1482 + }, + { + "epoch": 1.9832775919732442, + "grad_norm": 1.090112510630723, + "learning_rate": 7.556365165842466e-06, + "loss": 1.3167, + "step": 1483 + }, + { + "epoch": 1.9846153846153847, + "grad_norm": 1.2267275585650168, + "learning_rate": 7.552353443442944e-06, + "loss": 1.3557, + "step": 1484 + }, + { + "epoch": 1.985953177257525, + "grad_norm": 0.9299418096477532, + "learning_rate": 7.548339497670538e-06, + "loss": 1.2211, + "step": 1485 + }, + { + "epoch": 1.9872909698996657, + "grad_norm": 0.8963937233926849, + "learning_rate": 7.544323332021826e-06, + "loss": 1.2458, + "step": 1486 + }, + { + "epoch": 1.988628762541806, + "grad_norm": 1.1648166929316726, + "learning_rate": 7.540304949995314e-06, + "loss": 1.2061, + "step": 1487 + }, + { + "epoch": 1.9899665551839465, + "grad_norm": 1.4011482347799347, + "learning_rate": 7.536284355091443e-06, + "loss": 1.1798, + "step": 1488 + }, + { + "epoch": 1.991304347826087, + "grad_norm": 0.8839255606839322, + "learning_rate": 7.532261550812585e-06, + "loss": 1.0959, + "step": 1489 + }, + { + "epoch": 1.9926421404682273, + "grad_norm": 0.9664072103215202, + "learning_rate": 7.528236540663031e-06, + "loss": 1.1485, + "step": 1490 + }, + { + "epoch": 1.993979933110368, + "grad_norm": 1.1132269836849948, + "learning_rate": 7.524209328148995e-06, + "loss": 1.2741, + "step": 1491 + }, + { + "epoch": 1.9953177257525083, + "grad_norm": 1.0457705179365628, + "learning_rate": 7.520179916778608e-06, + "loss": 1.2182, + "step": 1492 + }, + { + "epoch": 1.9966555183946488, + "grad_norm": 0.8537877522376467, + "learning_rate": 7.516148310061921e-06, + "loss": 1.2171, + "step": 1493 + }, + { + "epoch": 1.9979933110367893, + "grad_norm": 1.1647019454748906, + "learning_rate": 7.512114511510893e-06, + "loss": 1.0596, + "step": 1494 + }, + { + "epoch": 1.9993311036789296, + "grad_norm": 0.8963248795437824, + "learning_rate": 7.508078524639397e-06, + "loss": 1.2316, + "step": 1495 + }, + { + "epoch": 2.0, + "grad_norm": 1.8079768630278104, + "learning_rate": 7.504040352963206e-06, + "loss": 1.3462, + "step": 1496 + }, + { + "epoch": 2.0013377926421403, + "grad_norm": 1.3000657705186407, + "learning_rate": 7.500000000000001e-06, + "loss": 0.9985, + "step": 1497 + }, + { + "epoch": 2.002675585284281, + "grad_norm": 1.0723868535757646, + "learning_rate": 7.495957469269361e-06, + "loss": 1.0896, + "step": 1498 + }, + { + "epoch": 2.0040133779264213, + "grad_norm": 0.8942899628708126, + "learning_rate": 7.491912764292764e-06, + "loss": 1.0953, + "step": 1499 + }, + { + "epoch": 2.005351170568562, + "grad_norm": 0.948191581197506, + "learning_rate": 7.487865888593579e-06, + "loss": 1.0535, + "step": 1500 + }, + { + "epoch": 2.0066889632107023, + "grad_norm": 0.9968920987212805, + "learning_rate": 7.483816845697069e-06, + "loss": 1.099, + "step": 1501 + }, + { + "epoch": 2.0080267558528426, + "grad_norm": 0.9709639502912054, + "learning_rate": 7.479765639130384e-06, + "loss": 1.0791, + "step": 1502 + }, + { + "epoch": 2.0093645484949834, + "grad_norm": 0.9485832773420066, + "learning_rate": 7.4757122724225575e-06, + "loss": 1.2447, + "step": 1503 + }, + { + "epoch": 2.0107023411371236, + "grad_norm": 0.8845428761375932, + "learning_rate": 7.471656749104503e-06, + "loss": 1.1301, + "step": 1504 + }, + { + "epoch": 2.0120401337792644, + "grad_norm": 1.2294132788466339, + "learning_rate": 7.467599072709019e-06, + "loss": 1.0542, + "step": 1505 + }, + { + "epoch": 2.0133779264214047, + "grad_norm": 0.9000845987561226, + "learning_rate": 7.463539246770775e-06, + "loss": 1.0265, + "step": 1506 + }, + { + "epoch": 2.014715719063545, + "grad_norm": 0.9975831992486021, + "learning_rate": 7.459477274826312e-06, + "loss": 1.1543, + "step": 1507 + }, + { + "epoch": 2.0160535117056857, + "grad_norm": 0.9711919432650428, + "learning_rate": 7.4554131604140425e-06, + "loss": 1.2972, + "step": 1508 + }, + { + "epoch": 2.017391304347826, + "grad_norm": 1.1002515088004952, + "learning_rate": 7.451346907074245e-06, + "loss": 1.1506, + "step": 1509 + }, + { + "epoch": 2.0187290969899667, + "grad_norm": 0.9428025137434339, + "learning_rate": 7.447278518349062e-06, + "loss": 1.2307, + "step": 1510 + }, + { + "epoch": 2.020066889632107, + "grad_norm": 0.9669667299631342, + "learning_rate": 7.443207997782495e-06, + "loss": 0.9033, + "step": 1511 + }, + { + "epoch": 2.0214046822742473, + "grad_norm": 1.0799854465373282, + "learning_rate": 7.439135348920403e-06, + "loss": 1.3068, + "step": 1512 + }, + { + "epoch": 2.022742474916388, + "grad_norm": 0.9957605497549845, + "learning_rate": 7.435060575310498e-06, + "loss": 1.1296, + "step": 1513 + }, + { + "epoch": 2.0240802675585283, + "grad_norm": 1.0025783531778265, + "learning_rate": 7.430983680502344e-06, + "loss": 1.1331, + "step": 1514 + }, + { + "epoch": 2.025418060200669, + "grad_norm": 1.003790503695329, + "learning_rate": 7.426904668047352e-06, + "loss": 1.1634, + "step": 1515 + }, + { + "epoch": 2.0267558528428093, + "grad_norm": 0.9362268270633876, + "learning_rate": 7.4228235414987805e-06, + "loss": 0.9878, + "step": 1516 + }, + { + "epoch": 2.0280936454849496, + "grad_norm": 1.036452987763032, + "learning_rate": 7.418740304411725e-06, + "loss": 1.1692, + "step": 1517 + }, + { + "epoch": 2.0294314381270904, + "grad_norm": 0.9485519215430027, + "learning_rate": 7.4146549603431225e-06, + "loss": 1.2775, + "step": 1518 + }, + { + "epoch": 2.0307692307692307, + "grad_norm": 0.9572278973228892, + "learning_rate": 7.4105675128517456e-06, + "loss": 1.1165, + "step": 1519 + }, + { + "epoch": 2.0321070234113714, + "grad_norm": 0.9417630821650733, + "learning_rate": 7.4064779654981966e-06, + "loss": 1.0109, + "step": 1520 + }, + { + "epoch": 2.0334448160535117, + "grad_norm": 0.9392786659498851, + "learning_rate": 7.40238632184491e-06, + "loss": 1.0913, + "step": 1521 + }, + { + "epoch": 2.034782608695652, + "grad_norm": 0.9924211887836294, + "learning_rate": 7.398292585456144e-06, + "loss": 1.2277, + "step": 1522 + }, + { + "epoch": 2.0361204013377927, + "grad_norm": 0.8840864706583701, + "learning_rate": 7.39419675989798e-06, + "loss": 1.0702, + "step": 1523 + }, + { + "epoch": 2.037458193979933, + "grad_norm": 0.9455488262177231, + "learning_rate": 7.390098848738324e-06, + "loss": 1.1108, + "step": 1524 + }, + { + "epoch": 2.0387959866220737, + "grad_norm": 0.9899221167505954, + "learning_rate": 7.385998855546892e-06, + "loss": 1.2402, + "step": 1525 + }, + { + "epoch": 2.040133779264214, + "grad_norm": 0.8429933950118261, + "learning_rate": 7.381896783895217e-06, + "loss": 1.1216, + "step": 1526 + }, + { + "epoch": 2.0414715719063543, + "grad_norm": 0.9229458044746145, + "learning_rate": 7.377792637356644e-06, + "loss": 1.0027, + "step": 1527 + }, + { + "epoch": 2.042809364548495, + "grad_norm": 1.1406724088039066, + "learning_rate": 7.373686419506321e-06, + "loss": 1.1356, + "step": 1528 + }, + { + "epoch": 2.0441471571906353, + "grad_norm": 1.139111500193687, + "learning_rate": 7.369578133921205e-06, + "loss": 1.1048, + "step": 1529 + }, + { + "epoch": 2.045484949832776, + "grad_norm": 0.9818756758376687, + "learning_rate": 7.365467784180051e-06, + "loss": 1.0138, + "step": 1530 + }, + { + "epoch": 2.0468227424749164, + "grad_norm": 0.9314502005618929, + "learning_rate": 7.361355373863415e-06, + "loss": 1.1457, + "step": 1531 + }, + { + "epoch": 2.0481605351170566, + "grad_norm": 1.3240026027278113, + "learning_rate": 7.357240906553644e-06, + "loss": 1.0649, + "step": 1532 + }, + { + "epoch": 2.0494983277591974, + "grad_norm": 0.8629665691551883, + "learning_rate": 7.35312438583488e-06, + "loss": 1.3987, + "step": 1533 + }, + { + "epoch": 2.0508361204013377, + "grad_norm": 1.3248677522310914, + "learning_rate": 7.349005815293055e-06, + "loss": 0.9667, + "step": 1534 + }, + { + "epoch": 2.0521739130434784, + "grad_norm": 1.0036179788530728, + "learning_rate": 7.344885198515881e-06, + "loss": 1.3065, + "step": 1535 + }, + { + "epoch": 2.0535117056856187, + "grad_norm": 1.0765788410555346, + "learning_rate": 7.340762539092858e-06, + "loss": 1.1697, + "step": 1536 + }, + { + "epoch": 2.054849498327759, + "grad_norm": 1.016336011504119, + "learning_rate": 7.336637840615265e-06, + "loss": 1.1305, + "step": 1537 + }, + { + "epoch": 2.0561872909698997, + "grad_norm": 1.059509555023407, + "learning_rate": 7.332511106676151e-06, + "loss": 1.1558, + "step": 1538 + }, + { + "epoch": 2.05752508361204, + "grad_norm": 1.2227828908001825, + "learning_rate": 7.3283823408703466e-06, + "loss": 1.2415, + "step": 1539 + }, + { + "epoch": 2.0588628762541807, + "grad_norm": 0.8935419375380195, + "learning_rate": 7.324251546794449e-06, + "loss": 0.9675, + "step": 1540 + }, + { + "epoch": 2.060200668896321, + "grad_norm": 1.0806181730175881, + "learning_rate": 7.320118728046818e-06, + "loss": 1.0478, + "step": 1541 + }, + { + "epoch": 2.0615384615384613, + "grad_norm": 0.886236822568923, + "learning_rate": 7.315983888227583e-06, + "loss": 1.1848, + "step": 1542 + }, + { + "epoch": 2.062876254180602, + "grad_norm": 1.0418235033089565, + "learning_rate": 7.3118470309386325e-06, + "loss": 1.0281, + "step": 1543 + }, + { + "epoch": 2.0642140468227423, + "grad_norm": 1.0902198586503806, + "learning_rate": 7.3077081597836105e-06, + "loss": 1.1739, + "step": 1544 + }, + { + "epoch": 2.065551839464883, + "grad_norm": 0.9320603774923458, + "learning_rate": 7.303567278367918e-06, + "loss": 1.099, + "step": 1545 + }, + { + "epoch": 2.0668896321070234, + "grad_norm": 1.1744552767164427, + "learning_rate": 7.299424390298704e-06, + "loss": 1.1482, + "step": 1546 + }, + { + "epoch": 2.068227424749164, + "grad_norm": 0.9030400827962202, + "learning_rate": 7.295279499184867e-06, + "loss": 1.0185, + "step": 1547 + }, + { + "epoch": 2.0695652173913044, + "grad_norm": 0.9227916720032788, + "learning_rate": 7.291132608637053e-06, + "loss": 1.1117, + "step": 1548 + }, + { + "epoch": 2.0709030100334447, + "grad_norm": 0.8356273235013194, + "learning_rate": 7.2869837222676445e-06, + "loss": 1.2813, + "step": 1549 + }, + { + "epoch": 2.0722408026755854, + "grad_norm": 0.8975657379658257, + "learning_rate": 7.282832843690768e-06, + "loss": 0.9678, + "step": 1550 + }, + { + "epoch": 2.0735785953177257, + "grad_norm": 1.003765718370978, + "learning_rate": 7.278679976522279e-06, + "loss": 1.0802, + "step": 1551 + }, + { + "epoch": 2.074916387959866, + "grad_norm": 1.0221195424112666, + "learning_rate": 7.274525124379773e-06, + "loss": 1.0121, + "step": 1552 + }, + { + "epoch": 2.0762541806020067, + "grad_norm": 1.0663752772695259, + "learning_rate": 7.2703682908825675e-06, + "loss": 1.057, + "step": 1553 + }, + { + "epoch": 2.077591973244147, + "grad_norm": 0.9041805723185209, + "learning_rate": 7.266209479651712e-06, + "loss": 1.1538, + "step": 1554 + }, + { + "epoch": 2.0789297658862878, + "grad_norm": 1.0027603617416274, + "learning_rate": 7.262048694309976e-06, + "loss": 1.2252, + "step": 1555 + }, + { + "epoch": 2.080267558528428, + "grad_norm": 1.1088550183807826, + "learning_rate": 7.257885938481845e-06, + "loss": 1.0155, + "step": 1556 + }, + { + "epoch": 2.0816053511705688, + "grad_norm": 0.948030058571784, + "learning_rate": 7.253721215793528e-06, + "loss": 0.9926, + "step": 1557 + }, + { + "epoch": 2.082943143812709, + "grad_norm": 0.904787546355584, + "learning_rate": 7.249554529872941e-06, + "loss": 1.1241, + "step": 1558 + }, + { + "epoch": 2.0842809364548494, + "grad_norm": 1.1153527356849093, + "learning_rate": 7.245385884349716e-06, + "loss": 1.178, + "step": 1559 + }, + { + "epoch": 2.08561872909699, + "grad_norm": 0.8488938902783197, + "learning_rate": 7.241215282855189e-06, + "loss": 1.0386, + "step": 1560 + }, + { + "epoch": 2.0869565217391304, + "grad_norm": 0.7884980777020186, + "learning_rate": 7.2370427290224e-06, + "loss": 1.0952, + "step": 1561 + }, + { + "epoch": 2.088294314381271, + "grad_norm": 1.0163464694904691, + "learning_rate": 7.232868226486087e-06, + "loss": 1.1918, + "step": 1562 + }, + { + "epoch": 2.0896321070234114, + "grad_norm": 0.9657798252677458, + "learning_rate": 7.2286917788826926e-06, + "loss": 1.0393, + "step": 1563 + }, + { + "epoch": 2.0909698996655517, + "grad_norm": 0.958807685491109, + "learning_rate": 7.224513389850345e-06, + "loss": 1.3305, + "step": 1564 + }, + { + "epoch": 2.0923076923076924, + "grad_norm": 1.0134781011519345, + "learning_rate": 7.2203330630288714e-06, + "loss": 1.3477, + "step": 1565 + }, + { + "epoch": 2.0936454849498327, + "grad_norm": 1.043137201617113, + "learning_rate": 7.216150802059782e-06, + "loss": 1.1508, + "step": 1566 + }, + { + "epoch": 2.0949832775919734, + "grad_norm": 0.8613768753111459, + "learning_rate": 7.211966610586274e-06, + "loss": 1.2361, + "step": 1567 + }, + { + "epoch": 2.0963210702341137, + "grad_norm": 1.0022256550757074, + "learning_rate": 7.2077804922532245e-06, + "loss": 1.0384, + "step": 1568 + }, + { + "epoch": 2.097658862876254, + "grad_norm": 1.131548793052805, + "learning_rate": 7.203592450707193e-06, + "loss": 1.0707, + "step": 1569 + }, + { + "epoch": 2.0989966555183948, + "grad_norm": 0.868573378093571, + "learning_rate": 7.1994024895964095e-06, + "loss": 1.0231, + "step": 1570 + }, + { + "epoch": 2.100334448160535, + "grad_norm": 0.944195606665096, + "learning_rate": 7.195210612570781e-06, + "loss": 1.2215, + "step": 1571 + }, + { + "epoch": 2.101672240802676, + "grad_norm": 1.0926409677823923, + "learning_rate": 7.1910168232818765e-06, + "loss": 1.1191, + "step": 1572 + }, + { + "epoch": 2.103010033444816, + "grad_norm": 0.9632064803681059, + "learning_rate": 7.1868211253829375e-06, + "loss": 1.0999, + "step": 1573 + }, + { + "epoch": 2.1043478260869564, + "grad_norm": 1.060702724554105, + "learning_rate": 7.182623522528866e-06, + "loss": 1.3021, + "step": 1574 + }, + { + "epoch": 2.105685618729097, + "grad_norm": 1.1775385611258566, + "learning_rate": 7.178424018376224e-06, + "loss": 1.0607, + "step": 1575 + }, + { + "epoch": 2.1070234113712374, + "grad_norm": 0.8972926223981945, + "learning_rate": 7.174222616583228e-06, + "loss": 1.0799, + "step": 1576 + }, + { + "epoch": 2.108361204013378, + "grad_norm": 0.9880647041087162, + "learning_rate": 7.170019320809747e-06, + "loss": 1.1659, + "step": 1577 + }, + { + "epoch": 2.1096989966555184, + "grad_norm": 0.9398390451102198, + "learning_rate": 7.165814134717303e-06, + "loss": 1.0735, + "step": 1578 + }, + { + "epoch": 2.1110367892976587, + "grad_norm": 0.9430187483907324, + "learning_rate": 7.161607061969061e-06, + "loss": 0.9749, + "step": 1579 + }, + { + "epoch": 2.1123745819397994, + "grad_norm": 1.0738881464526315, + "learning_rate": 7.157398106229834e-06, + "loss": 1.1895, + "step": 1580 + }, + { + "epoch": 2.1137123745819397, + "grad_norm": 1.1440565865384151, + "learning_rate": 7.153187271166071e-06, + "loss": 1.1405, + "step": 1581 + }, + { + "epoch": 2.1150501672240805, + "grad_norm": 1.1964761353044173, + "learning_rate": 7.148974560445859e-06, + "loss": 1.1972, + "step": 1582 + }, + { + "epoch": 2.1163879598662207, + "grad_norm": 0.9096643416638823, + "learning_rate": 7.144759977738921e-06, + "loss": 0.8978, + "step": 1583 + }, + { + "epoch": 2.117725752508361, + "grad_norm": 1.233030824850911, + "learning_rate": 7.14054352671661e-06, + "loss": 1.1267, + "step": 1584 + }, + { + "epoch": 2.1190635451505018, + "grad_norm": 1.0288930301379535, + "learning_rate": 7.136325211051905e-06, + "loss": 1.2183, + "step": 1585 + }, + { + "epoch": 2.120401337792642, + "grad_norm": 0.8492491155615624, + "learning_rate": 7.132105034419411e-06, + "loss": 1.0316, + "step": 1586 + }, + { + "epoch": 2.121739130434783, + "grad_norm": 0.9344087361693395, + "learning_rate": 7.127883000495353e-06, + "loss": 1.248, + "step": 1587 + }, + { + "epoch": 2.123076923076923, + "grad_norm": 0.879584464850301, + "learning_rate": 7.123659112957571e-06, + "loss": 1.2625, + "step": 1588 + }, + { + "epoch": 2.1244147157190634, + "grad_norm": 1.0918195401454724, + "learning_rate": 7.119433375485527e-06, + "loss": 1.071, + "step": 1589 + }, + { + "epoch": 2.125752508361204, + "grad_norm": 1.1457985479888764, + "learning_rate": 7.1152057917602904e-06, + "loss": 1.1297, + "step": 1590 + }, + { + "epoch": 2.1270903010033444, + "grad_norm": 1.0761235728510004, + "learning_rate": 7.110976365464537e-06, + "loss": 1.2754, + "step": 1591 + }, + { + "epoch": 2.128428093645485, + "grad_norm": 0.9802624311883074, + "learning_rate": 7.10674510028255e-06, + "loss": 1.1017, + "step": 1592 + }, + { + "epoch": 2.1297658862876254, + "grad_norm": 1.2534530261182082, + "learning_rate": 7.102511999900213e-06, + "loss": 1.011, + "step": 1593 + }, + { + "epoch": 2.1311036789297657, + "grad_norm": 0.9660028863045479, + "learning_rate": 7.098277068005012e-06, + "loss": 1.24, + "step": 1594 + }, + { + "epoch": 2.1324414715719064, + "grad_norm": 1.0575934799737816, + "learning_rate": 7.094040308286023e-06, + "loss": 1.0947, + "step": 1595 + }, + { + "epoch": 2.1337792642140467, + "grad_norm": 0.8479990590891526, + "learning_rate": 7.089801724433918e-06, + "loss": 1.2686, + "step": 1596 + }, + { + "epoch": 2.1351170568561875, + "grad_norm": 0.9843750673929655, + "learning_rate": 7.085561320140958e-06, + "loss": 1.2058, + "step": 1597 + }, + { + "epoch": 2.1364548494983278, + "grad_norm": 1.2083559637257557, + "learning_rate": 7.081319099100986e-06, + "loss": 1.0551, + "step": 1598 + }, + { + "epoch": 2.137792642140468, + "grad_norm": 1.4013007050262198, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.8716, + "step": 1599 + }, + { + "epoch": 2.139130434782609, + "grad_norm": 0.9233492956157842, + "learning_rate": 7.072829221563305e-06, + "loss": 1.2897, + "step": 1600 + }, + { + "epoch": 2.140468227424749, + "grad_norm": 0.8700064326218211, + "learning_rate": 7.068581572461188e-06, + "loss": 0.9947, + "step": 1601 + }, + { + "epoch": 2.14180602006689, + "grad_norm": 0.9633751662587583, + "learning_rate": 7.064332121403237e-06, + "loss": 1.1691, + "step": 1602 + }, + { + "epoch": 2.14314381270903, + "grad_norm": 1.325139739661102, + "learning_rate": 7.060080872091178e-06, + "loss": 0.9158, + "step": 1603 + }, + { + "epoch": 2.1444816053511704, + "grad_norm": 0.8769621095834191, + "learning_rate": 7.055827828228304e-06, + "loss": 1.044, + "step": 1604 + }, + { + "epoch": 2.145819397993311, + "grad_norm": 0.9931120770674013, + "learning_rate": 7.051572993519474e-06, + "loss": 1.257, + "step": 1605 + }, + { + "epoch": 2.1471571906354514, + "grad_norm": 0.9462937561099234, + "learning_rate": 7.0473163716711004e-06, + "loss": 1.2578, + "step": 1606 + }, + { + "epoch": 2.148494983277592, + "grad_norm": 0.8893145969522276, + "learning_rate": 7.043057966391158e-06, + "loss": 1.1401, + "step": 1607 + }, + { + "epoch": 2.1498327759197324, + "grad_norm": 0.9863901736964997, + "learning_rate": 7.038797781389174e-06, + "loss": 0.9947, + "step": 1608 + }, + { + "epoch": 2.1511705685618727, + "grad_norm": 1.0145770376438008, + "learning_rate": 7.034535820376225e-06, + "loss": 0.9844, + "step": 1609 + }, + { + "epoch": 2.1525083612040135, + "grad_norm": 1.0945494663020452, + "learning_rate": 7.030272087064933e-06, + "loss": 1.1661, + "step": 1610 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.9522919980610259, + "learning_rate": 7.026006585169467e-06, + "loss": 0.9509, + "step": 1611 + }, + { + "epoch": 2.1551839464882945, + "grad_norm": 1.2280829869016947, + "learning_rate": 7.021739318405537e-06, + "loss": 0.9685, + "step": 1612 + }, + { + "epoch": 2.1565217391304348, + "grad_norm": 1.084952780829979, + "learning_rate": 7.017470290490386e-06, + "loss": 1.0491, + "step": 1613 + }, + { + "epoch": 2.157859531772575, + "grad_norm": 1.3749216253321586, + "learning_rate": 7.013199505142796e-06, + "loss": 1.2695, + "step": 1614 + }, + { + "epoch": 2.159197324414716, + "grad_norm": 0.9680487852991507, + "learning_rate": 7.008926966083078e-06, + "loss": 1.1927, + "step": 1615 + }, + { + "epoch": 2.160535117056856, + "grad_norm": 1.1372179791633616, + "learning_rate": 7.004652677033069e-06, + "loss": 1.0339, + "step": 1616 + }, + { + "epoch": 2.161872909698997, + "grad_norm": 1.1377612812746605, + "learning_rate": 7.0003766417161335e-06, + "loss": 1.1113, + "step": 1617 + }, + { + "epoch": 2.163210702341137, + "grad_norm": 1.0841634454861877, + "learning_rate": 6.996098863857155e-06, + "loss": 1.1777, + "step": 1618 + }, + { + "epoch": 2.1645484949832774, + "grad_norm": 1.0034824468742094, + "learning_rate": 6.991819347182536e-06, + "loss": 1.0994, + "step": 1619 + }, + { + "epoch": 2.165886287625418, + "grad_norm": 1.1149938449783576, + "learning_rate": 6.987538095420193e-06, + "loss": 1.0601, + "step": 1620 + }, + { + "epoch": 2.1672240802675584, + "grad_norm": 1.0796552672809028, + "learning_rate": 6.983255112299554e-06, + "loss": 1.1745, + "step": 1621 + }, + { + "epoch": 2.168561872909699, + "grad_norm": 1.1051086365517555, + "learning_rate": 6.978970401551557e-06, + "loss": 1.3053, + "step": 1622 + }, + { + "epoch": 2.1698996655518394, + "grad_norm": 0.9277725695108889, + "learning_rate": 6.974683966908642e-06, + "loss": 1.1828, + "step": 1623 + }, + { + "epoch": 2.1712374581939797, + "grad_norm": 1.0971436936221923, + "learning_rate": 6.970395812104751e-06, + "loss": 1.1523, + "step": 1624 + }, + { + "epoch": 2.1725752508361205, + "grad_norm": 1.160858307979088, + "learning_rate": 6.966105940875328e-06, + "loss": 1.0688, + "step": 1625 + }, + { + "epoch": 2.1739130434782608, + "grad_norm": 0.8849394785803479, + "learning_rate": 6.961814356957308e-06, + "loss": 1.0163, + "step": 1626 + }, + { + "epoch": 2.1752508361204015, + "grad_norm": 0.9466563830352795, + "learning_rate": 6.9575210640891215e-06, + "loss": 1.083, + "step": 1627 + }, + { + "epoch": 2.1765886287625418, + "grad_norm": 0.9055708200004874, + "learning_rate": 6.953226066010683e-06, + "loss": 1.2302, + "step": 1628 + }, + { + "epoch": 2.177926421404682, + "grad_norm": 0.8903535496267045, + "learning_rate": 6.948929366463397e-06, + "loss": 0.9958, + "step": 1629 + }, + { + "epoch": 2.179264214046823, + "grad_norm": 0.8453158317414154, + "learning_rate": 6.944630969190149e-06, + "loss": 1.3854, + "step": 1630 + }, + { + "epoch": 2.180602006688963, + "grad_norm": 1.1944282404178397, + "learning_rate": 6.940330877935304e-06, + "loss": 1.0834, + "step": 1631 + }, + { + "epoch": 2.181939799331104, + "grad_norm": 1.022466866950006, + "learning_rate": 6.936029096444697e-06, + "loss": 1.2554, + "step": 1632 + }, + { + "epoch": 2.183277591973244, + "grad_norm": 1.0999351993573974, + "learning_rate": 6.931725628465643e-06, + "loss": 1.1646, + "step": 1633 + }, + { + "epoch": 2.184615384615385, + "grad_norm": 0.8559663609071889, + "learning_rate": 6.927420477746923e-06, + "loss": 1.0761, + "step": 1634 + }, + { + "epoch": 2.185953177257525, + "grad_norm": 0.853990784438674, + "learning_rate": 6.923113648038784e-06, + "loss": 1.1349, + "step": 1635 + }, + { + "epoch": 2.1872909698996654, + "grad_norm": 1.0421026569742085, + "learning_rate": 6.918805143092935e-06, + "loss": 1.1623, + "step": 1636 + }, + { + "epoch": 2.188628762541806, + "grad_norm": 1.0171853010193088, + "learning_rate": 6.9144949666625434e-06, + "loss": 1.3551, + "step": 1637 + }, + { + "epoch": 2.1899665551839465, + "grad_norm": 1.1607928567935786, + "learning_rate": 6.910183122502236e-06, + "loss": 1.1332, + "step": 1638 + }, + { + "epoch": 2.1913043478260867, + "grad_norm": 0.8340435818000312, + "learning_rate": 6.9058696143680895e-06, + "loss": 1.1831, + "step": 1639 + }, + { + "epoch": 2.1926421404682275, + "grad_norm": 1.2678945137801831, + "learning_rate": 6.9015544460176296e-06, + "loss": 0.8496, + "step": 1640 + }, + { + "epoch": 2.1939799331103678, + "grad_norm": 1.0635255312727432, + "learning_rate": 6.897237621209831e-06, + "loss": 1.1091, + "step": 1641 + }, + { + "epoch": 2.1953177257525085, + "grad_norm": 0.8852727110754681, + "learning_rate": 6.89291914370511e-06, + "loss": 1.0854, + "step": 1642 + }, + { + "epoch": 2.196655518394649, + "grad_norm": 1.0755305710800984, + "learning_rate": 6.888599017265321e-06, + "loss": 1.2821, + "step": 1643 + }, + { + "epoch": 2.1979933110367895, + "grad_norm": 0.9511833978709969, + "learning_rate": 6.884277245653758e-06, + "loss": 1.1117, + "step": 1644 + }, + { + "epoch": 2.19933110367893, + "grad_norm": 0.8465038517067713, + "learning_rate": 6.8799538326351455e-06, + "loss": 1.3823, + "step": 1645 + }, + { + "epoch": 2.20066889632107, + "grad_norm": 0.9781744660951697, + "learning_rate": 6.87562878197564e-06, + "loss": 1.0765, + "step": 1646 + }, + { + "epoch": 2.202006688963211, + "grad_norm": 1.1892080729171717, + "learning_rate": 6.87130209744282e-06, + "loss": 1.0566, + "step": 1647 + }, + { + "epoch": 2.203344481605351, + "grad_norm": 1.0207658451366288, + "learning_rate": 6.866973782805694e-06, + "loss": 1.1031, + "step": 1648 + }, + { + "epoch": 2.2046822742474914, + "grad_norm": 0.9258002251852493, + "learning_rate": 6.862643841834686e-06, + "loss": 1.097, + "step": 1649 + }, + { + "epoch": 2.206020066889632, + "grad_norm": 1.041026530306738, + "learning_rate": 6.858312278301638e-06, + "loss": 1.009, + "step": 1650 + }, + { + "epoch": 2.2073578595317724, + "grad_norm": 0.9084629289604107, + "learning_rate": 6.8539790959798045e-06, + "loss": 1.3018, + "step": 1651 + }, + { + "epoch": 2.208695652173913, + "grad_norm": 1.0468099200913341, + "learning_rate": 6.849644298643852e-06, + "loss": 1.1334, + "step": 1652 + }, + { + "epoch": 2.2100334448160535, + "grad_norm": 0.974110146745012, + "learning_rate": 6.845307890069851e-06, + "loss": 1.046, + "step": 1653 + }, + { + "epoch": 2.211371237458194, + "grad_norm": 0.9383786180883593, + "learning_rate": 6.840969874035278e-06, + "loss": 1.0831, + "step": 1654 + }, + { + "epoch": 2.2127090301003345, + "grad_norm": 0.9472385290082588, + "learning_rate": 6.83663025431901e-06, + "loss": 1.1762, + "step": 1655 + }, + { + "epoch": 2.2140468227424748, + "grad_norm": 1.0943913284296718, + "learning_rate": 6.832289034701318e-06, + "loss": 1.1365, + "step": 1656 + }, + { + "epoch": 2.2153846153846155, + "grad_norm": 1.4066842731714584, + "learning_rate": 6.82794621896387e-06, + "loss": 1.1264, + "step": 1657 + }, + { + "epoch": 2.216722408026756, + "grad_norm": 1.0616958864817325, + "learning_rate": 6.823601810889723e-06, + "loss": 1.0728, + "step": 1658 + }, + { + "epoch": 2.218060200668896, + "grad_norm": 0.8493091276462869, + "learning_rate": 6.8192558142633215e-06, + "loss": 1.0676, + "step": 1659 + }, + { + "epoch": 2.219397993311037, + "grad_norm": 0.9507576775056439, + "learning_rate": 6.814908232870493e-06, + "loss": 1.155, + "step": 1660 + }, + { + "epoch": 2.220735785953177, + "grad_norm": 0.922429121111823, + "learning_rate": 6.810559070498446e-06, + "loss": 1.1357, + "step": 1661 + }, + { + "epoch": 2.222073578595318, + "grad_norm": 1.0811529284321169, + "learning_rate": 6.806208330935766e-06, + "loss": 1.1844, + "step": 1662 + }, + { + "epoch": 2.223411371237458, + "grad_norm": 1.001171394894904, + "learning_rate": 6.801856017972412e-06, + "loss": 1.4365, + "step": 1663 + }, + { + "epoch": 2.224749163879599, + "grad_norm": 0.9770259463149462, + "learning_rate": 6.797502135399716e-06, + "loss": 1.056, + "step": 1664 + }, + { + "epoch": 2.226086956521739, + "grad_norm": 1.076030863437271, + "learning_rate": 6.7931466870103735e-06, + "loss": 1.1051, + "step": 1665 + }, + { + "epoch": 2.2274247491638794, + "grad_norm": 0.8791809852608744, + "learning_rate": 6.788789676598449e-06, + "loss": 1.1572, + "step": 1666 + }, + { + "epoch": 2.22876254180602, + "grad_norm": 0.9805098600685795, + "learning_rate": 6.78443110795936e-06, + "loss": 1.1925, + "step": 1667 + }, + { + "epoch": 2.2301003344481605, + "grad_norm": 0.8769224163298004, + "learning_rate": 6.78007098488989e-06, + "loss": 1.0718, + "step": 1668 + }, + { + "epoch": 2.231438127090301, + "grad_norm": 0.9492168638064307, + "learning_rate": 6.77570931118817e-06, + "loss": 1.1717, + "step": 1669 + }, + { + "epoch": 2.2327759197324415, + "grad_norm": 1.1098267869601046, + "learning_rate": 6.771346090653687e-06, + "loss": 1.1684, + "step": 1670 + }, + { + "epoch": 2.234113712374582, + "grad_norm": 1.0036463270489202, + "learning_rate": 6.766981327087271e-06, + "loss": 1.0762, + "step": 1671 + }, + { + "epoch": 2.2354515050167225, + "grad_norm": 0.9517714559749477, + "learning_rate": 6.762615024291098e-06, + "loss": 1.1356, + "step": 1672 + }, + { + "epoch": 2.236789297658863, + "grad_norm": 0.9755366914190448, + "learning_rate": 6.758247186068684e-06, + "loss": 0.9482, + "step": 1673 + }, + { + "epoch": 2.2381270903010035, + "grad_norm": 1.0517297985015124, + "learning_rate": 6.753877816224886e-06, + "loss": 1.2238, + "step": 1674 + }, + { + "epoch": 2.239464882943144, + "grad_norm": 0.9188442861446394, + "learning_rate": 6.749506918565891e-06, + "loss": 1.1857, + "step": 1675 + }, + { + "epoch": 2.240802675585284, + "grad_norm": 0.8927597463962551, + "learning_rate": 6.7451344968992184e-06, + "loss": 1.3467, + "step": 1676 + }, + { + "epoch": 2.242140468227425, + "grad_norm": 0.9212549362315088, + "learning_rate": 6.740760555033715e-06, + "loss": 1.1424, + "step": 1677 + }, + { + "epoch": 2.243478260869565, + "grad_norm": 0.7947157669304763, + "learning_rate": 6.736385096779552e-06, + "loss": 1.1543, + "step": 1678 + }, + { + "epoch": 2.244816053511706, + "grad_norm": 1.0187803004928657, + "learning_rate": 6.732008125948223e-06, + "loss": 1.2314, + "step": 1679 + }, + { + "epoch": 2.246153846153846, + "grad_norm": 0.9550453552610344, + "learning_rate": 6.727629646352536e-06, + "loss": 1.2779, + "step": 1680 + }, + { + "epoch": 2.2474916387959865, + "grad_norm": 1.0398512783712954, + "learning_rate": 6.723249661806617e-06, + "loss": 1.116, + "step": 1681 + }, + { + "epoch": 2.248829431438127, + "grad_norm": 0.9891134379619148, + "learning_rate": 6.718868176125899e-06, + "loss": 1.1674, + "step": 1682 + }, + { + "epoch": 2.2501672240802675, + "grad_norm": 0.9740966626066947, + "learning_rate": 6.714485193127126e-06, + "loss": 1.0484, + "step": 1683 + }, + { + "epoch": 2.251505016722408, + "grad_norm": 0.9735595469390748, + "learning_rate": 6.710100716628345e-06, + "loss": 1.0114, + "step": 1684 + }, + { + "epoch": 2.2528428093645485, + "grad_norm": 1.0013692233692009, + "learning_rate": 6.705714750448904e-06, + "loss": 1.18, + "step": 1685 + }, + { + "epoch": 2.254180602006689, + "grad_norm": 0.8592801757752747, + "learning_rate": 6.701327298409448e-06, + "loss": 1.3051, + "step": 1686 + }, + { + "epoch": 2.2555183946488295, + "grad_norm": 1.0738219828345197, + "learning_rate": 6.6969383643319175e-06, + "loss": 1.0394, + "step": 1687 + }, + { + "epoch": 2.25685618729097, + "grad_norm": 0.9979419322553167, + "learning_rate": 6.692547952039543e-06, + "loss": 1.173, + "step": 1688 + }, + { + "epoch": 2.2581939799331106, + "grad_norm": 0.9407586900190892, + "learning_rate": 6.688156065356845e-06, + "loss": 1.2459, + "step": 1689 + }, + { + "epoch": 2.259531772575251, + "grad_norm": 1.465544553223302, + "learning_rate": 6.683762708109625e-06, + "loss": 1.1329, + "step": 1690 + }, + { + "epoch": 2.260869565217391, + "grad_norm": 0.9256194371301226, + "learning_rate": 6.679367884124968e-06, + "loss": 1.0499, + "step": 1691 + }, + { + "epoch": 2.262207357859532, + "grad_norm": 1.0128458842604793, + "learning_rate": 6.674971597231236e-06, + "loss": 1.247, + "step": 1692 + }, + { + "epoch": 2.263545150501672, + "grad_norm": 0.8220546907448102, + "learning_rate": 6.670573851258063e-06, + "loss": 0.7988, + "step": 1693 + }, + { + "epoch": 2.264882943143813, + "grad_norm": 0.8785911630231444, + "learning_rate": 6.66617465003636e-06, + "loss": 0.8893, + "step": 1694 + }, + { + "epoch": 2.266220735785953, + "grad_norm": 0.8331048169998821, + "learning_rate": 6.6617739973982985e-06, + "loss": 1.1344, + "step": 1695 + }, + { + "epoch": 2.2675585284280935, + "grad_norm": 1.0050157921512939, + "learning_rate": 6.6573718971773204e-06, + "loss": 1.2447, + "step": 1696 + }, + { + "epoch": 2.268896321070234, + "grad_norm": 1.2874914406184148, + "learning_rate": 6.652968353208122e-06, + "loss": 1.1269, + "step": 1697 + }, + { + "epoch": 2.2702341137123745, + "grad_norm": 1.0647429425096577, + "learning_rate": 6.648563369326666e-06, + "loss": 1.2367, + "step": 1698 + }, + { + "epoch": 2.2715719063545152, + "grad_norm": 1.1142162197817975, + "learning_rate": 6.644156949370162e-06, + "loss": 1.1742, + "step": 1699 + }, + { + "epoch": 2.2729096989966555, + "grad_norm": 0.9305340720859075, + "learning_rate": 6.639749097177073e-06, + "loss": 1.0355, + "step": 1700 + }, + { + "epoch": 2.274247491638796, + "grad_norm": 1.0402036850122114, + "learning_rate": 6.635339816587109e-06, + "loss": 1.4055, + "step": 1701 + }, + { + "epoch": 2.2755852842809365, + "grad_norm": 0.934722324971509, + "learning_rate": 6.630929111441227e-06, + "loss": 1.1972, + "step": 1702 + }, + { + "epoch": 2.276923076923077, + "grad_norm": 0.9015421826902562, + "learning_rate": 6.626516985581621e-06, + "loss": 1.0159, + "step": 1703 + }, + { + "epoch": 2.2782608695652176, + "grad_norm": 0.9699525128175824, + "learning_rate": 6.622103442851728e-06, + "loss": 1.182, + "step": 1704 + }, + { + "epoch": 2.279598662207358, + "grad_norm": 0.9999373894591899, + "learning_rate": 6.617688487096213e-06, + "loss": 1.1576, + "step": 1705 + }, + { + "epoch": 2.280936454849498, + "grad_norm": 0.9862282493424738, + "learning_rate": 6.613272122160975e-06, + "loss": 1.0874, + "step": 1706 + }, + { + "epoch": 2.282274247491639, + "grad_norm": 1.1959420581974651, + "learning_rate": 6.60885435189314e-06, + "loss": 1.1214, + "step": 1707 + }, + { + "epoch": 2.283612040133779, + "grad_norm": 0.8069852540877671, + "learning_rate": 6.60443518014106e-06, + "loss": 1.045, + "step": 1708 + }, + { + "epoch": 2.28494983277592, + "grad_norm": 0.8175119881953598, + "learning_rate": 6.600014610754306e-06, + "loss": 1.1618, + "step": 1709 + }, + { + "epoch": 2.28628762541806, + "grad_norm": 0.83407078562439, + "learning_rate": 6.595592647583666e-06, + "loss": 1.0533, + "step": 1710 + }, + { + "epoch": 2.2876254180602005, + "grad_norm": 0.8815190000972616, + "learning_rate": 6.591169294481143e-06, + "loss": 1.0295, + "step": 1711 + }, + { + "epoch": 2.288963210702341, + "grad_norm": 1.3109493953709834, + "learning_rate": 6.586744555299953e-06, + "loss": 0.8988, + "step": 1712 + }, + { + "epoch": 2.2903010033444815, + "grad_norm": 0.9723680118055313, + "learning_rate": 6.582318433894513e-06, + "loss": 1.0396, + "step": 1713 + }, + { + "epoch": 2.2916387959866222, + "grad_norm": 0.8779445188322714, + "learning_rate": 6.577890934120451e-06, + "loss": 1.2158, + "step": 1714 + }, + { + "epoch": 2.2929765886287625, + "grad_norm": 0.9725191741651179, + "learning_rate": 6.573462059834593e-06, + "loss": 1.4385, + "step": 1715 + }, + { + "epoch": 2.294314381270903, + "grad_norm": 0.8558064676543375, + "learning_rate": 6.569031814894962e-06, + "loss": 1.1231, + "step": 1716 + }, + { + "epoch": 2.2956521739130435, + "grad_norm": 0.7824709163220143, + "learning_rate": 6.5646002031607726e-06, + "loss": 1.0943, + "step": 1717 + }, + { + "epoch": 2.296989966555184, + "grad_norm": 0.8738680657763636, + "learning_rate": 6.560167228492436e-06, + "loss": 0.9985, + "step": 1718 + }, + { + "epoch": 2.2983277591973246, + "grad_norm": 0.9843362610795234, + "learning_rate": 6.555732894751548e-06, + "loss": 1.0738, + "step": 1719 + }, + { + "epoch": 2.299665551839465, + "grad_norm": 0.9322138334333236, + "learning_rate": 6.551297205800884e-06, + "loss": 1.2059, + "step": 1720 + }, + { + "epoch": 2.3010033444816056, + "grad_norm": 0.8007927073662111, + "learning_rate": 6.546860165504406e-06, + "loss": 1.2719, + "step": 1721 + }, + { + "epoch": 2.302341137123746, + "grad_norm": 0.9838675921813376, + "learning_rate": 6.5424217777272506e-06, + "loss": 1.007, + "step": 1722 + }, + { + "epoch": 2.303678929765886, + "grad_norm": 0.9674835090719434, + "learning_rate": 6.537982046335727e-06, + "loss": 1.1606, + "step": 1723 + }, + { + "epoch": 2.305016722408027, + "grad_norm": 1.0154248288638754, + "learning_rate": 6.533540975197319e-06, + "loss": 1.0204, + "step": 1724 + }, + { + "epoch": 2.306354515050167, + "grad_norm": 0.9663864051023281, + "learning_rate": 6.529098568180672e-06, + "loss": 0.9887, + "step": 1725 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.8218741715491135, + "learning_rate": 6.524654829155599e-06, + "loss": 1.0099, + "step": 1726 + }, + { + "epoch": 2.309030100334448, + "grad_norm": 1.0696152399001564, + "learning_rate": 6.520209761993072e-06, + "loss": 1.1587, + "step": 1727 + }, + { + "epoch": 2.3103678929765885, + "grad_norm": 0.917140862560155, + "learning_rate": 6.515763370565218e-06, + "loss": 1.1613, + "step": 1728 + }, + { + "epoch": 2.3117056856187292, + "grad_norm": 1.0157709358970262, + "learning_rate": 6.511315658745323e-06, + "loss": 1.2146, + "step": 1729 + }, + { + "epoch": 2.3130434782608695, + "grad_norm": 1.058586258039853, + "learning_rate": 6.506866630407817e-06, + "loss": 1.2821, + "step": 1730 + }, + { + "epoch": 2.3143812709030103, + "grad_norm": 1.2434660662934258, + "learning_rate": 6.502416289428282e-06, + "loss": 1.225, + "step": 1731 + }, + { + "epoch": 2.3157190635451506, + "grad_norm": 0.9230874741474712, + "learning_rate": 6.4979646396834375e-06, + "loss": 1.2951, + "step": 1732 + }, + { + "epoch": 2.317056856187291, + "grad_norm": 1.0082383778915063, + "learning_rate": 6.4935116850511495e-06, + "loss": 1.1337, + "step": 1733 + }, + { + "epoch": 2.3183946488294316, + "grad_norm": 0.9619686680748357, + "learning_rate": 6.489057429410418e-06, + "loss": 1.0371, + "step": 1734 + }, + { + "epoch": 2.319732441471572, + "grad_norm": 0.8593386036498593, + "learning_rate": 6.484601876641375e-06, + "loss": 1.012, + "step": 1735 + }, + { + "epoch": 2.321070234113712, + "grad_norm": 1.1058458838675638, + "learning_rate": 6.480145030625284e-06, + "loss": 1.1724, + "step": 1736 + }, + { + "epoch": 2.322408026755853, + "grad_norm": 0.7769283227953023, + "learning_rate": 6.475686895244534e-06, + "loss": 1.0869, + "step": 1737 + }, + { + "epoch": 2.323745819397993, + "grad_norm": 0.7683263627385223, + "learning_rate": 6.471227474382639e-06, + "loss": 1.0492, + "step": 1738 + }, + { + "epoch": 2.325083612040134, + "grad_norm": 0.7867375708981926, + "learning_rate": 6.466766771924231e-06, + "loss": 1.2663, + "step": 1739 + }, + { + "epoch": 2.326421404682274, + "grad_norm": 0.8883958458120329, + "learning_rate": 6.462304791755059e-06, + "loss": 1.3093, + "step": 1740 + }, + { + "epoch": 2.327759197324415, + "grad_norm": 0.8617573566167085, + "learning_rate": 6.457841537761985e-06, + "loss": 1.028, + "step": 1741 + }, + { + "epoch": 2.3290969899665552, + "grad_norm": 1.0451437984433805, + "learning_rate": 6.453377013832981e-06, + "loss": 1.1172, + "step": 1742 + }, + { + "epoch": 2.3304347826086955, + "grad_norm": 0.9107365524644601, + "learning_rate": 6.448911223857124e-06, + "loss": 1.1142, + "step": 1743 + }, + { + "epoch": 2.3317725752508363, + "grad_norm": 0.8495812119859617, + "learning_rate": 6.444444171724595e-06, + "loss": 1.1016, + "step": 1744 + }, + { + "epoch": 2.3331103678929765, + "grad_norm": 0.9821771156429698, + "learning_rate": 6.4399758613266775e-06, + "loss": 1.1368, + "step": 1745 + }, + { + "epoch": 2.334448160535117, + "grad_norm": 0.9453454286749045, + "learning_rate": 6.435506296555742e-06, + "loss": 1.2275, + "step": 1746 + }, + { + "epoch": 2.3357859531772576, + "grad_norm": 0.9649128903967565, + "learning_rate": 6.431035481305261e-06, + "loss": 1.0374, + "step": 1747 + }, + { + "epoch": 2.337123745819398, + "grad_norm": 1.2124158217017966, + "learning_rate": 6.426563419469793e-06, + "loss": 1.3525, + "step": 1748 + }, + { + "epoch": 2.3384615384615386, + "grad_norm": 1.1470896530849861, + "learning_rate": 6.422090114944982e-06, + "loss": 0.9582, + "step": 1749 + }, + { + "epoch": 2.339799331103679, + "grad_norm": 0.7997737513276298, + "learning_rate": 6.417615571627555e-06, + "loss": 1.0339, + "step": 1750 + }, + { + "epoch": 2.3411371237458196, + "grad_norm": 0.9781390440147466, + "learning_rate": 6.4131397934153175e-06, + "loss": 1.0596, + "step": 1751 + }, + { + "epoch": 2.34247491638796, + "grad_norm": 1.0391341059668313, + "learning_rate": 6.408662784207149e-06, + "loss": 1.27, + "step": 1752 + }, + { + "epoch": 2.3438127090301, + "grad_norm": 0.9084272702843955, + "learning_rate": 6.404184547903006e-06, + "loss": 1.2123, + "step": 1753 + }, + { + "epoch": 2.345150501672241, + "grad_norm": 0.9628284555121637, + "learning_rate": 6.399705088403912e-06, + "loss": 0.9158, + "step": 1754 + }, + { + "epoch": 2.346488294314381, + "grad_norm": 1.0909011940815385, + "learning_rate": 6.3952244096119535e-06, + "loss": 1.153, + "step": 1755 + }, + { + "epoch": 2.3478260869565215, + "grad_norm": 1.0278694939116004, + "learning_rate": 6.3907425154302815e-06, + "loss": 0.9861, + "step": 1756 + }, + { + "epoch": 2.3491638795986622, + "grad_norm": 0.9022041156662959, + "learning_rate": 6.386259409763107e-06, + "loss": 1.1719, + "step": 1757 + }, + { + "epoch": 2.3505016722408025, + "grad_norm": 0.9081979403193579, + "learning_rate": 6.381775096515692e-06, + "loss": 1.0253, + "step": 1758 + }, + { + "epoch": 2.3518394648829433, + "grad_norm": 1.0792047874928035, + "learning_rate": 6.377289579594355e-06, + "loss": 1.2127, + "step": 1759 + }, + { + "epoch": 2.3531772575250836, + "grad_norm": 0.9261511621893147, + "learning_rate": 6.372802862906459e-06, + "loss": 0.9477, + "step": 1760 + }, + { + "epoch": 2.3545150501672243, + "grad_norm": 0.8468954292973784, + "learning_rate": 6.368314950360416e-06, + "loss": 1.0656, + "step": 1761 + }, + { + "epoch": 2.3558528428093646, + "grad_norm": 0.8917914532034018, + "learning_rate": 6.3638258458656766e-06, + "loss": 0.9504, + "step": 1762 + }, + { + "epoch": 2.357190635451505, + "grad_norm": 1.0845586623013064, + "learning_rate": 6.3593355533327314e-06, + "loss": 1.2787, + "step": 1763 + }, + { + "epoch": 2.3585284280936456, + "grad_norm": 0.9628720313996068, + "learning_rate": 6.354844076673108e-06, + "loss": 0.9846, + "step": 1764 + }, + { + "epoch": 2.359866220735786, + "grad_norm": 0.9188332376439134, + "learning_rate": 6.35035141979936e-06, + "loss": 1.1844, + "step": 1765 + }, + { + "epoch": 2.361204013377926, + "grad_norm": 1.2609984572462705, + "learning_rate": 6.345857586625073e-06, + "loss": 1.1833, + "step": 1766 + }, + { + "epoch": 2.362541806020067, + "grad_norm": 0.8842372205746254, + "learning_rate": 6.341362581064856e-06, + "loss": 1.2367, + "step": 1767 + }, + { + "epoch": 2.363879598662207, + "grad_norm": 0.8638609071968747, + "learning_rate": 6.336866407034341e-06, + "loss": 0.9177, + "step": 1768 + }, + { + "epoch": 2.365217391304348, + "grad_norm": 0.9315879303466001, + "learning_rate": 6.332369068450175e-06, + "loss": 1.1416, + "step": 1769 + }, + { + "epoch": 2.3665551839464882, + "grad_norm": 1.0027403255390854, + "learning_rate": 6.327870569230022e-06, + "loss": 1.4347, + "step": 1770 + }, + { + "epoch": 2.367892976588629, + "grad_norm": 0.9321600282601583, + "learning_rate": 6.323370913292557e-06, + "loss": 1.2767, + "step": 1771 + }, + { + "epoch": 2.3692307692307693, + "grad_norm": 1.2677816176396355, + "learning_rate": 6.318870104557459e-06, + "loss": 1.3159, + "step": 1772 + }, + { + "epoch": 2.3705685618729095, + "grad_norm": 0.8903284179744942, + "learning_rate": 6.314368146945418e-06, + "loss": 1.1431, + "step": 1773 + }, + { + "epoch": 2.3719063545150503, + "grad_norm": 1.0746856815914099, + "learning_rate": 6.309865044378115e-06, + "loss": 1.1084, + "step": 1774 + }, + { + "epoch": 2.3732441471571906, + "grad_norm": 0.8608112218963366, + "learning_rate": 6.3053608007782385e-06, + "loss": 0.9571, + "step": 1775 + }, + { + "epoch": 2.374581939799331, + "grad_norm": 1.063518724044132, + "learning_rate": 6.300855420069465e-06, + "loss": 1.1629, + "step": 1776 + }, + { + "epoch": 2.3759197324414716, + "grad_norm": 0.9189899236932314, + "learning_rate": 6.296348906176462e-06, + "loss": 0.9772, + "step": 1777 + }, + { + "epoch": 2.377257525083612, + "grad_norm": 1.21831146373883, + "learning_rate": 6.2918412630248874e-06, + "loss": 1.319, + "step": 1778 + }, + { + "epoch": 2.3785953177257526, + "grad_norm": 0.8970421545871088, + "learning_rate": 6.28733249454138e-06, + "loss": 1.0503, + "step": 1779 + }, + { + "epoch": 2.379933110367893, + "grad_norm": 0.917764942856931, + "learning_rate": 6.2828226046535575e-06, + "loss": 1.0067, + "step": 1780 + }, + { + "epoch": 2.3812709030100336, + "grad_norm": 0.9221789421107651, + "learning_rate": 6.278311597290019e-06, + "loss": 1.2449, + "step": 1781 + }, + { + "epoch": 2.382608695652174, + "grad_norm": 0.8961060097250235, + "learning_rate": 6.273799476380332e-06, + "loss": 1.1645, + "step": 1782 + }, + { + "epoch": 2.383946488294314, + "grad_norm": 1.0346465243991965, + "learning_rate": 6.269286245855039e-06, + "loss": 1.1109, + "step": 1783 + }, + { + "epoch": 2.385284280936455, + "grad_norm": 0.9845265733297017, + "learning_rate": 6.264771909645646e-06, + "loss": 1.3169, + "step": 1784 + }, + { + "epoch": 2.3866220735785952, + "grad_norm": 0.8510893312981895, + "learning_rate": 6.260256471684622e-06, + "loss": 1.0787, + "step": 1785 + }, + { + "epoch": 2.387959866220736, + "grad_norm": 0.8329060717941762, + "learning_rate": 6.255739935905396e-06, + "loss": 0.8638, + "step": 1786 + }, + { + "epoch": 2.3892976588628763, + "grad_norm": 0.8701911694272303, + "learning_rate": 6.2512223062423545e-06, + "loss": 1.1349, + "step": 1787 + }, + { + "epoch": 2.3906354515050166, + "grad_norm": 0.8126515219718907, + "learning_rate": 6.246703586630838e-06, + "loss": 1.082, + "step": 1788 + }, + { + "epoch": 2.3919732441471573, + "grad_norm": 0.8438625950394013, + "learning_rate": 6.242183781007132e-06, + "loss": 0.9553, + "step": 1789 + }, + { + "epoch": 2.3933110367892976, + "grad_norm": 0.9028029876586623, + "learning_rate": 6.237662893308471e-06, + "loss": 1.2583, + "step": 1790 + }, + { + "epoch": 2.3946488294314383, + "grad_norm": 1.6118741845118085, + "learning_rate": 6.233140927473033e-06, + "loss": 0.9991, + "step": 1791 + }, + { + "epoch": 2.3959866220735786, + "grad_norm": 0.9184170215386867, + "learning_rate": 6.228617887439931e-06, + "loss": 1.1345, + "step": 1792 + }, + { + "epoch": 2.397324414715719, + "grad_norm": 1.1019168985372425, + "learning_rate": 6.224093777149222e-06, + "loss": 1.0058, + "step": 1793 + }, + { + "epoch": 2.3986622073578596, + "grad_norm": 0.8628822101900847, + "learning_rate": 6.219568600541886e-06, + "loss": 1.2366, + "step": 1794 + }, + { + "epoch": 2.4, + "grad_norm": 0.9686673674886604, + "learning_rate": 6.2150423615598376e-06, + "loss": 1.0737, + "step": 1795 + }, + { + "epoch": 2.4013377926421406, + "grad_norm": 0.954989563689006, + "learning_rate": 6.210515064145915e-06, + "loss": 1.1412, + "step": 1796 + }, + { + "epoch": 2.402675585284281, + "grad_norm": 1.1470805346482216, + "learning_rate": 6.205986712243876e-06, + "loss": 1.0555, + "step": 1797 + }, + { + "epoch": 2.4040133779264212, + "grad_norm": 1.0420653289988044, + "learning_rate": 6.201457309798403e-06, + "loss": 1.0879, + "step": 1798 + }, + { + "epoch": 2.405351170568562, + "grad_norm": 1.030614740642502, + "learning_rate": 6.196926860755088e-06, + "loss": 1.241, + "step": 1799 + }, + { + "epoch": 2.4066889632107022, + "grad_norm": 0.7981386225865341, + "learning_rate": 6.192395369060439e-06, + "loss": 1.1802, + "step": 1800 + }, + { + "epoch": 2.408026755852843, + "grad_norm": 0.9977438001597115, + "learning_rate": 6.187862838661869e-06, + "loss": 0.877, + "step": 1801 + }, + { + "epoch": 2.4093645484949833, + "grad_norm": 0.843690197974476, + "learning_rate": 6.183329273507693e-06, + "loss": 1.2143, + "step": 1802 + }, + { + "epoch": 2.4107023411371236, + "grad_norm": 6.425466637240412, + "learning_rate": 6.178794677547138e-06, + "loss": 1.4319, + "step": 1803 + }, + { + "epoch": 2.4120401337792643, + "grad_norm": 0.9565964809166556, + "learning_rate": 6.174259054730316e-06, + "loss": 1.0502, + "step": 1804 + }, + { + "epoch": 2.4133779264214046, + "grad_norm": 1.2864624701389011, + "learning_rate": 6.169722409008244e-06, + "loss": 0.9944, + "step": 1805 + }, + { + "epoch": 2.4147157190635453, + "grad_norm": 0.9350947638740911, + "learning_rate": 6.165184744332824e-06, + "loss": 1.2815, + "step": 1806 + }, + { + "epoch": 2.4160535117056856, + "grad_norm": 0.9766270974902242, + "learning_rate": 6.160646064656845e-06, + "loss": 1.1775, + "step": 1807 + }, + { + "epoch": 2.417391304347826, + "grad_norm": 0.8867530776712851, + "learning_rate": 6.156106373933988e-06, + "loss": 1.0667, + "step": 1808 + }, + { + "epoch": 2.4187290969899666, + "grad_norm": 0.8330594951697958, + "learning_rate": 6.151565676118805e-06, + "loss": 1.084, + "step": 1809 + }, + { + "epoch": 2.420066889632107, + "grad_norm": 1.2106760001275398, + "learning_rate": 6.147023975166731e-06, + "loss": 1.1593, + "step": 1810 + }, + { + "epoch": 2.4214046822742477, + "grad_norm": 1.2803878643879827, + "learning_rate": 6.142481275034072e-06, + "loss": 1.019, + "step": 1811 + }, + { + "epoch": 2.422742474916388, + "grad_norm": 1.241068900771874, + "learning_rate": 6.137937579678007e-06, + "loss": 1.1789, + "step": 1812 + }, + { + "epoch": 2.4240802675585282, + "grad_norm": 1.063365992613077, + "learning_rate": 6.133392893056583e-06, + "loss": 1.1131, + "step": 1813 + }, + { + "epoch": 2.425418060200669, + "grad_norm": 1.0893985387079177, + "learning_rate": 6.128847219128703e-06, + "loss": 1.0534, + "step": 1814 + }, + { + "epoch": 2.4267558528428093, + "grad_norm": 0.8247476669111715, + "learning_rate": 6.124300561854139e-06, + "loss": 1.0376, + "step": 1815 + }, + { + "epoch": 2.42809364548495, + "grad_norm": 0.8322853900348329, + "learning_rate": 6.119752925193516e-06, + "loss": 1.1635, + "step": 1816 + }, + { + "epoch": 2.4294314381270903, + "grad_norm": 0.910616397795228, + "learning_rate": 6.1152043131083095e-06, + "loss": 1.0807, + "step": 1817 + }, + { + "epoch": 2.430769230769231, + "grad_norm": 0.9808402065740167, + "learning_rate": 6.1106547295608495e-06, + "loss": 1.1097, + "step": 1818 + }, + { + "epoch": 2.4321070234113713, + "grad_norm": 1.0984165528461385, + "learning_rate": 6.106104178514309e-06, + "loss": 1.1039, + "step": 1819 + }, + { + "epoch": 2.4334448160535116, + "grad_norm": 0.9796188257790117, + "learning_rate": 6.101552663932704e-06, + "loss": 1.089, + "step": 1820 + }, + { + "epoch": 2.4347826086956523, + "grad_norm": 0.9932700543069642, + "learning_rate": 6.097000189780893e-06, + "loss": 1.2177, + "step": 1821 + }, + { + "epoch": 2.4361204013377926, + "grad_norm": 0.9400327392769743, + "learning_rate": 6.092446760024564e-06, + "loss": 1.2726, + "step": 1822 + }, + { + "epoch": 2.437458193979933, + "grad_norm": 0.9711957318491249, + "learning_rate": 6.087892378630245e-06, + "loss": 1.1873, + "step": 1823 + }, + { + "epoch": 2.4387959866220736, + "grad_norm": 0.9407388431806829, + "learning_rate": 6.0833370495652885e-06, + "loss": 1.1813, + "step": 1824 + }, + { + "epoch": 2.440133779264214, + "grad_norm": 1.198249840499156, + "learning_rate": 6.0787807767978736e-06, + "loss": 1.0394, + "step": 1825 + }, + { + "epoch": 2.4414715719063547, + "grad_norm": 1.0639413908391946, + "learning_rate": 6.074223564296999e-06, + "loss": 1.2277, + "step": 1826 + }, + { + "epoch": 2.442809364548495, + "grad_norm": 0.8890173983125259, + "learning_rate": 6.0696654160324875e-06, + "loss": 1.1024, + "step": 1827 + }, + { + "epoch": 2.4441471571906357, + "grad_norm": 0.8612674239552284, + "learning_rate": 6.065106335974972e-06, + "loss": 1.037, + "step": 1828 + }, + { + "epoch": 2.445484949832776, + "grad_norm": 0.803916358346694, + "learning_rate": 6.0605463280958995e-06, + "loss": 1.1816, + "step": 1829 + }, + { + "epoch": 2.4468227424749163, + "grad_norm": 0.8365110154187334, + "learning_rate": 6.055985396367526e-06, + "loss": 1.032, + "step": 1830 + }, + { + "epoch": 2.448160535117057, + "grad_norm": 0.9589698404395355, + "learning_rate": 6.051423544762909e-06, + "loss": 1.1372, + "step": 1831 + }, + { + "epoch": 2.4494983277591973, + "grad_norm": 1.1368211087241282, + "learning_rate": 6.046860777255907e-06, + "loss": 1.0733, + "step": 1832 + }, + { + "epoch": 2.4508361204013376, + "grad_norm": 0.858345819994607, + "learning_rate": 6.042297097821184e-06, + "loss": 1.0313, + "step": 1833 + }, + { + "epoch": 2.4521739130434783, + "grad_norm": 0.9469796059974475, + "learning_rate": 6.0377325104341885e-06, + "loss": 1.1539, + "step": 1834 + }, + { + "epoch": 2.4535117056856186, + "grad_norm": 1.032260458447012, + "learning_rate": 6.033167019071168e-06, + "loss": 1.2432, + "step": 1835 + }, + { + "epoch": 2.4548494983277593, + "grad_norm": 1.1261754133587396, + "learning_rate": 6.028600627709151e-06, + "loss": 1.0183, + "step": 1836 + }, + { + "epoch": 2.4561872909698996, + "grad_norm": 0.9168203208355085, + "learning_rate": 6.024033340325954e-06, + "loss": 1.3389, + "step": 1837 + }, + { + "epoch": 2.4575250836120404, + "grad_norm": 1.042616229931407, + "learning_rate": 6.019465160900173e-06, + "loss": 1.0227, + "step": 1838 + }, + { + "epoch": 2.4588628762541807, + "grad_norm": 0.8836564358933575, + "learning_rate": 6.014896093411181e-06, + "loss": 1.0776, + "step": 1839 + }, + { + "epoch": 2.460200668896321, + "grad_norm": 1.3898556640828748, + "learning_rate": 6.010326141839125e-06, + "loss": 1.191, + "step": 1840 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.9901002607287887, + "learning_rate": 6.005755310164919e-06, + "loss": 1.2601, + "step": 1841 + }, + { + "epoch": 2.462876254180602, + "grad_norm": 0.8478080399327899, + "learning_rate": 6.001183602370249e-06, + "loss": 1.0972, + "step": 1842 + }, + { + "epoch": 2.4642140468227423, + "grad_norm": 0.8693995430156604, + "learning_rate": 5.996611022437562e-06, + "loss": 1.2139, + "step": 1843 + }, + { + "epoch": 2.465551839464883, + "grad_norm": 1.1909472727636246, + "learning_rate": 5.992037574350062e-06, + "loss": 1.0379, + "step": 1844 + }, + { + "epoch": 2.4668896321070233, + "grad_norm": 0.9692343636835201, + "learning_rate": 5.987463262091715e-06, + "loss": 1.0229, + "step": 1845 + }, + { + "epoch": 2.468227424749164, + "grad_norm": 1.00582135625811, + "learning_rate": 5.982888089647232e-06, + "loss": 0.9747, + "step": 1846 + }, + { + "epoch": 2.4695652173913043, + "grad_norm": 1.1717317098116065, + "learning_rate": 5.978312061002078e-06, + "loss": 1.2527, + "step": 1847 + }, + { + "epoch": 2.470903010033445, + "grad_norm": 0.8691866310339285, + "learning_rate": 5.973735180142468e-06, + "loss": 1.1667, + "step": 1848 + }, + { + "epoch": 2.4722408026755853, + "grad_norm": 0.9183020249613336, + "learning_rate": 5.9691574510553505e-06, + "loss": 1.1019, + "step": 1849 + }, + { + "epoch": 2.4735785953177256, + "grad_norm": 0.822897959630721, + "learning_rate": 5.9645788777284195e-06, + "loss": 1.4472, + "step": 1850 + }, + { + "epoch": 2.4749163879598663, + "grad_norm": 0.8725661688216033, + "learning_rate": 5.959999464150101e-06, + "loss": 1.2132, + "step": 1851 + }, + { + "epoch": 2.4762541806020066, + "grad_norm": 0.9535105554879281, + "learning_rate": 5.9554192143095535e-06, + "loss": 1.0856, + "step": 1852 + }, + { + "epoch": 2.477591973244147, + "grad_norm": 0.9591063409853011, + "learning_rate": 5.950838132196667e-06, + "loss": 1.1816, + "step": 1853 + }, + { + "epoch": 2.4789297658862877, + "grad_norm": 1.0338399810843932, + "learning_rate": 5.946256221802052e-06, + "loss": 1.0963, + "step": 1854 + }, + { + "epoch": 2.480267558528428, + "grad_norm": 1.0887989150740291, + "learning_rate": 5.941673487117043e-06, + "loss": 1.0151, + "step": 1855 + }, + { + "epoch": 2.4816053511705687, + "grad_norm": 1.2138702513312563, + "learning_rate": 5.937089932133693e-06, + "loss": 1.2028, + "step": 1856 + }, + { + "epoch": 2.482943143812709, + "grad_norm": 0.9932528645856918, + "learning_rate": 5.932505560844766e-06, + "loss": 1.036, + "step": 1857 + }, + { + "epoch": 2.4842809364548497, + "grad_norm": 0.7689323289806553, + "learning_rate": 5.927920377243743e-06, + "loss": 0.9598, + "step": 1858 + }, + { + "epoch": 2.48561872909699, + "grad_norm": 0.8369661729334102, + "learning_rate": 5.923334385324809e-06, + "loss": 1.0297, + "step": 1859 + }, + { + "epoch": 2.4869565217391303, + "grad_norm": 0.9026803148458492, + "learning_rate": 5.918747589082853e-06, + "loss": 1.188, + "step": 1860 + }, + { + "epoch": 2.488294314381271, + "grad_norm": 0.816270713899806, + "learning_rate": 5.914159992513464e-06, + "loss": 1.1629, + "step": 1861 + }, + { + "epoch": 2.4896321070234113, + "grad_norm": 0.7929616093723266, + "learning_rate": 5.90957159961293e-06, + "loss": 1.0589, + "step": 1862 + }, + { + "epoch": 2.4909698996655516, + "grad_norm": 1.2654181707968821, + "learning_rate": 5.904982414378233e-06, + "loss": 1.1636, + "step": 1863 + }, + { + "epoch": 2.4923076923076923, + "grad_norm": 1.0369216056079038, + "learning_rate": 5.900392440807044e-06, + "loss": 1.259, + "step": 1864 + }, + { + "epoch": 2.4936454849498326, + "grad_norm": 0.8976295064129585, + "learning_rate": 5.895801682897721e-06, + "loss": 0.8459, + "step": 1865 + }, + { + "epoch": 2.4949832775919734, + "grad_norm": 0.8095116169917583, + "learning_rate": 5.891210144649303e-06, + "loss": 1.0379, + "step": 1866 + }, + { + "epoch": 2.4963210702341136, + "grad_norm": 0.9491349731739996, + "learning_rate": 5.886617830061514e-06, + "loss": 1.2, + "step": 1867 + }, + { + "epoch": 2.4976588628762544, + "grad_norm": 1.0148555980394844, + "learning_rate": 5.88202474313475e-06, + "loss": 1.0672, + "step": 1868 + }, + { + "epoch": 2.4989966555183947, + "grad_norm": 1.1405469308840053, + "learning_rate": 5.877430887870081e-06, + "loss": 1.0798, + "step": 1869 + }, + { + "epoch": 2.500334448160535, + "grad_norm": 1.0327226258091362, + "learning_rate": 5.872836268269246e-06, + "loss": 1.3194, + "step": 1870 + }, + { + "epoch": 2.5016722408026757, + "grad_norm": 1.1093269685755693, + "learning_rate": 5.8682408883346535e-06, + "loss": 1.2024, + "step": 1871 + }, + { + "epoch": 2.503010033444816, + "grad_norm": 0.945321644079698, + "learning_rate": 5.863644752069364e-06, + "loss": 1.1155, + "step": 1872 + }, + { + "epoch": 2.5043478260869563, + "grad_norm": 1.1905675349875542, + "learning_rate": 5.859047863477112e-06, + "loss": 1.1059, + "step": 1873 + }, + { + "epoch": 2.505685618729097, + "grad_norm": 0.8461508724345891, + "learning_rate": 5.854450226562274e-06, + "loss": 1.3982, + "step": 1874 + }, + { + "epoch": 2.5070234113712373, + "grad_norm": 0.99266877426683, + "learning_rate": 5.849851845329884e-06, + "loss": 1.116, + "step": 1875 + }, + { + "epoch": 2.508361204013378, + "grad_norm": 0.9610312337802948, + "learning_rate": 5.845252723785626e-06, + "loss": 1.2904, + "step": 1876 + }, + { + "epoch": 2.5096989966555183, + "grad_norm": 1.5901551650332417, + "learning_rate": 5.8406528659358234e-06, + "loss": 1.1619, + "step": 1877 + }, + { + "epoch": 2.511036789297659, + "grad_norm": 0.9931081239063003, + "learning_rate": 5.836052275787448e-06, + "loss": 0.9471, + "step": 1878 + }, + { + "epoch": 2.5123745819397993, + "grad_norm": 0.9857370161694914, + "learning_rate": 5.831450957348106e-06, + "loss": 1.1994, + "step": 1879 + }, + { + "epoch": 2.5137123745819396, + "grad_norm": 0.8407797533840053, + "learning_rate": 5.826848914626035e-06, + "loss": 1.1854, + "step": 1880 + }, + { + "epoch": 2.5150501672240804, + "grad_norm": 1.1898842610293494, + "learning_rate": 5.822246151630109e-06, + "loss": 1.1602, + "step": 1881 + }, + { + "epoch": 2.5163879598662207, + "grad_norm": 0.9742483254099331, + "learning_rate": 5.817642672369825e-06, + "loss": 0.9744, + "step": 1882 + }, + { + "epoch": 2.517725752508361, + "grad_norm": 0.8047254215629368, + "learning_rate": 5.813038480855308e-06, + "loss": 1.0021, + "step": 1883 + }, + { + "epoch": 2.5190635451505017, + "grad_norm": 1.0735673847457252, + "learning_rate": 5.808433581097301e-06, + "loss": 1.3715, + "step": 1884 + }, + { + "epoch": 2.5204013377926424, + "grad_norm": 0.8391931560633863, + "learning_rate": 5.803827977107163e-06, + "loss": 1.1056, + "step": 1885 + }, + { + "epoch": 2.5217391304347827, + "grad_norm": 0.9131843668010159, + "learning_rate": 5.799221672896868e-06, + "loss": 1.0473, + "step": 1886 + }, + { + "epoch": 2.523076923076923, + "grad_norm": 1.0926044518919622, + "learning_rate": 5.794614672479e-06, + "loss": 1.1303, + "step": 1887 + }, + { + "epoch": 2.5244147157190637, + "grad_norm": 0.9535456523325415, + "learning_rate": 5.79000697986675e-06, + "loss": 1.0008, + "step": 1888 + }, + { + "epoch": 2.525752508361204, + "grad_norm": 0.9874488705983898, + "learning_rate": 5.7853985990739115e-06, + "loss": 1.1257, + "step": 1889 + }, + { + "epoch": 2.5270903010033443, + "grad_norm": 0.8874341196056981, + "learning_rate": 5.780789534114875e-06, + "loss": 1.2559, + "step": 1890 + }, + { + "epoch": 2.528428093645485, + "grad_norm": 0.8578796899762384, + "learning_rate": 5.77617978900463e-06, + "loss": 1.3546, + "step": 1891 + }, + { + "epoch": 2.5297658862876253, + "grad_norm": 0.948355920353001, + "learning_rate": 5.771569367758757e-06, + "loss": 1.205, + "step": 1892 + }, + { + "epoch": 2.5311036789297656, + "grad_norm": 0.8170034177623388, + "learning_rate": 5.766958274393428e-06, + "loss": 0.9986, + "step": 1893 + }, + { + "epoch": 2.5324414715719064, + "grad_norm": 0.9496991324522351, + "learning_rate": 5.762346512925397e-06, + "loss": 1.1141, + "step": 1894 + }, + { + "epoch": 2.533779264214047, + "grad_norm": 1.0320672184736253, + "learning_rate": 5.757734087372003e-06, + "loss": 1.1506, + "step": 1895 + }, + { + "epoch": 2.5351170568561874, + "grad_norm": 0.8652236121384564, + "learning_rate": 5.753121001751161e-06, + "loss": 1.3197, + "step": 1896 + }, + { + "epoch": 2.5364548494983277, + "grad_norm": 1.0056155737734929, + "learning_rate": 5.748507260081361e-06, + "loss": 1.2587, + "step": 1897 + }, + { + "epoch": 2.5377926421404684, + "grad_norm": 0.9035840441661858, + "learning_rate": 5.743892866381668e-06, + "loss": 1.26, + "step": 1898 + }, + { + "epoch": 2.5391304347826087, + "grad_norm": 0.9124431399485352, + "learning_rate": 5.739277824671711e-06, + "loss": 1.196, + "step": 1899 + }, + { + "epoch": 2.540468227424749, + "grad_norm": 0.9381494899312327, + "learning_rate": 5.734662138971686e-06, + "loss": 1.2459, + "step": 1900 + }, + { + "epoch": 2.5418060200668897, + "grad_norm": 0.899132918626439, + "learning_rate": 5.730045813302347e-06, + "loss": 1.2568, + "step": 1901 + }, + { + "epoch": 2.54314381270903, + "grad_norm": 0.9690876793067585, + "learning_rate": 5.725428851685011e-06, + "loss": 1.1005, + "step": 1902 + }, + { + "epoch": 2.5444816053511703, + "grad_norm": 1.1791825916332783, + "learning_rate": 5.720811258141541e-06, + "loss": 1.2539, + "step": 1903 + }, + { + "epoch": 2.545819397993311, + "grad_norm": 1.0681722943818328, + "learning_rate": 5.716193036694359e-06, + "loss": 1.1477, + "step": 1904 + }, + { + "epoch": 2.5471571906354518, + "grad_norm": 0.8802646717430055, + "learning_rate": 5.711574191366427e-06, + "loss": 1.0063, + "step": 1905 + }, + { + "epoch": 2.548494983277592, + "grad_norm": 0.7671182840461953, + "learning_rate": 5.706954726181255e-06, + "loss": 1.0528, + "step": 1906 + }, + { + "epoch": 2.5498327759197323, + "grad_norm": 0.883009536997986, + "learning_rate": 5.70233464516289e-06, + "loss": 1.2866, + "step": 1907 + }, + { + "epoch": 2.551170568561873, + "grad_norm": 0.895557863341288, + "learning_rate": 5.697713952335918e-06, + "loss": 1.1795, + "step": 1908 + }, + { + "epoch": 2.5525083612040134, + "grad_norm": 1.0811764828759998, + "learning_rate": 5.693092651725457e-06, + "loss": 1.3279, + "step": 1909 + }, + { + "epoch": 2.5538461538461537, + "grad_norm": 1.0116642763116326, + "learning_rate": 5.688470747357153e-06, + "loss": 1.0084, + "step": 1910 + }, + { + "epoch": 2.5551839464882944, + "grad_norm": 0.972433375713824, + "learning_rate": 5.683848243257181e-06, + "loss": 1.2375, + "step": 1911 + }, + { + "epoch": 2.5565217391304347, + "grad_norm": 0.8090993025169614, + "learning_rate": 5.679225143452233e-06, + "loss": 1.0676, + "step": 1912 + }, + { + "epoch": 2.5578595317725754, + "grad_norm": 1.0099329158706112, + "learning_rate": 5.674601451969527e-06, + "loss": 1.1595, + "step": 1913 + }, + { + "epoch": 2.5591973244147157, + "grad_norm": 0.9458678711406567, + "learning_rate": 5.669977172836791e-06, + "loss": 1.2343, + "step": 1914 + }, + { + "epoch": 2.5605351170568564, + "grad_norm": 0.9260676774332742, + "learning_rate": 5.66535231008227e-06, + "loss": 1.3107, + "step": 1915 + }, + { + "epoch": 2.5618729096989967, + "grad_norm": 0.9777797642654215, + "learning_rate": 5.66072686773471e-06, + "loss": 1.0502, + "step": 1916 + }, + { + "epoch": 2.563210702341137, + "grad_norm": 0.9321041929573854, + "learning_rate": 5.656100849823366e-06, + "loss": 1.0251, + "step": 1917 + }, + { + "epoch": 2.5645484949832777, + "grad_norm": 1.2168395133589351, + "learning_rate": 5.651474260377998e-06, + "loss": 1.1842, + "step": 1918 + }, + { + "epoch": 2.565886287625418, + "grad_norm": 0.9273013540998543, + "learning_rate": 5.646847103428859e-06, + "loss": 1.1388, + "step": 1919 + }, + { + "epoch": 2.5672240802675583, + "grad_norm": 1.0974174756716877, + "learning_rate": 5.642219383006696e-06, + "loss": 1.079, + "step": 1920 + }, + { + "epoch": 2.568561872909699, + "grad_norm": 1.3677799576196028, + "learning_rate": 5.63759110314275e-06, + "loss": 1.0908, + "step": 1921 + }, + { + "epoch": 2.5698996655518394, + "grad_norm": 0.9044238849318234, + "learning_rate": 5.632962267868747e-06, + "loss": 1.1118, + "step": 1922 + }, + { + "epoch": 2.57123745819398, + "grad_norm": 1.1413800740450317, + "learning_rate": 5.628332881216899e-06, + "loss": 1.0282, + "step": 1923 + }, + { + "epoch": 2.5725752508361204, + "grad_norm": 0.9660639090886108, + "learning_rate": 5.623702947219896e-06, + "loss": 1.0066, + "step": 1924 + }, + { + "epoch": 2.573913043478261, + "grad_norm": 1.064630500023338, + "learning_rate": 5.619072469910907e-06, + "loss": 1.2051, + "step": 1925 + }, + { + "epoch": 2.5752508361204014, + "grad_norm": 0.9148087099779061, + "learning_rate": 5.614441453323571e-06, + "loss": 1.0752, + "step": 1926 + }, + { + "epoch": 2.5765886287625417, + "grad_norm": 0.9218330716229904, + "learning_rate": 5.609809901492e-06, + "loss": 1.1741, + "step": 1927 + }, + { + "epoch": 2.5779264214046824, + "grad_norm": 1.1186899287859116, + "learning_rate": 5.605177818450772e-06, + "loss": 1.2191, + "step": 1928 + }, + { + "epoch": 2.5792642140468227, + "grad_norm": 1.0854435790042558, + "learning_rate": 5.600545208234927e-06, + "loss": 1.2356, + "step": 1929 + }, + { + "epoch": 2.580602006688963, + "grad_norm": 1.0836948867726905, + "learning_rate": 5.595912074879961e-06, + "loss": 1.1816, + "step": 1930 + }, + { + "epoch": 2.5819397993311037, + "grad_norm": 1.0310795529587704, + "learning_rate": 5.591278422421831e-06, + "loss": 1.0732, + "step": 1931 + }, + { + "epoch": 2.583277591973244, + "grad_norm": 0.9679981092319316, + "learning_rate": 5.586644254896945e-06, + "loss": 1.1535, + "step": 1932 + }, + { + "epoch": 2.5846153846153848, + "grad_norm": 0.9355398064041334, + "learning_rate": 5.5820095763421565e-06, + "loss": 1.0831, + "step": 1933 + }, + { + "epoch": 2.585953177257525, + "grad_norm": 1.0656246063535602, + "learning_rate": 5.5773743907947674e-06, + "loss": 1.1903, + "step": 1934 + }, + { + "epoch": 2.587290969899666, + "grad_norm": 1.0457223059729588, + "learning_rate": 5.57273870229252e-06, + "loss": 1.0123, + "step": 1935 + }, + { + "epoch": 2.588628762541806, + "grad_norm": 0.9782424826026477, + "learning_rate": 5.568102514873595e-06, + "loss": 0.9818, + "step": 1936 + }, + { + "epoch": 2.5899665551839464, + "grad_norm": 1.095496049380514, + "learning_rate": 5.5634658325766066e-06, + "loss": 1.1864, + "step": 1937 + }, + { + "epoch": 2.591304347826087, + "grad_norm": 1.323365272207047, + "learning_rate": 5.558828659440603e-06, + "loss": 1.2179, + "step": 1938 + }, + { + "epoch": 2.5926421404682274, + "grad_norm": 0.9674474928922701, + "learning_rate": 5.5541909995050554e-06, + "loss": 1.0576, + "step": 1939 + }, + { + "epoch": 2.5939799331103677, + "grad_norm": 1.3395682666351174, + "learning_rate": 5.549552856809865e-06, + "loss": 1.1471, + "step": 1940 + }, + { + "epoch": 2.5953177257525084, + "grad_norm": 1.228121524116488, + "learning_rate": 5.544914235395347e-06, + "loss": 1.0527, + "step": 1941 + }, + { + "epoch": 2.5966555183946487, + "grad_norm": 0.8736322723636835, + "learning_rate": 5.540275139302241e-06, + "loss": 1.1519, + "step": 1942 + }, + { + "epoch": 2.5979933110367894, + "grad_norm": 1.0774963595964024, + "learning_rate": 5.53563557257169e-06, + "loss": 0.983, + "step": 1943 + }, + { + "epoch": 2.5993311036789297, + "grad_norm": 0.9695978308210649, + "learning_rate": 5.5309955392452585e-06, + "loss": 1.2491, + "step": 1944 + }, + { + "epoch": 2.6006688963210705, + "grad_norm": 0.8606251045756166, + "learning_rate": 5.526355043364909e-06, + "loss": 1.1753, + "step": 1945 + }, + { + "epoch": 2.6020066889632107, + "grad_norm": 1.2507391490326398, + "learning_rate": 5.521714088973012e-06, + "loss": 1.1875, + "step": 1946 + }, + { + "epoch": 2.603344481605351, + "grad_norm": 1.0687145582925945, + "learning_rate": 5.517072680112332e-06, + "loss": 1.1984, + "step": 1947 + }, + { + "epoch": 2.6046822742474918, + "grad_norm": 1.0242439599067827, + "learning_rate": 5.512430820826035e-06, + "loss": 1.3009, + "step": 1948 + }, + { + "epoch": 2.606020066889632, + "grad_norm": 0.92667911870277, + "learning_rate": 5.507788515157677e-06, + "loss": 1.1193, + "step": 1949 + }, + { + "epoch": 2.6073578595317723, + "grad_norm": 0.7978175897701381, + "learning_rate": 5.503145767151201e-06, + "loss": 1.052, + "step": 1950 + }, + { + "epoch": 2.608695652173913, + "grad_norm": 0.9277620005008367, + "learning_rate": 5.498502580850938e-06, + "loss": 0.9066, + "step": 1951 + }, + { + "epoch": 2.6100334448160534, + "grad_norm": 1.0295571682524858, + "learning_rate": 5.493858960301602e-06, + "loss": 1.1499, + "step": 1952 + }, + { + "epoch": 2.611371237458194, + "grad_norm": 1.2789708540659368, + "learning_rate": 5.4892149095482815e-06, + "loss": 0.939, + "step": 1953 + }, + { + "epoch": 2.6127090301003344, + "grad_norm": 0.8962546737433192, + "learning_rate": 5.484570432636441e-06, + "loss": 1.3155, + "step": 1954 + }, + { + "epoch": 2.614046822742475, + "grad_norm": 0.9562468863548437, + "learning_rate": 5.479925533611917e-06, + "loss": 1.1112, + "step": 1955 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 1.008278391520744, + "learning_rate": 5.475280216520913e-06, + "loss": 1.0061, + "step": 1956 + }, + { + "epoch": 2.6167224080267557, + "grad_norm": 0.7976600252636826, + "learning_rate": 5.470634485409999e-06, + "loss": 1.2069, + "step": 1957 + }, + { + "epoch": 2.6180602006688964, + "grad_norm": 0.860184547751308, + "learning_rate": 5.465988344326103e-06, + "loss": 1.3305, + "step": 1958 + }, + { + "epoch": 2.6193979933110367, + "grad_norm": 1.0321738142081163, + "learning_rate": 5.46134179731651e-06, + "loss": 1.0688, + "step": 1959 + }, + { + "epoch": 2.620735785953177, + "grad_norm": 1.0440336411835986, + "learning_rate": 5.456694848428861e-06, + "loss": 0.9964, + "step": 1960 + }, + { + "epoch": 2.6220735785953178, + "grad_norm": 0.9612129352542391, + "learning_rate": 5.452047501711144e-06, + "loss": 1.1552, + "step": 1961 + }, + { + "epoch": 2.623411371237458, + "grad_norm": 0.830969345494151, + "learning_rate": 5.4473997612116956e-06, + "loss": 1.1333, + "step": 1962 + }, + { + "epoch": 2.624749163879599, + "grad_norm": 1.1447482660155537, + "learning_rate": 5.442751630979195e-06, + "loss": 0.9653, + "step": 1963 + }, + { + "epoch": 2.626086956521739, + "grad_norm": 0.8976735270896373, + "learning_rate": 5.438103115062662e-06, + "loss": 1.2911, + "step": 1964 + }, + { + "epoch": 2.62742474916388, + "grad_norm": 1.0959905623165183, + "learning_rate": 5.4334542175114495e-06, + "loss": 1.017, + "step": 1965 + }, + { + "epoch": 2.62876254180602, + "grad_norm": 0.8626876253322874, + "learning_rate": 5.428804942375243e-06, + "loss": 1.0659, + "step": 1966 + }, + { + "epoch": 2.6301003344481604, + "grad_norm": 0.8945552285492642, + "learning_rate": 5.424155293704063e-06, + "loss": 1.1095, + "step": 1967 + }, + { + "epoch": 2.631438127090301, + "grad_norm": 0.8916749985055723, + "learning_rate": 5.419505275548249e-06, + "loss": 1.1144, + "step": 1968 + }, + { + "epoch": 2.6327759197324414, + "grad_norm": 0.9594562483367076, + "learning_rate": 5.414854891958464e-06, + "loss": 0.8684, + "step": 1969 + }, + { + "epoch": 2.6341137123745817, + "grad_norm": 1.0943758301843511, + "learning_rate": 5.41020414698569e-06, + "loss": 1.0869, + "step": 1970 + }, + { + "epoch": 2.6354515050167224, + "grad_norm": 0.8007906816062694, + "learning_rate": 5.40555304468122e-06, + "loss": 1.1729, + "step": 1971 + }, + { + "epoch": 2.6367892976588627, + "grad_norm": 1.0706663330329604, + "learning_rate": 5.400901589096667e-06, + "loss": 1.1628, + "step": 1972 + }, + { + "epoch": 2.6381270903010035, + "grad_norm": 1.040242604527286, + "learning_rate": 5.396249784283943e-06, + "loss": 1.0824, + "step": 1973 + }, + { + "epoch": 2.6394648829431437, + "grad_norm": 0.8906364854035667, + "learning_rate": 5.391597634295269e-06, + "loss": 1.0333, + "step": 1974 + }, + { + "epoch": 2.6408026755852845, + "grad_norm": 0.9830577861414344, + "learning_rate": 5.386945143183164e-06, + "loss": 1.0934, + "step": 1975 + }, + { + "epoch": 2.6421404682274248, + "grad_norm": 1.0913657223067021, + "learning_rate": 5.382292315000448e-06, + "loss": 1.0562, + "step": 1976 + }, + { + "epoch": 2.643478260869565, + "grad_norm": 1.2459914279564375, + "learning_rate": 5.377639153800229e-06, + "loss": 1.2326, + "step": 1977 + }, + { + "epoch": 2.644816053511706, + "grad_norm": 1.4391045554499817, + "learning_rate": 5.37298566363591e-06, + "loss": 0.9526, + "step": 1978 + }, + { + "epoch": 2.646153846153846, + "grad_norm": 1.224970131650155, + "learning_rate": 5.368331848561178e-06, + "loss": 1.1792, + "step": 1979 + }, + { + "epoch": 2.6474916387959864, + "grad_norm": 1.0161587345667826, + "learning_rate": 5.363677712630004e-06, + "loss": 1.3102, + "step": 1980 + }, + { + "epoch": 2.648829431438127, + "grad_norm": 0.9750038580853488, + "learning_rate": 5.359023259896638e-06, + "loss": 1.1016, + "step": 1981 + }, + { + "epoch": 2.650167224080268, + "grad_norm": 0.9591750863976471, + "learning_rate": 5.354368494415607e-06, + "loss": 1.0885, + "step": 1982 + }, + { + "epoch": 2.651505016722408, + "grad_norm": 0.867658242594789, + "learning_rate": 5.34971342024171e-06, + "loss": 1.1222, + "step": 1983 + }, + { + "epoch": 2.6528428093645484, + "grad_norm": 0.9303827547010471, + "learning_rate": 5.345058041430013e-06, + "loss": 0.9732, + "step": 1984 + }, + { + "epoch": 2.654180602006689, + "grad_norm": 1.1390606980687696, + "learning_rate": 5.3404023620358494e-06, + "loss": 1.048, + "step": 1985 + }, + { + "epoch": 2.6555183946488294, + "grad_norm": 1.0382079824567223, + "learning_rate": 5.335746386114814e-06, + "loss": 1.1825, + "step": 1986 + }, + { + "epoch": 2.6568561872909697, + "grad_norm": 1.004464447129808, + "learning_rate": 5.3310901177227615e-06, + "loss": 1.0594, + "step": 1987 + }, + { + "epoch": 2.6581939799331105, + "grad_norm": 0.821265508206081, + "learning_rate": 5.326433560915798e-06, + "loss": 1.1023, + "step": 1988 + }, + { + "epoch": 2.6595317725752508, + "grad_norm": 0.9465588431502765, + "learning_rate": 5.321776719750283e-06, + "loss": 1.2752, + "step": 1989 + }, + { + "epoch": 2.660869565217391, + "grad_norm": 0.8158516473906591, + "learning_rate": 5.317119598282823e-06, + "loss": 0.9834, + "step": 1990 + }, + { + "epoch": 2.6622073578595318, + "grad_norm": 1.0952067547274482, + "learning_rate": 5.31246220057027e-06, + "loss": 1.0471, + "step": 1991 + }, + { + "epoch": 2.6635451505016725, + "grad_norm": 0.872859402466523, + "learning_rate": 5.3078045306697154e-06, + "loss": 1.4479, + "step": 1992 + }, + { + "epoch": 2.664882943143813, + "grad_norm": 0.9023156370529694, + "learning_rate": 5.303146592638487e-06, + "loss": 1.2376, + "step": 1993 + }, + { + "epoch": 2.666220735785953, + "grad_norm": 0.9218021162394812, + "learning_rate": 5.298488390534148e-06, + "loss": 1.0526, + "step": 1994 + }, + { + "epoch": 2.667558528428094, + "grad_norm": 0.9601485762426231, + "learning_rate": 5.29382992841449e-06, + "loss": 1.3406, + "step": 1995 + }, + { + "epoch": 2.668896321070234, + "grad_norm": 0.8379346535406986, + "learning_rate": 5.289171210337531e-06, + "loss": 1.1392, + "step": 1996 + }, + { + "epoch": 2.6702341137123744, + "grad_norm": 0.837095149847414, + "learning_rate": 5.284512240361516e-06, + "loss": 1.0953, + "step": 1997 + }, + { + "epoch": 2.671571906354515, + "grad_norm": 0.8593528304407095, + "learning_rate": 5.279853022544904e-06, + "loss": 1.309, + "step": 1998 + }, + { + "epoch": 2.6729096989966554, + "grad_norm": 1.2371139578202317, + "learning_rate": 5.275193560946372e-06, + "loss": 0.9566, + "step": 1999 + }, + { + "epoch": 2.6742474916387957, + "grad_norm": 0.9619954915677199, + "learning_rate": 5.27053385962481e-06, + "loss": 1.1405, + "step": 2000 + }, + { + "epoch": 2.6755852842809364, + "grad_norm": 0.9095493959027289, + "learning_rate": 5.265873922639315e-06, + "loss": 1.3071, + "step": 2001 + }, + { + "epoch": 2.676923076923077, + "grad_norm": 1.0588055549114577, + "learning_rate": 5.261213754049193e-06, + "loss": 1.0506, + "step": 2002 + }, + { + "epoch": 2.6782608695652175, + "grad_norm": 1.0490253799137084, + "learning_rate": 5.2565533579139484e-06, + "loss": 1.2147, + "step": 2003 + }, + { + "epoch": 2.6795986622073578, + "grad_norm": 0.9383766252491154, + "learning_rate": 5.251892738293285e-06, + "loss": 1.1365, + "step": 2004 + }, + { + "epoch": 2.6809364548494985, + "grad_norm": 0.8617028029877167, + "learning_rate": 5.247231899247099e-06, + "loss": 1.419, + "step": 2005 + }, + { + "epoch": 2.682274247491639, + "grad_norm": 1.1265834352281703, + "learning_rate": 5.242570844835484e-06, + "loss": 1.162, + "step": 2006 + }, + { + "epoch": 2.683612040133779, + "grad_norm": 0.9120059831200634, + "learning_rate": 5.237909579118713e-06, + "loss": 1.0149, + "step": 2007 + }, + { + "epoch": 2.68494983277592, + "grad_norm": 0.8743977770730044, + "learning_rate": 5.233248106157248e-06, + "loss": 1.0499, + "step": 2008 + }, + { + "epoch": 2.68628762541806, + "grad_norm": 1.2535171777236058, + "learning_rate": 5.228586430011732e-06, + "loss": 1.2318, + "step": 2009 + }, + { + "epoch": 2.687625418060201, + "grad_norm": 0.9167823250673414, + "learning_rate": 5.223924554742982e-06, + "loss": 1.2313, + "step": 2010 + }, + { + "epoch": 2.688963210702341, + "grad_norm": 1.0502949261931416, + "learning_rate": 5.21926248441199e-06, + "loss": 1.1892, + "step": 2011 + }, + { + "epoch": 2.690301003344482, + "grad_norm": 1.025931700115416, + "learning_rate": 5.21460022307992e-06, + "loss": 0.9437, + "step": 2012 + }, + { + "epoch": 2.691638795986622, + "grad_norm": 0.9880287036778672, + "learning_rate": 5.209937774808098e-06, + "loss": 1.1826, + "step": 2013 + }, + { + "epoch": 2.6929765886287624, + "grad_norm": 0.8659945318783809, + "learning_rate": 5.205275143658018e-06, + "loss": 1.0548, + "step": 2014 + }, + { + "epoch": 2.694314381270903, + "grad_norm": 0.863254962101334, + "learning_rate": 5.2006123336913275e-06, + "loss": 1.1522, + "step": 2015 + }, + { + "epoch": 2.6956521739130435, + "grad_norm": 1.2220136526473997, + "learning_rate": 5.195949348969833e-06, + "loss": 1.1376, + "step": 2016 + }, + { + "epoch": 2.6969899665551837, + "grad_norm": 1.113414311215439, + "learning_rate": 5.191286193555496e-06, + "loss": 0.8737, + "step": 2017 + }, + { + "epoch": 2.6983277591973245, + "grad_norm": 0.9221634768108381, + "learning_rate": 5.186622871510421e-06, + "loss": 1.2923, + "step": 2018 + }, + { + "epoch": 2.6996655518394648, + "grad_norm": 0.855362191298973, + "learning_rate": 5.181959386896862e-06, + "loss": 1.1086, + "step": 2019 + }, + { + "epoch": 2.7010033444816055, + "grad_norm": 0.8457146980462446, + "learning_rate": 5.177295743777212e-06, + "loss": 1.204, + "step": 2020 + }, + { + "epoch": 2.702341137123746, + "grad_norm": 1.0462387741861787, + "learning_rate": 5.172631946214003e-06, + "loss": 1.0712, + "step": 2021 + }, + { + "epoch": 2.7036789297658865, + "grad_norm": 0.9913042395628907, + "learning_rate": 5.167967998269902e-06, + "loss": 1.0644, + "step": 2022 + }, + { + "epoch": 2.705016722408027, + "grad_norm": 1.1482227613893319, + "learning_rate": 5.1633039040077046e-06, + "loss": 1.1876, + "step": 2023 + }, + { + "epoch": 2.706354515050167, + "grad_norm": 0.9331966638043082, + "learning_rate": 5.15863966749034e-06, + "loss": 1.2536, + "step": 2024 + }, + { + "epoch": 2.707692307692308, + "grad_norm": 1.0185974339358557, + "learning_rate": 5.153975292780852e-06, + "loss": 1.2537, + "step": 2025 + }, + { + "epoch": 2.709030100334448, + "grad_norm": 0.9067761715508699, + "learning_rate": 5.149310783942414e-06, + "loss": 1.1009, + "step": 2026 + }, + { + "epoch": 2.7103678929765884, + "grad_norm": 1.1497316793187025, + "learning_rate": 5.144646145038311e-06, + "loss": 1.0583, + "step": 2027 + }, + { + "epoch": 2.711705685618729, + "grad_norm": 1.0110652802886844, + "learning_rate": 5.139981380131943e-06, + "loss": 1.1903, + "step": 2028 + }, + { + "epoch": 2.7130434782608694, + "grad_norm": 0.9544579992609292, + "learning_rate": 5.135316493286818e-06, + "loss": 1.0677, + "step": 2029 + }, + { + "epoch": 2.71438127090301, + "grad_norm": 0.856359730820334, + "learning_rate": 5.1306514885665524e-06, + "loss": 1.0809, + "step": 2030 + }, + { + "epoch": 2.7157190635451505, + "grad_norm": 0.9075057693592437, + "learning_rate": 5.125986370034862e-06, + "loss": 0.9971, + "step": 2031 + }, + { + "epoch": 2.717056856187291, + "grad_norm": 0.8836371115613573, + "learning_rate": 5.121321141755568e-06, + "loss": 1.2398, + "step": 2032 + }, + { + "epoch": 2.7183946488294315, + "grad_norm": 1.019797317319734, + "learning_rate": 5.116655807792581e-06, + "loss": 1.2121, + "step": 2033 + }, + { + "epoch": 2.719732441471572, + "grad_norm": 0.9139960267175732, + "learning_rate": 5.111990372209906e-06, + "loss": 1.1188, + "step": 2034 + }, + { + "epoch": 2.7210702341137125, + "grad_norm": 0.7964487966442207, + "learning_rate": 5.107324839071638e-06, + "loss": 1.3085, + "step": 2035 + }, + { + "epoch": 2.722408026755853, + "grad_norm": 0.8633304165039568, + "learning_rate": 5.102659212441953e-06, + "loss": 0.9364, + "step": 2036 + }, + { + "epoch": 2.723745819397993, + "grad_norm": 0.946994893294297, + "learning_rate": 5.097993496385112e-06, + "loss": 1.0205, + "step": 2037 + }, + { + "epoch": 2.725083612040134, + "grad_norm": 1.0338061886539838, + "learning_rate": 5.093327694965453e-06, + "loss": 1.1603, + "step": 2038 + }, + { + "epoch": 2.726421404682274, + "grad_norm": 0.9822115979659388, + "learning_rate": 5.088661812247389e-06, + "loss": 1.1228, + "step": 2039 + }, + { + "epoch": 2.727759197324415, + "grad_norm": 0.9703288428351918, + "learning_rate": 5.083995852295402e-06, + "loss": 1.1161, + "step": 2040 + }, + { + "epoch": 2.729096989966555, + "grad_norm": 0.8975494368940978, + "learning_rate": 5.07932981917404e-06, + "loss": 1.1722, + "step": 2041 + }, + { + "epoch": 2.730434782608696, + "grad_norm": 0.9133275239710323, + "learning_rate": 5.0746637169479205e-06, + "loss": 1.0641, + "step": 2042 + }, + { + "epoch": 2.731772575250836, + "grad_norm": 0.8809528422389266, + "learning_rate": 5.069997549681718e-06, + "loss": 0.9794, + "step": 2043 + }, + { + "epoch": 2.7331103678929765, + "grad_norm": 0.8762053992267973, + "learning_rate": 5.06533132144016e-06, + "loss": 1.2611, + "step": 2044 + }, + { + "epoch": 2.734448160535117, + "grad_norm": 0.9537113264485307, + "learning_rate": 5.060665036288034e-06, + "loss": 1.0531, + "step": 2045 + }, + { + "epoch": 2.7357859531772575, + "grad_norm": 0.8991103370540876, + "learning_rate": 5.0559986982901695e-06, + "loss": 1.111, + "step": 2046 + }, + { + "epoch": 2.7371237458193978, + "grad_norm": 0.9137219575976356, + "learning_rate": 5.05133231151145e-06, + "loss": 1.303, + "step": 2047 + }, + { + "epoch": 2.7384615384615385, + "grad_norm": 0.9039198878004081, + "learning_rate": 5.046665880016795e-06, + "loss": 1.1801, + "step": 2048 + }, + { + "epoch": 2.739799331103679, + "grad_norm": 0.9074944144498762, + "learning_rate": 5.041999407871168e-06, + "loss": 1.0613, + "step": 2049 + }, + { + "epoch": 2.7411371237458195, + "grad_norm": 0.8324425472272402, + "learning_rate": 5.037332899139563e-06, + "loss": 1.2071, + "step": 2050 + }, + { + "epoch": 2.74247491638796, + "grad_norm": 0.9122471438077746, + "learning_rate": 5.0326663578870095e-06, + "loss": 1.0215, + "step": 2051 + }, + { + "epoch": 2.7438127090301005, + "grad_norm": 1.1668725917020413, + "learning_rate": 5.0279997881785635e-06, + "loss": 0.9371, + "step": 2052 + }, + { + "epoch": 2.745150501672241, + "grad_norm": 1.2498837033617924, + "learning_rate": 5.0233331940793074e-06, + "loss": 1.172, + "step": 2053 + }, + { + "epoch": 2.746488294314381, + "grad_norm": 0.9541660587367822, + "learning_rate": 5.018666579654342e-06, + "loss": 1.1056, + "step": 2054 + }, + { + "epoch": 2.747826086956522, + "grad_norm": 1.0954381179154014, + "learning_rate": 5.01399994896879e-06, + "loss": 1.3612, + "step": 2055 + }, + { + "epoch": 2.749163879598662, + "grad_norm": 0.7853829048988756, + "learning_rate": 5.009333306087784e-06, + "loss": 1.1384, + "step": 2056 + }, + { + "epoch": 2.7505016722408024, + "grad_norm": 1.086661440209556, + "learning_rate": 5.00466665507647e-06, + "loss": 1.0613, + "step": 2057 + }, + { + "epoch": 2.751839464882943, + "grad_norm": 0.877897746370246, + "learning_rate": 5e-06, + "loss": 1.2903, + "step": 2058 + }, + { + "epoch": 2.7531772575250835, + "grad_norm": 1.1762460204130751, + "learning_rate": 4.995333344923531e-06, + "loss": 1.1254, + "step": 2059 + }, + { + "epoch": 2.754515050167224, + "grad_norm": 1.288470582314544, + "learning_rate": 4.990666693912218e-06, + "loss": 0.9912, + "step": 2060 + }, + { + "epoch": 2.7558528428093645, + "grad_norm": 0.9435309691368428, + "learning_rate": 4.986000051031212e-06, + "loss": 0.9712, + "step": 2061 + }, + { + "epoch": 2.7571906354515052, + "grad_norm": 1.1990188848603738, + "learning_rate": 4.9813334203456595e-06, + "loss": 1.043, + "step": 2062 + }, + { + "epoch": 2.7585284280936455, + "grad_norm": 1.0574577729520038, + "learning_rate": 4.976666805920694e-06, + "loss": 1.051, + "step": 2063 + }, + { + "epoch": 2.759866220735786, + "grad_norm": 0.9520696578598331, + "learning_rate": 4.972000211821438e-06, + "loss": 0.9989, + "step": 2064 + }, + { + "epoch": 2.7612040133779265, + "grad_norm": 1.1788436219015193, + "learning_rate": 4.967333642112992e-06, + "loss": 1.0832, + "step": 2065 + }, + { + "epoch": 2.762541806020067, + "grad_norm": 0.9580394668794241, + "learning_rate": 4.9626671008604385e-06, + "loss": 1.1525, + "step": 2066 + }, + { + "epoch": 2.763879598662207, + "grad_norm": 0.8809808588570616, + "learning_rate": 4.958000592128834e-06, + "loss": 1.174, + "step": 2067 + }, + { + "epoch": 2.765217391304348, + "grad_norm": 1.0437180316613681, + "learning_rate": 4.953334119983206e-06, + "loss": 1.0767, + "step": 2068 + }, + { + "epoch": 2.766555183946488, + "grad_norm": 0.9546997713753513, + "learning_rate": 4.948667688488552e-06, + "loss": 1.1331, + "step": 2069 + }, + { + "epoch": 2.767892976588629, + "grad_norm": 1.086336763671734, + "learning_rate": 4.944001301709832e-06, + "loss": 1.1234, + "step": 2070 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 1.0136294130671486, + "learning_rate": 4.9393349637119695e-06, + "loss": 0.9886, + "step": 2071 + }, + { + "epoch": 2.77056856187291, + "grad_norm": 1.0401476422534288, + "learning_rate": 4.934668678559842e-06, + "loss": 1.1064, + "step": 2072 + }, + { + "epoch": 2.77190635451505, + "grad_norm": 0.8712830964713173, + "learning_rate": 4.930002450318282e-06, + "loss": 1.2612, + "step": 2073 + }, + { + "epoch": 2.7732441471571905, + "grad_norm": 0.8952454666724489, + "learning_rate": 4.925336283052079e-06, + "loss": 1.2125, + "step": 2074 + }, + { + "epoch": 2.774581939799331, + "grad_norm": 0.9258688277506749, + "learning_rate": 4.9206701808259605e-06, + "loss": 1.0871, + "step": 2075 + }, + { + "epoch": 2.7759197324414715, + "grad_norm": 0.856188624831838, + "learning_rate": 4.9160041477046e-06, + "loss": 1.0888, + "step": 2076 + }, + { + "epoch": 2.777257525083612, + "grad_norm": 0.9653080144700406, + "learning_rate": 4.911338187752612e-06, + "loss": 0.9336, + "step": 2077 + }, + { + "epoch": 2.7785953177257525, + "grad_norm": 0.931171936551532, + "learning_rate": 4.906672305034548e-06, + "loss": 1.0902, + "step": 2078 + }, + { + "epoch": 2.779933110367893, + "grad_norm": 1.1127698327806712, + "learning_rate": 4.9020065036148885e-06, + "loss": 1.1811, + "step": 2079 + }, + { + "epoch": 2.7812709030100335, + "grad_norm": 0.8321776233520158, + "learning_rate": 4.8973407875580485e-06, + "loss": 1.1869, + "step": 2080 + }, + { + "epoch": 2.782608695652174, + "grad_norm": 0.8554175373609216, + "learning_rate": 4.892675160928364e-06, + "loss": 1.0758, + "step": 2081 + }, + { + "epoch": 2.7839464882943146, + "grad_norm": 1.0462964704590723, + "learning_rate": 4.888009627790095e-06, + "loss": 1.0285, + "step": 2082 + }, + { + "epoch": 2.785284280936455, + "grad_norm": 0.9693579779997654, + "learning_rate": 4.8833441922074194e-06, + "loss": 1.0455, + "step": 2083 + }, + { + "epoch": 2.786622073578595, + "grad_norm": 1.0083460108450935, + "learning_rate": 4.878678858244432e-06, + "loss": 1.1705, + "step": 2084 + }, + { + "epoch": 2.787959866220736, + "grad_norm": 0.9374847488916958, + "learning_rate": 4.874013629965138e-06, + "loss": 0.9737, + "step": 2085 + }, + { + "epoch": 2.789297658862876, + "grad_norm": 0.9938930997616239, + "learning_rate": 4.869348511433449e-06, + "loss": 1.2439, + "step": 2086 + }, + { + "epoch": 2.7906354515050165, + "grad_norm": 1.2255629659717586, + "learning_rate": 4.864683506713183e-06, + "loss": 1.3354, + "step": 2087 + }, + { + "epoch": 2.791973244147157, + "grad_norm": 1.0782486728366465, + "learning_rate": 4.860018619868058e-06, + "loss": 1.1318, + "step": 2088 + }, + { + "epoch": 2.793311036789298, + "grad_norm": 0.9334408197356296, + "learning_rate": 4.85535385496169e-06, + "loss": 1.0872, + "step": 2089 + }, + { + "epoch": 2.794648829431438, + "grad_norm": 1.0548067751151755, + "learning_rate": 4.850689216057587e-06, + "loss": 1.1042, + "step": 2090 + }, + { + "epoch": 2.7959866220735785, + "grad_norm": 1.0087103828715216, + "learning_rate": 4.846024707219149e-06, + "loss": 1.2649, + "step": 2091 + }, + { + "epoch": 2.7973244147157192, + "grad_norm": 0.9565649889055464, + "learning_rate": 4.841360332509663e-06, + "loss": 1.2787, + "step": 2092 + }, + { + "epoch": 2.7986622073578595, + "grad_norm": 0.9279326915190004, + "learning_rate": 4.836696095992296e-06, + "loss": 1.1878, + "step": 2093 + }, + { + "epoch": 2.8, + "grad_norm": 0.9828441827626779, + "learning_rate": 4.8320320017301e-06, + "loss": 0.9602, + "step": 2094 + }, + { + "epoch": 2.8013377926421406, + "grad_norm": 1.0807140689599695, + "learning_rate": 4.827368053785999e-06, + "loss": 1.2265, + "step": 2095 + }, + { + "epoch": 2.802675585284281, + "grad_norm": 1.094782158657073, + "learning_rate": 4.82270425622279e-06, + "loss": 1.1895, + "step": 2096 + }, + { + "epoch": 2.804013377926421, + "grad_norm": 0.8151798225279424, + "learning_rate": 4.818040613103139e-06, + "loss": 1.1623, + "step": 2097 + }, + { + "epoch": 2.805351170568562, + "grad_norm": 0.8605696489195682, + "learning_rate": 4.81337712848958e-06, + "loss": 0.9561, + "step": 2098 + }, + { + "epoch": 2.8066889632107026, + "grad_norm": 1.051486526080082, + "learning_rate": 4.808713806444506e-06, + "loss": 1.2692, + "step": 2099 + }, + { + "epoch": 2.808026755852843, + "grad_norm": 0.8513892705533331, + "learning_rate": 4.804050651030168e-06, + "loss": 1.1449, + "step": 2100 + }, + { + "epoch": 2.809364548494983, + "grad_norm": 0.8322116282302046, + "learning_rate": 4.799387666308675e-06, + "loss": 1.0787, + "step": 2101 + }, + { + "epoch": 2.810702341137124, + "grad_norm": 1.3946331384103634, + "learning_rate": 4.794724856341985e-06, + "loss": 0.9985, + "step": 2102 + }, + { + "epoch": 2.812040133779264, + "grad_norm": 1.0666692273521357, + "learning_rate": 4.790062225191902e-06, + "loss": 1.3062, + "step": 2103 + }, + { + "epoch": 2.8133779264214045, + "grad_norm": 1.0874972970035837, + "learning_rate": 4.785399776920081e-06, + "loss": 1.1878, + "step": 2104 + }, + { + "epoch": 2.8147157190635452, + "grad_norm": 0.9705412097533702, + "learning_rate": 4.780737515588011e-06, + "loss": 1.0041, + "step": 2105 + }, + { + "epoch": 2.8160535117056855, + "grad_norm": 0.8370462018418929, + "learning_rate": 4.77607544525702e-06, + "loss": 0.9675, + "step": 2106 + }, + { + "epoch": 2.8173913043478263, + "grad_norm": 1.069804515116575, + "learning_rate": 4.77141356998827e-06, + "loss": 1.2317, + "step": 2107 + }, + { + "epoch": 2.8187290969899665, + "grad_norm": 0.9006190638303447, + "learning_rate": 4.7667518938427534e-06, + "loss": 1.2088, + "step": 2108 + }, + { + "epoch": 2.8200668896321073, + "grad_norm": 0.9112713596825163, + "learning_rate": 4.762090420881289e-06, + "loss": 1.124, + "step": 2109 + }, + { + "epoch": 2.8214046822742476, + "grad_norm": 1.0420582868732093, + "learning_rate": 4.757429155164518e-06, + "loss": 1.0515, + "step": 2110 + }, + { + "epoch": 2.822742474916388, + "grad_norm": 1.0448694515731616, + "learning_rate": 4.752768100752902e-06, + "loss": 1.0361, + "step": 2111 + }, + { + "epoch": 2.8240802675585286, + "grad_norm": 1.445743913183871, + "learning_rate": 4.748107261706716e-06, + "loss": 1.0315, + "step": 2112 + }, + { + "epoch": 2.825418060200669, + "grad_norm": 0.7854518220813301, + "learning_rate": 4.7434466420860515e-06, + "loss": 1.03, + "step": 2113 + }, + { + "epoch": 2.826755852842809, + "grad_norm": 0.7483648534180719, + "learning_rate": 4.7387862459508074e-06, + "loss": 1.102, + "step": 2114 + }, + { + "epoch": 2.82809364548495, + "grad_norm": 1.0458592018117638, + "learning_rate": 4.734126077360685e-06, + "loss": 1.0977, + "step": 2115 + }, + { + "epoch": 2.82943143812709, + "grad_norm": 0.9413591480867122, + "learning_rate": 4.729466140375192e-06, + "loss": 1.0526, + "step": 2116 + }, + { + "epoch": 2.830769230769231, + "grad_norm": 0.8986900203820658, + "learning_rate": 4.724806439053629e-06, + "loss": 1.2321, + "step": 2117 + }, + { + "epoch": 2.832107023411371, + "grad_norm": 0.8887313261858344, + "learning_rate": 4.720146977455098e-06, + "loss": 1.122, + "step": 2118 + }, + { + "epoch": 2.833444816053512, + "grad_norm": 0.9520435187302793, + "learning_rate": 4.715487759638486e-06, + "loss": 1.0654, + "step": 2119 + }, + { + "epoch": 2.8347826086956522, + "grad_norm": 1.158053106813478, + "learning_rate": 4.7108287896624695e-06, + "loss": 1.2001, + "step": 2120 + }, + { + "epoch": 2.8361204013377925, + "grad_norm": 0.8096225008016036, + "learning_rate": 4.706170071585513e-06, + "loss": 1.1602, + "step": 2121 + }, + { + "epoch": 2.8374581939799333, + "grad_norm": 1.0037253564116744, + "learning_rate": 4.7015116094658544e-06, + "loss": 0.9834, + "step": 2122 + }, + { + "epoch": 2.8387959866220736, + "grad_norm": 0.9632825241221348, + "learning_rate": 4.6968534073615145e-06, + "loss": 0.989, + "step": 2123 + }, + { + "epoch": 2.840133779264214, + "grad_norm": 0.8564040436380109, + "learning_rate": 4.692195469330286e-06, + "loss": 1.0627, + "step": 2124 + }, + { + "epoch": 2.8414715719063546, + "grad_norm": 1.0202040390983715, + "learning_rate": 4.687537799429731e-06, + "loss": 1.3484, + "step": 2125 + }, + { + "epoch": 2.842809364548495, + "grad_norm": 0.9410581403527539, + "learning_rate": 4.682880401717178e-06, + "loss": 1.1732, + "step": 2126 + }, + { + "epoch": 2.8441471571906356, + "grad_norm": 1.7849825678729567, + "learning_rate": 4.678223280249718e-06, + "loss": 1.1324, + "step": 2127 + }, + { + "epoch": 2.845484949832776, + "grad_norm": 0.8828706020335534, + "learning_rate": 4.673566439084204e-06, + "loss": 1.182, + "step": 2128 + }, + { + "epoch": 2.8468227424749166, + "grad_norm": 0.8792409937240653, + "learning_rate": 4.66890988227724e-06, + "loss": 0.9066, + "step": 2129 + }, + { + "epoch": 2.848160535117057, + "grad_norm": 0.8383169551060666, + "learning_rate": 4.664253613885187e-06, + "loss": 1.1841, + "step": 2130 + }, + { + "epoch": 2.849498327759197, + "grad_norm": 1.1194342923167968, + "learning_rate": 4.659597637964153e-06, + "loss": 1.2173, + "step": 2131 + }, + { + "epoch": 2.850836120401338, + "grad_norm": 0.9524753756820652, + "learning_rate": 4.65494195856999e-06, + "loss": 1.4295, + "step": 2132 + }, + { + "epoch": 2.8521739130434782, + "grad_norm": 0.8652111878730795, + "learning_rate": 4.650286579758291e-06, + "loss": 1.1826, + "step": 2133 + }, + { + "epoch": 2.8535117056856185, + "grad_norm": 0.7667290855922078, + "learning_rate": 4.645631505584393e-06, + "loss": 0.9678, + "step": 2134 + }, + { + "epoch": 2.8548494983277592, + "grad_norm": 1.0841658256229072, + "learning_rate": 4.640976740103363e-06, + "loss": 1.2886, + "step": 2135 + }, + { + "epoch": 2.8561872909698995, + "grad_norm": 0.8602227083296664, + "learning_rate": 4.636322287369997e-06, + "loss": 1.0664, + "step": 2136 + }, + { + "epoch": 2.8575250836120403, + "grad_norm": 0.8999679503160884, + "learning_rate": 4.6316681514388235e-06, + "loss": 1.1531, + "step": 2137 + }, + { + "epoch": 2.8588628762541806, + "grad_norm": 1.0119095691300843, + "learning_rate": 4.6270143363640914e-06, + "loss": 1.1461, + "step": 2138 + }, + { + "epoch": 2.8602006688963213, + "grad_norm": 0.9974463496048587, + "learning_rate": 4.622360846199772e-06, + "loss": 1.3389, + "step": 2139 + }, + { + "epoch": 2.8615384615384616, + "grad_norm": 0.941531576792948, + "learning_rate": 4.617707684999554e-06, + "loss": 1.139, + "step": 2140 + }, + { + "epoch": 2.862876254180602, + "grad_norm": 0.9518470431402013, + "learning_rate": 4.613054856816837e-06, + "loss": 0.9754, + "step": 2141 + }, + { + "epoch": 2.8642140468227426, + "grad_norm": 0.9170766139110639, + "learning_rate": 4.608402365704734e-06, + "loss": 0.9711, + "step": 2142 + }, + { + "epoch": 2.865551839464883, + "grad_norm": 1.0011023044050802, + "learning_rate": 4.603750215716057e-06, + "loss": 1.1128, + "step": 2143 + }, + { + "epoch": 2.866889632107023, + "grad_norm": 1.1009536559552826, + "learning_rate": 4.599098410903334e-06, + "loss": 1.2473, + "step": 2144 + }, + { + "epoch": 2.868227424749164, + "grad_norm": 1.036792799630238, + "learning_rate": 4.594446955318781e-06, + "loss": 1.0353, + "step": 2145 + }, + { + "epoch": 2.869565217391304, + "grad_norm": 0.8729876989196698, + "learning_rate": 4.589795853014313e-06, + "loss": 1.0781, + "step": 2146 + }, + { + "epoch": 2.870903010033445, + "grad_norm": 0.8542311889002614, + "learning_rate": 4.585145108041538e-06, + "loss": 1.2337, + "step": 2147 + }, + { + "epoch": 2.8722408026755852, + "grad_norm": 0.8571663180969143, + "learning_rate": 4.580494724451752e-06, + "loss": 1.0548, + "step": 2148 + }, + { + "epoch": 2.873578595317726, + "grad_norm": 1.4335815324315766, + "learning_rate": 4.575844706295938e-06, + "loss": 1.2073, + "step": 2149 + }, + { + "epoch": 2.8749163879598663, + "grad_norm": 1.1962507541783691, + "learning_rate": 4.5711950576247585e-06, + "loss": 1.1324, + "step": 2150 + }, + { + "epoch": 2.8762541806020065, + "grad_norm": 0.9941310324419463, + "learning_rate": 4.566545782488554e-06, + "loss": 1.288, + "step": 2151 + }, + { + "epoch": 2.8775919732441473, + "grad_norm": 1.042837408284672, + "learning_rate": 4.5618968849373415e-06, + "loss": 1.1436, + "step": 2152 + }, + { + "epoch": 2.8789297658862876, + "grad_norm": 1.1641028298508964, + "learning_rate": 4.557248369020806e-06, + "loss": 1.1475, + "step": 2153 + }, + { + "epoch": 2.880267558528428, + "grad_norm": 0.9646587410937738, + "learning_rate": 4.552600238788306e-06, + "loss": 1.0992, + "step": 2154 + }, + { + "epoch": 2.8816053511705686, + "grad_norm": 0.8841533873070887, + "learning_rate": 4.5479524982888575e-06, + "loss": 1.1361, + "step": 2155 + }, + { + "epoch": 2.882943143812709, + "grad_norm": 0.7650024136169764, + "learning_rate": 4.543305151571141e-06, + "loss": 1.0967, + "step": 2156 + }, + { + "epoch": 2.8842809364548496, + "grad_norm": 0.8782045814555667, + "learning_rate": 4.53865820268349e-06, + "loss": 0.9834, + "step": 2157 + }, + { + "epoch": 2.88561872909699, + "grad_norm": 0.9327264143836923, + "learning_rate": 4.534011655673898e-06, + "loss": 1.0891, + "step": 2158 + }, + { + "epoch": 2.8869565217391306, + "grad_norm": 0.9487289847599751, + "learning_rate": 4.529365514590002e-06, + "loss": 1.1293, + "step": 2159 + }, + { + "epoch": 2.888294314381271, + "grad_norm": 0.8562546062772746, + "learning_rate": 4.524719783479088e-06, + "loss": 1.0811, + "step": 2160 + }, + { + "epoch": 2.8896321070234112, + "grad_norm": 0.8827019479767941, + "learning_rate": 4.5200744663880856e-06, + "loss": 1.0643, + "step": 2161 + }, + { + "epoch": 2.890969899665552, + "grad_norm": 0.981425469308819, + "learning_rate": 4.515429567363562e-06, + "loss": 1.0737, + "step": 2162 + }, + { + "epoch": 2.8923076923076922, + "grad_norm": 1.0123527445468579, + "learning_rate": 4.510785090451719e-06, + "loss": 0.9851, + "step": 2163 + }, + { + "epoch": 2.8936454849498325, + "grad_norm": 1.0022280371339727, + "learning_rate": 4.506141039698398e-06, + "loss": 1.0724, + "step": 2164 + }, + { + "epoch": 2.8949832775919733, + "grad_norm": 0.8876304155081821, + "learning_rate": 4.501497419149062e-06, + "loss": 1.1286, + "step": 2165 + }, + { + "epoch": 2.8963210702341136, + "grad_norm": 0.8691360987001285, + "learning_rate": 4.4968542328488e-06, + "loss": 1.1994, + "step": 2166 + }, + { + "epoch": 2.8976588628762543, + "grad_norm": 0.9183580119529527, + "learning_rate": 4.492211484842324e-06, + "loss": 1.217, + "step": 2167 + }, + { + "epoch": 2.8989966555183946, + "grad_norm": 0.935541071363732, + "learning_rate": 4.4875691791739655e-06, + "loss": 1.0468, + "step": 2168 + }, + { + "epoch": 2.9003344481605353, + "grad_norm": 0.8833094023088488, + "learning_rate": 4.482927319887669e-06, + "loss": 1.1869, + "step": 2169 + }, + { + "epoch": 2.9016722408026756, + "grad_norm": 0.824553412399379, + "learning_rate": 4.478285911026989e-06, + "loss": 1.0341, + "step": 2170 + }, + { + "epoch": 2.903010033444816, + "grad_norm": 0.9823927505031895, + "learning_rate": 4.4736449566350924e-06, + "loss": 1.214, + "step": 2171 + }, + { + "epoch": 2.9043478260869566, + "grad_norm": 1.0260755629745997, + "learning_rate": 4.469004460754743e-06, + "loss": 1.1153, + "step": 2172 + }, + { + "epoch": 2.905685618729097, + "grad_norm": 1.0067868967112479, + "learning_rate": 4.46436442742831e-06, + "loss": 1.0989, + "step": 2173 + }, + { + "epoch": 2.907023411371237, + "grad_norm": 0.8559716061291618, + "learning_rate": 4.45972486069776e-06, + "loss": 1.1049, + "step": 2174 + }, + { + "epoch": 2.908361204013378, + "grad_norm": 0.8705342800534841, + "learning_rate": 4.455085764604653e-06, + "loss": 1.0901, + "step": 2175 + }, + { + "epoch": 2.9096989966555182, + "grad_norm": 0.9035713341873931, + "learning_rate": 4.450447143190136e-06, + "loss": 1.186, + "step": 2176 + }, + { + "epoch": 2.911036789297659, + "grad_norm": 1.0518791413552853, + "learning_rate": 4.445809000494945e-06, + "loss": 1.1297, + "step": 2177 + }, + { + "epoch": 2.9123745819397993, + "grad_norm": 0.8616126078037083, + "learning_rate": 4.441171340559399e-06, + "loss": 1.0708, + "step": 2178 + }, + { + "epoch": 2.91371237458194, + "grad_norm": 0.8452590689449315, + "learning_rate": 4.436534167423395e-06, + "loss": 1.1389, + "step": 2179 + }, + { + "epoch": 2.9150501672240803, + "grad_norm": 0.895902261388014, + "learning_rate": 4.431897485126408e-06, + "loss": 1.1529, + "step": 2180 + }, + { + "epoch": 2.9163879598662206, + "grad_norm": 0.8644893322491582, + "learning_rate": 4.427261297707482e-06, + "loss": 0.9506, + "step": 2181 + }, + { + "epoch": 2.9177257525083613, + "grad_norm": 0.8095020171094479, + "learning_rate": 4.422625609205235e-06, + "loss": 1.209, + "step": 2182 + }, + { + "epoch": 2.9190635451505016, + "grad_norm": 0.8402305685317917, + "learning_rate": 4.417990423657845e-06, + "loss": 1.2986, + "step": 2183 + }, + { + "epoch": 2.920401337792642, + "grad_norm": 1.184317505022654, + "learning_rate": 4.413355745103057e-06, + "loss": 1.0941, + "step": 2184 + }, + { + "epoch": 2.9217391304347826, + "grad_norm": 1.2997594005502948, + "learning_rate": 4.40872157757817e-06, + "loss": 1.1451, + "step": 2185 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.8179823643618449, + "learning_rate": 4.404087925120041e-06, + "loss": 1.3781, + "step": 2186 + }, + { + "epoch": 2.9244147157190636, + "grad_norm": 1.2583816755537756, + "learning_rate": 4.399454791765076e-06, + "loss": 1.2044, + "step": 2187 + }, + { + "epoch": 2.925752508361204, + "grad_norm": 0.8303684693588772, + "learning_rate": 4.3948221815492294e-06, + "loss": 1.1057, + "step": 2188 + }, + { + "epoch": 2.9270903010033447, + "grad_norm": 0.8340215118979083, + "learning_rate": 4.390190098508001e-06, + "loss": 1.1526, + "step": 2189 + }, + { + "epoch": 2.928428093645485, + "grad_norm": 0.9105174656600764, + "learning_rate": 4.3855585466764305e-06, + "loss": 0.8236, + "step": 2190 + }, + { + "epoch": 2.9297658862876252, + "grad_norm": 0.8762150263828394, + "learning_rate": 4.3809275300890956e-06, + "loss": 1.128, + "step": 2191 + }, + { + "epoch": 2.931103678929766, + "grad_norm": 0.8246095575765093, + "learning_rate": 4.376297052780106e-06, + "loss": 1.0531, + "step": 2192 + }, + { + "epoch": 2.9324414715719063, + "grad_norm": 1.0207981369367165, + "learning_rate": 4.371667118783101e-06, + "loss": 1.1355, + "step": 2193 + }, + { + "epoch": 2.9337792642140466, + "grad_norm": 0.7598006676238993, + "learning_rate": 4.367037732131254e-06, + "loss": 1.0477, + "step": 2194 + }, + { + "epoch": 2.9351170568561873, + "grad_norm": 0.7858608814184506, + "learning_rate": 4.362408896857251e-06, + "loss": 0.9812, + "step": 2195 + }, + { + "epoch": 2.936454849498328, + "grad_norm": 0.793959062901973, + "learning_rate": 4.357780616993305e-06, + "loss": 1.2113, + "step": 2196 + }, + { + "epoch": 2.9377926421404683, + "grad_norm": 1.0416726945070915, + "learning_rate": 4.353152896571143e-06, + "loss": 1.3138, + "step": 2197 + }, + { + "epoch": 2.9391304347826086, + "grad_norm": 0.9360352082638791, + "learning_rate": 4.348525739622003e-06, + "loss": 1.1984, + "step": 2198 + }, + { + "epoch": 2.9404682274247493, + "grad_norm": 1.0641711242708116, + "learning_rate": 4.343899150176635e-06, + "loss": 1.1275, + "step": 2199 + }, + { + "epoch": 2.9418060200668896, + "grad_norm": 1.123497984554877, + "learning_rate": 4.339273132265294e-06, + "loss": 1.2035, + "step": 2200 + }, + { + "epoch": 2.94314381270903, + "grad_norm": 1.4701432950290958, + "learning_rate": 4.334647689917734e-06, + "loss": 0.9786, + "step": 2201 + }, + { + "epoch": 2.9444816053511706, + "grad_norm": 0.9503440156399078, + "learning_rate": 4.3300228271632105e-06, + "loss": 1.1389, + "step": 2202 + }, + { + "epoch": 2.945819397993311, + "grad_norm": 0.7966976314849163, + "learning_rate": 4.325398548030473e-06, + "loss": 1.1919, + "step": 2203 + }, + { + "epoch": 2.9471571906354512, + "grad_norm": 0.9266777261668699, + "learning_rate": 4.320774856547767e-06, + "loss": 1.1746, + "step": 2204 + }, + { + "epoch": 2.948494983277592, + "grad_norm": 1.0336560394026026, + "learning_rate": 4.316151756742821e-06, + "loss": 1.1252, + "step": 2205 + }, + { + "epoch": 2.9498327759197327, + "grad_norm": 1.0582056458613631, + "learning_rate": 4.311529252642848e-06, + "loss": 1.4062, + "step": 2206 + }, + { + "epoch": 2.951170568561873, + "grad_norm": 0.8274283518578006, + "learning_rate": 4.306907348274545e-06, + "loss": 1.1824, + "step": 2207 + }, + { + "epoch": 2.9525083612040133, + "grad_norm": 1.0377969678977625, + "learning_rate": 4.302286047664083e-06, + "loss": 1.121, + "step": 2208 + }, + { + "epoch": 2.953846153846154, + "grad_norm": 0.9666764272572406, + "learning_rate": 4.2976653548371115e-06, + "loss": 1.1787, + "step": 2209 + }, + { + "epoch": 2.9551839464882943, + "grad_norm": 0.9594875149568958, + "learning_rate": 4.293045273818748e-06, + "loss": 1.1469, + "step": 2210 + }, + { + "epoch": 2.9565217391304346, + "grad_norm": 0.7999529230037545, + "learning_rate": 4.2884258086335755e-06, + "loss": 1.1454, + "step": 2211 + }, + { + "epoch": 2.9578595317725753, + "grad_norm": 0.9115097202833098, + "learning_rate": 4.283806963305644e-06, + "loss": 0.958, + "step": 2212 + }, + { + "epoch": 2.9591973244147156, + "grad_norm": 1.4693397282484983, + "learning_rate": 4.27918874185846e-06, + "loss": 1.0422, + "step": 2213 + }, + { + "epoch": 2.9605351170568563, + "grad_norm": 1.0514666693113148, + "learning_rate": 4.274571148314991e-06, + "loss": 1.3227, + "step": 2214 + }, + { + "epoch": 2.9618729096989966, + "grad_norm": 1.074862694448596, + "learning_rate": 4.269954186697654e-06, + "loss": 1.2093, + "step": 2215 + }, + { + "epoch": 2.9632107023411374, + "grad_norm": 1.020809909652327, + "learning_rate": 4.265337861028316e-06, + "loss": 0.9387, + "step": 2216 + }, + { + "epoch": 2.9645484949832777, + "grad_norm": 0.8717516727612477, + "learning_rate": 4.26072217532829e-06, + "loss": 1.1718, + "step": 2217 + }, + { + "epoch": 2.965886287625418, + "grad_norm": 0.8892121355667363, + "learning_rate": 4.256107133618333e-06, + "loss": 0.9895, + "step": 2218 + }, + { + "epoch": 2.9672240802675587, + "grad_norm": 1.1721571293521087, + "learning_rate": 4.251492739918641e-06, + "loss": 0.9993, + "step": 2219 + }, + { + "epoch": 2.968561872909699, + "grad_norm": 1.1472146146884303, + "learning_rate": 4.2468789982488415e-06, + "loss": 1.1442, + "step": 2220 + }, + { + "epoch": 2.9698996655518393, + "grad_norm": 1.2336811893631376, + "learning_rate": 4.242265912628e-06, + "loss": 0.9692, + "step": 2221 + }, + { + "epoch": 2.97123745819398, + "grad_norm": 1.0096178161652614, + "learning_rate": 4.2376534870746054e-06, + "loss": 1.2029, + "step": 2222 + }, + { + "epoch": 2.9725752508361203, + "grad_norm": 1.29767554307873, + "learning_rate": 4.233041725606573e-06, + "loss": 1.1197, + "step": 2223 + }, + { + "epoch": 2.973913043478261, + "grad_norm": 1.3671305454568599, + "learning_rate": 4.228430632241244e-06, + "loss": 0.995, + "step": 2224 + }, + { + "epoch": 2.9752508361204013, + "grad_norm": 0.8951366252840458, + "learning_rate": 4.223820210995372e-06, + "loss": 0.9386, + "step": 2225 + }, + { + "epoch": 2.976588628762542, + "grad_norm": 0.9639097967250406, + "learning_rate": 4.219210465885127e-06, + "loss": 1.0005, + "step": 2226 + }, + { + "epoch": 2.9779264214046823, + "grad_norm": 0.8663094118321276, + "learning_rate": 4.21460140092609e-06, + "loss": 1.114, + "step": 2227 + }, + { + "epoch": 2.9792642140468226, + "grad_norm": 0.9712388993521819, + "learning_rate": 4.209993020133251e-06, + "loss": 1.1663, + "step": 2228 + }, + { + "epoch": 2.9806020066889634, + "grad_norm": 0.8963611246811861, + "learning_rate": 4.205385327521002e-06, + "loss": 1.2106, + "step": 2229 + }, + { + "epoch": 2.9819397993311036, + "grad_norm": 1.0479967412066196, + "learning_rate": 4.200778327103134e-06, + "loss": 1.1066, + "step": 2230 + }, + { + "epoch": 2.983277591973244, + "grad_norm": 0.9323225182624362, + "learning_rate": 4.19617202289284e-06, + "loss": 1.1429, + "step": 2231 + }, + { + "epoch": 2.9846153846153847, + "grad_norm": 1.0649564677179955, + "learning_rate": 4.191566418902701e-06, + "loss": 1.2245, + "step": 2232 + }, + { + "epoch": 2.985953177257525, + "grad_norm": 1.2232308656973945, + "learning_rate": 4.1869615191446925e-06, + "loss": 1.0618, + "step": 2233 + }, + { + "epoch": 2.9872909698996657, + "grad_norm": 1.0014492731930904, + "learning_rate": 4.182357327630175e-06, + "loss": 1.0222, + "step": 2234 + }, + { + "epoch": 2.988628762541806, + "grad_norm": 1.0267524374711503, + "learning_rate": 4.177753848369892e-06, + "loss": 1.3143, + "step": 2235 + }, + { + "epoch": 2.9899665551839467, + "grad_norm": 0.9897136472480529, + "learning_rate": 4.173151085373966e-06, + "loss": 1.1997, + "step": 2236 + }, + { + "epoch": 2.991304347826087, + "grad_norm": 1.0154155181220557, + "learning_rate": 4.168549042651896e-06, + "loss": 1.3118, + "step": 2237 + }, + { + "epoch": 2.9926421404682273, + "grad_norm": 0.8673483405651217, + "learning_rate": 4.163947724212553e-06, + "loss": 1.0568, + "step": 2238 + }, + { + "epoch": 2.993979933110368, + "grad_norm": 0.8696364843925039, + "learning_rate": 4.159347134064177e-06, + "loss": 1.2282, + "step": 2239 + }, + { + "epoch": 2.9953177257525083, + "grad_norm": 1.1253944348280127, + "learning_rate": 4.154747276214377e-06, + "loss": 1.1441, + "step": 2240 + }, + { + "epoch": 2.9966555183946486, + "grad_norm": 0.9749294672159105, + "learning_rate": 4.1501481546701185e-06, + "loss": 1.4039, + "step": 2241 + }, + { + "epoch": 2.9979933110367893, + "grad_norm": 0.871627905068237, + "learning_rate": 4.145549773437728e-06, + "loss": 1.3321, + "step": 2242 + }, + { + "epoch": 2.9993311036789296, + "grad_norm": 0.8440755865109467, + "learning_rate": 4.140952136522889e-06, + "loss": 1.077, + "step": 2243 + }, + { + "epoch": 3.0, + "grad_norm": 0.8440755865109467, + "learning_rate": 4.136355247930636e-06, + "loss": 1.1354, + "step": 2244 + }, + { + "epoch": 3.0013377926421403, + "grad_norm": 1.4017229827398778, + "learning_rate": 4.131759111665349e-06, + "loss": 1.1812, + "step": 2245 + }, + { + "epoch": 3.002675585284281, + "grad_norm": 1.1065537502246596, + "learning_rate": 4.127163731730755e-06, + "loss": 1.2328, + "step": 2246 + }, + { + "epoch": 3.0040133779264213, + "grad_norm": 0.819824704412835, + "learning_rate": 4.12256911212992e-06, + "loss": 1.0368, + "step": 2247 + }, + { + "epoch": 3.005351170568562, + "grad_norm": 0.9462537803307012, + "learning_rate": 4.117975256865252e-06, + "loss": 0.9871, + "step": 2248 + }, + { + "epoch": 3.0066889632107023, + "grad_norm": 1.0426036880218121, + "learning_rate": 4.113382169938488e-06, + "loss": 1.118, + "step": 2249 + }, + { + "epoch": 3.0080267558528426, + "grad_norm": 0.6800627483903804, + "learning_rate": 4.108789855350699e-06, + "loss": 1.1468, + "step": 2250 + }, + { + "epoch": 3.0093645484949834, + "grad_norm": 1.0557855017996707, + "learning_rate": 4.104198317102283e-06, + "loss": 1.036, + "step": 2251 + }, + { + "epoch": 3.0107023411371236, + "grad_norm": 1.0447952452491283, + "learning_rate": 4.099607559192959e-06, + "loss": 1.075, + "step": 2252 + }, + { + "epoch": 3.0120401337792644, + "grad_norm": 1.0115770255613137, + "learning_rate": 4.095017585621767e-06, + "loss": 0.9468, + "step": 2253 + }, + { + "epoch": 3.0133779264214047, + "grad_norm": 1.1532741217245241, + "learning_rate": 4.090428400387071e-06, + "loss": 1.1201, + "step": 2254 + }, + { + "epoch": 3.014715719063545, + "grad_norm": 1.0662486461564682, + "learning_rate": 4.0858400074865364e-06, + "loss": 1.083, + "step": 2255 + }, + { + "epoch": 3.0160535117056857, + "grad_norm": 0.8563036453985883, + "learning_rate": 4.081252410917148e-06, + "loss": 1.188, + "step": 2256 + }, + { + "epoch": 3.017391304347826, + "grad_norm": 0.9831030473721777, + "learning_rate": 4.076665614675191e-06, + "loss": 0.9494, + "step": 2257 + }, + { + "epoch": 3.0187290969899667, + "grad_norm": 1.0217892949775056, + "learning_rate": 4.0720796227562585e-06, + "loss": 1.0808, + "step": 2258 + }, + { + "epoch": 3.020066889632107, + "grad_norm": 0.9011287596352604, + "learning_rate": 4.067494439155236e-06, + "loss": 0.9405, + "step": 2259 + }, + { + "epoch": 3.0214046822742473, + "grad_norm": 0.8899789764386151, + "learning_rate": 4.0629100678663104e-06, + "loss": 1.0879, + "step": 2260 + }, + { + "epoch": 3.022742474916388, + "grad_norm": 1.060744425215772, + "learning_rate": 4.05832651288296e-06, + "loss": 0.9137, + "step": 2261 + }, + { + "epoch": 3.0240802675585283, + "grad_norm": 1.117867410282552, + "learning_rate": 4.053743778197951e-06, + "loss": 1.1226, + "step": 2262 + }, + { + "epoch": 3.025418060200669, + "grad_norm": 0.8476982212358644, + "learning_rate": 4.049161867803334e-06, + "loss": 1.067, + "step": 2263 + }, + { + "epoch": 3.0267558528428093, + "grad_norm": 0.833964678707926, + "learning_rate": 4.0445807856904465e-06, + "loss": 0.9518, + "step": 2264 + }, + { + "epoch": 3.0280936454849496, + "grad_norm": 0.8324739396014376, + "learning_rate": 4.0400005358499e-06, + "loss": 1.1053, + "step": 2265 + }, + { + "epoch": 3.0294314381270904, + "grad_norm": 0.9560564180766438, + "learning_rate": 4.035421122271581e-06, + "loss": 1.2055, + "step": 2266 + }, + { + "epoch": 3.0307692307692307, + "grad_norm": 1.0601856519851747, + "learning_rate": 4.03084254894465e-06, + "loss": 1.2792, + "step": 2267 + }, + { + "epoch": 3.0321070234113714, + "grad_norm": 1.05416259426822, + "learning_rate": 4.026264819857533e-06, + "loss": 1.1644, + "step": 2268 + }, + { + "epoch": 3.0334448160535117, + "grad_norm": 0.9119808254947183, + "learning_rate": 4.021687938997923e-06, + "loss": 0.9586, + "step": 2269 + }, + { + "epoch": 3.034782608695652, + "grad_norm": 0.8104200370135146, + "learning_rate": 4.017111910352771e-06, + "loss": 1.2303, + "step": 2270 + }, + { + "epoch": 3.0361204013377927, + "grad_norm": 1.0646147072922671, + "learning_rate": 4.012536737908288e-06, + "loss": 1.0035, + "step": 2271 + }, + { + "epoch": 3.037458193979933, + "grad_norm": 1.1473191195606545, + "learning_rate": 4.007962425649939e-06, + "loss": 0.9707, + "step": 2272 + }, + { + "epoch": 3.0387959866220737, + "grad_norm": 1.2203324698237419, + "learning_rate": 4.003388977562439e-06, + "loss": 1.0479, + "step": 2273 + }, + { + "epoch": 3.040133779264214, + "grad_norm": 1.0093482967404146, + "learning_rate": 3.998816397629752e-06, + "loss": 1.1343, + "step": 2274 + }, + { + "epoch": 3.0414715719063543, + "grad_norm": 1.2126555211415382, + "learning_rate": 3.994244689835083e-06, + "loss": 1.0192, + "step": 2275 + }, + { + "epoch": 3.042809364548495, + "grad_norm": 0.8882216902310184, + "learning_rate": 3.989673858160878e-06, + "loss": 0.9458, + "step": 2276 + }, + { + "epoch": 3.0441471571906353, + "grad_norm": 0.8728853659426121, + "learning_rate": 3.985103906588821e-06, + "loss": 0.9777, + "step": 2277 + }, + { + "epoch": 3.045484949832776, + "grad_norm": 1.1241767269432428, + "learning_rate": 3.980534839099829e-06, + "loss": 1.1017, + "step": 2278 + }, + { + "epoch": 3.0468227424749164, + "grad_norm": 1.1836473483519447, + "learning_rate": 3.975966659674048e-06, + "loss": 0.845, + "step": 2279 + }, + { + "epoch": 3.0481605351170566, + "grad_norm": 1.4932681216252068, + "learning_rate": 3.971399372290851e-06, + "loss": 0.9373, + "step": 2280 + }, + { + "epoch": 3.0494983277591974, + "grad_norm": 1.026571728209308, + "learning_rate": 3.966832980928834e-06, + "loss": 0.9622, + "step": 2281 + }, + { + "epoch": 3.0508361204013377, + "grad_norm": 0.8748247941909988, + "learning_rate": 3.962267489565813e-06, + "loss": 1.05, + "step": 2282 + }, + { + "epoch": 3.0521739130434784, + "grad_norm": 0.8408430033089229, + "learning_rate": 3.957702902178816e-06, + "loss": 0.9495, + "step": 2283 + }, + { + "epoch": 3.0535117056856187, + "grad_norm": 0.8897078289909702, + "learning_rate": 3.953139222744093e-06, + "loss": 0.9325, + "step": 2284 + }, + { + "epoch": 3.054849498327759, + "grad_norm": 0.9220893357155554, + "learning_rate": 3.9485764552370934e-06, + "loss": 1.1458, + "step": 2285 + }, + { + "epoch": 3.0561872909698997, + "grad_norm": 0.9056637481214719, + "learning_rate": 3.944014603632476e-06, + "loss": 1.1127, + "step": 2286 + }, + { + "epoch": 3.05752508361204, + "grad_norm": 0.94054973071831, + "learning_rate": 3.939453671904101e-06, + "loss": 1.1001, + "step": 2287 + }, + { + "epoch": 3.0588628762541807, + "grad_norm": 0.8373084249105727, + "learning_rate": 3.93489366402503e-06, + "loss": 0.9797, + "step": 2288 + }, + { + "epoch": 3.060200668896321, + "grad_norm": 1.3983528780568955, + "learning_rate": 3.930334583967514e-06, + "loss": 1.1729, + "step": 2289 + }, + { + "epoch": 3.0615384615384613, + "grad_norm": 0.8628692156299802, + "learning_rate": 3.9257764357030025e-06, + "loss": 0.9811, + "step": 2290 + }, + { + "epoch": 3.062876254180602, + "grad_norm": 0.8449268392444776, + "learning_rate": 3.92121922320213e-06, + "loss": 1.1102, + "step": 2291 + }, + { + "epoch": 3.0642140468227423, + "grad_norm": 0.9562375906228393, + "learning_rate": 3.916662950434714e-06, + "loss": 0.9683, + "step": 2292 + }, + { + "epoch": 3.065551839464883, + "grad_norm": 1.0146154257926352, + "learning_rate": 3.912107621369755e-06, + "loss": 1.0848, + "step": 2293 + }, + { + "epoch": 3.0668896321070234, + "grad_norm": 0.6429801546047759, + "learning_rate": 3.907553239975437e-06, + "loss": 1.1111, + "step": 2294 + }, + { + "epoch": 3.068227424749164, + "grad_norm": 0.895182133526376, + "learning_rate": 3.902999810219109e-06, + "loss": 0.9794, + "step": 2295 + }, + { + "epoch": 3.0695652173913044, + "grad_norm": 0.9492931411610043, + "learning_rate": 3.898447336067297e-06, + "loss": 1.0902, + "step": 2296 + }, + { + "epoch": 3.0709030100334447, + "grad_norm": 1.0821546529703157, + "learning_rate": 3.893895821485692e-06, + "loss": 1.402, + "step": 2297 + }, + { + "epoch": 3.0722408026755854, + "grad_norm": 0.9648492628034739, + "learning_rate": 3.889345270439152e-06, + "loss": 1.0933, + "step": 2298 + }, + { + "epoch": 3.0735785953177257, + "grad_norm": 1.1032922432229433, + "learning_rate": 3.884795686891692e-06, + "loss": 1.0671, + "step": 2299 + }, + { + "epoch": 3.074916387959866, + "grad_norm": 0.8484966208309571, + "learning_rate": 3.8802470748064855e-06, + "loss": 1.1069, + "step": 2300 + }, + { + "epoch": 3.0762541806020067, + "grad_norm": 0.9820630040465695, + "learning_rate": 3.875699438145862e-06, + "loss": 0.8698, + "step": 2301 + }, + { + "epoch": 3.077591973244147, + "grad_norm": 0.9469733263683136, + "learning_rate": 3.871152780871298e-06, + "loss": 0.9496, + "step": 2302 + }, + { + "epoch": 3.0789297658862878, + "grad_norm": 0.8803404737153668, + "learning_rate": 3.866607106943418e-06, + "loss": 1.1284, + "step": 2303 + }, + { + "epoch": 3.080267558528428, + "grad_norm": 0.8750513045298085, + "learning_rate": 3.862062420321993e-06, + "loss": 1.3284, + "step": 2304 + }, + { + "epoch": 3.0816053511705688, + "grad_norm": 0.9859969900481739, + "learning_rate": 3.857518724965929e-06, + "loss": 1.1232, + "step": 2305 + }, + { + "epoch": 3.082943143812709, + "grad_norm": 0.7540198826336898, + "learning_rate": 3.852976024833271e-06, + "loss": 1.246, + "step": 2306 + }, + { + "epoch": 3.0842809364548494, + "grad_norm": 0.8354396963154893, + "learning_rate": 3.8484343238811976e-06, + "loss": 1.0495, + "step": 2307 + }, + { + "epoch": 3.08561872909699, + "grad_norm": 1.0151260050231805, + "learning_rate": 3.8438936260660145e-06, + "loss": 1.0611, + "step": 2308 + }, + { + "epoch": 3.0869565217391304, + "grad_norm": 0.9242641188811366, + "learning_rate": 3.839353935343156e-06, + "loss": 1.0435, + "step": 2309 + }, + { + "epoch": 3.088294314381271, + "grad_norm": 1.0533239600738828, + "learning_rate": 3.834815255667179e-06, + "loss": 0.8692, + "step": 2310 + }, + { + "epoch": 3.0896321070234114, + "grad_norm": 1.0364565211808436, + "learning_rate": 3.8302775909917585e-06, + "loss": 1.089, + "step": 2311 + }, + { + "epoch": 3.0909698996655517, + "grad_norm": 1.1825499263657415, + "learning_rate": 3.8257409452696845e-06, + "loss": 1.3622, + "step": 2312 + }, + { + "epoch": 3.0923076923076924, + "grad_norm": 0.8804730586018997, + "learning_rate": 3.821205322452863e-06, + "loss": 1.0033, + "step": 2313 + }, + { + "epoch": 3.0936454849498327, + "grad_norm": 0.9127030394758234, + "learning_rate": 3.816670726492307e-06, + "loss": 1.0866, + "step": 2314 + }, + { + "epoch": 3.0949832775919734, + "grad_norm": 0.7721816202167285, + "learning_rate": 3.812137161338133e-06, + "loss": 1.1104, + "step": 2315 + }, + { + "epoch": 3.0963210702341137, + "grad_norm": 0.7658644495896374, + "learning_rate": 3.8076046309395627e-06, + "loss": 1.0256, + "step": 2316 + }, + { + "epoch": 3.097658862876254, + "grad_norm": 0.7863627909646128, + "learning_rate": 3.803073139244913e-06, + "loss": 0.8642, + "step": 2317 + }, + { + "epoch": 3.0989966555183948, + "grad_norm": 1.0754061345679595, + "learning_rate": 3.7985426902015987e-06, + "loss": 1.0025, + "step": 2318 + }, + { + "epoch": 3.100334448160535, + "grad_norm": 1.3006144422589603, + "learning_rate": 3.794013287756125e-06, + "loss": 1.0523, + "step": 2319 + }, + { + "epoch": 3.101672240802676, + "grad_norm": 0.9175377904159328, + "learning_rate": 3.789484935854088e-06, + "loss": 1.0711, + "step": 2320 + }, + { + "epoch": 3.103010033444816, + "grad_norm": 1.3046766627010165, + "learning_rate": 3.784957638440165e-06, + "loss": 1.0243, + "step": 2321 + }, + { + "epoch": 3.1043478260869564, + "grad_norm": 0.9697555572419571, + "learning_rate": 3.7804313994581143e-06, + "loss": 0.9583, + "step": 2322 + }, + { + "epoch": 3.105685618729097, + "grad_norm": 0.9034722443796237, + "learning_rate": 3.775906222850778e-06, + "loss": 1.1131, + "step": 2323 + }, + { + "epoch": 3.1070234113712374, + "grad_norm": 0.8271421765693503, + "learning_rate": 3.7713821125600687e-06, + "loss": 0.9717, + "step": 2324 + }, + { + "epoch": 3.108361204013378, + "grad_norm": 0.89153016415846, + "learning_rate": 3.766859072526969e-06, + "loss": 1.3056, + "step": 2325 + }, + { + "epoch": 3.1096989966555184, + "grad_norm": 1.1169165071244875, + "learning_rate": 3.7623371066915305e-06, + "loss": 1.048, + "step": 2326 + }, + { + "epoch": 3.1110367892976587, + "grad_norm": 1.0988529099473532, + "learning_rate": 3.7578162189928696e-06, + "loss": 1.0366, + "step": 2327 + }, + { + "epoch": 3.1123745819397994, + "grad_norm": 1.0449233866952823, + "learning_rate": 3.7532964133691634e-06, + "loss": 0.9353, + "step": 2328 + }, + { + "epoch": 3.1137123745819397, + "grad_norm": 1.3471736007238158, + "learning_rate": 3.748777693757646e-06, + "loss": 0.9813, + "step": 2329 + }, + { + "epoch": 3.1150501672240805, + "grad_norm": 0.8477510609770019, + "learning_rate": 3.7442600640946045e-06, + "loss": 1.1355, + "step": 2330 + }, + { + "epoch": 3.1163879598662207, + "grad_norm": 1.0986344691996002, + "learning_rate": 3.7397435283153795e-06, + "loss": 1.0787, + "step": 2331 + }, + { + "epoch": 3.117725752508361, + "grad_norm": 0.8634123321760142, + "learning_rate": 3.735228090354354e-06, + "loss": 1.0116, + "step": 2332 + }, + { + "epoch": 3.1190635451505018, + "grad_norm": 0.7974224285488684, + "learning_rate": 3.730713754144961e-06, + "loss": 1.3433, + "step": 2333 + }, + { + "epoch": 3.120401337792642, + "grad_norm": 1.0039397442642357, + "learning_rate": 3.726200523619668e-06, + "loss": 0.9685, + "step": 2334 + }, + { + "epoch": 3.121739130434783, + "grad_norm": 1.076352241371438, + "learning_rate": 3.721688402709982e-06, + "loss": 1.1415, + "step": 2335 + }, + { + "epoch": 3.123076923076923, + "grad_norm": 0.9273115918474416, + "learning_rate": 3.7171773953464437e-06, + "loss": 0.9349, + "step": 2336 + }, + { + "epoch": 3.1244147157190634, + "grad_norm": 0.8234313410689247, + "learning_rate": 3.712667505458622e-06, + "loss": 0.9885, + "step": 2337 + }, + { + "epoch": 3.125752508361204, + "grad_norm": 1.102525847825686, + "learning_rate": 3.708158736975114e-06, + "loss": 0.9861, + "step": 2338 + }, + { + "epoch": 3.1270903010033444, + "grad_norm": 0.9356478296247032, + "learning_rate": 3.7036510938235394e-06, + "loss": 1.2177, + "step": 2339 + }, + { + "epoch": 3.128428093645485, + "grad_norm": 1.3019634846302297, + "learning_rate": 3.6991445799305376e-06, + "loss": 0.8762, + "step": 2340 + }, + { + "epoch": 3.1297658862876254, + "grad_norm": 1.1459835772685874, + "learning_rate": 3.694639199221764e-06, + "loss": 0.9738, + "step": 2341 + }, + { + "epoch": 3.1311036789297657, + "grad_norm": 1.106294471553763, + "learning_rate": 3.690134955621885e-06, + "loss": 1.0408, + "step": 2342 + }, + { + "epoch": 3.1324414715719064, + "grad_norm": 1.5611338501596745, + "learning_rate": 3.685631853054583e-06, + "loss": 1.1266, + "step": 2343 + }, + { + "epoch": 3.1337792642140467, + "grad_norm": 1.0674922903617092, + "learning_rate": 3.68112989544254e-06, + "loss": 1.2924, + "step": 2344 + }, + { + "epoch": 3.1351170568561875, + "grad_norm": 0.8211273241244206, + "learning_rate": 3.6766290867074444e-06, + "loss": 1.3609, + "step": 2345 + }, + { + "epoch": 3.1364548494983278, + "grad_norm": 0.9033152811299228, + "learning_rate": 3.6721294307699786e-06, + "loss": 1.0498, + "step": 2346 + }, + { + "epoch": 3.137792642140468, + "grad_norm": 0.7924712215363562, + "learning_rate": 3.667630931549826e-06, + "loss": 0.9767, + "step": 2347 + }, + { + "epoch": 3.139130434782609, + "grad_norm": 0.8465460394053941, + "learning_rate": 3.6631335929656608e-06, + "loss": 1.1497, + "step": 2348 + }, + { + "epoch": 3.140468227424749, + "grad_norm": 0.8949976514698427, + "learning_rate": 3.658637418935146e-06, + "loss": 1.041, + "step": 2349 + }, + { + "epoch": 3.14180602006689, + "grad_norm": 0.9228789613496183, + "learning_rate": 3.6541424133749293e-06, + "loss": 1.0102, + "step": 2350 + }, + { + "epoch": 3.14314381270903, + "grad_norm": 0.9754727694378477, + "learning_rate": 3.6496485802006433e-06, + "loss": 1.0138, + "step": 2351 + }, + { + "epoch": 3.1444816053511704, + "grad_norm": 1.1957518070790594, + "learning_rate": 3.645155923326893e-06, + "loss": 1.1358, + "step": 2352 + }, + { + "epoch": 3.145819397993311, + "grad_norm": 0.927415954233144, + "learning_rate": 3.640664446667268e-06, + "loss": 1.0574, + "step": 2353 + }, + { + "epoch": 3.1471571906354514, + "grad_norm": 0.9508092284241567, + "learning_rate": 3.6361741541343242e-06, + "loss": 1.0289, + "step": 2354 + }, + { + "epoch": 3.148494983277592, + "grad_norm": 1.017804412854776, + "learning_rate": 3.6316850496395863e-06, + "loss": 1.109, + "step": 2355 + }, + { + "epoch": 3.1498327759197324, + "grad_norm": 0.9519301988548542, + "learning_rate": 3.6271971370935432e-06, + "loss": 0.9328, + "step": 2356 + }, + { + "epoch": 3.1511705685618727, + "grad_norm": 1.0373099638465966, + "learning_rate": 3.622710420405647e-06, + "loss": 0.9291, + "step": 2357 + }, + { + "epoch": 3.1525083612040135, + "grad_norm": 0.8985055461655305, + "learning_rate": 3.61822490348431e-06, + "loss": 1.1342, + "step": 2358 + }, + { + "epoch": 3.1538461538461537, + "grad_norm": 0.9355770232944733, + "learning_rate": 3.613740590236895e-06, + "loss": 0.9968, + "step": 2359 + }, + { + "epoch": 3.1551839464882945, + "grad_norm": 1.0123928409931493, + "learning_rate": 3.6092574845697193e-06, + "loss": 1.1011, + "step": 2360 + }, + { + "epoch": 3.1565217391304348, + "grad_norm": 0.9608465574743059, + "learning_rate": 3.6047755903880478e-06, + "loss": 1.0738, + "step": 2361 + }, + { + "epoch": 3.157859531772575, + "grad_norm": 1.018156699672261, + "learning_rate": 3.6002949115960884e-06, + "loss": 0.9718, + "step": 2362 + }, + { + "epoch": 3.159197324414716, + "grad_norm": 0.9812725359854316, + "learning_rate": 3.595815452096994e-06, + "loss": 0.9811, + "step": 2363 + }, + { + "epoch": 3.160535117056856, + "grad_norm": 0.8754838294372695, + "learning_rate": 3.5913372157928515e-06, + "loss": 1.2462, + "step": 2364 + }, + { + "epoch": 3.161872909698997, + "grad_norm": 0.8579787351739092, + "learning_rate": 3.5868602065846846e-06, + "loss": 0.9852, + "step": 2365 + }, + { + "epoch": 3.163210702341137, + "grad_norm": 0.9433726572267782, + "learning_rate": 3.5823844283724464e-06, + "loss": 1.1961, + "step": 2366 + }, + { + "epoch": 3.1645484949832774, + "grad_norm": 1.1760398940648145, + "learning_rate": 3.577909885055019e-06, + "loss": 1.1556, + "step": 2367 + }, + { + "epoch": 3.165886287625418, + "grad_norm": 1.1209590079015292, + "learning_rate": 3.573436580530208e-06, + "loss": 1.0026, + "step": 2368 + }, + { + "epoch": 3.1672240802675584, + "grad_norm": 0.848988356645774, + "learning_rate": 3.56896451869474e-06, + "loss": 0.8485, + "step": 2369 + }, + { + "epoch": 3.168561872909699, + "grad_norm": 1.0937729628339976, + "learning_rate": 3.56449370344426e-06, + "loss": 1.0478, + "step": 2370 + }, + { + "epoch": 3.1698996655518394, + "grad_norm": 1.082694771672354, + "learning_rate": 3.560024138673326e-06, + "loss": 0.9648, + "step": 2371 + }, + { + "epoch": 3.1712374581939797, + "grad_norm": 1.0741293678098172, + "learning_rate": 3.5555558282754045e-06, + "loss": 1.1046, + "step": 2372 + }, + { + "epoch": 3.1725752508361205, + "grad_norm": 1.4946672328695578, + "learning_rate": 3.5510887761428764e-06, + "loss": 0.961, + "step": 2373 + }, + { + "epoch": 3.1739130434782608, + "grad_norm": 1.1989881793939268, + "learning_rate": 3.546622986167021e-06, + "loss": 0.9345, + "step": 2374 + }, + { + "epoch": 3.1752508361204015, + "grad_norm": 0.9290152896982753, + "learning_rate": 3.5421584622380167e-06, + "loss": 0.8384, + "step": 2375 + }, + { + "epoch": 3.1765886287625418, + "grad_norm": 0.736549403248328, + "learning_rate": 3.5376952082449425e-06, + "loss": 0.8628, + "step": 2376 + }, + { + "epoch": 3.177926421404682, + "grad_norm": 0.9923572817876587, + "learning_rate": 3.5332332280757706e-06, + "loss": 1.2556, + "step": 2377 + }, + { + "epoch": 3.179264214046823, + "grad_norm": 1.1936283963530312, + "learning_rate": 3.5287725256173627e-06, + "loss": 0.9854, + "step": 2378 + }, + { + "epoch": 3.180602006688963, + "grad_norm": 1.0943229529564364, + "learning_rate": 3.524313104755468e-06, + "loss": 1.2157, + "step": 2379 + }, + { + "epoch": 3.181939799331104, + "grad_norm": 0.9229320400099376, + "learning_rate": 3.5198549693747185e-06, + "loss": 1.0955, + "step": 2380 + }, + { + "epoch": 3.183277591973244, + "grad_norm": 0.9953626988792275, + "learning_rate": 3.5153981233586277e-06, + "loss": 0.9666, + "step": 2381 + }, + { + "epoch": 3.184615384615385, + "grad_norm": 0.8742366851807039, + "learning_rate": 3.510942570589583e-06, + "loss": 1.1599, + "step": 2382 + }, + { + "epoch": 3.185953177257525, + "grad_norm": 0.9398320928825028, + "learning_rate": 3.5064883149488505e-06, + "loss": 1.0016, + "step": 2383 + }, + { + "epoch": 3.1872909698996654, + "grad_norm": 0.8708679782767097, + "learning_rate": 3.5020353603165634e-06, + "loss": 1.2835, + "step": 2384 + }, + { + "epoch": 3.188628762541806, + "grad_norm": 0.8595828071938058, + "learning_rate": 3.4975837105717203e-06, + "loss": 1.0614, + "step": 2385 + }, + { + "epoch": 3.1899665551839465, + "grad_norm": 0.8801409528607754, + "learning_rate": 3.4931333695921843e-06, + "loss": 0.9698, + "step": 2386 + }, + { + "epoch": 3.1913043478260867, + "grad_norm": 0.832495564456757, + "learning_rate": 3.488684341254679e-06, + "loss": 1.0592, + "step": 2387 + }, + { + "epoch": 3.1926421404682275, + "grad_norm": 0.9156260732652037, + "learning_rate": 3.484236629434783e-06, + "loss": 1.1473, + "step": 2388 + }, + { + "epoch": 3.1939799331103678, + "grad_norm": 0.9953504657744031, + "learning_rate": 3.4797902380069305e-06, + "loss": 1.074, + "step": 2389 + }, + { + "epoch": 3.1953177257525085, + "grad_norm": 0.8226042172641296, + "learning_rate": 3.475345170844403e-06, + "loss": 0.9262, + "step": 2390 + }, + { + "epoch": 3.196655518394649, + "grad_norm": 0.9511908362493038, + "learning_rate": 3.4709014318193298e-06, + "loss": 1.0677, + "step": 2391 + }, + { + "epoch": 3.1979933110367895, + "grad_norm": 1.0727303566778088, + "learning_rate": 3.466459024802682e-06, + "loss": 1.1139, + "step": 2392 + }, + { + "epoch": 3.19933110367893, + "grad_norm": 0.932835113719062, + "learning_rate": 3.4620179536642727e-06, + "loss": 0.9979, + "step": 2393 + }, + { + "epoch": 3.20066889632107, + "grad_norm": 0.8459651190661563, + "learning_rate": 3.4575782222727507e-06, + "loss": 1.2832, + "step": 2394 + }, + { + "epoch": 3.202006688963211, + "grad_norm": 1.062917947118065, + "learning_rate": 3.453139834495596e-06, + "loss": 1.0126, + "step": 2395 + }, + { + "epoch": 3.203344481605351, + "grad_norm": 0.9613675891089286, + "learning_rate": 3.448702794199118e-06, + "loss": 0.9726, + "step": 2396 + }, + { + "epoch": 3.2046822742474914, + "grad_norm": 1.1127830727328003, + "learning_rate": 3.4442671052484545e-06, + "loss": 1.2137, + "step": 2397 + }, + { + "epoch": 3.206020066889632, + "grad_norm": 0.7773488625949485, + "learning_rate": 3.439832771507565e-06, + "loss": 1.0777, + "step": 2398 + }, + { + "epoch": 3.2073578595317724, + "grad_norm": 1.088362867173143, + "learning_rate": 3.4353997968392295e-06, + "loss": 1.0343, + "step": 2399 + }, + { + "epoch": 3.208695652173913, + "grad_norm": 1.278114229075439, + "learning_rate": 3.4309681851050414e-06, + "loss": 1.0875, + "step": 2400 + }, + { + "epoch": 3.2100334448160535, + "grad_norm": 1.0985925617005599, + "learning_rate": 3.4265379401654096e-06, + "loss": 0.8847, + "step": 2401 + }, + { + "epoch": 3.211371237458194, + "grad_norm": 1.5517839799746864, + "learning_rate": 3.4221090658795484e-06, + "loss": 1.0152, + "step": 2402 + }, + { + "epoch": 3.2127090301003345, + "grad_norm": 1.1886679212719065, + "learning_rate": 3.4176815661054884e-06, + "loss": 0.9917, + "step": 2403 + }, + { + "epoch": 3.2140468227424748, + "grad_norm": 1.4378309986840323, + "learning_rate": 3.4132554447000487e-06, + "loss": 1.1002, + "step": 2404 + }, + { + "epoch": 3.2153846153846155, + "grad_norm": 1.021388561341464, + "learning_rate": 3.4088307055188574e-06, + "loss": 1.166, + "step": 2405 + }, + { + "epoch": 3.216722408026756, + "grad_norm": 0.8783410493452397, + "learning_rate": 3.4044073524163344e-06, + "loss": 1.0855, + "step": 2406 + }, + { + "epoch": 3.218060200668896, + "grad_norm": 1.095222196629155, + "learning_rate": 3.3999853892456945e-06, + "loss": 1.1316, + "step": 2407 + }, + { + "epoch": 3.219397993311037, + "grad_norm": 1.2825955509941036, + "learning_rate": 3.3955648198589407e-06, + "loss": 1.2575, + "step": 2408 + }, + { + "epoch": 3.220735785953177, + "grad_norm": 0.7922118508567823, + "learning_rate": 3.3911456481068613e-06, + "loss": 1.0194, + "step": 2409 + }, + { + "epoch": 3.222073578595318, + "grad_norm": 0.8424070069328271, + "learning_rate": 3.386727877839027e-06, + "loss": 1.1413, + "step": 2410 + }, + { + "epoch": 3.223411371237458, + "grad_norm": 1.1408988202260504, + "learning_rate": 3.3823115129037897e-06, + "loss": 1.0062, + "step": 2411 + }, + { + "epoch": 3.224749163879599, + "grad_norm": 1.1099183238045633, + "learning_rate": 3.3778965571482723e-06, + "loss": 0.8887, + "step": 2412 + }, + { + "epoch": 3.226086956521739, + "grad_norm": 1.3792957778636215, + "learning_rate": 3.3734830144183783e-06, + "loss": 1.1256, + "step": 2413 + }, + { + "epoch": 3.2274247491638794, + "grad_norm": 0.7715257617669895, + "learning_rate": 3.369070888558774e-06, + "loss": 1.1869, + "step": 2414 + }, + { + "epoch": 3.22876254180602, + "grad_norm": 0.9272682178998404, + "learning_rate": 3.3646601834128924e-06, + "loss": 1.0429, + "step": 2415 + }, + { + "epoch": 3.2301003344481605, + "grad_norm": 1.1219247315456495, + "learning_rate": 3.360250902822929e-06, + "loss": 0.99, + "step": 2416 + }, + { + "epoch": 3.231438127090301, + "grad_norm": 0.9812172759845663, + "learning_rate": 3.35584305062984e-06, + "loss": 0.913, + "step": 2417 + }, + { + "epoch": 3.2327759197324415, + "grad_norm": 0.8060174002781856, + "learning_rate": 3.3514366306733348e-06, + "loss": 1.1282, + "step": 2418 + }, + { + "epoch": 3.234113712374582, + "grad_norm": 0.8331669758698264, + "learning_rate": 3.3470316467918785e-06, + "loss": 1.0733, + "step": 2419 + }, + { + "epoch": 3.2354515050167225, + "grad_norm": 0.9824992335119704, + "learning_rate": 3.3426281028226817e-06, + "loss": 1.1536, + "step": 2420 + }, + { + "epoch": 3.236789297658863, + "grad_norm": 1.2119122031622658, + "learning_rate": 3.3382260026017027e-06, + "loss": 0.9944, + "step": 2421 + }, + { + "epoch": 3.2381270903010035, + "grad_norm": 1.314866917346474, + "learning_rate": 3.3338253499636407e-06, + "loss": 1.1743, + "step": 2422 + }, + { + "epoch": 3.239464882943144, + "grad_norm": 0.9189263126911225, + "learning_rate": 3.329426148741937e-06, + "loss": 1.1187, + "step": 2423 + }, + { + "epoch": 3.240802675585284, + "grad_norm": 0.9274327033366042, + "learning_rate": 3.3250284027687652e-06, + "loss": 1.0135, + "step": 2424 + }, + { + "epoch": 3.242140468227425, + "grad_norm": 1.0024847068875662, + "learning_rate": 3.320632115875033e-06, + "loss": 0.9834, + "step": 2425 + }, + { + "epoch": 3.243478260869565, + "grad_norm": 1.1890114562895262, + "learning_rate": 3.3162372918903764e-06, + "loss": 1.0694, + "step": 2426 + }, + { + "epoch": 3.244816053511706, + "grad_norm": 1.0413551070073483, + "learning_rate": 3.311843934643157e-06, + "loss": 1.1563, + "step": 2427 + }, + { + "epoch": 3.246153846153846, + "grad_norm": 1.0644340220514492, + "learning_rate": 3.307452047960459e-06, + "loss": 1.1527, + "step": 2428 + }, + { + "epoch": 3.2474916387959865, + "grad_norm": 1.1498191521172159, + "learning_rate": 3.3030616356680854e-06, + "loss": 0.878, + "step": 2429 + }, + { + "epoch": 3.248829431438127, + "grad_norm": 1.0319344468848501, + "learning_rate": 3.298672701590555e-06, + "loss": 1.0131, + "step": 2430 + }, + { + "epoch": 3.2501672240802675, + "grad_norm": 0.802872734281659, + "learning_rate": 3.2942852495510992e-06, + "loss": 1.0669, + "step": 2431 + }, + { + "epoch": 3.251505016722408, + "grad_norm": 0.65197527081759, + "learning_rate": 3.289899283371657e-06, + "loss": 0.9698, + "step": 2432 + }, + { + "epoch": 3.2528428093645485, + "grad_norm": 1.065029769273126, + "learning_rate": 3.2855148068728753e-06, + "loss": 1.027, + "step": 2433 + }, + { + "epoch": 3.254180602006689, + "grad_norm": 0.994414345517791, + "learning_rate": 3.2811318238741026e-06, + "loss": 0.9733, + "step": 2434 + }, + { + "epoch": 3.2555183946488295, + "grad_norm": 1.0592645801758216, + "learning_rate": 3.276750338193385e-06, + "loss": 1.0935, + "step": 2435 + }, + { + "epoch": 3.25685618729097, + "grad_norm": 1.1007253085467803, + "learning_rate": 3.272370353647465e-06, + "loss": 1.1891, + "step": 2436 + }, + { + "epoch": 3.2581939799331106, + "grad_norm": 0.8628649728395789, + "learning_rate": 3.2679918740517785e-06, + "loss": 1.113, + "step": 2437 + }, + { + "epoch": 3.259531772575251, + "grad_norm": 0.8718617850099607, + "learning_rate": 3.263614903220449e-06, + "loss": 1.016, + "step": 2438 + }, + { + "epoch": 3.260869565217391, + "grad_norm": 1.0447142717061282, + "learning_rate": 3.2592394449662867e-06, + "loss": 1.0085, + "step": 2439 + }, + { + "epoch": 3.262207357859532, + "grad_norm": 1.0322493390511798, + "learning_rate": 3.2548655031007837e-06, + "loss": 1.0662, + "step": 2440 + }, + { + "epoch": 3.263545150501672, + "grad_norm": 1.3613179129349389, + "learning_rate": 3.250493081434112e-06, + "loss": 1.0638, + "step": 2441 + }, + { + "epoch": 3.264882943143813, + "grad_norm": 0.9564103887489093, + "learning_rate": 3.2461221837751146e-06, + "loss": 1.0962, + "step": 2442 + }, + { + "epoch": 3.266220735785953, + "grad_norm": 0.9143710710685061, + "learning_rate": 3.241752813931316e-06, + "loss": 1.061, + "step": 2443 + }, + { + "epoch": 3.2675585284280935, + "grad_norm": 1.2122004971052982, + "learning_rate": 3.237384975708904e-06, + "loss": 1.0692, + "step": 2444 + }, + { + "epoch": 3.268896321070234, + "grad_norm": 1.006055441230953, + "learning_rate": 3.233018672912731e-06, + "loss": 1.163, + "step": 2445 + }, + { + "epoch": 3.2702341137123745, + "grad_norm": 1.0195039638513308, + "learning_rate": 3.228653909346314e-06, + "loss": 0.994, + "step": 2446 + }, + { + "epoch": 3.2715719063545152, + "grad_norm": 0.8795208868189303, + "learning_rate": 3.224290688811831e-06, + "loss": 1.0441, + "step": 2447 + }, + { + "epoch": 3.2729096989966555, + "grad_norm": 0.8195303450855287, + "learning_rate": 3.2199290151101115e-06, + "loss": 1.2179, + "step": 2448 + }, + { + "epoch": 3.274247491638796, + "grad_norm": 0.8347309868800749, + "learning_rate": 3.2155688920406415e-06, + "loss": 1.1283, + "step": 2449 + }, + { + "epoch": 3.2755852842809365, + "grad_norm": 0.920684589305794, + "learning_rate": 3.2112103234015535e-06, + "loss": 0.9078, + "step": 2450 + }, + { + "epoch": 3.276923076923077, + "grad_norm": 1.0398942131738484, + "learning_rate": 3.2068533129896273e-06, + "loss": 1.1018, + "step": 2451 + }, + { + "epoch": 3.2782608695652176, + "grad_norm": 1.1709462023167203, + "learning_rate": 3.2024978646002848e-06, + "loss": 1.1331, + "step": 2452 + }, + { + "epoch": 3.279598662207358, + "grad_norm": 0.8097812085902492, + "learning_rate": 3.1981439820275883e-06, + "loss": 0.9368, + "step": 2453 + }, + { + "epoch": 3.280936454849498, + "grad_norm": 0.9116187687554597, + "learning_rate": 3.1937916690642356e-06, + "loss": 0.9767, + "step": 2454 + }, + { + "epoch": 3.282274247491639, + "grad_norm": 0.9262705990965142, + "learning_rate": 3.189440929501556e-06, + "loss": 1.1936, + "step": 2455 + }, + { + "epoch": 3.283612040133779, + "grad_norm": 0.8499067109104688, + "learning_rate": 3.185091767129509e-06, + "loss": 1.2235, + "step": 2456 + }, + { + "epoch": 3.28494983277592, + "grad_norm": 1.0512099309633622, + "learning_rate": 3.1807441857366798e-06, + "loss": 1.005, + "step": 2457 + }, + { + "epoch": 3.28628762541806, + "grad_norm": 0.9122374005935072, + "learning_rate": 3.1763981891102785e-06, + "loss": 1.022, + "step": 2458 + }, + { + "epoch": 3.2876254180602005, + "grad_norm": 0.7414610168226019, + "learning_rate": 3.172053781036132e-06, + "loss": 0.9047, + "step": 2459 + }, + { + "epoch": 3.288963210702341, + "grad_norm": 1.2066805381623102, + "learning_rate": 3.167710965298684e-06, + "loss": 0.9708, + "step": 2460 + }, + { + "epoch": 3.2903010033444815, + "grad_norm": 0.8855161590416807, + "learning_rate": 3.1633697456809932e-06, + "loss": 1.2735, + "step": 2461 + }, + { + "epoch": 3.2916387959866222, + "grad_norm": 1.0467989393154984, + "learning_rate": 3.159030125964723e-06, + "loss": 1.0936, + "step": 2462 + }, + { + "epoch": 3.2929765886287625, + "grad_norm": 0.8656586689937896, + "learning_rate": 3.1546921099301507e-06, + "loss": 1.117, + "step": 2463 + }, + { + "epoch": 3.294314381270903, + "grad_norm": 0.9660139350320865, + "learning_rate": 3.15035570135615e-06, + "loss": 0.8625, + "step": 2464 + }, + { + "epoch": 3.2956521739130435, + "grad_norm": 1.2156513877977078, + "learning_rate": 3.1460209040201967e-06, + "loss": 1.106, + "step": 2465 + }, + { + "epoch": 3.296989966555184, + "grad_norm": 1.0093426216321797, + "learning_rate": 3.141687721698363e-06, + "loss": 1.1151, + "step": 2466 + }, + { + "epoch": 3.2983277591973246, + "grad_norm": 0.9463505081531508, + "learning_rate": 3.1373561581653152e-06, + "loss": 1.0892, + "step": 2467 + }, + { + "epoch": 3.299665551839465, + "grad_norm": 1.1814692964612714, + "learning_rate": 3.1330262171943073e-06, + "loss": 1.0944, + "step": 2468 + }, + { + "epoch": 3.3010033444816056, + "grad_norm": 0.8408622400062618, + "learning_rate": 3.1286979025571817e-06, + "loss": 0.9806, + "step": 2469 + }, + { + "epoch": 3.302341137123746, + "grad_norm": 0.7639367204759744, + "learning_rate": 3.1243712180243633e-06, + "loss": 1.3055, + "step": 2470 + }, + { + "epoch": 3.303678929765886, + "grad_norm": 1.1341671347164017, + "learning_rate": 3.120046167364857e-06, + "loss": 1.0661, + "step": 2471 + }, + { + "epoch": 3.305016722408027, + "grad_norm": 1.0788297618135425, + "learning_rate": 3.1157227543462428e-06, + "loss": 1.0197, + "step": 2472 + }, + { + "epoch": 3.306354515050167, + "grad_norm": 1.1227572265099222, + "learning_rate": 3.11140098273468e-06, + "loss": 1.113, + "step": 2473 + }, + { + "epoch": 3.3076923076923075, + "grad_norm": 0.8614325876764148, + "learning_rate": 3.107080856294892e-06, + "loss": 0.954, + "step": 2474 + }, + { + "epoch": 3.309030100334448, + "grad_norm": 0.8020982540216864, + "learning_rate": 3.1027623787901706e-06, + "loss": 1.0044, + "step": 2475 + }, + { + "epoch": 3.3103678929765885, + "grad_norm": 0.9128853334174754, + "learning_rate": 3.098445553982372e-06, + "loss": 1.1079, + "step": 2476 + }, + { + "epoch": 3.3117056856187292, + "grad_norm": 0.8408464841933856, + "learning_rate": 3.0941303856319126e-06, + "loss": 1.1261, + "step": 2477 + }, + { + "epoch": 3.3130434782608695, + "grad_norm": 1.0607819163017005, + "learning_rate": 3.0898168774977654e-06, + "loss": 1.0055, + "step": 2478 + }, + { + "epoch": 3.3143812709030103, + "grad_norm": 0.9741984260305796, + "learning_rate": 3.0855050333374574e-06, + "loss": 1.2049, + "step": 2479 + }, + { + "epoch": 3.3157190635451506, + "grad_norm": 1.0318576444224703, + "learning_rate": 3.0811948569070666e-06, + "loss": 1.059, + "step": 2480 + }, + { + "epoch": 3.317056856187291, + "grad_norm": 0.8401231704250338, + "learning_rate": 3.076886351961217e-06, + "loss": 0.8795, + "step": 2481 + }, + { + "epoch": 3.3183946488294316, + "grad_norm": 0.8791755513555712, + "learning_rate": 3.072579522253076e-06, + "loss": 1.2999, + "step": 2482 + }, + { + "epoch": 3.319732441471572, + "grad_norm": 0.9215440920144221, + "learning_rate": 3.0682743715343565e-06, + "loss": 0.9964, + "step": 2483 + }, + { + "epoch": 3.321070234113712, + "grad_norm": 1.3708839596034286, + "learning_rate": 3.063970903555304e-06, + "loss": 1.1004, + "step": 2484 + }, + { + "epoch": 3.322408026755853, + "grad_norm": 1.1211389534760279, + "learning_rate": 3.0596691220646978e-06, + "loss": 1.2124, + "step": 2485 + }, + { + "epoch": 3.323745819397993, + "grad_norm": 0.9102686387156176, + "learning_rate": 3.0553690308098517e-06, + "loss": 1.1176, + "step": 2486 + }, + { + "epoch": 3.325083612040134, + "grad_norm": 0.8925464341101573, + "learning_rate": 3.0510706335366034e-06, + "loss": 1.0643, + "step": 2487 + }, + { + "epoch": 3.326421404682274, + "grad_norm": 0.8490730523380271, + "learning_rate": 3.046773933989319e-06, + "loss": 1.0717, + "step": 2488 + }, + { + "epoch": 3.327759197324415, + "grad_norm": 0.7729633523450368, + "learning_rate": 3.042478935910881e-06, + "loss": 0.9896, + "step": 2489 + }, + { + "epoch": 3.3290969899665552, + "grad_norm": 0.9107881332720101, + "learning_rate": 3.0381856430426935e-06, + "loss": 1.27, + "step": 2490 + }, + { + "epoch": 3.3304347826086955, + "grad_norm": 0.990706520151806, + "learning_rate": 3.033894059124675e-06, + "loss": 1.1396, + "step": 2491 + }, + { + "epoch": 3.3317725752508363, + "grad_norm": 0.9132968733020069, + "learning_rate": 3.0296041878952497e-06, + "loss": 1.0049, + "step": 2492 + }, + { + "epoch": 3.3331103678929765, + "grad_norm": 0.9580679640705901, + "learning_rate": 3.02531603309136e-06, + "loss": 1.0231, + "step": 2493 + }, + { + "epoch": 3.334448160535117, + "grad_norm": 0.9750281600236888, + "learning_rate": 3.0210295984484446e-06, + "loss": 0.946, + "step": 2494 + }, + { + "epoch": 3.3357859531772576, + "grad_norm": 0.7614153474018439, + "learning_rate": 3.016744887700447e-06, + "loss": 1.1526, + "step": 2495 + }, + { + "epoch": 3.337123745819398, + "grad_norm": 1.0752023418384342, + "learning_rate": 3.0124619045798087e-06, + "loss": 0.9383, + "step": 2496 + }, + { + "epoch": 3.3384615384615386, + "grad_norm": 0.9813492695795467, + "learning_rate": 3.0081806528174655e-06, + "loss": 1.1129, + "step": 2497 + }, + { + "epoch": 3.339799331103679, + "grad_norm": 0.9359341576916643, + "learning_rate": 3.0039011361428466e-06, + "loss": 1.0461, + "step": 2498 + }, + { + "epoch": 3.3411371237458196, + "grad_norm": 1.0255043011516078, + "learning_rate": 2.9996233582838686e-06, + "loss": 1.016, + "step": 2499 + }, + { + "epoch": 3.34247491638796, + "grad_norm": 1.000662564623912, + "learning_rate": 2.995347322966933e-06, + "loss": 1.1342, + "step": 2500 + }, + { + "epoch": 3.3438127090301, + "grad_norm": 1.0133585429254637, + "learning_rate": 2.9910730339169245e-06, + "loss": 1.2129, + "step": 2501 + }, + { + "epoch": 3.345150501672241, + "grad_norm": 0.97084940377941, + "learning_rate": 2.9868004948572044e-06, + "loss": 1.0085, + "step": 2502 + }, + { + "epoch": 3.346488294314381, + "grad_norm": 0.8546476825886723, + "learning_rate": 2.982529709509615e-06, + "loss": 0.9734, + "step": 2503 + }, + { + "epoch": 3.3478260869565215, + "grad_norm": 0.8820023940034417, + "learning_rate": 2.978260681594465e-06, + "loss": 1.0769, + "step": 2504 + }, + { + "epoch": 3.3491638795986622, + "grad_norm": 0.9534358121842227, + "learning_rate": 2.973993414830534e-06, + "loss": 0.8757, + "step": 2505 + }, + { + "epoch": 3.3505016722408025, + "grad_norm": 1.085518493759407, + "learning_rate": 2.9697279129350686e-06, + "loss": 1.1626, + "step": 2506 + }, + { + "epoch": 3.3518394648829433, + "grad_norm": 0.7989286185558426, + "learning_rate": 2.965464179623777e-06, + "loss": 1.1514, + "step": 2507 + }, + { + "epoch": 3.3531772575250836, + "grad_norm": 1.1915554767456562, + "learning_rate": 2.9612022186108267e-06, + "loss": 1.1102, + "step": 2508 + }, + { + "epoch": 3.3545150501672243, + "grad_norm": 1.0157270315486404, + "learning_rate": 2.956942033608843e-06, + "loss": 1.1005, + "step": 2509 + }, + { + "epoch": 3.3558528428093646, + "grad_norm": 0.9670215527084373, + "learning_rate": 2.952683628328901e-06, + "loss": 1.0809, + "step": 2510 + }, + { + "epoch": 3.357190635451505, + "grad_norm": 1.0248194014683274, + "learning_rate": 2.948427006480528e-06, + "loss": 1.0922, + "step": 2511 + }, + { + "epoch": 3.3585284280936456, + "grad_norm": 0.9222574002012809, + "learning_rate": 2.9441721717716966e-06, + "loss": 1.0084, + "step": 2512 + }, + { + "epoch": 3.359866220735786, + "grad_norm": 0.7809327875681917, + "learning_rate": 2.9399191279088236e-06, + "loss": 1.1328, + "step": 2513 + }, + { + "epoch": 3.361204013377926, + "grad_norm": 0.7363262272368276, + "learning_rate": 2.9356678785967646e-06, + "loss": 1.0853, + "step": 2514 + }, + { + "epoch": 3.362541806020067, + "grad_norm": 1.1258070820016302, + "learning_rate": 2.9314184275388134e-06, + "loss": 1.3072, + "step": 2515 + }, + { + "epoch": 3.363879598662207, + "grad_norm": 1.0677601602683608, + "learning_rate": 2.9271707784366952e-06, + "loss": 1.1612, + "step": 2516 + }, + { + "epoch": 3.365217391304348, + "grad_norm": 0.8962593275695383, + "learning_rate": 2.9229249349905686e-06, + "loss": 1.0164, + "step": 2517 + }, + { + "epoch": 3.3665551839464882, + "grad_norm": 0.8254841467717116, + "learning_rate": 2.918680900899017e-06, + "loss": 1.1912, + "step": 2518 + }, + { + "epoch": 3.367892976588629, + "grad_norm": 1.073534623831914, + "learning_rate": 2.914438679859046e-06, + "loss": 0.9438, + "step": 2519 + }, + { + "epoch": 3.3692307692307693, + "grad_norm": 0.8082938613176297, + "learning_rate": 2.910198275566085e-06, + "loss": 0.9634, + "step": 2520 + }, + { + "epoch": 3.3705685618729095, + "grad_norm": 0.85133480191929, + "learning_rate": 2.9059596917139804e-06, + "loss": 1.0549, + "step": 2521 + }, + { + "epoch": 3.3719063545150503, + "grad_norm": 1.174857612356516, + "learning_rate": 2.9017229319949897e-06, + "loss": 1.1671, + "step": 2522 + }, + { + "epoch": 3.3732441471571906, + "grad_norm": 0.8427087481180927, + "learning_rate": 2.897488000099788e-06, + "loss": 1.1747, + "step": 2523 + }, + { + "epoch": 3.374581939799331, + "grad_norm": 0.9819834183996887, + "learning_rate": 2.893254899717452e-06, + "loss": 1.185, + "step": 2524 + }, + { + "epoch": 3.3759197324414716, + "grad_norm": 0.852240839399697, + "learning_rate": 2.8890236345354648e-06, + "loss": 1.1055, + "step": 2525 + }, + { + "epoch": 3.377257525083612, + "grad_norm": 0.9789103660460219, + "learning_rate": 2.8847942082397112e-06, + "loss": 1.1069, + "step": 2526 + }, + { + "epoch": 3.3785953177257526, + "grad_norm": 0.9345216143463001, + "learning_rate": 2.8805666245144735e-06, + "loss": 1.2579, + "step": 2527 + }, + { + "epoch": 3.379933110367893, + "grad_norm": 0.7568253778987035, + "learning_rate": 2.8763408870424305e-06, + "loss": 1.086, + "step": 2528 + }, + { + "epoch": 3.3812709030100336, + "grad_norm": 1.2673106308751951, + "learning_rate": 2.8721169995046503e-06, + "loss": 0.9559, + "step": 2529 + }, + { + "epoch": 3.382608695652174, + "grad_norm": 0.9986678079255003, + "learning_rate": 2.8678949655805915e-06, + "loss": 1.0216, + "step": 2530 + }, + { + "epoch": 3.383946488294314, + "grad_norm": 0.9066778102244564, + "learning_rate": 2.863674788948097e-06, + "loss": 1.181, + "step": 2531 + }, + { + "epoch": 3.385284280936455, + "grad_norm": 0.9671430860483772, + "learning_rate": 2.85945647328339e-06, + "loss": 1.1478, + "step": 2532 + }, + { + "epoch": 3.3866220735785952, + "grad_norm": 0.809931654592769, + "learning_rate": 2.8552400222610788e-06, + "loss": 0.9227, + "step": 2533 + }, + { + "epoch": 3.387959866220736, + "grad_norm": 0.9577490376382101, + "learning_rate": 2.851025439554142e-06, + "loss": 1.0602, + "step": 2534 + }, + { + "epoch": 3.3892976588628763, + "grad_norm": 0.9530846292228046, + "learning_rate": 2.846812728833931e-06, + "loss": 1.0548, + "step": 2535 + }, + { + "epoch": 3.3906354515050166, + "grad_norm": 1.1663099615539463, + "learning_rate": 2.8426018937701678e-06, + "loss": 1.0759, + "step": 2536 + }, + { + "epoch": 3.3919732441471573, + "grad_norm": 1.0185735113960184, + "learning_rate": 2.8383929380309406e-06, + "loss": 1.0526, + "step": 2537 + }, + { + "epoch": 3.3933110367892976, + "grad_norm": 1.1239746412310514, + "learning_rate": 2.834185865282699e-06, + "loss": 1.1676, + "step": 2538 + }, + { + "epoch": 3.3946488294314383, + "grad_norm": 0.7651261574595578, + "learning_rate": 2.829980679190254e-06, + "loss": 1.1671, + "step": 2539 + }, + { + "epoch": 3.3959866220735786, + "grad_norm": 0.8412854805399609, + "learning_rate": 2.8257773834167736e-06, + "loss": 1.1746, + "step": 2540 + }, + { + "epoch": 3.397324414715719, + "grad_norm": 0.7844812083093925, + "learning_rate": 2.8215759816237748e-06, + "loss": 0.9297, + "step": 2541 + }, + { + "epoch": 3.3986622073578596, + "grad_norm": 0.9543397007623635, + "learning_rate": 2.817376477471132e-06, + "loss": 1.0596, + "step": 2542 + }, + { + "epoch": 3.4, + "grad_norm": 0.7788123236677295, + "learning_rate": 2.8131788746170612e-06, + "loss": 1.1683, + "step": 2543 + }, + { + "epoch": 3.4013377926421406, + "grad_norm": 0.8658866095748554, + "learning_rate": 2.808983176718125e-06, + "loss": 1.1377, + "step": 2544 + }, + { + "epoch": 3.402675585284281, + "grad_norm": 0.877811935942895, + "learning_rate": 2.804789387429222e-06, + "loss": 1.0485, + "step": 2545 + }, + { + "epoch": 3.4040133779264212, + "grad_norm": 0.9457116551642105, + "learning_rate": 2.800597510403592e-06, + "loss": 1.2277, + "step": 2546 + }, + { + "epoch": 3.405351170568562, + "grad_norm": 0.8697084740697768, + "learning_rate": 2.796407549292809e-06, + "loss": 1.066, + "step": 2547 + }, + { + "epoch": 3.4066889632107022, + "grad_norm": 0.9198608557494562, + "learning_rate": 2.792219507746777e-06, + "loss": 1.055, + "step": 2548 + }, + { + "epoch": 3.408026755852843, + "grad_norm": 0.9395242047203777, + "learning_rate": 2.788033389413729e-06, + "loss": 1.0288, + "step": 2549 + }, + { + "epoch": 3.4093645484949833, + "grad_norm": 7.041459725933871, + "learning_rate": 2.7838491979402205e-06, + "loss": 1.4429, + "step": 2550 + }, + { + "epoch": 3.4107023411371236, + "grad_norm": 1.0865479024253812, + "learning_rate": 2.7796669369711294e-06, + "loss": 0.8941, + "step": 2551 + }, + { + "epoch": 3.4120401337792643, + "grad_norm": 0.9149357827262925, + "learning_rate": 2.7754866101496558e-06, + "loss": 0.9331, + "step": 2552 + }, + { + "epoch": 3.4133779264214046, + "grad_norm": 0.8617304204735194, + "learning_rate": 2.771308221117309e-06, + "loss": 0.9941, + "step": 2553 + }, + { + "epoch": 3.4147157190635453, + "grad_norm": 1.0047382169795993, + "learning_rate": 2.7671317735139136e-06, + "loss": 1.1711, + "step": 2554 + }, + { + "epoch": 3.4160535117056856, + "grad_norm": 1.041671149087585, + "learning_rate": 2.762957270977602e-06, + "loss": 1.2521, + "step": 2555 + }, + { + "epoch": 3.417391304347826, + "grad_norm": 0.8017549778079676, + "learning_rate": 2.758784717144812e-06, + "loss": 1.3429, + "step": 2556 + }, + { + "epoch": 3.4187290969899666, + "grad_norm": 0.9079481189582558, + "learning_rate": 2.754614115650285e-06, + "loss": 1.0566, + "step": 2557 + }, + { + "epoch": 3.420066889632107, + "grad_norm": 1.1152597239111524, + "learning_rate": 2.7504454701270604e-06, + "loss": 1.0586, + "step": 2558 + }, + { + "epoch": 3.4214046822742477, + "grad_norm": 0.8680250862828278, + "learning_rate": 2.7462787842064753e-06, + "loss": 1.1526, + "step": 2559 + }, + { + "epoch": 3.422742474916388, + "grad_norm": 0.7756860157272666, + "learning_rate": 2.742114061518157e-06, + "loss": 1.2861, + "step": 2560 + }, + { + "epoch": 3.4240802675585282, + "grad_norm": 0.7921670611843281, + "learning_rate": 2.7379513056900254e-06, + "loss": 1.3477, + "step": 2561 + }, + { + "epoch": 3.425418060200669, + "grad_norm": 0.8527741035100135, + "learning_rate": 2.7337905203482884e-06, + "loss": 1.2222, + "step": 2562 + }, + { + "epoch": 3.4267558528428093, + "grad_norm": 0.8880573434648437, + "learning_rate": 2.7296317091174325e-06, + "loss": 1.0377, + "step": 2563 + }, + { + "epoch": 3.42809364548495, + "grad_norm": 0.8557295192044388, + "learning_rate": 2.725474875620228e-06, + "loss": 1.1484, + "step": 2564 + }, + { + "epoch": 3.4294314381270903, + "grad_norm": 0.7767534426714005, + "learning_rate": 2.7213200234777215e-06, + "loss": 0.9973, + "step": 2565 + }, + { + "epoch": 3.430769230769231, + "grad_norm": 0.9052257784903444, + "learning_rate": 2.717167156309234e-06, + "loss": 1.0224, + "step": 2566 + }, + { + "epoch": 3.4321070234113713, + "grad_norm": 0.7202920875352024, + "learning_rate": 2.7130162777323567e-06, + "loss": 0.8858, + "step": 2567 + }, + { + "epoch": 3.4334448160535116, + "grad_norm": 1.0581486421909119, + "learning_rate": 2.708867391362948e-06, + "loss": 1.039, + "step": 2568 + }, + { + "epoch": 3.4347826086956523, + "grad_norm": 0.9485317963040799, + "learning_rate": 2.7047205008151332e-06, + "loss": 1.1119, + "step": 2569 + }, + { + "epoch": 3.4361204013377926, + "grad_norm": 0.8602658339897042, + "learning_rate": 2.700575609701298e-06, + "loss": 0.9972, + "step": 2570 + }, + { + "epoch": 3.437458193979933, + "grad_norm": 1.1181348956844714, + "learning_rate": 2.696432721632082e-06, + "loss": 0.8811, + "step": 2571 + }, + { + "epoch": 3.4387959866220736, + "grad_norm": 0.8679305450822343, + "learning_rate": 2.692291840216389e-06, + "loss": 1.0882, + "step": 2572 + }, + { + "epoch": 3.440133779264214, + "grad_norm": 0.8638474985994636, + "learning_rate": 2.6881529690613687e-06, + "loss": 1.0527, + "step": 2573 + }, + { + "epoch": 3.4414715719063547, + "grad_norm": 0.8630670303994121, + "learning_rate": 2.6840161117724184e-06, + "loss": 0.88, + "step": 2574 + }, + { + "epoch": 3.442809364548495, + "grad_norm": 0.8366083974433173, + "learning_rate": 2.6798812719531843e-06, + "loss": 1.0093, + "step": 2575 + }, + { + "epoch": 3.4441471571906357, + "grad_norm": 0.9422759455794807, + "learning_rate": 2.6757484532055537e-06, + "loss": 0.9675, + "step": 2576 + }, + { + "epoch": 3.445484949832776, + "grad_norm": 0.7945393029099427, + "learning_rate": 2.671617659129655e-06, + "loss": 1.1229, + "step": 2577 + }, + { + "epoch": 3.4468227424749163, + "grad_norm": 0.75315307762368, + "learning_rate": 2.667488893323851e-06, + "loss": 1.1946, + "step": 2578 + }, + { + "epoch": 3.448160535117057, + "grad_norm": 0.9395739381162015, + "learning_rate": 2.6633621593847387e-06, + "loss": 0.8886, + "step": 2579 + }, + { + "epoch": 3.4494983277591973, + "grad_norm": 1.7695312570774948, + "learning_rate": 2.6592374609071446e-06, + "loss": 1.125, + "step": 2580 + }, + { + "epoch": 3.4508361204013376, + "grad_norm": 0.9425154652374119, + "learning_rate": 2.65511480148412e-06, + "loss": 1.0826, + "step": 2581 + }, + { + "epoch": 3.4521739130434783, + "grad_norm": 0.8835500994965572, + "learning_rate": 2.6509941847069466e-06, + "loss": 0.9754, + "step": 2582 + }, + { + "epoch": 3.4535117056856186, + "grad_norm": 0.8181540467883843, + "learning_rate": 2.646875614165121e-06, + "loss": 1.09, + "step": 2583 + }, + { + "epoch": 3.4548494983277593, + "grad_norm": 0.9704702747138957, + "learning_rate": 2.6427590934463576e-06, + "loss": 1.0153, + "step": 2584 + }, + { + "epoch": 3.4561872909698996, + "grad_norm": 0.8035647046678176, + "learning_rate": 2.6386446261365874e-06, + "loss": 0.9314, + "step": 2585 + }, + { + "epoch": 3.4575250836120404, + "grad_norm": 0.9748583413060181, + "learning_rate": 2.6345322158199503e-06, + "loss": 1.0431, + "step": 2586 + }, + { + "epoch": 3.4588628762541807, + "grad_norm": 1.228967563054055, + "learning_rate": 2.630421866078797e-06, + "loss": 0.888, + "step": 2587 + }, + { + "epoch": 3.460200668896321, + "grad_norm": 1.031773029507726, + "learning_rate": 2.626313580493681e-06, + "loss": 1.3087, + "step": 2588 + }, + { + "epoch": 3.4615384615384617, + "grad_norm": 0.9463602367915367, + "learning_rate": 2.6222073626433587e-06, + "loss": 1.1282, + "step": 2589 + }, + { + "epoch": 3.462876254180602, + "grad_norm": 0.8555395018825176, + "learning_rate": 2.618103216104785e-06, + "loss": 1.0923, + "step": 2590 + }, + { + "epoch": 3.4642140468227423, + "grad_norm": 0.7991534129810952, + "learning_rate": 2.6140011444531086e-06, + "loss": 1.1954, + "step": 2591 + }, + { + "epoch": 3.465551839464883, + "grad_norm": 0.8434375910947169, + "learning_rate": 2.6099011512616767e-06, + "loss": 1.1258, + "step": 2592 + }, + { + "epoch": 3.4668896321070233, + "grad_norm": 1.085730556543459, + "learning_rate": 2.60580324010202e-06, + "loss": 1.111, + "step": 2593 + }, + { + "epoch": 3.468227424749164, + "grad_norm": 0.9883105980231855, + "learning_rate": 2.6017074145438583e-06, + "loss": 0.9206, + "step": 2594 + }, + { + "epoch": 3.4695652173913043, + "grad_norm": 1.2117013755474564, + "learning_rate": 2.597613678155092e-06, + "loss": 0.9713, + "step": 2595 + }, + { + "epoch": 3.470903010033445, + "grad_norm": 0.7559203330129127, + "learning_rate": 2.593522034501805e-06, + "loss": 0.9605, + "step": 2596 + }, + { + "epoch": 3.4722408026755853, + "grad_norm": 1.0293713470753503, + "learning_rate": 2.5894324871482557e-06, + "loss": 1.1229, + "step": 2597 + }, + { + "epoch": 3.4735785953177256, + "grad_norm": 1.1092912372818529, + "learning_rate": 2.585345039656878e-06, + "loss": 1.0763, + "step": 2598 + }, + { + "epoch": 3.4749163879598663, + "grad_norm": 1.0559187508821508, + "learning_rate": 2.5812596955882756e-06, + "loss": 1.3552, + "step": 2599 + }, + { + "epoch": 3.4762541806020066, + "grad_norm": 0.6739878135474947, + "learning_rate": 2.5771764585012203e-06, + "loss": 0.8435, + "step": 2600 + }, + { + "epoch": 3.477591973244147, + "grad_norm": 1.0351381113470008, + "learning_rate": 2.573095331952646e-06, + "loss": 0.8835, + "step": 2601 + }, + { + "epoch": 3.4789297658862877, + "grad_norm": 1.171747872023386, + "learning_rate": 2.5690163194976576e-06, + "loss": 0.9317, + "step": 2602 + }, + { + "epoch": 3.480267558528428, + "grad_norm": 0.8972436886020762, + "learning_rate": 2.5649394246895044e-06, + "loss": 1.0982, + "step": 2603 + }, + { + "epoch": 3.4816053511705687, + "grad_norm": 0.8217461816525587, + "learning_rate": 2.560864651079599e-06, + "loss": 1.0396, + "step": 2604 + }, + { + "epoch": 3.482943143812709, + "grad_norm": 0.9340735010820088, + "learning_rate": 2.556792002217507e-06, + "loss": 1.1569, + "step": 2605 + }, + { + "epoch": 3.4842809364548497, + "grad_norm": 1.0210170349912948, + "learning_rate": 2.5527214816509398e-06, + "loss": 0.9482, + "step": 2606 + }, + { + "epoch": 3.48561872909699, + "grad_norm": 0.976674873813516, + "learning_rate": 2.5486530929257574e-06, + "loss": 1.0501, + "step": 2607 + }, + { + "epoch": 3.4869565217391303, + "grad_norm": 1.0410366157912685, + "learning_rate": 2.544586839585961e-06, + "loss": 1.0941, + "step": 2608 + }, + { + "epoch": 3.488294314381271, + "grad_norm": 0.9045128351500166, + "learning_rate": 2.540522725173692e-06, + "loss": 1.0981, + "step": 2609 + }, + { + "epoch": 3.4896321070234113, + "grad_norm": 1.0260155115575325, + "learning_rate": 2.5364607532292283e-06, + "loss": 1.0244, + "step": 2610 + }, + { + "epoch": 3.4909698996655516, + "grad_norm": 1.0306321334368451, + "learning_rate": 2.532400927290982e-06, + "loss": 1.038, + "step": 2611 + }, + { + "epoch": 3.4923076923076923, + "grad_norm": 0.9309223751608114, + "learning_rate": 2.5283432508954976e-06, + "loss": 1.1737, + "step": 2612 + }, + { + "epoch": 3.4936454849498326, + "grad_norm": 0.8409592699250541, + "learning_rate": 2.5242877275774446e-06, + "loss": 1.0183, + "step": 2613 + }, + { + "epoch": 3.4949832775919734, + "grad_norm": 1.0364581679346605, + "learning_rate": 2.520234360869617e-06, + "loss": 1.1366, + "step": 2614 + }, + { + "epoch": 3.4963210702341136, + "grad_norm": 1.0988671389491218, + "learning_rate": 2.5161831543029314e-06, + "loss": 0.9884, + "step": 2615 + }, + { + "epoch": 3.4976588628762544, + "grad_norm": 0.9527429231798308, + "learning_rate": 2.512134111406422e-06, + "loss": 1.1025, + "step": 2616 + }, + { + "epoch": 3.4989966555183947, + "grad_norm": 1.0138145289377336, + "learning_rate": 2.508087235707237e-06, + "loss": 0.8556, + "step": 2617 + }, + { + "epoch": 3.500334448160535, + "grad_norm": 0.9463958862038007, + "learning_rate": 2.5040425307306404e-06, + "loss": 0.9476, + "step": 2618 + }, + { + "epoch": 3.5016722408026757, + "grad_norm": 0.9372345767832728, + "learning_rate": 2.5000000000000015e-06, + "loss": 1.0631, + "step": 2619 + }, + { + "epoch": 3.503010033444816, + "grad_norm": 0.7218320518218642, + "learning_rate": 2.4959596470367965e-06, + "loss": 1.1005, + "step": 2620 + }, + { + "epoch": 3.5043478260869563, + "grad_norm": 1.051088743725283, + "learning_rate": 2.4919214753606043e-06, + "loss": 1.1414, + "step": 2621 + }, + { + "epoch": 3.505685618729097, + "grad_norm": 1.1793057043249766, + "learning_rate": 2.4878854884891067e-06, + "loss": 0.8634, + "step": 2622 + }, + { + "epoch": 3.5070234113712373, + "grad_norm": 1.1662558405738659, + "learning_rate": 2.4838516899380806e-06, + "loss": 1.0999, + "step": 2623 + }, + { + "epoch": 3.508361204013378, + "grad_norm": 1.0837713490524352, + "learning_rate": 2.4798200832213933e-06, + "loss": 1.2553, + "step": 2624 + }, + { + "epoch": 3.5096989966555183, + "grad_norm": 0.6752392574093172, + "learning_rate": 2.475790671851007e-06, + "loss": 0.9809, + "step": 2625 + }, + { + "epoch": 3.511036789297659, + "grad_norm": 0.922710507164882, + "learning_rate": 2.4717634593369704e-06, + "loss": 0.9092, + "step": 2626 + }, + { + "epoch": 3.5123745819397993, + "grad_norm": 0.7738538887502094, + "learning_rate": 2.4677384491874155e-06, + "loss": 0.9811, + "step": 2627 + }, + { + "epoch": 3.5137123745819396, + "grad_norm": 1.0212006913829679, + "learning_rate": 2.463715644908557e-06, + "loss": 1.1528, + "step": 2628 + }, + { + "epoch": 3.5150501672240804, + "grad_norm": 0.9518240893364514, + "learning_rate": 2.459695050004688e-06, + "loss": 0.9729, + "step": 2629 + }, + { + "epoch": 3.5163879598662207, + "grad_norm": 0.8934367913996837, + "learning_rate": 2.4556766679781763e-06, + "loss": 1.0596, + "step": 2630 + }, + { + "epoch": 3.517725752508361, + "grad_norm": 0.868690674010569, + "learning_rate": 2.4516605023294626e-06, + "loss": 1.0726, + "step": 2631 + }, + { + "epoch": 3.5190635451505017, + "grad_norm": 1.0344128613247463, + "learning_rate": 2.447646556557057e-06, + "loss": 1.0813, + "step": 2632 + }, + { + "epoch": 3.5204013377926424, + "grad_norm": 0.7549507451237314, + "learning_rate": 2.443634834157536e-06, + "loss": 1.1215, + "step": 2633 + }, + { + "epoch": 3.5217391304347827, + "grad_norm": 0.8424891483468037, + "learning_rate": 2.4396253386255386e-06, + "loss": 1.1546, + "step": 2634 + }, + { + "epoch": 3.523076923076923, + "grad_norm": 0.7957986962428718, + "learning_rate": 2.4356180734537643e-06, + "loss": 1.2162, + "step": 2635 + }, + { + "epoch": 3.5244147157190637, + "grad_norm": 0.8557074977631162, + "learning_rate": 2.4316130421329696e-06, + "loss": 1.0593, + "step": 2636 + }, + { + "epoch": 3.525752508361204, + "grad_norm": 0.9104626642330509, + "learning_rate": 2.4276102481519655e-06, + "loss": 0.9459, + "step": 2637 + }, + { + "epoch": 3.5270903010033443, + "grad_norm": 1.037211496891758, + "learning_rate": 2.4236096949976136e-06, + "loss": 1.0823, + "step": 2638 + }, + { + "epoch": 3.528428093645485, + "grad_norm": 0.8986415035135684, + "learning_rate": 2.4196113861548233e-06, + "loss": 0.9252, + "step": 2639 + }, + { + "epoch": 3.5297658862876253, + "grad_norm": 1.3495622312676687, + "learning_rate": 2.41561532510655e-06, + "loss": 0.9678, + "step": 2640 + }, + { + "epoch": 3.5311036789297656, + "grad_norm": 1.0269283616452476, + "learning_rate": 2.411621515333788e-06, + "loss": 0.9592, + "step": 2641 + }, + { + "epoch": 3.5324414715719064, + "grad_norm": 1.027179963591522, + "learning_rate": 2.407629960315577e-06, + "loss": 1.0919, + "step": 2642 + }, + { + "epoch": 3.533779264214047, + "grad_norm": 0.9052106485596393, + "learning_rate": 2.403640663528986e-06, + "loss": 1.0513, + "step": 2643 + }, + { + "epoch": 3.5351170568561874, + "grad_norm": 1.0278284377419844, + "learning_rate": 2.3996536284491197e-06, + "loss": 1.0786, + "step": 2644 + }, + { + "epoch": 3.5364548494983277, + "grad_norm": 0.922389783864616, + "learning_rate": 2.3956688585491117e-06, + "loss": 1.0574, + "step": 2645 + }, + { + "epoch": 3.5377926421404684, + "grad_norm": 0.9324483284690743, + "learning_rate": 2.391686357300123e-06, + "loss": 1.2835, + "step": 2646 + }, + { + "epoch": 3.5391304347826087, + "grad_norm": 0.9970659011198872, + "learning_rate": 2.3877061281713393e-06, + "loss": 0.9281, + "step": 2647 + }, + { + "epoch": 3.540468227424749, + "grad_norm": 1.1584901827451426, + "learning_rate": 2.383728174629964e-06, + "loss": 1.0552, + "step": 2648 + }, + { + "epoch": 3.5418060200668897, + "grad_norm": 0.7674399285503986, + "learning_rate": 2.379752500141222e-06, + "loss": 1.0726, + "step": 2649 + }, + { + "epoch": 3.54314381270903, + "grad_norm": 0.8381763943861675, + "learning_rate": 2.3757791081683497e-06, + "loss": 1.0694, + "step": 2650 + }, + { + "epoch": 3.5444816053511703, + "grad_norm": 1.0826155215849955, + "learning_rate": 2.371808002172595e-06, + "loss": 1.0515, + "step": 2651 + }, + { + "epoch": 3.545819397993311, + "grad_norm": 0.7028857705793344, + "learning_rate": 2.3678391856132203e-06, + "loss": 1.0625, + "step": 2652 + }, + { + "epoch": 3.5471571906354518, + "grad_norm": 1.0552604493468993, + "learning_rate": 2.363872661947488e-06, + "loss": 0.9562, + "step": 2653 + }, + { + "epoch": 3.548494983277592, + "grad_norm": 0.9912828560608378, + "learning_rate": 2.3599084346306626e-06, + "loss": 1.2453, + "step": 2654 + }, + { + "epoch": 3.5498327759197323, + "grad_norm": 0.8429714233845631, + "learning_rate": 2.355946507116012e-06, + "loss": 1.0412, + "step": 2655 + }, + { + "epoch": 3.551170568561873, + "grad_norm": 1.3340489324342981, + "learning_rate": 2.3519868828547974e-06, + "loss": 0.9466, + "step": 2656 + }, + { + "epoch": 3.5525083612040134, + "grad_norm": 0.9925184249372818, + "learning_rate": 2.348029565296277e-06, + "loss": 1.0674, + "step": 2657 + }, + { + "epoch": 3.5538461538461537, + "grad_norm": 0.9390345666102589, + "learning_rate": 2.344074557887696e-06, + "loss": 1.0289, + "step": 2658 + }, + { + "epoch": 3.5551839464882944, + "grad_norm": 0.8595578382906944, + "learning_rate": 2.3401218640742894e-06, + "loss": 1.0194, + "step": 2659 + }, + { + "epoch": 3.5565217391304347, + "grad_norm": 0.7502141800012422, + "learning_rate": 2.336171487299277e-06, + "loss": 1.1025, + "step": 2660 + }, + { + "epoch": 3.5578595317725754, + "grad_norm": 1.0412862490432826, + "learning_rate": 2.332223431003859e-06, + "loss": 1.0653, + "step": 2661 + }, + { + "epoch": 3.5591973244147157, + "grad_norm": 0.9877083446424171, + "learning_rate": 2.3282776986272143e-06, + "loss": 1.0396, + "step": 2662 + }, + { + "epoch": 3.5605351170568564, + "grad_norm": 0.9352841479945525, + "learning_rate": 2.324334293606499e-06, + "loss": 0.9595, + "step": 2663 + }, + { + "epoch": 3.5618729096989967, + "grad_norm": 1.3031474363647848, + "learning_rate": 2.3203932193768398e-06, + "loss": 1.1289, + "step": 2664 + }, + { + "epoch": 3.563210702341137, + "grad_norm": 0.9825258489726795, + "learning_rate": 2.3164544793713345e-06, + "loss": 1.3079, + "step": 2665 + }, + { + "epoch": 3.5645484949832777, + "grad_norm": 0.7727803793751752, + "learning_rate": 2.3125180770210464e-06, + "loss": 1.1101, + "step": 2666 + }, + { + "epoch": 3.565886287625418, + "grad_norm": 0.9661328639556744, + "learning_rate": 2.3085840157550036e-06, + "loss": 1.0392, + "step": 2667 + }, + { + "epoch": 3.5672240802675583, + "grad_norm": 0.6588504082944027, + "learning_rate": 2.3046522990001944e-06, + "loss": 0.9939, + "step": 2668 + }, + { + "epoch": 3.568561872909699, + "grad_norm": 0.8013055827536424, + "learning_rate": 2.3007229301815643e-06, + "loss": 1.1888, + "step": 2669 + }, + { + "epoch": 3.5698996655518394, + "grad_norm": 0.9594260535924309, + "learning_rate": 2.296795912722014e-06, + "loss": 0.9911, + "step": 2670 + }, + { + "epoch": 3.57123745819398, + "grad_norm": 0.7154364312316852, + "learning_rate": 2.2928712500423938e-06, + "loss": 1.1914, + "step": 2671 + }, + { + "epoch": 3.5725752508361204, + "grad_norm": 0.9283996985090233, + "learning_rate": 2.288948945561509e-06, + "loss": 0.9604, + "step": 2672 + }, + { + "epoch": 3.573913043478261, + "grad_norm": 0.9012246021000133, + "learning_rate": 2.2850290026961032e-06, + "loss": 1.1544, + "step": 2673 + }, + { + "epoch": 3.5752508361204014, + "grad_norm": 0.9522213023855441, + "learning_rate": 2.2811114248608675e-06, + "loss": 1.1367, + "step": 2674 + }, + { + "epoch": 3.5765886287625417, + "grad_norm": 1.0385151075067505, + "learning_rate": 2.2771962154684303e-06, + "loss": 1.0711, + "step": 2675 + }, + { + "epoch": 3.5779264214046824, + "grad_norm": 0.9575866279606987, + "learning_rate": 2.2732833779293583e-06, + "loss": 1.033, + "step": 2676 + }, + { + "epoch": 3.5792642140468227, + "grad_norm": 0.9558894657248921, + "learning_rate": 2.2693729156521518e-06, + "loss": 1.1828, + "step": 2677 + }, + { + "epoch": 3.580602006688963, + "grad_norm": 0.9773673977623225, + "learning_rate": 2.2654648320432403e-06, + "loss": 1.0308, + "step": 2678 + }, + { + "epoch": 3.5819397993311037, + "grad_norm": 1.0079248879066478, + "learning_rate": 2.2615591305069846e-06, + "loss": 0.9986, + "step": 2679 + }, + { + "epoch": 3.583277591973244, + "grad_norm": 0.817094953247699, + "learning_rate": 2.2576558144456677e-06, + "loss": 1.0876, + "step": 2680 + }, + { + "epoch": 3.5846153846153848, + "grad_norm": 1.0407218841606265, + "learning_rate": 2.2537548872594935e-06, + "loss": 1.0309, + "step": 2681 + }, + { + "epoch": 3.585953177257525, + "grad_norm": 0.8408805085358311, + "learning_rate": 2.2498563523465905e-06, + "loss": 1.0107, + "step": 2682 + }, + { + "epoch": 3.587290969899666, + "grad_norm": 0.8737509753514003, + "learning_rate": 2.2459602131029977e-06, + "loss": 1.2233, + "step": 2683 + }, + { + "epoch": 3.588628762541806, + "grad_norm": 0.7980275656314636, + "learning_rate": 2.24206647292267e-06, + "loss": 1.073, + "step": 2684 + }, + { + "epoch": 3.5899665551839464, + "grad_norm": 1.0564404633036677, + "learning_rate": 2.238175135197471e-06, + "loss": 1.0274, + "step": 2685 + }, + { + "epoch": 3.591304347826087, + "grad_norm": 0.7953158680427642, + "learning_rate": 2.234286203317172e-06, + "loss": 1.0039, + "step": 2686 + }, + { + "epoch": 3.5926421404682274, + "grad_norm": 0.825196700814659, + "learning_rate": 2.230399680669449e-06, + "loss": 1.0127, + "step": 2687 + }, + { + "epoch": 3.5939799331103677, + "grad_norm": 0.9375210771200359, + "learning_rate": 2.226515570639879e-06, + "loss": 1.1309, + "step": 2688 + }, + { + "epoch": 3.5953177257525084, + "grad_norm": 1.1892991414162801, + "learning_rate": 2.2226338766119366e-06, + "loss": 1.1612, + "step": 2689 + }, + { + "epoch": 3.5966555183946487, + "grad_norm": 1.115626663710309, + "learning_rate": 2.2187546019669938e-06, + "loss": 1.0667, + "step": 2690 + }, + { + "epoch": 3.5979933110367894, + "grad_norm": 1.0178217937516383, + "learning_rate": 2.2148777500843125e-06, + "loss": 0.8256, + "step": 2691 + }, + { + "epoch": 3.5993311036789297, + "grad_norm": 0.9037937062739937, + "learning_rate": 2.2110033243410462e-06, + "loss": 1.1242, + "step": 2692 + }, + { + "epoch": 3.6006688963210705, + "grad_norm": 0.860581552153641, + "learning_rate": 2.207131328112234e-06, + "loss": 1.1516, + "step": 2693 + }, + { + "epoch": 3.6020066889632107, + "grad_norm": 0.9639311781628258, + "learning_rate": 2.2032617647707995e-06, + "loss": 0.9698, + "step": 2694 + }, + { + "epoch": 3.603344481605351, + "grad_norm": 0.9721329256541208, + "learning_rate": 2.1993946376875447e-06, + "loss": 0.9842, + "step": 2695 + }, + { + "epoch": 3.6046822742474918, + "grad_norm": 0.7603206624599905, + "learning_rate": 2.1955299502311523e-06, + "loss": 1.2075, + "step": 2696 + }, + { + "epoch": 3.606020066889632, + "grad_norm": 1.0106891010547097, + "learning_rate": 2.1916677057681786e-06, + "loss": 1.067, + "step": 2697 + }, + { + "epoch": 3.6073578595317723, + "grad_norm": 1.1228695526830925, + "learning_rate": 2.1878079076630502e-06, + "loss": 1.1504, + "step": 2698 + }, + { + "epoch": 3.608695652173913, + "grad_norm": 0.8849161354819635, + "learning_rate": 2.1839505592780658e-06, + "loss": 0.8921, + "step": 2699 + }, + { + "epoch": 3.6100334448160534, + "grad_norm": 1.0895117576253792, + "learning_rate": 2.180095663973388e-06, + "loss": 0.9142, + "step": 2700 + }, + { + "epoch": 3.611371237458194, + "grad_norm": 1.0353515441308003, + "learning_rate": 2.1762432251070404e-06, + "loss": 1.0711, + "step": 2701 + }, + { + "epoch": 3.6127090301003344, + "grad_norm": 0.8047986764219439, + "learning_rate": 2.172393246034914e-06, + "loss": 1.0336, + "step": 2702 + }, + { + "epoch": 3.614046822742475, + "grad_norm": 1.0031245897849785, + "learning_rate": 2.1685457301107506e-06, + "loss": 0.8507, + "step": 2703 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 1.0914965986056089, + "learning_rate": 2.1647006806861472e-06, + "loss": 1.1571, + "step": 2704 + }, + { + "epoch": 3.6167224080267557, + "grad_norm": 0.9890715570624745, + "learning_rate": 2.1608581011105533e-06, + "loss": 1.0412, + "step": 2705 + }, + { + "epoch": 3.6180602006688964, + "grad_norm": 1.1440888907755438, + "learning_rate": 2.1570179947312674e-06, + "loss": 0.9815, + "step": 2706 + }, + { + "epoch": 3.6193979933110367, + "grad_norm": 0.9507713045730398, + "learning_rate": 2.1531803648934333e-06, + "loss": 1.0062, + "step": 2707 + }, + { + "epoch": 3.620735785953177, + "grad_norm": 1.139527696789641, + "learning_rate": 2.149345214940036e-06, + "loss": 0.9963, + "step": 2708 + }, + { + "epoch": 3.6220735785953178, + "grad_norm": 0.8853406254986462, + "learning_rate": 2.145512548211902e-06, + "loss": 1.0188, + "step": 2709 + }, + { + "epoch": 3.623411371237458, + "grad_norm": 0.8855209062544999, + "learning_rate": 2.1416823680476945e-06, + "loss": 0.9845, + "step": 2710 + }, + { + "epoch": 3.624749163879599, + "grad_norm": 1.0133603775949525, + "learning_rate": 2.137854677783907e-06, + "loss": 1.108, + "step": 2711 + }, + { + "epoch": 3.626086956521739, + "grad_norm": 0.9145716357429914, + "learning_rate": 2.1340294807548716e-06, + "loss": 1.0484, + "step": 2712 + }, + { + "epoch": 3.62742474916388, + "grad_norm": 1.0578004907931025, + "learning_rate": 2.130206780292743e-06, + "loss": 1.0883, + "step": 2713 + }, + { + "epoch": 3.62876254180602, + "grad_norm": 0.8283115018034659, + "learning_rate": 2.1263865797275007e-06, + "loss": 1.0526, + "step": 2714 + }, + { + "epoch": 3.6301003344481604, + "grad_norm": 0.9659358257218931, + "learning_rate": 2.1225688823869494e-06, + "loss": 1.2078, + "step": 2715 + }, + { + "epoch": 3.631438127090301, + "grad_norm": 1.0790713738662845, + "learning_rate": 2.118753691596711e-06, + "loss": 0.992, + "step": 2716 + }, + { + "epoch": 3.6327759197324414, + "grad_norm": 1.027929812034752, + "learning_rate": 2.1149410106802252e-06, + "loss": 1.2187, + "step": 2717 + }, + { + "epoch": 3.6341137123745817, + "grad_norm": 0.8631913013147053, + "learning_rate": 2.1111308429587446e-06, + "loss": 1.1011, + "step": 2718 + }, + { + "epoch": 3.6354515050167224, + "grad_norm": 1.088793544395742, + "learning_rate": 2.1073231917513336e-06, + "loss": 0.9289, + "step": 2719 + }, + { + "epoch": 3.6367892976588627, + "grad_norm": 0.8890333983034818, + "learning_rate": 2.1035180603748635e-06, + "loss": 1.2273, + "step": 2720 + }, + { + "epoch": 3.6381270903010035, + "grad_norm": 0.9952298948368233, + "learning_rate": 2.09971545214401e-06, + "loss": 1.0466, + "step": 2721 + }, + { + "epoch": 3.6394648829431437, + "grad_norm": 0.8917520753345333, + "learning_rate": 2.095915370371252e-06, + "loss": 1.1506, + "step": 2722 + }, + { + "epoch": 3.6408026755852845, + "grad_norm": 1.0948217877042263, + "learning_rate": 2.0921178183668676e-06, + "loss": 1.0926, + "step": 2723 + }, + { + "epoch": 3.6421404682274248, + "grad_norm": 0.8048287780565885, + "learning_rate": 2.088322799438931e-06, + "loss": 1.1672, + "step": 2724 + }, + { + "epoch": 3.643478260869565, + "grad_norm": 0.9991800269336778, + "learning_rate": 2.084530316893309e-06, + "loss": 1.0989, + "step": 2725 + }, + { + "epoch": 3.644816053511706, + "grad_norm": 0.7930045690710666, + "learning_rate": 2.08074037403366e-06, + "loss": 1.1449, + "step": 2726 + }, + { + "epoch": 3.646153846153846, + "grad_norm": 0.7046171515087327, + "learning_rate": 2.0769529741614297e-06, + "loss": 0.9722, + "step": 2727 + }, + { + "epoch": 3.6474916387959864, + "grad_norm": 0.9629032668616068, + "learning_rate": 2.0731681205758485e-06, + "loss": 1.1633, + "step": 2728 + }, + { + "epoch": 3.648829431438127, + "grad_norm": 0.9042350204214943, + "learning_rate": 2.069385816573928e-06, + "loss": 1.2805, + "step": 2729 + }, + { + "epoch": 3.650167224080268, + "grad_norm": 0.8934970860754987, + "learning_rate": 2.065606065450461e-06, + "loss": 1.0294, + "step": 2730 + }, + { + "epoch": 3.651505016722408, + "grad_norm": 1.1414327229430712, + "learning_rate": 2.061828870498012e-06, + "loss": 1.1101, + "step": 2731 + }, + { + "epoch": 3.6528428093645484, + "grad_norm": 1.2066407178522551, + "learning_rate": 2.0580542350069266e-06, + "loss": 1.0318, + "step": 2732 + }, + { + "epoch": 3.654180602006689, + "grad_norm": 1.16579901397988, + "learning_rate": 2.054282162265313e-06, + "loss": 1.0612, + "step": 2733 + }, + { + "epoch": 3.6555183946488294, + "grad_norm": 1.152839679670362, + "learning_rate": 2.050512655559051e-06, + "loss": 1.2118, + "step": 2734 + }, + { + "epoch": 3.6568561872909697, + "grad_norm": 1.3177628025595163, + "learning_rate": 2.046745718171784e-06, + "loss": 1.0561, + "step": 2735 + }, + { + "epoch": 3.6581939799331105, + "grad_norm": 1.0899924976631323, + "learning_rate": 2.0429813533849174e-06, + "loss": 0.9929, + "step": 2736 + }, + { + "epoch": 3.6595317725752508, + "grad_norm": 0.9518840971296211, + "learning_rate": 2.0392195644776153e-06, + "loss": 1.1827, + "step": 2737 + }, + { + "epoch": 3.660869565217391, + "grad_norm": 0.9698163402899949, + "learning_rate": 2.0354603547267985e-06, + "loss": 0.9806, + "step": 2738 + }, + { + "epoch": 3.6622073578595318, + "grad_norm": 1.2013066444872826, + "learning_rate": 2.0317037274071412e-06, + "loss": 1.0069, + "step": 2739 + }, + { + "epoch": 3.6635451505016725, + "grad_norm": 0.9628741484858042, + "learning_rate": 2.0279496857910667e-06, + "loss": 1.1578, + "step": 2740 + }, + { + "epoch": 3.664882943143813, + "grad_norm": 0.9450048390150443, + "learning_rate": 2.0241982331487465e-06, + "loss": 0.9884, + "step": 2741 + }, + { + "epoch": 3.666220735785953, + "grad_norm": 1.0079963565635395, + "learning_rate": 2.0204493727480996e-06, + "loss": 0.9124, + "step": 2742 + }, + { + "epoch": 3.667558528428094, + "grad_norm": 0.887972744390216, + "learning_rate": 2.016703107854783e-06, + "loss": 1.0791, + "step": 2743 + }, + { + "epoch": 3.668896321070234, + "grad_norm": 0.9532374974715614, + "learning_rate": 2.0129594417321937e-06, + "loss": 1.023, + "step": 2744 + }, + { + "epoch": 3.6702341137123744, + "grad_norm": 0.8557274389753433, + "learning_rate": 2.009218377641466e-06, + "loss": 1.1221, + "step": 2745 + }, + { + "epoch": 3.671571906354515, + "grad_norm": 0.8585796813721607, + "learning_rate": 2.0054799188414666e-06, + "loss": 1.0504, + "step": 2746 + }, + { + "epoch": 3.6729096989966554, + "grad_norm": 0.7769342830196289, + "learning_rate": 2.0017440685887934e-06, + "loss": 1.047, + "step": 2747 + }, + { + "epoch": 3.6742474916387957, + "grad_norm": 0.9537684746959323, + "learning_rate": 1.998010830137771e-06, + "loss": 1.059, + "step": 2748 + }, + { + "epoch": 3.6755852842809364, + "grad_norm": 0.7162057343642183, + "learning_rate": 1.99428020674045e-06, + "loss": 0.8909, + "step": 2749 + }, + { + "epoch": 3.676923076923077, + "grad_norm": 0.8653043690608888, + "learning_rate": 1.9905522016466023e-06, + "loss": 1.172, + "step": 2750 + }, + { + "epoch": 3.6782608695652175, + "grad_norm": 0.8655686760818823, + "learning_rate": 1.9868268181037186e-06, + "loss": 1.1228, + "step": 2751 + }, + { + "epoch": 3.6795986622073578, + "grad_norm": 0.8538538351627439, + "learning_rate": 1.9831040593570076e-06, + "loss": 1.152, + "step": 2752 + }, + { + "epoch": 3.6809364548494985, + "grad_norm": 0.880726928010622, + "learning_rate": 1.9793839286493894e-06, + "loss": 1.1482, + "step": 2753 + }, + { + "epoch": 3.682274247491639, + "grad_norm": 0.9813827603794215, + "learning_rate": 1.9756664292214962e-06, + "loss": 1.0601, + "step": 2754 + }, + { + "epoch": 3.683612040133779, + "grad_norm": 1.128492176977974, + "learning_rate": 1.971951564311668e-06, + "loss": 0.9718, + "step": 2755 + }, + { + "epoch": 3.68494983277592, + "grad_norm": 0.7754980911041515, + "learning_rate": 1.968239337155949e-06, + "loss": 1.1544, + "step": 2756 + }, + { + "epoch": 3.68628762541806, + "grad_norm": 1.1602904748632614, + "learning_rate": 1.964529750988086e-06, + "loss": 1.1421, + "step": 2757 + }, + { + "epoch": 3.687625418060201, + "grad_norm": 0.8780066556413934, + "learning_rate": 1.960822809039526e-06, + "loss": 0.8943, + "step": 2758 + }, + { + "epoch": 3.688963210702341, + "grad_norm": 0.968353054404242, + "learning_rate": 1.9571185145394117e-06, + "loss": 1.3942, + "step": 2759 + }, + { + "epoch": 3.690301003344482, + "grad_norm": 0.9194676753683174, + "learning_rate": 1.95341687071458e-06, + "loss": 1.141, + "step": 2760 + }, + { + "epoch": 3.691638795986622, + "grad_norm": 0.9068376395168342, + "learning_rate": 1.949717880789557e-06, + "loss": 0.9115, + "step": 2761 + }, + { + "epoch": 3.6929765886287624, + "grad_norm": 0.9217031319346836, + "learning_rate": 1.9460215479865613e-06, + "loss": 1.0038, + "step": 2762 + }, + { + "epoch": 3.694314381270903, + "grad_norm": 1.0043209100828228, + "learning_rate": 1.9423278755254933e-06, + "loss": 1.0897, + "step": 2763 + }, + { + "epoch": 3.6956521739130435, + "grad_norm": 0.9524930188314544, + "learning_rate": 1.9386368666239364e-06, + "loss": 1.2429, + "step": 2764 + }, + { + "epoch": 3.6969899665551837, + "grad_norm": 0.8990749631590592, + "learning_rate": 1.9349485244971543e-06, + "loss": 0.9947, + "step": 2765 + }, + { + "epoch": 3.6983277591973245, + "grad_norm": 0.8428526563932764, + "learning_rate": 1.9312628523580882e-06, + "loss": 1.1755, + "step": 2766 + }, + { + "epoch": 3.6996655518394648, + "grad_norm": 0.9337993885468372, + "learning_rate": 1.927579853417352e-06, + "loss": 1.1635, + "step": 2767 + }, + { + "epoch": 3.7010033444816055, + "grad_norm": 0.9148710184204182, + "learning_rate": 1.923899530883232e-06, + "loss": 1.1917, + "step": 2768 + }, + { + "epoch": 3.702341137123746, + "grad_norm": 0.809035417548916, + "learning_rate": 1.9202218879616824e-06, + "loss": 1.1886, + "step": 2769 + }, + { + "epoch": 3.7036789297658865, + "grad_norm": 0.8836788351114218, + "learning_rate": 1.9165469278563243e-06, + "loss": 1.2759, + "step": 2770 + }, + { + "epoch": 3.705016722408027, + "grad_norm": 1.2251383204944355, + "learning_rate": 1.912874653768439e-06, + "loss": 0.9843, + "step": 2771 + }, + { + "epoch": 3.706354515050167, + "grad_norm": 1.1425996429078147, + "learning_rate": 1.9092050688969736e-06, + "loss": 0.9941, + "step": 2772 + }, + { + "epoch": 3.707692307692308, + "grad_norm": 1.031186013990491, + "learning_rate": 1.9055381764385272e-06, + "loss": 1.0096, + "step": 2773 + }, + { + "epoch": 3.709030100334448, + "grad_norm": 1.0681377278574502, + "learning_rate": 1.9018739795873558e-06, + "loss": 1.0295, + "step": 2774 + }, + { + "epoch": 3.7103678929765884, + "grad_norm": 1.0787117488759448, + "learning_rate": 1.8982124815353665e-06, + "loss": 0.8808, + "step": 2775 + }, + { + "epoch": 3.711705685618729, + "grad_norm": 0.9959641501344341, + "learning_rate": 1.8945536854721153e-06, + "loss": 1.0787, + "step": 2776 + }, + { + "epoch": 3.7130434782608694, + "grad_norm": 0.7941490704462614, + "learning_rate": 1.8908975945848063e-06, + "loss": 0.9825, + "step": 2777 + }, + { + "epoch": 3.71438127090301, + "grad_norm": 1.031294627181548, + "learning_rate": 1.8872442120582845e-06, + "loss": 0.919, + "step": 2778 + }, + { + "epoch": 3.7157190635451505, + "grad_norm": 0.9161895637326706, + "learning_rate": 1.8835935410750372e-06, + "loss": 0.9806, + "step": 2779 + }, + { + "epoch": 3.717056856187291, + "grad_norm": 0.7186293149112671, + "learning_rate": 1.8799455848151898e-06, + "loss": 1.0508, + "step": 2780 + }, + { + "epoch": 3.7183946488294315, + "grad_norm": 0.8279049451411468, + "learning_rate": 1.8763003464565022e-06, + "loss": 1.0708, + "step": 2781 + }, + { + "epoch": 3.719732441471572, + "grad_norm": 0.7623448248284717, + "learning_rate": 1.872657829174367e-06, + "loss": 0.9938, + "step": 2782 + }, + { + "epoch": 3.7210702341137125, + "grad_norm": 0.9311215236034235, + "learning_rate": 1.8690180361418058e-06, + "loss": 0.9227, + "step": 2783 + }, + { + "epoch": 3.722408026755853, + "grad_norm": 0.9612431289597403, + "learning_rate": 1.865380970529469e-06, + "loss": 1.1226, + "step": 2784 + }, + { + "epoch": 3.723745819397993, + "grad_norm": 1.3926395679266796, + "learning_rate": 1.8617466355056285e-06, + "loss": 1.0465, + "step": 2785 + }, + { + "epoch": 3.725083612040134, + "grad_norm": 0.8830561706783839, + "learning_rate": 1.8581150342361792e-06, + "loss": 0.9521, + "step": 2786 + }, + { + "epoch": 3.726421404682274, + "grad_norm": 0.8670259140448274, + "learning_rate": 1.854486169884635e-06, + "loss": 0.988, + "step": 2787 + }, + { + "epoch": 3.727759197324415, + "grad_norm": 0.7558643063595495, + "learning_rate": 1.850860045612124e-06, + "loss": 0.8816, + "step": 2788 + }, + { + "epoch": 3.729096989966555, + "grad_norm": 0.9931243911049729, + "learning_rate": 1.8472366645773892e-06, + "loss": 0.973, + "step": 2789 + }, + { + "epoch": 3.730434782608696, + "grad_norm": 1.0254230540551248, + "learning_rate": 1.8436160299367806e-06, + "loss": 1.0116, + "step": 2790 + }, + { + "epoch": 3.731772575250836, + "grad_norm": 0.9094449498611974, + "learning_rate": 1.8399981448442623e-06, + "loss": 1.2397, + "step": 2791 + }, + { + "epoch": 3.7331103678929765, + "grad_norm": 1.213368494707879, + "learning_rate": 1.8363830124513975e-06, + "loss": 1.0779, + "step": 2792 + }, + { + "epoch": 3.734448160535117, + "grad_norm": 0.8936546353302961, + "learning_rate": 1.8327706359073526e-06, + "loss": 1.0141, + "step": 2793 + }, + { + "epoch": 3.7357859531772575, + "grad_norm": 1.084103655017377, + "learning_rate": 1.8291610183588949e-06, + "loss": 0.7879, + "step": 2794 + }, + { + "epoch": 3.7371237458193978, + "grad_norm": 1.180362139602117, + "learning_rate": 1.8255541629503865e-06, + "loss": 1.09, + "step": 2795 + }, + { + "epoch": 3.7384615384615385, + "grad_norm": 0.8012821971445513, + "learning_rate": 1.8219500728237849e-06, + "loss": 0.987, + "step": 2796 + }, + { + "epoch": 3.739799331103679, + "grad_norm": 0.8354248589919371, + "learning_rate": 1.8183487511186381e-06, + "loss": 1.1399, + "step": 2797 + }, + { + "epoch": 3.7411371237458195, + "grad_norm": 0.8845705452262279, + "learning_rate": 1.8147502009720825e-06, + "loss": 0.8105, + "step": 2798 + }, + { + "epoch": 3.74247491638796, + "grad_norm": 0.9252298567214378, + "learning_rate": 1.8111544255188402e-06, + "loss": 1.0665, + "step": 2799 + }, + { + "epoch": 3.7438127090301005, + "grad_norm": 1.1098262960449106, + "learning_rate": 1.807561427891214e-06, + "loss": 1.1096, + "step": 2800 + }, + { + "epoch": 3.745150501672241, + "grad_norm": 0.8628411680818938, + "learning_rate": 1.8039712112190938e-06, + "loss": 1.1588, + "step": 2801 + }, + { + "epoch": 3.746488294314381, + "grad_norm": 1.0119211370938128, + "learning_rate": 1.8003837786299399e-06, + "loss": 1.1697, + "step": 2802 + }, + { + "epoch": 3.747826086956522, + "grad_norm": 0.9473621344825409, + "learning_rate": 1.79679913324879e-06, + "loss": 1.123, + "step": 2803 + }, + { + "epoch": 3.749163879598662, + "grad_norm": 0.9978528010653906, + "learning_rate": 1.7932172781982532e-06, + "loss": 0.9485, + "step": 2804 + }, + { + "epoch": 3.7505016722408024, + "grad_norm": 1.022487164990879, + "learning_rate": 1.7896382165985094e-06, + "loss": 0.9576, + "step": 2805 + }, + { + "epoch": 3.751839464882943, + "grad_norm": 0.9325091115371175, + "learning_rate": 1.7860619515673034e-06, + "loss": 1.1665, + "step": 2806 + }, + { + "epoch": 3.7531772575250835, + "grad_norm": 0.9979760882644158, + "learning_rate": 1.7824884862199448e-06, + "loss": 1.1326, + "step": 2807 + }, + { + "epoch": 3.754515050167224, + "grad_norm": 0.848143209328818, + "learning_rate": 1.7789178236693045e-06, + "loss": 1.0706, + "step": 2808 + }, + { + "epoch": 3.7558528428093645, + "grad_norm": 1.0455695127797928, + "learning_rate": 1.7753499670258106e-06, + "loss": 0.8939, + "step": 2809 + }, + { + "epoch": 3.7571906354515052, + "grad_norm": 0.7426873709380583, + "learning_rate": 1.771784919397449e-06, + "loss": 1.3008, + "step": 2810 + }, + { + "epoch": 3.7585284280936455, + "grad_norm": 1.164801737644014, + "learning_rate": 1.768222683889757e-06, + "loss": 1.1714, + "step": 2811 + }, + { + "epoch": 3.759866220735786, + "grad_norm": 1.0798320149844223, + "learning_rate": 1.764663263605823e-06, + "loss": 1.0454, + "step": 2812 + }, + { + "epoch": 3.7612040133779265, + "grad_norm": 0.7643034039627744, + "learning_rate": 1.7611066616462824e-06, + "loss": 1.0088, + "step": 2813 + }, + { + "epoch": 3.762541806020067, + "grad_norm": 0.8072809948841719, + "learning_rate": 1.7575528811093168e-06, + "loss": 1.037, + "step": 2814 + }, + { + "epoch": 3.763879598662207, + "grad_norm": 0.8347043729092293, + "learning_rate": 1.7540019250906481e-06, + "loss": 1.0216, + "step": 2815 + }, + { + "epoch": 3.765217391304348, + "grad_norm": 0.9583417118484997, + "learning_rate": 1.75045379668354e-06, + "loss": 0.936, + "step": 2816 + }, + { + "epoch": 3.766555183946488, + "grad_norm": 1.0265881765641254, + "learning_rate": 1.746908498978791e-06, + "loss": 1.1239, + "step": 2817 + }, + { + "epoch": 3.767892976588629, + "grad_norm": 1.1494937061928667, + "learning_rate": 1.7433660350647347e-06, + "loss": 0.9135, + "step": 2818 + }, + { + "epoch": 3.769230769230769, + "grad_norm": 1.2042031363152312, + "learning_rate": 1.7398264080272371e-06, + "loss": 0.9698, + "step": 2819 + }, + { + "epoch": 3.77056856187291, + "grad_norm": 1.0483081349211367, + "learning_rate": 1.7362896209496894e-06, + "loss": 1.1464, + "step": 2820 + }, + { + "epoch": 3.77190635451505, + "grad_norm": 1.0251600047412552, + "learning_rate": 1.732755676913015e-06, + "loss": 1.0378, + "step": 2821 + }, + { + "epoch": 3.7732441471571905, + "grad_norm": 1.136486496774369, + "learning_rate": 1.7292245789956552e-06, + "loss": 1.0855, + "step": 2822 + }, + { + "epoch": 3.774581939799331, + "grad_norm": 1.1782187246732656, + "learning_rate": 1.7256963302735752e-06, + "loss": 1.0642, + "step": 2823 + }, + { + "epoch": 3.7759197324414715, + "grad_norm": 1.2857021624300164, + "learning_rate": 1.7221709338202558e-06, + "loss": 0.9606, + "step": 2824 + }, + { + "epoch": 3.777257525083612, + "grad_norm": 1.3906441741579114, + "learning_rate": 1.718648392706695e-06, + "loss": 0.9389, + "step": 2825 + }, + { + "epoch": 3.7785953177257525, + "grad_norm": 0.9650541711866645, + "learning_rate": 1.715128710001403e-06, + "loss": 1.1558, + "step": 2826 + }, + { + "epoch": 3.779933110367893, + "grad_norm": 1.009400716622834, + "learning_rate": 1.7116118887703997e-06, + "loss": 1.1427, + "step": 2827 + }, + { + "epoch": 3.7812709030100335, + "grad_norm": 1.1802142519985026, + "learning_rate": 1.708097932077213e-06, + "loss": 0.9314, + "step": 2828 + }, + { + "epoch": 3.782608695652174, + "grad_norm": 0.7928905445377001, + "learning_rate": 1.7045868429828745e-06, + "loss": 1.1754, + "step": 2829 + }, + { + "epoch": 3.7839464882943146, + "grad_norm": 1.0619173302304947, + "learning_rate": 1.7010786245459166e-06, + "loss": 1.0011, + "step": 2830 + }, + { + "epoch": 3.785284280936455, + "grad_norm": 1.0888320224979053, + "learning_rate": 1.697573279822377e-06, + "loss": 0.9708, + "step": 2831 + }, + { + "epoch": 3.786622073578595, + "grad_norm": 0.9243475040610646, + "learning_rate": 1.6940708118657838e-06, + "loss": 1.184, + "step": 2832 + }, + { + "epoch": 3.787959866220736, + "grad_norm": 0.8192833792354464, + "learning_rate": 1.6905712237271616e-06, + "loss": 1.3003, + "step": 2833 + }, + { + "epoch": 3.789297658862876, + "grad_norm": 1.1917299474816203, + "learning_rate": 1.6870745184550257e-06, + "loss": 0.9449, + "step": 2834 + }, + { + "epoch": 3.7906354515050165, + "grad_norm": 0.9490884873099569, + "learning_rate": 1.6835806990953802e-06, + "loss": 0.955, + "step": 2835 + }, + { + "epoch": 3.791973244147157, + "grad_norm": 0.9259453807575193, + "learning_rate": 1.680089768691716e-06, + "loss": 0.9786, + "step": 2836 + }, + { + "epoch": 3.793311036789298, + "grad_norm": 0.9654738973154228, + "learning_rate": 1.6766017302850068e-06, + "loss": 1.0135, + "step": 2837 + }, + { + "epoch": 3.794648829431438, + "grad_norm": 0.8471952038768596, + "learning_rate": 1.6731165869137073e-06, + "loss": 1.1473, + "step": 2838 + }, + { + "epoch": 3.7959866220735785, + "grad_norm": 0.9835076655099491, + "learning_rate": 1.6696343416137495e-06, + "loss": 0.9816, + "step": 2839 + }, + { + "epoch": 3.7973244147157192, + "grad_norm": 0.8384328298161975, + "learning_rate": 1.6661549974185426e-06, + "loss": 1.0332, + "step": 2840 + }, + { + "epoch": 3.7986622073578595, + "grad_norm": 0.8450977386021993, + "learning_rate": 1.6626785573589667e-06, + "loss": 1.0971, + "step": 2841 + }, + { + "epoch": 3.8, + "grad_norm": 1.1007067469191312, + "learning_rate": 1.6592050244633733e-06, + "loss": 1.1874, + "step": 2842 + }, + { + "epoch": 3.8013377926421406, + "grad_norm": 1.125906185854442, + "learning_rate": 1.6557344017575817e-06, + "loss": 1.112, + "step": 2843 + }, + { + "epoch": 3.802675585284281, + "grad_norm": 0.8016804691801273, + "learning_rate": 1.6522666922648745e-06, + "loss": 1.205, + "step": 2844 + }, + { + "epoch": 3.804013377926421, + "grad_norm": 0.9434006866973588, + "learning_rate": 1.6488018990059985e-06, + "loss": 1.0498, + "step": 2845 + }, + { + "epoch": 3.805351170568562, + "grad_norm": 1.0416878944276322, + "learning_rate": 1.6453400249991587e-06, + "loss": 1.119, + "step": 2846 + }, + { + "epoch": 3.8066889632107026, + "grad_norm": 0.8969926508745716, + "learning_rate": 1.6418810732600177e-06, + "loss": 1.139, + "step": 2847 + }, + { + "epoch": 3.808026755852843, + "grad_norm": 0.9453582934615907, + "learning_rate": 1.6384250468016932e-06, + "loss": 1.1208, + "step": 2848 + }, + { + "epoch": 3.809364548494983, + "grad_norm": 0.8919105175142065, + "learning_rate": 1.6349719486347533e-06, + "loss": 1.1904, + "step": 2849 + }, + { + "epoch": 3.810702341137124, + "grad_norm": 0.7439478950192581, + "learning_rate": 1.6315217817672142e-06, + "loss": 0.9226, + "step": 2850 + }, + { + "epoch": 3.812040133779264, + "grad_norm": 0.9287891369116521, + "learning_rate": 1.6280745492045435e-06, + "loss": 1.0713, + "step": 2851 + }, + { + "epoch": 3.8133779264214045, + "grad_norm": 0.8813965086446669, + "learning_rate": 1.6246302539496483e-06, + "loss": 1.0867, + "step": 2852 + }, + { + "epoch": 3.8147157190635452, + "grad_norm": 0.8709129280571959, + "learning_rate": 1.6211888990028785e-06, + "loss": 1.1811, + "step": 2853 + }, + { + "epoch": 3.8160535117056855, + "grad_norm": 0.8989475057678912, + "learning_rate": 1.617750487362022e-06, + "loss": 1.182, + "step": 2854 + }, + { + "epoch": 3.8173913043478263, + "grad_norm": 0.739254031120975, + "learning_rate": 1.614315022022303e-06, + "loss": 1.0328, + "step": 2855 + }, + { + "epoch": 3.8187290969899665, + "grad_norm": 1.044732876159129, + "learning_rate": 1.6108825059763794e-06, + "loss": 0.8423, + "step": 2856 + }, + { + "epoch": 3.8200668896321073, + "grad_norm": 0.8896956728238575, + "learning_rate": 1.6074529422143398e-06, + "loss": 1.0776, + "step": 2857 + }, + { + "epoch": 3.8214046822742476, + "grad_norm": 0.7950320774154257, + "learning_rate": 1.6040263337237017e-06, + "loss": 1.0109, + "step": 2858 + }, + { + "epoch": 3.822742474916388, + "grad_norm": 1.0389256373637143, + "learning_rate": 1.6006026834894068e-06, + "loss": 1.0409, + "step": 2859 + }, + { + "epoch": 3.8240802675585286, + "grad_norm": 0.7834137123805304, + "learning_rate": 1.5971819944938194e-06, + "loss": 1.0629, + "step": 2860 + }, + { + "epoch": 3.825418060200669, + "grad_norm": 0.8969361180378026, + "learning_rate": 1.5937642697167288e-06, + "loss": 1.177, + "step": 2861 + }, + { + "epoch": 3.826755852842809, + "grad_norm": 0.9745806521381526, + "learning_rate": 1.5903495121353373e-06, + "loss": 1.0575, + "step": 2862 + }, + { + "epoch": 3.82809364548495, + "grad_norm": 0.912196239526788, + "learning_rate": 1.5869377247242645e-06, + "loss": 0.9508, + "step": 2863 + }, + { + "epoch": 3.82943143812709, + "grad_norm": 0.997768074775593, + "learning_rate": 1.5835289104555417e-06, + "loss": 1.2125, + "step": 2864 + }, + { + "epoch": 3.830769230769231, + "grad_norm": 0.8285197419451076, + "learning_rate": 1.5801230722986104e-06, + "loss": 1.1643, + "step": 2865 + }, + { + "epoch": 3.832107023411371, + "grad_norm": 1.0054238735048069, + "learning_rate": 1.5767202132203207e-06, + "loss": 1.2888, + "step": 2866 + }, + { + "epoch": 3.833444816053512, + "grad_norm": 0.8389845258311182, + "learning_rate": 1.5733203361849265e-06, + "loss": 0.9394, + "step": 2867 + }, + { + "epoch": 3.8347826086956522, + "grad_norm": 0.8679751699213366, + "learning_rate": 1.5699234441540845e-06, + "loss": 1.3396, + "step": 2868 + }, + { + "epoch": 3.8361204013377925, + "grad_norm": 0.8830319086039121, + "learning_rate": 1.5665295400868513e-06, + "loss": 1.116, + "step": 2869 + }, + { + "epoch": 3.8374581939799333, + "grad_norm": 0.8108978074588221, + "learning_rate": 1.5631386269396798e-06, + "loss": 1.0862, + "step": 2870 + }, + { + "epoch": 3.8387959866220736, + "grad_norm": 0.8127759344937346, + "learning_rate": 1.5597507076664187e-06, + "loss": 1.2515, + "step": 2871 + }, + { + "epoch": 3.840133779264214, + "grad_norm": 0.7810270283069889, + "learning_rate": 1.5563657852183072e-06, + "loss": 1.1139, + "step": 2872 + }, + { + "epoch": 3.8414715719063546, + "grad_norm": 0.7246364804164164, + "learning_rate": 1.5529838625439763e-06, + "loss": 1.2376, + "step": 2873 + }, + { + "epoch": 3.842809364548495, + "grad_norm": 0.9387183638364117, + "learning_rate": 1.549604942589441e-06, + "loss": 0.9708, + "step": 2874 + }, + { + "epoch": 3.8441471571906356, + "grad_norm": 0.986196500899491, + "learning_rate": 1.546229028298103e-06, + "loss": 1.1789, + "step": 2875 + }, + { + "epoch": 3.845484949832776, + "grad_norm": 0.8203350547867899, + "learning_rate": 1.5428561226107442e-06, + "loss": 0.9853, + "step": 2876 + }, + { + "epoch": 3.8468227424749166, + "grad_norm": 0.9860318634941652, + "learning_rate": 1.5394862284655266e-06, + "loss": 0.9592, + "step": 2877 + }, + { + "epoch": 3.848160535117057, + "grad_norm": 1.1476293190167852, + "learning_rate": 1.5361193487979881e-06, + "loss": 1.1171, + "step": 2878 + }, + { + "epoch": 3.849498327759197, + "grad_norm": 0.9798683282858243, + "learning_rate": 1.5327554865410415e-06, + "loss": 1.2147, + "step": 2879 + }, + { + "epoch": 3.850836120401338, + "grad_norm": 1.0982009011794278, + "learning_rate": 1.5293946446249686e-06, + "loss": 1.0341, + "step": 2880 + }, + { + "epoch": 3.8521739130434782, + "grad_norm": 0.9432057578834805, + "learning_rate": 1.526036825977426e-06, + "loss": 1.0292, + "step": 2881 + }, + { + "epoch": 3.8535117056856185, + "grad_norm": 1.2440083775942758, + "learning_rate": 1.5226820335234316e-06, + "loss": 1.0097, + "step": 2882 + }, + { + "epoch": 3.8548494983277592, + "grad_norm": 1.263116619403076, + "learning_rate": 1.5193302701853674e-06, + "loss": 1.1814, + "step": 2883 + }, + { + "epoch": 3.8561872909698995, + "grad_norm": 0.9574191354129671, + "learning_rate": 1.5159815388829784e-06, + "loss": 1.1751, + "step": 2884 + }, + { + "epoch": 3.8575250836120403, + "grad_norm": 1.086619560979527, + "learning_rate": 1.5126358425333677e-06, + "loss": 1.1612, + "step": 2885 + }, + { + "epoch": 3.8588628762541806, + "grad_norm": 1.0585123612080123, + "learning_rate": 1.509293184050995e-06, + "loss": 1.2251, + "step": 2886 + }, + { + "epoch": 3.8602006688963213, + "grad_norm": 1.0535628919881639, + "learning_rate": 1.5059535663476731e-06, + "loss": 1.0493, + "step": 2887 + }, + { + "epoch": 3.8615384615384616, + "grad_norm": 0.9801317320868471, + "learning_rate": 1.5026169923325668e-06, + "loss": 1.142, + "step": 2888 + }, + { + "epoch": 3.862876254180602, + "grad_norm": 1.1566052205564246, + "learning_rate": 1.499283464912188e-06, + "loss": 0.9624, + "step": 2889 + }, + { + "epoch": 3.8642140468227426, + "grad_norm": 0.9124074696927102, + "learning_rate": 1.4959529869903948e-06, + "loss": 1.03, + "step": 2890 + }, + { + "epoch": 3.865551839464883, + "grad_norm": 0.9807658588687818, + "learning_rate": 1.4926255614683931e-06, + "loss": 1.3967, + "step": 2891 + }, + { + "epoch": 3.866889632107023, + "grad_norm": 0.9376322560147929, + "learning_rate": 1.4893011912447248e-06, + "loss": 1.1086, + "step": 2892 + }, + { + "epoch": 3.868227424749164, + "grad_norm": 0.8404174396047467, + "learning_rate": 1.4859798792152713e-06, + "loss": 1.1302, + "step": 2893 + }, + { + "epoch": 3.869565217391304, + "grad_norm": 0.8904987621913899, + "learning_rate": 1.4826616282732509e-06, + "loss": 1.1516, + "step": 2894 + }, + { + "epoch": 3.870903010033445, + "grad_norm": 0.9181391547817395, + "learning_rate": 1.4793464413092161e-06, + "loss": 0.7472, + "step": 2895 + }, + { + "epoch": 3.8722408026755852, + "grad_norm": 1.2267500202147437, + "learning_rate": 1.4760343212110484e-06, + "loss": 1.0238, + "step": 2896 + }, + { + "epoch": 3.873578595317726, + "grad_norm": 1.237684985695305, + "learning_rate": 1.4727252708639589e-06, + "loss": 1.0977, + "step": 2897 + }, + { + "epoch": 3.8749163879598663, + "grad_norm": 1.0359775335942873, + "learning_rate": 1.4694192931504842e-06, + "loss": 1.0594, + "step": 2898 + }, + { + "epoch": 3.8762541806020065, + "grad_norm": 1.0278650636949753, + "learning_rate": 1.4661163909504855e-06, + "loss": 0.9512, + "step": 2899 + }, + { + "epoch": 3.8775919732441473, + "grad_norm": 0.8973357456163312, + "learning_rate": 1.4628165671411426e-06, + "loss": 1.1501, + "step": 2900 + }, + { + "epoch": 3.8789297658862876, + "grad_norm": 1.0618057106701666, + "learning_rate": 1.459519824596956e-06, + "loss": 1.0779, + "step": 2901 + }, + { + "epoch": 3.880267558528428, + "grad_norm": 1.0482073556998235, + "learning_rate": 1.4562261661897415e-06, + "loss": 1.0885, + "step": 2902 + }, + { + "epoch": 3.8816053511705686, + "grad_norm": 0.7908455769546419, + "learning_rate": 1.4529355947886265e-06, + "loss": 1.071, + "step": 2903 + }, + { + "epoch": 3.882943143812709, + "grad_norm": 1.1917066279605102, + "learning_rate": 1.4496481132600516e-06, + "loss": 0.9719, + "step": 2904 + }, + { + "epoch": 3.8842809364548496, + "grad_norm": 0.926043541257457, + "learning_rate": 1.4463637244677648e-06, + "loss": 1.0747, + "step": 2905 + }, + { + "epoch": 3.88561872909699, + "grad_norm": 0.90020458131842, + "learning_rate": 1.4430824312728197e-06, + "loss": 0.9377, + "step": 2906 + }, + { + "epoch": 3.8869565217391306, + "grad_norm": 0.9659386789991806, + "learning_rate": 1.4398042365335745e-06, + "loss": 1.0677, + "step": 2907 + }, + { + "epoch": 3.888294314381271, + "grad_norm": 0.832397841856589, + "learning_rate": 1.4365291431056871e-06, + "loss": 1.2764, + "step": 2908 + }, + { + "epoch": 3.8896321070234112, + "grad_norm": 0.8759285295429187, + "learning_rate": 1.4332571538421136e-06, + "loss": 1.0852, + "step": 2909 + }, + { + "epoch": 3.890969899665552, + "grad_norm": 1.1420838162341116, + "learning_rate": 1.4299882715931062e-06, + "loss": 0.9385, + "step": 2910 + }, + { + "epoch": 3.8923076923076922, + "grad_norm": 1.3040271958803875, + "learning_rate": 1.4267224992062134e-06, + "loss": 0.8935, + "step": 2911 + }, + { + "epoch": 3.8936454849498325, + "grad_norm": 1.1504856304966116, + "learning_rate": 1.4234598395262706e-06, + "loss": 1.1313, + "step": 2912 + }, + { + "epoch": 3.8949832775919733, + "grad_norm": 0.8395953895470128, + "learning_rate": 1.4202002953954042e-06, + "loss": 1.0996, + "step": 2913 + }, + { + "epoch": 3.8963210702341136, + "grad_norm": 0.9402456209347255, + "learning_rate": 1.4169438696530246e-06, + "loss": 1.2065, + "step": 2914 + }, + { + "epoch": 3.8976588628762543, + "grad_norm": 0.8961001564195932, + "learning_rate": 1.4136905651358284e-06, + "loss": 1.162, + "step": 2915 + }, + { + "epoch": 3.8989966555183946, + "grad_norm": 0.8551890882547396, + "learning_rate": 1.410440384677791e-06, + "loss": 1.0646, + "step": 2916 + }, + { + "epoch": 3.9003344481605353, + "grad_norm": 0.739006176712536, + "learning_rate": 1.4071933311101675e-06, + "loss": 0.9988, + "step": 2917 + }, + { + "epoch": 3.9016722408026756, + "grad_norm": 0.8124637645432263, + "learning_rate": 1.4039494072614884e-06, + "loss": 0.9335, + "step": 2918 + }, + { + "epoch": 3.903010033444816, + "grad_norm": 0.7606340234907115, + "learning_rate": 1.4007086159575595e-06, + "loss": 0.9152, + "step": 2919 + }, + { + "epoch": 3.9043478260869566, + "grad_norm": 0.9256019659678446, + "learning_rate": 1.3974709600214541e-06, + "loss": 1.0577, + "step": 2920 + }, + { + "epoch": 3.905685618729097, + "grad_norm": 0.7144033116600553, + "learning_rate": 1.3942364422735205e-06, + "loss": 1.0, + "step": 2921 + }, + { + "epoch": 3.907023411371237, + "grad_norm": 0.8814286467712614, + "learning_rate": 1.3910050655313679e-06, + "loss": 0.9857, + "step": 2922 + }, + { + "epoch": 3.908361204013378, + "grad_norm": 0.7586669067060352, + "learning_rate": 1.3877768326098712e-06, + "loss": 1.0519, + "step": 2923 + }, + { + "epoch": 3.9096989966555182, + "grad_norm": 0.7729159851731855, + "learning_rate": 1.3845517463211667e-06, + "loss": 1.0185, + "step": 2924 + }, + { + "epoch": 3.911036789297659, + "grad_norm": 0.8532846106231734, + "learning_rate": 1.3813298094746491e-06, + "loss": 1.1643, + "step": 2925 + }, + { + "epoch": 3.9123745819397993, + "grad_norm": 0.9119485955624689, + "learning_rate": 1.3781110248769709e-06, + "loss": 0.8927, + "step": 2926 + }, + { + "epoch": 3.91371237458194, + "grad_norm": 0.9665780888011636, + "learning_rate": 1.374895395332037e-06, + "loss": 0.9672, + "step": 2927 + }, + { + "epoch": 3.9150501672240803, + "grad_norm": 0.9648249513673769, + "learning_rate": 1.371682923641005e-06, + "loss": 1.0399, + "step": 2928 + }, + { + "epoch": 3.9163879598662206, + "grad_norm": 0.9056512625236489, + "learning_rate": 1.3684736126022812e-06, + "loss": 1.1677, + "step": 2929 + }, + { + "epoch": 3.9177257525083613, + "grad_norm": 1.061331498933367, + "learning_rate": 1.3652674650115193e-06, + "loss": 1.3042, + "step": 2930 + }, + { + "epoch": 3.9190635451505016, + "grad_norm": 1.0619272220477896, + "learning_rate": 1.362064483661617e-06, + "loss": 1.0321, + "step": 2931 + }, + { + "epoch": 3.920401337792642, + "grad_norm": 1.149052734884213, + "learning_rate": 1.3588646713427128e-06, + "loss": 1.0026, + "step": 2932 + }, + { + "epoch": 3.9217391304347826, + "grad_norm": 0.9480283569446559, + "learning_rate": 1.3556680308421865e-06, + "loss": 1.143, + "step": 2933 + }, + { + "epoch": 3.9230769230769234, + "grad_norm": 0.8673152405938298, + "learning_rate": 1.352474564944653e-06, + "loss": 1.1869, + "step": 2934 + }, + { + "epoch": 3.9244147157190636, + "grad_norm": 0.9858701426438153, + "learning_rate": 1.349284276431963e-06, + "loss": 1.0889, + "step": 2935 + }, + { + "epoch": 3.925752508361204, + "grad_norm": 0.8139874811502765, + "learning_rate": 1.3460971680831996e-06, + "loss": 1.2517, + "step": 2936 + }, + { + "epoch": 3.9270903010033447, + "grad_norm": 0.8519194379494244, + "learning_rate": 1.3429132426746743e-06, + "loss": 1.1004, + "step": 2937 + }, + { + "epoch": 3.928428093645485, + "grad_norm": 1.0396059023238209, + "learning_rate": 1.339732502979928e-06, + "loss": 1.0183, + "step": 2938 + }, + { + "epoch": 3.9297658862876252, + "grad_norm": 0.8895402383668675, + "learning_rate": 1.3365549517697234e-06, + "loss": 1.0993, + "step": 2939 + }, + { + "epoch": 3.931103678929766, + "grad_norm": 0.9544775883105664, + "learning_rate": 1.3333805918120473e-06, + "loss": 1.1633, + "step": 2940 + }, + { + "epoch": 3.9324414715719063, + "grad_norm": 0.8857275079511229, + "learning_rate": 1.33020942587211e-06, + "loss": 1.0068, + "step": 2941 + }, + { + "epoch": 3.9337792642140466, + "grad_norm": 1.103794952599576, + "learning_rate": 1.3270414567123342e-06, + "loss": 1.0254, + "step": 2942 + }, + { + "epoch": 3.9351170568561873, + "grad_norm": 1.0901273515785135, + "learning_rate": 1.3238766870923592e-06, + "loss": 1.0954, + "step": 2943 + }, + { + "epoch": 3.936454849498328, + "grad_norm": 1.033403198652052, + "learning_rate": 1.3207151197690392e-06, + "loss": 0.8708, + "step": 2944 + }, + { + "epoch": 3.9377926421404683, + "grad_norm": 1.0108016268304647, + "learning_rate": 1.3175567574964372e-06, + "loss": 1.0035, + "step": 2945 + }, + { + "epoch": 3.9391304347826086, + "grad_norm": 0.7437893860800271, + "learning_rate": 1.3144016030258244e-06, + "loss": 1.0037, + "step": 2946 + }, + { + "epoch": 3.9404682274247493, + "grad_norm": 1.123737048276975, + "learning_rate": 1.3112496591056778e-06, + "loss": 1.0091, + "step": 2947 + }, + { + "epoch": 3.9418060200668896, + "grad_norm": 0.9009601269785793, + "learning_rate": 1.3081009284816776e-06, + "loss": 1.0476, + "step": 2948 + }, + { + "epoch": 3.94314381270903, + "grad_norm": 0.9458104225026971, + "learning_rate": 1.3049554138967052e-06, + "loss": 0.8625, + "step": 2949 + }, + { + "epoch": 3.9444816053511706, + "grad_norm": 1.0207893197030595, + "learning_rate": 1.301813118090839e-06, + "loss": 0.9917, + "step": 2950 + }, + { + "epoch": 3.945819397993311, + "grad_norm": 0.8810484497059263, + "learning_rate": 1.2986740438013579e-06, + "loss": 0.7601, + "step": 2951 + }, + { + "epoch": 3.9471571906354512, + "grad_norm": 0.7618522256182519, + "learning_rate": 1.2955381937627293e-06, + "loss": 1.1673, + "step": 2952 + }, + { + "epoch": 3.948494983277592, + "grad_norm": 1.0152797414001336, + "learning_rate": 1.2924055707066141e-06, + "loss": 1.021, + "step": 2953 + }, + { + "epoch": 3.9498327759197327, + "grad_norm": 0.9382419182934473, + "learning_rate": 1.2892761773618628e-06, + "loss": 1.1815, + "step": 2954 + }, + { + "epoch": 3.951170568561873, + "grad_norm": 0.9830246460984771, + "learning_rate": 1.286150016454511e-06, + "loss": 1.1614, + "step": 2955 + }, + { + "epoch": 3.9525083612040133, + "grad_norm": 1.0255389505985097, + "learning_rate": 1.2830270907077797e-06, + "loss": 1.1688, + "step": 2956 + }, + { + "epoch": 3.953846153846154, + "grad_norm": 0.8365409388322737, + "learning_rate": 1.279907402842071e-06, + "loss": 1.0129, + "step": 2957 + }, + { + "epoch": 3.9551839464882943, + "grad_norm": 0.8405119943826836, + "learning_rate": 1.2767909555749676e-06, + "loss": 0.924, + "step": 2958 + }, + { + "epoch": 3.9565217391304346, + "grad_norm": 1.1815720339372906, + "learning_rate": 1.2736777516212267e-06, + "loss": 1.0908, + "step": 2959 + }, + { + "epoch": 3.9578595317725753, + "grad_norm": 0.8781548545426815, + "learning_rate": 1.2705677936927841e-06, + "loss": 1.1362, + "step": 2960 + }, + { + "epoch": 3.9591973244147156, + "grad_norm": 1.2247341698923335, + "learning_rate": 1.267461084498744e-06, + "loss": 1.0291, + "step": 2961 + }, + { + "epoch": 3.9605351170568563, + "grad_norm": 0.8102838890199839, + "learning_rate": 1.2643576267453832e-06, + "loss": 0.9935, + "step": 2962 + }, + { + "epoch": 3.9618729096989966, + "grad_norm": 1.0212776042753715, + "learning_rate": 1.2612574231361463e-06, + "loss": 1.0039, + "step": 2963 + }, + { + "epoch": 3.9632107023411374, + "grad_norm": 0.9600635629777312, + "learning_rate": 1.2581604763716404e-06, + "loss": 1.0407, + "step": 2964 + }, + { + "epoch": 3.9645484949832777, + "grad_norm": 0.877870563990983, + "learning_rate": 1.2550667891496394e-06, + "loss": 1.1433, + "step": 2965 + }, + { + "epoch": 3.965886287625418, + "grad_norm": 1.2020277643430024, + "learning_rate": 1.2519763641650739e-06, + "loss": 0.9639, + "step": 2966 + }, + { + "epoch": 3.9672240802675587, + "grad_norm": 0.8294016198772269, + "learning_rate": 1.2488892041100364e-06, + "loss": 1.0181, + "step": 2967 + }, + { + "epoch": 3.968561872909699, + "grad_norm": 0.860347121483889, + "learning_rate": 1.2458053116737722e-06, + "loss": 1.1288, + "step": 2968 + }, + { + "epoch": 3.9698996655518393, + "grad_norm": 0.9212334648002705, + "learning_rate": 1.2427246895426826e-06, + "loss": 1.237, + "step": 2969 + }, + { + "epoch": 3.97123745819398, + "grad_norm": 0.7623522620256389, + "learning_rate": 1.2396473404003162e-06, + "loss": 1.1031, + "step": 2970 + }, + { + "epoch": 3.9725752508361203, + "grad_norm": 0.9711661770303537, + "learning_rate": 1.2365732669273778e-06, + "loss": 1.0491, + "step": 2971 + }, + { + "epoch": 3.973913043478261, + "grad_norm": 1.0324457469385886, + "learning_rate": 1.233502471801712e-06, + "loss": 0.989, + "step": 2972 + }, + { + "epoch": 3.9752508361204013, + "grad_norm": 0.9276100879674521, + "learning_rate": 1.2304349576983094e-06, + "loss": 1.0325, + "step": 2973 + }, + { + "epoch": 3.976588628762542, + "grad_norm": 0.893542401419334, + "learning_rate": 1.2273707272893038e-06, + "loss": 1.0882, + "step": 2974 + }, + { + "epoch": 3.9779264214046823, + "grad_norm": 1.069014821252538, + "learning_rate": 1.2243097832439672e-06, + "loss": 1.0647, + "step": 2975 + }, + { + "epoch": 3.9792642140468226, + "grad_norm": 1.268270529700524, + "learning_rate": 1.2212521282287093e-06, + "loss": 1.1053, + "step": 2976 + }, + { + "epoch": 3.9806020066889634, + "grad_norm": 1.0267323836433249, + "learning_rate": 1.2181977649070749e-06, + "loss": 1.0401, + "step": 2977 + }, + { + "epoch": 3.9819397993311036, + "grad_norm": 0.9310362599778057, + "learning_rate": 1.2151466959397406e-06, + "loss": 1.2597, + "step": 2978 + }, + { + "epoch": 3.983277591973244, + "grad_norm": 0.7261098368132611, + "learning_rate": 1.2120989239845149e-06, + "loss": 1.1791, + "step": 2979 + }, + { + "epoch": 3.9846153846153847, + "grad_norm": 1.0648814561634923, + "learning_rate": 1.209054451696331e-06, + "loss": 1.0335, + "step": 2980 + }, + { + "epoch": 3.985953177257525, + "grad_norm": 0.7797375502953175, + "learning_rate": 1.206013281727253e-06, + "loss": 1.0486, + "step": 2981 + }, + { + "epoch": 3.9872909698996657, + "grad_norm": 0.6796371218587165, + "learning_rate": 1.202975416726464e-06, + "loss": 0.8865, + "step": 2982 + }, + { + "epoch": 3.988628762541806, + "grad_norm": 0.883927882092417, + "learning_rate": 1.1999408593402688e-06, + "loss": 1.1026, + "step": 2983 + }, + { + "epoch": 3.9899665551839467, + "grad_norm": 0.9578231928825651, + "learning_rate": 1.1969096122120927e-06, + "loss": 0.9769, + "step": 2984 + }, + { + "epoch": 3.991304347826087, + "grad_norm": 0.9967946975319333, + "learning_rate": 1.1938816779824753e-06, + "loss": 0.9216, + "step": 2985 + }, + { + "epoch": 3.9926421404682273, + "grad_norm": 0.7319772670932233, + "learning_rate": 1.190857059289071e-06, + "loss": 1.0731, + "step": 2986 + }, + { + "epoch": 3.993979933110368, + "grad_norm": 0.9781539291997341, + "learning_rate": 1.1878357587666468e-06, + "loss": 0.7655, + "step": 2987 + }, + { + "epoch": 3.9953177257525083, + "grad_norm": 1.0050093920706686, + "learning_rate": 1.1848177790470784e-06, + "loss": 1.1054, + "step": 2988 + }, + { + "epoch": 3.9966555183946486, + "grad_norm": 1.0733681599800282, + "learning_rate": 1.1818031227593491e-06, + "loss": 1.1372, + "step": 2989 + }, + { + "epoch": 3.9979933110367893, + "grad_norm": 0.9048277853427193, + "learning_rate": 1.1787917925295467e-06, + "loss": 1.1745, + "step": 2990 + }, + { + "epoch": 3.9993311036789296, + "grad_norm": 0.9309755236789513, + "learning_rate": 1.1757837909808628e-06, + "loss": 1.0139, + "step": 2991 + }, + { + "epoch": 4.0, + "grad_norm": 2.0214905619213175, + "learning_rate": 1.1727791207335876e-06, + "loss": 1.0353, + "step": 2992 + } + ], + "logging_steps": 1, + "max_steps": 3740, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 358518137094144.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}