diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,4933 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.03894586524730624, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.5636950353294636e-05, + "grad_norm": 13.800827980041504, + "learning_rate": 2.02e-06, + "loss": 12.1771, + "step": 1 + }, + { + "epoch": 0.00011127390070658927, + "grad_norm": 14.943708419799805, + "learning_rate": 4.04e-06, + "loss": 12.3704, + "step": 2 + }, + { + "epoch": 0.0001669108510598839, + "grad_norm": 12.551050186157227, + "learning_rate": 6.06e-06, + "loss": 11.6178, + "step": 3 + }, + { + "epoch": 0.00022254780141317854, + "grad_norm": 14.947781562805176, + "learning_rate": 8.08e-06, + "loss": 11.8778, + "step": 4 + }, + { + "epoch": 0.0002781847517664732, + "grad_norm": 13.710307121276855, + "learning_rate": 1.0100000000000002e-05, + "loss": 12.3538, + "step": 5 + }, + { + "epoch": 0.0003338217021197678, + "grad_norm": 14.015996932983398, + "learning_rate": 1.212e-05, + "loss": 12.1668, + "step": 6 + }, + { + "epoch": 0.0003894586524730624, + "grad_norm": 13.778985977172852, + "learning_rate": 1.4140000000000002e-05, + "loss": 12.1282, + "step": 7 + }, + { + "epoch": 0.0004450956028263571, + "grad_norm": 17.870742797851562, + "learning_rate": 1.616e-05, + "loss": 12.1028, + "step": 8 + }, + { + "epoch": 0.0005007325531796518, + "grad_norm": 12.128801345825195, + "learning_rate": 1.818e-05, + "loss": 11.0843, + "step": 9 + }, + { + "epoch": 0.0005563695035329464, + "grad_norm": 9.627545356750488, + "learning_rate": 2.0200000000000003e-05, + "loss": 10.888, + "step": 10 + }, + { + "epoch": 0.000612006453886241, + "grad_norm": 9.773977279663086, + "learning_rate": 2.222e-05, + "loss": 10.7024, + "step": 11 + }, + { + "epoch": 0.0006676434042395356, + "grad_norm": 9.958878517150879, + "learning_rate": 2.424e-05, + "loss": 10.4974, + "step": 12 + }, + { + "epoch": 0.0007232803545928302, + "grad_norm": 9.917668342590332, + "learning_rate": 2.6260000000000003e-05, + "loss": 9.9214, + "step": 13 + }, + { + "epoch": 0.0007789173049461248, + "grad_norm": 11.693564414978027, + "learning_rate": 2.8280000000000004e-05, + "loss": 10.4188, + "step": 14 + }, + { + "epoch": 0.0008345542552994196, + "grad_norm": 14.34203052520752, + "learning_rate": 3.0299999999999998e-05, + "loss": 10.479, + "step": 15 + }, + { + "epoch": 0.0008901912056527142, + "grad_norm": 13.203216552734375, + "learning_rate": 3.232e-05, + "loss": 9.7199, + "step": 16 + }, + { + "epoch": 0.0009458281560060088, + "grad_norm": 12.069876670837402, + "learning_rate": 3.434e-05, + "loss": 9.9687, + "step": 17 + }, + { + "epoch": 0.0010014651063593035, + "grad_norm": 9.353058815002441, + "learning_rate": 3.636e-05, + "loss": 9.1979, + "step": 18 + }, + { + "epoch": 0.001057102056712598, + "grad_norm": 9.61816692352295, + "learning_rate": 3.838e-05, + "loss": 9.7821, + "step": 19 + }, + { + "epoch": 0.0011127390070658928, + "grad_norm": 9.409801483154297, + "learning_rate": 4.0400000000000006e-05, + "loss": 8.8036, + "step": 20 + }, + { + "epoch": 0.0011683759574191873, + "grad_norm": 9.369738578796387, + "learning_rate": 4.242e-05, + "loss": 8.6842, + "step": 21 + }, + { + "epoch": 0.001224012907772482, + "grad_norm": 8.6599702835083, + "learning_rate": 4.444e-05, + "loss": 8.5738, + "step": 22 + }, + { + "epoch": 0.0012796498581257765, + "grad_norm": 8.33215045928955, + "learning_rate": 4.6460000000000006e-05, + "loss": 8.8557, + "step": 23 + }, + { + "epoch": 0.0013352868084790712, + "grad_norm": 8.61293888092041, + "learning_rate": 4.848e-05, + "loss": 8.5132, + "step": 24 + }, + { + "epoch": 0.001390923758832366, + "grad_norm": 9.249735832214355, + "learning_rate": 5.05e-05, + "loss": 8.8402, + "step": 25 + }, + { + "epoch": 0.0014465607091856604, + "grad_norm": 7.949573516845703, + "learning_rate": 5.2520000000000005e-05, + "loss": 8.6889, + "step": 26 + }, + { + "epoch": 0.0015021976595389552, + "grad_norm": 9.205979347229004, + "learning_rate": 5.454e-05, + "loss": 7.8971, + "step": 27 + }, + { + "epoch": 0.0015578346098922497, + "grad_norm": 9.982477188110352, + "learning_rate": 5.656000000000001e-05, + "loss": 8.2363, + "step": 28 + }, + { + "epoch": 0.0016134715602455444, + "grad_norm": 9.562348365783691, + "learning_rate": 5.858e-05, + "loss": 8.4463, + "step": 29 + }, + { + "epoch": 0.0016691085105988391, + "grad_norm": 8.459097862243652, + "learning_rate": 6.0599999999999996e-05, + "loss": 7.5909, + "step": 30 + }, + { + "epoch": 0.0017247454609521336, + "grad_norm": 7.977583885192871, + "learning_rate": 6.262000000000001e-05, + "loss": 7.8482, + "step": 31 + }, + { + "epoch": 0.0017803824113054284, + "grad_norm": 10.415498733520508, + "learning_rate": 6.464e-05, + "loss": 7.5464, + "step": 32 + }, + { + "epoch": 0.0018360193616587229, + "grad_norm": 9.659133911132812, + "learning_rate": 6.666e-05, + "loss": 7.8361, + "step": 33 + }, + { + "epoch": 0.0018916563120120176, + "grad_norm": 9.549842834472656, + "learning_rate": 6.868e-05, + "loss": 7.8558, + "step": 34 + }, + { + "epoch": 0.0019472932623653123, + "grad_norm": 8.901015281677246, + "learning_rate": 7.07e-05, + "loss": 7.3988, + "step": 35 + }, + { + "epoch": 0.002002930212718607, + "grad_norm": 7.464367389678955, + "learning_rate": 7.272e-05, + "loss": 7.0652, + "step": 36 + }, + { + "epoch": 0.0020585671630719013, + "grad_norm": 7.1383442878723145, + "learning_rate": 7.474e-05, + "loss": 7.4159, + "step": 37 + }, + { + "epoch": 0.002114204113425196, + "grad_norm": 7.8097124099731445, + "learning_rate": 7.676e-05, + "loss": 7.3087, + "step": 38 + }, + { + "epoch": 0.0021698410637784908, + "grad_norm": 10.265083312988281, + "learning_rate": 7.878e-05, + "loss": 7.3348, + "step": 39 + }, + { + "epoch": 0.0022254780141317855, + "grad_norm": 8.188544273376465, + "learning_rate": 8.080000000000001e-05, + "loss": 6.8637, + "step": 40 + }, + { + "epoch": 0.0022811149644850802, + "grad_norm": 8.422771453857422, + "learning_rate": 8.282e-05, + "loss": 6.7185, + "step": 41 + }, + { + "epoch": 0.0023367519148383745, + "grad_norm": 8.6810941696167, + "learning_rate": 8.484e-05, + "loss": 7.4694, + "step": 42 + }, + { + "epoch": 0.0023923888651916692, + "grad_norm": 8.281686782836914, + "learning_rate": 8.686e-05, + "loss": 6.8384, + "step": 43 + }, + { + "epoch": 0.002448025815544964, + "grad_norm": 8.296032905578613, + "learning_rate": 8.888e-05, + "loss": 7.44, + "step": 44 + }, + { + "epoch": 0.0025036627658982587, + "grad_norm": 7.057008266448975, + "learning_rate": 9.09e-05, + "loss": 7.4695, + "step": 45 + }, + { + "epoch": 0.002559299716251553, + "grad_norm": 7.112790584564209, + "learning_rate": 9.292000000000001e-05, + "loss": 7.2718, + "step": 46 + }, + { + "epoch": 0.0026149366666048477, + "grad_norm": 6.733706951141357, + "learning_rate": 9.494e-05, + "loss": 7.4331, + "step": 47 + }, + { + "epoch": 0.0026705736169581424, + "grad_norm": 6.797787189483643, + "learning_rate": 9.696e-05, + "loss": 6.8889, + "step": 48 + }, + { + "epoch": 0.002726210567311437, + "grad_norm": 8.037755966186523, + "learning_rate": 9.898e-05, + "loss": 7.1156, + "step": 49 + }, + { + "epoch": 0.002781847517664732, + "grad_norm": 7.376313209533691, + "learning_rate": 0.000101, + "loss": 6.3873, + "step": 50 + }, + { + "epoch": 0.002837484468018026, + "grad_norm": 7.13367223739624, + "learning_rate": 0.00010302, + "loss": 6.8425, + "step": 51 + }, + { + "epoch": 0.002893121418371321, + "grad_norm": 7.325440883636475, + "learning_rate": 0.00010504000000000001, + "loss": 6.7685, + "step": 52 + }, + { + "epoch": 0.0029487583687246156, + "grad_norm": 6.7059221267700195, + "learning_rate": 0.00010706000000000001, + "loss": 6.843, + "step": 53 + }, + { + "epoch": 0.0030043953190779103, + "grad_norm": 6.644082546234131, + "learning_rate": 0.00010908, + "loss": 7.0423, + "step": 54 + }, + { + "epoch": 0.003060032269431205, + "grad_norm": 8.201497077941895, + "learning_rate": 0.00011110000000000002, + "loss": 6.2695, + "step": 55 + }, + { + "epoch": 0.0031156692197844994, + "grad_norm": 6.687970161437988, + "learning_rate": 0.00011312000000000001, + "loss": 6.5756, + "step": 56 + }, + { + "epoch": 0.003171306170137794, + "grad_norm": 6.312666416168213, + "learning_rate": 0.00011514, + "loss": 7.0507, + "step": 57 + }, + { + "epoch": 0.003226943120491089, + "grad_norm": 6.983911991119385, + "learning_rate": 0.00011716, + "loss": 7.044, + "step": 58 + }, + { + "epoch": 0.0032825800708443835, + "grad_norm": 6.039947986602783, + "learning_rate": 0.00011918, + "loss": 7.2692, + "step": 59 + }, + { + "epoch": 0.0033382170211976783, + "grad_norm": 6.602978229522705, + "learning_rate": 0.00012119999999999999, + "loss": 7.299, + "step": 60 + }, + { + "epoch": 0.0033938539715509725, + "grad_norm": 5.712470531463623, + "learning_rate": 0.00012322, + "loss": 6.8726, + "step": 61 + }, + { + "epoch": 0.0034494909219042673, + "grad_norm": 7.145991325378418, + "learning_rate": 0.00012524000000000001, + "loss": 7.1443, + "step": 62 + }, + { + "epoch": 0.003505127872257562, + "grad_norm": 6.179739475250244, + "learning_rate": 0.00012726, + "loss": 6.2718, + "step": 63 + }, + { + "epoch": 0.0035607648226108567, + "grad_norm": 6.11101770401001, + "learning_rate": 0.00012928, + "loss": 6.8681, + "step": 64 + }, + { + "epoch": 0.0036164017729641514, + "grad_norm": 5.940270900726318, + "learning_rate": 0.00013130000000000002, + "loss": 6.5271, + "step": 65 + }, + { + "epoch": 0.0036720387233174457, + "grad_norm": 6.0141448974609375, + "learning_rate": 0.00013332, + "loss": 6.8941, + "step": 66 + }, + { + "epoch": 0.0037276756736707405, + "grad_norm": 8.359067916870117, + "learning_rate": 0.00013534000000000002, + "loss": 7.4407, + "step": 67 + }, + { + "epoch": 0.003783312624024035, + "grad_norm": 6.049771308898926, + "learning_rate": 0.00013736, + "loss": 6.9547, + "step": 68 + }, + { + "epoch": 0.00383894957437733, + "grad_norm": 5.933608055114746, + "learning_rate": 0.00013937999999999998, + "loss": 6.2827, + "step": 69 + }, + { + "epoch": 0.0038945865247306246, + "grad_norm": 5.4579033851623535, + "learning_rate": 0.0001414, + "loss": 7.208, + "step": 70 + }, + { + "epoch": 0.003950223475083919, + "grad_norm": 5.986669540405273, + "learning_rate": 0.00014342, + "loss": 6.1897, + "step": 71 + }, + { + "epoch": 0.004005860425437214, + "grad_norm": 5.855253219604492, + "learning_rate": 0.00014544, + "loss": 5.9931, + "step": 72 + }, + { + "epoch": 0.004061497375790508, + "grad_norm": 7.575289249420166, + "learning_rate": 0.00014746, + "loss": 6.3136, + "step": 73 + }, + { + "epoch": 0.004117134326143803, + "grad_norm": 7.413035869598389, + "learning_rate": 0.00014948, + "loss": 6.0242, + "step": 74 + }, + { + "epoch": 0.004172771276497097, + "grad_norm": 7.043323993682861, + "learning_rate": 0.0001515, + "loss": 6.7775, + "step": 75 + }, + { + "epoch": 0.004228408226850392, + "grad_norm": 5.949311256408691, + "learning_rate": 0.00015352, + "loss": 6.2016, + "step": 76 + }, + { + "epoch": 0.004284045177203687, + "grad_norm": 5.563730716705322, + "learning_rate": 0.00015554000000000002, + "loss": 6.5674, + "step": 77 + }, + { + "epoch": 0.0043396821275569816, + "grad_norm": 5.330533504486084, + "learning_rate": 0.00015756, + "loss": 6.237, + "step": 78 + }, + { + "epoch": 0.004395319077910276, + "grad_norm": 5.901546955108643, + "learning_rate": 0.00015958000000000001, + "loss": 6.7449, + "step": 79 + }, + { + "epoch": 0.004450956028263571, + "grad_norm": 5.803636074066162, + "learning_rate": 0.00016160000000000002, + "loss": 6.2914, + "step": 80 + }, + { + "epoch": 0.004506592978616866, + "grad_norm": 5.663641452789307, + "learning_rate": 0.00016362, + "loss": 6.533, + "step": 81 + }, + { + "epoch": 0.0045622299289701605, + "grad_norm": 5.951206684112549, + "learning_rate": 0.00016564, + "loss": 6.723, + "step": 82 + }, + { + "epoch": 0.004617866879323454, + "grad_norm": 7.9221720695495605, + "learning_rate": 0.00016766, + "loss": 6.8039, + "step": 83 + }, + { + "epoch": 0.004673503829676749, + "grad_norm": 6.168700218200684, + "learning_rate": 0.00016968, + "loss": 6.7571, + "step": 84 + }, + { + "epoch": 0.004729140780030044, + "grad_norm": 5.759303569793701, + "learning_rate": 0.0001717, + "loss": 6.7284, + "step": 85 + }, + { + "epoch": 0.0047847777303833385, + "grad_norm": 5.594539642333984, + "learning_rate": 0.00017372, + "loss": 6.7883, + "step": 86 + }, + { + "epoch": 0.004840414680736633, + "grad_norm": 5.415343761444092, + "learning_rate": 0.00017574, + "loss": 6.8812, + "step": 87 + }, + { + "epoch": 0.004896051631089928, + "grad_norm": 8.225547790527344, + "learning_rate": 0.00017776, + "loss": 5.7674, + "step": 88 + }, + { + "epoch": 0.004951688581443223, + "grad_norm": 5.967362880706787, + "learning_rate": 0.00017978000000000002, + "loss": 6.8245, + "step": 89 + }, + { + "epoch": 0.005007325531796517, + "grad_norm": 5.206155300140381, + "learning_rate": 0.0001818, + "loss": 6.4984, + "step": 90 + }, + { + "epoch": 0.005062962482149812, + "grad_norm": 5.472154140472412, + "learning_rate": 0.00018382, + "loss": 7.2622, + "step": 91 + }, + { + "epoch": 0.005118599432503106, + "grad_norm": 5.967668056488037, + "learning_rate": 0.00018584000000000002, + "loss": 6.7622, + "step": 92 + }, + { + "epoch": 0.005174236382856401, + "grad_norm": 5.653740406036377, + "learning_rate": 0.00018786, + "loss": 6.8219, + "step": 93 + }, + { + "epoch": 0.005229873333209695, + "grad_norm": 5.179713249206543, + "learning_rate": 0.00018988, + "loss": 6.1042, + "step": 94 + }, + { + "epoch": 0.00528551028356299, + "grad_norm": 6.005762577056885, + "learning_rate": 0.0001919, + "loss": 6.9674, + "step": 95 + }, + { + "epoch": 0.005341147233916285, + "grad_norm": 4.429709434509277, + "learning_rate": 0.00019392, + "loss": 6.1024, + "step": 96 + }, + { + "epoch": 0.00539678418426958, + "grad_norm": 5.628688335418701, + "learning_rate": 0.00019594, + "loss": 5.6687, + "step": 97 + }, + { + "epoch": 0.005452421134622874, + "grad_norm": 5.644309043884277, + "learning_rate": 0.00019796, + "loss": 6.8907, + "step": 98 + }, + { + "epoch": 0.005508058084976169, + "grad_norm": 5.049044132232666, + "learning_rate": 0.00019998, + "loss": 6.1869, + "step": 99 + }, + { + "epoch": 0.005563695035329464, + "grad_norm": 5.269039630889893, + "learning_rate": 0.000202, + "loss": 6.6536, + "step": 100 + }, + { + "epoch": 0.0056193319856827585, + "grad_norm": 5.037814140319824, + "learning_rate": 0.0002019986155169901, + "loss": 6.3661, + "step": 101 + }, + { + "epoch": 0.005674968936036052, + "grad_norm": 5.785567283630371, + "learning_rate": 0.00020199446210591673, + "loss": 6.4969, + "step": 102 + }, + { + "epoch": 0.005730605886389347, + "grad_norm": 4.767268657684326, + "learning_rate": 0.00020198753988064772, + "loss": 6.5622, + "step": 103 + }, + { + "epoch": 0.005786242836742642, + "grad_norm": 6.075338363647461, + "learning_rate": 0.0002019778490309594, + "loss": 6.0689, + "step": 104 + }, + { + "epoch": 0.0058418797870959365, + "grad_norm": 5.83820915222168, + "learning_rate": 0.00020196538982253126, + "loss": 6.5399, + "step": 105 + }, + { + "epoch": 0.005897516737449231, + "grad_norm": 5.307482719421387, + "learning_rate": 0.0002019501625969389, + "loss": 6.4257, + "step": 106 + }, + { + "epoch": 0.005953153687802526, + "grad_norm": 5.115218639373779, + "learning_rate": 0.00020193216777164428, + "loss": 6.1499, + "step": 107 + }, + { + "epoch": 0.006008790638155821, + "grad_norm": 5.3422160148620605, + "learning_rate": 0.0002019114058399847, + "loss": 6.2177, + "step": 108 + }, + { + "epoch": 0.006064427588509115, + "grad_norm": 5.2168474197387695, + "learning_rate": 0.00020188787737115897, + "loss": 6.3908, + "step": 109 + }, + { + "epoch": 0.00612006453886241, + "grad_norm": 6.172547817230225, + "learning_rate": 0.00020186158301021195, + "loss": 6.3804, + "step": 110 + }, + { + "epoch": 0.006175701489215705, + "grad_norm": 5.522261619567871, + "learning_rate": 0.00020183252347801686, + "loss": 6.617, + "step": 111 + }, + { + "epoch": 0.006231338439568999, + "grad_norm": 6.066495895385742, + "learning_rate": 0.00020180069957125544, + "loss": 6.7899, + "step": 112 + }, + { + "epoch": 0.0062869753899222934, + "grad_norm": 5.390618801116943, + "learning_rate": 0.00020176611216239613, + "loss": 6.0983, + "step": 113 + }, + { + "epoch": 0.006342612340275588, + "grad_norm": 4.880346298217773, + "learning_rate": 0.00020172876219967027, + "loss": 6.1, + "step": 114 + }, + { + "epoch": 0.006398249290628883, + "grad_norm": 12.22669506072998, + "learning_rate": 0.00020168865070704594, + "loss": 5.9939, + "step": 115 + }, + { + "epoch": 0.006453886240982178, + "grad_norm": 5.522224426269531, + "learning_rate": 0.00020164577878419994, + "loss": 6.2229, + "step": 116 + }, + { + "epoch": 0.006509523191335472, + "grad_norm": 6.669813632965088, + "learning_rate": 0.00020160014760648774, + "loss": 5.4593, + "step": 117 + }, + { + "epoch": 0.006565160141688767, + "grad_norm": 5.494222640991211, + "learning_rate": 0.00020155175842491107, + "loss": 6.3158, + "step": 118 + }, + { + "epoch": 0.006620797092042062, + "grad_norm": 9.116362571716309, + "learning_rate": 0.00020150061256608387, + "loss": 6.3658, + "step": 119 + }, + { + "epoch": 0.0066764340423953565, + "grad_norm": 5.606349945068359, + "learning_rate": 0.0002014467114321956, + "loss": 6.087, + "step": 120 + }, + { + "epoch": 0.006732070992748651, + "grad_norm": 5.5257887840271, + "learning_rate": 0.00020139005650097317, + "loss": 6.6457, + "step": 121 + }, + { + "epoch": 0.006787707943101945, + "grad_norm": 7.258926868438721, + "learning_rate": 0.00020133064932564002, + "loss": 6.6227, + "step": 122 + }, + { + "epoch": 0.00684334489345524, + "grad_norm": 4.674434185028076, + "learning_rate": 0.000201268491534874, + "loss": 6.4474, + "step": 123 + }, + { + "epoch": 0.0068989818438085345, + "grad_norm": 5.548820972442627, + "learning_rate": 0.00020120358483276227, + "loss": 6.9501, + "step": 124 + }, + { + "epoch": 0.006954618794161829, + "grad_norm": 5.273190021514893, + "learning_rate": 0.00020113593099875486, + "loss": 6.5339, + "step": 125 + }, + { + "epoch": 0.007010255744515124, + "grad_norm": 4.936262607574463, + "learning_rate": 0.00020106553188761582, + "loss": 6.4375, + "step": 126 + }, + { + "epoch": 0.007065892694868419, + "grad_norm": 6.2824273109436035, + "learning_rate": 0.0002009923894293723, + "loss": 6.4268, + "step": 127 + }, + { + "epoch": 0.0071215296452217134, + "grad_norm": 5.193487644195557, + "learning_rate": 0.00020091650562926183, + "loss": 6.4718, + "step": 128 + }, + { + "epoch": 0.007177166595575008, + "grad_norm": 4.972619533538818, + "learning_rate": 0.00020083788256767702, + "loss": 6.1604, + "step": 129 + }, + { + "epoch": 0.007232803545928303, + "grad_norm": 5.467920303344727, + "learning_rate": 0.00020075652240010892, + "loss": 6.0447, + "step": 130 + }, + { + "epoch": 0.007288440496281597, + "grad_norm": 4.896269798278809, + "learning_rate": 0.00020067242735708754, + "loss": 6.8815, + "step": 131 + }, + { + "epoch": 0.0073440774466348915, + "grad_norm": 6.456630229949951, + "learning_rate": 0.00020058559974412102, + "loss": 5.9796, + "step": 132 + }, + { + "epoch": 0.007399714396988186, + "grad_norm": 4.889880180358887, + "learning_rate": 0.00020049604194163217, + "loss": 5.7119, + "step": 133 + }, + { + "epoch": 0.007455351347341481, + "grad_norm": 4.703945159912109, + "learning_rate": 0.00020040375640489343, + "loss": 6.2785, + "step": 134 + }, + { + "epoch": 0.007510988297694776, + "grad_norm": 5.321676731109619, + "learning_rate": 0.00020030874566395943, + "loss": 6.9574, + "step": 135 + }, + { + "epoch": 0.00756662524804807, + "grad_norm": 4.994111061096191, + "learning_rate": 0.00020021101232359757, + "loss": 5.9102, + "step": 136 + }, + { + "epoch": 0.007622262198401365, + "grad_norm": 6.15833854675293, + "learning_rate": 0.00020011055906321676, + "loss": 6.6138, + "step": 137 + }, + { + "epoch": 0.00767789914875466, + "grad_norm": 5.724658489227295, + "learning_rate": 0.0002000073886367939, + "loss": 5.4355, + "step": 138 + }, + { + "epoch": 0.0077335360991079545, + "grad_norm": 9.080390930175781, + "learning_rate": 0.00019990150387279835, + "loss": 6.073, + "step": 139 + }, + { + "epoch": 0.007789173049461249, + "grad_norm": 4.942266941070557, + "learning_rate": 0.00019979290767411438, + "loss": 6.0685, + "step": 140 + }, + { + "epoch": 0.007844809999814543, + "grad_norm": 4.976442337036133, + "learning_rate": 0.00019968160301796163, + "loss": 6.2268, + "step": 141 + }, + { + "epoch": 0.007900446950167839, + "grad_norm": 4.503157615661621, + "learning_rate": 0.0001995675929558135, + "loss": 6.0819, + "step": 142 + }, + { + "epoch": 0.007956083900521133, + "grad_norm": 7.313660144805908, + "learning_rate": 0.0001994508806133134, + "loss": 6.6115, + "step": 143 + }, + { + "epoch": 0.008011720850874428, + "grad_norm": 5.139042377471924, + "learning_rate": 0.0001993314691901892, + "loss": 6.0428, + "step": 144 + }, + { + "epoch": 0.008067357801227722, + "grad_norm": 5.146208763122559, + "learning_rate": 0.00019920936196016534, + "loss": 6.0148, + "step": 145 + }, + { + "epoch": 0.008122994751581016, + "grad_norm": 5.296838760375977, + "learning_rate": 0.00019908456227087326, + "loss": 6.6792, + "step": 146 + }, + { + "epoch": 0.008178631701934311, + "grad_norm": 5.775576114654541, + "learning_rate": 0.00019895707354375945, + "loss": 7.1007, + "step": 147 + }, + { + "epoch": 0.008234268652287605, + "grad_norm": 6.212763786315918, + "learning_rate": 0.00019882689927399174, + "loss": 6.0656, + "step": 148 + }, + { + "epoch": 0.008289905602640901, + "grad_norm": 4.9248046875, + "learning_rate": 0.00019869404303036355, + "loss": 6.9572, + "step": 149 + }, + { + "epoch": 0.008345542552994195, + "grad_norm": 5.322810649871826, + "learning_rate": 0.00019855850845519588, + "loss": 6.0424, + "step": 150 + }, + { + "epoch": 0.00840117950334749, + "grad_norm": 5.905417442321777, + "learning_rate": 0.00019842029926423762, + "loss": 6.2819, + "step": 151 + }, + { + "epoch": 0.008456816453700784, + "grad_norm": 4.72113037109375, + "learning_rate": 0.00019827941924656348, + "loss": 6.1201, + "step": 152 + }, + { + "epoch": 0.00851245340405408, + "grad_norm": 4.623650550842285, + "learning_rate": 0.00019813587226447034, + "loss": 5.9929, + "step": 153 + }, + { + "epoch": 0.008568090354407374, + "grad_norm": 4.896861553192139, + "learning_rate": 0.00019798966225337126, + "loss": 7.0054, + "step": 154 + }, + { + "epoch": 0.008623727304760668, + "grad_norm": 4.321646213531494, + "learning_rate": 0.00019784079322168752, + "loss": 5.7235, + "step": 155 + }, + { + "epoch": 0.008679364255113963, + "grad_norm": 4.686023235321045, + "learning_rate": 0.00019768926925073878, + "loss": 6.5881, + "step": 156 + }, + { + "epoch": 0.008735001205467257, + "grad_norm": 5.237584114074707, + "learning_rate": 0.00019753509449463134, + "loss": 6.0167, + "step": 157 + }, + { + "epoch": 0.008790638155820553, + "grad_norm": 5.139473915100098, + "learning_rate": 0.00019737827318014396, + "loss": 6.9701, + "step": 158 + }, + { + "epoch": 0.008846275106173846, + "grad_norm": 4.788873672485352, + "learning_rate": 0.00019721880960661223, + "loss": 5.5182, + "step": 159 + }, + { + "epoch": 0.008901912056527142, + "grad_norm": 4.913562774658203, + "learning_rate": 0.00019705670814581052, + "loss": 6.2666, + "step": 160 + }, + { + "epoch": 0.008957549006880436, + "grad_norm": 4.9042158126831055, + "learning_rate": 0.0001968919732418323, + "loss": 6.884, + "step": 161 + }, + { + "epoch": 0.009013185957233731, + "grad_norm": 7.520642280578613, + "learning_rate": 0.00019672460941096818, + "loss": 6.7009, + "step": 162 + }, + { + "epoch": 0.009068822907587025, + "grad_norm": 4.975858688354492, + "learning_rate": 0.0001965546212415821, + "loss": 5.5829, + "step": 163 + }, + { + "epoch": 0.009124459857940321, + "grad_norm": 5.106305122375488, + "learning_rate": 0.0001963820133939856, + "loss": 6.2909, + "step": 164 + }, + { + "epoch": 0.009180096808293615, + "grad_norm": 8.0565824508667, + "learning_rate": 0.00019620679060031003, + "loss": 6.3891, + "step": 165 + }, + { + "epoch": 0.009235733758646909, + "grad_norm": 4.544425010681152, + "learning_rate": 0.00019602895766437678, + "loss": 6.1293, + "step": 166 + }, + { + "epoch": 0.009291370709000204, + "grad_norm": 4.567877292633057, + "learning_rate": 0.0001958485194615656, + "loss": 6.0897, + "step": 167 + }, + { + "epoch": 0.009347007659353498, + "grad_norm": 4.860219955444336, + "learning_rate": 0.00019566548093868106, + "loss": 6.6652, + "step": 168 + }, + { + "epoch": 0.009402644609706794, + "grad_norm": 4.439017295837402, + "learning_rate": 0.00019547984711381662, + "loss": 5.6879, + "step": 169 + }, + { + "epoch": 0.009458281560060088, + "grad_norm": 4.715156078338623, + "learning_rate": 0.00019529162307621738, + "loss": 6.5205, + "step": 170 + }, + { + "epoch": 0.009513918510413383, + "grad_norm": 5.1127610206604, + "learning_rate": 0.00019510081398614045, + "loss": 4.9883, + "step": 171 + }, + { + "epoch": 0.009569555460766677, + "grad_norm": 4.963103771209717, + "learning_rate": 0.00019490742507471338, + "loss": 6.3996, + "step": 172 + }, + { + "epoch": 0.009625192411119973, + "grad_norm": 4.485114097595215, + "learning_rate": 0.00019471146164379093, + "loss": 6.3602, + "step": 173 + }, + { + "epoch": 0.009680829361473266, + "grad_norm": 5.5230536460876465, + "learning_rate": 0.00019451292906580948, + "loss": 6.3082, + "step": 174 + }, + { + "epoch": 0.00973646631182656, + "grad_norm": 5.0220561027526855, + "learning_rate": 0.00019431183278363997, + "loss": 6.8835, + "step": 175 + }, + { + "epoch": 0.009792103262179856, + "grad_norm": 4.252419471740723, + "learning_rate": 0.00019410817831043856, + "loss": 6.3393, + "step": 176 + }, + { + "epoch": 0.00984774021253315, + "grad_norm": 5.507422924041748, + "learning_rate": 0.00019390197122949552, + "loss": 7.2248, + "step": 177 + }, + { + "epoch": 0.009903377162886445, + "grad_norm": 4.4019775390625, + "learning_rate": 0.0001936932171940821, + "loss": 5.9893, + "step": 178 + }, + { + "epoch": 0.00995901411323974, + "grad_norm": 4.5496602058410645, + "learning_rate": 0.0001934819219272957, + "loss": 6.062, + "step": 179 + }, + { + "epoch": 0.010014651063593035, + "grad_norm": 3.9321088790893555, + "learning_rate": 0.0001932680912219027, + "loss": 5.5829, + "step": 180 + }, + { + "epoch": 0.010070288013946329, + "grad_norm": 4.952230930328369, + "learning_rate": 0.00019305173094017996, + "loss": 6.421, + "step": 181 + }, + { + "epoch": 0.010125924964299624, + "grad_norm": 4.319357872009277, + "learning_rate": 0.00019283284701375393, + "loss": 5.8002, + "step": 182 + }, + { + "epoch": 0.010181561914652918, + "grad_norm": 4.632691860198975, + "learning_rate": 0.00019261144544343794, + "loss": 5.9414, + "step": 183 + }, + { + "epoch": 0.010237198865006212, + "grad_norm": 4.216100692749023, + "learning_rate": 0.00019238753229906797, + "loss": 6.2408, + "step": 184 + }, + { + "epoch": 0.010292835815359508, + "grad_norm": 4.778911113739014, + "learning_rate": 0.00019216111371933594, + "loss": 5.9821, + "step": 185 + }, + { + "epoch": 0.010348472765712801, + "grad_norm": 4.519885063171387, + "learning_rate": 0.00019193219591162155, + "loss": 6.1568, + "step": 186 + }, + { + "epoch": 0.010404109716066097, + "grad_norm": 5.346129417419434, + "learning_rate": 0.00019170078515182216, + "loss": 6.1002, + "step": 187 + }, + { + "epoch": 0.01045974666641939, + "grad_norm": 4.33414888381958, + "learning_rate": 0.0001914668877841807, + "loss": 5.9836, + "step": 188 + }, + { + "epoch": 0.010515383616772686, + "grad_norm": 4.282817840576172, + "learning_rate": 0.0001912305102211116, + "loss": 5.451, + "step": 189 + }, + { + "epoch": 0.01057102056712598, + "grad_norm": 5.004985332489014, + "learning_rate": 0.00019099165894302515, + "loss": 5.939, + "step": 190 + }, + { + "epoch": 0.010626657517479276, + "grad_norm": 4.402198791503906, + "learning_rate": 0.00019075034049814983, + "loss": 6.1151, + "step": 191 + }, + { + "epoch": 0.01068229446783257, + "grad_norm": 4.642638683319092, + "learning_rate": 0.00019050656150235268, + "loss": 6.2485, + "step": 192 + }, + { + "epoch": 0.010737931418185865, + "grad_norm": 4.481376647949219, + "learning_rate": 0.00019026032863895805, + "loss": 5.6708, + "step": 193 + }, + { + "epoch": 0.01079356836853916, + "grad_norm": 3.6944057941436768, + "learning_rate": 0.0001900116486585642, + "loss": 5.5048, + "step": 194 + }, + { + "epoch": 0.010849205318892453, + "grad_norm": 4.244951248168945, + "learning_rate": 0.0001897605283788585, + "loss": 5.6744, + "step": 195 + }, + { + "epoch": 0.010904842269245749, + "grad_norm": 4.832974910736084, + "learning_rate": 0.0001895069746844302, + "loss": 6.7287, + "step": 196 + }, + { + "epoch": 0.010960479219599042, + "grad_norm": 3.7996647357940674, + "learning_rate": 0.000189250994526582, + "loss": 5.503, + "step": 197 + }, + { + "epoch": 0.011016116169952338, + "grad_norm": 4.708200931549072, + "learning_rate": 0.00018899259492313915, + "loss": 5.5765, + "step": 198 + }, + { + "epoch": 0.011071753120305632, + "grad_norm": 4.379589557647705, + "learning_rate": 0.00018873178295825732, + "loss": 6.1057, + "step": 199 + }, + { + "epoch": 0.011127390070658928, + "grad_norm": 4.480067729949951, + "learning_rate": 0.00018846856578222832, + "loss": 6.5128, + "step": 200 + }, + { + "epoch": 0.011183027021012221, + "grad_norm": 3.9331722259521484, + "learning_rate": 0.00018820295061128394, + "loss": 5.6085, + "step": 201 + }, + { + "epoch": 0.011238663971365517, + "grad_norm": 3.989475727081299, + "learning_rate": 0.00018793494472739831, + "loss": 6.1886, + "step": 202 + }, + { + "epoch": 0.01129430092171881, + "grad_norm": 4.001644134521484, + "learning_rate": 0.00018766455547808813, + "loss": 6.0936, + "step": 203 + }, + { + "epoch": 0.011349937872072105, + "grad_norm": 4.242369174957275, + "learning_rate": 0.0001873917902762112, + "loss": 5.8661, + "step": 204 + }, + { + "epoch": 0.0114055748224254, + "grad_norm": 4.301037788391113, + "learning_rate": 0.0001871166565997633, + "loss": 6.0092, + "step": 205 + }, + { + "epoch": 0.011461211772778694, + "grad_norm": 4.59658145904541, + "learning_rate": 0.00018683916199167325, + "loss": 6.3616, + "step": 206 + }, + { + "epoch": 0.01151684872313199, + "grad_norm": 4.7506513595581055, + "learning_rate": 0.00018655931405959586, + "loss": 5.7288, + "step": 207 + }, + { + "epoch": 0.011572485673485284, + "grad_norm": 5.412206172943115, + "learning_rate": 0.00018627712047570352, + "loss": 6.0094, + "step": 208 + }, + { + "epoch": 0.01162812262383858, + "grad_norm": 4.606365203857422, + "learning_rate": 0.00018599258897647594, + "loss": 5.7258, + "step": 209 + }, + { + "epoch": 0.011683759574191873, + "grad_norm": 4.121314525604248, + "learning_rate": 0.00018570572736248782, + "loss": 5.1908, + "step": 210 + }, + { + "epoch": 0.011739396524545169, + "grad_norm": 4.5832929611206055, + "learning_rate": 0.0001854165434981953, + "loss": 5.7616, + "step": 211 + }, + { + "epoch": 0.011795033474898462, + "grad_norm": 4.121862888336182, + "learning_rate": 0.00018512504531172005, + "loss": 5.8666, + "step": 212 + }, + { + "epoch": 0.011850670425251758, + "grad_norm": 4.314868927001953, + "learning_rate": 0.0001848312407946321, + "loss": 5.5418, + "step": 213 + }, + { + "epoch": 0.011906307375605052, + "grad_norm": 4.383836269378662, + "learning_rate": 0.00018453513800173072, + "loss": 5.9541, + "step": 214 + }, + { + "epoch": 0.011961944325958346, + "grad_norm": 5.118366718292236, + "learning_rate": 0.00018423674505082356, + "loss": 6.6696, + "step": 215 + }, + { + "epoch": 0.012017581276311641, + "grad_norm": 4.667532444000244, + "learning_rate": 0.0001839360701225041, + "loss": 6.0478, + "step": 216 + }, + { + "epoch": 0.012073218226664935, + "grad_norm": 4.4542083740234375, + "learning_rate": 0.00018363312145992737, + "loss": 6.055, + "step": 217 + }, + { + "epoch": 0.01212885517701823, + "grad_norm": 4.726438522338867, + "learning_rate": 0.00018332790736858397, + "loss": 5.6499, + "step": 218 + }, + { + "epoch": 0.012184492127371525, + "grad_norm": 4.818346977233887, + "learning_rate": 0.00018302043621607245, + "loss": 5.9574, + "step": 219 + }, + { + "epoch": 0.01224012907772482, + "grad_norm": 4.342275619506836, + "learning_rate": 0.00018271071643186968, + "loss": 5.978, + "step": 220 + }, + { + "epoch": 0.012295766028078114, + "grad_norm": 4.275669097900391, + "learning_rate": 0.0001823987565071001, + "loss": 6.03, + "step": 221 + }, + { + "epoch": 0.01235140297843141, + "grad_norm": 4.157369136810303, + "learning_rate": 0.00018208456499430256, + "loss": 6.0674, + "step": 222 + }, + { + "epoch": 0.012407039928784704, + "grad_norm": 4.342342853546143, + "learning_rate": 0.00018176815050719615, + "loss": 5.7965, + "step": 223 + }, + { + "epoch": 0.012462676879137997, + "grad_norm": 4.968392848968506, + "learning_rate": 0.00018144952172044381, + "loss": 6.7337, + "step": 224 + }, + { + "epoch": 0.012518313829491293, + "grad_norm": 4.00648307800293, + "learning_rate": 0.00018112868736941477, + "loss": 5.4009, + "step": 225 + }, + { + "epoch": 0.012573950779844587, + "grad_norm": 4.082150459289551, + "learning_rate": 0.00018080565624994474, + "loss": 5.9283, + "step": 226 + }, + { + "epoch": 0.012629587730197882, + "grad_norm": 3.9879908561706543, + "learning_rate": 0.00018048043721809507, + "loss": 6.0883, + "step": 227 + }, + { + "epoch": 0.012685224680551176, + "grad_norm": 4.111201763153076, + "learning_rate": 0.00018015303918990982, + "loss": 5.5457, + "step": 228 + }, + { + "epoch": 0.012740861630904472, + "grad_norm": 4.810683727264404, + "learning_rate": 0.0001798234711411713, + "loss": 5.9197, + "step": 229 + }, + { + "epoch": 0.012796498581257766, + "grad_norm": 4.465317726135254, + "learning_rate": 0.00017949174210715407, + "loss": 6.5193, + "step": 230 + }, + { + "epoch": 0.012852135531611061, + "grad_norm": 4.771295070648193, + "learning_rate": 0.00017915786118237714, + "loss": 6.3658, + "step": 231 + }, + { + "epoch": 0.012907772481964355, + "grad_norm": 4.386283874511719, + "learning_rate": 0.0001788218375203547, + "loss": 6.226, + "step": 232 + }, + { + "epoch": 0.012963409432317649, + "grad_norm": 4.33884334564209, + "learning_rate": 0.00017848368033334528, + "loss": 5.8676, + "step": 233 + }, + { + "epoch": 0.013019046382670945, + "grad_norm": 4.183704853057861, + "learning_rate": 0.00017814339889209887, + "loss": 6.256, + "step": 234 + }, + { + "epoch": 0.013074683333024239, + "grad_norm": 4.691279411315918, + "learning_rate": 0.00017780100252560313, + "loss": 6.5217, + "step": 235 + }, + { + "epoch": 0.013130320283377534, + "grad_norm": 4.341029167175293, + "learning_rate": 0.0001774565006208274, + "loss": 6.5025, + "step": 236 + }, + { + "epoch": 0.013185957233730828, + "grad_norm": 4.256303310394287, + "learning_rate": 0.00017710990262246543, + "loss": 5.7146, + "step": 237 + }, + { + "epoch": 0.013241594184084124, + "grad_norm": 4.13632345199585, + "learning_rate": 0.0001767612180326764, + "loss": 6.1192, + "step": 238 + }, + { + "epoch": 0.013297231134437417, + "grad_norm": 4.3793511390686035, + "learning_rate": 0.00017641045641082453, + "loss": 5.427, + "step": 239 + }, + { + "epoch": 0.013352868084790713, + "grad_norm": 6.428075313568115, + "learning_rate": 0.00017605762737321683, + "loss": 6.4587, + "step": 240 + }, + { + "epoch": 0.013408505035144007, + "grad_norm": 7.261257648468018, + "learning_rate": 0.0001757027405928396, + "loss": 6.3551, + "step": 241 + }, + { + "epoch": 0.013464141985497302, + "grad_norm": 4.533746719360352, + "learning_rate": 0.0001753458057990932, + "loss": 5.9538, + "step": 242 + }, + { + "epoch": 0.013519778935850596, + "grad_norm": 4.003462791442871, + "learning_rate": 0.00017498683277752527, + "loss": 6.6298, + "step": 243 + }, + { + "epoch": 0.01357541588620389, + "grad_norm": 3.955387830734253, + "learning_rate": 0.00017462583136956258, + "loss": 6.1641, + "step": 244 + }, + { + "epoch": 0.013631052836557186, + "grad_norm": 4.206735134124756, + "learning_rate": 0.00017426281147224105, + "loss": 5.905, + "step": 245 + }, + { + "epoch": 0.01368668978691048, + "grad_norm": 3.501095771789551, + "learning_rate": 0.00017389778303793457, + "loss": 5.0529, + "step": 246 + }, + { + "epoch": 0.013742326737263775, + "grad_norm": 4.198129653930664, + "learning_rate": 0.00017353075607408209, + "loss": 5.6073, + "step": 247 + }, + { + "epoch": 0.013797963687617069, + "grad_norm": 4.990898132324219, + "learning_rate": 0.00017316174064291315, + "loss": 5.6831, + "step": 248 + }, + { + "epoch": 0.013853600637970365, + "grad_norm": 4.297070503234863, + "learning_rate": 0.00017279074686117225, + "loss": 6.0396, + "step": 249 + }, + { + "epoch": 0.013909237588323659, + "grad_norm": 3.8654425144195557, + "learning_rate": 0.0001724177848998413, + "loss": 5.5425, + "step": 250 + }, + { + "epoch": 0.013964874538676954, + "grad_norm": 4.994975566864014, + "learning_rate": 0.0001720428649838609, + "loss": 6.4005, + "step": 251 + }, + { + "epoch": 0.014020511489030248, + "grad_norm": 4.118058681488037, + "learning_rate": 0.0001716659973918499, + "loss": 5.7095, + "step": 252 + }, + { + "epoch": 0.014076148439383542, + "grad_norm": 4.50797176361084, + "learning_rate": 0.00017128719245582374, + "loss": 6.2111, + "step": 253 + }, + { + "epoch": 0.014131785389736837, + "grad_norm": 4.260339260101318, + "learning_rate": 0.0001709064605609111, + "loss": 5.6713, + "step": 254 + }, + { + "epoch": 0.014187422340090131, + "grad_norm": 3.9633259773254395, + "learning_rate": 0.00017052381214506914, + "loss": 5.9797, + "step": 255 + }, + { + "epoch": 0.014243059290443427, + "grad_norm": 4.197867393493652, + "learning_rate": 0.00017013925769879755, + "loss": 5.673, + "step": 256 + }, + { + "epoch": 0.01429869624079672, + "grad_norm": 4.298388957977295, + "learning_rate": 0.0001697528077648507, + "loss": 6.4818, + "step": 257 + }, + { + "epoch": 0.014354333191150016, + "grad_norm": 3.9560089111328125, + "learning_rate": 0.00016936447293794873, + "loss": 6.1854, + "step": 258 + }, + { + "epoch": 0.01440997014150331, + "grad_norm": 3.917959213256836, + "learning_rate": 0.0001689742638644871, + "loss": 6.2532, + "step": 259 + }, + { + "epoch": 0.014465607091856606, + "grad_norm": 4.023734092712402, + "learning_rate": 0.0001685821912422447, + "loss": 5.7174, + "step": 260 + }, + { + "epoch": 0.0145212440422099, + "grad_norm": 4.500511169433594, + "learning_rate": 0.00016818826582009044, + "loss": 5.6947, + "step": 261 + }, + { + "epoch": 0.014576880992563193, + "grad_norm": 4.956663131713867, + "learning_rate": 0.00016779249839768884, + "loss": 5.4685, + "step": 262 + }, + { + "epoch": 0.014632517942916489, + "grad_norm": 5.168318271636963, + "learning_rate": 0.00016739489982520368, + "loss": 6.6102, + "step": 263 + }, + { + "epoch": 0.014688154893269783, + "grad_norm": 3.7343878746032715, + "learning_rate": 0.00016699548100300066, + "loss": 5.1714, + "step": 264 + }, + { + "epoch": 0.014743791843623079, + "grad_norm": 4.7374348640441895, + "learning_rate": 0.00016659425288134854, + "loss": 5.7039, + "step": 265 + }, + { + "epoch": 0.014799428793976372, + "grad_norm": 5.316050052642822, + "learning_rate": 0.00016619122646011902, + "loss": 6.0498, + "step": 266 + }, + { + "epoch": 0.014855065744329668, + "grad_norm": 4.233699321746826, + "learning_rate": 0.00016578641278848497, + "loss": 5.843, + "step": 267 + }, + { + "epoch": 0.014910702694682962, + "grad_norm": 3.862027406692505, + "learning_rate": 0.00016537982296461768, + "loss": 5.7984, + "step": 268 + }, + { + "epoch": 0.014966339645036257, + "grad_norm": 4.486534118652344, + "learning_rate": 0.00016497146813538257, + "loss": 6.1495, + "step": 269 + }, + { + "epoch": 0.015021976595389551, + "grad_norm": 3.686363458633423, + "learning_rate": 0.00016456135949603358, + "loss": 5.3701, + "step": 270 + }, + { + "epoch": 0.015077613545742847, + "grad_norm": 3.8409647941589355, + "learning_rate": 0.00016414950828990625, + "loss": 5.6247, + "step": 271 + }, + { + "epoch": 0.01513325049609614, + "grad_norm": 3.75473952293396, + "learning_rate": 0.00016373592580810935, + "loss": 6.2893, + "step": 272 + }, + { + "epoch": 0.015188887446449435, + "grad_norm": 4.1087422370910645, + "learning_rate": 0.00016332062338921563, + "loss": 5.8363, + "step": 273 + }, + { + "epoch": 0.01524452439680273, + "grad_norm": 4.495436668395996, + "learning_rate": 0.00016290361241895064, + "loss": 6.088, + "step": 274 + }, + { + "epoch": 0.015300161347156024, + "grad_norm": 3.96954083442688, + "learning_rate": 0.0001624849043298808, + "loss": 6.0236, + "step": 275 + }, + { + "epoch": 0.01535579829750932, + "grad_norm": 3.5179734230041504, + "learning_rate": 0.00016206451060109988, + "loss": 5.01, + "step": 276 + }, + { + "epoch": 0.015411435247862613, + "grad_norm": 3.924241065979004, + "learning_rate": 0.0001616424427579143, + "loss": 5.5519, + "step": 277 + }, + { + "epoch": 0.015467072198215909, + "grad_norm": 4.176694393157959, + "learning_rate": 0.0001612187123715272, + "loss": 5.0326, + "step": 278 + }, + { + "epoch": 0.015522709148569203, + "grad_norm": 4.59972620010376, + "learning_rate": 0.00016079333105872118, + "loss": 5.4528, + "step": 279 + }, + { + "epoch": 0.015578346098922499, + "grad_norm": 4.718801975250244, + "learning_rate": 0.00016036631048153979, + "loss": 6.0674, + "step": 280 + }, + { + "epoch": 0.015633983049275792, + "grad_norm": 3.861063003540039, + "learning_rate": 0.00015993766234696785, + "loss": 5.6641, + "step": 281 + }, + { + "epoch": 0.015689619999629086, + "grad_norm": 4.231996536254883, + "learning_rate": 0.00015950739840661055, + "loss": 6.2912, + "step": 282 + }, + { + "epoch": 0.01574525694998238, + "grad_norm": 5.1836628913879395, + "learning_rate": 0.00015907553045637116, + "loss": 5.6069, + "step": 283 + }, + { + "epoch": 0.015800893900335677, + "grad_norm": 3.6092748641967773, + "learning_rate": 0.00015864207033612762, + "loss": 5.5475, + "step": 284 + }, + { + "epoch": 0.01585653085068897, + "grad_norm": 3.619304895401001, + "learning_rate": 0.00015820702992940813, + "loss": 5.5397, + "step": 285 + }, + { + "epoch": 0.015912167801042265, + "grad_norm": 3.9067981243133545, + "learning_rate": 0.0001577704211630652, + "loss": 5.9907, + "step": 286 + }, + { + "epoch": 0.01596780475139556, + "grad_norm": 4.317156791687012, + "learning_rate": 0.00015733225600694873, + "loss": 6.3837, + "step": 287 + }, + { + "epoch": 0.016023441701748856, + "grad_norm": 3.834639549255371, + "learning_rate": 0.00015689254647357776, + "loss": 5.6241, + "step": 288 + }, + { + "epoch": 0.01607907865210215, + "grad_norm": 3.919989585876465, + "learning_rate": 0.0001564513046178113, + "loss": 5.0147, + "step": 289 + }, + { + "epoch": 0.016134715602455444, + "grad_norm": 3.6235647201538086, + "learning_rate": 0.00015600854253651776, + "loss": 5.7651, + "step": 290 + }, + { + "epoch": 0.016190352552808738, + "grad_norm": 3.4974348545074463, + "learning_rate": 0.00015556427236824318, + "loss": 5.3932, + "step": 291 + }, + { + "epoch": 0.016245989503162032, + "grad_norm": 3.745358943939209, + "learning_rate": 0.00015511850629287865, + "loss": 5.5836, + "step": 292 + }, + { + "epoch": 0.01630162645351533, + "grad_norm": 4.6745991706848145, + "learning_rate": 0.00015467125653132637, + "loss": 5.9228, + "step": 293 + }, + { + "epoch": 0.016357263403868623, + "grad_norm": 6.11571741104126, + "learning_rate": 0.00015422253534516444, + "loss": 6.3837, + "step": 294 + }, + { + "epoch": 0.016412900354221917, + "grad_norm": 4.6769022941589355, + "learning_rate": 0.00015377235503631083, + "loss": 6.0375, + "step": 295 + }, + { + "epoch": 0.01646853730457521, + "grad_norm": 3.4016823768615723, + "learning_rate": 0.00015332072794668617, + "loss": 5.8922, + "step": 296 + }, + { + "epoch": 0.016524174254928508, + "grad_norm": 4.977784156799316, + "learning_rate": 0.0001528676664578752, + "loss": 5.6385, + "step": 297 + }, + { + "epoch": 0.016579811205281802, + "grad_norm": 5.7106428146362305, + "learning_rate": 0.00015241318299078751, + "loss": 6.454, + "step": 298 + }, + { + "epoch": 0.016635448155635096, + "grad_norm": 5.207014083862305, + "learning_rate": 0.00015195729000531694, + "loss": 5.2934, + "step": 299 + }, + { + "epoch": 0.01669108510598839, + "grad_norm": 5.4067301750183105, + "learning_rate": 0.0001515, + "loss": 5.766, + "step": 300 + }, + { + "epoch": 0.016746722056341683, + "grad_norm": 4.444033145904541, + "learning_rate": 0.00015104132551167318, + "loss": 6.147, + "step": 301 + }, + { + "epoch": 0.01680235900669498, + "grad_norm": 4.146214485168457, + "learning_rate": 0.00015058127911512923, + "loss": 6.182, + "step": 302 + }, + { + "epoch": 0.016857995957048275, + "grad_norm": 4.133290767669678, + "learning_rate": 0.00015011987342277255, + "loss": 6.3317, + "step": 303 + }, + { + "epoch": 0.01691363290740157, + "grad_norm": 4.0636820793151855, + "learning_rate": 0.00014965712108427323, + "loss": 4.8367, + "step": 304 + }, + { + "epoch": 0.016969269857754862, + "grad_norm": 4.677021026611328, + "learning_rate": 0.00014919303478622045, + "loss": 5.3606, + "step": 305 + }, + { + "epoch": 0.01702490680810816, + "grad_norm": 4.121607780456543, + "learning_rate": 0.00014872762725177447, + "loss": 5.4315, + "step": 306 + }, + { + "epoch": 0.017080543758461453, + "grad_norm": 5.159542083740234, + "learning_rate": 0.00014826091124031792, + "loss": 5.7607, + "step": 307 + }, + { + "epoch": 0.017136180708814747, + "grad_norm": 4.207242488861084, + "learning_rate": 0.00014779289954710604, + "loss": 6.4252, + "step": 308 + }, + { + "epoch": 0.01719181765916804, + "grad_norm": 4.018631458282471, + "learning_rate": 0.00014732360500291583, + "loss": 5.6436, + "step": 309 + }, + { + "epoch": 0.017247454609521335, + "grad_norm": 4.236734390258789, + "learning_rate": 0.00014685304047369423, + "loss": 5.9777, + "step": 310 + }, + { + "epoch": 0.017303091559874632, + "grad_norm": 4.267455577850342, + "learning_rate": 0.00014638121886020555, + "loss": 5.4746, + "step": 311 + }, + { + "epoch": 0.017358728510227926, + "grad_norm": 3.874371290206909, + "learning_rate": 0.00014590815309767767, + "loss": 6.022, + "step": 312 + }, + { + "epoch": 0.01741436546058122, + "grad_norm": 3.8613169193267822, + "learning_rate": 0.00014543385615544744, + "loss": 6.175, + "step": 313 + }, + { + "epoch": 0.017470002410934514, + "grad_norm": 3.8040213584899902, + "learning_rate": 0.0001449583410366051, + "loss": 5.9392, + "step": 314 + }, + { + "epoch": 0.01752563936128781, + "grad_norm": 4.015208721160889, + "learning_rate": 0.00014448162077763783, + "loss": 5.1542, + "step": 315 + }, + { + "epoch": 0.017581276311641105, + "grad_norm": 3.7842915058135986, + "learning_rate": 0.00014400370844807234, + "loss": 5.6346, + "step": 316 + }, + { + "epoch": 0.0176369132619944, + "grad_norm": 3.9127633571624756, + "learning_rate": 0.0001435246171501166, + "loss": 5.4481, + "step": 317 + }, + { + "epoch": 0.017692550212347693, + "grad_norm": 3.985870122909546, + "learning_rate": 0.00014304436001830054, + "loss": 6.14, + "step": 318 + }, + { + "epoch": 0.01774818716270099, + "grad_norm": 4.676354885101318, + "learning_rate": 0.000142562950219116, + "loss": 5.2206, + "step": 319 + }, + { + "epoch": 0.017803824113054284, + "grad_norm": 3.7993323802948, + "learning_rate": 0.00014208040095065584, + "loss": 5.8393, + "step": 320 + }, + { + "epoch": 0.017859461063407578, + "grad_norm": 4.0176100730896, + "learning_rate": 0.000141596725442252, + "loss": 6.6027, + "step": 321 + }, + { + "epoch": 0.017915098013760872, + "grad_norm": 3.794816732406616, + "learning_rate": 0.00014111193695411285, + "loss": 5.3384, + "step": 322 + }, + { + "epoch": 0.017970734964114166, + "grad_norm": 3.6705198287963867, + "learning_rate": 0.00014062604877695972, + "loss": 5.2017, + "step": 323 + }, + { + "epoch": 0.018026371914467463, + "grad_norm": 3.9035749435424805, + "learning_rate": 0.0001401390742316624, + "loss": 5.8686, + "step": 324 + }, + { + "epoch": 0.018082008864820757, + "grad_norm": 3.7254738807678223, + "learning_rate": 0.00013965102666887408, + "loss": 5.8681, + "step": 325 + }, + { + "epoch": 0.01813764581517405, + "grad_norm": 4.296890735626221, + "learning_rate": 0.0001391619194686652, + "loss": 6.1765, + "step": 326 + }, + { + "epoch": 0.018193282765527344, + "grad_norm": 3.9504783153533936, + "learning_rate": 0.00013867176604015672, + "loss": 5.4285, + "step": 327 + }, + { + "epoch": 0.018248919715880642, + "grad_norm": 4.739797592163086, + "learning_rate": 0.0001381805798211525, + "loss": 5.808, + "step": 328 + }, + { + "epoch": 0.018304556666233936, + "grad_norm": 3.777249813079834, + "learning_rate": 0.00013768837427777082, + "loss": 6.1816, + "step": 329 + }, + { + "epoch": 0.01836019361658723, + "grad_norm": 3.811932325363159, + "learning_rate": 0.0001371951629040753, + "loss": 5.5372, + "step": 330 + }, + { + "epoch": 0.018415830566940523, + "grad_norm": 3.6895103454589844, + "learning_rate": 0.00013670095922170498, + "loss": 5.8523, + "step": 331 + }, + { + "epoch": 0.018471467517293817, + "grad_norm": 3.6029834747314453, + "learning_rate": 0.00013620577677950335, + "loss": 5.9181, + "step": 332 + }, + { + "epoch": 0.018527104467647115, + "grad_norm": 3.388713836669922, + "learning_rate": 0.00013570962915314725, + "loss": 5.1125, + "step": 333 + }, + { + "epoch": 0.01858274141800041, + "grad_norm": 4.782525062561035, + "learning_rate": 0.00013521252994477446, + "loss": 6.2268, + "step": 334 + }, + { + "epoch": 0.018638378368353702, + "grad_norm": 4.812259197235107, + "learning_rate": 0.00013471449278261086, + "loss": 5.3066, + "step": 335 + }, + { + "epoch": 0.018694015318706996, + "grad_norm": 4.30558967590332, + "learning_rate": 0.0001342155313205969, + "loss": 6.1037, + "step": 336 + }, + { + "epoch": 0.018749652269060293, + "grad_norm": 4.0816545486450195, + "learning_rate": 0.0001337156592380131, + "loss": 5.2987, + "step": 337 + }, + { + "epoch": 0.018805289219413587, + "grad_norm": 4.110915660858154, + "learning_rate": 0.00013321489023910508, + "loss": 5.9777, + "step": 338 + }, + { + "epoch": 0.01886092616976688, + "grad_norm": 4.508513927459717, + "learning_rate": 0.0001327132380527079, + "loss": 6.1224, + "step": 339 + }, + { + "epoch": 0.018916563120120175, + "grad_norm": 3.5167860984802246, + "learning_rate": 0.0001322107164318697, + "loss": 5.6014, + "step": 340 + }, + { + "epoch": 0.01897220007047347, + "grad_norm": 3.9897732734680176, + "learning_rate": 0.00013170733915347451, + "loss": 5.8923, + "step": 341 + }, + { + "epoch": 0.019027837020826766, + "grad_norm": 4.49286413192749, + "learning_rate": 0.00013120312001786477, + "loss": 5.8677, + "step": 342 + }, + { + "epoch": 0.01908347397118006, + "grad_norm": 3.687140703201294, + "learning_rate": 0.0001306980728484627, + "loss": 5.7511, + "step": 343 + }, + { + "epoch": 0.019139110921533354, + "grad_norm": 3.9855973720550537, + "learning_rate": 0.00013019221149139162, + "loss": 6.2623, + "step": 344 + }, + { + "epoch": 0.019194747871886648, + "grad_norm": 3.667323350906372, + "learning_rate": 0.00012968554981509622, + "loss": 6.1062, + "step": 345 + }, + { + "epoch": 0.019250384822239945, + "grad_norm": 3.8460049629211426, + "learning_rate": 0.00012917810170996218, + "loss": 6.0577, + "step": 346 + }, + { + "epoch": 0.01930602177259324, + "grad_norm": 3.6674225330352783, + "learning_rate": 0.0001286698810879357, + "loss": 5.5785, + "step": 347 + }, + { + "epoch": 0.019361658722946533, + "grad_norm": 3.5017611980438232, + "learning_rate": 0.00012816090188214182, + "loss": 5.7312, + "step": 348 + }, + { + "epoch": 0.019417295673299827, + "grad_norm": 3.2471115589141846, + "learning_rate": 0.00012765117804650267, + "loss": 4.8269, + "step": 349 + }, + { + "epoch": 0.01947293262365312, + "grad_norm": 3.5942840576171875, + "learning_rate": 0.0001271407235553546, + "loss": 5.6728, + "step": 350 + }, + { + "epoch": 0.019528569574006418, + "grad_norm": 4.019217014312744, + "learning_rate": 0.00012662955240306538, + "loss": 5.1322, + "step": 351 + }, + { + "epoch": 0.019584206524359712, + "grad_norm": 4.0236287117004395, + "learning_rate": 0.00012611767860365038, + "loss": 5.7071, + "step": 352 + }, + { + "epoch": 0.019639843474713006, + "grad_norm": 4.3137969970703125, + "learning_rate": 0.00012560511619038827, + "loss": 6.1461, + "step": 353 + }, + { + "epoch": 0.0196954804250663, + "grad_norm": 3.376030206680298, + "learning_rate": 0.00012509187921543667, + "loss": 4.504, + "step": 354 + }, + { + "epoch": 0.019751117375419597, + "grad_norm": 3.505828619003296, + "learning_rate": 0.00012457798174944645, + "loss": 5.2368, + "step": 355 + }, + { + "epoch": 0.01980675432577289, + "grad_norm": 3.7752232551574707, + "learning_rate": 0.00012406343788117625, + "loss": 5.0682, + "step": 356 + }, + { + "epoch": 0.019862391276126184, + "grad_norm": 3.6609232425689697, + "learning_rate": 0.0001235482617171061, + "loss": 5.8648, + "step": 357 + }, + { + "epoch": 0.01991802822647948, + "grad_norm": 3.760667562484741, + "learning_rate": 0.00012303246738105082, + "loss": 5.2033, + "step": 358 + }, + { + "epoch": 0.019973665176832772, + "grad_norm": 4.119334697723389, + "learning_rate": 0.00012251606901377265, + "loss": 4.4503, + "step": 359 + }, + { + "epoch": 0.02002930212718607, + "grad_norm": 3.700547456741333, + "learning_rate": 0.00012199908077259367, + "loss": 6.0612, + "step": 360 + }, + { + "epoch": 0.020084939077539363, + "grad_norm": 3.967587471008301, + "learning_rate": 0.00012148151683100776, + "loss": 5.3589, + "step": 361 + }, + { + "epoch": 0.020140576027892657, + "grad_norm": 4.06566047668457, + "learning_rate": 0.00012096339137829174, + "loss": 6.4126, + "step": 362 + }, + { + "epoch": 0.02019621297824595, + "grad_norm": 3.8236615657806396, + "learning_rate": 0.00012044471861911666, + "loss": 5.4793, + "step": 363 + }, + { + "epoch": 0.02025184992859925, + "grad_norm": 3.578010320663452, + "learning_rate": 0.0001199255127731582, + "loss": 6.1177, + "step": 364 + }, + { + "epoch": 0.020307486878952542, + "grad_norm": 3.4872570037841797, + "learning_rate": 0.00011940578807470692, + "loss": 5.523, + "step": 365 + }, + { + "epoch": 0.020363123829305836, + "grad_norm": 3.543159008026123, + "learning_rate": 0.00011888555877227793, + "loss": 6.0783, + "step": 366 + }, + { + "epoch": 0.02041876077965913, + "grad_norm": 3.256772041320801, + "learning_rate": 0.00011836483912822035, + "loss": 5.6494, + "step": 367 + }, + { + "epoch": 0.020474397730012424, + "grad_norm": 3.838078260421753, + "learning_rate": 0.00011784364341832634, + "loss": 5.5132, + "step": 368 + }, + { + "epoch": 0.02053003468036572, + "grad_norm": 3.5172770023345947, + "learning_rate": 0.00011732198593143949, + "loss": 5.6237, + "step": 369 + }, + { + "epoch": 0.020585671630719015, + "grad_norm": 3.60217547416687, + "learning_rate": 0.00011679988096906333, + "loss": 6.5804, + "step": 370 + }, + { + "epoch": 0.02064130858107231, + "grad_norm": 3.513338327407837, + "learning_rate": 0.00011627734284496917, + "loss": 5.7445, + "step": 371 + }, + { + "epoch": 0.020696945531425603, + "grad_norm": 3.867730140686035, + "learning_rate": 0.00011575438588480359, + "loss": 6.2278, + "step": 372 + }, + { + "epoch": 0.0207525824817789, + "grad_norm": 3.464897871017456, + "learning_rate": 0.00011523102442569585, + "loss": 5.3684, + "step": 373 + }, + { + "epoch": 0.020808219432132194, + "grad_norm": 3.784332752227783, + "learning_rate": 0.00011470727281586475, + "loss": 5.2912, + "step": 374 + }, + { + "epoch": 0.020863856382485488, + "grad_norm": 3.7165110111236572, + "learning_rate": 0.00011418314541422523, + "loss": 5.4899, + "step": 375 + }, + { + "epoch": 0.02091949333283878, + "grad_norm": 3.8226206302642822, + "learning_rate": 0.00011365865658999474, + "loss": 5.514, + "step": 376 + }, + { + "epoch": 0.02097513028319208, + "grad_norm": 3.945645809173584, + "learning_rate": 0.00011313382072229936, + "loss": 5.3529, + "step": 377 + }, + { + "epoch": 0.021030767233545373, + "grad_norm": 3.2092161178588867, + "learning_rate": 0.00011260865219977954, + "loss": 4.813, + "step": 378 + }, + { + "epoch": 0.021086404183898667, + "grad_norm": 3.592958927154541, + "learning_rate": 0.00011208316542019556, + "loss": 5.6217, + "step": 379 + }, + { + "epoch": 0.02114204113425196, + "grad_norm": 3.767707109451294, + "learning_rate": 0.00011155737479003301, + "loss": 6.0992, + "step": 380 + }, + { + "epoch": 0.021197678084605254, + "grad_norm": 4.047881126403809, + "learning_rate": 0.00011103129472410755, + "loss": 5.3497, + "step": 381 + }, + { + "epoch": 0.02125331503495855, + "grad_norm": 3.887549638748169, + "learning_rate": 0.00011050493964516997, + "loss": 5.6986, + "step": 382 + }, + { + "epoch": 0.021308951985311846, + "grad_norm": 3.8913826942443848, + "learning_rate": 0.00010997832398351062, + "loss": 5.0789, + "step": 383 + }, + { + "epoch": 0.02136458893566514, + "grad_norm": 3.389906406402588, + "learning_rate": 0.0001094514621765639, + "loss": 5.3022, + "step": 384 + }, + { + "epoch": 0.021420225886018433, + "grad_norm": 4.141808032989502, + "learning_rate": 0.00010892436866851235, + "loss": 6.3253, + "step": 385 + }, + { + "epoch": 0.02147586283637173, + "grad_norm": 3.226128101348877, + "learning_rate": 0.0001083970579098908, + "loss": 4.8829, + "step": 386 + }, + { + "epoch": 0.021531499786725024, + "grad_norm": 3.2388365268707275, + "learning_rate": 0.00010786954435719008, + "loss": 4.8998, + "step": 387 + }, + { + "epoch": 0.02158713673707832, + "grad_norm": 3.4774935245513916, + "learning_rate": 0.00010734184247246066, + "loss": 5.0029, + "step": 388 + }, + { + "epoch": 0.021642773687431612, + "grad_norm": 4.356663703918457, + "learning_rate": 0.00010681396672291631, + "loss": 6.0405, + "step": 389 + }, + { + "epoch": 0.021698410637784906, + "grad_norm": 3.263737201690674, + "learning_rate": 0.00010628593158053734, + "loss": 5.4247, + "step": 390 + }, + { + "epoch": 0.021754047588138203, + "grad_norm": 3.7901406288146973, + "learning_rate": 0.00010575775152167391, + "loss": 5.8834, + "step": 391 + }, + { + "epoch": 0.021809684538491497, + "grad_norm": 3.621027946472168, + "learning_rate": 0.00010522944102664915, + "loss": 4.9949, + "step": 392 + }, + { + "epoch": 0.02186532148884479, + "grad_norm": 4.161514759063721, + "learning_rate": 0.00010470101457936219, + "loss": 5.2739, + "step": 393 + }, + { + "epoch": 0.021920958439198085, + "grad_norm": 3.880347967147827, + "learning_rate": 0.00010417248666689095, + "loss": 5.5535, + "step": 394 + }, + { + "epoch": 0.021976595389551382, + "grad_norm": 3.449695587158203, + "learning_rate": 0.00010364387177909521, + "loss": 5.9954, + "step": 395 + }, + { + "epoch": 0.022032232339904676, + "grad_norm": 3.717266321182251, + "learning_rate": 0.00010311518440821906, + "loss": 5.6882, + "step": 396 + }, + { + "epoch": 0.02208786929025797, + "grad_norm": 4.129327774047852, + "learning_rate": 0.0001025864390484939, + "loss": 6.2451, + "step": 397 + }, + { + "epoch": 0.022143506240611264, + "grad_norm": 3.6546480655670166, + "learning_rate": 0.00010205765019574084, + "loss": 5.506, + "step": 398 + }, + { + "epoch": 0.022199143190964558, + "grad_norm": 3.8026344776153564, + "learning_rate": 0.00010152883234697336, + "loss": 5.4686, + "step": 399 + }, + { + "epoch": 0.022254780141317855, + "grad_norm": 4.067442417144775, + "learning_rate": 0.000101, + "loss": 5.9194, + "step": 400 + }, + { + "epoch": 0.02231041709167115, + "grad_norm": 3.478989839553833, + "learning_rate": 0.00010047116765302661, + "loss": 5.1729, + "step": 401 + }, + { + "epoch": 0.022366054042024443, + "grad_norm": 3.46468448638916, + "learning_rate": 9.994234980425921e-05, + "loss": 5.3032, + "step": 402 + }, + { + "epoch": 0.022421690992377737, + "grad_norm": 4.244353771209717, + "learning_rate": 9.941356095150613e-05, + "loss": 6.1177, + "step": 403 + }, + { + "epoch": 0.022477327942731034, + "grad_norm": 3.2709834575653076, + "learning_rate": 9.888481559178096e-05, + "loss": 5.8566, + "step": 404 + }, + { + "epoch": 0.022532964893084328, + "grad_norm": 3.706587791442871, + "learning_rate": 9.835612822090483e-05, + "loss": 5.9073, + "step": 405 + }, + { + "epoch": 0.02258860184343762, + "grad_norm": 3.508145809173584, + "learning_rate": 9.782751333310905e-05, + "loss": 4.9236, + "step": 406 + }, + { + "epoch": 0.022644238793790916, + "grad_norm": 3.788998603820801, + "learning_rate": 9.72989854206378e-05, + "loss": 6.6588, + "step": 407 + }, + { + "epoch": 0.02269987574414421, + "grad_norm": 3.3095412254333496, + "learning_rate": 9.677055897335087e-05, + "loss": 4.7509, + "step": 408 + }, + { + "epoch": 0.022755512694497507, + "grad_norm": 3.542297840118408, + "learning_rate": 9.62422484783261e-05, + "loss": 5.6812, + "step": 409 + }, + { + "epoch": 0.0228111496448508, + "grad_norm": 3.743990659713745, + "learning_rate": 9.571406841946267e-05, + "loss": 5.8889, + "step": 410 + }, + { + "epoch": 0.022866786595204094, + "grad_norm": 3.510870933532715, + "learning_rate": 9.518603327708372e-05, + "loss": 4.938, + "step": 411 + }, + { + "epoch": 0.022922423545557388, + "grad_norm": 3.850944995880127, + "learning_rate": 9.465815752753935e-05, + "loss": 5.8124, + "step": 412 + }, + { + "epoch": 0.022978060495910686, + "grad_norm": 3.304145097732544, + "learning_rate": 9.413045564280998e-05, + "loss": 5.5285, + "step": 413 + }, + { + "epoch": 0.02303369744626398, + "grad_norm": 3.62439227104187, + "learning_rate": 9.360294209010923e-05, + "loss": 5.4526, + "step": 414 + }, + { + "epoch": 0.023089334396617273, + "grad_norm": 3.7704150676727295, + "learning_rate": 9.307563133148767e-05, + "loss": 5.929, + "step": 415 + }, + { + "epoch": 0.023144971346970567, + "grad_norm": 3.6585686206817627, + "learning_rate": 9.254853782343616e-05, + "loss": 5.5951, + "step": 416 + }, + { + "epoch": 0.02320060829732386, + "grad_norm": 3.673398017883301, + "learning_rate": 9.202167601648942e-05, + "loss": 5.7902, + "step": 417 + }, + { + "epoch": 0.02325624524767716, + "grad_norm": 3.187032461166382, + "learning_rate": 9.149506035483005e-05, + "loss": 5.5882, + "step": 418 + }, + { + "epoch": 0.023311882198030452, + "grad_norm": 3.46944260597229, + "learning_rate": 9.096870527589248e-05, + "loss": 6.3346, + "step": 419 + }, + { + "epoch": 0.023367519148383746, + "grad_norm": 3.2629919052124023, + "learning_rate": 9.044262520996702e-05, + "loss": 5.4838, + "step": 420 + }, + { + "epoch": 0.02342315609873704, + "grad_norm": 3.3348388671875, + "learning_rate": 8.991683457980443e-05, + "loss": 5.7126, + "step": 421 + }, + { + "epoch": 0.023478793049090337, + "grad_norm": 3.5031561851501465, + "learning_rate": 8.93913478002205e-05, + "loss": 5.31, + "step": 422 + }, + { + "epoch": 0.02353442999944363, + "grad_norm": 3.5377800464630127, + "learning_rate": 8.886617927770065e-05, + "loss": 6.2062, + "step": 423 + }, + { + "epoch": 0.023590066949796925, + "grad_norm": 4.750455856323242, + "learning_rate": 8.834134341000527e-05, + "loss": 5.3093, + "step": 424 + }, + { + "epoch": 0.02364570390015022, + "grad_norm": 3.5273725986480713, + "learning_rate": 8.781685458577481e-05, + "loss": 5.421, + "step": 425 + }, + { + "epoch": 0.023701340850503516, + "grad_norm": 3.2430624961853027, + "learning_rate": 8.729272718413527e-05, + "loss": 5.4385, + "step": 426 + }, + { + "epoch": 0.02375697780085681, + "grad_norm": 3.2267744541168213, + "learning_rate": 8.676897557430415e-05, + "loss": 5.7221, + "step": 427 + }, + { + "epoch": 0.023812614751210104, + "grad_norm": 3.4016199111938477, + "learning_rate": 8.624561411519644e-05, + "loss": 5.0269, + "step": 428 + }, + { + "epoch": 0.023868251701563398, + "grad_norm": 3.182039499282837, + "learning_rate": 8.572265715503086e-05, + "loss": 5.7324, + "step": 429 + }, + { + "epoch": 0.02392388865191669, + "grad_norm": 3.3035106658935547, + "learning_rate": 8.520011903093666e-05, + "loss": 5.2542, + "step": 430 + }, + { + "epoch": 0.02397952560226999, + "grad_norm": 3.667174816131592, + "learning_rate": 8.467801406856054e-05, + "loss": 5.7719, + "step": 431 + }, + { + "epoch": 0.024035162552623283, + "grad_norm": 3.2686805725097656, + "learning_rate": 8.415635658167368e-05, + "loss": 5.1549, + "step": 432 + }, + { + "epoch": 0.024090799502976577, + "grad_norm": 3.5747358798980713, + "learning_rate": 8.363516087177962e-05, + "loss": 5.3843, + "step": 433 + }, + { + "epoch": 0.02414643645332987, + "grad_norm": 3.591118335723877, + "learning_rate": 8.31144412277221e-05, + "loss": 5.1639, + "step": 434 + }, + { + "epoch": 0.024202073403683168, + "grad_norm": 3.398651361465454, + "learning_rate": 8.25942119252931e-05, + "loss": 5.867, + "step": 435 + }, + { + "epoch": 0.02425771035403646, + "grad_norm": 3.472480535507202, + "learning_rate": 8.20744872268418e-05, + "loss": 6.0988, + "step": 436 + }, + { + "epoch": 0.024313347304389755, + "grad_norm": 3.5368642807006836, + "learning_rate": 8.155528138088337e-05, + "loss": 5.8288, + "step": 437 + }, + { + "epoch": 0.02436898425474305, + "grad_norm": 3.430379629135132, + "learning_rate": 8.103660862170826e-05, + "loss": 5.8626, + "step": 438 + }, + { + "epoch": 0.024424621205096343, + "grad_norm": 3.348661184310913, + "learning_rate": 8.051848316899227e-05, + "loss": 5.0892, + "step": 439 + }, + { + "epoch": 0.02448025815544964, + "grad_norm": 3.908968925476074, + "learning_rate": 8.000091922740633e-05, + "loss": 5.4387, + "step": 440 + }, + { + "epoch": 0.024535895105802934, + "grad_norm": 3.2938337326049805, + "learning_rate": 7.948393098622737e-05, + "loss": 5.7218, + "step": 441 + }, + { + "epoch": 0.024591532056156228, + "grad_norm": 3.456089496612549, + "learning_rate": 7.896753261894923e-05, + "loss": 5.3252, + "step": 442 + }, + { + "epoch": 0.024647169006509522, + "grad_norm": 3.2834115028381348, + "learning_rate": 7.845173828289392e-05, + "loss": 5.4011, + "step": 443 + }, + { + "epoch": 0.02470280595686282, + "grad_norm": 4.02750301361084, + "learning_rate": 7.793656211882377e-05, + "loss": 5.927, + "step": 444 + }, + { + "epoch": 0.024758442907216113, + "grad_norm": 4.30374813079834, + "learning_rate": 7.74220182505536e-05, + "loss": 5.8014, + "step": 445 + }, + { + "epoch": 0.024814079857569407, + "grad_norm": 3.0613205432891846, + "learning_rate": 7.690812078456336e-05, + "loss": 4.9314, + "step": 446 + }, + { + "epoch": 0.0248697168079227, + "grad_norm": 3.470621347427368, + "learning_rate": 7.639488380961173e-05, + "loss": 4.9305, + "step": 447 + }, + { + "epoch": 0.024925353758275995, + "grad_norm": 4.040193557739258, + "learning_rate": 7.588232139634968e-05, + "loss": 6.5299, + "step": 448 + }, + { + "epoch": 0.024980990708629292, + "grad_norm": 3.410454273223877, + "learning_rate": 7.537044759693463e-05, + "loss": 5.1331, + "step": 449 + }, + { + "epoch": 0.025036627658982586, + "grad_norm": 3.2952678203582764, + "learning_rate": 7.48592764446454e-05, + "loss": 5.8608, + "step": 450 + }, + { + "epoch": 0.02509226460933588, + "grad_norm": 3.343153238296509, + "learning_rate": 7.434882195349736e-05, + "loss": 5.3242, + "step": 451 + }, + { + "epoch": 0.025147901559689174, + "grad_norm": 3.4447689056396484, + "learning_rate": 7.383909811785817e-05, + "loss": 5.7523, + "step": 452 + }, + { + "epoch": 0.02520353851004247, + "grad_norm": 3.862658977508545, + "learning_rate": 7.333011891206432e-05, + "loss": 5.3692, + "step": 453 + }, + { + "epoch": 0.025259175460395765, + "grad_norm": 3.3530426025390625, + "learning_rate": 7.282189829003785e-05, + "loss": 5.1298, + "step": 454 + }, + { + "epoch": 0.02531481241074906, + "grad_norm": 3.363036870956421, + "learning_rate": 7.231445018490381e-05, + "loss": 5.8424, + "step": 455 + }, + { + "epoch": 0.025370449361102353, + "grad_norm": 3.6629528999328613, + "learning_rate": 7.180778850860835e-05, + "loss": 5.9942, + "step": 456 + }, + { + "epoch": 0.025426086311455647, + "grad_norm": 3.159034252166748, + "learning_rate": 7.130192715153731e-05, + "loss": 4.9182, + "step": 457 + }, + { + "epoch": 0.025481723261808944, + "grad_norm": 3.4242496490478516, + "learning_rate": 7.079687998213526e-05, + "loss": 5.4535, + "step": 458 + }, + { + "epoch": 0.025537360212162238, + "grad_norm": 3.7312228679656982, + "learning_rate": 7.029266084652548e-05, + "loss": 5.4774, + "step": 459 + }, + { + "epoch": 0.02559299716251553, + "grad_norm": 3.387279748916626, + "learning_rate": 6.978928356813031e-05, + "loss": 5.1657, + "step": 460 + }, + { + "epoch": 0.025648634112868825, + "grad_norm": 3.4769787788391113, + "learning_rate": 6.92867619472921e-05, + "loss": 5.4317, + "step": 461 + }, + { + "epoch": 0.025704271063222123, + "grad_norm": 3.285330057144165, + "learning_rate": 6.878510976089493e-05, + "loss": 5.6362, + "step": 462 + }, + { + "epoch": 0.025759908013575417, + "grad_norm": 3.3947970867156982, + "learning_rate": 6.828434076198693e-05, + "loss": 5.5002, + "step": 463 + }, + { + "epoch": 0.02581554496392871, + "grad_norm": 3.2423789501190186, + "learning_rate": 6.77844686794031e-05, + "loss": 5.472, + "step": 464 + }, + { + "epoch": 0.025871181914282004, + "grad_norm": 3.418966770172119, + "learning_rate": 6.728550721738915e-05, + "loss": 5.5353, + "step": 465 + }, + { + "epoch": 0.025926818864635298, + "grad_norm": 3.3809266090393066, + "learning_rate": 6.678747005522557e-05, + "loss": 5.2725, + "step": 466 + }, + { + "epoch": 0.025982455814988595, + "grad_norm": 3.5110361576080322, + "learning_rate": 6.629037084685278e-05, + "loss": 5.4797, + "step": 467 + }, + { + "epoch": 0.02603809276534189, + "grad_norm": 3.28928804397583, + "learning_rate": 6.579422322049668e-05, + "loss": 5.6707, + "step": 468 + }, + { + "epoch": 0.026093729715695183, + "grad_norm": 3.581725835800171, + "learning_rate": 6.529904077829505e-05, + "loss": 5.4784, + "step": 469 + }, + { + "epoch": 0.026149366666048477, + "grad_norm": 3.3287882804870605, + "learning_rate": 6.480483709592468e-05, + "loss": 5.2092, + "step": 470 + }, + { + "epoch": 0.026205003616401774, + "grad_norm": 3.178893566131592, + "learning_rate": 6.43116257222292e-05, + "loss": 4.9041, + "step": 471 + }, + { + "epoch": 0.026260640566755068, + "grad_norm": 3.2872695922851562, + "learning_rate": 6.381942017884753e-05, + "loss": 5.1723, + "step": 472 + }, + { + "epoch": 0.026316277517108362, + "grad_norm": 3.268374443054199, + "learning_rate": 6.33282339598433e-05, + "loss": 5.5008, + "step": 473 + }, + { + "epoch": 0.026371914467461656, + "grad_norm": 3.879403591156006, + "learning_rate": 6.283808053133484e-05, + "loss": 5.7245, + "step": 474 + }, + { + "epoch": 0.02642755141781495, + "grad_norm": 2.9905014038085938, + "learning_rate": 6.234897333112594e-05, + "loss": 4.658, + "step": 475 + }, + { + "epoch": 0.026483188368168247, + "grad_norm": 3.395414113998413, + "learning_rate": 6.186092576833761e-05, + "loss": 5.2822, + "step": 476 + }, + { + "epoch": 0.02653882531852154, + "grad_norm": 3.3310208320617676, + "learning_rate": 6.137395122304033e-05, + "loss": 5.828, + "step": 477 + }, + { + "epoch": 0.026594462268874835, + "grad_norm": 3.529879570007324, + "learning_rate": 6.088806304588717e-05, + "loss": 5.5382, + "step": 478 + }, + { + "epoch": 0.02665009921922813, + "grad_norm": 3.6457550525665283, + "learning_rate": 6.0403274557748035e-05, + "loss": 5.8661, + "step": 479 + }, + { + "epoch": 0.026705736169581426, + "grad_norm": 3.241577625274658, + "learning_rate": 5.9919599049344194e-05, + "loss": 5.5501, + "step": 480 + }, + { + "epoch": 0.02676137311993472, + "grad_norm": 3.176980495452881, + "learning_rate": 5.943704978088402e-05, + "loss": 5.0739, + "step": 481 + }, + { + "epoch": 0.026817010070288014, + "grad_norm": 3.3501482009887695, + "learning_rate": 5.89556399816995e-05, + "loss": 4.9533, + "step": 482 + }, + { + "epoch": 0.026872647020641308, + "grad_norm": 2.957366704940796, + "learning_rate": 5.847538284988341e-05, + "loss": 5.316, + "step": 483 + }, + { + "epoch": 0.026928283970994605, + "grad_norm": 3.2180655002593994, + "learning_rate": 5.7996291551927666e-05, + "loss": 5.2691, + "step": 484 + }, + { + "epoch": 0.0269839209213479, + "grad_norm": 3.095756769180298, + "learning_rate": 5.751837922236217e-05, + "loss": 5.3814, + "step": 485 + }, + { + "epoch": 0.027039557871701193, + "grad_norm": 2.905590295791626, + "learning_rate": 5.704165896339494e-05, + "loss": 5.1131, + "step": 486 + }, + { + "epoch": 0.027095194822054487, + "grad_norm": 3.297048568725586, + "learning_rate": 5.656614384455257e-05, + "loss": 5.4045, + "step": 487 + }, + { + "epoch": 0.02715083177240778, + "grad_norm": 3.0116183757781982, + "learning_rate": 5.609184690232235e-05, + "loss": 5.21, + "step": 488 + }, + { + "epoch": 0.027206468722761078, + "grad_norm": 3.350761890411377, + "learning_rate": 5.5618781139794465e-05, + "loss": 5.4372, + "step": 489 + }, + { + "epoch": 0.02726210567311437, + "grad_norm": 3.0806150436401367, + "learning_rate": 5.514695952630578e-05, + "loss": 5.2992, + "step": 490 + }, + { + "epoch": 0.027317742623467665, + "grad_norm": 3.291692018508911, + "learning_rate": 5.467639499708423e-05, + "loss": 5.2915, + "step": 491 + }, + { + "epoch": 0.02737337957382096, + "grad_norm": 3.5205254554748535, + "learning_rate": 5.420710045289399e-05, + "loss": 5.6932, + "step": 492 + }, + { + "epoch": 0.027429016524174257, + "grad_norm": 3.4537689685821533, + "learning_rate": 5.373908875968211e-05, + "loss": 5.5969, + "step": 493 + }, + { + "epoch": 0.02748465347452755, + "grad_norm": 3.472107172012329, + "learning_rate": 5.3272372748225556e-05, + "loss": 4.8338, + "step": 494 + }, + { + "epoch": 0.027540290424880844, + "grad_norm": 3.843360185623169, + "learning_rate": 5.2806965213779544e-05, + "loss": 5.875, + "step": 495 + }, + { + "epoch": 0.027595927375234138, + "grad_norm": 3.1354005336761475, + "learning_rate": 5.234287891572674e-05, + "loss": 4.8525, + "step": 496 + }, + { + "epoch": 0.027651564325587432, + "grad_norm": 3.2259721755981445, + "learning_rate": 5.1880126577227464e-05, + "loss": 5.4074, + "step": 497 + }, + { + "epoch": 0.02770720127594073, + "grad_norm": 3.5572381019592285, + "learning_rate": 5.141872088487078e-05, + "loss": 5.5728, + "step": 498 + }, + { + "epoch": 0.027762838226294023, + "grad_norm": 3.4521615505218506, + "learning_rate": 5.095867448832683e-05, + "loss": 5.0019, + "step": 499 + }, + { + "epoch": 0.027818475176647317, + "grad_norm": 3.239311456680298, + "learning_rate": 5.050000000000002e-05, + "loss": 4.9722, + "step": 500 + }, + { + "epoch": 0.02787411212700061, + "grad_norm": 3.4787046909332275, + "learning_rate": 5.004270999468307e-05, + "loss": 5.7415, + "step": 501 + }, + { + "epoch": 0.027929749077353908, + "grad_norm": 3.137946605682373, + "learning_rate": 4.95868170092125e-05, + "loss": 5.0207, + "step": 502 + }, + { + "epoch": 0.027985386027707202, + "grad_norm": 3.7396066188812256, + "learning_rate": 4.913233354212485e-05, + "loss": 5.2124, + "step": 503 + }, + { + "epoch": 0.028041022978060496, + "grad_norm": 3.3970324993133545, + "learning_rate": 4.867927205331386e-05, + "loss": 5.2931, + "step": 504 + }, + { + "epoch": 0.02809665992841379, + "grad_norm": 3.504610300064087, + "learning_rate": 4.822764496368917e-05, + "loss": 5.4184, + "step": 505 + }, + { + "epoch": 0.028152296878767084, + "grad_norm": 3.151719093322754, + "learning_rate": 4.7777464654835564e-05, + "loss": 5.3099, + "step": 506 + }, + { + "epoch": 0.02820793382912038, + "grad_norm": 3.407949924468994, + "learning_rate": 4.732874346867362e-05, + "loss": 5.5688, + "step": 507 + }, + { + "epoch": 0.028263570779473675, + "grad_norm": 3.104405641555786, + "learning_rate": 4.6881493707121315e-05, + "loss": 4.832, + "step": 508 + }, + { + "epoch": 0.02831920772982697, + "grad_norm": 3.1438100337982178, + "learning_rate": 4.643572763175684e-05, + "loss": 5.6234, + "step": 509 + }, + { + "epoch": 0.028374844680180263, + "grad_norm": 3.434096336364746, + "learning_rate": 4.5991457463482264e-05, + "loss": 5.5369, + "step": 510 + }, + { + "epoch": 0.02843048163053356, + "grad_norm": 3.259537935256958, + "learning_rate": 4.554869538218868e-05, + "loss": 5.1967, + "step": 511 + }, + { + "epoch": 0.028486118580886854, + "grad_norm": 3.3598663806915283, + "learning_rate": 4.5107453526422255e-05, + "loss": 5.7332, + "step": 512 + }, + { + "epoch": 0.028541755531240148, + "grad_norm": 3.219007730484009, + "learning_rate": 4.46677439930513e-05, + "loss": 5.2544, + "step": 513 + }, + { + "epoch": 0.02859739248159344, + "grad_norm": 3.120272397994995, + "learning_rate": 4.422957883693483e-05, + "loss": 4.6936, + "step": 514 + }, + { + "epoch": 0.028653029431946735, + "grad_norm": 3.8940160274505615, + "learning_rate": 4.3792970070591906e-05, + "loss": 6.2992, + "step": 515 + }, + { + "epoch": 0.028708666382300033, + "grad_norm": 3.3370492458343506, + "learning_rate": 4.3357929663872406e-05, + "loss": 5.5732, + "step": 516 + }, + { + "epoch": 0.028764303332653327, + "grad_norm": 3.3376708030700684, + "learning_rate": 4.29244695436289e-05, + "loss": 5.8762, + "step": 517 + }, + { + "epoch": 0.02881994028300662, + "grad_norm": 3.5328142642974854, + "learning_rate": 4.249260159338946e-05, + "loss": 6.0416, + "step": 518 + }, + { + "epoch": 0.028875577233359914, + "grad_norm": 3.951347827911377, + "learning_rate": 4.2062337653032146e-05, + "loss": 6.1428, + "step": 519 + }, + { + "epoch": 0.02893121418371321, + "grad_norm": 3.1477136611938477, + "learning_rate": 4.1633689518460225e-05, + "loss": 5.1361, + "step": 520 + }, + { + "epoch": 0.028986851134066505, + "grad_norm": 3.2897682189941406, + "learning_rate": 4.1206668941278826e-05, + "loss": 5.6343, + "step": 521 + }, + { + "epoch": 0.0290424880844198, + "grad_norm": 3.64078950881958, + "learning_rate": 4.078128762847279e-05, + "loss": 5.7439, + "step": 522 + }, + { + "epoch": 0.029098125034773093, + "grad_norm": 3.577857494354248, + "learning_rate": 4.035755724208573e-05, + "loss": 5.921, + "step": 523 + }, + { + "epoch": 0.029153761985126387, + "grad_norm": 3.3612093925476074, + "learning_rate": 3.9935489398900145e-05, + "loss": 5.7789, + "step": 524 + }, + { + "epoch": 0.029209398935479684, + "grad_norm": 3.505073308944702, + "learning_rate": 3.951509567011922e-05, + "loss": 5.5106, + "step": 525 + }, + { + "epoch": 0.029265035885832978, + "grad_norm": 3.3955559730529785, + "learning_rate": 3.90963875810494e-05, + "loss": 5.3709, + "step": 526 + }, + { + "epoch": 0.029320672836186272, + "grad_norm": 3.3325414657592773, + "learning_rate": 3.86793766107844e-05, + "loss": 5.3491, + "step": 527 + }, + { + "epoch": 0.029376309786539566, + "grad_norm": 3.730947971343994, + "learning_rate": 3.826407419189066e-05, + "loss": 5.4297, + "step": 528 + }, + { + "epoch": 0.029431946736892863, + "grad_norm": 3.1895978450775146, + "learning_rate": 3.785049171009381e-05, + "loss": 5.5247, + "step": 529 + }, + { + "epoch": 0.029487583687246157, + "grad_norm": 3.0305094718933105, + "learning_rate": 3.743864050396644e-05, + "loss": 5.0703, + "step": 530 + }, + { + "epoch": 0.02954322063759945, + "grad_norm": 3.2483253479003906, + "learning_rate": 3.7028531864617444e-05, + "loss": 5.4687, + "step": 531 + }, + { + "epoch": 0.029598857587952745, + "grad_norm": 3.203097343444824, + "learning_rate": 3.662017703538234e-05, + "loss": 5.1184, + "step": 532 + }, + { + "epoch": 0.02965449453830604, + "grad_norm": 3.517947196960449, + "learning_rate": 3.621358721151505e-05, + "loss": 5.0587, + "step": 533 + }, + { + "epoch": 0.029710131488659336, + "grad_norm": 3.3544349670410156, + "learning_rate": 3.5808773539880973e-05, + "loss": 5.8922, + "step": 534 + }, + { + "epoch": 0.02976576843901263, + "grad_norm": 3.3595056533813477, + "learning_rate": 3.540574711865146e-05, + "loss": 5.259, + "step": 535 + }, + { + "epoch": 0.029821405389365924, + "grad_norm": 2.9764606952667236, + "learning_rate": 3.500451899699935e-05, + "loss": 5.2703, + "step": 536 + }, + { + "epoch": 0.029877042339719218, + "grad_norm": 3.1871285438537598, + "learning_rate": 3.460510017479631e-05, + "loss": 5.0553, + "step": 537 + }, + { + "epoch": 0.029932679290072515, + "grad_norm": 4.060029029846191, + "learning_rate": 3.420750160231118e-05, + "loss": 5.816, + "step": 538 + }, + { + "epoch": 0.02998831624042581, + "grad_norm": 3.1042470932006836, + "learning_rate": 3.381173417990957e-05, + "loss": 5.4576, + "step": 539 + }, + { + "epoch": 0.030043953190779103, + "grad_norm": 2.91147780418396, + "learning_rate": 3.3417808757755355e-05, + "loss": 4.9136, + "step": 540 + }, + { + "epoch": 0.030099590141132396, + "grad_norm": 4.417067050933838, + "learning_rate": 3.302573613551292e-05, + "loss": 5.6685, + "step": 541 + }, + { + "epoch": 0.030155227091485694, + "grad_norm": 3.0060160160064697, + "learning_rate": 3.263552706205128e-05, + "loss": 5.0501, + "step": 542 + }, + { + "epoch": 0.030210864041838988, + "grad_norm": 3.2223386764526367, + "learning_rate": 3.22471922351493e-05, + "loss": 5.2892, + "step": 543 + }, + { + "epoch": 0.03026650099219228, + "grad_norm": 3.4699134826660156, + "learning_rate": 3.186074230120244e-05, + "loss": 6.03, + "step": 544 + }, + { + "epoch": 0.030322137942545575, + "grad_norm": 3.263355016708374, + "learning_rate": 3.147618785493083e-05, + "loss": 5.1747, + "step": 545 + }, + { + "epoch": 0.03037777489289887, + "grad_norm": 4.594723224639893, + "learning_rate": 3.109353943908893e-05, + "loss": 5.658, + "step": 546 + }, + { + "epoch": 0.030433411843252167, + "grad_norm": 3.1902480125427246, + "learning_rate": 3.071280754417626e-05, + "loss": 5.0878, + "step": 547 + }, + { + "epoch": 0.03048904879360546, + "grad_norm": 3.541567325592041, + "learning_rate": 3.033400260815008e-05, + "loss": 5.7298, + "step": 548 + }, + { + "epoch": 0.030544685743958754, + "grad_norm": 3.197453022003174, + "learning_rate": 2.9957135016139122e-05, + "loss": 5.198, + "step": 549 + }, + { + "epoch": 0.030600322694312048, + "grad_norm": 3.3864033222198486, + "learning_rate": 2.9582215100158706e-05, + "loss": 4.4524, + "step": 550 + }, + { + "epoch": 0.030655959644665345, + "grad_norm": 3.3881423473358154, + "learning_rate": 2.920925313882776e-05, + "loss": 6.1387, + "step": 551 + }, + { + "epoch": 0.03071159659501864, + "grad_norm": 2.8004982471466064, + "learning_rate": 2.8838259357086884e-05, + "loss": 4.9554, + "step": 552 + }, + { + "epoch": 0.030767233545371933, + "grad_norm": 3.141254425048828, + "learning_rate": 2.846924392591794e-05, + "loss": 5.6829, + "step": 553 + }, + { + "epoch": 0.030822870495725227, + "grad_norm": 3.157815456390381, + "learning_rate": 2.8102216962065423e-05, + "loss": 5.5436, + "step": 554 + }, + { + "epoch": 0.03087850744607852, + "grad_norm": 3.5025582313537598, + "learning_rate": 2.7737188527758972e-05, + "loss": 5.6507, + "step": 555 + }, + { + "epoch": 0.030934144396431818, + "grad_norm": 4.286128044128418, + "learning_rate": 2.7374168630437456e-05, + "loss": 5.5292, + "step": 556 + }, + { + "epoch": 0.030989781346785112, + "grad_norm": 3.13250470161438, + "learning_rate": 2.7013167222474756e-05, + "loss": 4.9745, + "step": 557 + }, + { + "epoch": 0.031045418297138406, + "grad_norm": 3.3368782997131348, + "learning_rate": 2.6654194200906833e-05, + "loss": 5.528, + "step": 558 + }, + { + "epoch": 0.0311010552474917, + "grad_norm": 2.921546220779419, + "learning_rate": 2.629725940716041e-05, + "loss": 5.139, + "step": 559 + }, + { + "epoch": 0.031156692197844997, + "grad_norm": 3.0273654460906982, + "learning_rate": 2.5942372626783172e-05, + "loss": 4.8338, + "step": 560 + }, + { + "epoch": 0.03121232914819829, + "grad_norm": 3.3616998195648193, + "learning_rate": 2.5589543589175485e-05, + "loss": 4.6992, + "step": 561 + }, + { + "epoch": 0.031267966098551585, + "grad_norm": 3.146557092666626, + "learning_rate": 2.523878196732358e-05, + "loss": 5.675, + "step": 562 + }, + { + "epoch": 0.03132360304890488, + "grad_norm": 3.1204278469085693, + "learning_rate": 2.489009737753459e-05, + "loss": 4.7224, + "step": 563 + }, + { + "epoch": 0.03137923999925817, + "grad_norm": 3.319640874862671, + "learning_rate": 2.4543499379172615e-05, + "loss": 5.7704, + "step": 564 + }, + { + "epoch": 0.031434876949611466, + "grad_norm": 3.0480804443359375, + "learning_rate": 2.4198997474396877e-05, + "loss": 5.3685, + "step": 565 + }, + { + "epoch": 0.03149051389996476, + "grad_norm": 3.0615460872650146, + "learning_rate": 2.3856601107901166e-05, + "loss": 5.1338, + "step": 566 + }, + { + "epoch": 0.03154615085031806, + "grad_norm": 3.0596776008605957, + "learning_rate": 2.351631966665476e-05, + "loss": 6.0379, + "step": 567 + }, + { + "epoch": 0.031601787800671355, + "grad_norm": 3.2118067741394043, + "learning_rate": 2.31781624796453e-05, + "loss": 5.2096, + "step": 568 + }, + { + "epoch": 0.03165742475102465, + "grad_norm": 3.5685503482818604, + "learning_rate": 2.2842138817622883e-05, + "loss": 6.0739, + "step": 569 + }, + { + "epoch": 0.03171306170137794, + "grad_norm": 3.4660439491271973, + "learning_rate": 2.250825789284594e-05, + "loss": 4.9725, + "step": 570 + }, + { + "epoch": 0.031768698651731236, + "grad_norm": 3.092043399810791, + "learning_rate": 2.217652885882869e-05, + "loss": 5.2655, + "step": 571 + }, + { + "epoch": 0.03182433560208453, + "grad_norm": 3.073158025741577, + "learning_rate": 2.1846960810090188e-05, + "loss": 5.2164, + "step": 572 + }, + { + "epoch": 0.031879972552437824, + "grad_norm": 3.2738797664642334, + "learning_rate": 2.151956278190494e-05, + "loss": 5.1302, + "step": 573 + }, + { + "epoch": 0.03193560950279112, + "grad_norm": 3.1305830478668213, + "learning_rate": 2.119434375005527e-05, + "loss": 5.0297, + "step": 574 + }, + { + "epoch": 0.03199124645314441, + "grad_norm": 3.142305850982666, + "learning_rate": 2.087131263058526e-05, + "loss": 5.1092, + "step": 575 + }, + { + "epoch": 0.03204688340349771, + "grad_norm": 3.49165940284729, + "learning_rate": 2.055047827955618e-05, + "loss": 5.914, + "step": 576 + }, + { + "epoch": 0.032102520353851006, + "grad_norm": 3.1859958171844482, + "learning_rate": 2.0231849492803852e-05, + "loss": 5.6582, + "step": 577 + }, + { + "epoch": 0.0321581573042043, + "grad_norm": 2.9012060165405273, + "learning_rate": 1.991543500569745e-05, + "loss": 5.3881, + "step": 578 + }, + { + "epoch": 0.032213794254557594, + "grad_norm": 3.5505943298339844, + "learning_rate": 1.960124349289992e-05, + "loss": 5.3315, + "step": 579 + }, + { + "epoch": 0.03226943120491089, + "grad_norm": 3.426257610321045, + "learning_rate": 1.928928356813032e-05, + "loss": 5.6951, + "step": 580 + }, + { + "epoch": 0.03232506815526418, + "grad_norm": 3.150081157684326, + "learning_rate": 1.8979563783927565e-05, + "loss": 5.2932, + "step": 581 + }, + { + "epoch": 0.032380705105617476, + "grad_norm": 3.2909204959869385, + "learning_rate": 1.8672092631416013e-05, + "loss": 5.3027, + "step": 582 + }, + { + "epoch": 0.03243634205597077, + "grad_norm": 3.8638181686401367, + "learning_rate": 1.8366878540072614e-05, + "loss": 6.4383, + "step": 583 + }, + { + "epoch": 0.032491979006324063, + "grad_norm": 3.067730665206909, + "learning_rate": 1.8063929877495892e-05, + "loss": 5.553, + "step": 584 + }, + { + "epoch": 0.032547615956677364, + "grad_norm": 4.0556559562683105, + "learning_rate": 1.7763254949176414e-05, + "loss": 5.6984, + "step": 585 + }, + { + "epoch": 0.03260325290703066, + "grad_norm": 3.1798980236053467, + "learning_rate": 1.7464861998269243e-05, + "loss": 5.6912, + "step": 586 + }, + { + "epoch": 0.03265888985738395, + "grad_norm": 3.126953363418579, + "learning_rate": 1.7168759205367893e-05, + "loss": 5.1372, + "step": 587 + }, + { + "epoch": 0.032714526807737246, + "grad_norm": 3.512899160385132, + "learning_rate": 1.6874954688279956e-05, + "loss": 5.5996, + "step": 588 + }, + { + "epoch": 0.03277016375809054, + "grad_norm": 3.397019147872925, + "learning_rate": 1.6583456501804725e-05, + "loss": 5.4761, + "step": 589 + }, + { + "epoch": 0.032825800708443834, + "grad_norm": 3.2552711963653564, + "learning_rate": 1.6294272637512183e-05, + "loss": 5.0166, + "step": 590 + }, + { + "epoch": 0.03288143765879713, + "grad_norm": 3.104959011077881, + "learning_rate": 1.600741102352409e-05, + "loss": 5.0748, + "step": 591 + }, + { + "epoch": 0.03293707460915042, + "grad_norm": 2.9046316146850586, + "learning_rate": 1.57228795242965e-05, + "loss": 4.7039, + "step": 592 + }, + { + "epoch": 0.032992711559503715, + "grad_norm": 2.9473962783813477, + "learning_rate": 1.544068594040417e-05, + "loss": 5.0494, + "step": 593 + }, + { + "epoch": 0.033048348509857016, + "grad_norm": 3.3171298503875732, + "learning_rate": 1.516083800832676e-05, + "loss": 5.0401, + "step": 594 + }, + { + "epoch": 0.03310398546021031, + "grad_norm": 2.6179308891296387, + "learning_rate": 1.488334340023669e-05, + "loss": 4.5191, + "step": 595 + }, + { + "epoch": 0.033159622410563604, + "grad_norm": 3.19975209236145, + "learning_rate": 1.4608209723788835e-05, + "loss": 5.0509, + "step": 596 + }, + { + "epoch": 0.0332152593609169, + "grad_norm": 3.4313442707061768, + "learning_rate": 1.4335444521911899e-05, + "loss": 5.5635, + "step": 597 + }, + { + "epoch": 0.03327089631127019, + "grad_norm": 3.330453634262085, + "learning_rate": 1.4065055272601703e-05, + "loss": 4.9744, + "step": 598 + }, + { + "epoch": 0.033326533261623485, + "grad_norm": 3.262907028198242, + "learning_rate": 1.3797049388716065e-05, + "loss": 5.5586, + "step": 599 + }, + { + "epoch": 0.03338217021197678, + "grad_norm": 2.928389549255371, + "learning_rate": 1.3531434217771692e-05, + "loss": 5.2089, + "step": 600 + }, + { + "epoch": 0.03343780716233007, + "grad_norm": 3.3383753299713135, + "learning_rate": 1.3268217041742701e-05, + "loss": 5.0282, + "step": 601 + }, + { + "epoch": 0.03349344411268337, + "grad_norm": 3.2334837913513184, + "learning_rate": 1.3007405076860875e-05, + "loss": 5.5836, + "step": 602 + }, + { + "epoch": 0.03354908106303667, + "grad_norm": 3.505136013031006, + "learning_rate": 1.2749005473418015e-05, + "loss": 4.8968, + "step": 603 + }, + { + "epoch": 0.03360471801338996, + "grad_norm": 3.3375158309936523, + "learning_rate": 1.2493025315569801e-05, + "loss": 5.6284, + "step": 604 + }, + { + "epoch": 0.033660354963743255, + "grad_norm": 3.081942081451416, + "learning_rate": 1.2239471621141508e-05, + "loss": 5.2374, + "step": 605 + }, + { + "epoch": 0.03371599191409655, + "grad_norm": 3.623671054840088, + "learning_rate": 1.1988351341435792e-05, + "loss": 5.4548, + "step": 606 + }, + { + "epoch": 0.03377162886444984, + "grad_norm": 3.3528661727905273, + "learning_rate": 1.173967136104196e-05, + "loss": 5.0832, + "step": 607 + }, + { + "epoch": 0.03382726581480314, + "grad_norm": 2.910083055496216, + "learning_rate": 1.1493438497647313e-05, + "loss": 5.1922, + "step": 608 + }, + { + "epoch": 0.03388290276515643, + "grad_norm": 3.314018964767456, + "learning_rate": 1.1249659501850155e-05, + "loss": 5.2669, + "step": 609 + }, + { + "epoch": 0.033938539715509725, + "grad_norm": 3.2967429161071777, + "learning_rate": 1.1008341056974854e-05, + "loss": 5.1903, + "step": 610 + }, + { + "epoch": 0.03399417666586302, + "grad_norm": 3.3789079189300537, + "learning_rate": 1.0769489778888405e-05, + "loss": 5.3849, + "step": 611 + }, + { + "epoch": 0.03404981361621632, + "grad_norm": 2.777427911758423, + "learning_rate": 1.0533112215819298e-05, + "loss": 4.8711, + "step": 612 + }, + { + "epoch": 0.03410545056656961, + "grad_norm": 2.8376872539520264, + "learning_rate": 1.029921484817783e-05, + "loss": 4.4653, + "step": 613 + }, + { + "epoch": 0.03416108751692291, + "grad_norm": 3.2733492851257324, + "learning_rate": 1.0067804088378455e-05, + "loss": 5.3222, + "step": 614 + }, + { + "epoch": 0.0342167244672762, + "grad_norm": 3.173684597015381, + "learning_rate": 9.8388862806641e-06, + "loss": 5.4409, + "step": 615 + }, + { + "epoch": 0.034272361417629495, + "grad_norm": 3.3361847400665283, + "learning_rate": 9.612467700932045e-06, + "loss": 5.6518, + "step": 616 + }, + { + "epoch": 0.03432799836798279, + "grad_norm": 3.1028096675872803, + "learning_rate": 9.388554556562049e-06, + "loss": 5.5069, + "step": 617 + }, + { + "epoch": 0.03438363531833608, + "grad_norm": 3.2164828777313232, + "learning_rate": 9.167152986246078e-06, + "loss": 5.5031, + "step": 618 + }, + { + "epoch": 0.034439272268689376, + "grad_norm": 3.419510841369629, + "learning_rate": 8.948269059820025e-06, + "loss": 5.5147, + "step": 619 + }, + { + "epoch": 0.03449490921904267, + "grad_norm": 3.42582631111145, + "learning_rate": 8.731908778097302e-06, + "loss": 5.5011, + "step": 620 + }, + { + "epoch": 0.03455054616939597, + "grad_norm": 3.4066319465637207, + "learning_rate": 8.518078072704338e-06, + "loss": 5.7813, + "step": 621 + }, + { + "epoch": 0.034606183119749265, + "grad_norm": 3.2096457481384277, + "learning_rate": 8.306782805917904e-06, + "loss": 4.8056, + "step": 622 + }, + { + "epoch": 0.03466182007010256, + "grad_norm": 3.057443618774414, + "learning_rate": 8.098028770504494e-06, + "loss": 4.6218, + "step": 623 + }, + { + "epoch": 0.03471745702045585, + "grad_norm": 3.057856559753418, + "learning_rate": 7.891821689561459e-06, + "loss": 5.0136, + "step": 624 + }, + { + "epoch": 0.034773093970809146, + "grad_norm": 3.4276516437530518, + "learning_rate": 7.68816721636004e-06, + "loss": 4.6166, + "step": 625 + }, + { + "epoch": 0.03482873092116244, + "grad_norm": 3.630868673324585, + "learning_rate": 7.487070934190532e-06, + "loss": 5.4276, + "step": 626 + }, + { + "epoch": 0.034884367871515734, + "grad_norm": 3.2166192531585693, + "learning_rate": 7.288538356209092e-06, + "loss": 5.2448, + "step": 627 + }, + { + "epoch": 0.03494000482186903, + "grad_norm": 3.6713593006134033, + "learning_rate": 7.092574925286614e-06, + "loss": 5.6868, + "step": 628 + }, + { + "epoch": 0.03499564177222233, + "grad_norm": 3.1197285652160645, + "learning_rate": 6.899186013859561e-06, + "loss": 5.2911, + "step": 629 + }, + { + "epoch": 0.03505127872257562, + "grad_norm": 3.6722445487976074, + "learning_rate": 6.708376923782635e-06, + "loss": 4.8365, + "step": 630 + }, + { + "epoch": 0.035106915672928916, + "grad_norm": 2.946157693862915, + "learning_rate": 6.520152886183406e-06, + "loss": 5.03, + "step": 631 + }, + { + "epoch": 0.03516255262328221, + "grad_norm": 3.5462489128112793, + "learning_rate": 6.3345190613189635e-06, + "loss": 5.4141, + "step": 632 + }, + { + "epoch": 0.035218189573635504, + "grad_norm": 3.6179442405700684, + "learning_rate": 6.151480538434382e-06, + "loss": 5.4522, + "step": 633 + }, + { + "epoch": 0.0352738265239888, + "grad_norm": 3.187727451324463, + "learning_rate": 5.971042335623229e-06, + "loss": 5.1797, + "step": 634 + }, + { + "epoch": 0.03532946347434209, + "grad_norm": 3.1899514198303223, + "learning_rate": 5.793209399689978e-06, + "loss": 5.4256, + "step": 635 + }, + { + "epoch": 0.035385100424695386, + "grad_norm": 3.337507963180542, + "learning_rate": 5.617986606014419e-06, + "loss": 5.6824, + "step": 636 + }, + { + "epoch": 0.03544073737504868, + "grad_norm": 3.0960090160369873, + "learning_rate": 5.445378758417925e-06, + "loss": 4.8739, + "step": 637 + }, + { + "epoch": 0.03549637432540198, + "grad_norm": 3.0640668869018555, + "learning_rate": 5.275390589031859e-06, + "loss": 5.2784, + "step": 638 + }, + { + "epoch": 0.035552011275755274, + "grad_norm": 3.3573837280273438, + "learning_rate": 5.108026758167719e-06, + "loss": 5.6433, + "step": 639 + }, + { + "epoch": 0.03560764822610857, + "grad_norm": 2.7123923301696777, + "learning_rate": 4.943291854189493e-06, + "loss": 5.1223, + "step": 640 + }, + { + "epoch": 0.03566328517646186, + "grad_norm": 3.1964516639709473, + "learning_rate": 4.781190393387796e-06, + "loss": 4.6121, + "step": 641 + }, + { + "epoch": 0.035718922126815156, + "grad_norm": 3.294018030166626, + "learning_rate": 4.6217268198560404e-06, + "loss": 5.5015, + "step": 642 + }, + { + "epoch": 0.03577455907716845, + "grad_norm": 3.1996569633483887, + "learning_rate": 4.464905505368658e-06, + "loss": 5.6098, + "step": 643 + }, + { + "epoch": 0.035830196027521743, + "grad_norm": 3.5497488975524902, + "learning_rate": 4.3107307492612086e-06, + "loss": 5.5032, + "step": 644 + }, + { + "epoch": 0.03588583297787504, + "grad_norm": 2.9554443359375, + "learning_rate": 4.1592067783125015e-06, + "loss": 5.2009, + "step": 645 + }, + { + "epoch": 0.03594146992822833, + "grad_norm": 4.038937091827393, + "learning_rate": 4.010337746628751e-06, + "loss": 5.7413, + "step": 646 + }, + { + "epoch": 0.03599710687858163, + "grad_norm": 3.1167092323303223, + "learning_rate": 3.864127735529656e-06, + "loss": 5.3229, + "step": 647 + }, + { + "epoch": 0.036052743828934926, + "grad_norm": 3.3470003604888916, + "learning_rate": 3.7205807534365315e-06, + "loss": 5.3129, + "step": 648 + }, + { + "epoch": 0.03610838077928822, + "grad_norm": 3.6280088424682617, + "learning_rate": 3.5797007357623945e-06, + "loss": 5.528, + "step": 649 + }, + { + "epoch": 0.036164017729641514, + "grad_norm": 2.901085376739502, + "learning_rate": 3.441491544804112e-06, + "loss": 5.1328, + "step": 650 + }, + { + "epoch": 0.03621965467999481, + "grad_norm": 2.996978998184204, + "learning_rate": 3.3059569696364502e-06, + "loss": 4.824, + "step": 651 + }, + { + "epoch": 0.0362752916303481, + "grad_norm": 4.007577419281006, + "learning_rate": 3.1731007260082616e-06, + "loss": 5.6728, + "step": 652 + }, + { + "epoch": 0.036330928580701395, + "grad_norm": 3.5199480056762695, + "learning_rate": 3.0429264562405776e-06, + "loss": 5.2839, + "step": 653 + }, + { + "epoch": 0.03638656553105469, + "grad_norm": 3.0006823539733887, + "learning_rate": 2.9154377291267674e-06, + "loss": 5.1906, + "step": 654 + }, + { + "epoch": 0.03644220248140798, + "grad_norm": 3.2792537212371826, + "learning_rate": 2.790638039834668e-06, + "loss": 5.2099, + "step": 655 + }, + { + "epoch": 0.036497839431761284, + "grad_norm": 3.3460628986358643, + "learning_rate": 2.6685308098108106e-06, + "loss": 5.3725, + "step": 656 + }, + { + "epoch": 0.03655347638211458, + "grad_norm": 3.1523663997650146, + "learning_rate": 2.5491193866866025e-06, + "loss": 5.2477, + "step": 657 + }, + { + "epoch": 0.03660911333246787, + "grad_norm": 3.209110975265503, + "learning_rate": 2.432407044186509e-06, + "loss": 5.3084, + "step": 658 + }, + { + "epoch": 0.036664750282821165, + "grad_norm": 3.2112083435058594, + "learning_rate": 2.3183969820383735e-06, + "loss": 5.7235, + "step": 659 + }, + { + "epoch": 0.03672038723317446, + "grad_norm": 2.9874796867370605, + "learning_rate": 2.2070923258856255e-06, + "loss": 4.8377, + "step": 660 + }, + { + "epoch": 0.03677602418352775, + "grad_norm": 3.120227336883545, + "learning_rate": 2.098496127201648e-06, + "loss": 4.9049, + "step": 661 + }, + { + "epoch": 0.03683166113388105, + "grad_norm": 3.2788374423980713, + "learning_rate": 1.992611363206103e-06, + "loss": 5.5721, + "step": 662 + }, + { + "epoch": 0.03688729808423434, + "grad_norm": 3.341951608657837, + "learning_rate": 1.889440936783242e-06, + "loss": 5.8991, + "step": 663 + }, + { + "epoch": 0.036942935034587635, + "grad_norm": 3.2425339221954346, + "learning_rate": 1.7889876764024505e-06, + "loss": 5.5709, + "step": 664 + }, + { + "epoch": 0.036998571984940935, + "grad_norm": 3.178581476211548, + "learning_rate": 1.691254336040595e-06, + "loss": 5.3055, + "step": 665 + }, + { + "epoch": 0.03705420893529423, + "grad_norm": 3.122201681137085, + "learning_rate": 1.59624359510657e-06, + "loss": 5.0241, + "step": 666 + }, + { + "epoch": 0.03710984588564752, + "grad_norm": 3.15179181098938, + "learning_rate": 1.5039580583678393e-06, + "loss": 5.6298, + "step": 667 + }, + { + "epoch": 0.03716548283600082, + "grad_norm": 2.999861717224121, + "learning_rate": 1.414400255879008e-06, + "loss": 5.2461, + "step": 668 + }, + { + "epoch": 0.03722111978635411, + "grad_norm": 3.14927339553833, + "learning_rate": 1.327572642912468e-06, + "loss": 5.2516, + "step": 669 + }, + { + "epoch": 0.037276756736707405, + "grad_norm": 3.683779716491699, + "learning_rate": 1.2434775998910964e-06, + "loss": 5.308, + "step": 670 + }, + { + "epoch": 0.0373323936870607, + "grad_norm": 3.206599712371826, + "learning_rate": 1.1621174323229612e-06, + "loss": 5.072, + "step": 671 + }, + { + "epoch": 0.03738803063741399, + "grad_norm": 3.208451747894287, + "learning_rate": 1.0834943707381784e-06, + "loss": 5.3667, + "step": 672 + }, + { + "epoch": 0.037443667587767286, + "grad_norm": 2.992967367172241, + "learning_rate": 1.0076105706276888e-06, + "loss": 5.2797, + "step": 673 + }, + { + "epoch": 0.03749930453812059, + "grad_norm": 3.831514835357666, + "learning_rate": 9.344681123841967e-07, + "loss": 5.329, + "step": 674 + }, + { + "epoch": 0.03755494148847388, + "grad_norm": 3.573158025741577, + "learning_rate": 8.640690012451515e-07, + "loss": 5.4266, + "step": 675 + }, + { + "epoch": 0.037610578438827175, + "grad_norm": 3.2301135063171387, + "learning_rate": 7.964151672377458e-07, + "loss": 5.2795, + "step": 676 + }, + { + "epoch": 0.03766621538918047, + "grad_norm": 3.9718902111053467, + "learning_rate": 7.315084651260009e-07, + "loss": 5.9585, + "step": 677 + }, + { + "epoch": 0.03772185233953376, + "grad_norm": 3.3933451175689697, + "learning_rate": 6.69350674359959e-07, + "loss": 5.5774, + "step": 678 + }, + { + "epoch": 0.037777489289887056, + "grad_norm": 3.264383554458618, + "learning_rate": 6.099434990268609e-07, + "loss": 5.6062, + "step": 679 + }, + { + "epoch": 0.03783312624024035, + "grad_norm": 3.261469602584839, + "learning_rate": 5.532885678043977e-07, + "loss": 5.4672, + "step": 680 + }, + { + "epoch": 0.037888763190593644, + "grad_norm": 3.185216188430786, + "learning_rate": 4.9938743391615e-07, + "loss": 5.3658, + "step": 681 + }, + { + "epoch": 0.03794440014094694, + "grad_norm": 3.737952947616577, + "learning_rate": 4.482415750889204e-07, + "loss": 5.3403, + "step": 682 + }, + { + "epoch": 0.03800003709130024, + "grad_norm": 3.1326498985290527, + "learning_rate": 3.998523935122772e-07, + "loss": 5.6484, + "step": 683 + }, + { + "epoch": 0.03805567404165353, + "grad_norm": 3.3844358921051025, + "learning_rate": 3.5422121580005864e-07, + "loss": 5.8857, + "step": 684 + }, + { + "epoch": 0.038111310992006826, + "grad_norm": 3.121704578399658, + "learning_rate": 3.1134929295407564e-07, + "loss": 5.2231, + "step": 685 + }, + { + "epoch": 0.03816694794236012, + "grad_norm": 3.0350818634033203, + "learning_rate": 2.7123780032973235e-07, + "loss": 5.3108, + "step": 686 + }, + { + "epoch": 0.038222584892713414, + "grad_norm": 4.1188764572143555, + "learning_rate": 2.3388783760386601e-07, + "loss": 6.2249, + "step": 687 + }, + { + "epoch": 0.03827822184306671, + "grad_norm": 3.0596892833709717, + "learning_rate": 1.9930042874457254e-07, + "loss": 5.082, + "step": 688 + }, + { + "epoch": 0.03833385879342, + "grad_norm": 3.485663890838623, + "learning_rate": 1.6747652198313957e-07, + "loss": 5.8324, + "step": 689 + }, + { + "epoch": 0.038389495743773296, + "grad_norm": 3.1050713062286377, + "learning_rate": 1.3841698978804285e-07, + "loss": 5.4737, + "step": 690 + }, + { + "epoch": 0.03844513269412659, + "grad_norm": 3.163748264312744, + "learning_rate": 1.1212262884103974e-07, + "loss": 5.5179, + "step": 691 + }, + { + "epoch": 0.03850076964447989, + "grad_norm": 2.8173980712890625, + "learning_rate": 8.85941600153033e-08, + "loss": 4.7785, + "step": 692 + }, + { + "epoch": 0.038556406594833184, + "grad_norm": 3.2245240211486816, + "learning_rate": 6.783222835572055e-08, + "loss": 5.2927, + "step": 693 + }, + { + "epoch": 0.03861204354518648, + "grad_norm": 2.993494987487793, + "learning_rate": 4.98374030611084e-08, + "loss": 4.9075, + "step": 694 + }, + { + "epoch": 0.03866768049553977, + "grad_norm": 3.4781525135040283, + "learning_rate": 3.461017746871675e-08, + "loss": 5.2247, + "step": 695 + }, + { + "epoch": 0.038723317445893066, + "grad_norm": 3.1908767223358154, + "learning_rate": 2.215096904060454e-08, + "loss": 6.0285, + "step": 696 + }, + { + "epoch": 0.03877895439624636, + "grad_norm": 3.04532790184021, + "learning_rate": 1.246011935228064e-08, + "loss": 4.8014, + "step": 697 + }, + { + "epoch": 0.03883459134659965, + "grad_norm": 3.597644567489624, + "learning_rate": 5.537894083273543e-09, + "loss": 5.4907, + "step": 698 + }, + { + "epoch": 0.03889022829695295, + "grad_norm": 2.9295830726623535, + "learning_rate": 1.384483009898796e-09, + "loss": 5.1188, + "step": 699 + }, + { + "epoch": 0.03894586524730624, + "grad_norm": 3.0771842002868652, + "learning_rate": 0.0, + "loss": 5.178, + "step": 700 + } + ], + "logging_steps": 1, + "max_steps": 700, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.16061635690496e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}