diff --git "a/checkpoint-4968/trainer_state.json" "b/checkpoint-4968/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4968/trainer_state.json" @@ -0,0 +1,34881 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9996981587684877, + "eval_steps": 500, + "global_step": 4968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006036824630244491, + "grad_norm": 0.876740574836731, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.3437, + "step": 1 + }, + { + "epoch": 0.0012073649260488982, + "grad_norm": 0.8783963322639465, + "learning_rate": 2.666666666666667e-06, + "loss": 1.4745, + "step": 2 + }, + { + "epoch": 0.0018110473890733474, + "grad_norm": 0.8955765962600708, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6589, + "step": 3 + }, + { + "epoch": 0.0024147298520977964, + "grad_norm": 1.0829285383224487, + "learning_rate": 5.333333333333334e-06, + "loss": 1.5662, + "step": 4 + }, + { + "epoch": 0.003018412315122246, + "grad_norm": 1.1975041627883911, + "learning_rate": 6.666666666666667e-06, + "loss": 1.4658, + "step": 5 + }, + { + "epoch": 0.003622094778146695, + "grad_norm": 1.0904583930969238, + "learning_rate": 8.000000000000001e-06, + "loss": 1.6691, + "step": 6 + }, + { + "epoch": 0.004225777241171144, + "grad_norm": 0.6331161260604858, + "learning_rate": 9.333333333333334e-06, + "loss": 1.5705, + "step": 7 + }, + { + "epoch": 0.004829459704195593, + "grad_norm": 0.3753370940685272, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.5175, + "step": 8 + }, + { + "epoch": 0.005433142167220043, + "grad_norm": 2.936147451400757, + "learning_rate": 1.2e-05, + "loss": 1.5404, + "step": 9 + }, + { + "epoch": 0.006036824630244492, + "grad_norm": 2.635810613632202, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.5605, + "step": 10 + }, + { + "epoch": 0.006640507093268941, + "grad_norm": 1.9180281162261963, + "learning_rate": 1.4666666666666668e-05, + "loss": 1.5239, + "step": 11 + }, + { + "epoch": 0.00724418955629339, + "grad_norm": 3.021333932876587, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.5081, + "step": 12 + }, + { + "epoch": 0.00784787201931784, + "grad_norm": 1.9040541648864746, + "learning_rate": 1.7333333333333336e-05, + "loss": 1.4366, + "step": 13 + }, + { + "epoch": 0.008451554482342288, + "grad_norm": 1.0555269718170166, + "learning_rate": 1.866666666666667e-05, + "loss": 1.6606, + "step": 14 + }, + { + "epoch": 0.009055236945366738, + "grad_norm": 0.5196813344955444, + "learning_rate": 2e-05, + "loss": 1.7105, + "step": 15 + }, + { + "epoch": 0.009658919408391186, + "grad_norm": 1.1674487590789795, + "learning_rate": 2.1333333333333335e-05, + "loss": 1.3767, + "step": 16 + }, + { + "epoch": 0.010262601871415635, + "grad_norm": 1.3149781227111816, + "learning_rate": 2.2666666666666668e-05, + "loss": 1.2422, + "step": 17 + }, + { + "epoch": 0.010866284334440085, + "grad_norm": 0.5175302028656006, + "learning_rate": 2.4e-05, + "loss": 1.3791, + "step": 18 + }, + { + "epoch": 0.011469966797464533, + "grad_norm": 0.3626837432384491, + "learning_rate": 2.5333333333333337e-05, + "loss": 1.4945, + "step": 19 + }, + { + "epoch": 0.012073649260488983, + "grad_norm": 0.26292356848716736, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.3598, + "step": 20 + }, + { + "epoch": 0.012677331723513431, + "grad_norm": 0.2678084671497345, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.3896, + "step": 21 + }, + { + "epoch": 0.013281014186537881, + "grad_norm": 0.34738680720329285, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.3734, + "step": 22 + }, + { + "epoch": 0.01388469664956233, + "grad_norm": 0.3626849949359894, + "learning_rate": 3.066666666666667e-05, + "loss": 1.306, + "step": 23 + }, + { + "epoch": 0.01448837911258678, + "grad_norm": 0.2588627338409424, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.3373, + "step": 24 + }, + { + "epoch": 0.01509206157561123, + "grad_norm": 0.6032963395118713, + "learning_rate": 3.3333333333333335e-05, + "loss": 1.1394, + "step": 25 + }, + { + "epoch": 0.01569574403863568, + "grad_norm": 0.194705531001091, + "learning_rate": 3.466666666666667e-05, + "loss": 1.0727, + "step": 26 + }, + { + "epoch": 0.016299426501660125, + "grad_norm": 0.3403484523296356, + "learning_rate": 3.6e-05, + "loss": 1.045, + "step": 27 + }, + { + "epoch": 0.016903108964684575, + "grad_norm": 0.3210083544254303, + "learning_rate": 3.733333333333334e-05, + "loss": 1.285, + "step": 28 + }, + { + "epoch": 0.017506791427709025, + "grad_norm": 0.6296122074127197, + "learning_rate": 3.866666666666667e-05, + "loss": 1.15, + "step": 29 + }, + { + "epoch": 0.018110473890733475, + "grad_norm": 0.4753726124763489, + "learning_rate": 4e-05, + "loss": 1.0053, + "step": 30 + }, + { + "epoch": 0.018714156353757925, + "grad_norm": 0.19821159541606903, + "learning_rate": 4.133333333333333e-05, + "loss": 1.1965, + "step": 31 + }, + { + "epoch": 0.01931783881678237, + "grad_norm": 0.2836245000362396, + "learning_rate": 4.266666666666667e-05, + "loss": 1.0633, + "step": 32 + }, + { + "epoch": 0.01992152127980682, + "grad_norm": 0.19199256598949432, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.9728, + "step": 33 + }, + { + "epoch": 0.02052520374283127, + "grad_norm": 0.17820274829864502, + "learning_rate": 4.5333333333333335e-05, + "loss": 1.0132, + "step": 34 + }, + { + "epoch": 0.02112888620585572, + "grad_norm": 0.20217391848564148, + "learning_rate": 4.666666666666667e-05, + "loss": 1.0202, + "step": 35 + }, + { + "epoch": 0.02173256866888017, + "grad_norm": 0.18383841216564178, + "learning_rate": 4.8e-05, + "loss": 0.9666, + "step": 36 + }, + { + "epoch": 0.022336251131904617, + "grad_norm": 0.17872729897499084, + "learning_rate": 4.933333333333334e-05, + "loss": 0.9385, + "step": 37 + }, + { + "epoch": 0.022939933594929067, + "grad_norm": 0.14148443937301636, + "learning_rate": 5.0666666666666674e-05, + "loss": 0.9338, + "step": 38 + }, + { + "epoch": 0.023543616057953517, + "grad_norm": 0.14587634801864624, + "learning_rate": 5.2000000000000004e-05, + "loss": 0.964, + "step": 39 + }, + { + "epoch": 0.024147298520977967, + "grad_norm": 0.17298458516597748, + "learning_rate": 5.333333333333333e-05, + "loss": 0.9934, + "step": 40 + }, + { + "epoch": 0.024750980984002413, + "grad_norm": 0.1523771435022354, + "learning_rate": 5.466666666666666e-05, + "loss": 0.9995, + "step": 41 + }, + { + "epoch": 0.025354663447026863, + "grad_norm": 0.16784684360027313, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.9621, + "step": 42 + }, + { + "epoch": 0.025958345910051313, + "grad_norm": 0.18494223058223724, + "learning_rate": 5.7333333333333336e-05, + "loss": 0.9181, + "step": 43 + }, + { + "epoch": 0.026562028373075763, + "grad_norm": 0.15912078320980072, + "learning_rate": 5.866666666666667e-05, + "loss": 0.9311, + "step": 44 + }, + { + "epoch": 0.027165710836100213, + "grad_norm": 0.17092670500278473, + "learning_rate": 6e-05, + "loss": 0.8838, + "step": 45 + }, + { + "epoch": 0.02776939329912466, + "grad_norm": 0.2552148401737213, + "learning_rate": 6.133333333333334e-05, + "loss": 0.9121, + "step": 46 + }, + { + "epoch": 0.02837307576214911, + "grad_norm": 0.20719660818576813, + "learning_rate": 6.266666666666667e-05, + "loss": 0.8881, + "step": 47 + }, + { + "epoch": 0.02897675822517356, + "grad_norm": 0.25345510244369507, + "learning_rate": 6.400000000000001e-05, + "loss": 0.8592, + "step": 48 + }, + { + "epoch": 0.02958044068819801, + "grad_norm": 0.23677493631839752, + "learning_rate": 6.533333333333334e-05, + "loss": 0.8428, + "step": 49 + }, + { + "epoch": 0.03018412315122246, + "grad_norm": 0.3329518437385559, + "learning_rate": 6.666666666666667e-05, + "loss": 0.7946, + "step": 50 + }, + { + "epoch": 0.030787805614246905, + "grad_norm": 0.1412961632013321, + "learning_rate": 6.800000000000001e-05, + "loss": 0.9836, + "step": 51 + }, + { + "epoch": 0.03139148807727136, + "grad_norm": 0.1430910974740982, + "learning_rate": 6.933333333333334e-05, + "loss": 1.0782, + "step": 52 + }, + { + "epoch": 0.031995170540295804, + "grad_norm": 0.19594986736774445, + "learning_rate": 7.066666666666667e-05, + "loss": 0.9402, + "step": 53 + }, + { + "epoch": 0.03259885300332025, + "grad_norm": 0.15413224697113037, + "learning_rate": 7.2e-05, + "loss": 1.0387, + "step": 54 + }, + { + "epoch": 0.033202535466344704, + "grad_norm": 0.15673322975635529, + "learning_rate": 7.333333333333333e-05, + "loss": 0.9795, + "step": 55 + }, + { + "epoch": 0.03380621792936915, + "grad_norm": 0.14926938712596893, + "learning_rate": 7.466666666666667e-05, + "loss": 1.088, + "step": 56 + }, + { + "epoch": 0.034409900392393604, + "grad_norm": 0.20912237465381622, + "learning_rate": 7.6e-05, + "loss": 0.9247, + "step": 57 + }, + { + "epoch": 0.03501358285541805, + "grad_norm": 0.1995040327310562, + "learning_rate": 7.733333333333333e-05, + "loss": 0.9202, + "step": 58 + }, + { + "epoch": 0.0356172653184425, + "grad_norm": 0.20646536350250244, + "learning_rate": 7.866666666666666e-05, + "loss": 0.8604, + "step": 59 + }, + { + "epoch": 0.03622094778146695, + "grad_norm": 0.2546747624874115, + "learning_rate": 8e-05, + "loss": 0.955, + "step": 60 + }, + { + "epoch": 0.036824630244491396, + "grad_norm": 0.25772592425346375, + "learning_rate": 8.133333333333334e-05, + "loss": 0.9363, + "step": 61 + }, + { + "epoch": 0.03742831270751585, + "grad_norm": 0.4138891398906708, + "learning_rate": 8.266666666666667e-05, + "loss": 0.83, + "step": 62 + }, + { + "epoch": 0.038031995170540296, + "grad_norm": 0.2324582189321518, + "learning_rate": 8.4e-05, + "loss": 0.95, + "step": 63 + }, + { + "epoch": 0.03863567763356474, + "grad_norm": 0.22521646320819855, + "learning_rate": 8.533333333333334e-05, + "loss": 0.8359, + "step": 64 + }, + { + "epoch": 0.039239360096589196, + "grad_norm": 0.19713546335697174, + "learning_rate": 8.666666666666667e-05, + "loss": 0.9552, + "step": 65 + }, + { + "epoch": 0.03984304255961364, + "grad_norm": 0.1569695621728897, + "learning_rate": 8.800000000000001e-05, + "loss": 0.9283, + "step": 66 + }, + { + "epoch": 0.040446725022638096, + "grad_norm": 0.13675987720489502, + "learning_rate": 8.933333333333334e-05, + "loss": 0.9144, + "step": 67 + }, + { + "epoch": 0.04105040748566254, + "grad_norm": 0.08005592226982117, + "learning_rate": 9.066666666666667e-05, + "loss": 0.9661, + "step": 68 + }, + { + "epoch": 0.04165408994868699, + "grad_norm": 0.17173191905021667, + "learning_rate": 9.200000000000001e-05, + "loss": 1.1486, + "step": 69 + }, + { + "epoch": 0.04225777241171144, + "grad_norm": 0.2080589085817337, + "learning_rate": 9.333333333333334e-05, + "loss": 0.8091, + "step": 70 + }, + { + "epoch": 0.04286145487473589, + "grad_norm": 0.21545596420764923, + "learning_rate": 9.466666666666667e-05, + "loss": 0.8606, + "step": 71 + }, + { + "epoch": 0.04346513733776034, + "grad_norm": 0.21326108276844025, + "learning_rate": 9.6e-05, + "loss": 0.9271, + "step": 72 + }, + { + "epoch": 0.04406881980078479, + "grad_norm": 0.21748293936252594, + "learning_rate": 9.733333333333335e-05, + "loss": 1.1042, + "step": 73 + }, + { + "epoch": 0.044672502263809234, + "grad_norm": 0.20910663902759552, + "learning_rate": 9.866666666666668e-05, + "loss": 0.8998, + "step": 74 + }, + { + "epoch": 0.04527618472683369, + "grad_norm": 0.19514353573322296, + "learning_rate": 0.0001, + "loss": 0.8976, + "step": 75 + }, + { + "epoch": 0.045879867189858134, + "grad_norm": 0.17271868884563446, + "learning_rate": 0.00010133333333333335, + "loss": 0.8919, + "step": 76 + }, + { + "epoch": 0.04648354965288258, + "grad_norm": 0.17146329581737518, + "learning_rate": 0.00010266666666666666, + "loss": 0.8703, + "step": 77 + }, + { + "epoch": 0.047087232115907034, + "grad_norm": 0.12326353043317795, + "learning_rate": 0.00010400000000000001, + "loss": 1.0872, + "step": 78 + }, + { + "epoch": 0.04769091457893148, + "grad_norm": 0.08894680440425873, + "learning_rate": 0.00010533333333333332, + "loss": 0.9586, + "step": 79 + }, + { + "epoch": 0.04829459704195593, + "grad_norm": 0.07784762233495712, + "learning_rate": 0.00010666666666666667, + "loss": 0.9304, + "step": 80 + }, + { + "epoch": 0.04889827950498038, + "grad_norm": 0.08525023609399796, + "learning_rate": 0.00010800000000000001, + "loss": 1.1813, + "step": 81 + }, + { + "epoch": 0.049501961968004826, + "grad_norm": 0.11610118299722672, + "learning_rate": 0.00010933333333333333, + "loss": 0.8561, + "step": 82 + }, + { + "epoch": 0.05010564443102928, + "grad_norm": 0.12873214483261108, + "learning_rate": 0.00011066666666666667, + "loss": 0.8394, + "step": 83 + }, + { + "epoch": 0.050709326894053726, + "grad_norm": 0.11888518184423447, + "learning_rate": 0.00011200000000000001, + "loss": 0.873, + "step": 84 + }, + { + "epoch": 0.05131300935707818, + "grad_norm": 0.12383485585451126, + "learning_rate": 0.00011333333333333334, + "loss": 0.803, + "step": 85 + }, + { + "epoch": 0.051916691820102626, + "grad_norm": 0.13411828875541687, + "learning_rate": 0.00011466666666666667, + "loss": 0.8332, + "step": 86 + }, + { + "epoch": 0.05252037428312707, + "grad_norm": 0.12794946134090424, + "learning_rate": 0.000116, + "loss": 0.8405, + "step": 87 + }, + { + "epoch": 0.053124056746151525, + "grad_norm": 0.12056224048137665, + "learning_rate": 0.00011733333333333334, + "loss": 0.9108, + "step": 88 + }, + { + "epoch": 0.05372773920917597, + "grad_norm": 0.11550690233707428, + "learning_rate": 0.00011866666666666669, + "loss": 0.9052, + "step": 89 + }, + { + "epoch": 0.054331421672200425, + "grad_norm": 0.09730254113674164, + "learning_rate": 0.00012, + "loss": 0.9104, + "step": 90 + }, + { + "epoch": 0.05493510413522487, + "grad_norm": 0.10430070012807846, + "learning_rate": 0.00012133333333333335, + "loss": 0.8333, + "step": 91 + }, + { + "epoch": 0.05553878659824932, + "grad_norm": 0.10595888644456863, + "learning_rate": 0.00012266666666666668, + "loss": 0.7792, + "step": 92 + }, + { + "epoch": 0.05614246906127377, + "grad_norm": 0.13614441454410553, + "learning_rate": 0.000124, + "loss": 0.8317, + "step": 93 + }, + { + "epoch": 0.05674615152429822, + "grad_norm": 0.1545422077178955, + "learning_rate": 0.00012533333333333334, + "loss": 0.8139, + "step": 94 + }, + { + "epoch": 0.05734983398732267, + "grad_norm": 0.17710909247398376, + "learning_rate": 0.00012666666666666666, + "loss": 0.8969, + "step": 95 + }, + { + "epoch": 0.05795351645034712, + "grad_norm": 0.17680825293064117, + "learning_rate": 0.00012800000000000002, + "loss": 0.7812, + "step": 96 + }, + { + "epoch": 0.058557198913371564, + "grad_norm": 0.17637892067432404, + "learning_rate": 0.00012933333333333332, + "loss": 0.7518, + "step": 97 + }, + { + "epoch": 0.05916088137639602, + "grad_norm": 0.14965130388736725, + "learning_rate": 0.00013066666666666668, + "loss": 0.6467, + "step": 98 + }, + { + "epoch": 0.05976456383942046, + "grad_norm": 0.15980888903141022, + "learning_rate": 0.000132, + "loss": 0.724, + "step": 99 + }, + { + "epoch": 0.06036824630244492, + "grad_norm": 0.1575448364019394, + "learning_rate": 0.00013333333333333334, + "loss": 0.523, + "step": 100 + }, + { + "epoch": 0.06097192876546936, + "grad_norm": 0.0918162539601326, + "learning_rate": 0.00013466666666666667, + "loss": 0.8158, + "step": 101 + }, + { + "epoch": 0.06157561122849381, + "grad_norm": 0.09651898592710495, + "learning_rate": 0.00013600000000000003, + "loss": 1.1298, + "step": 102 + }, + { + "epoch": 0.06217929369151826, + "grad_norm": 0.10406182706356049, + "learning_rate": 0.00013733333333333333, + "loss": 1.2655, + "step": 103 + }, + { + "epoch": 0.06278297615454272, + "grad_norm": 0.09533622860908508, + "learning_rate": 0.00013866666666666669, + "loss": 0.9408, + "step": 104 + }, + { + "epoch": 0.06338665861756716, + "grad_norm": 0.1048218235373497, + "learning_rate": 0.00014, + "loss": 0.9564, + "step": 105 + }, + { + "epoch": 0.06399034108059161, + "grad_norm": 0.09610721468925476, + "learning_rate": 0.00014133333333333334, + "loss": 0.8791, + "step": 106 + }, + { + "epoch": 0.06459402354361606, + "grad_norm": 0.08564590662717819, + "learning_rate": 0.00014266666666666667, + "loss": 1.0284, + "step": 107 + }, + { + "epoch": 0.0651977060066405, + "grad_norm": 0.07987428456544876, + "learning_rate": 0.000144, + "loss": 1.1595, + "step": 108 + }, + { + "epoch": 0.06580138846966496, + "grad_norm": 0.09171764552593231, + "learning_rate": 0.00014533333333333333, + "loss": 0.9098, + "step": 109 + }, + { + "epoch": 0.06640507093268941, + "grad_norm": 0.08003644645214081, + "learning_rate": 0.00014666666666666666, + "loss": 1.0118, + "step": 110 + }, + { + "epoch": 0.06700875339571385, + "grad_norm": 0.08099836111068726, + "learning_rate": 0.000148, + "loss": 1.0011, + "step": 111 + }, + { + "epoch": 0.0676124358587383, + "grad_norm": 0.08993902802467346, + "learning_rate": 0.00014933333333333335, + "loss": 1.2031, + "step": 112 + }, + { + "epoch": 0.06821611832176275, + "grad_norm": 0.0858883187174797, + "learning_rate": 0.00015066666666666668, + "loss": 0.7726, + "step": 113 + }, + { + "epoch": 0.06881980078478721, + "grad_norm": 0.07739171385765076, + "learning_rate": 0.000152, + "loss": 0.8213, + "step": 114 + }, + { + "epoch": 0.06942348324781165, + "grad_norm": 0.09992550313472748, + "learning_rate": 0.00015333333333333334, + "loss": 1.2648, + "step": 115 + }, + { + "epoch": 0.0700271657108361, + "grad_norm": 0.08734551072120667, + "learning_rate": 0.00015466666666666667, + "loss": 0.9113, + "step": 116 + }, + { + "epoch": 0.07063084817386055, + "grad_norm": 0.13743777573108673, + "learning_rate": 0.00015600000000000002, + "loss": 1.2323, + "step": 117 + }, + { + "epoch": 0.071234530636885, + "grad_norm": 0.08608172088861465, + "learning_rate": 0.00015733333333333333, + "loss": 1.0779, + "step": 118 + }, + { + "epoch": 0.07183821309990945, + "grad_norm": 0.07633006572723389, + "learning_rate": 0.00015866666666666668, + "loss": 0.9064, + "step": 119 + }, + { + "epoch": 0.0724418955629339, + "grad_norm": 0.07484789192676544, + "learning_rate": 0.00016, + "loss": 0.8816, + "step": 120 + }, + { + "epoch": 0.07304557802595835, + "grad_norm": 0.06947381794452667, + "learning_rate": 0.00016133333333333334, + "loss": 0.9609, + "step": 121 + }, + { + "epoch": 0.07364926048898279, + "grad_norm": 0.09331609308719635, + "learning_rate": 0.00016266666666666667, + "loss": 0.899, + "step": 122 + }, + { + "epoch": 0.07425294295200724, + "grad_norm": 0.0736856609582901, + "learning_rate": 0.000164, + "loss": 0.8749, + "step": 123 + }, + { + "epoch": 0.0748566254150317, + "grad_norm": 0.08171521127223969, + "learning_rate": 0.00016533333333333333, + "loss": 0.8952, + "step": 124 + }, + { + "epoch": 0.07546030787805615, + "grad_norm": 0.0707223191857338, + "learning_rate": 0.0001666666666666667, + "loss": 0.9492, + "step": 125 + }, + { + "epoch": 0.07606399034108059, + "grad_norm": 0.07196628302335739, + "learning_rate": 0.000168, + "loss": 1.0499, + "step": 126 + }, + { + "epoch": 0.07666767280410504, + "grad_norm": 0.07485999912023544, + "learning_rate": 0.00016933333333333335, + "loss": 0.8929, + "step": 127 + }, + { + "epoch": 0.07727135526712949, + "grad_norm": 0.06874241679906845, + "learning_rate": 0.00017066666666666668, + "loss": 0.8589, + "step": 128 + }, + { + "epoch": 0.07787503773015395, + "grad_norm": 0.07588055729866028, + "learning_rate": 0.000172, + "loss": 0.9973, + "step": 129 + }, + { + "epoch": 0.07847872019317839, + "grad_norm": 0.07442892342805862, + "learning_rate": 0.00017333333333333334, + "loss": 1.3857, + "step": 130 + }, + { + "epoch": 0.07908240265620284, + "grad_norm": 0.07991475611925125, + "learning_rate": 0.00017466666666666667, + "loss": 0.8986, + "step": 131 + }, + { + "epoch": 0.07968608511922728, + "grad_norm": 0.07599324733018875, + "learning_rate": 0.00017600000000000002, + "loss": 0.874, + "step": 132 + }, + { + "epoch": 0.08028976758225173, + "grad_norm": 0.08738681674003601, + "learning_rate": 0.00017733333333333335, + "loss": 0.8635, + "step": 133 + }, + { + "epoch": 0.08089345004527619, + "grad_norm": 0.08158082515001297, + "learning_rate": 0.00017866666666666668, + "loss": 0.8143, + "step": 134 + }, + { + "epoch": 0.08149713250830064, + "grad_norm": 0.08535363525152206, + "learning_rate": 0.00018, + "loss": 0.7629, + "step": 135 + }, + { + "epoch": 0.08210081497132508, + "grad_norm": 0.08319278806447983, + "learning_rate": 0.00018133333333333334, + "loss": 0.7807, + "step": 136 + }, + { + "epoch": 0.08270449743434953, + "grad_norm": 0.10717540234327316, + "learning_rate": 0.00018266666666666667, + "loss": 0.7731, + "step": 137 + }, + { + "epoch": 0.08330817989737398, + "grad_norm": 0.08758540451526642, + "learning_rate": 0.00018400000000000003, + "loss": 0.7561, + "step": 138 + }, + { + "epoch": 0.08391186236039844, + "grad_norm": 0.09366493672132492, + "learning_rate": 0.00018533333333333333, + "loss": 0.7703, + "step": 139 + }, + { + "epoch": 0.08451554482342288, + "grad_norm": 0.10316765308380127, + "learning_rate": 0.0001866666666666667, + "loss": 0.7798, + "step": 140 + }, + { + "epoch": 0.08511922728644733, + "grad_norm": 0.09655182808637619, + "learning_rate": 0.000188, + "loss": 0.8053, + "step": 141 + }, + { + "epoch": 0.08572290974947178, + "grad_norm": 0.09309862554073334, + "learning_rate": 0.00018933333333333335, + "loss": 0.7777, + "step": 142 + }, + { + "epoch": 0.08632659221249622, + "grad_norm": 0.11444847285747528, + "learning_rate": 0.00019066666666666668, + "loss": 0.8681, + "step": 143 + }, + { + "epoch": 0.08693027467552068, + "grad_norm": 0.09801818430423737, + "learning_rate": 0.000192, + "loss": 0.6858, + "step": 144 + }, + { + "epoch": 0.08753395713854513, + "grad_norm": 0.10028098523616791, + "learning_rate": 0.00019333333333333333, + "loss": 0.8063, + "step": 145 + }, + { + "epoch": 0.08813763960156958, + "grad_norm": 0.11404330283403397, + "learning_rate": 0.0001946666666666667, + "loss": 0.8138, + "step": 146 + }, + { + "epoch": 0.08874132206459402, + "grad_norm": 0.12089970707893372, + "learning_rate": 0.000196, + "loss": 0.653, + "step": 147 + }, + { + "epoch": 0.08934500452761847, + "grad_norm": 0.13996511697769165, + "learning_rate": 0.00019733333333333335, + "loss": 0.6567, + "step": 148 + }, + { + "epoch": 0.08994868699064293, + "grad_norm": 0.1570245325565338, + "learning_rate": 0.00019866666666666668, + "loss": 0.6688, + "step": 149 + }, + { + "epoch": 0.09055236945366738, + "grad_norm": 0.14232978224754333, + "learning_rate": 0.0002, + "loss": 0.5686, + "step": 150 + }, + { + "epoch": 0.09115605191669182, + "grad_norm": 0.09911559522151947, + "learning_rate": 0.00019995848899958488, + "loss": 0.9231, + "step": 151 + }, + { + "epoch": 0.09175973437971627, + "grad_norm": 0.0942869484424591, + "learning_rate": 0.0001999169779991698, + "loss": 0.9987, + "step": 152 + }, + { + "epoch": 0.09236341684274071, + "grad_norm": 0.08815930783748627, + "learning_rate": 0.00019987546699875468, + "loss": 1.0607, + "step": 153 + }, + { + "epoch": 0.09296709930576516, + "grad_norm": 0.0868469625711441, + "learning_rate": 0.00019983395599833956, + "loss": 0.9998, + "step": 154 + }, + { + "epoch": 0.09357078176878962, + "grad_norm": 0.07780348509550095, + "learning_rate": 0.00019979244499792446, + "loss": 0.911, + "step": 155 + }, + { + "epoch": 0.09417446423181407, + "grad_norm": 0.09089189767837524, + "learning_rate": 0.00019975093399750936, + "loss": 0.9344, + "step": 156 + }, + { + "epoch": 0.09477814669483851, + "grad_norm": 0.07558704167604446, + "learning_rate": 0.00019970942299709423, + "loss": 0.8435, + "step": 157 + }, + { + "epoch": 0.09538182915786296, + "grad_norm": 0.1009376272559166, + "learning_rate": 0.00019966791199667913, + "loss": 0.8571, + "step": 158 + }, + { + "epoch": 0.0959855116208874, + "grad_norm": 0.07474809885025024, + "learning_rate": 0.00019962640099626403, + "loss": 0.9764, + "step": 159 + }, + { + "epoch": 0.09658919408391187, + "grad_norm": 0.0775795429944992, + "learning_rate": 0.0001995848899958489, + "loss": 1.0164, + "step": 160 + }, + { + "epoch": 0.09719287654693631, + "grad_norm": 0.07155506312847137, + "learning_rate": 0.0001995433789954338, + "loss": 0.9886, + "step": 161 + }, + { + "epoch": 0.09779655900996076, + "grad_norm": 0.0788518413901329, + "learning_rate": 0.00019950186799501867, + "loss": 0.8248, + "step": 162 + }, + { + "epoch": 0.0984002414729852, + "grad_norm": 0.06630228459835052, + "learning_rate": 0.00019946035699460357, + "loss": 0.9792, + "step": 163 + }, + { + "epoch": 0.09900392393600965, + "grad_norm": 0.20900733768939972, + "learning_rate": 0.00019941884599418847, + "loss": 0.9877, + "step": 164 + }, + { + "epoch": 0.09960760639903411, + "grad_norm": 0.06860389560461044, + "learning_rate": 0.00019937733499377335, + "loss": 0.9082, + "step": 165 + }, + { + "epoch": 0.10021128886205856, + "grad_norm": 0.06557682156562805, + "learning_rate": 0.00019933582399335825, + "loss": 0.8677, + "step": 166 + }, + { + "epoch": 0.100814971325083, + "grad_norm": 0.07390300929546356, + "learning_rate": 0.00019929431299294315, + "loss": 1.1377, + "step": 167 + }, + { + "epoch": 0.10141865378810745, + "grad_norm": 0.07005932927131653, + "learning_rate": 0.00019925280199252802, + "loss": 1.0111, + "step": 168 + }, + { + "epoch": 0.1020223362511319, + "grad_norm": 0.06733494251966476, + "learning_rate": 0.00019921129099211292, + "loss": 0.9273, + "step": 169 + }, + { + "epoch": 0.10262601871415636, + "grad_norm": 0.4364582300186157, + "learning_rate": 0.00019916977999169782, + "loss": 0.863, + "step": 170 + }, + { + "epoch": 0.1032297011771808, + "grad_norm": 0.06318307667970657, + "learning_rate": 0.0001991282689912827, + "loss": 1.0073, + "step": 171 + }, + { + "epoch": 0.10383338364020525, + "grad_norm": 492.8619689941406, + "learning_rate": 0.00019908675799086757, + "loss": 0.9392, + "step": 172 + }, + { + "epoch": 0.1044370661032297, + "grad_norm": 14.513906478881836, + "learning_rate": 0.0001990452469904525, + "loss": 0.8524, + "step": 173 + }, + { + "epoch": 0.10504074856625414, + "grad_norm": 0.08823119848966599, + "learning_rate": 0.00019900373599003737, + "loss": 0.8712, + "step": 174 + }, + { + "epoch": 0.1056444310292786, + "grad_norm": 0.0739288404583931, + "learning_rate": 0.00019896222498962227, + "loss": 1.0265, + "step": 175 + }, + { + "epoch": 0.10624811349230305, + "grad_norm": 6.7402520179748535, + "learning_rate": 0.00019892071398920714, + "loss": 0.832, + "step": 176 + }, + { + "epoch": 0.1068517959553275, + "grad_norm": 0.20480509102344513, + "learning_rate": 0.00019887920298879204, + "loss": 0.962, + "step": 177 + }, + { + "epoch": 0.10745547841835194, + "grad_norm": 2.3439409732818604, + "learning_rate": 0.00019883769198837694, + "loss": 0.8044, + "step": 178 + }, + { + "epoch": 0.10805916088137639, + "grad_norm": 10.576101303100586, + "learning_rate": 0.0001987961809879618, + "loss": 1.1324, + "step": 179 + }, + { + "epoch": 0.10866284334440085, + "grad_norm": 5.063470840454102, + "learning_rate": 0.0001987546699875467, + "loss": 1.0371, + "step": 180 + }, + { + "epoch": 0.1092665258074253, + "grad_norm": 0.47612178325653076, + "learning_rate": 0.0001987131589871316, + "loss": 1.0325, + "step": 181 + }, + { + "epoch": 0.10987020827044974, + "grad_norm": 0.5040671229362488, + "learning_rate": 0.00019867164798671649, + "loss": 0.8587, + "step": 182 + }, + { + "epoch": 0.11047389073347419, + "grad_norm": 0.49563658237457275, + "learning_rate": 0.00019863013698630139, + "loss": 0.9701, + "step": 183 + }, + { + "epoch": 0.11107757319649864, + "grad_norm": 0.5639724731445312, + "learning_rate": 0.00019858862598588629, + "loss": 0.8009, + "step": 184 + }, + { + "epoch": 0.1116812556595231, + "grad_norm": 0.10233303904533386, + "learning_rate": 0.00019854711498547116, + "loss": 0.8951, + "step": 185 + }, + { + "epoch": 0.11228493812254754, + "grad_norm": 0.1553623080253601, + "learning_rate": 0.00019850560398505603, + "loss": 0.7539, + "step": 186 + }, + { + "epoch": 0.11288862058557199, + "grad_norm": 0.15513239800930023, + "learning_rate": 0.00019846409298464096, + "loss": 0.8451, + "step": 187 + }, + { + "epoch": 0.11349230304859644, + "grad_norm": 0.15111730992794037, + "learning_rate": 0.00019842258198422583, + "loss": 0.8565, + "step": 188 + }, + { + "epoch": 0.11409598551162088, + "grad_norm": 0.17460127174854279, + "learning_rate": 0.0001983810709838107, + "loss": 0.8017, + "step": 189 + }, + { + "epoch": 0.11469966797464534, + "grad_norm": 0.19736827909946442, + "learning_rate": 0.0001983395599833956, + "loss": 0.7927, + "step": 190 + }, + { + "epoch": 0.11530335043766979, + "grad_norm": 0.15492677688598633, + "learning_rate": 0.0001982980489829805, + "loss": 0.8179, + "step": 191 + }, + { + "epoch": 0.11590703290069423, + "grad_norm": 0.33444538712501526, + "learning_rate": 0.00019825653798256538, + "loss": 0.8264, + "step": 192 + }, + { + "epoch": 0.11651071536371868, + "grad_norm": 0.15822017192840576, + "learning_rate": 0.00019821502698215028, + "loss": 0.7953, + "step": 193 + }, + { + "epoch": 0.11711439782674313, + "grad_norm": 0.19879087805747986, + "learning_rate": 0.00019817351598173518, + "loss": 0.8023, + "step": 194 + }, + { + "epoch": 0.11771808028976759, + "grad_norm": 0.155591681599617, + "learning_rate": 0.00019813200498132005, + "loss": 0.6876, + "step": 195 + }, + { + "epoch": 0.11832176275279203, + "grad_norm": 0.19856096804141998, + "learning_rate": 0.00019809049398090495, + "loss": 0.7227, + "step": 196 + }, + { + "epoch": 0.11892544521581648, + "grad_norm": 0.14677810668945312, + "learning_rate": 0.00019804898298048985, + "loss": 0.729, + "step": 197 + }, + { + "epoch": 0.11952912767884093, + "grad_norm": 3.824247121810913, + "learning_rate": 0.00019800747198007472, + "loss": 0.7045, + "step": 198 + }, + { + "epoch": 0.12013281014186537, + "grad_norm": 1.2843711376190186, + "learning_rate": 0.00019796596097965962, + "loss": 0.681, + "step": 199 + }, + { + "epoch": 0.12073649260488983, + "grad_norm": 0.15802475810050964, + "learning_rate": 0.0001979244499792445, + "loss": 0.54, + "step": 200 + }, + { + "epoch": 0.12134017506791428, + "grad_norm": 0.08058023452758789, + "learning_rate": 0.0001978829389788294, + "loss": 1.2773, + "step": 201 + }, + { + "epoch": 0.12194385753093873, + "grad_norm": 0.09442666172981262, + "learning_rate": 0.0001978414279784143, + "loss": 1.0519, + "step": 202 + }, + { + "epoch": 0.12254753999396317, + "grad_norm": 0.11098414659500122, + "learning_rate": 0.00019779991697799917, + "loss": 0.9809, + "step": 203 + }, + { + "epoch": 0.12315122245698762, + "grad_norm": 0.08391015231609344, + "learning_rate": 0.00019775840597758407, + "loss": 0.7483, + "step": 204 + }, + { + "epoch": 0.12375490492001208, + "grad_norm": 0.09060636162757874, + "learning_rate": 0.00019771689497716897, + "loss": 0.9001, + "step": 205 + }, + { + "epoch": 0.12435858738303653, + "grad_norm": 0.08621667325496674, + "learning_rate": 0.00019767538397675384, + "loss": 1.0983, + "step": 206 + }, + { + "epoch": 0.12496226984606097, + "grad_norm": 0.07971488684415817, + "learning_rate": 0.00019763387297633874, + "loss": 0.8414, + "step": 207 + }, + { + "epoch": 0.12556595230908543, + "grad_norm": 0.07815425097942352, + "learning_rate": 0.00019759236197592364, + "loss": 0.8872, + "step": 208 + }, + { + "epoch": 0.12616963477210988, + "grad_norm": 0.0847320556640625, + "learning_rate": 0.00019755085097550852, + "loss": 0.8398, + "step": 209 + }, + { + "epoch": 0.12677331723513433, + "grad_norm": 0.08897458761930466, + "learning_rate": 0.0001975093399750934, + "loss": 0.7865, + "step": 210 + }, + { + "epoch": 0.12737699969815877, + "grad_norm": 0.0703200101852417, + "learning_rate": 0.00019746782897467832, + "loss": 0.8256, + "step": 211 + }, + { + "epoch": 0.12798068216118322, + "grad_norm": 0.07879285514354706, + "learning_rate": 0.0001974263179742632, + "loss": 0.9032, + "step": 212 + }, + { + "epoch": 0.12858436462420766, + "grad_norm": 0.07276301831007004, + "learning_rate": 0.00019738480697384806, + "loss": 0.9857, + "step": 213 + }, + { + "epoch": 0.1291880470872321, + "grad_norm": 0.09440230578184128, + "learning_rate": 0.00019734329597343296, + "loss": 0.9156, + "step": 214 + }, + { + "epoch": 0.12979172955025656, + "grad_norm": 0.1108328178524971, + "learning_rate": 0.00019730178497301786, + "loss": 0.9671, + "step": 215 + }, + { + "epoch": 0.130395412013281, + "grad_norm": 0.07690040022134781, + "learning_rate": 0.00019726027397260273, + "loss": 0.9945, + "step": 216 + }, + { + "epoch": 0.13099909447630545, + "grad_norm": 0.06875025480985641, + "learning_rate": 0.00019721876297218763, + "loss": 0.911, + "step": 217 + }, + { + "epoch": 0.13160277693932992, + "grad_norm": 0.09118712693452835, + "learning_rate": 0.00019717725197177253, + "loss": 1.1363, + "step": 218 + }, + { + "epoch": 0.13220645940235437, + "grad_norm": 0.07356920093297958, + "learning_rate": 0.00019713574097135743, + "loss": 1.14, + "step": 219 + }, + { + "epoch": 0.13281014186537882, + "grad_norm": 0.06750470399856567, + "learning_rate": 0.0001970942299709423, + "loss": 0.9591, + "step": 220 + }, + { + "epoch": 0.13341382432840326, + "grad_norm": 0.11778965592384338, + "learning_rate": 0.0001970527189705272, + "loss": 0.9048, + "step": 221 + }, + { + "epoch": 0.1340175067914277, + "grad_norm": 0.07331351190805435, + "learning_rate": 0.0001970112079701121, + "loss": 0.8792, + "step": 222 + }, + { + "epoch": 0.13462118925445216, + "grad_norm": 0.08289069682359695, + "learning_rate": 0.00019696969696969698, + "loss": 1.1292, + "step": 223 + }, + { + "epoch": 0.1352248717174766, + "grad_norm": 0.07485181093215942, + "learning_rate": 0.00019692818596928185, + "loss": 1.013, + "step": 224 + }, + { + "epoch": 0.13582855418050105, + "grad_norm": 0.08499836176633835, + "learning_rate": 0.00019688667496886678, + "loss": 0.9177, + "step": 225 + }, + { + "epoch": 0.1364322366435255, + "grad_norm": 0.06567627936601639, + "learning_rate": 0.00019684516396845165, + "loss": 0.9016, + "step": 226 + }, + { + "epoch": 0.13703591910654994, + "grad_norm": 0.07324650138616562, + "learning_rate": 0.00019680365296803653, + "loss": 0.9649, + "step": 227 + }, + { + "epoch": 0.13763960156957442, + "grad_norm": 0.06484885513782501, + "learning_rate": 0.00019676214196762143, + "loss": 0.824, + "step": 228 + }, + { + "epoch": 0.13824328403259886, + "grad_norm": 0.06534811854362488, + "learning_rate": 0.00019672063096720633, + "loss": 0.7969, + "step": 229 + }, + { + "epoch": 0.1388469664956233, + "grad_norm": 0.06635700166225433, + "learning_rate": 0.0001966791199667912, + "loss": 0.9041, + "step": 230 + }, + { + "epoch": 0.13945064895864775, + "grad_norm": 0.06364124268293381, + "learning_rate": 0.0001966376089663761, + "loss": 0.8935, + "step": 231 + }, + { + "epoch": 0.1400543314216722, + "grad_norm": 0.0640692487359047, + "learning_rate": 0.000196596097965961, + "loss": 0.9025, + "step": 232 + }, + { + "epoch": 0.14065801388469665, + "grad_norm": 0.07015056163072586, + "learning_rate": 0.00019655458696554587, + "loss": 0.8065, + "step": 233 + }, + { + "epoch": 0.1412616963477211, + "grad_norm": 0.07654477655887604, + "learning_rate": 0.00019651307596513077, + "loss": 0.8408, + "step": 234 + }, + { + "epoch": 0.14186537881074554, + "grad_norm": 0.06782008707523346, + "learning_rate": 0.00019647156496471567, + "loss": 0.7275, + "step": 235 + }, + { + "epoch": 0.14246906127377, + "grad_norm": 0.07276429980993271, + "learning_rate": 0.00019643005396430055, + "loss": 0.7955, + "step": 236 + }, + { + "epoch": 0.14307274373679443, + "grad_norm": 0.08035031706094742, + "learning_rate": 0.00019638854296388545, + "loss": 0.7074, + "step": 237 + }, + { + "epoch": 0.1436764261998189, + "grad_norm": 0.10624828189611435, + "learning_rate": 0.00019634703196347032, + "loss": 0.8799, + "step": 238 + }, + { + "epoch": 0.14428010866284335, + "grad_norm": 0.0777585506439209, + "learning_rate": 0.00019630552096305522, + "loss": 0.7528, + "step": 239 + }, + { + "epoch": 0.1448837911258678, + "grad_norm": 0.0803024098277092, + "learning_rate": 0.00019626400996264012, + "loss": 0.7702, + "step": 240 + }, + { + "epoch": 0.14548747358889225, + "grad_norm": 0.0814545750617981, + "learning_rate": 0.000196222498962225, + "loss": 0.78, + "step": 241 + }, + { + "epoch": 0.1460911560519167, + "grad_norm": 0.0940927267074585, + "learning_rate": 0.0001961809879618099, + "loss": 0.7294, + "step": 242 + }, + { + "epoch": 0.14669483851494114, + "grad_norm": 0.08877824991941452, + "learning_rate": 0.0001961394769613948, + "loss": 0.7663, + "step": 243 + }, + { + "epoch": 0.14729852097796559, + "grad_norm": 0.09321099519729614, + "learning_rate": 0.00019609796596097966, + "loss": 0.7524, + "step": 244 + }, + { + "epoch": 0.14790220344099003, + "grad_norm": 0.09710489213466644, + "learning_rate": 0.00019605645496056454, + "loss": 0.8147, + "step": 245 + }, + { + "epoch": 0.14850588590401448, + "grad_norm": 0.10399672389030457, + "learning_rate": 0.00019601494396014946, + "loss": 0.7327, + "step": 246 + }, + { + "epoch": 0.14910956836703892, + "grad_norm": 0.11608180403709412, + "learning_rate": 0.00019597343295973434, + "loss": 0.7464, + "step": 247 + }, + { + "epoch": 0.1497132508300634, + "grad_norm": 0.10949523001909256, + "learning_rate": 0.0001959319219593192, + "loss": 0.6295, + "step": 248 + }, + { + "epoch": 0.15031693329308785, + "grad_norm": 0.10902351886034012, + "learning_rate": 0.00019589041095890414, + "loss": 0.6048, + "step": 249 + }, + { + "epoch": 0.1509206157561123, + "grad_norm": 0.11830253154039383, + "learning_rate": 0.000195848899958489, + "loss": 0.4881, + "step": 250 + }, + { + "epoch": 0.15152429821913674, + "grad_norm": 0.06550572067499161, + "learning_rate": 0.00019580738895807388, + "loss": 0.8051, + "step": 251 + }, + { + "epoch": 0.15212798068216118, + "grad_norm": 0.08449156582355499, + "learning_rate": 0.00019576587795765878, + "loss": 1.0096, + "step": 252 + }, + { + "epoch": 0.15273166314518563, + "grad_norm": 0.07215554267168045, + "learning_rate": 0.00019572436695724368, + "loss": 0.7212, + "step": 253 + }, + { + "epoch": 0.15333534560821008, + "grad_norm": 0.0707031637430191, + "learning_rate": 0.00019568285595682856, + "loss": 0.8708, + "step": 254 + }, + { + "epoch": 0.15393902807123452, + "grad_norm": 0.22556833922863007, + "learning_rate": 0.00019564134495641346, + "loss": 0.9448, + "step": 255 + }, + { + "epoch": 0.15454271053425897, + "grad_norm": 0.06614769995212555, + "learning_rate": 0.00019559983395599836, + "loss": 1.2733, + "step": 256 + }, + { + "epoch": 0.15514639299728342, + "grad_norm": 0.14129576086997986, + "learning_rate": 0.00019555832295558323, + "loss": 0.9814, + "step": 257 + }, + { + "epoch": 0.1557500754603079, + "grad_norm": 0.061934180557727814, + "learning_rate": 0.00019551681195516813, + "loss": 0.8052, + "step": 258 + }, + { + "epoch": 0.15635375792333234, + "grad_norm": 0.09368140995502472, + "learning_rate": 0.000195475300954753, + "loss": 0.9107, + "step": 259 + }, + { + "epoch": 0.15695744038635678, + "grad_norm": 0.066087506711483, + "learning_rate": 0.0001954337899543379, + "loss": 0.7128, + "step": 260 + }, + { + "epoch": 0.15756112284938123, + "grad_norm": 0.06857267022132874, + "learning_rate": 0.0001953922789539228, + "loss": 0.8081, + "step": 261 + }, + { + "epoch": 0.15816480531240568, + "grad_norm": 0.06954636424779892, + "learning_rate": 0.00019535076795350768, + "loss": 0.8834, + "step": 262 + }, + { + "epoch": 0.15876848777543012, + "grad_norm": 0.07501845061779022, + "learning_rate": 0.0001953092569530926, + "loss": 0.7618, + "step": 263 + }, + { + "epoch": 0.15937217023845457, + "grad_norm": 0.06725212186574936, + "learning_rate": 0.00019526774595267748, + "loss": 0.852, + "step": 264 + }, + { + "epoch": 0.15997585270147902, + "grad_norm": 0.06601674854755402, + "learning_rate": 0.00019522623495226235, + "loss": 0.8241, + "step": 265 + }, + { + "epoch": 0.16057953516450346, + "grad_norm": 0.0759337842464447, + "learning_rate": 0.00019518472395184725, + "loss": 1.1918, + "step": 266 + }, + { + "epoch": 0.1611832176275279, + "grad_norm": 0.0703561007976532, + "learning_rate": 0.00019514321295143215, + "loss": 0.8613, + "step": 267 + }, + { + "epoch": 0.16178690009055238, + "grad_norm": 0.07018351554870605, + "learning_rate": 0.00019510170195101702, + "loss": 1.0508, + "step": 268 + }, + { + "epoch": 0.16239058255357683, + "grad_norm": 0.06850980967283249, + "learning_rate": 0.00019506019095060192, + "loss": 0.9527, + "step": 269 + }, + { + "epoch": 0.16299426501660128, + "grad_norm": 0.06927502900362015, + "learning_rate": 0.00019501867995018682, + "loss": 0.9985, + "step": 270 + }, + { + "epoch": 0.16359794747962572, + "grad_norm": 0.07486286759376526, + "learning_rate": 0.0001949771689497717, + "loss": 0.9889, + "step": 271 + }, + { + "epoch": 0.16420162994265017, + "grad_norm": 0.06840648502111435, + "learning_rate": 0.0001949356579493566, + "loss": 0.902, + "step": 272 + }, + { + "epoch": 0.16480531240567461, + "grad_norm": 0.06660640239715576, + "learning_rate": 0.00019489414694894147, + "loss": 0.9774, + "step": 273 + }, + { + "epoch": 0.16540899486869906, + "grad_norm": 0.06089378520846367, + "learning_rate": 0.00019485263594852637, + "loss": 1.0334, + "step": 274 + }, + { + "epoch": 0.1660126773317235, + "grad_norm": 0.06978226453065872, + "learning_rate": 0.00019481112494811127, + "loss": 0.8932, + "step": 275 + }, + { + "epoch": 0.16661635979474795, + "grad_norm": 0.06877858191728592, + "learning_rate": 0.00019476961394769614, + "loss": 0.91, + "step": 276 + }, + { + "epoch": 0.1672200422577724, + "grad_norm": 0.06825589388608932, + "learning_rate": 0.00019472810294728104, + "loss": 0.9188, + "step": 277 + }, + { + "epoch": 0.16782372472079687, + "grad_norm": 0.06272375583648682, + "learning_rate": 0.00019468659194686594, + "loss": 0.8665, + "step": 278 + }, + { + "epoch": 0.16842740718382132, + "grad_norm": 0.07434337586164474, + "learning_rate": 0.0001946450809464508, + "loss": 0.9386, + "step": 279 + }, + { + "epoch": 0.16903108964684577, + "grad_norm": 0.062427766621112823, + "learning_rate": 0.0001946035699460357, + "loss": 0.8038, + "step": 280 + }, + { + "epoch": 0.1696347721098702, + "grad_norm": 0.07670903950929642, + "learning_rate": 0.0001945620589456206, + "loss": 0.8494, + "step": 281 + }, + { + "epoch": 0.17023845457289466, + "grad_norm": 0.06715535372495651, + "learning_rate": 0.00019452054794520549, + "loss": 1.0171, + "step": 282 + }, + { + "epoch": 0.1708421370359191, + "grad_norm": 0.0629987046122551, + "learning_rate": 0.00019447903694479036, + "loss": 0.8715, + "step": 283 + }, + { + "epoch": 0.17144581949894355, + "grad_norm": 0.0751439705491066, + "learning_rate": 0.00019443752594437529, + "loss": 0.8137, + "step": 284 + }, + { + "epoch": 0.172049501961968, + "grad_norm": 0.07195259630680084, + "learning_rate": 0.00019439601494396016, + "loss": 0.8276, + "step": 285 + }, + { + "epoch": 0.17265318442499245, + "grad_norm": 0.06936553120613098, + "learning_rate": 0.00019435450394354503, + "loss": 0.7662, + "step": 286 + }, + { + "epoch": 0.1732568668880169, + "grad_norm": 0.07795591652393341, + "learning_rate": 0.00019431299294312993, + "loss": 0.7729, + "step": 287 + }, + { + "epoch": 0.17386054935104137, + "grad_norm": 0.07397985458374023, + "learning_rate": 0.00019427148194271483, + "loss": 0.7924, + "step": 288 + }, + { + "epoch": 0.1744642318140658, + "grad_norm": 0.08324669301509857, + "learning_rate": 0.0001942299709422997, + "loss": 0.8163, + "step": 289 + }, + { + "epoch": 0.17506791427709026, + "grad_norm": 0.08435431867837906, + "learning_rate": 0.0001941884599418846, + "loss": 0.7395, + "step": 290 + }, + { + "epoch": 0.1756715967401147, + "grad_norm": 0.08156009763479233, + "learning_rate": 0.0001941469489414695, + "loss": 0.7299, + "step": 291 + }, + { + "epoch": 0.17627527920313915, + "grad_norm": 0.08714080601930618, + "learning_rate": 0.00019410543794105438, + "loss": 0.7918, + "step": 292 + }, + { + "epoch": 0.1768789616661636, + "grad_norm": 0.09345049411058426, + "learning_rate": 0.00019406392694063928, + "loss": 0.7841, + "step": 293 + }, + { + "epoch": 0.17748264412918804, + "grad_norm": 0.09327434748411179, + "learning_rate": 0.00019402241594022418, + "loss": 0.7073, + "step": 294 + }, + { + "epoch": 0.1780863265922125, + "grad_norm": 0.09284470230340958, + "learning_rate": 0.00019398090493980905, + "loss": 0.6427, + "step": 295 + }, + { + "epoch": 0.17869000905523694, + "grad_norm": 0.09828530997037888, + "learning_rate": 0.00019393939393939395, + "loss": 0.6953, + "step": 296 + }, + { + "epoch": 0.17929369151826138, + "grad_norm": 0.10501807928085327, + "learning_rate": 0.00019389788293897882, + "loss": 0.7286, + "step": 297 + }, + { + "epoch": 0.17989737398128586, + "grad_norm": 0.14845532178878784, + "learning_rate": 0.00019385637193856372, + "loss": 0.6602, + "step": 298 + }, + { + "epoch": 0.1805010564443103, + "grad_norm": 0.1208406463265419, + "learning_rate": 0.00019381486093814862, + "loss": 0.5862, + "step": 299 + }, + { + "epoch": 0.18110473890733475, + "grad_norm": 0.127839133143425, + "learning_rate": 0.0001937733499377335, + "loss": 0.4804, + "step": 300 + }, + { + "epoch": 0.1817084213703592, + "grad_norm": 0.07252588123083115, + "learning_rate": 0.0001937318389373184, + "loss": 0.8918, + "step": 301 + }, + { + "epoch": 0.18231210383338364, + "grad_norm": 0.07854969054460526, + "learning_rate": 0.0001936903279369033, + "loss": 0.9295, + "step": 302 + }, + { + "epoch": 0.1829157862964081, + "grad_norm": 0.07926355302333832, + "learning_rate": 0.00019364881693648817, + "loss": 1.0768, + "step": 303 + }, + { + "epoch": 0.18351946875943254, + "grad_norm": 0.1117272675037384, + "learning_rate": 0.00019360730593607307, + "loss": 0.89, + "step": 304 + }, + { + "epoch": 0.18412315122245698, + "grad_norm": 0.09249380975961685, + "learning_rate": 0.00019356579493565797, + "loss": 0.9915, + "step": 305 + }, + { + "epoch": 0.18472683368548143, + "grad_norm": 0.0759844034910202, + "learning_rate": 0.00019352428393524284, + "loss": 0.9705, + "step": 306 + }, + { + "epoch": 0.18533051614850587, + "grad_norm": 0.06819566339254379, + "learning_rate": 0.00019348277293482774, + "loss": 0.9801, + "step": 307 + }, + { + "epoch": 0.18593419861153032, + "grad_norm": 0.07146774232387543, + "learning_rate": 0.00019344126193441264, + "loss": 0.8923, + "step": 308 + }, + { + "epoch": 0.1865378810745548, + "grad_norm": 0.06504259258508682, + "learning_rate": 0.00019339975093399752, + "loss": 0.8941, + "step": 309 + }, + { + "epoch": 0.18714156353757924, + "grad_norm": 0.06857950240373611, + "learning_rate": 0.00019335823993358242, + "loss": 0.9558, + "step": 310 + }, + { + "epoch": 0.1877452460006037, + "grad_norm": 0.06270836293697357, + "learning_rate": 0.0001933167289331673, + "loss": 0.9532, + "step": 311 + }, + { + "epoch": 0.18834892846362813, + "grad_norm": 0.13885007798671722, + "learning_rate": 0.0001932752179327522, + "loss": 0.8591, + "step": 312 + }, + { + "epoch": 0.18895261092665258, + "grad_norm": 0.07498139888048172, + "learning_rate": 0.0001932337069323371, + "loss": 0.9812, + "step": 313 + }, + { + "epoch": 0.18955629338967703, + "grad_norm": 0.0773036777973175, + "learning_rate": 0.00019319219593192196, + "loss": 0.8807, + "step": 314 + }, + { + "epoch": 0.19015997585270147, + "grad_norm": 0.07549306750297546, + "learning_rate": 0.00019315068493150686, + "loss": 1.0929, + "step": 315 + }, + { + "epoch": 0.19076365831572592, + "grad_norm": 0.06931062042713165, + "learning_rate": 0.00019310917393109176, + "loss": 0.9293, + "step": 316 + }, + { + "epoch": 0.19136734077875037, + "grad_norm": 0.06612774729728699, + "learning_rate": 0.00019306766293067664, + "loss": 1.1528, + "step": 317 + }, + { + "epoch": 0.1919710232417748, + "grad_norm": 0.07355131208896637, + "learning_rate": 0.00019302615193026154, + "loss": 1.4277, + "step": 318 + }, + { + "epoch": 0.1925747057047993, + "grad_norm": 0.07465571165084839, + "learning_rate": 0.00019298464092984644, + "loss": 0.8799, + "step": 319 + }, + { + "epoch": 0.19317838816782373, + "grad_norm": 0.07097172737121582, + "learning_rate": 0.0001929431299294313, + "loss": 0.7634, + "step": 320 + }, + { + "epoch": 0.19378207063084818, + "grad_norm": 0.07077208161354065, + "learning_rate": 0.00019290161892901618, + "loss": 0.9721, + "step": 321 + }, + { + "epoch": 0.19438575309387263, + "grad_norm": 0.06071571260690689, + "learning_rate": 0.0001928601079286011, + "loss": 0.8238, + "step": 322 + }, + { + "epoch": 0.19498943555689707, + "grad_norm": 0.07852339744567871, + "learning_rate": 0.00019281859692818598, + "loss": 0.7897, + "step": 323 + }, + { + "epoch": 0.19559311801992152, + "grad_norm": 0.06762294471263885, + "learning_rate": 0.00019277708592777085, + "loss": 0.9692, + "step": 324 + }, + { + "epoch": 0.19619680048294597, + "grad_norm": 0.06589538604021072, + "learning_rate": 0.00019273557492735575, + "loss": 0.8701, + "step": 325 + }, + { + "epoch": 0.1968004829459704, + "grad_norm": 0.07512197643518448, + "learning_rate": 0.00019269406392694065, + "loss": 0.8921, + "step": 326 + }, + { + "epoch": 0.19740416540899486, + "grad_norm": 0.06501419097185135, + "learning_rate": 0.00019265255292652553, + "loss": 0.8559, + "step": 327 + }, + { + "epoch": 0.1980078478720193, + "grad_norm": 0.06205839663743973, + "learning_rate": 0.00019261104192611043, + "loss": 0.9534, + "step": 328 + }, + { + "epoch": 0.19861153033504378, + "grad_norm": 0.06113965064287186, + "learning_rate": 0.00019256953092569533, + "loss": 0.7965, + "step": 329 + }, + { + "epoch": 0.19921521279806823, + "grad_norm": 0.06127781420946121, + "learning_rate": 0.0001925280199252802, + "loss": 0.8429, + "step": 330 + }, + { + "epoch": 0.19981889526109267, + "grad_norm": 0.07560814172029495, + "learning_rate": 0.0001924865089248651, + "loss": 0.8128, + "step": 331 + }, + { + "epoch": 0.20042257772411712, + "grad_norm": 0.06620073318481445, + "learning_rate": 0.00019244499792445, + "loss": 0.958, + "step": 332 + }, + { + "epoch": 0.20102626018714156, + "grad_norm": 0.0671965479850769, + "learning_rate": 0.00019240348692403487, + "loss": 0.8134, + "step": 333 + }, + { + "epoch": 0.201629942650166, + "grad_norm": 0.0731448382139206, + "learning_rate": 0.00019236197592361977, + "loss": 0.7264, + "step": 334 + }, + { + "epoch": 0.20223362511319046, + "grad_norm": 0.06810946017503738, + "learning_rate": 0.00019232046492320465, + "loss": 0.7509, + "step": 335 + }, + { + "epoch": 0.2028373075762149, + "grad_norm": 0.08012348413467407, + "learning_rate": 0.00019227895392278955, + "loss": 0.8237, + "step": 336 + }, + { + "epoch": 0.20344099003923935, + "grad_norm": 0.07328704744577408, + "learning_rate": 0.00019223744292237445, + "loss": 0.7719, + "step": 337 + }, + { + "epoch": 0.2040446725022638, + "grad_norm": 0.07838919013738632, + "learning_rate": 0.00019219593192195932, + "loss": 0.8404, + "step": 338 + }, + { + "epoch": 0.20464835496528827, + "grad_norm": 0.07977066934108734, + "learning_rate": 0.00019215442092154422, + "loss": 0.7614, + "step": 339 + }, + { + "epoch": 0.20525203742831272, + "grad_norm": 0.07425908744335175, + "learning_rate": 0.00019211290992112912, + "loss": 0.7439, + "step": 340 + }, + { + "epoch": 0.20585571989133716, + "grad_norm": 0.08555571734905243, + "learning_rate": 0.000192071398920714, + "loss": 0.7711, + "step": 341 + }, + { + "epoch": 0.2064594023543616, + "grad_norm": 0.0798811987042427, + "learning_rate": 0.0001920298879202989, + "loss": 0.7246, + "step": 342 + }, + { + "epoch": 0.20706308481738606, + "grad_norm": 0.09403866529464722, + "learning_rate": 0.0001919883769198838, + "loss": 0.7171, + "step": 343 + }, + { + "epoch": 0.2076667672804105, + "grad_norm": 0.09114021807909012, + "learning_rate": 0.00019194686591946866, + "loss": 0.7605, + "step": 344 + }, + { + "epoch": 0.20827044974343495, + "grad_norm": 0.1013076901435852, + "learning_rate": 0.00019190535491905354, + "loss": 0.7583, + "step": 345 + }, + { + "epoch": 0.2088741322064594, + "grad_norm": 0.10122162848711014, + "learning_rate": 0.00019186384391863847, + "loss": 0.7477, + "step": 346 + }, + { + "epoch": 0.20947781466948384, + "grad_norm": 0.09927545487880707, + "learning_rate": 0.00019182233291822334, + "loss": 0.6728, + "step": 347 + }, + { + "epoch": 0.2100814971325083, + "grad_norm": 0.12509341537952423, + "learning_rate": 0.0001917808219178082, + "loss": 0.6543, + "step": 348 + }, + { + "epoch": 0.21068517959553276, + "grad_norm": 0.11812290549278259, + "learning_rate": 0.0001917393109173931, + "loss": 0.6701, + "step": 349 + }, + { + "epoch": 0.2112888620585572, + "grad_norm": 0.12166016548871994, + "learning_rate": 0.000191697799916978, + "loss": 0.5261, + "step": 350 + }, + { + "epoch": 0.21189254452158165, + "grad_norm": 0.0700993686914444, + "learning_rate": 0.0001916562889165629, + "loss": 1.2022, + "step": 351 + }, + { + "epoch": 0.2124962269846061, + "grad_norm": 0.08300930261611938, + "learning_rate": 0.00019161477791614778, + "loss": 0.9249, + "step": 352 + }, + { + "epoch": 0.21309990944763055, + "grad_norm": 0.06463124603033066, + "learning_rate": 0.00019157326691573268, + "loss": 0.8147, + "step": 353 + }, + { + "epoch": 0.213703591910655, + "grad_norm": 0.06934931129217148, + "learning_rate": 0.00019153175591531758, + "loss": 0.9904, + "step": 354 + }, + { + "epoch": 0.21430727437367944, + "grad_norm": 0.06555134057998657, + "learning_rate": 0.00019149024491490246, + "loss": 0.8825, + "step": 355 + }, + { + "epoch": 0.2149109568367039, + "grad_norm": 0.07276731729507446, + "learning_rate": 0.00019144873391448736, + "loss": 1.0806, + "step": 356 + }, + { + "epoch": 0.21551463929972833, + "grad_norm": 0.06886199861764908, + "learning_rate": 0.00019140722291407226, + "loss": 0.9756, + "step": 357 + }, + { + "epoch": 0.21611832176275278, + "grad_norm": 0.07671529054641724, + "learning_rate": 0.00019136571191365713, + "loss": 0.9871, + "step": 358 + }, + { + "epoch": 0.21672200422577725, + "grad_norm": 0.08558958023786545, + "learning_rate": 0.000191324200913242, + "loss": 0.9553, + "step": 359 + }, + { + "epoch": 0.2173256866888017, + "grad_norm": 0.07093027234077454, + "learning_rate": 0.00019128268991282693, + "loss": 0.8247, + "step": 360 + }, + { + "epoch": 0.21792936915182615, + "grad_norm": 0.07704441994428635, + "learning_rate": 0.0001912411789124118, + "loss": 1.0793, + "step": 361 + }, + { + "epoch": 0.2185330516148506, + "grad_norm": 0.06594787538051605, + "learning_rate": 0.00019119966791199668, + "loss": 0.866, + "step": 362 + }, + { + "epoch": 0.21913673407787504, + "grad_norm": 0.07074205577373505, + "learning_rate": 0.00019115815691158158, + "loss": 0.9088, + "step": 363 + }, + { + "epoch": 0.21974041654089949, + "grad_norm": 0.06734868884086609, + "learning_rate": 0.00019111664591116648, + "loss": 0.8861, + "step": 364 + }, + { + "epoch": 0.22034409900392393, + "grad_norm": 0.07831276208162308, + "learning_rate": 0.00019107513491075135, + "loss": 0.939, + "step": 365 + }, + { + "epoch": 0.22094778146694838, + "grad_norm": 0.06146768108010292, + "learning_rate": 0.00019103362391033625, + "loss": 0.9476, + "step": 366 + }, + { + "epoch": 0.22155146392997283, + "grad_norm": 0.062476493418216705, + "learning_rate": 0.00019099211290992115, + "loss": 0.821, + "step": 367 + }, + { + "epoch": 0.22215514639299727, + "grad_norm": 0.07490051537752151, + "learning_rate": 0.00019095060190950602, + "loss": 0.9016, + "step": 368 + }, + { + "epoch": 0.22275882885602175, + "grad_norm": 0.07467661798000336, + "learning_rate": 0.00019090909090909092, + "loss": 0.786, + "step": 369 + }, + { + "epoch": 0.2233625113190462, + "grad_norm": 0.0667489618062973, + "learning_rate": 0.00019086757990867582, + "loss": 0.7985, + "step": 370 + }, + { + "epoch": 0.22396619378207064, + "grad_norm": 0.06847196817398071, + "learning_rate": 0.0001908260689082607, + "loss": 0.7912, + "step": 371 + }, + { + "epoch": 0.22456987624509508, + "grad_norm": 0.06785493344068527, + "learning_rate": 0.0001907845579078456, + "loss": 0.9787, + "step": 372 + }, + { + "epoch": 0.22517355870811953, + "grad_norm": 0.06378093361854553, + "learning_rate": 0.00019074304690743047, + "loss": 1.0737, + "step": 373 + }, + { + "epoch": 0.22577724117114398, + "grad_norm": 0.07684706896543503, + "learning_rate": 0.00019070153590701537, + "loss": 0.8541, + "step": 374 + }, + { + "epoch": 0.22638092363416842, + "grad_norm": 0.06430470943450928, + "learning_rate": 0.00019066002490660027, + "loss": 0.8404, + "step": 375 + }, + { + "epoch": 0.22698460609719287, + "grad_norm": 0.062282588332891464, + "learning_rate": 0.00019061851390618514, + "loss": 0.8361, + "step": 376 + }, + { + "epoch": 0.22758828856021732, + "grad_norm": 0.06882327795028687, + "learning_rate": 0.00019057700290577004, + "loss": 0.7329, + "step": 377 + }, + { + "epoch": 0.22819197102324176, + "grad_norm": 0.06568577885627747, + "learning_rate": 0.00019053549190535494, + "loss": 0.8607, + "step": 378 + }, + { + "epoch": 0.22879565348626624, + "grad_norm": 0.07336780428886414, + "learning_rate": 0.00019049398090493981, + "loss": 0.8122, + "step": 379 + }, + { + "epoch": 0.22939933594929068, + "grad_norm": 0.06682398915290833, + "learning_rate": 0.0001904524699045247, + "loss": 0.8005, + "step": 380 + }, + { + "epoch": 0.23000301841231513, + "grad_norm": 0.07804804295301437, + "learning_rate": 0.00019041095890410961, + "loss": 0.8991, + "step": 381 + }, + { + "epoch": 0.23060670087533958, + "grad_norm": 0.07914794981479645, + "learning_rate": 0.0001903694479036945, + "loss": 0.7798, + "step": 382 + }, + { + "epoch": 0.23121038333836402, + "grad_norm": 0.10398049652576447, + "learning_rate": 0.00019032793690327936, + "loss": 0.7993, + "step": 383 + }, + { + "epoch": 0.23181406580138847, + "grad_norm": 0.0717068761587143, + "learning_rate": 0.0001902864259028643, + "loss": 0.7845, + "step": 384 + }, + { + "epoch": 0.23241774826441292, + "grad_norm": 0.07129625976085663, + "learning_rate": 0.00019024491490244916, + "loss": 0.7409, + "step": 385 + }, + { + "epoch": 0.23302143072743736, + "grad_norm": 0.07239419221878052, + "learning_rate": 0.00019020340390203403, + "loss": 0.7352, + "step": 386 + }, + { + "epoch": 0.2336251131904618, + "grad_norm": 0.07583223283290863, + "learning_rate": 0.00019016189290161893, + "loss": 0.8076, + "step": 387 + }, + { + "epoch": 0.23422879565348625, + "grad_norm": 0.07693106681108475, + "learning_rate": 0.00019012038190120383, + "loss": 0.7828, + "step": 388 + }, + { + "epoch": 0.23483247811651073, + "grad_norm": 0.08271142840385437, + "learning_rate": 0.0001900788709007887, + "loss": 0.7281, + "step": 389 + }, + { + "epoch": 0.23543616057953518, + "grad_norm": 0.08440615981817245, + "learning_rate": 0.0001900373599003736, + "loss": 0.7776, + "step": 390 + }, + { + "epoch": 0.23603984304255962, + "grad_norm": 0.08483248949050903, + "learning_rate": 0.0001899958488999585, + "loss": 0.7142, + "step": 391 + }, + { + "epoch": 0.23664352550558407, + "grad_norm": 0.10398207604885101, + "learning_rate": 0.00018995433789954338, + "loss": 0.7971, + "step": 392 + }, + { + "epoch": 0.23724720796860851, + "grad_norm": 0.09310369938611984, + "learning_rate": 0.00018991282689912828, + "loss": 0.7611, + "step": 393 + }, + { + "epoch": 0.23785089043163296, + "grad_norm": 0.09080198407173157, + "learning_rate": 0.00018987131589871315, + "loss": 0.7963, + "step": 394 + }, + { + "epoch": 0.2384545728946574, + "grad_norm": 0.09882273524999619, + "learning_rate": 0.00018982980489829805, + "loss": 0.7826, + "step": 395 + }, + { + "epoch": 0.23905825535768185, + "grad_norm": 0.11651689559221268, + "learning_rate": 0.00018978829389788295, + "loss": 0.7177, + "step": 396 + }, + { + "epoch": 0.2396619378207063, + "grad_norm": 0.10827039927244186, + "learning_rate": 0.00018974678289746782, + "loss": 0.6913, + "step": 397 + }, + { + "epoch": 0.24026562028373075, + "grad_norm": 0.10637848824262619, + "learning_rate": 0.00018970527189705275, + "loss": 0.5721, + "step": 398 + }, + { + "epoch": 0.24086930274675522, + "grad_norm": 0.11688965559005737, + "learning_rate": 0.00018966376089663762, + "loss": 0.5603, + "step": 399 + }, + { + "epoch": 0.24147298520977967, + "grad_norm": 0.12776370346546173, + "learning_rate": 0.0001896222498962225, + "loss": 0.4937, + "step": 400 + }, + { + "epoch": 0.2420766676728041, + "grad_norm": 0.06769760698080063, + "learning_rate": 0.0001895807388958074, + "loss": 0.8702, + "step": 401 + }, + { + "epoch": 0.24268035013582856, + "grad_norm": 0.07302338629961014, + "learning_rate": 0.0001895392278953923, + "loss": 0.8243, + "step": 402 + }, + { + "epoch": 0.243284032598853, + "grad_norm": 0.07331572473049164, + "learning_rate": 0.00018949771689497717, + "loss": 1.1457, + "step": 403 + }, + { + "epoch": 0.24388771506187745, + "grad_norm": 0.07224598526954651, + "learning_rate": 0.00018945620589456207, + "loss": 1.0838, + "step": 404 + }, + { + "epoch": 0.2444913975249019, + "grad_norm": 0.06487125158309937, + "learning_rate": 0.00018941469489414697, + "loss": 0.8927, + "step": 405 + }, + { + "epoch": 0.24509507998792635, + "grad_norm": 0.06399485468864441, + "learning_rate": 0.00018937318389373184, + "loss": 0.7973, + "step": 406 + }, + { + "epoch": 0.2456987624509508, + "grad_norm": 0.08675524592399597, + "learning_rate": 0.00018933167289331674, + "loss": 0.9525, + "step": 407 + }, + { + "epoch": 0.24630244491397524, + "grad_norm": 0.0663706362247467, + "learning_rate": 0.00018929016189290162, + "loss": 0.7668, + "step": 408 + }, + { + "epoch": 0.24690612737699968, + "grad_norm": 0.09714970737695694, + "learning_rate": 0.00018924865089248652, + "loss": 0.783, + "step": 409 + }, + { + "epoch": 0.24750980984002416, + "grad_norm": 0.06401702761650085, + "learning_rate": 0.00018920713989207142, + "loss": 1.0896, + "step": 410 + }, + { + "epoch": 0.2481134923030486, + "grad_norm": 0.07394849509000778, + "learning_rate": 0.0001891656288916563, + "loss": 0.79, + "step": 411 + }, + { + "epoch": 0.24871717476607305, + "grad_norm": 0.1234453096985817, + "learning_rate": 0.0001891241178912412, + "loss": 0.841, + "step": 412 + }, + { + "epoch": 0.2493208572290975, + "grad_norm": 0.06933286041021347, + "learning_rate": 0.0001890826068908261, + "loss": 1.1143, + "step": 413 + }, + { + "epoch": 0.24992453969212194, + "grad_norm": 0.07103940844535828, + "learning_rate": 0.00018904109589041096, + "loss": 0.8918, + "step": 414 + }, + { + "epoch": 0.2505282221551464, + "grad_norm": 0.11104138940572739, + "learning_rate": 0.00018899958488999586, + "loss": 0.922, + "step": 415 + }, + { + "epoch": 0.25113190461817086, + "grad_norm": 0.06877300888299942, + "learning_rate": 0.00018895807388958076, + "loss": 0.9959, + "step": 416 + }, + { + "epoch": 0.2517355870811953, + "grad_norm": 0.07087550312280655, + "learning_rate": 0.00018891656288916564, + "loss": 0.8376, + "step": 417 + }, + { + "epoch": 0.25233926954421976, + "grad_norm": 0.08183194696903229, + "learning_rate": 0.0001888750518887505, + "loss": 0.8378, + "step": 418 + }, + { + "epoch": 0.2529429520072442, + "grad_norm": 0.06973780691623688, + "learning_rate": 0.00018883354088833544, + "loss": 0.8195, + "step": 419 + }, + { + "epoch": 0.25354663447026865, + "grad_norm": 0.07654432952404022, + "learning_rate": 0.0001887920298879203, + "loss": 0.8851, + "step": 420 + }, + { + "epoch": 0.2541503169332931, + "grad_norm": 0.0699959322810173, + "learning_rate": 0.00018875051888750518, + "loss": 1.0005, + "step": 421 + }, + { + "epoch": 0.25475399939631754, + "grad_norm": 0.06692960858345032, + "learning_rate": 0.00018870900788709008, + "loss": 0.8751, + "step": 422 + }, + { + "epoch": 0.255357681859342, + "grad_norm": 0.06952139735221863, + "learning_rate": 0.00018866749688667498, + "loss": 0.968, + "step": 423 + }, + { + "epoch": 0.25596136432236644, + "grad_norm": 0.0676087811589241, + "learning_rate": 0.00018862598588625985, + "loss": 0.9608, + "step": 424 + }, + { + "epoch": 0.2565650467853909, + "grad_norm": 0.07644575089216232, + "learning_rate": 0.00018858447488584475, + "loss": 0.8708, + "step": 425 + }, + { + "epoch": 0.25716872924841533, + "grad_norm": 0.0646698921918869, + "learning_rate": 0.00018854296388542965, + "loss": 0.7413, + "step": 426 + }, + { + "epoch": 0.2577724117114398, + "grad_norm": 0.06822269409894943, + "learning_rate": 0.00018850145288501453, + "loss": 0.8585, + "step": 427 + }, + { + "epoch": 0.2583760941744642, + "grad_norm": 0.23585358262062073, + "learning_rate": 0.00018845994188459943, + "loss": 0.8879, + "step": 428 + }, + { + "epoch": 0.25897977663748867, + "grad_norm": 0.07148776948451996, + "learning_rate": 0.00018841843088418433, + "loss": 0.9396, + "step": 429 + }, + { + "epoch": 0.2595834591005131, + "grad_norm": 0.07239923626184464, + "learning_rate": 0.0001883769198837692, + "loss": 0.9303, + "step": 430 + }, + { + "epoch": 0.26018714156353756, + "grad_norm": 0.06573071330785751, + "learning_rate": 0.0001883354088833541, + "loss": 0.866, + "step": 431 + }, + { + "epoch": 0.260790824026562, + "grad_norm": 0.06852304190397263, + "learning_rate": 0.00018829389788293897, + "loss": 1.0774, + "step": 432 + }, + { + "epoch": 0.26139450648958645, + "grad_norm": 0.06725315749645233, + "learning_rate": 0.00018825238688252387, + "loss": 0.9174, + "step": 433 + }, + { + "epoch": 0.2619981889526109, + "grad_norm": 0.07200594246387482, + "learning_rate": 0.00018821087588210877, + "loss": 0.7667, + "step": 434 + }, + { + "epoch": 0.2626018714156354, + "grad_norm": 0.06961148232221603, + "learning_rate": 0.00018816936488169365, + "loss": 0.7284, + "step": 435 + }, + { + "epoch": 0.26320555387865985, + "grad_norm": 0.07602632790803909, + "learning_rate": 0.00018812785388127855, + "loss": 0.8159, + "step": 436 + }, + { + "epoch": 0.2638092363416843, + "grad_norm": 0.0729806199669838, + "learning_rate": 0.00018808634288086345, + "loss": 0.7094, + "step": 437 + }, + { + "epoch": 0.26441291880470874, + "grad_norm": 0.07881097495555878, + "learning_rate": 0.00018804483188044832, + "loss": 0.7488, + "step": 438 + }, + { + "epoch": 0.2650166012677332, + "grad_norm": 0.08327119052410126, + "learning_rate": 0.00018800332088003322, + "loss": 0.7672, + "step": 439 + }, + { + "epoch": 0.26562028373075763, + "grad_norm": 0.08234728127717972, + "learning_rate": 0.00018796180987961812, + "loss": 0.7644, + "step": 440 + }, + { + "epoch": 0.2662239661937821, + "grad_norm": 0.09000684320926666, + "learning_rate": 0.000187920298879203, + "loss": 0.793, + "step": 441 + }, + { + "epoch": 0.2668276486568065, + "grad_norm": 0.07901246100664139, + "learning_rate": 0.0001878787878787879, + "loss": 0.7346, + "step": 442 + }, + { + "epoch": 0.267431331119831, + "grad_norm": 0.09163561463356018, + "learning_rate": 0.0001878372768783728, + "loss": 0.762, + "step": 443 + }, + { + "epoch": 0.2680350135828554, + "grad_norm": 0.09499283879995346, + "learning_rate": 0.00018779576587795767, + "loss": 0.7669, + "step": 444 + }, + { + "epoch": 0.26863869604587987, + "grad_norm": 0.1023356169462204, + "learning_rate": 0.00018775425487754257, + "loss": 0.7431, + "step": 445 + }, + { + "epoch": 0.2692423785089043, + "grad_norm": 0.10197634249925613, + "learning_rate": 0.00018771274387712744, + "loss": 0.7059, + "step": 446 + }, + { + "epoch": 0.26984606097192876, + "grad_norm": 0.09975861012935638, + "learning_rate": 0.00018767123287671234, + "loss": 0.658, + "step": 447 + }, + { + "epoch": 0.2704497434349532, + "grad_norm": 0.11030542105436325, + "learning_rate": 0.00018762972187629724, + "loss": 0.6281, + "step": 448 + }, + { + "epoch": 0.27105342589797765, + "grad_norm": 0.11011619865894318, + "learning_rate": 0.0001875882108758821, + "loss": 0.5196, + "step": 449 + }, + { + "epoch": 0.2716571083610021, + "grad_norm": 0.12410090863704681, + "learning_rate": 0.000187546699875467, + "loss": 0.4871, + "step": 450 + }, + { + "epoch": 0.27226079082402654, + "grad_norm": 0.06774574518203735, + "learning_rate": 0.0001875051888750519, + "loss": 0.7875, + "step": 451 + }, + { + "epoch": 0.272864473287051, + "grad_norm": 0.06862092763185501, + "learning_rate": 0.00018746367787463678, + "loss": 0.8497, + "step": 452 + }, + { + "epoch": 0.27346815575007544, + "grad_norm": 0.07752003520727158, + "learning_rate": 0.00018742216687422168, + "loss": 0.7635, + "step": 453 + }, + { + "epoch": 0.2740718382130999, + "grad_norm": 0.138156920671463, + "learning_rate": 0.00018738065587380658, + "loss": 0.8682, + "step": 454 + }, + { + "epoch": 0.2746755206761244, + "grad_norm": 0.07247649133205414, + "learning_rate": 0.00018733914487339146, + "loss": 0.9772, + "step": 455 + }, + { + "epoch": 0.27527920313914883, + "grad_norm": 0.06809130311012268, + "learning_rate": 0.00018729763387297633, + "loss": 0.9594, + "step": 456 + }, + { + "epoch": 0.2758828856021733, + "grad_norm": 0.08479005098342896, + "learning_rate": 0.00018725612287256126, + "loss": 1.0778, + "step": 457 + }, + { + "epoch": 0.2764865680651977, + "grad_norm": 0.06665640324354172, + "learning_rate": 0.00018721461187214613, + "loss": 0.9377, + "step": 458 + }, + { + "epoch": 0.27709025052822217, + "grad_norm": 0.06898784637451172, + "learning_rate": 0.000187173100871731, + "loss": 0.8999, + "step": 459 + }, + { + "epoch": 0.2776939329912466, + "grad_norm": 0.06468936055898666, + "learning_rate": 0.0001871315898713159, + "loss": 0.9559, + "step": 460 + }, + { + "epoch": 0.27829761545427106, + "grad_norm": 0.0695393905043602, + "learning_rate": 0.0001870900788709008, + "loss": 0.8692, + "step": 461 + }, + { + "epoch": 0.2789012979172955, + "grad_norm": 0.10467389971017838, + "learning_rate": 0.00018704856787048568, + "loss": 0.8858, + "step": 462 + }, + { + "epoch": 0.27950498038031996, + "grad_norm": 0.06943607330322266, + "learning_rate": 0.00018700705687007058, + "loss": 1.0079, + "step": 463 + }, + { + "epoch": 0.2801086628433444, + "grad_norm": 0.07984504848718643, + "learning_rate": 0.00018696554586965548, + "loss": 0.8257, + "step": 464 + }, + { + "epoch": 0.28071234530636885, + "grad_norm": 0.06335430592298508, + "learning_rate": 0.00018692403486924035, + "loss": 1.1628, + "step": 465 + }, + { + "epoch": 0.2813160277693933, + "grad_norm": 0.06936845183372498, + "learning_rate": 0.00018688252386882525, + "loss": 0.8205, + "step": 466 + }, + { + "epoch": 0.28191971023241774, + "grad_norm": 0.0703270360827446, + "learning_rate": 0.00018684101286841015, + "loss": 1.053, + "step": 467 + }, + { + "epoch": 0.2825233926954422, + "grad_norm": 0.06945156306028366, + "learning_rate": 0.00018679950186799502, + "loss": 0.8785, + "step": 468 + }, + { + "epoch": 0.28312707515846663, + "grad_norm": 0.08337710797786713, + "learning_rate": 0.00018675799086757992, + "loss": 1.0398, + "step": 469 + }, + { + "epoch": 0.2837307576214911, + "grad_norm": 0.07238437235355377, + "learning_rate": 0.0001867164798671648, + "loss": 0.9231, + "step": 470 + }, + { + "epoch": 0.2843344400845155, + "grad_norm": 0.0756809413433075, + "learning_rate": 0.0001866749688667497, + "loss": 1.1661, + "step": 471 + }, + { + "epoch": 0.28493812254754, + "grad_norm": 0.06537111848592758, + "learning_rate": 0.0001866334578663346, + "loss": 0.7645, + "step": 472 + }, + { + "epoch": 0.2855418050105644, + "grad_norm": 0.07265693694353104, + "learning_rate": 0.00018659194686591947, + "loss": 1.1106, + "step": 473 + }, + { + "epoch": 0.28614548747358887, + "grad_norm": 0.06632302701473236, + "learning_rate": 0.00018655043586550437, + "loss": 0.8943, + "step": 474 + }, + { + "epoch": 0.28674916993661337, + "grad_norm": 0.06948743760585785, + "learning_rate": 0.00018650892486508927, + "loss": 1.0051, + "step": 475 + }, + { + "epoch": 0.2873528523996378, + "grad_norm": 0.07059241831302643, + "learning_rate": 0.00018646741386467414, + "loss": 1.0785, + "step": 476 + }, + { + "epoch": 0.28795653486266226, + "grad_norm": 0.06578975915908813, + "learning_rate": 0.00018642590286425901, + "loss": 0.8156, + "step": 477 + }, + { + "epoch": 0.2885602173256867, + "grad_norm": 0.06178348883986473, + "learning_rate": 0.00018638439186384394, + "loss": 0.8512, + "step": 478 + }, + { + "epoch": 0.28916389978871115, + "grad_norm": 0.06867639720439911, + "learning_rate": 0.00018634288086342881, + "loss": 0.8549, + "step": 479 + }, + { + "epoch": 0.2897675822517356, + "grad_norm": 0.07352021336555481, + "learning_rate": 0.0001863013698630137, + "loss": 0.7585, + "step": 480 + }, + { + "epoch": 0.29037126471476005, + "grad_norm": 0.07471666485071182, + "learning_rate": 0.00018625985886259861, + "loss": 0.9468, + "step": 481 + }, + { + "epoch": 0.2909749471777845, + "grad_norm": 0.06977469474077225, + "learning_rate": 0.0001862183478621835, + "loss": 0.7439, + "step": 482 + }, + { + "epoch": 0.29157862964080894, + "grad_norm": 0.07040760666131973, + "learning_rate": 0.00018617683686176836, + "loss": 0.7331, + "step": 483 + }, + { + "epoch": 0.2921823121038334, + "grad_norm": 0.07527907937765121, + "learning_rate": 0.00018613532586135326, + "loss": 0.8511, + "step": 484 + }, + { + "epoch": 0.29278599456685783, + "grad_norm": 0.07468552142381668, + "learning_rate": 0.00018609381486093816, + "loss": 0.8146, + "step": 485 + }, + { + "epoch": 0.2933896770298823, + "grad_norm": 0.0728183388710022, + "learning_rate": 0.00018605230386052306, + "loss": 0.8189, + "step": 486 + }, + { + "epoch": 0.2939933594929067, + "grad_norm": 0.07946665585041046, + "learning_rate": 0.00018601079286010793, + "loss": 0.8002, + "step": 487 + }, + { + "epoch": 0.29459704195593117, + "grad_norm": 0.08301442861557007, + "learning_rate": 0.00018596928185969283, + "loss": 0.76, + "step": 488 + }, + { + "epoch": 0.2952007244189556, + "grad_norm": 0.07893984019756317, + "learning_rate": 0.00018592777085927773, + "loss": 0.7711, + "step": 489 + }, + { + "epoch": 0.29580440688198006, + "grad_norm": 0.08230159431695938, + "learning_rate": 0.0001858862598588626, + "loss": 0.7041, + "step": 490 + }, + { + "epoch": 0.2964080893450045, + "grad_norm": 0.09270324558019638, + "learning_rate": 0.00018584474885844748, + "loss": 0.7666, + "step": 491 + }, + { + "epoch": 0.29701177180802896, + "grad_norm": 0.09311135858297348, + "learning_rate": 0.0001858032378580324, + "loss": 0.7878, + "step": 492 + }, + { + "epoch": 0.2976154542710534, + "grad_norm": 0.09099919348955154, + "learning_rate": 0.00018576172685761728, + "loss": 0.7074, + "step": 493 + }, + { + "epoch": 0.29821913673407785, + "grad_norm": 0.09507730603218079, + "learning_rate": 0.00018572021585720215, + "loss": 0.6735, + "step": 494 + }, + { + "epoch": 0.29882281919710235, + "grad_norm": 0.09427333623170853, + "learning_rate": 0.00018567870485678708, + "loss": 0.7592, + "step": 495 + }, + { + "epoch": 0.2994265016601268, + "grad_norm": 0.1038491502404213, + "learning_rate": 0.00018563719385637195, + "loss": 0.7356, + "step": 496 + }, + { + "epoch": 0.30003018412315124, + "grad_norm": 0.1097760945558548, + "learning_rate": 0.00018559568285595683, + "loss": 0.6825, + "step": 497 + }, + { + "epoch": 0.3006338665861757, + "grad_norm": 0.10912280529737473, + "learning_rate": 0.00018555417185554173, + "loss": 0.6431, + "step": 498 + }, + { + "epoch": 0.30123754904920014, + "grad_norm": 0.1107010692358017, + "learning_rate": 0.00018551266085512663, + "loss": 0.5303, + "step": 499 + }, + { + "epoch": 0.3018412315122246, + "grad_norm": 0.12602561712265015, + "learning_rate": 0.0001854711498547115, + "loss": 0.4541, + "step": 500 + }, + { + "epoch": 0.3018412315122246, + "eval_loss": 0.8565130233764648, + "eval_runtime": 1219.0924, + "eval_samples_per_second": 2.289, + "eval_steps_per_second": 0.286, + "step": 500 + }, + { + "epoch": 0.30244491397524903, + "grad_norm": 0.11287853866815567, + "learning_rate": 0.0001854296388542964, + "loss": 0.8697, + "step": 501 + }, + { + "epoch": 0.3030485964382735, + "grad_norm": 0.08270236849784851, + "learning_rate": 0.0001853881278538813, + "loss": 0.8881, + "step": 502 + }, + { + "epoch": 0.3036522789012979, + "grad_norm": 0.07861533761024475, + "learning_rate": 0.00018534661685346617, + "loss": 0.8053, + "step": 503 + }, + { + "epoch": 0.30425596136432237, + "grad_norm": 0.07589036971330643, + "learning_rate": 0.00018530510585305107, + "loss": 1.0165, + "step": 504 + }, + { + "epoch": 0.3048596438273468, + "grad_norm": 0.07618770003318787, + "learning_rate": 0.00018526359485263594, + "loss": 1.0332, + "step": 505 + }, + { + "epoch": 0.30546332629037126, + "grad_norm": 0.07502375543117523, + "learning_rate": 0.00018522208385222084, + "loss": 0.976, + "step": 506 + }, + { + "epoch": 0.3060670087533957, + "grad_norm": 0.07355045527219772, + "learning_rate": 0.00018518057285180574, + "loss": 0.7887, + "step": 507 + }, + { + "epoch": 0.30667069121642015, + "grad_norm": 0.08850781619548798, + "learning_rate": 0.00018513906185139062, + "loss": 1.0632, + "step": 508 + }, + { + "epoch": 0.3072743736794446, + "grad_norm": 0.06971225887537003, + "learning_rate": 0.00018509755085097552, + "loss": 0.9073, + "step": 509 + }, + { + "epoch": 0.30787805614246905, + "grad_norm": 0.16970930993556976, + "learning_rate": 0.00018505603985056042, + "loss": 0.9196, + "step": 510 + }, + { + "epoch": 0.3084817386054935, + "grad_norm": 0.07109450548887253, + "learning_rate": 0.0001850145288501453, + "loss": 0.8324, + "step": 511 + }, + { + "epoch": 0.30908542106851794, + "grad_norm": 0.06761594116687775, + "learning_rate": 0.0001849730178497302, + "loss": 0.8924, + "step": 512 + }, + { + "epoch": 0.3096891035315424, + "grad_norm": 0.06660860776901245, + "learning_rate": 0.0001849315068493151, + "loss": 0.8244, + "step": 513 + }, + { + "epoch": 0.31029278599456683, + "grad_norm": 0.08837427198886871, + "learning_rate": 0.00018488999584889996, + "loss": 0.8228, + "step": 514 + }, + { + "epoch": 0.3108964684575913, + "grad_norm": 0.09773479402065277, + "learning_rate": 0.00018484848484848484, + "loss": 0.8686, + "step": 515 + }, + { + "epoch": 0.3115001509206158, + "grad_norm": 0.07643985003232956, + "learning_rate": 0.00018480697384806976, + "loss": 0.8456, + "step": 516 + }, + { + "epoch": 0.31210383338364023, + "grad_norm": 0.06785845011472702, + "learning_rate": 0.00018476546284765464, + "loss": 0.8381, + "step": 517 + }, + { + "epoch": 0.3127075158466647, + "grad_norm": 0.0758255124092102, + "learning_rate": 0.0001847239518472395, + "loss": 0.8728, + "step": 518 + }, + { + "epoch": 0.3133111983096891, + "grad_norm": 0.0750250369310379, + "learning_rate": 0.0001846824408468244, + "loss": 0.7163, + "step": 519 + }, + { + "epoch": 0.31391488077271357, + "grad_norm": 0.06817881017923355, + "learning_rate": 0.0001846409298464093, + "loss": 0.7658, + "step": 520 + }, + { + "epoch": 0.314518563235738, + "grad_norm": 0.06935089081525803, + "learning_rate": 0.00018459941884599418, + "loss": 0.8448, + "step": 521 + }, + { + "epoch": 0.31512224569876246, + "grad_norm": 0.0738990381360054, + "learning_rate": 0.00018455790784557908, + "loss": 0.8026, + "step": 522 + }, + { + "epoch": 0.3157259281617869, + "grad_norm": 0.07354303449392319, + "learning_rate": 0.00018451639684516398, + "loss": 0.8194, + "step": 523 + }, + { + "epoch": 0.31632961062481135, + "grad_norm": 0.07702893018722534, + "learning_rate": 0.00018447488584474886, + "loss": 0.8923, + "step": 524 + }, + { + "epoch": 0.3169332930878358, + "grad_norm": 0.09026607125997543, + "learning_rate": 0.00018443337484433376, + "loss": 0.9738, + "step": 525 + }, + { + "epoch": 0.31753697555086025, + "grad_norm": 0.08617381006479263, + "learning_rate": 0.00018439186384391866, + "loss": 0.9765, + "step": 526 + }, + { + "epoch": 0.3181406580138847, + "grad_norm": 0.07624173164367676, + "learning_rate": 0.00018435035284350353, + "loss": 1.0301, + "step": 527 + }, + { + "epoch": 0.31874434047690914, + "grad_norm": 0.06759200245141983, + "learning_rate": 0.00018430884184308843, + "loss": 0.8237, + "step": 528 + }, + { + "epoch": 0.3193480229399336, + "grad_norm": 0.07457785308361053, + "learning_rate": 0.0001842673308426733, + "loss": 0.7363, + "step": 529 + }, + { + "epoch": 0.31995170540295803, + "grad_norm": 0.08332669734954834, + "learning_rate": 0.00018422581984225823, + "loss": 0.7471, + "step": 530 + }, + { + "epoch": 0.3205553878659825, + "grad_norm": 0.0689413771033287, + "learning_rate": 0.0001841843088418431, + "loss": 0.8585, + "step": 531 + }, + { + "epoch": 0.3211590703290069, + "grad_norm": 0.06793367117643356, + "learning_rate": 0.00018414279784142797, + "loss": 1.033, + "step": 532 + }, + { + "epoch": 0.32176275279203137, + "grad_norm": 0.07640314847230911, + "learning_rate": 0.00018410128684101287, + "loss": 1.0033, + "step": 533 + }, + { + "epoch": 0.3223664352550558, + "grad_norm": 0.07172773033380508, + "learning_rate": 0.00018405977584059777, + "loss": 0.7355, + "step": 534 + }, + { + "epoch": 0.32297011771808026, + "grad_norm": 0.07412228733301163, + "learning_rate": 0.00018401826484018265, + "loss": 0.7548, + "step": 535 + }, + { + "epoch": 0.32357380018110476, + "grad_norm": 0.07675584405660629, + "learning_rate": 0.00018397675383976755, + "loss": 0.7105, + "step": 536 + }, + { + "epoch": 0.3241774826441292, + "grad_norm": 0.07256907969713211, + "learning_rate": 0.00018393524283935245, + "loss": 0.7383, + "step": 537 + }, + { + "epoch": 0.32478116510715366, + "grad_norm": 0.08796676248311996, + "learning_rate": 0.00018389373183893732, + "loss": 0.7606, + "step": 538 + }, + { + "epoch": 0.3253848475701781, + "grad_norm": 0.07517845928668976, + "learning_rate": 0.00018385222083852222, + "loss": 0.8343, + "step": 539 + }, + { + "epoch": 0.32598853003320255, + "grad_norm": 0.08098675310611725, + "learning_rate": 0.00018381070983810712, + "loss": 0.7752, + "step": 540 + }, + { + "epoch": 0.326592212496227, + "grad_norm": 0.08791480958461761, + "learning_rate": 0.000183769198837692, + "loss": 0.6954, + "step": 541 + }, + { + "epoch": 0.32719589495925144, + "grad_norm": 0.09234142303466797, + "learning_rate": 0.0001837276878372769, + "loss": 0.7773, + "step": 542 + }, + { + "epoch": 0.3277995774222759, + "grad_norm": 0.09258239716291428, + "learning_rate": 0.00018368617683686177, + "loss": 0.7787, + "step": 543 + }, + { + "epoch": 0.32840325988530034, + "grad_norm": 0.09329955279827118, + "learning_rate": 0.00018364466583644667, + "loss": 0.6927, + "step": 544 + }, + { + "epoch": 0.3290069423483248, + "grad_norm": 0.1046968549489975, + "learning_rate": 0.00018360315483603157, + "loss": 0.742, + "step": 545 + }, + { + "epoch": 0.32961062481134923, + "grad_norm": 0.11193925142288208, + "learning_rate": 0.00018356164383561644, + "loss": 0.7062, + "step": 546 + }, + { + "epoch": 0.3302143072743737, + "grad_norm": 0.10192117840051651, + "learning_rate": 0.00018352013283520134, + "loss": 0.6246, + "step": 547 + }, + { + "epoch": 0.3308179897373981, + "grad_norm": 0.11968918889760971, + "learning_rate": 0.00018347862183478624, + "loss": 0.6263, + "step": 548 + }, + { + "epoch": 0.33142167220042257, + "grad_norm": 0.11495087295770645, + "learning_rate": 0.0001834371108343711, + "loss": 0.5845, + "step": 549 + }, + { + "epoch": 0.332025354663447, + "grad_norm": 0.1315176784992218, + "learning_rate": 0.000183395599833956, + "loss": 0.5137, + "step": 550 + }, + { + "epoch": 0.33262903712647146, + "grad_norm": 0.07526993751525879, + "learning_rate": 0.0001833540888335409, + "loss": 0.8475, + "step": 551 + }, + { + "epoch": 0.3332327195894959, + "grad_norm": 0.08781188726425171, + "learning_rate": 0.00018331257783312579, + "loss": 1.0003, + "step": 552 + }, + { + "epoch": 0.33383640205252035, + "grad_norm": 0.07417803257703781, + "learning_rate": 0.00018327106683271066, + "loss": 0.9112, + "step": 553 + }, + { + "epoch": 0.3344400845155448, + "grad_norm": 0.08806382119655609, + "learning_rate": 0.00018322955583229559, + "loss": 0.9675, + "step": 554 + }, + { + "epoch": 0.33504376697856925, + "grad_norm": 0.08356613665819168, + "learning_rate": 0.00018318804483188046, + "loss": 0.808, + "step": 555 + }, + { + "epoch": 0.33564744944159375, + "grad_norm": 0.08373204618692398, + "learning_rate": 0.00018314653383146533, + "loss": 1.1798, + "step": 556 + }, + { + "epoch": 0.3362511319046182, + "grad_norm": 0.07093155384063721, + "learning_rate": 0.00018310502283105023, + "loss": 1.0393, + "step": 557 + }, + { + "epoch": 0.33685481436764264, + "grad_norm": 0.11354014277458191, + "learning_rate": 0.00018306351183063513, + "loss": 0.8586, + "step": 558 + }, + { + "epoch": 0.3374584968306671, + "grad_norm": 0.07332495599985123, + "learning_rate": 0.00018302200083022, + "loss": 0.6829, + "step": 559 + }, + { + "epoch": 0.33806217929369153, + "grad_norm": 0.07165679335594177, + "learning_rate": 0.0001829804898298049, + "loss": 0.8619, + "step": 560 + }, + { + "epoch": 0.338665861756716, + "grad_norm": 0.07097924500703812, + "learning_rate": 0.0001829389788293898, + "loss": 0.7529, + "step": 561 + }, + { + "epoch": 0.3392695442197404, + "grad_norm": 0.06817946583032608, + "learning_rate": 0.00018289746782897468, + "loss": 0.9555, + "step": 562 + }, + { + "epoch": 0.3398732266827649, + "grad_norm": 0.07513990998268127, + "learning_rate": 0.00018285595682855958, + "loss": 0.874, + "step": 563 + }, + { + "epoch": 0.3404769091457893, + "grad_norm": 0.07172597199678421, + "learning_rate": 0.00018281444582814448, + "loss": 0.8608, + "step": 564 + }, + { + "epoch": 0.34108059160881377, + "grad_norm": 0.08141127973794937, + "learning_rate": 0.00018277293482772935, + "loss": 1.1752, + "step": 565 + }, + { + "epoch": 0.3416842740718382, + "grad_norm": 0.06964030116796494, + "learning_rate": 0.00018273142382731425, + "loss": 0.8235, + "step": 566 + }, + { + "epoch": 0.34228795653486266, + "grad_norm": 0.07054075598716736, + "learning_rate": 0.00018268991282689912, + "loss": 0.9064, + "step": 567 + }, + { + "epoch": 0.3428916389978871, + "grad_norm": 0.08551137149333954, + "learning_rate": 0.00018264840182648402, + "loss": 1.1779, + "step": 568 + }, + { + "epoch": 0.34349532146091155, + "grad_norm": 0.06879527121782303, + "learning_rate": 0.00018260689082606892, + "loss": 0.8866, + "step": 569 + }, + { + "epoch": 0.344099003923936, + "grad_norm": 0.07618753612041473, + "learning_rate": 0.0001825653798256538, + "loss": 0.9287, + "step": 570 + }, + { + "epoch": 0.34470268638696044, + "grad_norm": 0.0731632187962532, + "learning_rate": 0.0001825238688252387, + "loss": 1.2391, + "step": 571 + }, + { + "epoch": 0.3453063688499849, + "grad_norm": 0.06929552555084229, + "learning_rate": 0.0001824823578248236, + "loss": 0.8374, + "step": 572 + }, + { + "epoch": 0.34591005131300934, + "grad_norm": 0.08287245035171509, + "learning_rate": 0.00018244084682440847, + "loss": 0.9951, + "step": 573 + }, + { + "epoch": 0.3465137337760338, + "grad_norm": 0.08636925369501114, + "learning_rate": 0.00018239933582399337, + "loss": 0.7663, + "step": 574 + }, + { + "epoch": 0.34711741623905823, + "grad_norm": 0.07229752093553543, + "learning_rate": 0.00018235782482357827, + "loss": 0.9063, + "step": 575 + }, + { + "epoch": 0.34772109870208273, + "grad_norm": 0.08006583899259567, + "learning_rate": 0.00018231631382316314, + "loss": 0.8608, + "step": 576 + }, + { + "epoch": 0.3483247811651072, + "grad_norm": 0.07500839233398438, + "learning_rate": 0.00018227480282274804, + "loss": 1.0615, + "step": 577 + }, + { + "epoch": 0.3489284636281316, + "grad_norm": 0.07585518062114716, + "learning_rate": 0.00018223329182233294, + "loss": 1.3967, + "step": 578 + }, + { + "epoch": 0.34953214609115607, + "grad_norm": 0.06830618530511856, + "learning_rate": 0.00018219178082191782, + "loss": 1.0998, + "step": 579 + }, + { + "epoch": 0.3501358285541805, + "grad_norm": 0.0734933540225029, + "learning_rate": 0.00018215026982150272, + "loss": 0.8375, + "step": 580 + }, + { + "epoch": 0.35073951101720496, + "grad_norm": 0.07250437140464783, + "learning_rate": 0.0001821087588210876, + "loss": 1.0154, + "step": 581 + }, + { + "epoch": 0.3513431934802294, + "grad_norm": 0.07496879994869232, + "learning_rate": 0.0001820672478206725, + "loss": 0.8673, + "step": 582 + }, + { + "epoch": 0.35194687594325386, + "grad_norm": 0.07836568355560303, + "learning_rate": 0.0001820257368202574, + "loss": 0.8566, + "step": 583 + }, + { + "epoch": 0.3525505584062783, + "grad_norm": 0.07284071296453476, + "learning_rate": 0.00018198422581984226, + "loss": 0.7801, + "step": 584 + }, + { + "epoch": 0.35315424086930275, + "grad_norm": 0.07430974394083023, + "learning_rate": 0.00018194271481942716, + "loss": 0.7227, + "step": 585 + }, + { + "epoch": 0.3537579233323272, + "grad_norm": 0.0871773213148117, + "learning_rate": 0.00018190120381901206, + "loss": 0.6884, + "step": 586 + }, + { + "epoch": 0.35436160579535164, + "grad_norm": 0.08565318584442139, + "learning_rate": 0.00018185969281859693, + "loss": 0.7762, + "step": 587 + }, + { + "epoch": 0.3549652882583761, + "grad_norm": 0.07946131378412247, + "learning_rate": 0.00018181818181818183, + "loss": 1.008, + "step": 588 + }, + { + "epoch": 0.35556897072140053, + "grad_norm": 0.07984606176614761, + "learning_rate": 0.00018177667081776673, + "loss": 0.8231, + "step": 589 + }, + { + "epoch": 0.356172653184425, + "grad_norm": 0.08642979711294174, + "learning_rate": 0.0001817351598173516, + "loss": 0.6995, + "step": 590 + }, + { + "epoch": 0.3567763356474494, + "grad_norm": 0.0884912982583046, + "learning_rate": 0.00018169364881693648, + "loss": 0.7622, + "step": 591 + }, + { + "epoch": 0.3573800181104739, + "grad_norm": 0.08437898755073547, + "learning_rate": 0.0001816521378165214, + "loss": 0.6997, + "step": 592 + }, + { + "epoch": 0.3579837005734983, + "grad_norm": 0.1158236563205719, + "learning_rate": 0.00018161062681610628, + "loss": 0.7354, + "step": 593 + }, + { + "epoch": 0.35858738303652277, + "grad_norm": 0.09563788026571274, + "learning_rate": 0.00018156911581569115, + "loss": 0.8278, + "step": 594 + }, + { + "epoch": 0.3591910654995472, + "grad_norm": 0.10473807901144028, + "learning_rate": 0.00018152760481527605, + "loss": 0.7105, + "step": 595 + }, + { + "epoch": 0.3597947479625717, + "grad_norm": 0.09579318761825562, + "learning_rate": 0.00018148609381486095, + "loss": 0.6555, + "step": 596 + }, + { + "epoch": 0.36039843042559616, + "grad_norm": 0.10552844405174255, + "learning_rate": 0.00018144458281444583, + "loss": 0.6662, + "step": 597 + }, + { + "epoch": 0.3610021128886206, + "grad_norm": 0.10778263211250305, + "learning_rate": 0.00018140307181403073, + "loss": 0.6501, + "step": 598 + }, + { + "epoch": 0.36160579535164505, + "grad_norm": 0.11160840839147568, + "learning_rate": 0.00018136156081361563, + "loss": 0.5741, + "step": 599 + }, + { + "epoch": 0.3622094778146695, + "grad_norm": 0.11790695786476135, + "learning_rate": 0.0001813200498132005, + "loss": 0.507, + "step": 600 + }, + { + "epoch": 0.36281316027769395, + "grad_norm": 0.09489453583955765, + "learning_rate": 0.0001812785388127854, + "loss": 0.905, + "step": 601 + }, + { + "epoch": 0.3634168427407184, + "grad_norm": 0.07241594046354294, + "learning_rate": 0.0001812370278123703, + "loss": 0.9352, + "step": 602 + }, + { + "epoch": 0.36402052520374284, + "grad_norm": 0.07990288734436035, + "learning_rate": 0.00018119551681195517, + "loss": 0.9155, + "step": 603 + }, + { + "epoch": 0.3646242076667673, + "grad_norm": 0.07297532260417938, + "learning_rate": 0.00018115400581154007, + "loss": 0.9952, + "step": 604 + }, + { + "epoch": 0.36522789012979173, + "grad_norm": 0.10625027120113373, + "learning_rate": 0.00018111249481112495, + "loss": 1.0103, + "step": 605 + }, + { + "epoch": 0.3658315725928162, + "grad_norm": 0.0755513533949852, + "learning_rate": 0.00018107098381070985, + "loss": 0.9527, + "step": 606 + }, + { + "epoch": 0.3664352550558406, + "grad_norm": 0.0824398621916771, + "learning_rate": 0.00018102947281029475, + "loss": 1.032, + "step": 607 + }, + { + "epoch": 0.36703893751886507, + "grad_norm": 0.07464707642793655, + "learning_rate": 0.00018098796180987962, + "loss": 0.9073, + "step": 608 + }, + { + "epoch": 0.3676426199818895, + "grad_norm": 0.07036054134368896, + "learning_rate": 0.00018094645080946452, + "loss": 0.8517, + "step": 609 + }, + { + "epoch": 0.36824630244491396, + "grad_norm": 0.10687808692455292, + "learning_rate": 0.00018090493980904942, + "loss": 0.7676, + "step": 610 + }, + { + "epoch": 0.3688499849079384, + "grad_norm": 0.07665056735277176, + "learning_rate": 0.0001808634288086343, + "loss": 0.9801, + "step": 611 + }, + { + "epoch": 0.36945366737096286, + "grad_norm": 0.06765671819448471, + "learning_rate": 0.00018082191780821916, + "loss": 0.7543, + "step": 612 + }, + { + "epoch": 0.3700573498339873, + "grad_norm": 0.07217054814100266, + "learning_rate": 0.0001807804068078041, + "loss": 1.0022, + "step": 613 + }, + { + "epoch": 0.37066103229701175, + "grad_norm": 0.14387869834899902, + "learning_rate": 0.00018073889580738896, + "loss": 0.7746, + "step": 614 + }, + { + "epoch": 0.3712647147600362, + "grad_norm": 0.07988094538450241, + "learning_rate": 0.00018069738480697384, + "loss": 1.327, + "step": 615 + }, + { + "epoch": 0.37186839722306064, + "grad_norm": 0.06843268126249313, + "learning_rate": 0.00018065587380655876, + "loss": 0.7978, + "step": 616 + }, + { + "epoch": 0.37247207968608514, + "grad_norm": 0.0766974464058876, + "learning_rate": 0.00018061436280614364, + "loss": 0.7766, + "step": 617 + }, + { + "epoch": 0.3730757621491096, + "grad_norm": 0.08933945000171661, + "learning_rate": 0.0001805728518057285, + "loss": 0.7874, + "step": 618 + }, + { + "epoch": 0.37367944461213404, + "grad_norm": 0.07440406084060669, + "learning_rate": 0.0001805313408053134, + "loss": 0.722, + "step": 619 + }, + { + "epoch": 0.3742831270751585, + "grad_norm": 0.0785699188709259, + "learning_rate": 0.0001804898298048983, + "loss": 1.0223, + "step": 620 + }, + { + "epoch": 0.37488680953818293, + "grad_norm": 0.07815410941839218, + "learning_rate": 0.0001804483188044832, + "loss": 0.8886, + "step": 621 + }, + { + "epoch": 0.3754904920012074, + "grad_norm": 0.06365236639976501, + "learning_rate": 0.00018040680780406808, + "loss": 0.7909, + "step": 622 + }, + { + "epoch": 0.3760941744642318, + "grad_norm": 0.07768423855304718, + "learning_rate": 0.00018036529680365298, + "loss": 0.8831, + "step": 623 + }, + { + "epoch": 0.37669785692725627, + "grad_norm": 0.07100118696689606, + "learning_rate": 0.00018032378580323788, + "loss": 0.8649, + "step": 624 + }, + { + "epoch": 0.3773015393902807, + "grad_norm": 0.08485390990972519, + "learning_rate": 0.00018028227480282276, + "loss": 0.731, + "step": 625 + }, + { + "epoch": 0.37790522185330516, + "grad_norm": 0.07133664935827255, + "learning_rate": 0.00018024076380240763, + "loss": 0.7825, + "step": 626 + }, + { + "epoch": 0.3785089043163296, + "grad_norm": 0.06745673716068268, + "learning_rate": 0.00018019925280199256, + "loss": 0.7547, + "step": 627 + }, + { + "epoch": 0.37911258677935405, + "grad_norm": 0.08499856293201447, + "learning_rate": 0.00018015774180157743, + "loss": 0.7289, + "step": 628 + }, + { + "epoch": 0.3797162692423785, + "grad_norm": 0.07550381869077682, + "learning_rate": 0.0001801162308011623, + "loss": 1.0874, + "step": 629 + }, + { + "epoch": 0.38031995170540295, + "grad_norm": 0.07187377661466599, + "learning_rate": 0.00018007471980074723, + "loss": 0.7885, + "step": 630 + }, + { + "epoch": 0.3809236341684274, + "grad_norm": 0.07693067193031311, + "learning_rate": 0.0001800332088003321, + "loss": 0.9349, + "step": 631 + }, + { + "epoch": 0.38152731663145184, + "grad_norm": 0.0718834400177002, + "learning_rate": 0.00017999169779991698, + "loss": 0.8609, + "step": 632 + }, + { + "epoch": 0.3821309990944763, + "grad_norm": 0.07122069597244263, + "learning_rate": 0.00017995018679950188, + "loss": 0.8418, + "step": 633 + }, + { + "epoch": 0.38273468155750073, + "grad_norm": 0.0743737518787384, + "learning_rate": 0.00017990867579908678, + "loss": 0.7195, + "step": 634 + }, + { + "epoch": 0.3833383640205252, + "grad_norm": 0.07467867434024811, + "learning_rate": 0.00017986716479867165, + "loss": 0.7802, + "step": 635 + }, + { + "epoch": 0.3839420464835496, + "grad_norm": 0.07535380125045776, + "learning_rate": 0.00017982565379825655, + "loss": 0.7726, + "step": 636 + }, + { + "epoch": 0.38454572894657413, + "grad_norm": 0.07515112310647964, + "learning_rate": 0.00017978414279784145, + "loss": 0.7936, + "step": 637 + }, + { + "epoch": 0.3851494114095986, + "grad_norm": 0.08222845941781998, + "learning_rate": 0.00017974263179742632, + "loss": 0.7473, + "step": 638 + }, + { + "epoch": 0.385753093872623, + "grad_norm": 0.09565534442663193, + "learning_rate": 0.00017970112079701122, + "loss": 0.7899, + "step": 639 + }, + { + "epoch": 0.38635677633564747, + "grad_norm": 0.0862174928188324, + "learning_rate": 0.0001796596097965961, + "loss": 0.7633, + "step": 640 + }, + { + "epoch": 0.3869604587986719, + "grad_norm": 0.08321114629507065, + "learning_rate": 0.000179618098796181, + "loss": 0.6477, + "step": 641 + }, + { + "epoch": 0.38756414126169636, + "grad_norm": 0.09201504290103912, + "learning_rate": 0.0001795765877957659, + "loss": 0.7508, + "step": 642 + }, + { + "epoch": 0.3881678237247208, + "grad_norm": 0.09954454004764557, + "learning_rate": 0.00017953507679535077, + "loss": 0.7636, + "step": 643 + }, + { + "epoch": 0.38877150618774525, + "grad_norm": 0.10669595003128052, + "learning_rate": 0.00017949356579493567, + "loss": 0.7761, + "step": 644 + }, + { + "epoch": 0.3893751886507697, + "grad_norm": 0.09882800281047821, + "learning_rate": 0.00017945205479452057, + "loss": 0.7333, + "step": 645 + }, + { + "epoch": 0.38997887111379415, + "grad_norm": 0.10011550784111023, + "learning_rate": 0.00017941054379410544, + "loss": 0.6698, + "step": 646 + }, + { + "epoch": 0.3905825535768186, + "grad_norm": 0.1063833087682724, + "learning_rate": 0.00017936903279369034, + "loss": 0.6371, + "step": 647 + }, + { + "epoch": 0.39118623603984304, + "grad_norm": 0.11087442189455032, + "learning_rate": 0.00017932752179327524, + "loss": 0.6137, + "step": 648 + }, + { + "epoch": 0.3917899185028675, + "grad_norm": 0.1278572678565979, + "learning_rate": 0.0001792860107928601, + "loss": 0.6264, + "step": 649 + }, + { + "epoch": 0.39239360096589193, + "grad_norm": 0.13009613752365112, + "learning_rate": 0.00017924449979244499, + "loss": 0.5163, + "step": 650 + }, + { + "epoch": 0.3929972834289164, + "grad_norm": 0.08168315887451172, + "learning_rate": 0.0001792029887920299, + "loss": 0.8955, + "step": 651 + }, + { + "epoch": 0.3936009658919408, + "grad_norm": 0.08920527994632721, + "learning_rate": 0.00017916147779161479, + "loss": 1.0973, + "step": 652 + }, + { + "epoch": 0.39420464835496527, + "grad_norm": 0.0853213369846344, + "learning_rate": 0.00017911996679119966, + "loss": 0.8951, + "step": 653 + }, + { + "epoch": 0.3948083308179897, + "grad_norm": 0.07048202306032181, + "learning_rate": 0.00017907845579078456, + "loss": 0.6935, + "step": 654 + }, + { + "epoch": 0.39541201328101416, + "grad_norm": 0.08051397651433945, + "learning_rate": 0.00017903694479036946, + "loss": 1.1476, + "step": 655 + }, + { + "epoch": 0.3960156957440386, + "grad_norm": 0.07060935348272324, + "learning_rate": 0.00017899543378995433, + "loss": 0.7518, + "step": 656 + }, + { + "epoch": 0.3966193782070631, + "grad_norm": 0.0792861208319664, + "learning_rate": 0.00017895392278953923, + "loss": 0.843, + "step": 657 + }, + { + "epoch": 0.39722306067008756, + "grad_norm": 0.07856278121471405, + "learning_rate": 0.00017891241178912413, + "loss": 0.949, + "step": 658 + }, + { + "epoch": 0.397826743133112, + "grad_norm": 0.07348600775003433, + "learning_rate": 0.000178870900788709, + "loss": 0.902, + "step": 659 + }, + { + "epoch": 0.39843042559613645, + "grad_norm": 0.06807007640600204, + "learning_rate": 0.0001788293897882939, + "loss": 0.8443, + "step": 660 + }, + { + "epoch": 0.3990341080591609, + "grad_norm": 0.07370761781930923, + "learning_rate": 0.0001787878787878788, + "loss": 0.8161, + "step": 661 + }, + { + "epoch": 0.39963779052218534, + "grad_norm": 0.08829770982265472, + "learning_rate": 0.00017874636778746368, + "loss": 0.7679, + "step": 662 + }, + { + "epoch": 0.4002414729852098, + "grad_norm": 0.07680663466453552, + "learning_rate": 0.00017870485678704858, + "loss": 0.8106, + "step": 663 + }, + { + "epoch": 0.40084515544823424, + "grad_norm": 0.07080180197954178, + "learning_rate": 0.00017866334578663345, + "loss": 0.8348, + "step": 664 + }, + { + "epoch": 0.4014488379112587, + "grad_norm": 0.0829470232129097, + "learning_rate": 0.00017862183478621838, + "loss": 0.8589, + "step": 665 + }, + { + "epoch": 0.40205252037428313, + "grad_norm": 0.065013587474823, + "learning_rate": 0.00017858032378580325, + "loss": 0.736, + "step": 666 + }, + { + "epoch": 0.4026562028373076, + "grad_norm": 0.07887616008520126, + "learning_rate": 0.00017853881278538812, + "loss": 0.8547, + "step": 667 + }, + { + "epoch": 0.403259885300332, + "grad_norm": 0.07337549328804016, + "learning_rate": 0.00017849730178497302, + "loss": 0.8766, + "step": 668 + }, + { + "epoch": 0.40386356776335647, + "grad_norm": 0.06863244622945786, + "learning_rate": 0.00017845579078455792, + "loss": 0.7484, + "step": 669 + }, + { + "epoch": 0.4044672502263809, + "grad_norm": 0.07943541556596756, + "learning_rate": 0.0001784142797841428, + "loss": 0.8951, + "step": 670 + }, + { + "epoch": 0.40507093268940536, + "grad_norm": 0.07468891143798828, + "learning_rate": 0.0001783727687837277, + "loss": 0.8805, + "step": 671 + }, + { + "epoch": 0.4056746151524298, + "grad_norm": 0.06805651634931564, + "learning_rate": 0.0001783312577833126, + "loss": 0.9149, + "step": 672 + }, + { + "epoch": 0.40627829761545425, + "grad_norm": 0.08728659898042679, + "learning_rate": 0.00017828974678289747, + "loss": 1.1018, + "step": 673 + }, + { + "epoch": 0.4068819800784787, + "grad_norm": 0.07230555266141891, + "learning_rate": 0.00017824823578248237, + "loss": 0.8377, + "step": 674 + }, + { + "epoch": 0.40748566254150315, + "grad_norm": 0.07561583071947098, + "learning_rate": 0.00017820672478206727, + "loss": 0.8613, + "step": 675 + }, + { + "epoch": 0.4080893450045276, + "grad_norm": 0.07437833398580551, + "learning_rate": 0.00017816521378165214, + "loss": 0.9668, + "step": 676 + }, + { + "epoch": 0.4086930274675521, + "grad_norm": 0.09480807930231094, + "learning_rate": 0.00017812370278123704, + "loss": 0.9128, + "step": 677 + }, + { + "epoch": 0.40929670993057654, + "grad_norm": 0.07243344187736511, + "learning_rate": 0.00017808219178082192, + "loss": 1.0159, + "step": 678 + }, + { + "epoch": 0.409900392393601, + "grad_norm": 0.07010567933320999, + "learning_rate": 0.00017804068078040682, + "loss": 0.7627, + "step": 679 + }, + { + "epoch": 0.41050407485662543, + "grad_norm": 0.07201089709997177, + "learning_rate": 0.00017799916977999172, + "loss": 1.0637, + "step": 680 + }, + { + "epoch": 0.4111077573196499, + "grad_norm": 0.0743207260966301, + "learning_rate": 0.0001779576587795766, + "loss": 0.8637, + "step": 681 + }, + { + "epoch": 0.4117114397826743, + "grad_norm": 0.06971647590398788, + "learning_rate": 0.0001779161477791615, + "loss": 0.8119, + "step": 682 + }, + { + "epoch": 0.4123151222456988, + "grad_norm": 0.07709638774394989, + "learning_rate": 0.0001778746367787464, + "loss": 0.8523, + "step": 683 + }, + { + "epoch": 0.4129188047087232, + "grad_norm": 0.07961713522672653, + "learning_rate": 0.00017783312577833126, + "loss": 0.8088, + "step": 684 + }, + { + "epoch": 0.41352248717174767, + "grad_norm": 0.0781940221786499, + "learning_rate": 0.00017779161477791616, + "loss": 0.7029, + "step": 685 + }, + { + "epoch": 0.4141261696347721, + "grad_norm": 0.07617820799350739, + "learning_rate": 0.00017775010377750106, + "loss": 0.7593, + "step": 686 + }, + { + "epoch": 0.41472985209779656, + "grad_norm": 0.08119190484285355, + "learning_rate": 0.00017770859277708593, + "loss": 0.8067, + "step": 687 + }, + { + "epoch": 0.415333534560821, + "grad_norm": 0.07956133782863617, + "learning_rate": 0.0001776670817766708, + "loss": 0.7693, + "step": 688 + }, + { + "epoch": 0.41593721702384545, + "grad_norm": 0.08153844624757767, + "learning_rate": 0.00017762557077625574, + "loss": 0.7713, + "step": 689 + }, + { + "epoch": 0.4165408994868699, + "grad_norm": 0.08829324692487717, + "learning_rate": 0.0001775840597758406, + "loss": 0.7574, + "step": 690 + }, + { + "epoch": 0.41714458194989434, + "grad_norm": 0.09734578430652618, + "learning_rate": 0.00017754254877542548, + "loss": 0.7962, + "step": 691 + }, + { + "epoch": 0.4177482644129188, + "grad_norm": 0.08630529791116714, + "learning_rate": 0.00017750103777501038, + "loss": 0.7482, + "step": 692 + }, + { + "epoch": 0.41835194687594324, + "grad_norm": 0.09447003155946732, + "learning_rate": 0.00017745952677459528, + "loss": 0.716, + "step": 693 + }, + { + "epoch": 0.4189556293389677, + "grad_norm": 0.09416269510984421, + "learning_rate": 0.00017741801577418015, + "loss": 0.7481, + "step": 694 + }, + { + "epoch": 0.41955931180199213, + "grad_norm": 0.10847954452037811, + "learning_rate": 0.00017737650477376505, + "loss": 0.668, + "step": 695 + }, + { + "epoch": 0.4201629942650166, + "grad_norm": 0.10643991082906723, + "learning_rate": 0.00017733499377334995, + "loss": 0.6973, + "step": 696 + }, + { + "epoch": 0.4207666767280411, + "grad_norm": 0.10840442776679993, + "learning_rate": 0.00017729348277293483, + "loss": 0.6755, + "step": 697 + }, + { + "epoch": 0.4213703591910655, + "grad_norm": 0.10803359001874924, + "learning_rate": 0.00017725197177251973, + "loss": 0.6234, + "step": 698 + }, + { + "epoch": 0.42197404165408997, + "grad_norm": 0.12027863413095474, + "learning_rate": 0.00017721046077210463, + "loss": 0.6337, + "step": 699 + }, + { + "epoch": 0.4225777241171144, + "grad_norm": 0.1258537471294403, + "learning_rate": 0.0001771689497716895, + "loss": 0.447, + "step": 700 + }, + { + "epoch": 0.42318140658013886, + "grad_norm": 0.07856849581003189, + "learning_rate": 0.0001771274387712744, + "loss": 1.0295, + "step": 701 + }, + { + "epoch": 0.4237850890431633, + "grad_norm": 0.08135174959897995, + "learning_rate": 0.00017708592777085927, + "loss": 0.8548, + "step": 702 + }, + { + "epoch": 0.42438877150618776, + "grad_norm": 0.08222603797912598, + "learning_rate": 0.00017704441677044417, + "loss": 0.8463, + "step": 703 + }, + { + "epoch": 0.4249924539692122, + "grad_norm": 0.07562818378210068, + "learning_rate": 0.00017700290577002907, + "loss": 0.7474, + "step": 704 + }, + { + "epoch": 0.42559613643223665, + "grad_norm": 0.11416960507631302, + "learning_rate": 0.00017696139476961395, + "loss": 0.9287, + "step": 705 + }, + { + "epoch": 0.4261998188952611, + "grad_norm": 0.07070968300104141, + "learning_rate": 0.00017691988376919885, + "loss": 0.7686, + "step": 706 + }, + { + "epoch": 0.42680350135828554, + "grad_norm": 0.10440545529127121, + "learning_rate": 0.00017687837276878375, + "loss": 0.8466, + "step": 707 + }, + { + "epoch": 0.42740718382131, + "grad_norm": 0.10694804787635803, + "learning_rate": 0.00017683686176836862, + "loss": 0.7819, + "step": 708 + }, + { + "epoch": 0.42801086628433443, + "grad_norm": 0.07630432397127151, + "learning_rate": 0.00017679535076795352, + "loss": 0.8324, + "step": 709 + }, + { + "epoch": 0.4286145487473589, + "grad_norm": 0.07171212136745453, + "learning_rate": 0.00017675383976753842, + "loss": 0.819, + "step": 710 + }, + { + "epoch": 0.4292182312103833, + "grad_norm": 0.06875535845756531, + "learning_rate": 0.0001767123287671233, + "loss": 0.7489, + "step": 711 + }, + { + "epoch": 0.4298219136734078, + "grad_norm": 0.07789517194032669, + "learning_rate": 0.0001766708177667082, + "loss": 0.8193, + "step": 712 + }, + { + "epoch": 0.4304255961364322, + "grad_norm": 0.07542435824871063, + "learning_rate": 0.0001766293067662931, + "loss": 1.0486, + "step": 713 + }, + { + "epoch": 0.43102927859945667, + "grad_norm": 0.07148928195238113, + "learning_rate": 0.00017658779576587796, + "loss": 1.1104, + "step": 714 + }, + { + "epoch": 0.4316329610624811, + "grad_norm": 0.0701369047164917, + "learning_rate": 0.00017654628476546286, + "loss": 0.8409, + "step": 715 + }, + { + "epoch": 0.43223664352550556, + "grad_norm": 0.07325640320777893, + "learning_rate": 0.00017650477376504774, + "loss": 0.8249, + "step": 716 + }, + { + "epoch": 0.43284032598853, + "grad_norm": 0.07507720589637756, + "learning_rate": 0.00017646326276463264, + "loss": 0.8903, + "step": 717 + }, + { + "epoch": 0.4334440084515545, + "grad_norm": 0.07389452308416367, + "learning_rate": 0.00017642175176421754, + "loss": 0.9039, + "step": 718 + }, + { + "epoch": 0.43404769091457895, + "grad_norm": 0.07989213615655899, + "learning_rate": 0.0001763802407638024, + "loss": 0.8314, + "step": 719 + }, + { + "epoch": 0.4346513733776034, + "grad_norm": 0.0715138241648674, + "learning_rate": 0.0001763387297633873, + "loss": 0.8579, + "step": 720 + }, + { + "epoch": 0.43525505584062785, + "grad_norm": 0.09497915208339691, + "learning_rate": 0.0001762972187629722, + "loss": 0.8342, + "step": 721 + }, + { + "epoch": 0.4358587383036523, + "grad_norm": 0.07308909296989441, + "learning_rate": 0.00017625570776255708, + "loss": 0.87, + "step": 722 + }, + { + "epoch": 0.43646242076667674, + "grad_norm": 0.08746104687452316, + "learning_rate": 0.00017621419676214196, + "loss": 1.1309, + "step": 723 + }, + { + "epoch": 0.4370661032297012, + "grad_norm": 0.09257882833480835, + "learning_rate": 0.00017617268576172688, + "loss": 0.7745, + "step": 724 + }, + { + "epoch": 0.43766978569272563, + "grad_norm": 0.08195153623819351, + "learning_rate": 0.00017613117476131176, + "loss": 0.7545, + "step": 725 + }, + { + "epoch": 0.4382734681557501, + "grad_norm": 0.07948730885982513, + "learning_rate": 0.00017608966376089663, + "loss": 0.8637, + "step": 726 + }, + { + "epoch": 0.4388771506187745, + "grad_norm": 0.08171603083610535, + "learning_rate": 0.00017604815276048156, + "loss": 0.8005, + "step": 727 + }, + { + "epoch": 0.43948083308179897, + "grad_norm": 0.0735010951757431, + "learning_rate": 0.00017600664176006643, + "loss": 0.8667, + "step": 728 + }, + { + "epoch": 0.4400845155448234, + "grad_norm": 0.07493755221366882, + "learning_rate": 0.0001759651307596513, + "loss": 0.8406, + "step": 729 + }, + { + "epoch": 0.44068819800784786, + "grad_norm": 0.07382465153932571, + "learning_rate": 0.0001759236197592362, + "loss": 0.9226, + "step": 730 + }, + { + "epoch": 0.4412918804708723, + "grad_norm": 0.10297898948192596, + "learning_rate": 0.0001758821087588211, + "loss": 0.8471, + "step": 731 + }, + { + "epoch": 0.44189556293389676, + "grad_norm": 0.08343175053596497, + "learning_rate": 0.00017584059775840598, + "loss": 0.8474, + "step": 732 + }, + { + "epoch": 0.4424992453969212, + "grad_norm": 0.07784884423017502, + "learning_rate": 0.00017579908675799088, + "loss": 0.7984, + "step": 733 + }, + { + "epoch": 0.44310292785994565, + "grad_norm": 0.07451200485229492, + "learning_rate": 0.00017575757575757578, + "loss": 0.8766, + "step": 734 + }, + { + "epoch": 0.4437066103229701, + "grad_norm": 0.08384682983160019, + "learning_rate": 0.00017571606475716065, + "loss": 0.806, + "step": 735 + }, + { + "epoch": 0.44431029278599454, + "grad_norm": 0.0747980996966362, + "learning_rate": 0.00017567455375674555, + "loss": 0.6971, + "step": 736 + }, + { + "epoch": 0.444913975249019, + "grad_norm": 0.07966098934412003, + "learning_rate": 0.00017563304275633042, + "loss": 0.8149, + "step": 737 + }, + { + "epoch": 0.4455176577120435, + "grad_norm": 0.08828859031200409, + "learning_rate": 0.00017559153175591532, + "loss": 0.7579, + "step": 738 + }, + { + "epoch": 0.44612134017506794, + "grad_norm": 0.08065015077590942, + "learning_rate": 0.00017555002075550022, + "loss": 0.7766, + "step": 739 + }, + { + "epoch": 0.4467250226380924, + "grad_norm": 0.09102361649274826, + "learning_rate": 0.0001755085097550851, + "loss": 0.8112, + "step": 740 + }, + { + "epoch": 0.44732870510111683, + "grad_norm": 0.09253661334514618, + "learning_rate": 0.00017546699875467, + "loss": 0.7208, + "step": 741 + }, + { + "epoch": 0.4479323875641413, + "grad_norm": 0.11388830095529556, + "learning_rate": 0.0001754254877542549, + "loss": 0.759, + "step": 742 + }, + { + "epoch": 0.4485360700271657, + "grad_norm": 0.09276237338781357, + "learning_rate": 0.00017538397675383977, + "loss": 0.7928, + "step": 743 + }, + { + "epoch": 0.44913975249019017, + "grad_norm": 0.09758086502552032, + "learning_rate": 0.00017534246575342467, + "loss": 0.7086, + "step": 744 + }, + { + "epoch": 0.4497434349532146, + "grad_norm": 0.1056419312953949, + "learning_rate": 0.00017530095475300957, + "loss": 0.7533, + "step": 745 + }, + { + "epoch": 0.45034711741623906, + "grad_norm": 0.10619323700666428, + "learning_rate": 0.00017525944375259444, + "loss": 0.7371, + "step": 746 + }, + { + "epoch": 0.4509507998792635, + "grad_norm": 0.13212867081165314, + "learning_rate": 0.00017521793275217931, + "loss": 0.6729, + "step": 747 + }, + { + "epoch": 0.45155448234228796, + "grad_norm": 0.12233047932386398, + "learning_rate": 0.00017517642175176424, + "loss": 0.6855, + "step": 748 + }, + { + "epoch": 0.4521581648053124, + "grad_norm": 0.11795574426651001, + "learning_rate": 0.00017513491075134911, + "loss": 0.522, + "step": 749 + }, + { + "epoch": 0.45276184726833685, + "grad_norm": 0.12232359498739243, + "learning_rate": 0.000175093399750934, + "loss": 0.5003, + "step": 750 + }, + { + "epoch": 0.4533655297313613, + "grad_norm": 0.11266329884529114, + "learning_rate": 0.0001750518887505189, + "loss": 0.7975, + "step": 751 + }, + { + "epoch": 0.45396921219438574, + "grad_norm": 0.12558221817016602, + "learning_rate": 0.0001750103777501038, + "loss": 1.0336, + "step": 752 + }, + { + "epoch": 0.4545728946574102, + "grad_norm": 0.08540918678045273, + "learning_rate": 0.0001749688667496887, + "loss": 0.8133, + "step": 753 + }, + { + "epoch": 0.45517657712043463, + "grad_norm": 0.07365544140338898, + "learning_rate": 0.00017492735574927356, + "loss": 0.9823, + "step": 754 + }, + { + "epoch": 0.4557802595834591, + "grad_norm": 0.07597804814577103, + "learning_rate": 0.00017488584474885846, + "loss": 0.8333, + "step": 755 + }, + { + "epoch": 0.4563839420464835, + "grad_norm": 0.12274730950593948, + "learning_rate": 0.00017484433374844336, + "loss": 0.8916, + "step": 756 + }, + { + "epoch": 0.456987624509508, + "grad_norm": 0.07644704729318619, + "learning_rate": 0.00017480282274802823, + "loss": 1.2536, + "step": 757 + }, + { + "epoch": 0.4575913069725325, + "grad_norm": 0.0743841826915741, + "learning_rate": 0.00017476131174761313, + "loss": 0.8892, + "step": 758 + }, + { + "epoch": 0.4581949894355569, + "grad_norm": 0.07549133896827698, + "learning_rate": 0.00017471980074719803, + "loss": 0.9356, + "step": 759 + }, + { + "epoch": 0.45879867189858137, + "grad_norm": 0.10283336043357849, + "learning_rate": 0.0001746782897467829, + "loss": 0.8478, + "step": 760 + }, + { + "epoch": 0.4594023543616058, + "grad_norm": 0.0773346945643425, + "learning_rate": 0.00017463677874636778, + "loss": 0.8264, + "step": 761 + }, + { + "epoch": 0.46000603682463026, + "grad_norm": 0.08246394246816635, + "learning_rate": 0.0001745952677459527, + "loss": 0.8789, + "step": 762 + }, + { + "epoch": 0.4606097192876547, + "grad_norm": 0.0820598155260086, + "learning_rate": 0.00017455375674553758, + "loss": 0.8355, + "step": 763 + }, + { + "epoch": 0.46121340175067915, + "grad_norm": 0.08660144358873367, + "learning_rate": 0.00017451224574512245, + "loss": 0.9191, + "step": 764 + }, + { + "epoch": 0.4618170842137036, + "grad_norm": 0.07887571305036545, + "learning_rate": 0.00017447073474470735, + "loss": 0.8801, + "step": 765 + }, + { + "epoch": 0.46242076667672805, + "grad_norm": 0.07366171479225159, + "learning_rate": 0.00017442922374429225, + "loss": 0.9114, + "step": 766 + }, + { + "epoch": 0.4630244491397525, + "grad_norm": 0.07119464874267578, + "learning_rate": 0.00017438771274387712, + "loss": 0.7584, + "step": 767 + }, + { + "epoch": 0.46362813160277694, + "grad_norm": 0.07550892978906631, + "learning_rate": 0.00017434620174346202, + "loss": 0.9126, + "step": 768 + }, + { + "epoch": 0.4642318140658014, + "grad_norm": 0.07863172888755798, + "learning_rate": 0.00017430469074304692, + "loss": 0.8234, + "step": 769 + }, + { + "epoch": 0.46483549652882583, + "grad_norm": 0.08048754185438156, + "learning_rate": 0.0001742631797426318, + "loss": 0.7634, + "step": 770 + }, + { + "epoch": 0.4654391789918503, + "grad_norm": 0.0739179328083992, + "learning_rate": 0.0001742216687422167, + "loss": 0.782, + "step": 771 + }, + { + "epoch": 0.4660428614548747, + "grad_norm": 0.07369952648878098, + "learning_rate": 0.0001741801577418016, + "loss": 0.9965, + "step": 772 + }, + { + "epoch": 0.46664654391789917, + "grad_norm": 0.0785854384303093, + "learning_rate": 0.00017413864674138647, + "loss": 0.9077, + "step": 773 + }, + { + "epoch": 0.4672502263809236, + "grad_norm": 0.07083092629909515, + "learning_rate": 0.00017409713574097137, + "loss": 0.8157, + "step": 774 + }, + { + "epoch": 0.46785390884394806, + "grad_norm": 0.07597315311431885, + "learning_rate": 0.00017405562474055624, + "loss": 1.1588, + "step": 775 + }, + { + "epoch": 0.4684575913069725, + "grad_norm": 0.07124720513820648, + "learning_rate": 0.00017401411374014114, + "loss": 0.8374, + "step": 776 + }, + { + "epoch": 0.46906127376999696, + "grad_norm": 0.0762951672077179, + "learning_rate": 0.00017397260273972604, + "loss": 0.7713, + "step": 777 + }, + { + "epoch": 0.46966495623302146, + "grad_norm": 0.09306569397449493, + "learning_rate": 0.00017393109173931092, + "loss": 0.7561, + "step": 778 + }, + { + "epoch": 0.4702686386960459, + "grad_norm": 0.09182307124137878, + "learning_rate": 0.00017388958073889582, + "loss": 0.816, + "step": 779 + }, + { + "epoch": 0.47087232115907035, + "grad_norm": 0.08006258308887482, + "learning_rate": 0.00017384806973848072, + "loss": 0.7516, + "step": 780 + }, + { + "epoch": 0.4714760036220948, + "grad_norm": 0.07114402204751968, + "learning_rate": 0.0001738065587380656, + "loss": 0.8256, + "step": 781 + }, + { + "epoch": 0.47207968608511924, + "grad_norm": 0.08175129443407059, + "learning_rate": 0.0001737650477376505, + "loss": 0.9031, + "step": 782 + }, + { + "epoch": 0.4726833685481437, + "grad_norm": 0.06829746812582016, + "learning_rate": 0.0001737235367372354, + "loss": 0.7038, + "step": 783 + }, + { + "epoch": 0.47328705101116814, + "grad_norm": 0.07961759716272354, + "learning_rate": 0.00017368202573682026, + "loss": 0.8112, + "step": 784 + }, + { + "epoch": 0.4738907334741926, + "grad_norm": 0.08282366394996643, + "learning_rate": 0.00017364051473640514, + "loss": 0.7348, + "step": 785 + }, + { + "epoch": 0.47449441593721703, + "grad_norm": 0.07623188942670822, + "learning_rate": 0.00017359900373599006, + "loss": 0.7162, + "step": 786 + }, + { + "epoch": 0.4750980984002415, + "grad_norm": 0.07791578769683838, + "learning_rate": 0.00017355749273557494, + "loss": 0.7827, + "step": 787 + }, + { + "epoch": 0.4757017808632659, + "grad_norm": 0.07546679675579071, + "learning_rate": 0.0001735159817351598, + "loss": 0.733, + "step": 788 + }, + { + "epoch": 0.47630546332629037, + "grad_norm": 0.08498945832252502, + "learning_rate": 0.0001734744707347447, + "loss": 0.8845, + "step": 789 + }, + { + "epoch": 0.4769091457893148, + "grad_norm": 0.08883391320705414, + "learning_rate": 0.0001734329597343296, + "loss": 0.802, + "step": 790 + }, + { + "epoch": 0.47751282825233926, + "grad_norm": 0.08220940828323364, + "learning_rate": 0.00017339144873391448, + "loss": 0.7421, + "step": 791 + }, + { + "epoch": 0.4781165107153637, + "grad_norm": 0.0887073203921318, + "learning_rate": 0.00017334993773349938, + "loss": 0.7697, + "step": 792 + }, + { + "epoch": 0.47872019317838815, + "grad_norm": 0.09844100475311279, + "learning_rate": 0.00017330842673308428, + "loss": 0.7502, + "step": 793 + }, + { + "epoch": 0.4793238756414126, + "grad_norm": 0.10373784601688385, + "learning_rate": 0.00017326691573266915, + "loss": 0.7348, + "step": 794 + }, + { + "epoch": 0.47992755810443705, + "grad_norm": 0.09787629544734955, + "learning_rate": 0.00017322540473225405, + "loss": 0.7995, + "step": 795 + }, + { + "epoch": 0.4805312405674615, + "grad_norm": 0.10174565017223358, + "learning_rate": 0.00017318389373183895, + "loss": 0.6651, + "step": 796 + }, + { + "epoch": 0.48113492303048594, + "grad_norm": 0.10913451015949249, + "learning_rate": 0.00017314238273142385, + "loss": 0.6945, + "step": 797 + }, + { + "epoch": 0.48173860549351044, + "grad_norm": 0.109565369784832, + "learning_rate": 0.00017310087173100873, + "loss": 0.6027, + "step": 798 + }, + { + "epoch": 0.4823422879565349, + "grad_norm": 0.12080711126327515, + "learning_rate": 0.0001730593607305936, + "loss": 0.533, + "step": 799 + }, + { + "epoch": 0.48294597041955933, + "grad_norm": 0.11455202102661133, + "learning_rate": 0.00017301784973017853, + "loss": 0.4213, + "step": 800 + }, + { + "epoch": 0.4835496528825838, + "grad_norm": 0.08588794618844986, + "learning_rate": 0.0001729763387297634, + "loss": 0.8639, + "step": 801 + }, + { + "epoch": 0.4841533353456082, + "grad_norm": 0.08318472653627396, + "learning_rate": 0.00017293482772934827, + "loss": 0.9486, + "step": 802 + }, + { + "epoch": 0.4847570178086327, + "grad_norm": 0.08030420541763306, + "learning_rate": 0.00017289331672893317, + "loss": 0.8172, + "step": 803 + }, + { + "epoch": 0.4853607002716571, + "grad_norm": 0.08063428103923798, + "learning_rate": 0.00017285180572851807, + "loss": 0.7898, + "step": 804 + }, + { + "epoch": 0.48596438273468157, + "grad_norm": 0.07770851999521255, + "learning_rate": 0.00017281029472810295, + "loss": 1.0558, + "step": 805 + }, + { + "epoch": 0.486568065197706, + "grad_norm": 0.08149967342615128, + "learning_rate": 0.00017276878372768785, + "loss": 0.8185, + "step": 806 + }, + { + "epoch": 0.48717174766073046, + "grad_norm": 0.08028626441955566, + "learning_rate": 0.00017272727272727275, + "loss": 0.858, + "step": 807 + }, + { + "epoch": 0.4877754301237549, + "grad_norm": 0.07975967228412628, + "learning_rate": 0.00017268576172685762, + "loss": 0.861, + "step": 808 + }, + { + "epoch": 0.48837911258677935, + "grad_norm": 0.17568281292915344, + "learning_rate": 0.00017264425072644252, + "loss": 0.8738, + "step": 809 + }, + { + "epoch": 0.4889827950498038, + "grad_norm": 0.0747470036149025, + "learning_rate": 0.00017260273972602742, + "loss": 1.0316, + "step": 810 + }, + { + "epoch": 0.48958647751282824, + "grad_norm": 0.0755024403333664, + "learning_rate": 0.0001725612287256123, + "loss": 0.8248, + "step": 811 + }, + { + "epoch": 0.4901901599758527, + "grad_norm": 0.0713796466588974, + "learning_rate": 0.0001725197177251972, + "loss": 0.764, + "step": 812 + }, + { + "epoch": 0.49079384243887714, + "grad_norm": 0.07347027212381363, + "learning_rate": 0.00017247820672478207, + "loss": 0.8605, + "step": 813 + }, + { + "epoch": 0.4913975249019016, + "grad_norm": 0.07424753159284592, + "learning_rate": 0.00017243669572436697, + "loss": 0.7838, + "step": 814 + }, + { + "epoch": 0.49200120736492603, + "grad_norm": 0.08433883637189865, + "learning_rate": 0.00017239518472395187, + "loss": 0.8561, + "step": 815 + }, + { + "epoch": 0.4926048898279505, + "grad_norm": 0.07728437334299088, + "learning_rate": 0.00017235367372353674, + "loss": 0.8175, + "step": 816 + }, + { + "epoch": 0.4932085722909749, + "grad_norm": 0.0686701089143753, + "learning_rate": 0.00017231216272312164, + "loss": 0.9421, + "step": 817 + }, + { + "epoch": 0.49381225475399937, + "grad_norm": 0.08988066762685776, + "learning_rate": 0.00017227065172270654, + "loss": 0.8417, + "step": 818 + }, + { + "epoch": 0.49441593721702387, + "grad_norm": 0.07794896513223648, + "learning_rate": 0.0001722291407222914, + "loss": 0.9656, + "step": 819 + }, + { + "epoch": 0.4950196196800483, + "grad_norm": 0.0833602100610733, + "learning_rate": 0.00017218762972187628, + "loss": 0.8488, + "step": 820 + }, + { + "epoch": 0.49562330214307276, + "grad_norm": 0.08479736745357513, + "learning_rate": 0.0001721461187214612, + "loss": 0.8215, + "step": 821 + }, + { + "epoch": 0.4962269846060972, + "grad_norm": 0.07611726969480515, + "learning_rate": 0.00017210460772104608, + "loss": 0.7847, + "step": 822 + }, + { + "epoch": 0.49683066706912166, + "grad_norm": 0.07969488203525543, + "learning_rate": 0.00017206309672063096, + "loss": 0.7988, + "step": 823 + }, + { + "epoch": 0.4974343495321461, + "grad_norm": 0.07162702083587646, + "learning_rate": 0.00017202158572021588, + "loss": 0.8497, + "step": 824 + }, + { + "epoch": 0.49803803199517055, + "grad_norm": 0.0738188773393631, + "learning_rate": 0.00017198007471980076, + "loss": 0.9446, + "step": 825 + }, + { + "epoch": 0.498641714458195, + "grad_norm": 0.36350712180137634, + "learning_rate": 0.00017193856371938563, + "loss": 0.7566, + "step": 826 + }, + { + "epoch": 0.49924539692121944, + "grad_norm": 0.0755830928683281, + "learning_rate": 0.00017189705271897053, + "loss": 1.0117, + "step": 827 + }, + { + "epoch": 0.4998490793842439, + "grad_norm": 0.07795432209968567, + "learning_rate": 0.00017185554171855543, + "loss": 0.8203, + "step": 828 + }, + { + "epoch": 0.5004527618472684, + "grad_norm": 0.08904866874217987, + "learning_rate": 0.0001718140307181403, + "loss": 0.7696, + "step": 829 + }, + { + "epoch": 0.5010564443102928, + "grad_norm": 0.07873449474573135, + "learning_rate": 0.0001717725197177252, + "loss": 0.8633, + "step": 830 + }, + { + "epoch": 0.5016601267733173, + "grad_norm": 0.0743393748998642, + "learning_rate": 0.0001717310087173101, + "loss": 0.8497, + "step": 831 + }, + { + "epoch": 0.5022638092363417, + "grad_norm": 0.07956568151712418, + "learning_rate": 0.00017168949771689498, + "loss": 0.7725, + "step": 832 + }, + { + "epoch": 0.5028674916993662, + "grad_norm": 0.07717841863632202, + "learning_rate": 0.00017164798671647988, + "loss": 0.8644, + "step": 833 + }, + { + "epoch": 0.5034711741623906, + "grad_norm": 0.07158881425857544, + "learning_rate": 0.00017160647571606475, + "loss": 0.7757, + "step": 834 + }, + { + "epoch": 0.5040748566254151, + "grad_norm": 0.0767531618475914, + "learning_rate": 0.00017156496471564965, + "loss": 0.7311, + "step": 835 + }, + { + "epoch": 0.5046785390884395, + "grad_norm": 0.07811619341373444, + "learning_rate": 0.00017152345371523455, + "loss": 0.7659, + "step": 836 + }, + { + "epoch": 0.505282221551464, + "grad_norm": 0.07905134558677673, + "learning_rate": 0.00017148194271481942, + "loss": 0.7226, + "step": 837 + }, + { + "epoch": 0.5058859040144884, + "grad_norm": 0.08344464004039764, + "learning_rate": 0.00017144043171440432, + "loss": 0.7939, + "step": 838 + }, + { + "epoch": 0.5064895864775129, + "grad_norm": 0.07940410822629929, + "learning_rate": 0.00017139892071398922, + "loss": 0.7231, + "step": 839 + }, + { + "epoch": 0.5070932689405373, + "grad_norm": 0.08555116504430771, + "learning_rate": 0.0001713574097135741, + "loss": 0.7344, + "step": 840 + }, + { + "epoch": 0.5076969514035617, + "grad_norm": 0.09922269731760025, + "learning_rate": 0.000171315898713159, + "loss": 0.6382, + "step": 841 + }, + { + "epoch": 0.5083006338665862, + "grad_norm": 0.08947370946407318, + "learning_rate": 0.0001712743877127439, + "loss": 0.7337, + "step": 842 + }, + { + "epoch": 0.5089043163296106, + "grad_norm": 0.11347758769989014, + "learning_rate": 0.00017123287671232877, + "loss": 0.7492, + "step": 843 + }, + { + "epoch": 0.5095079987926351, + "grad_norm": 0.09824637323617935, + "learning_rate": 0.00017119136571191367, + "loss": 0.6762, + "step": 844 + }, + { + "epoch": 0.5101116812556595, + "grad_norm": 0.1099235787987709, + "learning_rate": 0.00017114985471149857, + "loss": 0.7085, + "step": 845 + }, + { + "epoch": 0.510715363718684, + "grad_norm": 0.11087027192115784, + "learning_rate": 0.00017110834371108344, + "loss": 0.7089, + "step": 846 + }, + { + "epoch": 0.5113190461817084, + "grad_norm": 0.13687917590141296, + "learning_rate": 0.00017106683271066834, + "loss": 0.6154, + "step": 847 + }, + { + "epoch": 0.5119227286447329, + "grad_norm": 0.11547640711069107, + "learning_rate": 0.00017102532171025321, + "loss": 0.5773, + "step": 848 + }, + { + "epoch": 0.5125264111077573, + "grad_norm": 0.12279754877090454, + "learning_rate": 0.00017098381070983811, + "loss": 0.5398, + "step": 849 + }, + { + "epoch": 0.5131300935707818, + "grad_norm": 0.12324254214763641, + "learning_rate": 0.00017094229970942301, + "loss": 0.4505, + "step": 850 + }, + { + "epoch": 0.5137337760338062, + "grad_norm": 0.07736624777317047, + "learning_rate": 0.0001709007887090079, + "loss": 0.9161, + "step": 851 + }, + { + "epoch": 0.5143374584968307, + "grad_norm": 0.08110132068395615, + "learning_rate": 0.0001708592777085928, + "loss": 0.7757, + "step": 852 + }, + { + "epoch": 0.5149411409598551, + "grad_norm": 0.08207719773054123, + "learning_rate": 0.0001708177667081777, + "loss": 0.9293, + "step": 853 + }, + { + "epoch": 0.5155448234228796, + "grad_norm": 0.07672082632780075, + "learning_rate": 0.00017077625570776256, + "loss": 0.8568, + "step": 854 + }, + { + "epoch": 0.516148505885904, + "grad_norm": 0.07514925301074982, + "learning_rate": 0.00017073474470734746, + "loss": 0.9212, + "step": 855 + }, + { + "epoch": 0.5167521883489284, + "grad_norm": 0.07040940970182419, + "learning_rate": 0.00017069323370693236, + "loss": 0.7008, + "step": 856 + }, + { + "epoch": 0.5173558708119529, + "grad_norm": 0.07006466388702393, + "learning_rate": 0.00017065172270651723, + "loss": 0.8403, + "step": 857 + }, + { + "epoch": 0.5179595532749773, + "grad_norm": 0.0859081968665123, + "learning_rate": 0.0001706102117061021, + "loss": 0.8658, + "step": 858 + }, + { + "epoch": 0.5185632357380018, + "grad_norm": 0.08266257494688034, + "learning_rate": 0.00017056870070568703, + "loss": 1.0393, + "step": 859 + }, + { + "epoch": 0.5191669182010262, + "grad_norm": 0.06764436513185501, + "learning_rate": 0.0001705271897052719, + "loss": 0.7812, + "step": 860 + }, + { + "epoch": 0.5197706006640507, + "grad_norm": 0.0848420113325119, + "learning_rate": 0.00017048567870485678, + "loss": 0.8968, + "step": 861 + }, + { + "epoch": 0.5203742831270751, + "grad_norm": 0.11803008615970612, + "learning_rate": 0.00017044416770444168, + "loss": 0.8015, + "step": 862 + }, + { + "epoch": 0.5209779655900996, + "grad_norm": 0.07445228844881058, + "learning_rate": 0.00017040265670402658, + "loss": 0.9567, + "step": 863 + }, + { + "epoch": 0.521581648053124, + "grad_norm": 0.0766722708940506, + "learning_rate": 0.00017036114570361145, + "loss": 1.2654, + "step": 864 + }, + { + "epoch": 0.5221853305161485, + "grad_norm": 0.0825088769197464, + "learning_rate": 0.00017031963470319635, + "loss": 1.1549, + "step": 865 + }, + { + "epoch": 0.5227890129791729, + "grad_norm": 0.0870489627122879, + "learning_rate": 0.00017027812370278125, + "loss": 0.945, + "step": 866 + }, + { + "epoch": 0.5233926954421974, + "grad_norm": 0.07320686429738998, + "learning_rate": 0.00017023661270236613, + "loss": 0.7858, + "step": 867 + }, + { + "epoch": 0.5239963779052218, + "grad_norm": 0.16626937687397003, + "learning_rate": 0.00017019510170195103, + "loss": 0.8496, + "step": 868 + }, + { + "epoch": 0.5246000603682464, + "grad_norm": 0.07288725674152374, + "learning_rate": 0.00017015359070153593, + "loss": 1.0189, + "step": 869 + }, + { + "epoch": 0.5252037428312708, + "grad_norm": 0.07230374217033386, + "learning_rate": 0.0001701120797011208, + "loss": 0.7273, + "step": 870 + }, + { + "epoch": 0.5258074252942952, + "grad_norm": 0.0818004459142685, + "learning_rate": 0.0001700705687007057, + "loss": 1.0229, + "step": 871 + }, + { + "epoch": 0.5264111077573197, + "grad_norm": 0.08718733489513397, + "learning_rate": 0.00017002905770029057, + "loss": 0.9639, + "step": 872 + }, + { + "epoch": 0.5270147902203441, + "grad_norm": 0.07159387320280075, + "learning_rate": 0.00016998754669987547, + "loss": 0.796, + "step": 873 + }, + { + "epoch": 0.5276184726833686, + "grad_norm": 0.08361268043518066, + "learning_rate": 0.00016994603569946037, + "loss": 0.8615, + "step": 874 + }, + { + "epoch": 0.528222155146393, + "grad_norm": 0.07266636192798615, + "learning_rate": 0.00016990452469904524, + "loss": 0.7398, + "step": 875 + }, + { + "epoch": 0.5288258376094175, + "grad_norm": 0.08292002975940704, + "learning_rate": 0.00016986301369863014, + "loss": 0.773, + "step": 876 + }, + { + "epoch": 0.5294295200724419, + "grad_norm": 0.08378379046916962, + "learning_rate": 0.00016982150269821504, + "loss": 0.882, + "step": 877 + }, + { + "epoch": 0.5300332025354664, + "grad_norm": 0.07400650531053543, + "learning_rate": 0.00016977999169779992, + "loss": 0.8248, + "step": 878 + }, + { + "epoch": 0.5306368849984908, + "grad_norm": 0.07494434714317322, + "learning_rate": 0.00016973848069738482, + "loss": 1.0832, + "step": 879 + }, + { + "epoch": 0.5312405674615153, + "grad_norm": 0.07507789880037308, + "learning_rate": 0.00016969696969696972, + "loss": 1.004, + "step": 880 + }, + { + "epoch": 0.5318442499245397, + "grad_norm": 0.08284460753202438, + "learning_rate": 0.0001696554586965546, + "loss": 0.7898, + "step": 881 + }, + { + "epoch": 0.5324479323875642, + "grad_norm": 0.07790551334619522, + "learning_rate": 0.00016961394769613946, + "loss": 1.0397, + "step": 882 + }, + { + "epoch": 0.5330516148505886, + "grad_norm": 0.07756227999925613, + "learning_rate": 0.0001695724366957244, + "loss": 0.7647, + "step": 883 + }, + { + "epoch": 0.533655297313613, + "grad_norm": 0.0776314064860344, + "learning_rate": 0.00016953092569530926, + "loss": 0.7354, + "step": 884 + }, + { + "epoch": 0.5342589797766375, + "grad_norm": 0.09555763006210327, + "learning_rate": 0.00016948941469489414, + "loss": 0.7396, + "step": 885 + }, + { + "epoch": 0.534862662239662, + "grad_norm": 0.08198418468236923, + "learning_rate": 0.00016944790369447904, + "loss": 0.8173, + "step": 886 + }, + { + "epoch": 0.5354663447026864, + "grad_norm": 0.0823531225323677, + "learning_rate": 0.00016940639269406394, + "loss": 0.7131, + "step": 887 + }, + { + "epoch": 0.5360700271657108, + "grad_norm": 0.08781224489212036, + "learning_rate": 0.00016936488169364884, + "loss": 0.6971, + "step": 888 + }, + { + "epoch": 0.5366737096287353, + "grad_norm": 0.0982750728726387, + "learning_rate": 0.0001693233706932337, + "loss": 0.8086, + "step": 889 + }, + { + "epoch": 0.5372773920917597, + "grad_norm": 0.08869129419326782, + "learning_rate": 0.0001692818596928186, + "loss": 0.7577, + "step": 890 + }, + { + "epoch": 0.5378810745547842, + "grad_norm": 0.08565336465835571, + "learning_rate": 0.0001692403486924035, + "loss": 0.7097, + "step": 891 + }, + { + "epoch": 0.5384847570178086, + "grad_norm": 0.08914349973201752, + "learning_rate": 0.00016919883769198838, + "loss": 0.6995, + "step": 892 + }, + { + "epoch": 0.5390884394808331, + "grad_norm": 0.10026069730520248, + "learning_rate": 0.00016915732669157328, + "loss": 0.7999, + "step": 893 + }, + { + "epoch": 0.5396921219438575, + "grad_norm": 0.1060984879732132, + "learning_rate": 0.00016911581569115818, + "loss": 0.7441, + "step": 894 + }, + { + "epoch": 0.540295804406882, + "grad_norm": 0.10654401034116745, + "learning_rate": 0.00016907430469074306, + "loss": 0.7129, + "step": 895 + }, + { + "epoch": 0.5408994868699064, + "grad_norm": 0.1091652438044548, + "learning_rate": 0.00016903279369032793, + "loss": 0.6522, + "step": 896 + }, + { + "epoch": 0.5415031693329309, + "grad_norm": 0.11495837569236755, + "learning_rate": 0.00016899128268991286, + "loss": 0.6387, + "step": 897 + }, + { + "epoch": 0.5421068517959553, + "grad_norm": 0.1222093477845192, + "learning_rate": 0.00016894977168949773, + "loss": 0.6086, + "step": 898 + }, + { + "epoch": 0.5427105342589797, + "grad_norm": 0.11351132392883301, + "learning_rate": 0.0001689082606890826, + "loss": 0.5575, + "step": 899 + }, + { + "epoch": 0.5433142167220042, + "grad_norm": 0.12081324309110641, + "learning_rate": 0.0001688667496886675, + "loss": 0.484, + "step": 900 + }, + { + "epoch": 0.5439178991850286, + "grad_norm": 0.10098535567522049, + "learning_rate": 0.0001688252386882524, + "loss": 0.8019, + "step": 901 + }, + { + "epoch": 0.5445215816480531, + "grad_norm": 0.07991302758455276, + "learning_rate": 0.00016878372768783727, + "loss": 0.8477, + "step": 902 + }, + { + "epoch": 0.5451252641110775, + "grad_norm": 0.07697251439094543, + "learning_rate": 0.00016874221668742217, + "loss": 0.7631, + "step": 903 + }, + { + "epoch": 0.545728946574102, + "grad_norm": 0.10373490303754807, + "learning_rate": 0.00016870070568700707, + "loss": 0.9223, + "step": 904 + }, + { + "epoch": 0.5463326290371264, + "grad_norm": 0.0777481198310852, + "learning_rate": 0.00016865919468659195, + "loss": 0.882, + "step": 905 + }, + { + "epoch": 0.5469363115001509, + "grad_norm": 0.08426318317651749, + "learning_rate": 0.00016861768368617685, + "loss": 1.0619, + "step": 906 + }, + { + "epoch": 0.5475399939631753, + "grad_norm": 0.07732190936803818, + "learning_rate": 0.00016857617268576175, + "loss": 0.7125, + "step": 907 + }, + { + "epoch": 0.5481436764261998, + "grad_norm": 0.07979828119277954, + "learning_rate": 0.00016853466168534662, + "loss": 0.8596, + "step": 908 + }, + { + "epoch": 0.5487473588892242, + "grad_norm": 0.10036496073007584, + "learning_rate": 0.00016849315068493152, + "loss": 0.8396, + "step": 909 + }, + { + "epoch": 0.5493510413522488, + "grad_norm": 0.07785385847091675, + "learning_rate": 0.0001684516396845164, + "loss": 0.7962, + "step": 910 + }, + { + "epoch": 0.5499547238152732, + "grad_norm": 0.07757363468408585, + "learning_rate": 0.0001684101286841013, + "loss": 0.9085, + "step": 911 + }, + { + "epoch": 0.5505584062782977, + "grad_norm": 0.07866904139518738, + "learning_rate": 0.0001683686176836862, + "loss": 0.7676, + "step": 912 + }, + { + "epoch": 0.5511620887413221, + "grad_norm": 0.08171574026346207, + "learning_rate": 0.00016832710668327107, + "loss": 0.8823, + "step": 913 + }, + { + "epoch": 0.5517657712043466, + "grad_norm": 0.09291453659534454, + "learning_rate": 0.00016828559568285597, + "loss": 0.8212, + "step": 914 + }, + { + "epoch": 0.552369453667371, + "grad_norm": 0.06771212071180344, + "learning_rate": 0.00016824408468244087, + "loss": 0.8255, + "step": 915 + }, + { + "epoch": 0.5529731361303954, + "grad_norm": 0.07624577730894089, + "learning_rate": 0.00016820257368202574, + "loss": 1.1116, + "step": 916 + }, + { + "epoch": 0.5535768185934199, + "grad_norm": 0.06806713342666626, + "learning_rate": 0.00016816106268161064, + "loss": 0.7341, + "step": 917 + }, + { + "epoch": 0.5541805010564443, + "grad_norm": 0.1294786036014557, + "learning_rate": 0.00016811955168119554, + "loss": 0.8429, + "step": 918 + }, + { + "epoch": 0.5547841835194688, + "grad_norm": 0.08023308962583542, + "learning_rate": 0.0001680780406807804, + "loss": 1.1129, + "step": 919 + }, + { + "epoch": 0.5553878659824932, + "grad_norm": 0.09215756505727768, + "learning_rate": 0.00016803652968036529, + "loss": 0.935, + "step": 920 + }, + { + "epoch": 0.5559915484455177, + "grad_norm": 0.11087724566459656, + "learning_rate": 0.0001679950186799502, + "loss": 0.764, + "step": 921 + }, + { + "epoch": 0.5565952309085421, + "grad_norm": 0.07170010358095169, + "learning_rate": 0.00016795350767953509, + "loss": 0.71, + "step": 922 + }, + { + "epoch": 0.5571989133715666, + "grad_norm": 0.07650398463010788, + "learning_rate": 0.00016791199667911996, + "loss": 0.7885, + "step": 923 + }, + { + "epoch": 0.557802595834591, + "grad_norm": 0.08096078783273697, + "learning_rate": 0.00016787048567870486, + "loss": 0.7443, + "step": 924 + }, + { + "epoch": 0.5584062782976155, + "grad_norm": 0.07324434071779251, + "learning_rate": 0.00016782897467828976, + "loss": 0.8045, + "step": 925 + }, + { + "epoch": 0.5590099607606399, + "grad_norm": 0.11291930824518204, + "learning_rate": 0.00016778746367787463, + "loss": 0.8388, + "step": 926 + }, + { + "epoch": 0.5596136432236644, + "grad_norm": 0.07567749172449112, + "learning_rate": 0.00016774595267745953, + "loss": 0.8203, + "step": 927 + }, + { + "epoch": 0.5602173256866888, + "grad_norm": 0.07367605715990067, + "learning_rate": 0.00016770444167704443, + "loss": 0.7428, + "step": 928 + }, + { + "epoch": 0.5608210081497133, + "grad_norm": 0.11504574120044708, + "learning_rate": 0.0001676629306766293, + "loss": 1.0331, + "step": 929 + }, + { + "epoch": 0.5614246906127377, + "grad_norm": 0.06944482028484344, + "learning_rate": 0.0001676214196762142, + "loss": 1.0409, + "step": 930 + }, + { + "epoch": 0.5620283730757621, + "grad_norm": 0.07487515360116959, + "learning_rate": 0.0001675799086757991, + "loss": 0.747, + "step": 931 + }, + { + "epoch": 0.5626320555387866, + "grad_norm": 0.09043581038713455, + "learning_rate": 0.000167538397675384, + "loss": 0.8308, + "step": 932 + }, + { + "epoch": 0.563235738001811, + "grad_norm": 0.07596288621425629, + "learning_rate": 0.00016749688667496888, + "loss": 0.6969, + "step": 933 + }, + { + "epoch": 0.5638394204648355, + "grad_norm": 0.08029479533433914, + "learning_rate": 0.00016745537567455375, + "loss": 0.7532, + "step": 934 + }, + { + "epoch": 0.5644431029278599, + "grad_norm": 0.08100748807191849, + "learning_rate": 0.00016741386467413868, + "loss": 0.7859, + "step": 935 + }, + { + "epoch": 0.5650467853908844, + "grad_norm": 0.08451302349567413, + "learning_rate": 0.00016737235367372355, + "loss": 0.7942, + "step": 936 + }, + { + "epoch": 0.5656504678539088, + "grad_norm": 0.0821182057261467, + "learning_rate": 0.00016733084267330842, + "loss": 0.7002, + "step": 937 + }, + { + "epoch": 0.5662541503169333, + "grad_norm": 0.08984285593032837, + "learning_rate": 0.00016728933167289332, + "loss": 0.7721, + "step": 938 + }, + { + "epoch": 0.5668578327799577, + "grad_norm": 0.09170673787593842, + "learning_rate": 0.00016724782067247822, + "loss": 0.7705, + "step": 939 + }, + { + "epoch": 0.5674615152429822, + "grad_norm": 0.09316025674343109, + "learning_rate": 0.0001672063096720631, + "loss": 0.7349, + "step": 940 + }, + { + "epoch": 0.5680651977060066, + "grad_norm": 0.09121593087911606, + "learning_rate": 0.000167164798671648, + "loss": 0.7074, + "step": 941 + }, + { + "epoch": 0.568668880169031, + "grad_norm": 0.099585622549057, + "learning_rate": 0.0001671232876712329, + "loss": 0.772, + "step": 942 + }, + { + "epoch": 0.5692725626320555, + "grad_norm": 0.09845862537622452, + "learning_rate": 0.00016708177667081777, + "loss": 0.6837, + "step": 943 + }, + { + "epoch": 0.56987624509508, + "grad_norm": 0.10467004030942917, + "learning_rate": 0.00016704026567040267, + "loss": 0.7469, + "step": 944 + }, + { + "epoch": 0.5704799275581044, + "grad_norm": 0.10466741770505905, + "learning_rate": 0.00016699875466998757, + "loss": 0.709, + "step": 945 + }, + { + "epoch": 0.5710836100211288, + "grad_norm": 0.11000916361808777, + "learning_rate": 0.00016695724366957244, + "loss": 0.6697, + "step": 946 + }, + { + "epoch": 0.5716872924841533, + "grad_norm": 0.1157151609659195, + "learning_rate": 0.00016691573266915734, + "loss": 0.6396, + "step": 947 + }, + { + "epoch": 0.5722909749471777, + "grad_norm": 0.11827955394983292, + "learning_rate": 0.00016687422166874222, + "loss": 0.6224, + "step": 948 + }, + { + "epoch": 0.5728946574102022, + "grad_norm": 0.12374337762594223, + "learning_rate": 0.00016683271066832712, + "loss": 0.5617, + "step": 949 + }, + { + "epoch": 0.5734983398732267, + "grad_norm": 0.12392137944698334, + "learning_rate": 0.00016679119966791202, + "loss": 0.4256, + "step": 950 + }, + { + "epoch": 0.5741020223362512, + "grad_norm": 0.09002061933279037, + "learning_rate": 0.0001667496886674969, + "loss": 0.8641, + "step": 951 + }, + { + "epoch": 0.5747057047992756, + "grad_norm": 0.0768359899520874, + "learning_rate": 0.0001667081776670818, + "loss": 0.6958, + "step": 952 + }, + { + "epoch": 0.5753093872623001, + "grad_norm": 0.07474057376384735, + "learning_rate": 0.0001666666666666667, + "loss": 0.9245, + "step": 953 + }, + { + "epoch": 0.5759130697253245, + "grad_norm": 0.0858532041311264, + "learning_rate": 0.00016662515566625156, + "loss": 0.801, + "step": 954 + }, + { + "epoch": 0.576516752188349, + "grad_norm": 0.08939801156520844, + "learning_rate": 0.00016658364466583643, + "loss": 0.8461, + "step": 955 + }, + { + "epoch": 0.5771204346513734, + "grad_norm": 0.07809244841337204, + "learning_rate": 0.00016654213366542136, + "loss": 0.9988, + "step": 956 + }, + { + "epoch": 0.5777241171143979, + "grad_norm": 0.08366213738918304, + "learning_rate": 0.00016650062266500623, + "loss": 0.8169, + "step": 957 + }, + { + "epoch": 0.5783277995774223, + "grad_norm": 0.08782380819320679, + "learning_rate": 0.0001664591116645911, + "loss": 0.8715, + "step": 958 + }, + { + "epoch": 0.5789314820404468, + "grad_norm": 0.0959751307964325, + "learning_rate": 0.00016641760066417603, + "loss": 0.9998, + "step": 959 + }, + { + "epoch": 0.5795351645034712, + "grad_norm": 0.07628454267978668, + "learning_rate": 0.0001663760896637609, + "loss": 0.8369, + "step": 960 + }, + { + "epoch": 0.5801388469664956, + "grad_norm": 0.0932275727391243, + "learning_rate": 0.00016633457866334578, + "loss": 1.0541, + "step": 961 + }, + { + "epoch": 0.5807425294295201, + "grad_norm": 0.09154736250638962, + "learning_rate": 0.00016629306766293068, + "loss": 0.7972, + "step": 962 + }, + { + "epoch": 0.5813462118925445, + "grad_norm": 0.09588005393743515, + "learning_rate": 0.00016625155666251558, + "loss": 0.9873, + "step": 963 + }, + { + "epoch": 0.581949894355569, + "grad_norm": 0.06718328595161438, + "learning_rate": 0.00016621004566210045, + "loss": 0.839, + "step": 964 + }, + { + "epoch": 0.5825535768185934, + "grad_norm": 0.09526249766349792, + "learning_rate": 0.00016616853466168535, + "loss": 0.8999, + "step": 965 + }, + { + "epoch": 0.5831572592816179, + "grad_norm": 0.0714835673570633, + "learning_rate": 0.00016612702366127025, + "loss": 0.7127, + "step": 966 + }, + { + "epoch": 0.5837609417446423, + "grad_norm": 0.07606557011604309, + "learning_rate": 0.00016608551266085513, + "loss": 0.8162, + "step": 967 + }, + { + "epoch": 0.5843646242076668, + "grad_norm": 0.07503029704093933, + "learning_rate": 0.00016604400166044003, + "loss": 0.9174, + "step": 968 + }, + { + "epoch": 0.5849683066706912, + "grad_norm": 0.08567387610673904, + "learning_rate": 0.0001660024906600249, + "loss": 0.9408, + "step": 969 + }, + { + "epoch": 0.5855719891337157, + "grad_norm": 0.08126292377710342, + "learning_rate": 0.0001659609796596098, + "loss": 0.9288, + "step": 970 + }, + { + "epoch": 0.5861756715967401, + "grad_norm": 0.07483746111392975, + "learning_rate": 0.0001659194686591947, + "loss": 0.8598, + "step": 971 + }, + { + "epoch": 0.5867793540597646, + "grad_norm": 0.07251130044460297, + "learning_rate": 0.00016587795765877957, + "loss": 0.9461, + "step": 972 + }, + { + "epoch": 0.587383036522789, + "grad_norm": 0.08908132463693619, + "learning_rate": 0.00016583644665836447, + "loss": 0.9473, + "step": 973 + }, + { + "epoch": 0.5879867189858135, + "grad_norm": 0.08767648786306381, + "learning_rate": 0.00016579493565794937, + "loss": 0.757, + "step": 974 + }, + { + "epoch": 0.5885904014488379, + "grad_norm": 0.07573854178190231, + "learning_rate": 0.00016575342465753425, + "loss": 0.7972, + "step": 975 + }, + { + "epoch": 0.5891940839118623, + "grad_norm": 0.074210524559021, + "learning_rate": 0.00016571191365711915, + "loss": 0.8437, + "step": 976 + }, + { + "epoch": 0.5897977663748868, + "grad_norm": 0.08098217099905014, + "learning_rate": 0.00016567040265670405, + "loss": 0.9021, + "step": 977 + }, + { + "epoch": 0.5904014488379112, + "grad_norm": 0.07945941388607025, + "learning_rate": 0.00016562889165628892, + "loss": 0.8371, + "step": 978 + }, + { + "epoch": 0.5910051313009357, + "grad_norm": 0.10039619356393814, + "learning_rate": 0.00016558738065587382, + "loss": 0.7602, + "step": 979 + }, + { + "epoch": 0.5916088137639601, + "grad_norm": 0.08295184373855591, + "learning_rate": 0.00016554586965545872, + "loss": 0.761, + "step": 980 + }, + { + "epoch": 0.5922124962269846, + "grad_norm": 0.08036410063505173, + "learning_rate": 0.0001655043586550436, + "loss": 0.9951, + "step": 981 + }, + { + "epoch": 0.592816178690009, + "grad_norm": 0.07915826141834259, + "learning_rate": 0.0001654628476546285, + "loss": 0.82, + "step": 982 + }, + { + "epoch": 0.5934198611530335, + "grad_norm": 0.07575780898332596, + "learning_rate": 0.00016542133665421336, + "loss": 0.7732, + "step": 983 + }, + { + "epoch": 0.5940235436160579, + "grad_norm": 0.08057866990566254, + "learning_rate": 0.00016537982565379826, + "loss": 0.7388, + "step": 984 + }, + { + "epoch": 0.5946272260790824, + "grad_norm": 0.07897903025150299, + "learning_rate": 0.00016533831465338316, + "loss": 0.733, + "step": 985 + }, + { + "epoch": 0.5952309085421068, + "grad_norm": 0.1108403429389, + "learning_rate": 0.00016529680365296804, + "loss": 0.7831, + "step": 986 + }, + { + "epoch": 0.5958345910051313, + "grad_norm": 0.08219526708126068, + "learning_rate": 0.00016525529265255294, + "loss": 0.754, + "step": 987 + }, + { + "epoch": 0.5964382734681557, + "grad_norm": 0.09057065844535828, + "learning_rate": 0.00016521378165213784, + "loss": 0.7478, + "step": 988 + }, + { + "epoch": 0.5970419559311801, + "grad_norm": 0.09317967295646667, + "learning_rate": 0.0001651722706517227, + "loss": 0.7583, + "step": 989 + }, + { + "epoch": 0.5976456383942047, + "grad_norm": 0.09358835965394974, + "learning_rate": 0.0001651307596513076, + "loss": 0.6958, + "step": 990 + }, + { + "epoch": 0.5982493208572291, + "grad_norm": 0.100360207259655, + "learning_rate": 0.0001650892486508925, + "loss": 0.7544, + "step": 991 + }, + { + "epoch": 0.5988530033202536, + "grad_norm": 0.09013186395168304, + "learning_rate": 0.00016504773765047738, + "loss": 0.7112, + "step": 992 + }, + { + "epoch": 0.599456685783278, + "grad_norm": 0.1008266881108284, + "learning_rate": 0.00016500622665006226, + "loss": 0.767, + "step": 993 + }, + { + "epoch": 0.6000603682463025, + "grad_norm": 0.09665394574403763, + "learning_rate": 0.00016496471564964718, + "loss": 0.6782, + "step": 994 + }, + { + "epoch": 0.6006640507093269, + "grad_norm": 0.10552255064249039, + "learning_rate": 0.00016492320464923206, + "loss": 0.6667, + "step": 995 + }, + { + "epoch": 0.6012677331723514, + "grad_norm": 0.11099070310592651, + "learning_rate": 0.00016488169364881693, + "loss": 0.7023, + "step": 996 + }, + { + "epoch": 0.6018714156353758, + "grad_norm": 0.10965953767299652, + "learning_rate": 0.00016484018264840183, + "loss": 0.6285, + "step": 997 + }, + { + "epoch": 0.6024750980984003, + "grad_norm": 0.11270109564065933, + "learning_rate": 0.00016479867164798673, + "loss": 0.612, + "step": 998 + }, + { + "epoch": 0.6030787805614247, + "grad_norm": 0.11348733305931091, + "learning_rate": 0.0001647571606475716, + "loss": 0.5591, + "step": 999 + }, + { + "epoch": 0.6036824630244492, + "grad_norm": 0.1182008683681488, + "learning_rate": 0.0001647156496471565, + "loss": 0.4379, + "step": 1000 + }, + { + "epoch": 0.6036824630244492, + "eval_loss": 0.8275489211082458, + "eval_runtime": 1218.7348, + "eval_samples_per_second": 2.289, + "eval_steps_per_second": 0.286, + "step": 1000 + }, + { + "epoch": 0.6042861454874736, + "grad_norm": 0.0789058580994606, + "learning_rate": 0.0001646741386467414, + "loss": 0.7819, + "step": 1001 + }, + { + "epoch": 0.6048898279504981, + "grad_norm": 0.0920637771487236, + "learning_rate": 0.00016463262764632627, + "loss": 1.4056, + "step": 1002 + }, + { + "epoch": 0.6054935104135225, + "grad_norm": 0.07369931042194366, + "learning_rate": 0.00016459111664591118, + "loss": 0.9731, + "step": 1003 + }, + { + "epoch": 0.606097192876547, + "grad_norm": 0.08552141487598419, + "learning_rate": 0.00016454960564549608, + "loss": 0.7628, + "step": 1004 + }, + { + "epoch": 0.6067008753395714, + "grad_norm": 0.07296184450387955, + "learning_rate": 0.00016450809464508095, + "loss": 0.7961, + "step": 1005 + }, + { + "epoch": 0.6073045578025958, + "grad_norm": 0.08106445521116257, + "learning_rate": 0.00016446658364466585, + "loss": 0.9692, + "step": 1006 + }, + { + "epoch": 0.6079082402656203, + "grad_norm": 0.08443611115217209, + "learning_rate": 0.00016442507264425072, + "loss": 1.3388, + "step": 1007 + }, + { + "epoch": 0.6085119227286447, + "grad_norm": 0.08953455090522766, + "learning_rate": 0.00016438356164383562, + "loss": 0.9006, + "step": 1008 + }, + { + "epoch": 0.6091156051916692, + "grad_norm": 0.09113611280918121, + "learning_rate": 0.00016434205064342052, + "loss": 0.9404, + "step": 1009 + }, + { + "epoch": 0.6097192876546936, + "grad_norm": 0.08092870563268661, + "learning_rate": 0.0001643005396430054, + "loss": 0.8747, + "step": 1010 + }, + { + "epoch": 0.6103229701177181, + "grad_norm": 0.07962828874588013, + "learning_rate": 0.0001642590286425903, + "loss": 1.0154, + "step": 1011 + }, + { + "epoch": 0.6109266525807425, + "grad_norm": 0.07279300689697266, + "learning_rate": 0.0001642175176421752, + "loss": 0.8086, + "step": 1012 + }, + { + "epoch": 0.611530335043767, + "grad_norm": 0.08411548286676407, + "learning_rate": 0.00016417600664176007, + "loss": 0.8582, + "step": 1013 + }, + { + "epoch": 0.6121340175067914, + "grad_norm": 0.0790749341249466, + "learning_rate": 0.00016413449564134497, + "loss": 0.7265, + "step": 1014 + }, + { + "epoch": 0.6127376999698159, + "grad_norm": 0.0777488648891449, + "learning_rate": 0.00016409298464092987, + "loss": 0.8086, + "step": 1015 + }, + { + "epoch": 0.6133413824328403, + "grad_norm": 0.09185469895601273, + "learning_rate": 0.00016405147364051474, + "loss": 0.8752, + "step": 1016 + }, + { + "epoch": 0.6139450648958648, + "grad_norm": 0.09490189701318741, + "learning_rate": 0.0001640099626400996, + "loss": 0.9276, + "step": 1017 + }, + { + "epoch": 0.6145487473588892, + "grad_norm": 0.10677061229944229, + "learning_rate": 0.00016396845163968454, + "loss": 1.1395, + "step": 1018 + }, + { + "epoch": 0.6151524298219136, + "grad_norm": 0.08552467823028564, + "learning_rate": 0.0001639269406392694, + "loss": 1.0333, + "step": 1019 + }, + { + "epoch": 0.6157561122849381, + "grad_norm": 0.10448099672794342, + "learning_rate": 0.0001638854296388543, + "loss": 1.003, + "step": 1020 + }, + { + "epoch": 0.6163597947479625, + "grad_norm": 0.08592557907104492, + "learning_rate": 0.00016384391863843919, + "loss": 0.7925, + "step": 1021 + }, + { + "epoch": 0.616963477210987, + "grad_norm": 0.082502081990242, + "learning_rate": 0.00016380240763802409, + "loss": 0.8321, + "step": 1022 + }, + { + "epoch": 0.6175671596740114, + "grad_norm": 0.08332712948322296, + "learning_rate": 0.00016376089663760899, + "loss": 0.9372, + "step": 1023 + }, + { + "epoch": 0.6181708421370359, + "grad_norm": 0.09017034620046616, + "learning_rate": 0.00016371938563719386, + "loss": 0.8795, + "step": 1024 + }, + { + "epoch": 0.6187745246000603, + "grad_norm": 0.08691825717687607, + "learning_rate": 0.00016367787463677876, + "loss": 0.7948, + "step": 1025 + }, + { + "epoch": 0.6193782070630848, + "grad_norm": 0.07839145511388779, + "learning_rate": 0.00016363636363636366, + "loss": 0.7879, + "step": 1026 + }, + { + "epoch": 0.6199818895261092, + "grad_norm": 0.09319109469652176, + "learning_rate": 0.00016359485263594853, + "loss": 0.8614, + "step": 1027 + }, + { + "epoch": 0.6205855719891337, + "grad_norm": 0.0755649283528328, + "learning_rate": 0.00016355334163553343, + "loss": 0.7723, + "step": 1028 + }, + { + "epoch": 0.6211892544521581, + "grad_norm": 0.08007936924695969, + "learning_rate": 0.00016351183063511833, + "loss": 0.7863, + "step": 1029 + }, + { + "epoch": 0.6217929369151826, + "grad_norm": 0.11180046945810318, + "learning_rate": 0.0001634703196347032, + "loss": 1.0614, + "step": 1030 + }, + { + "epoch": 0.6223966193782071, + "grad_norm": 0.09498503804206848, + "learning_rate": 0.00016342880863428808, + "loss": 0.8302, + "step": 1031 + }, + { + "epoch": 0.6230003018412316, + "grad_norm": 0.08199071139097214, + "learning_rate": 0.000163387297633873, + "loss": 1.0062, + "step": 1032 + }, + { + "epoch": 0.623603984304256, + "grad_norm": 0.08216526359319687, + "learning_rate": 0.00016334578663345788, + "loss": 0.6999, + "step": 1033 + }, + { + "epoch": 0.6242076667672805, + "grad_norm": 0.07529989629983902, + "learning_rate": 0.00016330427563304275, + "loss": 0.8036, + "step": 1034 + }, + { + "epoch": 0.6248113492303049, + "grad_norm": 0.08288833498954773, + "learning_rate": 0.00016326276463262765, + "loss": 0.7338, + "step": 1035 + }, + { + "epoch": 0.6254150316933293, + "grad_norm": 0.08039028942584991, + "learning_rate": 0.00016322125363221255, + "loss": 0.7554, + "step": 1036 + }, + { + "epoch": 0.6260187141563538, + "grad_norm": 0.08144210278987885, + "learning_rate": 0.00016317974263179742, + "loss": 0.7421, + "step": 1037 + }, + { + "epoch": 0.6266223966193782, + "grad_norm": 0.08654642850160599, + "learning_rate": 0.00016313823163138232, + "loss": 0.7317, + "step": 1038 + }, + { + "epoch": 0.6272260790824027, + "grad_norm": 0.08834250271320343, + "learning_rate": 0.00016309672063096722, + "loss": 0.7208, + "step": 1039 + }, + { + "epoch": 0.6278297615454271, + "grad_norm": 0.08896996825933456, + "learning_rate": 0.0001630552096305521, + "loss": 0.7637, + "step": 1040 + }, + { + "epoch": 0.6284334440084516, + "grad_norm": 0.09299125522375107, + "learning_rate": 0.000163013698630137, + "loss": 0.6993, + "step": 1041 + }, + { + "epoch": 0.629037126471476, + "grad_norm": 0.0911908969283104, + "learning_rate": 0.0001629721876297219, + "loss": 0.7455, + "step": 1042 + }, + { + "epoch": 0.6296408089345005, + "grad_norm": 0.0976100042462349, + "learning_rate": 0.00016293067662930677, + "loss": 0.6683, + "step": 1043 + }, + { + "epoch": 0.6302444913975249, + "grad_norm": 0.10183601081371307, + "learning_rate": 0.00016288916562889167, + "loss": 0.7513, + "step": 1044 + }, + { + "epoch": 0.6308481738605494, + "grad_norm": 0.10491038113832474, + "learning_rate": 0.00016284765462847654, + "loss": 0.7886, + "step": 1045 + }, + { + "epoch": 0.6314518563235738, + "grad_norm": 0.1056092381477356, + "learning_rate": 0.00016280614362806144, + "loss": 0.6803, + "step": 1046 + }, + { + "epoch": 0.6320555387865983, + "grad_norm": 0.1121145635843277, + "learning_rate": 0.00016276463262764634, + "loss": 0.6256, + "step": 1047 + }, + { + "epoch": 0.6326592212496227, + "grad_norm": 0.11745337396860123, + "learning_rate": 0.00016272312162723122, + "loss": 0.6546, + "step": 1048 + }, + { + "epoch": 0.6332629037126472, + "grad_norm": 0.12515030801296234, + "learning_rate": 0.00016268161062681612, + "loss": 0.5382, + "step": 1049 + }, + { + "epoch": 0.6338665861756716, + "grad_norm": 0.13032299280166626, + "learning_rate": 0.00016264009962640102, + "loss": 0.4321, + "step": 1050 + }, + { + "epoch": 0.634470268638696, + "grad_norm": 0.08480330556631088, + "learning_rate": 0.0001625985886259859, + "loss": 0.8055, + "step": 1051 + }, + { + "epoch": 0.6350739511017205, + "grad_norm": 0.08330792188644409, + "learning_rate": 0.00016255707762557076, + "loss": 1.1714, + "step": 1052 + }, + { + "epoch": 0.6356776335647449, + "grad_norm": 0.09432794153690338, + "learning_rate": 0.0001625155666251557, + "loss": 1.0071, + "step": 1053 + }, + { + "epoch": 0.6362813160277694, + "grad_norm": 0.084816575050354, + "learning_rate": 0.00016247405562474056, + "loss": 0.7843, + "step": 1054 + }, + { + "epoch": 0.6368849984907938, + "grad_norm": 0.09600014239549637, + "learning_rate": 0.00016243254462432543, + "loss": 0.8295, + "step": 1055 + }, + { + "epoch": 0.6374886809538183, + "grad_norm": 0.10093321651220322, + "learning_rate": 0.00016239103362391036, + "loss": 0.8685, + "step": 1056 + }, + { + "epoch": 0.6380923634168427, + "grad_norm": 0.07668054848909378, + "learning_rate": 0.00016234952262349523, + "loss": 1.042, + "step": 1057 + }, + { + "epoch": 0.6386960458798672, + "grad_norm": 0.12126658856868744, + "learning_rate": 0.0001623080116230801, + "loss": 0.8384, + "step": 1058 + }, + { + "epoch": 0.6392997283428916, + "grad_norm": 0.07691983133554459, + "learning_rate": 0.000162266500622665, + "loss": 0.9138, + "step": 1059 + }, + { + "epoch": 0.6399034108059161, + "grad_norm": 0.07935553044080734, + "learning_rate": 0.0001622249896222499, + "loss": 0.7028, + "step": 1060 + }, + { + "epoch": 0.6405070932689405, + "grad_norm": 0.0814066082239151, + "learning_rate": 0.00016218347862183478, + "loss": 0.8901, + "step": 1061 + }, + { + "epoch": 0.641110775731965, + "grad_norm": 0.09054214507341385, + "learning_rate": 0.00016214196762141968, + "loss": 0.9142, + "step": 1062 + }, + { + "epoch": 0.6417144581949894, + "grad_norm": 0.0810336098074913, + "learning_rate": 0.00016210045662100458, + "loss": 0.7975, + "step": 1063 + }, + { + "epoch": 0.6423181406580138, + "grad_norm": 0.07674489170312881, + "learning_rate": 0.00016205894562058945, + "loss": 0.9827, + "step": 1064 + }, + { + "epoch": 0.6429218231210383, + "grad_norm": 0.07564074546098709, + "learning_rate": 0.00016201743462017435, + "loss": 0.9164, + "step": 1065 + }, + { + "epoch": 0.6435255055840627, + "grad_norm": 0.082376629114151, + "learning_rate": 0.00016197592361975923, + "loss": 1.0141, + "step": 1066 + }, + { + "epoch": 0.6441291880470872, + "grad_norm": 0.09569506347179413, + "learning_rate": 0.00016193441261934415, + "loss": 1.031, + "step": 1067 + }, + { + "epoch": 0.6447328705101116, + "grad_norm": 0.07831252366304398, + "learning_rate": 0.00016189290161892903, + "loss": 0.7855, + "step": 1068 + }, + { + "epoch": 0.6453365529731361, + "grad_norm": 0.12247852981090546, + "learning_rate": 0.0001618513906185139, + "loss": 0.9584, + "step": 1069 + }, + { + "epoch": 0.6459402354361605, + "grad_norm": 0.07706090807914734, + "learning_rate": 0.00016180987961809883, + "loss": 0.9001, + "step": 1070 + }, + { + "epoch": 0.6465439178991851, + "grad_norm": 0.07961270213127136, + "learning_rate": 0.0001617683686176837, + "loss": 0.725, + "step": 1071 + }, + { + "epoch": 0.6471476003622095, + "grad_norm": 0.08146402984857559, + "learning_rate": 0.00016172685761726857, + "loss": 0.8213, + "step": 1072 + }, + { + "epoch": 0.647751282825234, + "grad_norm": 0.1310458779335022, + "learning_rate": 0.00016168534661685347, + "loss": 0.8039, + "step": 1073 + }, + { + "epoch": 0.6483549652882584, + "grad_norm": 0.09053052216768265, + "learning_rate": 0.00016164383561643837, + "loss": 0.8154, + "step": 1074 + }, + { + "epoch": 0.6489586477512829, + "grad_norm": 0.07922950387001038, + "learning_rate": 0.00016160232461602325, + "loss": 0.8074, + "step": 1075 + }, + { + "epoch": 0.6495623302143073, + "grad_norm": 0.07535174489021301, + "learning_rate": 0.00016156081361560815, + "loss": 0.7056, + "step": 1076 + }, + { + "epoch": 0.6501660126773318, + "grad_norm": 0.08822023123502731, + "learning_rate": 0.00016151930261519305, + "loss": 0.8269, + "step": 1077 + }, + { + "epoch": 0.6507696951403562, + "grad_norm": 0.12088391184806824, + "learning_rate": 0.00016147779161477792, + "loss": 1.0016, + "step": 1078 + }, + { + "epoch": 0.6513733776033807, + "grad_norm": 0.08213537931442261, + "learning_rate": 0.00016143628061436282, + "loss": 0.7239, + "step": 1079 + }, + { + "epoch": 0.6519770600664051, + "grad_norm": 0.08252882957458496, + "learning_rate": 0.0001613947696139477, + "loss": 0.8806, + "step": 1080 + }, + { + "epoch": 0.6525807425294295, + "grad_norm": 0.06710948050022125, + "learning_rate": 0.0001613532586135326, + "loss": 0.9193, + "step": 1081 + }, + { + "epoch": 0.653184424992454, + "grad_norm": 0.14059540629386902, + "learning_rate": 0.0001613117476131175, + "loss": 0.8201, + "step": 1082 + }, + { + "epoch": 0.6537881074554784, + "grad_norm": 0.08148171007633209, + "learning_rate": 0.00016127023661270236, + "loss": 0.7983, + "step": 1083 + }, + { + "epoch": 0.6543917899185029, + "grad_norm": 0.07753555476665497, + "learning_rate": 0.00016122872561228726, + "loss": 0.6901, + "step": 1084 + }, + { + "epoch": 0.6549954723815273, + "grad_norm": 0.08410129696130753, + "learning_rate": 0.00016118721461187216, + "loss": 0.7755, + "step": 1085 + }, + { + "epoch": 0.6555991548445518, + "grad_norm": 0.08018611371517181, + "learning_rate": 0.00016114570361145704, + "loss": 0.7569, + "step": 1086 + }, + { + "epoch": 0.6562028373075762, + "grad_norm": 0.08290990442037582, + "learning_rate": 0.00016110419261104194, + "loss": 0.7694, + "step": 1087 + }, + { + "epoch": 0.6568065197706007, + "grad_norm": 0.08168758451938629, + "learning_rate": 0.00016106268161062684, + "loss": 0.7458, + "step": 1088 + }, + { + "epoch": 0.6574102022336251, + "grad_norm": 0.09103231132030487, + "learning_rate": 0.0001610211706102117, + "loss": 0.7203, + "step": 1089 + }, + { + "epoch": 0.6580138846966496, + "grad_norm": 0.0930967926979065, + "learning_rate": 0.00016097965960979658, + "loss": 0.764, + "step": 1090 + }, + { + "epoch": 0.658617567159674, + "grad_norm": 0.0999932810664177, + "learning_rate": 0.0001609381486093815, + "loss": 0.7885, + "step": 1091 + }, + { + "epoch": 0.6592212496226985, + "grad_norm": 0.09345666319131851, + "learning_rate": 0.00016089663760896638, + "loss": 0.7567, + "step": 1092 + }, + { + "epoch": 0.6598249320857229, + "grad_norm": 0.09461873769760132, + "learning_rate": 0.00016085512660855126, + "loss": 0.6643, + "step": 1093 + }, + { + "epoch": 0.6604286145487474, + "grad_norm": 0.10697057098150253, + "learning_rate": 0.00016081361560813616, + "loss": 0.7219, + "step": 1094 + }, + { + "epoch": 0.6610322970117718, + "grad_norm": 0.11102855950593948, + "learning_rate": 0.00016077210460772106, + "loss": 0.6777, + "step": 1095 + }, + { + "epoch": 0.6616359794747962, + "grad_norm": 0.11301318556070328, + "learning_rate": 0.00016073059360730593, + "loss": 0.7065, + "step": 1096 + }, + { + "epoch": 0.6622396619378207, + "grad_norm": 0.11578387022018433, + "learning_rate": 0.00016068908260689083, + "loss": 0.731, + "step": 1097 + }, + { + "epoch": 0.6628433444008451, + "grad_norm": 0.11346578598022461, + "learning_rate": 0.00016064757160647573, + "loss": 0.579, + "step": 1098 + }, + { + "epoch": 0.6634470268638696, + "grad_norm": 0.1209607645869255, + "learning_rate": 0.0001606060606060606, + "loss": 0.5572, + "step": 1099 + }, + { + "epoch": 0.664050709326894, + "grad_norm": 0.11950193345546722, + "learning_rate": 0.0001605645496056455, + "loss": 0.3567, + "step": 1100 + }, + { + "epoch": 0.6646543917899185, + "grad_norm": 0.07838748395442963, + "learning_rate": 0.0001605230386052304, + "loss": 1.0775, + "step": 1101 + }, + { + "epoch": 0.6652580742529429, + "grad_norm": 0.1073046401143074, + "learning_rate": 0.00016048152760481528, + "loss": 0.693, + "step": 1102 + }, + { + "epoch": 0.6658617567159674, + "grad_norm": 0.0886329859495163, + "learning_rate": 0.00016044001660440018, + "loss": 0.7818, + "step": 1103 + }, + { + "epoch": 0.6664654391789918, + "grad_norm": 0.07882886379957199, + "learning_rate": 0.00016039850560398505, + "loss": 0.7407, + "step": 1104 + }, + { + "epoch": 0.6670691216420163, + "grad_norm": 0.09764014929533005, + "learning_rate": 0.00016035699460356995, + "loss": 0.8944, + "step": 1105 + }, + { + "epoch": 0.6676728041050407, + "grad_norm": 0.08643536269664764, + "learning_rate": 0.00016031548360315485, + "loss": 0.8511, + "step": 1106 + }, + { + "epoch": 0.6682764865680652, + "grad_norm": 0.08230648189783096, + "learning_rate": 0.00016027397260273972, + "loss": 0.9837, + "step": 1107 + }, + { + "epoch": 0.6688801690310896, + "grad_norm": 0.08523669838905334, + "learning_rate": 0.00016023246160232462, + "loss": 0.7902, + "step": 1108 + }, + { + "epoch": 0.669483851494114, + "grad_norm": 0.08347252011299133, + "learning_rate": 0.00016019095060190952, + "loss": 0.7791, + "step": 1109 + }, + { + "epoch": 0.6700875339571385, + "grad_norm": 0.08182717114686966, + "learning_rate": 0.0001601494396014944, + "loss": 0.8484, + "step": 1110 + }, + { + "epoch": 0.6706912164201629, + "grad_norm": 0.0879136249423027, + "learning_rate": 0.0001601079286010793, + "loss": 0.8469, + "step": 1111 + }, + { + "epoch": 0.6712948988831875, + "grad_norm": 0.08389199525117874, + "learning_rate": 0.0001600664176006642, + "loss": 0.9403, + "step": 1112 + }, + { + "epoch": 0.6718985813462119, + "grad_norm": 0.07789173722267151, + "learning_rate": 0.00016002490660024907, + "loss": 0.8443, + "step": 1113 + }, + { + "epoch": 0.6725022638092364, + "grad_norm": 0.07873908430337906, + "learning_rate": 0.00015998339559983397, + "loss": 0.8464, + "step": 1114 + }, + { + "epoch": 0.6731059462722608, + "grad_norm": 0.10451683402061462, + "learning_rate": 0.00015994188459941887, + "loss": 0.8507, + "step": 1115 + }, + { + "epoch": 0.6737096287352853, + "grad_norm": 0.10225295275449753, + "learning_rate": 0.00015990037359900374, + "loss": 1.1292, + "step": 1116 + }, + { + "epoch": 0.6743133111983097, + "grad_norm": 0.08053860068321228, + "learning_rate": 0.00015985886259858864, + "loss": 0.8511, + "step": 1117 + }, + { + "epoch": 0.6749169936613342, + "grad_norm": 0.15555590391159058, + "learning_rate": 0.00015981735159817351, + "loss": 0.6958, + "step": 1118 + }, + { + "epoch": 0.6755206761243586, + "grad_norm": 0.09447921067476273, + "learning_rate": 0.00015977584059775841, + "loss": 0.8228, + "step": 1119 + }, + { + "epoch": 0.6761243585873831, + "grad_norm": 0.11645516008138657, + "learning_rate": 0.00015973432959734331, + "loss": 0.8909, + "step": 1120 + }, + { + "epoch": 0.6767280410504075, + "grad_norm": 0.09540687501430511, + "learning_rate": 0.0001596928185969282, + "loss": 0.7483, + "step": 1121 + }, + { + "epoch": 0.677331723513432, + "grad_norm": 0.08288270980119705, + "learning_rate": 0.0001596513075965131, + "loss": 0.837, + "step": 1122 + }, + { + "epoch": 0.6779354059764564, + "grad_norm": 0.0898602157831192, + "learning_rate": 0.000159609796596098, + "loss": 0.7893, + "step": 1123 + }, + { + "epoch": 0.6785390884394809, + "grad_norm": 0.08173252642154694, + "learning_rate": 0.00015956828559568286, + "loss": 0.7965, + "step": 1124 + }, + { + "epoch": 0.6791427709025053, + "grad_norm": 0.08245470374822617, + "learning_rate": 0.00015952677459526776, + "loss": 0.9573, + "step": 1125 + }, + { + "epoch": 0.6797464533655297, + "grad_norm": 0.07909370958805084, + "learning_rate": 0.00015948526359485266, + "loss": 0.7852, + "step": 1126 + }, + { + "epoch": 0.6803501358285542, + "grad_norm": 0.07963291555643082, + "learning_rate": 0.00015944375259443753, + "loss": 0.8806, + "step": 1127 + }, + { + "epoch": 0.6809538182915786, + "grad_norm": 0.07449831813573837, + "learning_rate": 0.0001594022415940224, + "loss": 0.8094, + "step": 1128 + }, + { + "epoch": 0.6815575007546031, + "grad_norm": 0.07845521718263626, + "learning_rate": 0.00015936073059360733, + "loss": 0.8813, + "step": 1129 + }, + { + "epoch": 0.6821611832176275, + "grad_norm": 0.0760309100151062, + "learning_rate": 0.0001593192195931922, + "loss": 0.9867, + "step": 1130 + }, + { + "epoch": 0.682764865680652, + "grad_norm": 0.07355882227420807, + "learning_rate": 0.00015927770859277708, + "loss": 0.8448, + "step": 1131 + }, + { + "epoch": 0.6833685481436764, + "grad_norm": 0.07743940502405167, + "learning_rate": 0.00015923619759236198, + "loss": 1.0274, + "step": 1132 + }, + { + "epoch": 0.6839722306067009, + "grad_norm": 0.08236207067966461, + "learning_rate": 0.00015919468659194688, + "loss": 0.7691, + "step": 1133 + }, + { + "epoch": 0.6845759130697253, + "grad_norm": 0.07984744012355804, + "learning_rate": 0.00015915317559153175, + "loss": 0.7605, + "step": 1134 + }, + { + "epoch": 0.6851795955327498, + "grad_norm": 0.08415329456329346, + "learning_rate": 0.00015911166459111665, + "loss": 0.7576, + "step": 1135 + }, + { + "epoch": 0.6857832779957742, + "grad_norm": 0.08308225870132446, + "learning_rate": 0.00015907015359070155, + "loss": 0.6959, + "step": 1136 + }, + { + "epoch": 0.6863869604587987, + "grad_norm": 0.08338841050863266, + "learning_rate": 0.00015902864259028642, + "loss": 0.6998, + "step": 1137 + }, + { + "epoch": 0.6869906429218231, + "grad_norm": 0.08652734011411667, + "learning_rate": 0.00015898713158987132, + "loss": 0.7469, + "step": 1138 + }, + { + "epoch": 0.6875943253848475, + "grad_norm": 0.11346086114645004, + "learning_rate": 0.00015894562058945622, + "loss": 0.6995, + "step": 1139 + }, + { + "epoch": 0.688198007847872, + "grad_norm": 0.08581293374300003, + "learning_rate": 0.0001589041095890411, + "loss": 0.7138, + "step": 1140 + }, + { + "epoch": 0.6888016903108964, + "grad_norm": 0.0915643498301506, + "learning_rate": 0.000158862598588626, + "loss": 0.7836, + "step": 1141 + }, + { + "epoch": 0.6894053727739209, + "grad_norm": 0.09856559336185455, + "learning_rate": 0.00015882108758821087, + "loss": 0.7151, + "step": 1142 + }, + { + "epoch": 0.6900090552369453, + "grad_norm": 0.09949114173650742, + "learning_rate": 0.00015877957658779577, + "loss": 0.7292, + "step": 1143 + }, + { + "epoch": 0.6906127376999698, + "grad_norm": 0.10931304842233658, + "learning_rate": 0.00015873806558738067, + "loss": 0.777, + "step": 1144 + }, + { + "epoch": 0.6912164201629942, + "grad_norm": 0.10665369033813477, + "learning_rate": 0.00015869655458696554, + "loss": 0.6729, + "step": 1145 + }, + { + "epoch": 0.6918201026260187, + "grad_norm": 0.1097274124622345, + "learning_rate": 0.00015865504358655044, + "loss": 0.6532, + "step": 1146 + }, + { + "epoch": 0.6924237850890431, + "grad_norm": 0.12797416746616364, + "learning_rate": 0.00015861353258613534, + "loss": 0.6375, + "step": 1147 + }, + { + "epoch": 0.6930274675520676, + "grad_norm": 0.11834930628538132, + "learning_rate": 0.00015857202158572022, + "loss": 0.6374, + "step": 1148 + }, + { + "epoch": 0.693631150015092, + "grad_norm": 0.11373134702444077, + "learning_rate": 0.00015853051058530512, + "loss": 0.5455, + "step": 1149 + }, + { + "epoch": 0.6942348324781165, + "grad_norm": 0.13020096719264984, + "learning_rate": 0.00015848899958489002, + "loss": 0.4624, + "step": 1150 + }, + { + "epoch": 0.6948385149411409, + "grad_norm": 0.09528189897537231, + "learning_rate": 0.0001584474885844749, + "loss": 0.8045, + "step": 1151 + }, + { + "epoch": 0.6954421974041655, + "grad_norm": 0.09046769887208939, + "learning_rate": 0.00015840597758405976, + "loss": 0.7992, + "step": 1152 + }, + { + "epoch": 0.6960458798671899, + "grad_norm": 0.10954567044973373, + "learning_rate": 0.0001583644665836447, + "loss": 0.7242, + "step": 1153 + }, + { + "epoch": 0.6966495623302144, + "grad_norm": 0.08049467206001282, + "learning_rate": 0.00015832295558322956, + "loss": 0.9732, + "step": 1154 + }, + { + "epoch": 0.6972532447932388, + "grad_norm": 0.08102589845657349, + "learning_rate": 0.00015828144458281446, + "loss": 1.0654, + "step": 1155 + }, + { + "epoch": 0.6978569272562632, + "grad_norm": 0.07799682021141052, + "learning_rate": 0.00015823993358239934, + "loss": 0.879, + "step": 1156 + }, + { + "epoch": 0.6984606097192877, + "grad_norm": 0.08080107718706131, + "learning_rate": 0.00015819842258198424, + "loss": 0.8515, + "step": 1157 + }, + { + "epoch": 0.6990642921823121, + "grad_norm": 0.07894068211317062, + "learning_rate": 0.00015815691158156914, + "loss": 0.8024, + "step": 1158 + }, + { + "epoch": 0.6996679746453366, + "grad_norm": 0.0848923772573471, + "learning_rate": 0.000158115400581154, + "loss": 0.7698, + "step": 1159 + }, + { + "epoch": 0.700271657108361, + "grad_norm": 0.08654095232486725, + "learning_rate": 0.0001580738895807389, + "loss": 0.8809, + "step": 1160 + }, + { + "epoch": 0.7008753395713855, + "grad_norm": 0.07776283472776413, + "learning_rate": 0.0001580323785803238, + "loss": 0.7659, + "step": 1161 + }, + { + "epoch": 0.7014790220344099, + "grad_norm": 0.09321299195289612, + "learning_rate": 0.00015799086757990868, + "loss": 0.9147, + "step": 1162 + }, + { + "epoch": 0.7020827044974344, + "grad_norm": 0.10445485264062881, + "learning_rate": 0.00015794935657949358, + "loss": 0.7182, + "step": 1163 + }, + { + "epoch": 0.7026863869604588, + "grad_norm": 0.08144387602806091, + "learning_rate": 0.00015790784557907848, + "loss": 0.9075, + "step": 1164 + }, + { + "epoch": 0.7032900694234833, + "grad_norm": 0.08363913744688034, + "learning_rate": 0.00015786633457866335, + "loss": 0.7935, + "step": 1165 + }, + { + "epoch": 0.7038937518865077, + "grad_norm": 0.07295918464660645, + "learning_rate": 0.00015782482357824823, + "loss": 0.99, + "step": 1166 + }, + { + "epoch": 0.7044974343495322, + "grad_norm": 0.0843958631157875, + "learning_rate": 0.00015778331257783315, + "loss": 0.8588, + "step": 1167 + }, + { + "epoch": 0.7051011168125566, + "grad_norm": 0.08498518168926239, + "learning_rate": 0.00015774180157741803, + "loss": 0.7837, + "step": 1168 + }, + { + "epoch": 0.705704799275581, + "grad_norm": 0.08214520663022995, + "learning_rate": 0.0001577002905770029, + "loss": 0.8411, + "step": 1169 + }, + { + "epoch": 0.7063084817386055, + "grad_norm": 0.08001639693975449, + "learning_rate": 0.0001576587795765878, + "loss": 0.7665, + "step": 1170 + }, + { + "epoch": 0.70691216420163, + "grad_norm": 0.08271230757236481, + "learning_rate": 0.0001576172685761727, + "loss": 0.8341, + "step": 1171 + }, + { + "epoch": 0.7075158466646544, + "grad_norm": 0.08699033409357071, + "learning_rate": 0.00015757575757575757, + "loss": 0.7784, + "step": 1172 + }, + { + "epoch": 0.7081195291276788, + "grad_norm": 0.09423791617155075, + "learning_rate": 0.00015753424657534247, + "loss": 0.9262, + "step": 1173 + }, + { + "epoch": 0.7087232115907033, + "grad_norm": 0.07842440158128738, + "learning_rate": 0.00015749273557492737, + "loss": 0.9249, + "step": 1174 + }, + { + "epoch": 0.7093268940537277, + "grad_norm": 0.08183681219816208, + "learning_rate": 0.00015745122457451225, + "loss": 0.7941, + "step": 1175 + }, + { + "epoch": 0.7099305765167522, + "grad_norm": 0.08315775543451309, + "learning_rate": 0.00015740971357409715, + "loss": 0.8225, + "step": 1176 + }, + { + "epoch": 0.7105342589797766, + "grad_norm": 0.08027949929237366, + "learning_rate": 0.00015736820257368205, + "loss": 0.8441, + "step": 1177 + }, + { + "epoch": 0.7111379414428011, + "grad_norm": 0.08332298696041107, + "learning_rate": 0.00015732669157326692, + "loss": 0.9679, + "step": 1178 + }, + { + "epoch": 0.7117416239058255, + "grad_norm": 0.09133647382259369, + "learning_rate": 0.00015728518057285182, + "loss": 0.9468, + "step": 1179 + }, + { + "epoch": 0.71234530636885, + "grad_norm": 0.08058296889066696, + "learning_rate": 0.0001572436695724367, + "loss": 0.8374, + "step": 1180 + }, + { + "epoch": 0.7129489888318744, + "grad_norm": 0.08813148736953735, + "learning_rate": 0.0001572021585720216, + "loss": 0.9568, + "step": 1181 + }, + { + "epoch": 0.7135526712948989, + "grad_norm": 0.07533196359872818, + "learning_rate": 0.0001571606475716065, + "loss": 0.8627, + "step": 1182 + }, + { + "epoch": 0.7141563537579233, + "grad_norm": 0.08658763766288757, + "learning_rate": 0.00015711913657119137, + "loss": 0.7429, + "step": 1183 + }, + { + "epoch": 0.7147600362209477, + "grad_norm": 0.0844459980726242, + "learning_rate": 0.00015707762557077627, + "loss": 0.7966, + "step": 1184 + }, + { + "epoch": 0.7153637186839722, + "grad_norm": 0.08175187557935715, + "learning_rate": 0.00015703611457036117, + "loss": 0.7226, + "step": 1185 + }, + { + "epoch": 0.7159674011469966, + "grad_norm": 0.08582815527915955, + "learning_rate": 0.00015699460356994604, + "loss": 0.717, + "step": 1186 + }, + { + "epoch": 0.7165710836100211, + "grad_norm": 0.08503274619579315, + "learning_rate": 0.0001569530925695309, + "loss": 0.7526, + "step": 1187 + }, + { + "epoch": 0.7171747660730455, + "grad_norm": 0.08774889260530472, + "learning_rate": 0.00015691158156911584, + "loss": 0.7016, + "step": 1188 + }, + { + "epoch": 0.71777844853607, + "grad_norm": 0.10124623775482178, + "learning_rate": 0.0001568700705687007, + "loss": 0.6455, + "step": 1189 + }, + { + "epoch": 0.7183821309990944, + "grad_norm": 0.10203047096729279, + "learning_rate": 0.00015682855956828558, + "loss": 0.6779, + "step": 1190 + }, + { + "epoch": 0.7189858134621189, + "grad_norm": 0.09329473972320557, + "learning_rate": 0.0001567870485678705, + "loss": 0.7313, + "step": 1191 + }, + { + "epoch": 0.7195894959251434, + "grad_norm": 0.09363296627998352, + "learning_rate": 0.00015674553756745538, + "loss": 0.6968, + "step": 1192 + }, + { + "epoch": 0.7201931783881679, + "grad_norm": 0.10082225501537323, + "learning_rate": 0.00015670402656704026, + "loss": 0.737, + "step": 1193 + }, + { + "epoch": 0.7207968608511923, + "grad_norm": 0.10495683550834656, + "learning_rate": 0.00015666251556662516, + "loss": 0.7186, + "step": 1194 + }, + { + "epoch": 0.7214005433142168, + "grad_norm": 0.10491074621677399, + "learning_rate": 0.00015662100456621006, + "loss": 0.6949, + "step": 1195 + }, + { + "epoch": 0.7220042257772412, + "grad_norm": 0.12182420492172241, + "learning_rate": 0.00015657949356579493, + "loss": 0.667, + "step": 1196 + }, + { + "epoch": 0.7226079082402657, + "grad_norm": 0.12086670100688934, + "learning_rate": 0.00015653798256537983, + "loss": 0.5885, + "step": 1197 + }, + { + "epoch": 0.7232115907032901, + "grad_norm": 0.11433517187833786, + "learning_rate": 0.00015649647156496473, + "loss": 0.6147, + "step": 1198 + }, + { + "epoch": 0.7238152731663146, + "grad_norm": 0.12221046537160873, + "learning_rate": 0.00015645496056454963, + "loss": 0.5129, + "step": 1199 + }, + { + "epoch": 0.724418955629339, + "grad_norm": 0.12147875875234604, + "learning_rate": 0.0001564134495641345, + "loss": 0.4205, + "step": 1200 + }, + { + "epoch": 0.7250226380923634, + "grad_norm": 0.08808968216180801, + "learning_rate": 0.00015637193856371938, + "loss": 0.8195, + "step": 1201 + }, + { + "epoch": 0.7256263205553879, + "grad_norm": 0.08320089429616928, + "learning_rate": 0.0001563304275633043, + "loss": 0.8505, + "step": 1202 + }, + { + "epoch": 0.7262300030184123, + "grad_norm": 0.0852508470416069, + "learning_rate": 0.00015628891656288918, + "loss": 1.0964, + "step": 1203 + }, + { + "epoch": 0.7268336854814368, + "grad_norm": 0.09474988281726837, + "learning_rate": 0.00015624740556247405, + "loss": 0.8924, + "step": 1204 + }, + { + "epoch": 0.7274373679444612, + "grad_norm": 0.092880479991436, + "learning_rate": 0.00015620589456205898, + "loss": 0.8223, + "step": 1205 + }, + { + "epoch": 0.7280410504074857, + "grad_norm": 0.10932306200265884, + "learning_rate": 0.00015616438356164385, + "loss": 0.8786, + "step": 1206 + }, + { + "epoch": 0.7286447328705101, + "grad_norm": 0.0782761201262474, + "learning_rate": 0.00015612287256122872, + "loss": 0.7801, + "step": 1207 + }, + { + "epoch": 0.7292484153335346, + "grad_norm": 0.09566731005907059, + "learning_rate": 0.00015608136156081362, + "loss": 0.9474, + "step": 1208 + }, + { + "epoch": 0.729852097796559, + "grad_norm": 0.08776892721652985, + "learning_rate": 0.00015603985056039852, + "loss": 0.7888, + "step": 1209 + }, + { + "epoch": 0.7304557802595835, + "grad_norm": 0.07871197909116745, + "learning_rate": 0.0001559983395599834, + "loss": 0.7624, + "step": 1210 + }, + { + "epoch": 0.7310594627226079, + "grad_norm": 0.19957715272903442, + "learning_rate": 0.0001559568285595683, + "loss": 0.8708, + "step": 1211 + }, + { + "epoch": 0.7316631451856324, + "grad_norm": 0.0865522176027298, + "learning_rate": 0.0001559153175591532, + "loss": 0.9895, + "step": 1212 + }, + { + "epoch": 0.7322668276486568, + "grad_norm": 0.08585958927869797, + "learning_rate": 0.00015587380655873807, + "loss": 0.8575, + "step": 1213 + }, + { + "epoch": 0.7328705101116813, + "grad_norm": 0.07868848741054535, + "learning_rate": 0.00015583229555832297, + "loss": 0.768, + "step": 1214 + }, + { + "epoch": 0.7334741925747057, + "grad_norm": 0.08140038698911667, + "learning_rate": 0.00015579078455790784, + "loss": 0.7544, + "step": 1215 + }, + { + "epoch": 0.7340778750377301, + "grad_norm": 0.08099745959043503, + "learning_rate": 0.00015574927355749274, + "loss": 0.7234, + "step": 1216 + }, + { + "epoch": 0.7346815575007546, + "grad_norm": 0.08431069552898407, + "learning_rate": 0.00015570776255707764, + "loss": 0.8079, + "step": 1217 + }, + { + "epoch": 0.735285239963779, + "grad_norm": 0.08134233951568604, + "learning_rate": 0.00015566625155666251, + "loss": 1.0639, + "step": 1218 + }, + { + "epoch": 0.7358889224268035, + "grad_norm": 0.08087371289730072, + "learning_rate": 0.00015562474055624741, + "loss": 0.7534, + "step": 1219 + }, + { + "epoch": 0.7364926048898279, + "grad_norm": 0.08062789589166641, + "learning_rate": 0.00015558322955583231, + "loss": 0.8352, + "step": 1220 + }, + { + "epoch": 0.7370962873528524, + "grad_norm": 0.08649458736181259, + "learning_rate": 0.0001555417185554172, + "loss": 0.7991, + "step": 1221 + }, + { + "epoch": 0.7376999698158768, + "grad_norm": 0.0848165899515152, + "learning_rate": 0.0001555002075550021, + "loss": 0.8912, + "step": 1222 + }, + { + "epoch": 0.7383036522789013, + "grad_norm": 0.07689525932073593, + "learning_rate": 0.000155458696554587, + "loss": 0.8181, + "step": 1223 + }, + { + "epoch": 0.7389073347419257, + "grad_norm": 0.07876717299222946, + "learning_rate": 0.00015541718555417186, + "loss": 0.7244, + "step": 1224 + }, + { + "epoch": 0.7395110172049502, + "grad_norm": 0.07784243673086166, + "learning_rate": 0.00015537567455375673, + "loss": 1.539, + "step": 1225 + }, + { + "epoch": 0.7401146996679746, + "grad_norm": 0.08357193320989609, + "learning_rate": 0.00015533416355334166, + "loss": 0.8915, + "step": 1226 + }, + { + "epoch": 0.740718382130999, + "grad_norm": 0.07806842029094696, + "learning_rate": 0.00015529265255292653, + "loss": 0.801, + "step": 1227 + }, + { + "epoch": 0.7413220645940235, + "grad_norm": 0.08752121776342392, + "learning_rate": 0.0001552511415525114, + "loss": 0.7526, + "step": 1228 + }, + { + "epoch": 0.741925747057048, + "grad_norm": 0.0805983915925026, + "learning_rate": 0.0001552096305520963, + "loss": 0.9307, + "step": 1229 + }, + { + "epoch": 0.7425294295200724, + "grad_norm": 0.08306834846735, + "learning_rate": 0.0001551681195516812, + "loss": 0.7313, + "step": 1230 + }, + { + "epoch": 0.7431331119830968, + "grad_norm": 0.08837762475013733, + "learning_rate": 0.00015512660855126608, + "loss": 0.9053, + "step": 1231 + }, + { + "epoch": 0.7437367944461213, + "grad_norm": 0.07785294204950333, + "learning_rate": 0.00015508509755085098, + "loss": 0.7965, + "step": 1232 + }, + { + "epoch": 0.7443404769091458, + "grad_norm": 0.0776657834649086, + "learning_rate": 0.00015504358655043588, + "loss": 0.8275, + "step": 1233 + }, + { + "epoch": 0.7449441593721703, + "grad_norm": 0.08299911767244339, + "learning_rate": 0.00015500207555002075, + "loss": 0.7684, + "step": 1234 + }, + { + "epoch": 0.7455478418351947, + "grad_norm": 0.09403832256793976, + "learning_rate": 0.00015496056454960565, + "loss": 0.8317, + "step": 1235 + }, + { + "epoch": 0.7461515242982192, + "grad_norm": 0.08008915185928345, + "learning_rate": 0.00015491905354919055, + "loss": 0.6594, + "step": 1236 + }, + { + "epoch": 0.7467552067612436, + "grad_norm": 0.08133239299058914, + "learning_rate": 0.00015487754254877543, + "loss": 0.6942, + "step": 1237 + }, + { + "epoch": 0.7473588892242681, + "grad_norm": 0.08288057893514633, + "learning_rate": 0.00015483603154836033, + "loss": 0.7323, + "step": 1238 + }, + { + "epoch": 0.7479625716872925, + "grad_norm": 0.08818881958723068, + "learning_rate": 0.0001547945205479452, + "loss": 0.6966, + "step": 1239 + }, + { + "epoch": 0.748566254150317, + "grad_norm": 0.09948495030403137, + "learning_rate": 0.0001547530095475301, + "loss": 0.7126, + "step": 1240 + }, + { + "epoch": 0.7491699366133414, + "grad_norm": 0.12626221776008606, + "learning_rate": 0.000154711498547115, + "loss": 0.7647, + "step": 1241 + }, + { + "epoch": 0.7497736190763659, + "grad_norm": 0.10528393089771271, + "learning_rate": 0.00015466998754669987, + "loss": 0.8261, + "step": 1242 + }, + { + "epoch": 0.7503773015393903, + "grad_norm": 0.10515029728412628, + "learning_rate": 0.00015462847654628477, + "loss": 0.6662, + "step": 1243 + }, + { + "epoch": 0.7509809840024148, + "grad_norm": 0.10484447330236435, + "learning_rate": 0.00015458696554586967, + "loss": 0.7143, + "step": 1244 + }, + { + "epoch": 0.7515846664654392, + "grad_norm": 0.10932087153196335, + "learning_rate": 0.00015454545454545454, + "loss": 0.6561, + "step": 1245 + }, + { + "epoch": 0.7521883489284636, + "grad_norm": 0.10715696215629578, + "learning_rate": 0.00015450394354503944, + "loss": 0.6272, + "step": 1246 + }, + { + "epoch": 0.7527920313914881, + "grad_norm": 0.11597730964422226, + "learning_rate": 0.00015446243254462434, + "loss": 0.6093, + "step": 1247 + }, + { + "epoch": 0.7533957138545125, + "grad_norm": 0.11862548440694809, + "learning_rate": 0.00015442092154420922, + "loss": 0.6208, + "step": 1248 + }, + { + "epoch": 0.753999396317537, + "grad_norm": 0.12388130277395248, + "learning_rate": 0.00015437941054379412, + "loss": 0.5587, + "step": 1249 + }, + { + "epoch": 0.7546030787805614, + "grad_norm": 0.12081338465213776, + "learning_rate": 0.00015433789954337902, + "loss": 0.432, + "step": 1250 + }, + { + "epoch": 0.7552067612435859, + "grad_norm": 0.08639662712812424, + "learning_rate": 0.0001542963885429639, + "loss": 0.9072, + "step": 1251 + }, + { + "epoch": 0.7558104437066103, + "grad_norm": 0.0837317556142807, + "learning_rate": 0.0001542548775425488, + "loss": 1.1732, + "step": 1252 + }, + { + "epoch": 0.7564141261696348, + "grad_norm": 0.09014665335416794, + "learning_rate": 0.00015421336654213366, + "loss": 0.8012, + "step": 1253 + }, + { + "epoch": 0.7570178086326592, + "grad_norm": 0.09888771176338196, + "learning_rate": 0.00015417185554171856, + "loss": 0.8288, + "step": 1254 + }, + { + "epoch": 0.7576214910956837, + "grad_norm": 0.09830440580844879, + "learning_rate": 0.00015413034454130346, + "loss": 0.9115, + "step": 1255 + }, + { + "epoch": 0.7582251735587081, + "grad_norm": 0.08318249881267548, + "learning_rate": 0.00015408883354088834, + "loss": 0.8073, + "step": 1256 + }, + { + "epoch": 0.7588288560217326, + "grad_norm": 0.09623551368713379, + "learning_rate": 0.00015404732254047324, + "loss": 0.8798, + "step": 1257 + }, + { + "epoch": 0.759432538484757, + "grad_norm": 0.18773970007896423, + "learning_rate": 0.00015400581154005814, + "loss": 0.8237, + "step": 1258 + }, + { + "epoch": 0.7600362209477814, + "grad_norm": 0.07493336498737335, + "learning_rate": 0.000153964300539643, + "loss": 0.9224, + "step": 1259 + }, + { + "epoch": 0.7606399034108059, + "grad_norm": 0.08891238272190094, + "learning_rate": 0.0001539227895392279, + "loss": 0.8604, + "step": 1260 + }, + { + "epoch": 0.7612435858738303, + "grad_norm": 0.0810379758477211, + "learning_rate": 0.0001538812785388128, + "loss": 0.7966, + "step": 1261 + }, + { + "epoch": 0.7618472683368548, + "grad_norm": 0.08036971092224121, + "learning_rate": 0.00015383976753839768, + "loss": 0.7795, + "step": 1262 + }, + { + "epoch": 0.7624509507998792, + "grad_norm": 0.08294712007045746, + "learning_rate": 0.00015379825653798256, + "loss": 1.0023, + "step": 1263 + }, + { + "epoch": 0.7630546332629037, + "grad_norm": 0.0813121646642685, + "learning_rate": 0.00015375674553756748, + "loss": 1.0195, + "step": 1264 + }, + { + "epoch": 0.7636583157259281, + "grad_norm": 0.08313670009374619, + "learning_rate": 0.00015371523453715236, + "loss": 0.9885, + "step": 1265 + }, + { + "epoch": 0.7642619981889526, + "grad_norm": 0.07355551421642303, + "learning_rate": 0.00015367372353673723, + "loss": 0.8474, + "step": 1266 + }, + { + "epoch": 0.764865680651977, + "grad_norm": 0.13657139241695404, + "learning_rate": 0.00015363221253632213, + "loss": 0.9142, + "step": 1267 + }, + { + "epoch": 0.7654693631150015, + "grad_norm": 0.09950345009565353, + "learning_rate": 0.00015359070153590703, + "loss": 1.0635, + "step": 1268 + }, + { + "epoch": 0.7660730455780259, + "grad_norm": 0.07820272445678711, + "learning_rate": 0.0001535491905354919, + "loss": 1.1326, + "step": 1269 + }, + { + "epoch": 0.7666767280410504, + "grad_norm": 0.07532312721014023, + "learning_rate": 0.0001535076795350768, + "loss": 0.9895, + "step": 1270 + }, + { + "epoch": 0.7672804105040748, + "grad_norm": 0.09391959011554718, + "learning_rate": 0.0001534661685346617, + "loss": 0.7384, + "step": 1271 + }, + { + "epoch": 0.7678840929670993, + "grad_norm": 0.07820689678192139, + "learning_rate": 0.00015342465753424657, + "loss": 0.734, + "step": 1272 + }, + { + "epoch": 0.7684877754301238, + "grad_norm": 0.08966294676065445, + "learning_rate": 0.00015338314653383147, + "loss": 0.7925, + "step": 1273 + }, + { + "epoch": 0.7690914578931483, + "grad_norm": 0.08040351420640945, + "learning_rate": 0.00015334163553341637, + "loss": 0.7811, + "step": 1274 + }, + { + "epoch": 0.7696951403561727, + "grad_norm": 0.08416478335857391, + "learning_rate": 0.00015330012453300125, + "loss": 0.9939, + "step": 1275 + }, + { + "epoch": 0.7702988228191971, + "grad_norm": 0.08706527948379517, + "learning_rate": 0.00015325861353258615, + "loss": 0.8099, + "step": 1276 + }, + { + "epoch": 0.7709025052822216, + "grad_norm": 0.10363662242889404, + "learning_rate": 0.00015321710253217102, + "loss": 0.7835, + "step": 1277 + }, + { + "epoch": 0.771506187745246, + "grad_norm": 0.080405592918396, + "learning_rate": 0.00015317559153175592, + "loss": 0.7816, + "step": 1278 + }, + { + "epoch": 0.7721098702082705, + "grad_norm": 0.0954577773809433, + "learning_rate": 0.00015313408053134082, + "loss": 1.0492, + "step": 1279 + }, + { + "epoch": 0.7727135526712949, + "grad_norm": 0.08522694557905197, + "learning_rate": 0.0001530925695309257, + "loss": 0.7938, + "step": 1280 + }, + { + "epoch": 0.7733172351343194, + "grad_norm": 0.0870218351483345, + "learning_rate": 0.0001530510585305106, + "loss": 1.0509, + "step": 1281 + }, + { + "epoch": 0.7739209175973438, + "grad_norm": 0.07454058527946472, + "learning_rate": 0.0001530095475300955, + "loss": 0.8381, + "step": 1282 + }, + { + "epoch": 0.7745246000603683, + "grad_norm": 0.0701836496591568, + "learning_rate": 0.00015296803652968037, + "loss": 0.6133, + "step": 1283 + }, + { + "epoch": 0.7751282825233927, + "grad_norm": 0.08319991081953049, + "learning_rate": 0.00015292652552926524, + "loss": 0.7434, + "step": 1284 + }, + { + "epoch": 0.7757319649864172, + "grad_norm": 0.07945888489484787, + "learning_rate": 0.00015288501452885017, + "loss": 0.7266, + "step": 1285 + }, + { + "epoch": 0.7763356474494416, + "grad_norm": 0.0800202265381813, + "learning_rate": 0.00015284350352843504, + "loss": 0.7203, + "step": 1286 + }, + { + "epoch": 0.7769393299124661, + "grad_norm": 0.08976299315690994, + "learning_rate": 0.0001528019925280199, + "loss": 0.7075, + "step": 1287 + }, + { + "epoch": 0.7775430123754905, + "grad_norm": 0.08427184820175171, + "learning_rate": 0.00015276048152760484, + "loss": 0.6855, + "step": 1288 + }, + { + "epoch": 0.778146694838515, + "grad_norm": 0.09182146191596985, + "learning_rate": 0.0001527189705271897, + "loss": 0.7375, + "step": 1289 + }, + { + "epoch": 0.7787503773015394, + "grad_norm": 0.0932517945766449, + "learning_rate": 0.0001526774595267746, + "loss": 0.7168, + "step": 1290 + }, + { + "epoch": 0.7793540597645638, + "grad_norm": 0.11614131927490234, + "learning_rate": 0.00015263594852635949, + "loss": 0.7134, + "step": 1291 + }, + { + "epoch": 0.7799577422275883, + "grad_norm": 0.10372986644506454, + "learning_rate": 0.00015259443752594439, + "loss": 0.7539, + "step": 1292 + }, + { + "epoch": 0.7805614246906127, + "grad_norm": 0.10486335307359695, + "learning_rate": 0.00015255292652552929, + "loss": 0.7198, + "step": 1293 + }, + { + "epoch": 0.7811651071536372, + "grad_norm": 0.11475211381912231, + "learning_rate": 0.00015251141552511416, + "loss": 0.7086, + "step": 1294 + }, + { + "epoch": 0.7817687896166616, + "grad_norm": 0.10555735975503922, + "learning_rate": 0.00015246990452469906, + "loss": 0.6697, + "step": 1295 + }, + { + "epoch": 0.7823724720796861, + "grad_norm": 0.10920953750610352, + "learning_rate": 0.00015242839352428396, + "loss": 0.6179, + "step": 1296 + }, + { + "epoch": 0.7829761545427105, + "grad_norm": 0.13240636885166168, + "learning_rate": 0.00015238688252386883, + "loss": 0.6766, + "step": 1297 + }, + { + "epoch": 0.783579837005735, + "grad_norm": 0.1218361034989357, + "learning_rate": 0.0001523453715234537, + "loss": 0.6364, + "step": 1298 + }, + { + "epoch": 0.7841835194687594, + "grad_norm": 0.1165798008441925, + "learning_rate": 0.00015230386052303863, + "loss": 0.5109, + "step": 1299 + }, + { + "epoch": 0.7847872019317839, + "grad_norm": 0.12739436328411102, + "learning_rate": 0.0001522623495226235, + "loss": 0.4305, + "step": 1300 + }, + { + "epoch": 0.7853908843948083, + "grad_norm": 0.10032903403043747, + "learning_rate": 0.00015222083852220838, + "loss": 0.8067, + "step": 1301 + }, + { + "epoch": 0.7859945668578328, + "grad_norm": 0.08724936097860336, + "learning_rate": 0.0001521793275217933, + "loss": 0.9813, + "step": 1302 + }, + { + "epoch": 0.7865982493208572, + "grad_norm": 0.08223757147789001, + "learning_rate": 0.00015213781652137818, + "loss": 0.756, + "step": 1303 + }, + { + "epoch": 0.7872019317838816, + "grad_norm": 0.09227702766656876, + "learning_rate": 0.00015209630552096305, + "loss": 0.8074, + "step": 1304 + }, + { + "epoch": 0.7878056142469061, + "grad_norm": 0.08399201929569244, + "learning_rate": 0.00015205479452054795, + "loss": 1.0582, + "step": 1305 + }, + { + "epoch": 0.7884092967099305, + "grad_norm": 0.0827503651380539, + "learning_rate": 0.00015201328352013285, + "loss": 0.7878, + "step": 1306 + }, + { + "epoch": 0.789012979172955, + "grad_norm": 0.08233575522899628, + "learning_rate": 0.00015197177251971772, + "loss": 0.8512, + "step": 1307 + }, + { + "epoch": 0.7896166616359794, + "grad_norm": 0.09783722460269928, + "learning_rate": 0.00015193026151930262, + "loss": 0.8324, + "step": 1308 + }, + { + "epoch": 0.7902203440990039, + "grad_norm": 0.09064453095197678, + "learning_rate": 0.00015188875051888752, + "loss": 0.8361, + "step": 1309 + }, + { + "epoch": 0.7908240265620283, + "grad_norm": 0.09642300009727478, + "learning_rate": 0.0001518472395184724, + "loss": 0.7578, + "step": 1310 + }, + { + "epoch": 0.7914277090250528, + "grad_norm": 0.08258376270532608, + "learning_rate": 0.0001518057285180573, + "loss": 0.9612, + "step": 1311 + }, + { + "epoch": 0.7920313914880772, + "grad_norm": 0.08849532902240753, + "learning_rate": 0.00015176421751764217, + "loss": 0.798, + "step": 1312 + }, + { + "epoch": 0.7926350739511017, + "grad_norm": 0.08029762655496597, + "learning_rate": 0.00015172270651722707, + "loss": 0.8082, + "step": 1313 + }, + { + "epoch": 0.7932387564141262, + "grad_norm": 0.0844390019774437, + "learning_rate": 0.00015168119551681197, + "loss": 0.7453, + "step": 1314 + }, + { + "epoch": 0.7938424388771507, + "grad_norm": 0.09358351677656174, + "learning_rate": 0.00015163968451639684, + "loss": 0.8036, + "step": 1315 + }, + { + "epoch": 0.7944461213401751, + "grad_norm": 0.08466824889183044, + "learning_rate": 0.00015159817351598174, + "loss": 0.777, + "step": 1316 + }, + { + "epoch": 0.7950498038031996, + "grad_norm": 0.08159632235765457, + "learning_rate": 0.00015155666251556664, + "loss": 0.8004, + "step": 1317 + }, + { + "epoch": 0.795653486266224, + "grad_norm": 0.1605248749256134, + "learning_rate": 0.00015151515151515152, + "loss": 0.8841, + "step": 1318 + }, + { + "epoch": 0.7962571687292485, + "grad_norm": 0.08063754439353943, + "learning_rate": 0.00015147364051473642, + "loss": 0.9292, + "step": 1319 + }, + { + "epoch": 0.7968608511922729, + "grad_norm": 0.08406137675046921, + "learning_rate": 0.00015143212951432132, + "loss": 0.7878, + "step": 1320 + }, + { + "epoch": 0.7974645336552973, + "grad_norm": 0.07940443605184555, + "learning_rate": 0.0001513906185139062, + "loss": 0.8981, + "step": 1321 + }, + { + "epoch": 0.7980682161183218, + "grad_norm": 0.07914227992296219, + "learning_rate": 0.00015134910751349106, + "loss": 1.1571, + "step": 1322 + }, + { + "epoch": 0.7986718985813462, + "grad_norm": 0.07836727797985077, + "learning_rate": 0.000151307596513076, + "loss": 0.8384, + "step": 1323 + }, + { + "epoch": 0.7992755810443707, + "grad_norm": 0.08515416085720062, + "learning_rate": 0.00015126608551266086, + "loss": 0.7468, + "step": 1324 + }, + { + "epoch": 0.7998792635073951, + "grad_norm": 0.0839102566242218, + "learning_rate": 0.00015122457451224573, + "loss": 0.9835, + "step": 1325 + }, + { + "epoch": 0.8004829459704196, + "grad_norm": 0.2878398299217224, + "learning_rate": 0.00015118306351183063, + "loss": 1.232, + "step": 1326 + }, + { + "epoch": 0.801086628433444, + "grad_norm": 0.0905466228723526, + "learning_rate": 0.00015114155251141553, + "loss": 0.8908, + "step": 1327 + }, + { + "epoch": 0.8016903108964685, + "grad_norm": 0.08527706563472748, + "learning_rate": 0.0001511000415110004, + "loss": 0.8004, + "step": 1328 + }, + { + "epoch": 0.8022939933594929, + "grad_norm": 0.08341808617115021, + "learning_rate": 0.0001510585305105853, + "loss": 0.7974, + "step": 1329 + }, + { + "epoch": 0.8028976758225174, + "grad_norm": 0.07585755735635757, + "learning_rate": 0.0001510170195101702, + "loss": 0.7528, + "step": 1330 + }, + { + "epoch": 0.8035013582855418, + "grad_norm": 0.08576681464910507, + "learning_rate": 0.00015097550850975508, + "loss": 0.7634, + "step": 1331 + }, + { + "epoch": 0.8041050407485663, + "grad_norm": 0.10057312995195389, + "learning_rate": 0.00015093399750933998, + "loss": 0.6505, + "step": 1332 + }, + { + "epoch": 0.8047087232115907, + "grad_norm": 0.08735567331314087, + "learning_rate": 0.00015089248650892488, + "loss": 0.6856, + "step": 1333 + }, + { + "epoch": 0.8053124056746152, + "grad_norm": 0.09076007455587387, + "learning_rate": 0.00015085097550850978, + "loss": 0.7697, + "step": 1334 + }, + { + "epoch": 0.8059160881376396, + "grad_norm": 0.09162318706512451, + "learning_rate": 0.00015080946450809465, + "loss": 0.7425, + "step": 1335 + }, + { + "epoch": 0.806519770600664, + "grad_norm": 0.08863736689090729, + "learning_rate": 0.00015076795350767953, + "loss": 0.7748, + "step": 1336 + }, + { + "epoch": 0.8071234530636885, + "grad_norm": 0.10001536458730698, + "learning_rate": 0.00015072644250726445, + "loss": 0.7162, + "step": 1337 + }, + { + "epoch": 0.8077271355267129, + "grad_norm": 0.09326426684856415, + "learning_rate": 0.00015068493150684933, + "loss": 0.82, + "step": 1338 + }, + { + "epoch": 0.8083308179897374, + "grad_norm": 0.09727218747138977, + "learning_rate": 0.0001506434205064342, + "loss": 0.7534, + "step": 1339 + }, + { + "epoch": 0.8089345004527618, + "grad_norm": 0.09535179287195206, + "learning_rate": 0.0001506019095060191, + "loss": 0.7623, + "step": 1340 + }, + { + "epoch": 0.8095381829157863, + "grad_norm": 0.09758217632770538, + "learning_rate": 0.000150560398505604, + "loss": 0.7533, + "step": 1341 + }, + { + "epoch": 0.8101418653788107, + "grad_norm": 0.11768268793821335, + "learning_rate": 0.00015051888750518887, + "loss": 0.7481, + "step": 1342 + }, + { + "epoch": 0.8107455478418352, + "grad_norm": 0.09633111953735352, + "learning_rate": 0.00015047737650477377, + "loss": 0.7191, + "step": 1343 + }, + { + "epoch": 0.8113492303048596, + "grad_norm": 0.10011658817529678, + "learning_rate": 0.00015043586550435867, + "loss": 0.6665, + "step": 1344 + }, + { + "epoch": 0.8119529127678841, + "grad_norm": 0.11359147727489471, + "learning_rate": 0.00015039435450394354, + "loss": 0.7549, + "step": 1345 + }, + { + "epoch": 0.8125565952309085, + "grad_norm": 0.11092590540647507, + "learning_rate": 0.00015035284350352844, + "loss": 0.8078, + "step": 1346 + }, + { + "epoch": 0.813160277693933, + "grad_norm": 0.1197749450802803, + "learning_rate": 0.00015031133250311335, + "loss": 0.6463, + "step": 1347 + }, + { + "epoch": 0.8137639601569574, + "grad_norm": 0.12124069780111313, + "learning_rate": 0.00015026982150269822, + "loss": 0.6384, + "step": 1348 + }, + { + "epoch": 0.8143676426199818, + "grad_norm": 0.14109613001346588, + "learning_rate": 0.00015022831050228312, + "loss": 0.5564, + "step": 1349 + }, + { + "epoch": 0.8149713250830063, + "grad_norm": 0.12870782613754272, + "learning_rate": 0.000150186799501868, + "loss": 0.4514, + "step": 1350 + }, + { + "epoch": 0.8155750075460307, + "grad_norm": 0.14718082547187805, + "learning_rate": 0.0001501452885014529, + "loss": 0.8223, + "step": 1351 + }, + { + "epoch": 0.8161786900090552, + "grad_norm": 0.0827641561627388, + "learning_rate": 0.0001501037775010378, + "loss": 0.7125, + "step": 1352 + }, + { + "epoch": 0.8167823724720796, + "grad_norm": 0.0898633524775505, + "learning_rate": 0.00015006226650062266, + "loss": 1.0915, + "step": 1353 + }, + { + "epoch": 0.8173860549351042, + "grad_norm": 0.09281725436449051, + "learning_rate": 0.00015002075550020756, + "loss": 0.854, + "step": 1354 + }, + { + "epoch": 0.8179897373981286, + "grad_norm": 0.08282145857810974, + "learning_rate": 0.00014997924449979246, + "loss": 1.0761, + "step": 1355 + }, + { + "epoch": 0.8185934198611531, + "grad_norm": 0.08490285277366638, + "learning_rate": 0.00014993773349937734, + "loss": 1.1083, + "step": 1356 + }, + { + "epoch": 0.8191971023241775, + "grad_norm": 0.09798530489206314, + "learning_rate": 0.00014989622249896224, + "loss": 1.0771, + "step": 1357 + }, + { + "epoch": 0.819800784787202, + "grad_norm": 0.08119674026966095, + "learning_rate": 0.00014985471149854714, + "loss": 0.8048, + "step": 1358 + }, + { + "epoch": 0.8204044672502264, + "grad_norm": 0.08817379921674728, + "learning_rate": 0.000149813200498132, + "loss": 0.8062, + "step": 1359 + }, + { + "epoch": 0.8210081497132509, + "grad_norm": 0.08717963844537735, + "learning_rate": 0.00014977168949771688, + "loss": 0.8923, + "step": 1360 + }, + { + "epoch": 0.8216118321762753, + "grad_norm": 0.08484237641096115, + "learning_rate": 0.0001497301784973018, + "loss": 0.882, + "step": 1361 + }, + { + "epoch": 0.8222155146392998, + "grad_norm": 0.0800343006849289, + "learning_rate": 0.00014968866749688668, + "loss": 0.8224, + "step": 1362 + }, + { + "epoch": 0.8228191971023242, + "grad_norm": 0.08844833076000214, + "learning_rate": 0.00014964715649647156, + "loss": 0.7582, + "step": 1363 + }, + { + "epoch": 0.8234228795653487, + "grad_norm": 0.08437283337116241, + "learning_rate": 0.00014960564549605646, + "loss": 0.847, + "step": 1364 + }, + { + "epoch": 0.8240265620283731, + "grad_norm": 0.08416473865509033, + "learning_rate": 0.00014956413449564136, + "loss": 0.8066, + "step": 1365 + }, + { + "epoch": 0.8246302444913975, + "grad_norm": 0.0806473046541214, + "learning_rate": 0.00014952262349522623, + "loss": 0.797, + "step": 1366 + }, + { + "epoch": 0.825233926954422, + "grad_norm": 0.08505918830633163, + "learning_rate": 0.00014948111249481113, + "loss": 0.826, + "step": 1367 + }, + { + "epoch": 0.8258376094174464, + "grad_norm": 0.08299072831869125, + "learning_rate": 0.00014943960149439603, + "loss": 0.7974, + "step": 1368 + }, + { + "epoch": 0.8264412918804709, + "grad_norm": 0.0950680747628212, + "learning_rate": 0.0001493980904939809, + "loss": 0.8961, + "step": 1369 + }, + { + "epoch": 0.8270449743434953, + "grad_norm": 0.08349784463644028, + "learning_rate": 0.0001493565794935658, + "loss": 0.8791, + "step": 1370 + }, + { + "epoch": 0.8276486568065198, + "grad_norm": 0.08619910478591919, + "learning_rate": 0.0001493150684931507, + "loss": 0.8367, + "step": 1371 + }, + { + "epoch": 0.8282523392695442, + "grad_norm": 0.09480497241020203, + "learning_rate": 0.00014927355749273557, + "loss": 0.8101, + "step": 1372 + }, + { + "epoch": 0.8288560217325687, + "grad_norm": 0.0833049863576889, + "learning_rate": 0.00014923204649232047, + "loss": 0.8616, + "step": 1373 + }, + { + "epoch": 0.8294597041955931, + "grad_norm": 0.08111268281936646, + "learning_rate": 0.00014919053549190535, + "loss": 0.7388, + "step": 1374 + }, + { + "epoch": 0.8300633866586176, + "grad_norm": 0.08618942648172379, + "learning_rate": 0.00014914902449149025, + "loss": 0.8678, + "step": 1375 + }, + { + "epoch": 0.830667069121642, + "grad_norm": 0.08400508016347885, + "learning_rate": 0.00014910751349107515, + "loss": 0.7766, + "step": 1376 + }, + { + "epoch": 0.8312707515846665, + "grad_norm": 0.0798826590180397, + "learning_rate": 0.00014906600249066002, + "loss": 0.8156, + "step": 1377 + }, + { + "epoch": 0.8318744340476909, + "grad_norm": 0.08481590449810028, + "learning_rate": 0.00014902449149024492, + "loss": 1.117, + "step": 1378 + }, + { + "epoch": 0.8324781165107153, + "grad_norm": 0.09439216554164886, + "learning_rate": 0.00014898298048982982, + "loss": 0.8822, + "step": 1379 + }, + { + "epoch": 0.8330817989737398, + "grad_norm": 0.09862873703241348, + "learning_rate": 0.0001489414694894147, + "loss": 0.8087, + "step": 1380 + }, + { + "epoch": 0.8336854814367642, + "grad_norm": 0.08466385304927826, + "learning_rate": 0.0001488999584889996, + "loss": 0.8568, + "step": 1381 + }, + { + "epoch": 0.8342891638997887, + "grad_norm": 0.08554793149232864, + "learning_rate": 0.0001488584474885845, + "loss": 0.95, + "step": 1382 + }, + { + "epoch": 0.8348928463628131, + "grad_norm": 0.07983773201704025, + "learning_rate": 0.00014881693648816937, + "loss": 0.7137, + "step": 1383 + }, + { + "epoch": 0.8354965288258376, + "grad_norm": 0.07740309834480286, + "learning_rate": 0.00014877542548775427, + "loss": 0.7139, + "step": 1384 + }, + { + "epoch": 0.836100211288862, + "grad_norm": 0.08271116763353348, + "learning_rate": 0.00014873391448733917, + "loss": 0.7001, + "step": 1385 + }, + { + "epoch": 0.8367038937518865, + "grad_norm": 0.08564220368862152, + "learning_rate": 0.00014869240348692404, + "loss": 0.7624, + "step": 1386 + }, + { + "epoch": 0.8373075762149109, + "grad_norm": 0.08788985759019852, + "learning_rate": 0.00014865089248650894, + "loss": 0.7563, + "step": 1387 + }, + { + "epoch": 0.8379112586779354, + "grad_norm": 0.09162076562643051, + "learning_rate": 0.0001486093814860938, + "loss": 0.7008, + "step": 1388 + }, + { + "epoch": 0.8385149411409598, + "grad_norm": 0.08919322490692139, + "learning_rate": 0.0001485678704856787, + "loss": 0.7408, + "step": 1389 + }, + { + "epoch": 0.8391186236039843, + "grad_norm": 0.4382922053337097, + "learning_rate": 0.0001485263594852636, + "loss": 0.748, + "step": 1390 + }, + { + "epoch": 0.8397223060670087, + "grad_norm": 0.09150999784469604, + "learning_rate": 0.00014848484848484849, + "loss": 0.7801, + "step": 1391 + }, + { + "epoch": 0.8403259885300332, + "grad_norm": 0.09508758038282394, + "learning_rate": 0.00014844333748443339, + "loss": 0.6773, + "step": 1392 + }, + { + "epoch": 0.8409296709930576, + "grad_norm": 0.10179366171360016, + "learning_rate": 0.00014840182648401829, + "loss": 0.7492, + "step": 1393 + }, + { + "epoch": 0.8415333534560822, + "grad_norm": 0.10912247747182846, + "learning_rate": 0.00014836031548360316, + "loss": 0.7271, + "step": 1394 + }, + { + "epoch": 0.8421370359191066, + "grad_norm": 0.10175374895334244, + "learning_rate": 0.00014831880448318803, + "loss": 0.6465, + "step": 1395 + }, + { + "epoch": 0.842740718382131, + "grad_norm": 0.10953323543071747, + "learning_rate": 0.00014827729348277296, + "loss": 0.6639, + "step": 1396 + }, + { + "epoch": 0.8433444008451555, + "grad_norm": 0.11571014672517776, + "learning_rate": 0.00014823578248235783, + "loss": 0.6389, + "step": 1397 + }, + { + "epoch": 0.8439480833081799, + "grad_norm": 0.1150619387626648, + "learning_rate": 0.0001481942714819427, + "loss": 0.5918, + "step": 1398 + }, + { + "epoch": 0.8445517657712044, + "grad_norm": 0.12481655180454254, + "learning_rate": 0.00014815276048152763, + "loss": 0.5285, + "step": 1399 + }, + { + "epoch": 0.8451554482342288, + "grad_norm": 0.12378332763910294, + "learning_rate": 0.0001481112494811125, + "loss": 0.4092, + "step": 1400 + }, + { + "epoch": 0.8457591306972533, + "grad_norm": 0.0824832022190094, + "learning_rate": 0.00014806973848069738, + "loss": 1.0004, + "step": 1401 + }, + { + "epoch": 0.8463628131602777, + "grad_norm": 0.0863024890422821, + "learning_rate": 0.00014802822748028228, + "loss": 0.8013, + "step": 1402 + }, + { + "epoch": 0.8469664956233022, + "grad_norm": 0.08444255590438843, + "learning_rate": 0.00014798671647986718, + "loss": 0.8442, + "step": 1403 + }, + { + "epoch": 0.8475701780863266, + "grad_norm": 0.10533567517995834, + "learning_rate": 0.00014794520547945205, + "loss": 0.987, + "step": 1404 + }, + { + "epoch": 0.8481738605493511, + "grad_norm": 0.09298814088106155, + "learning_rate": 0.00014790369447903695, + "loss": 0.9405, + "step": 1405 + }, + { + "epoch": 0.8487775430123755, + "grad_norm": 0.09649625420570374, + "learning_rate": 0.00014786218347862185, + "loss": 0.9325, + "step": 1406 + }, + { + "epoch": 0.8493812254754, + "grad_norm": 0.10233578085899353, + "learning_rate": 0.00014782067247820672, + "loss": 0.9722, + "step": 1407 + }, + { + "epoch": 0.8499849079384244, + "grad_norm": 0.07888230681419373, + "learning_rate": 0.00014777916147779162, + "loss": 0.6847, + "step": 1408 + }, + { + "epoch": 0.8505885904014489, + "grad_norm": 0.1061205044388771, + "learning_rate": 0.0001477376504773765, + "loss": 0.7159, + "step": 1409 + }, + { + "epoch": 0.8511922728644733, + "grad_norm": 0.09067028015851974, + "learning_rate": 0.0001476961394769614, + "loss": 1.0638, + "step": 1410 + }, + { + "epoch": 0.8517959553274977, + "grad_norm": 0.07997080683708191, + "learning_rate": 0.0001476546284765463, + "loss": 1.0731, + "step": 1411 + }, + { + "epoch": 0.8523996377905222, + "grad_norm": 0.08588366955518723, + "learning_rate": 0.00014761311747613117, + "loss": 0.8226, + "step": 1412 + }, + { + "epoch": 0.8530033202535466, + "grad_norm": 0.08147002011537552, + "learning_rate": 0.00014757160647571607, + "loss": 0.8404, + "step": 1413 + }, + { + "epoch": 0.8536070027165711, + "grad_norm": 0.08258868753910065, + "learning_rate": 0.00014753009547530097, + "loss": 0.8363, + "step": 1414 + }, + { + "epoch": 0.8542106851795955, + "grad_norm": 0.11236279457807541, + "learning_rate": 0.00014748858447488584, + "loss": 0.8376, + "step": 1415 + }, + { + "epoch": 0.85481436764262, + "grad_norm": 0.07888762652873993, + "learning_rate": 0.00014744707347447074, + "loss": 0.8395, + "step": 1416 + }, + { + "epoch": 0.8554180501056444, + "grad_norm": 0.08469484001398087, + "learning_rate": 0.00014740556247405564, + "loss": 1.0579, + "step": 1417 + }, + { + "epoch": 0.8560217325686689, + "grad_norm": 0.07748138904571533, + "learning_rate": 0.00014736405147364052, + "loss": 0.7358, + "step": 1418 + }, + { + "epoch": 0.8566254150316933, + "grad_norm": 0.08709147572517395, + "learning_rate": 0.0001473225404732254, + "loss": 0.9408, + "step": 1419 + }, + { + "epoch": 0.8572290974947178, + "grad_norm": 0.08583850413560867, + "learning_rate": 0.00014728102947281032, + "loss": 0.822, + "step": 1420 + }, + { + "epoch": 0.8578327799577422, + "grad_norm": 0.09203623980283737, + "learning_rate": 0.0001472395184723952, + "loss": 0.8535, + "step": 1421 + }, + { + "epoch": 0.8584364624207667, + "grad_norm": 0.08235791325569153, + "learning_rate": 0.0001471980074719801, + "loss": 0.8012, + "step": 1422 + }, + { + "epoch": 0.8590401448837911, + "grad_norm": 0.09391656517982483, + "learning_rate": 0.00014715649647156496, + "loss": 0.888, + "step": 1423 + }, + { + "epoch": 0.8596438273468155, + "grad_norm": 0.0862961858510971, + "learning_rate": 0.00014711498547114986, + "loss": 0.8323, + "step": 1424 + }, + { + "epoch": 0.86024750980984, + "grad_norm": 0.08743558079004288, + "learning_rate": 0.00014707347447073476, + "loss": 0.7275, + "step": 1425 + }, + { + "epoch": 0.8608511922728644, + "grad_norm": 0.07890239357948303, + "learning_rate": 0.00014703196347031963, + "loss": 0.6607, + "step": 1426 + }, + { + "epoch": 0.8614548747358889, + "grad_norm": 0.07826490700244904, + "learning_rate": 0.00014699045246990453, + "loss": 0.7179, + "step": 1427 + }, + { + "epoch": 0.8620585571989133, + "grad_norm": 0.08540347218513489, + "learning_rate": 0.00014694894146948943, + "loss": 0.7166, + "step": 1428 + }, + { + "epoch": 0.8626622396619378, + "grad_norm": 0.08672580122947693, + "learning_rate": 0.0001469074304690743, + "loss": 1.0812, + "step": 1429 + }, + { + "epoch": 0.8632659221249622, + "grad_norm": 0.08532918244600296, + "learning_rate": 0.0001468659194686592, + "loss": 0.7676, + "step": 1430 + }, + { + "epoch": 0.8638696045879867, + "grad_norm": 0.08678078651428223, + "learning_rate": 0.0001468244084682441, + "loss": 0.8672, + "step": 1431 + }, + { + "epoch": 0.8644732870510111, + "grad_norm": 0.08257097750902176, + "learning_rate": 0.00014678289746782898, + "loss": 0.7108, + "step": 1432 + }, + { + "epoch": 0.8650769695140356, + "grad_norm": 0.09253834187984467, + "learning_rate": 0.00014674138646741385, + "loss": 0.8333, + "step": 1433 + }, + { + "epoch": 0.86568065197706, + "grad_norm": 0.08253934979438782, + "learning_rate": 0.00014669987546699878, + "loss": 0.7099, + "step": 1434 + }, + { + "epoch": 0.8662843344400846, + "grad_norm": 0.08953447639942169, + "learning_rate": 0.00014665836446658365, + "loss": 0.7593, + "step": 1435 + }, + { + "epoch": 0.866888016903109, + "grad_norm": 0.09555850178003311, + "learning_rate": 0.00014661685346616853, + "loss": 0.6711, + "step": 1436 + }, + { + "epoch": 0.8674916993661335, + "grad_norm": 0.08465403318405151, + "learning_rate": 0.00014657534246575343, + "loss": 0.7276, + "step": 1437 + }, + { + "epoch": 0.8680953818291579, + "grad_norm": 0.08792877197265625, + "learning_rate": 0.00014653383146533833, + "loss": 0.7075, + "step": 1438 + }, + { + "epoch": 0.8686990642921824, + "grad_norm": 0.09290292859077454, + "learning_rate": 0.0001464923204649232, + "loss": 0.7923, + "step": 1439 + }, + { + "epoch": 0.8693027467552068, + "grad_norm": 0.09828725457191467, + "learning_rate": 0.0001464508094645081, + "loss": 0.7282, + "step": 1440 + }, + { + "epoch": 0.8699064292182312, + "grad_norm": 0.1370926946401596, + "learning_rate": 0.000146409298464093, + "loss": 0.7228, + "step": 1441 + }, + { + "epoch": 0.8705101116812557, + "grad_norm": 0.09725096821784973, + "learning_rate": 0.00014636778746367787, + "loss": 0.7643, + "step": 1442 + }, + { + "epoch": 0.8711137941442801, + "grad_norm": 0.09183010458946228, + "learning_rate": 0.00014632627646326277, + "loss": 0.7243, + "step": 1443 + }, + { + "epoch": 0.8717174766073046, + "grad_norm": 0.0972151830792427, + "learning_rate": 0.00014628476546284767, + "loss": 0.6837, + "step": 1444 + }, + { + "epoch": 0.872321159070329, + "grad_norm": 0.11110269278287888, + "learning_rate": 0.00014624325446243255, + "loss": 0.7487, + "step": 1445 + }, + { + "epoch": 0.8729248415333535, + "grad_norm": 0.10543739795684814, + "learning_rate": 0.00014620174346201745, + "loss": 0.6177, + "step": 1446 + }, + { + "epoch": 0.8735285239963779, + "grad_norm": 0.13775797188282013, + "learning_rate": 0.00014616023246160232, + "loss": 0.6354, + "step": 1447 + }, + { + "epoch": 0.8741322064594024, + "grad_norm": 0.11157894879579544, + "learning_rate": 0.00014611872146118722, + "loss": 0.5814, + "step": 1448 + }, + { + "epoch": 0.8747358889224268, + "grad_norm": 0.11887330561876297, + "learning_rate": 0.00014607721046077212, + "loss": 0.5033, + "step": 1449 + }, + { + "epoch": 0.8753395713854513, + "grad_norm": 0.1345004439353943, + "learning_rate": 0.000146035699460357, + "loss": 0.4091, + "step": 1450 + }, + { + "epoch": 0.8759432538484757, + "grad_norm": 0.1368480920791626, + "learning_rate": 0.0001459941884599419, + "loss": 0.7745, + "step": 1451 + }, + { + "epoch": 0.8765469363115002, + "grad_norm": 0.09147506207227707, + "learning_rate": 0.0001459526774595268, + "loss": 1.1362, + "step": 1452 + }, + { + "epoch": 0.8771506187745246, + "grad_norm": 0.07865995168685913, + "learning_rate": 0.00014591116645911166, + "loss": 0.7771, + "step": 1453 + }, + { + "epoch": 0.877754301237549, + "grad_norm": 0.0899992287158966, + "learning_rate": 0.00014586965545869656, + "loss": 0.779, + "step": 1454 + }, + { + "epoch": 0.8783579837005735, + "grad_norm": 0.0916723981499672, + "learning_rate": 0.00014582814445828146, + "loss": 1.2033, + "step": 1455 + }, + { + "epoch": 0.8789616661635979, + "grad_norm": 0.09812314063310623, + "learning_rate": 0.00014578663345786634, + "loss": 0.9668, + "step": 1456 + }, + { + "epoch": 0.8795653486266224, + "grad_norm": 0.09160758554935455, + "learning_rate": 0.0001457451224574512, + "loss": 0.8744, + "step": 1457 + }, + { + "epoch": 0.8801690310896468, + "grad_norm": 0.0805237665772438, + "learning_rate": 0.00014570361145703614, + "loss": 0.9906, + "step": 1458 + }, + { + "epoch": 0.8807727135526713, + "grad_norm": 0.09051381051540375, + "learning_rate": 0.000145662100456621, + "loss": 1.0932, + "step": 1459 + }, + { + "epoch": 0.8813763960156957, + "grad_norm": 0.08794981986284256, + "learning_rate": 0.00014562058945620588, + "loss": 0.7806, + "step": 1460 + }, + { + "epoch": 0.8819800784787202, + "grad_norm": 0.0870475247502327, + "learning_rate": 0.00014557907845579078, + "loss": 1.0775, + "step": 1461 + }, + { + "epoch": 0.8825837609417446, + "grad_norm": 0.0890730544924736, + "learning_rate": 0.00014553756745537568, + "loss": 0.7489, + "step": 1462 + }, + { + "epoch": 0.8831874434047691, + "grad_norm": 0.08005908131599426, + "learning_rate": 0.00014549605645496056, + "loss": 1.0614, + "step": 1463 + }, + { + "epoch": 0.8837911258677935, + "grad_norm": 0.10012607276439667, + "learning_rate": 0.00014545454545454546, + "loss": 0.8, + "step": 1464 + }, + { + "epoch": 0.884394808330818, + "grad_norm": 0.09003414958715439, + "learning_rate": 0.00014541303445413036, + "loss": 0.7296, + "step": 1465 + }, + { + "epoch": 0.8849984907938424, + "grad_norm": 0.09428632259368896, + "learning_rate": 0.00014537152345371523, + "loss": 0.7256, + "step": 1466 + }, + { + "epoch": 0.8856021732568669, + "grad_norm": 0.08243349939584732, + "learning_rate": 0.00014533001245330013, + "loss": 0.7458, + "step": 1467 + }, + { + "epoch": 0.8862058557198913, + "grad_norm": 0.07649943977594376, + "learning_rate": 0.00014528850145288503, + "loss": 0.7372, + "step": 1468 + }, + { + "epoch": 0.8868095381829157, + "grad_norm": 0.08396594226360321, + "learning_rate": 0.00014524699045246993, + "loss": 0.8017, + "step": 1469 + }, + { + "epoch": 0.8874132206459402, + "grad_norm": 0.10132227838039398, + "learning_rate": 0.0001452054794520548, + "loss": 1.2413, + "step": 1470 + }, + { + "epoch": 0.8880169031089646, + "grad_norm": 0.0780363529920578, + "learning_rate": 0.00014516396845163968, + "loss": 0.8076, + "step": 1471 + }, + { + "epoch": 0.8886205855719891, + "grad_norm": 0.09692296385765076, + "learning_rate": 0.0001451224574512246, + "loss": 0.9112, + "step": 1472 + }, + { + "epoch": 0.8892242680350135, + "grad_norm": 0.08603715151548386, + "learning_rate": 0.00014508094645080948, + "loss": 0.8524, + "step": 1473 + }, + { + "epoch": 0.889827950498038, + "grad_norm": 0.0896432101726532, + "learning_rate": 0.00014503943545039435, + "loss": 1.0479, + "step": 1474 + }, + { + "epoch": 0.8904316329610625, + "grad_norm": 0.0839807540178299, + "learning_rate": 0.00014499792444997925, + "loss": 0.794, + "step": 1475 + }, + { + "epoch": 0.891035315424087, + "grad_norm": 0.08278023451566696, + "learning_rate": 0.00014495641344956415, + "loss": 0.8375, + "step": 1476 + }, + { + "epoch": 0.8916389978871114, + "grad_norm": 0.07943416386842728, + "learning_rate": 0.00014491490244914902, + "loss": 1.0645, + "step": 1477 + }, + { + "epoch": 0.8922426803501359, + "grad_norm": 0.08287323266267776, + "learning_rate": 0.00014487339144873392, + "loss": 0.7824, + "step": 1478 + }, + { + "epoch": 0.8928463628131603, + "grad_norm": 0.09486839175224304, + "learning_rate": 0.00014483188044831882, + "loss": 0.9189, + "step": 1479 + }, + { + "epoch": 0.8934500452761848, + "grad_norm": 0.09007449448108673, + "learning_rate": 0.0001447903694479037, + "loss": 0.7276, + "step": 1480 + }, + { + "epoch": 0.8940537277392092, + "grad_norm": 0.08944438397884369, + "learning_rate": 0.0001447488584474886, + "loss": 0.8821, + "step": 1481 + }, + { + "epoch": 0.8946574102022337, + "grad_norm": 0.08388552069664001, + "learning_rate": 0.0001447073474470735, + "loss": 0.7537, + "step": 1482 + }, + { + "epoch": 0.8952610926652581, + "grad_norm": 0.09146512299776077, + "learning_rate": 0.00014466583644665837, + "loss": 0.7041, + "step": 1483 + }, + { + "epoch": 0.8958647751282826, + "grad_norm": 0.09139760583639145, + "learning_rate": 0.00014462432544624327, + "loss": 0.7686, + "step": 1484 + }, + { + "epoch": 0.896468457591307, + "grad_norm": 0.08465917408466339, + "learning_rate": 0.00014458281444582814, + "loss": 0.7442, + "step": 1485 + }, + { + "epoch": 0.8970721400543314, + "grad_norm": 0.08978710323572159, + "learning_rate": 0.00014454130344541304, + "loss": 0.7162, + "step": 1486 + }, + { + "epoch": 0.8976758225173559, + "grad_norm": 0.09368009865283966, + "learning_rate": 0.00014449979244499794, + "loss": 0.6723, + "step": 1487 + }, + { + "epoch": 0.8982795049803803, + "grad_norm": 0.09958908706903458, + "learning_rate": 0.0001444582814445828, + "loss": 0.7469, + "step": 1488 + }, + { + "epoch": 0.8988831874434048, + "grad_norm": 0.09646685421466827, + "learning_rate": 0.0001444167704441677, + "loss": 0.7735, + "step": 1489 + }, + { + "epoch": 0.8994868699064292, + "grad_norm": 0.09356389194726944, + "learning_rate": 0.00014437525944375261, + "loss": 0.7194, + "step": 1490 + }, + { + "epoch": 0.9000905523694537, + "grad_norm": 0.09738507866859436, + "learning_rate": 0.0001443337484433375, + "loss": 0.6941, + "step": 1491 + }, + { + "epoch": 0.9006942348324781, + "grad_norm": 0.10621945559978485, + "learning_rate": 0.0001442922374429224, + "loss": 0.636, + "step": 1492 + }, + { + "epoch": 0.9012979172955026, + "grad_norm": 0.10333646088838577, + "learning_rate": 0.0001442507264425073, + "loss": 0.6574, + "step": 1493 + }, + { + "epoch": 0.901901599758527, + "grad_norm": 0.1097961962223053, + "learning_rate": 0.00014420921544209216, + "loss": 0.7928, + "step": 1494 + }, + { + "epoch": 0.9025052822215515, + "grad_norm": 0.11894567310810089, + "learning_rate": 0.00014416770444167703, + "loss": 0.6904, + "step": 1495 + }, + { + "epoch": 0.9031089646845759, + "grad_norm": 0.12034012377262115, + "learning_rate": 0.00014412619344126196, + "loss": 0.6091, + "step": 1496 + }, + { + "epoch": 0.9037126471476004, + "grad_norm": 0.11718066781759262, + "learning_rate": 0.00014408468244084683, + "loss": 0.6336, + "step": 1497 + }, + { + "epoch": 0.9043163296106248, + "grad_norm": 0.130398690700531, + "learning_rate": 0.0001440431714404317, + "loss": 0.5771, + "step": 1498 + }, + { + "epoch": 0.9049200120736492, + "grad_norm": 0.12476742267608643, + "learning_rate": 0.0001440016604400166, + "loss": 0.5312, + "step": 1499 + }, + { + "epoch": 0.9055236945366737, + "grad_norm": 0.12364601343870163, + "learning_rate": 0.0001439601494396015, + "loss": 0.3942, + "step": 1500 + }, + { + "epoch": 0.9055236945366737, + "eval_loss": 0.8090887069702148, + "eval_runtime": 1219.063, + "eval_samples_per_second": 2.289, + "eval_steps_per_second": 0.286, + "step": 1500 + }, + { + "epoch": 0.9061273769996981, + "grad_norm": 0.0810943990945816, + "learning_rate": 0.00014391863843918638, + "loss": 1.0772, + "step": 1501 + }, + { + "epoch": 0.9067310594627226, + "grad_norm": 0.08426682651042938, + "learning_rate": 0.00014387712743877128, + "loss": 0.7147, + "step": 1502 + }, + { + "epoch": 0.907334741925747, + "grad_norm": 0.09331586956977844, + "learning_rate": 0.00014383561643835618, + "loss": 0.8483, + "step": 1503 + }, + { + "epoch": 0.9079384243887715, + "grad_norm": 0.09169352054595947, + "learning_rate": 0.00014379410543794105, + "loss": 0.763, + "step": 1504 + }, + { + "epoch": 0.9085421068517959, + "grad_norm": 0.08313553780317307, + "learning_rate": 0.00014375259443752595, + "loss": 0.7842, + "step": 1505 + }, + { + "epoch": 0.9091457893148204, + "grad_norm": 0.08828964829444885, + "learning_rate": 0.00014371108343711085, + "loss": 0.9434, + "step": 1506 + }, + { + "epoch": 0.9097494717778448, + "grad_norm": 0.08548730611801147, + "learning_rate": 0.00014366957243669572, + "loss": 0.7584, + "step": 1507 + }, + { + "epoch": 0.9103531542408693, + "grad_norm": 0.09586931765079498, + "learning_rate": 0.00014362806143628062, + "loss": 1.016, + "step": 1508 + }, + { + "epoch": 0.9109568367038937, + "grad_norm": 0.08847955614328384, + "learning_rate": 0.0001435865504358655, + "loss": 0.886, + "step": 1509 + }, + { + "epoch": 0.9115605191669182, + "grad_norm": 0.14398765563964844, + "learning_rate": 0.0001435450394354504, + "loss": 0.7924, + "step": 1510 + }, + { + "epoch": 0.9121642016299426, + "grad_norm": 0.08921834826469421, + "learning_rate": 0.0001435035284350353, + "loss": 0.8978, + "step": 1511 + }, + { + "epoch": 0.912767884092967, + "grad_norm": 0.07638692855834961, + "learning_rate": 0.00014346201743462017, + "loss": 0.6995, + "step": 1512 + }, + { + "epoch": 0.9133715665559915, + "grad_norm": 0.08616790175437927, + "learning_rate": 0.00014342050643420507, + "loss": 0.9325, + "step": 1513 + }, + { + "epoch": 0.913975249019016, + "grad_norm": 0.08327416330575943, + "learning_rate": 0.00014337899543378997, + "loss": 1.0005, + "step": 1514 + }, + { + "epoch": 0.9145789314820404, + "grad_norm": 0.07625047117471695, + "learning_rate": 0.00014333748443337484, + "loss": 0.9611, + "step": 1515 + }, + { + "epoch": 0.915182613945065, + "grad_norm": 0.08822344988584518, + "learning_rate": 0.00014329597343295974, + "loss": 0.8797, + "step": 1516 + }, + { + "epoch": 0.9157862964080894, + "grad_norm": 0.08142781257629395, + "learning_rate": 0.00014325446243254464, + "loss": 0.8249, + "step": 1517 + }, + { + "epoch": 0.9163899788711138, + "grad_norm": 0.07951053231954575, + "learning_rate": 0.00014321295143212952, + "loss": 0.8624, + "step": 1518 + }, + { + "epoch": 0.9169936613341383, + "grad_norm": 0.08235177397727966, + "learning_rate": 0.00014317144043171442, + "loss": 0.8208, + "step": 1519 + }, + { + "epoch": 0.9175973437971627, + "grad_norm": 0.09271861612796783, + "learning_rate": 0.00014312992943129932, + "loss": 0.789, + "step": 1520 + }, + { + "epoch": 0.9182010262601872, + "grad_norm": 0.09429887682199478, + "learning_rate": 0.0001430884184308842, + "loss": 1.1317, + "step": 1521 + }, + { + "epoch": 0.9188047087232116, + "grad_norm": 0.08084212988615036, + "learning_rate": 0.0001430469074304691, + "loss": 0.7235, + "step": 1522 + }, + { + "epoch": 0.9194083911862361, + "grad_norm": 0.08684766292572021, + "learning_rate": 0.00014300539643005396, + "loss": 0.9076, + "step": 1523 + }, + { + "epoch": 0.9200120736492605, + "grad_norm": 0.07625840604305267, + "learning_rate": 0.00014296388542963886, + "loss": 0.743, + "step": 1524 + }, + { + "epoch": 0.920615756112285, + "grad_norm": 0.0968519002199173, + "learning_rate": 0.00014292237442922376, + "loss": 0.8984, + "step": 1525 + }, + { + "epoch": 0.9212194385753094, + "grad_norm": 0.08472032099962234, + "learning_rate": 0.00014288086342880864, + "loss": 1.0295, + "step": 1526 + }, + { + "epoch": 0.9218231210383339, + "grad_norm": 0.0939970538020134, + "learning_rate": 0.00014283935242839354, + "loss": 0.9342, + "step": 1527 + }, + { + "epoch": 0.9224268035013583, + "grad_norm": 0.14263185858726501, + "learning_rate": 0.00014279784142797844, + "loss": 0.8136, + "step": 1528 + }, + { + "epoch": 0.9230304859643828, + "grad_norm": 0.08136036992073059, + "learning_rate": 0.0001427563304275633, + "loss": 0.8533, + "step": 1529 + }, + { + "epoch": 0.9236341684274072, + "grad_norm": 0.0846790298819542, + "learning_rate": 0.00014271481942714818, + "loss": 0.857, + "step": 1530 + }, + { + "epoch": 0.9242378508904316, + "grad_norm": 0.08513128757476807, + "learning_rate": 0.0001426733084267331, + "loss": 0.8139, + "step": 1531 + }, + { + "epoch": 0.9248415333534561, + "grad_norm": 0.07980025559663773, + "learning_rate": 0.00014263179742631798, + "loss": 0.745, + "step": 1532 + }, + { + "epoch": 0.9254452158164805, + "grad_norm": 0.0855269581079483, + "learning_rate": 0.00014259028642590285, + "loss": 0.7256, + "step": 1533 + }, + { + "epoch": 0.926048898279505, + "grad_norm": 0.08665366470813751, + "learning_rate": 0.00014254877542548778, + "loss": 0.7378, + "step": 1534 + }, + { + "epoch": 0.9266525807425294, + "grad_norm": 0.098544642329216, + "learning_rate": 0.00014250726442507265, + "loss": 0.7067, + "step": 1535 + }, + { + "epoch": 0.9272562632055539, + "grad_norm": 0.08651740849018097, + "learning_rate": 0.00014246575342465753, + "loss": 0.7731, + "step": 1536 + }, + { + "epoch": 0.9278599456685783, + "grad_norm": 0.09180111438035965, + "learning_rate": 0.00014242424242424243, + "loss": 0.7735, + "step": 1537 + }, + { + "epoch": 0.9284636281316028, + "grad_norm": 0.09831391274929047, + "learning_rate": 0.00014238273142382733, + "loss": 0.7889, + "step": 1538 + }, + { + "epoch": 0.9290673105946272, + "grad_norm": 0.09092969447374344, + "learning_rate": 0.0001423412204234122, + "loss": 0.8703, + "step": 1539 + }, + { + "epoch": 0.9296709930576517, + "grad_norm": 0.08774839341640472, + "learning_rate": 0.0001422997094229971, + "loss": 0.6438, + "step": 1540 + }, + { + "epoch": 0.9302746755206761, + "grad_norm": 0.10328055173158646, + "learning_rate": 0.000142258198422582, + "loss": 0.7747, + "step": 1541 + }, + { + "epoch": 0.9308783579837006, + "grad_norm": 0.0955677479505539, + "learning_rate": 0.00014221668742216687, + "loss": 0.7044, + "step": 1542 + }, + { + "epoch": 0.931482040446725, + "grad_norm": 0.12321915477514267, + "learning_rate": 0.00014217517642175177, + "loss": 0.7503, + "step": 1543 + }, + { + "epoch": 0.9320857229097494, + "grad_norm": 0.10311946272850037, + "learning_rate": 0.00014213366542133665, + "loss": 0.7091, + "step": 1544 + }, + { + "epoch": 0.9326894053727739, + "grad_norm": 0.10908188670873642, + "learning_rate": 0.00014209215442092155, + "loss": 0.6301, + "step": 1545 + }, + { + "epoch": 0.9332930878357983, + "grad_norm": 0.10754991322755814, + "learning_rate": 0.00014205064342050645, + "loss": 0.6588, + "step": 1546 + }, + { + "epoch": 0.9338967702988228, + "grad_norm": 0.11673349887132645, + "learning_rate": 0.00014200913242009132, + "loss": 0.6268, + "step": 1547 + }, + { + "epoch": 0.9345004527618472, + "grad_norm": 0.12704375386238098, + "learning_rate": 0.00014196762141967622, + "loss": 0.6035, + "step": 1548 + }, + { + "epoch": 0.9351041352248717, + "grad_norm": 0.1293480098247528, + "learning_rate": 0.00014192611041926112, + "loss": 0.5336, + "step": 1549 + }, + { + "epoch": 0.9357078176878961, + "grad_norm": 0.13309147953987122, + "learning_rate": 0.000141884599418846, + "loss": 0.4451, + "step": 1550 + }, + { + "epoch": 0.9363115001509206, + "grad_norm": 0.1289544254541397, + "learning_rate": 0.0001418430884184309, + "loss": 0.818, + "step": 1551 + }, + { + "epoch": 0.936915182613945, + "grad_norm": 0.10981032997369766, + "learning_rate": 0.0001418015774180158, + "loss": 0.8971, + "step": 1552 + }, + { + "epoch": 0.9375188650769695, + "grad_norm": 0.07926082611083984, + "learning_rate": 0.00014176006641760067, + "loss": 0.7756, + "step": 1553 + }, + { + "epoch": 0.9381225475399939, + "grad_norm": 0.08216461539268494, + "learning_rate": 0.00014171855541718554, + "loss": 0.7566, + "step": 1554 + }, + { + "epoch": 0.9387262300030184, + "grad_norm": 0.08312779664993286, + "learning_rate": 0.00014167704441677047, + "loss": 0.7627, + "step": 1555 + }, + { + "epoch": 0.9393299124660429, + "grad_norm": 0.08494763821363449, + "learning_rate": 0.00014163553341635534, + "loss": 1.0068, + "step": 1556 + }, + { + "epoch": 0.9399335949290674, + "grad_norm": 0.10769182443618774, + "learning_rate": 0.00014159402241594024, + "loss": 0.8372, + "step": 1557 + }, + { + "epoch": 0.9405372773920918, + "grad_norm": 0.08497872948646545, + "learning_rate": 0.0001415525114155251, + "loss": 0.8532, + "step": 1558 + }, + { + "epoch": 0.9411409598551163, + "grad_norm": 0.07787720859050751, + "learning_rate": 0.00014151100041511, + "loss": 0.7097, + "step": 1559 + }, + { + "epoch": 0.9417446423181407, + "grad_norm": 0.10002875328063965, + "learning_rate": 0.0001414694894146949, + "loss": 0.8718, + "step": 1560 + }, + { + "epoch": 0.9423483247811651, + "grad_norm": 0.09802395850419998, + "learning_rate": 0.00014142797841427978, + "loss": 0.8319, + "step": 1561 + }, + { + "epoch": 0.9429520072441896, + "grad_norm": 0.08751332759857178, + "learning_rate": 0.00014138646741386468, + "loss": 0.7035, + "step": 1562 + }, + { + "epoch": 0.943555689707214, + "grad_norm": 0.08736349642276764, + "learning_rate": 0.00014134495641344958, + "loss": 0.8215, + "step": 1563 + }, + { + "epoch": 0.9441593721702385, + "grad_norm": 0.09416454285383224, + "learning_rate": 0.00014130344541303446, + "loss": 0.8634, + "step": 1564 + }, + { + "epoch": 0.9447630546332629, + "grad_norm": 0.0884031280875206, + "learning_rate": 0.00014126193441261936, + "loss": 0.848, + "step": 1565 + }, + { + "epoch": 0.9453667370962874, + "grad_norm": 0.08755529671907425, + "learning_rate": 0.00014122042341220426, + "loss": 0.8167, + "step": 1566 + }, + { + "epoch": 0.9459704195593118, + "grad_norm": 0.10254927724599838, + "learning_rate": 0.00014117891241178913, + "loss": 0.8119, + "step": 1567 + }, + { + "epoch": 0.9465741020223363, + "grad_norm": 0.0907716229557991, + "learning_rate": 0.000141137401411374, + "loss": 1.0695, + "step": 1568 + }, + { + "epoch": 0.9471777844853607, + "grad_norm": 0.08859268575906754, + "learning_rate": 0.00014109589041095893, + "loss": 0.8692, + "step": 1569 + }, + { + "epoch": 0.9477814669483852, + "grad_norm": 0.0878458023071289, + "learning_rate": 0.0001410543794105438, + "loss": 0.758, + "step": 1570 + }, + { + "epoch": 0.9483851494114096, + "grad_norm": 0.10177980363368988, + "learning_rate": 0.00014101286841012868, + "loss": 0.7831, + "step": 1571 + }, + { + "epoch": 0.9489888318744341, + "grad_norm": 0.08606535941362381, + "learning_rate": 0.00014097135740971358, + "loss": 1.0743, + "step": 1572 + }, + { + "epoch": 0.9495925143374585, + "grad_norm": 0.09055022895336151, + "learning_rate": 0.00014092984640929848, + "loss": 0.9108, + "step": 1573 + }, + { + "epoch": 0.950196196800483, + "grad_norm": 0.08120472729206085, + "learning_rate": 0.00014088833540888335, + "loss": 0.7569, + "step": 1574 + }, + { + "epoch": 0.9507998792635074, + "grad_norm": 0.08704983443021774, + "learning_rate": 0.00014084682440846825, + "loss": 0.8356, + "step": 1575 + }, + { + "epoch": 0.9514035617265318, + "grad_norm": 0.08406449109315872, + "learning_rate": 0.00014080531340805315, + "loss": 0.7055, + "step": 1576 + }, + { + "epoch": 0.9520072441895563, + "grad_norm": 0.08247263729572296, + "learning_rate": 0.00014076380240763802, + "loss": 0.8572, + "step": 1577 + }, + { + "epoch": 0.9526109266525807, + "grad_norm": 0.13809391856193542, + "learning_rate": 0.00014072229140722292, + "loss": 0.7313, + "step": 1578 + }, + { + "epoch": 0.9532146091156052, + "grad_norm": 0.09925238788127899, + "learning_rate": 0.00014068078040680782, + "loss": 0.7794, + "step": 1579 + }, + { + "epoch": 0.9538182915786296, + "grad_norm": 0.08268136531114578, + "learning_rate": 0.0001406392694063927, + "loss": 0.7268, + "step": 1580 + }, + { + "epoch": 0.9544219740416541, + "grad_norm": 0.08466971665620804, + "learning_rate": 0.0001405977584059776, + "loss": 0.8175, + "step": 1581 + }, + { + "epoch": 0.9550256565046785, + "grad_norm": 0.08671456575393677, + "learning_rate": 0.00014055624740556247, + "loss": 0.7747, + "step": 1582 + }, + { + "epoch": 0.955629338967703, + "grad_norm": 0.2081962525844574, + "learning_rate": 0.00014051473640514737, + "loss": 0.8782, + "step": 1583 + }, + { + "epoch": 0.9562330214307274, + "grad_norm": 0.11799836158752441, + "learning_rate": 0.00014047322540473227, + "loss": 0.8098, + "step": 1584 + }, + { + "epoch": 0.9568367038937519, + "grad_norm": 0.08406732231378555, + "learning_rate": 0.00014043171440431714, + "loss": 0.7716, + "step": 1585 + }, + { + "epoch": 0.9574403863567763, + "grad_norm": 0.08422478288412094, + "learning_rate": 0.00014039020340390204, + "loss": 0.7237, + "step": 1586 + }, + { + "epoch": 0.9580440688198008, + "grad_norm": 0.09907133132219315, + "learning_rate": 0.00014034869240348694, + "loss": 0.7298, + "step": 1587 + }, + { + "epoch": 0.9586477512828252, + "grad_norm": 0.08633650094270706, + "learning_rate": 0.00014030718140307181, + "loss": 0.7262, + "step": 1588 + }, + { + "epoch": 0.9592514337458496, + "grad_norm": 0.0880611315369606, + "learning_rate": 0.00014026567040265671, + "loss": 0.6691, + "step": 1589 + }, + { + "epoch": 0.9598551162088741, + "grad_norm": 0.0872715413570404, + "learning_rate": 0.00014022415940224161, + "loss": 0.6593, + "step": 1590 + }, + { + "epoch": 0.9604587986718985, + "grad_norm": 0.09152337163686752, + "learning_rate": 0.0001401826484018265, + "loss": 0.7065, + "step": 1591 + }, + { + "epoch": 0.961062481134923, + "grad_norm": 0.1341097503900528, + "learning_rate": 0.00014014113740141136, + "loss": 0.723, + "step": 1592 + }, + { + "epoch": 0.9616661635979474, + "grad_norm": 0.09733890742063522, + "learning_rate": 0.0001400996264009963, + "loss": 0.7194, + "step": 1593 + }, + { + "epoch": 0.9622698460609719, + "grad_norm": 0.10491717606782913, + "learning_rate": 0.00014005811540058116, + "loss": 0.6986, + "step": 1594 + }, + { + "epoch": 0.9628735285239963, + "grad_norm": 0.10709907114505768, + "learning_rate": 0.00014001660440016603, + "loss": 0.7316, + "step": 1595 + }, + { + "epoch": 0.9634772109870209, + "grad_norm": 0.1096893846988678, + "learning_rate": 0.00013997509339975093, + "loss": 0.6352, + "step": 1596 + }, + { + "epoch": 0.9640808934500453, + "grad_norm": 0.11475121229887009, + "learning_rate": 0.00013993358239933583, + "loss": 0.6632, + "step": 1597 + }, + { + "epoch": 0.9646845759130698, + "grad_norm": 0.12020900100469589, + "learning_rate": 0.0001398920713989207, + "loss": 0.6824, + "step": 1598 + }, + { + "epoch": 0.9652882583760942, + "grad_norm": 0.12162651866674423, + "learning_rate": 0.0001398505603985056, + "loss": 0.5713, + "step": 1599 + }, + { + "epoch": 0.9658919408391187, + "grad_norm": 0.13266035914421082, + "learning_rate": 0.0001398090493980905, + "loss": 0.4436, + "step": 1600 + }, + { + "epoch": 0.9664956233021431, + "grad_norm": 0.0847954973578453, + "learning_rate": 0.0001397675383976754, + "loss": 0.8099, + "step": 1601 + }, + { + "epoch": 0.9670993057651676, + "grad_norm": 0.08815553784370422, + "learning_rate": 0.00013972602739726028, + "loss": 0.8847, + "step": 1602 + }, + { + "epoch": 0.967702988228192, + "grad_norm": 0.0948781669139862, + "learning_rate": 0.00013968451639684518, + "loss": 1.0048, + "step": 1603 + }, + { + "epoch": 0.9683066706912165, + "grad_norm": 0.08522479981184006, + "learning_rate": 0.00013964300539643008, + "loss": 1.0174, + "step": 1604 + }, + { + "epoch": 0.9689103531542409, + "grad_norm": 0.07847335189580917, + "learning_rate": 0.00013960149439601495, + "loss": 0.7368, + "step": 1605 + }, + { + "epoch": 0.9695140356172653, + "grad_norm": 0.10585256665945053, + "learning_rate": 0.00013955998339559983, + "loss": 0.9218, + "step": 1606 + }, + { + "epoch": 0.9701177180802898, + "grad_norm": 0.08924368768930435, + "learning_rate": 0.00013951847239518475, + "loss": 0.8461, + "step": 1607 + }, + { + "epoch": 0.9707214005433142, + "grad_norm": 0.07704899460077286, + "learning_rate": 0.00013947696139476963, + "loss": 0.7675, + "step": 1608 + }, + { + "epoch": 0.9713250830063387, + "grad_norm": 0.08050254732370377, + "learning_rate": 0.0001394354503943545, + "loss": 0.845, + "step": 1609 + }, + { + "epoch": 0.9719287654693631, + "grad_norm": 0.08915068209171295, + "learning_rate": 0.0001393939393939394, + "loss": 0.8523, + "step": 1610 + }, + { + "epoch": 0.9725324479323876, + "grad_norm": 0.07807064801454544, + "learning_rate": 0.0001393524283935243, + "loss": 0.6999, + "step": 1611 + }, + { + "epoch": 0.973136130395412, + "grad_norm": 0.1043185442686081, + "learning_rate": 0.00013931091739310917, + "loss": 0.8859, + "step": 1612 + }, + { + "epoch": 0.9737398128584365, + "grad_norm": 0.1474182903766632, + "learning_rate": 0.00013926940639269407, + "loss": 1.1711, + "step": 1613 + }, + { + "epoch": 0.9743434953214609, + "grad_norm": 0.08791965246200562, + "learning_rate": 0.00013922789539227897, + "loss": 0.9619, + "step": 1614 + }, + { + "epoch": 0.9749471777844854, + "grad_norm": 0.08670973777770996, + "learning_rate": 0.00013918638439186384, + "loss": 1.1479, + "step": 1615 + }, + { + "epoch": 0.9755508602475098, + "grad_norm": 0.09029529243707657, + "learning_rate": 0.00013914487339144874, + "loss": 0.7685, + "step": 1616 + }, + { + "epoch": 0.9761545427105343, + "grad_norm": 0.15448486804962158, + "learning_rate": 0.00013910336239103364, + "loss": 0.9595, + "step": 1617 + }, + { + "epoch": 0.9767582251735587, + "grad_norm": 0.08685984462499619, + "learning_rate": 0.00013906185139061852, + "loss": 0.7231, + "step": 1618 + }, + { + "epoch": 0.9773619076365831, + "grad_norm": 0.09811729937791824, + "learning_rate": 0.00013902034039020342, + "loss": 0.9077, + "step": 1619 + }, + { + "epoch": 0.9779655900996076, + "grad_norm": 0.10184766352176666, + "learning_rate": 0.0001389788293897883, + "loss": 0.8866, + "step": 1620 + }, + { + "epoch": 0.978569272562632, + "grad_norm": 0.13190968334674835, + "learning_rate": 0.0001389373183893732, + "loss": 0.8538, + "step": 1621 + }, + { + "epoch": 0.9791729550256565, + "grad_norm": 0.0916040688753128, + "learning_rate": 0.0001388958073889581, + "loss": 0.8823, + "step": 1622 + }, + { + "epoch": 0.9797766374886809, + "grad_norm": 0.08365904539823532, + "learning_rate": 0.00013885429638854296, + "loss": 0.7825, + "step": 1623 + }, + { + "epoch": 0.9803803199517054, + "grad_norm": 0.09449176490306854, + "learning_rate": 0.00013881278538812786, + "loss": 0.9603, + "step": 1624 + }, + { + "epoch": 0.9809840024147298, + "grad_norm": 0.08852092921733856, + "learning_rate": 0.00013877127438771276, + "loss": 0.7159, + "step": 1625 + }, + { + "epoch": 0.9815876848777543, + "grad_norm": 0.10536504536867142, + "learning_rate": 0.00013872976338729764, + "loss": 1.0687, + "step": 1626 + }, + { + "epoch": 0.9821913673407787, + "grad_norm": 0.07926955074071884, + "learning_rate": 0.0001386882523868825, + "loss": 1.2306, + "step": 1627 + }, + { + "epoch": 0.9827950498038032, + "grad_norm": 0.08607269078493118, + "learning_rate": 0.00013864674138646744, + "loss": 0.8071, + "step": 1628 + }, + { + "epoch": 0.9833987322668276, + "grad_norm": 0.08337133377790451, + "learning_rate": 0.0001386052303860523, + "loss": 0.9241, + "step": 1629 + }, + { + "epoch": 0.9840024147298521, + "grad_norm": 0.0874747484922409, + "learning_rate": 0.00013856371938563718, + "loss": 0.7326, + "step": 1630 + }, + { + "epoch": 0.9846060971928765, + "grad_norm": 0.0751953125, + "learning_rate": 0.0001385222083852221, + "loss": 0.7567, + "step": 1631 + }, + { + "epoch": 0.985209779655901, + "grad_norm": 0.09537570923566818, + "learning_rate": 0.00013848069738480698, + "loss": 1.0592, + "step": 1632 + }, + { + "epoch": 0.9858134621189254, + "grad_norm": 0.0916895717382431, + "learning_rate": 0.00013843918638439186, + "loss": 0.7127, + "step": 1633 + }, + { + "epoch": 0.9864171445819498, + "grad_norm": 0.08885890990495682, + "learning_rate": 0.00013839767538397676, + "loss": 0.7207, + "step": 1634 + }, + { + "epoch": 0.9870208270449743, + "grad_norm": 0.08471481502056122, + "learning_rate": 0.00013835616438356166, + "loss": 0.7205, + "step": 1635 + }, + { + "epoch": 0.9876245095079987, + "grad_norm": 0.0972660556435585, + "learning_rate": 0.00013831465338314653, + "loss": 0.793, + "step": 1636 + }, + { + "epoch": 0.9882281919710233, + "grad_norm": 0.09207335114479065, + "learning_rate": 0.00013827314238273143, + "loss": 0.7134, + "step": 1637 + }, + { + "epoch": 0.9888318744340477, + "grad_norm": 0.0968717560172081, + "learning_rate": 0.00013823163138231633, + "loss": 0.6872, + "step": 1638 + }, + { + "epoch": 0.9894355568970722, + "grad_norm": 0.09645961970090866, + "learning_rate": 0.0001381901203819012, + "loss": 0.7423, + "step": 1639 + }, + { + "epoch": 0.9900392393600966, + "grad_norm": 0.09619560837745667, + "learning_rate": 0.0001381486093814861, + "loss": 0.7063, + "step": 1640 + }, + { + "epoch": 0.9906429218231211, + "grad_norm": 0.09428079426288605, + "learning_rate": 0.00013810709838107097, + "loss": 0.7305, + "step": 1641 + }, + { + "epoch": 0.9912466042861455, + "grad_norm": 0.10160063952207565, + "learning_rate": 0.00013806558738065587, + "loss": 0.7281, + "step": 1642 + }, + { + "epoch": 0.99185028674917, + "grad_norm": 0.10067980736494064, + "learning_rate": 0.00013802407638024077, + "loss": 0.7407, + "step": 1643 + }, + { + "epoch": 0.9924539692121944, + "grad_norm": 0.18993158638477325, + "learning_rate": 0.00013798256537982565, + "loss": 0.6791, + "step": 1644 + }, + { + "epoch": 0.9930576516752189, + "grad_norm": 0.10360643267631531, + "learning_rate": 0.00013794105437941057, + "loss": 0.7026, + "step": 1645 + }, + { + "epoch": 0.9936613341382433, + "grad_norm": 0.11464804410934448, + "learning_rate": 0.00013789954337899545, + "loss": 0.6614, + "step": 1646 + }, + { + "epoch": 0.9942650166012678, + "grad_norm": 0.11702211946249008, + "learning_rate": 0.00013785803237858032, + "loss": 0.6386, + "step": 1647 + }, + { + "epoch": 0.9948686990642922, + "grad_norm": 0.12298808991909027, + "learning_rate": 0.00013781652137816522, + "loss": 0.574, + "step": 1648 + }, + { + "epoch": 0.9954723815273167, + "grad_norm": 0.12374621629714966, + "learning_rate": 0.00013777501037775012, + "loss": 0.5813, + "step": 1649 + }, + { + "epoch": 0.9960760639903411, + "grad_norm": 0.12134044617414474, + "learning_rate": 0.000137733499377335, + "loss": 0.4442, + "step": 1650 + }, + { + "epoch": 0.9966797464533655, + "grad_norm": 0.08615315705537796, + "learning_rate": 0.0001376919883769199, + "loss": 0.839, + "step": 1651 + }, + { + "epoch": 0.99728342891639, + "grad_norm": 0.08225715905427933, + "learning_rate": 0.0001376504773765048, + "loss": 0.8809, + "step": 1652 + }, + { + "epoch": 0.9978871113794144, + "grad_norm": 0.0893816277384758, + "learning_rate": 0.00013760896637608967, + "loss": 0.8767, + "step": 1653 + }, + { + "epoch": 0.9984907938424389, + "grad_norm": 0.09268541634082794, + "learning_rate": 0.00013756745537567457, + "loss": 0.8083, + "step": 1654 + }, + { + "epoch": 0.9990944763054633, + "grad_norm": 0.10186032205820084, + "learning_rate": 0.00013752594437525944, + "loss": 0.783, + "step": 1655 + }, + { + "epoch": 0.9996981587684878, + "grad_norm": 0.11257860064506531, + "learning_rate": 0.00013748443337484434, + "loss": 0.6633, + "step": 1656 + }, + { + "epoch": 1.0, + "grad_norm": 0.1917642205953598, + "learning_rate": 0.00013744292237442924, + "loss": 0.476, + "step": 1657 + }, + { + "epoch": 1.0006036824630244, + "grad_norm": 0.09141898155212402, + "learning_rate": 0.0001374014113740141, + "loss": 0.8386, + "step": 1658 + }, + { + "epoch": 1.001207364926049, + "grad_norm": 0.09631163626909256, + "learning_rate": 0.000137359900373599, + "loss": 0.6534, + "step": 1659 + }, + { + "epoch": 1.0018110473890733, + "grad_norm": 0.09060285240411758, + "learning_rate": 0.0001373183893731839, + "loss": 0.9631, + "step": 1660 + }, + { + "epoch": 1.0024147298520978, + "grad_norm": 0.10698094218969345, + "learning_rate": 0.00013727687837276878, + "loss": 0.7317, + "step": 1661 + }, + { + "epoch": 1.0030184123151222, + "grad_norm": 0.08738933503627777, + "learning_rate": 0.00013723536737235369, + "loss": 0.689, + "step": 1662 + }, + { + "epoch": 1.0036220947781467, + "grad_norm": 0.09262979030609131, + "learning_rate": 0.00013719385637193859, + "loss": 0.7741, + "step": 1663 + }, + { + "epoch": 1.0042257772411711, + "grad_norm": 0.08418719470500946, + "learning_rate": 0.00013715234537152346, + "loss": 0.7623, + "step": 1664 + }, + { + "epoch": 1.0048294597041956, + "grad_norm": 0.09748540818691254, + "learning_rate": 0.00013711083437110833, + "loss": 0.8195, + "step": 1665 + }, + { + "epoch": 1.00543314216722, + "grad_norm": 0.08836773782968521, + "learning_rate": 0.00013706932337069326, + "loss": 0.6536, + "step": 1666 + }, + { + "epoch": 1.0060368246302445, + "grad_norm": 0.0891902819275856, + "learning_rate": 0.00013702781237027813, + "loss": 0.647, + "step": 1667 + }, + { + "epoch": 1.006640507093269, + "grad_norm": 0.08402203768491745, + "learning_rate": 0.000136986301369863, + "loss": 0.9145, + "step": 1668 + }, + { + "epoch": 1.0072441895562934, + "grad_norm": 0.08421420305967331, + "learning_rate": 0.0001369447903694479, + "loss": 0.6114, + "step": 1669 + }, + { + "epoch": 1.0078478720193178, + "grad_norm": 0.09319291263818741, + "learning_rate": 0.0001369032793690328, + "loss": 0.6923, + "step": 1670 + }, + { + "epoch": 1.0084515544823422, + "grad_norm": 0.0897219106554985, + "learning_rate": 0.00013686176836861768, + "loss": 0.7721, + "step": 1671 + }, + { + "epoch": 1.0090552369453667, + "grad_norm": 0.08546704798936844, + "learning_rate": 0.00013682025736820258, + "loss": 0.7558, + "step": 1672 + }, + { + "epoch": 1.0096589194083911, + "grad_norm": 0.10542399436235428, + "learning_rate": 0.00013677874636778748, + "loss": 0.7204, + "step": 1673 + }, + { + "epoch": 1.0102626018714156, + "grad_norm": 0.0866287350654602, + "learning_rate": 0.00013673723536737235, + "loss": 0.6816, + "step": 1674 + }, + { + "epoch": 1.01086628433444, + "grad_norm": 0.0957920253276825, + "learning_rate": 0.00013669572436695725, + "loss": 0.7554, + "step": 1675 + }, + { + "epoch": 1.0114699667974645, + "grad_norm": 0.07835828512907028, + "learning_rate": 0.00013665421336654215, + "loss": 0.8906, + "step": 1676 + }, + { + "epoch": 1.012073649260489, + "grad_norm": 0.09102047979831696, + "learning_rate": 0.00013661270236612702, + "loss": 0.707, + "step": 1677 + }, + { + "epoch": 1.0126773317235134, + "grad_norm": 0.10025649517774582, + "learning_rate": 0.00013657119136571192, + "loss": 1.0865, + "step": 1678 + }, + { + "epoch": 1.0132810141865378, + "grad_norm": 0.08916833996772766, + "learning_rate": 0.0001365296803652968, + "loss": 0.7058, + "step": 1679 + }, + { + "epoch": 1.0138846966495623, + "grad_norm": 0.0850253701210022, + "learning_rate": 0.0001364881693648817, + "loss": 0.9324, + "step": 1680 + }, + { + "epoch": 1.0144883791125867, + "grad_norm": 0.09609053283929825, + "learning_rate": 0.0001364466583644666, + "loss": 0.7521, + "step": 1681 + }, + { + "epoch": 1.0150920615756112, + "grad_norm": 0.0949166864156723, + "learning_rate": 0.00013640514736405147, + "loss": 0.7548, + "step": 1682 + }, + { + "epoch": 1.0156957440386356, + "grad_norm": 0.08850863575935364, + "learning_rate": 0.00013636363636363637, + "loss": 0.7067, + "step": 1683 + }, + { + "epoch": 1.01629942650166, + "grad_norm": 0.09187474101781845, + "learning_rate": 0.00013632212536322127, + "loss": 0.7852, + "step": 1684 + }, + { + "epoch": 1.0169031089646845, + "grad_norm": 0.09210950136184692, + "learning_rate": 0.00013628061436280614, + "loss": 0.8244, + "step": 1685 + }, + { + "epoch": 1.017506791427709, + "grad_norm": 0.08923452347517014, + "learning_rate": 0.00013623910336239104, + "loss": 0.9229, + "step": 1686 + }, + { + "epoch": 1.0181104738907334, + "grad_norm": 0.08911499381065369, + "learning_rate": 0.00013619759236197594, + "loss": 0.7053, + "step": 1687 + }, + { + "epoch": 1.0187141563537578, + "grad_norm": 0.09007196873426437, + "learning_rate": 0.00013615608136156081, + "loss": 0.8376, + "step": 1688 + }, + { + "epoch": 1.0193178388167823, + "grad_norm": 0.08651362359523773, + "learning_rate": 0.0001361145703611457, + "loss": 0.6759, + "step": 1689 + }, + { + "epoch": 1.0199215212798067, + "grad_norm": 0.08241482824087143, + "learning_rate": 0.00013607305936073061, + "loss": 0.6885, + "step": 1690 + }, + { + "epoch": 1.0205252037428312, + "grad_norm": 0.10755082219839096, + "learning_rate": 0.0001360315483603155, + "loss": 0.6961, + "step": 1691 + }, + { + "epoch": 1.0211288862058556, + "grad_norm": 0.09004219621419907, + "learning_rate": 0.0001359900373599004, + "loss": 0.6243, + "step": 1692 + }, + { + "epoch": 1.02173256866888, + "grad_norm": 0.0957816019654274, + "learning_rate": 0.00013594852635948526, + "loss": 0.6462, + "step": 1693 + }, + { + "epoch": 1.0223362511319045, + "grad_norm": 0.1017783135175705, + "learning_rate": 0.00013590701535907016, + "loss": 0.6332, + "step": 1694 + }, + { + "epoch": 1.022939933594929, + "grad_norm": 0.09232212603092194, + "learning_rate": 0.00013586550435865506, + "loss": 0.6461, + "step": 1695 + }, + { + "epoch": 1.0235436160579534, + "grad_norm": 0.09460336714982986, + "learning_rate": 0.00013582399335823993, + "loss": 0.6641, + "step": 1696 + }, + { + "epoch": 1.0241472985209779, + "grad_norm": 0.09913098812103271, + "learning_rate": 0.00013578248235782483, + "loss": 0.6559, + "step": 1697 + }, + { + "epoch": 1.0247509809840025, + "grad_norm": 0.10010208189487457, + "learning_rate": 0.00013574097135740973, + "loss": 0.5921, + "step": 1698 + }, + { + "epoch": 1.025354663447027, + "grad_norm": 0.10247639566659927, + "learning_rate": 0.0001356994603569946, + "loss": 0.6158, + "step": 1699 + }, + { + "epoch": 1.0259583459100514, + "grad_norm": 0.10905825346708298, + "learning_rate": 0.0001356579493565795, + "loss": 0.66, + "step": 1700 + }, + { + "epoch": 1.0265620283730759, + "grad_norm": 0.11569482833147049, + "learning_rate": 0.0001356164383561644, + "loss": 0.6431, + "step": 1701 + }, + { + "epoch": 1.0271657108361003, + "grad_norm": 0.14849194884300232, + "learning_rate": 0.00013557492735574928, + "loss": 0.6045, + "step": 1702 + }, + { + "epoch": 1.0277693932991248, + "grad_norm": 0.11826399713754654, + "learning_rate": 0.00013553341635533415, + "loss": 0.6559, + "step": 1703 + }, + { + "epoch": 1.0283730757621492, + "grad_norm": 0.1232730820775032, + "learning_rate": 0.00013549190535491908, + "loss": 0.5124, + "step": 1704 + }, + { + "epoch": 1.0289767582251736, + "grad_norm": 0.121260866522789, + "learning_rate": 0.00013545039435450395, + "loss": 0.4503, + "step": 1705 + }, + { + "epoch": 1.029580440688198, + "grad_norm": 0.1328078657388687, + "learning_rate": 0.00013540888335408883, + "loss": 0.4254, + "step": 1706 + }, + { + "epoch": 1.0301841231512225, + "grad_norm": 0.13635770976543427, + "learning_rate": 0.00013536737235367373, + "loss": 0.3007, + "step": 1707 + }, + { + "epoch": 1.030787805614247, + "grad_norm": 0.10616832226514816, + "learning_rate": 0.00013532586135325863, + "loss": 0.7256, + "step": 1708 + }, + { + "epoch": 1.0313914880772714, + "grad_norm": 0.10195182263851166, + "learning_rate": 0.0001352843503528435, + "loss": 0.7402, + "step": 1709 + }, + { + "epoch": 1.0319951705402959, + "grad_norm": 0.11944833397865295, + "learning_rate": 0.0001352428393524284, + "loss": 0.757, + "step": 1710 + }, + { + "epoch": 1.0325988530033203, + "grad_norm": 0.12061698734760284, + "learning_rate": 0.0001352013283520133, + "loss": 1.0738, + "step": 1711 + }, + { + "epoch": 1.0332025354663448, + "grad_norm": 0.10394060611724854, + "learning_rate": 0.00013515981735159817, + "loss": 0.7147, + "step": 1712 + }, + { + "epoch": 1.0338062179293692, + "grad_norm": 0.10730050504207611, + "learning_rate": 0.00013511830635118307, + "loss": 0.9586, + "step": 1713 + }, + { + "epoch": 1.0344099003923937, + "grad_norm": 0.09942850470542908, + "learning_rate": 0.00013507679535076797, + "loss": 1.0808, + "step": 1714 + }, + { + "epoch": 1.0350135828554181, + "grad_norm": 0.10033493489027023, + "learning_rate": 0.00013503528435035284, + "loss": 0.7759, + "step": 1715 + }, + { + "epoch": 1.0356172653184426, + "grad_norm": 0.08915812522172928, + "learning_rate": 0.00013499377334993774, + "loss": 0.7716, + "step": 1716 + }, + { + "epoch": 1.036220947781467, + "grad_norm": 0.09855514019727707, + "learning_rate": 0.00013495226234952262, + "loss": 0.8139, + "step": 1717 + }, + { + "epoch": 1.0368246302444915, + "grad_norm": 0.0998401790857315, + "learning_rate": 0.00013491075134910752, + "loss": 0.8308, + "step": 1718 + }, + { + "epoch": 1.037428312707516, + "grad_norm": 0.09024068713188171, + "learning_rate": 0.00013486924034869242, + "loss": 0.7503, + "step": 1719 + }, + { + "epoch": 1.0380319951705403, + "grad_norm": 0.09240993112325668, + "learning_rate": 0.0001348277293482773, + "loss": 0.9344, + "step": 1720 + }, + { + "epoch": 1.0386356776335648, + "grad_norm": 0.09110064059495926, + "learning_rate": 0.0001347862183478622, + "loss": 0.7452, + "step": 1721 + }, + { + "epoch": 1.0392393600965892, + "grad_norm": 0.09389975666999817, + "learning_rate": 0.0001347447073474471, + "loss": 1.061, + "step": 1722 + }, + { + "epoch": 1.0398430425596137, + "grad_norm": 0.10916675627231598, + "learning_rate": 0.00013470319634703196, + "loss": 0.7749, + "step": 1723 + }, + { + "epoch": 1.0404467250226381, + "grad_norm": 0.08076412975788116, + "learning_rate": 0.00013466168534661686, + "loss": 0.5778, + "step": 1724 + }, + { + "epoch": 1.0410504074856626, + "grad_norm": 0.09830320626497269, + "learning_rate": 0.00013462017434620176, + "loss": 0.7115, + "step": 1725 + }, + { + "epoch": 1.041654089948687, + "grad_norm": 0.09751685708761215, + "learning_rate": 0.00013457866334578664, + "loss": 0.7439, + "step": 1726 + }, + { + "epoch": 1.0422577724117115, + "grad_norm": 0.09098626673221588, + "learning_rate": 0.0001345371523453715, + "loss": 0.7062, + "step": 1727 + }, + { + "epoch": 1.042861454874736, + "grad_norm": 0.10907137393951416, + "learning_rate": 0.00013449564134495644, + "loss": 0.888, + "step": 1728 + }, + { + "epoch": 1.0434651373377604, + "grad_norm": 0.09241855889558792, + "learning_rate": 0.0001344541303445413, + "loss": 0.6564, + "step": 1729 + }, + { + "epoch": 1.0440688198007848, + "grad_norm": 0.092192642390728, + "learning_rate": 0.00013441261934412618, + "loss": 0.6394, + "step": 1730 + }, + { + "epoch": 1.0446725022638093, + "grad_norm": 0.08987250179052353, + "learning_rate": 0.00013437110834371108, + "loss": 0.8383, + "step": 1731 + }, + { + "epoch": 1.0452761847268337, + "grad_norm": 0.09562289714813232, + "learning_rate": 0.00013432959734329598, + "loss": 0.6214, + "step": 1732 + }, + { + "epoch": 1.0458798671898581, + "grad_norm": 0.0972490981221199, + "learning_rate": 0.00013428808634288086, + "loss": 0.7911, + "step": 1733 + }, + { + "epoch": 1.0464835496528826, + "grad_norm": 0.10719991475343704, + "learning_rate": 0.00013424657534246576, + "loss": 0.8041, + "step": 1734 + }, + { + "epoch": 1.047087232115907, + "grad_norm": 0.09392597526311874, + "learning_rate": 0.00013420506434205066, + "loss": 0.9184, + "step": 1735 + }, + { + "epoch": 1.0476909145789315, + "grad_norm": 0.09040381014347076, + "learning_rate": 0.00013416355334163556, + "loss": 0.8159, + "step": 1736 + }, + { + "epoch": 1.048294597041956, + "grad_norm": 0.09064770489931107, + "learning_rate": 0.00013412204234122043, + "loss": 0.768, + "step": 1737 + }, + { + "epoch": 1.0488982795049804, + "grad_norm": 0.0895996019244194, + "learning_rate": 0.00013408053134080533, + "loss": 0.6926, + "step": 1738 + }, + { + "epoch": 1.0495019619680048, + "grad_norm": 0.10096043348312378, + "learning_rate": 0.00013403902034039023, + "loss": 0.8543, + "step": 1739 + }, + { + "epoch": 1.0501056444310293, + "grad_norm": 0.09393858164548874, + "learning_rate": 0.0001339975093399751, + "loss": 0.6274, + "step": 1740 + }, + { + "epoch": 1.0507093268940537, + "grad_norm": 0.0904972106218338, + "learning_rate": 0.00013395599833955997, + "loss": 0.636, + "step": 1741 + }, + { + "epoch": 1.0513130093570782, + "grad_norm": 0.09535270184278488, + "learning_rate": 0.0001339144873391449, + "loss": 0.6623, + "step": 1742 + }, + { + "epoch": 1.0519166918201026, + "grad_norm": 0.09259801357984543, + "learning_rate": 0.00013387297633872977, + "loss": 0.659, + "step": 1743 + }, + { + "epoch": 1.052520374283127, + "grad_norm": 0.09762239456176758, + "learning_rate": 0.00013383146533831465, + "loss": 0.6494, + "step": 1744 + }, + { + "epoch": 1.0531240567461515, + "grad_norm": 0.104673370718956, + "learning_rate": 0.00013378995433789955, + "loss": 0.6844, + "step": 1745 + }, + { + "epoch": 1.053727739209176, + "grad_norm": 0.108694888651371, + "learning_rate": 0.00013374844333748445, + "loss": 0.6817, + "step": 1746 + }, + { + "epoch": 1.0543314216722004, + "grad_norm": 0.10462528467178345, + "learning_rate": 0.00013370693233706932, + "loss": 0.6151, + "step": 1747 + }, + { + "epoch": 1.0549351041352248, + "grad_norm": 0.11397890001535416, + "learning_rate": 0.00013366542133665422, + "loss": 0.6114, + "step": 1748 + }, + { + "epoch": 1.0555387865982493, + "grad_norm": 0.10102736949920654, + "learning_rate": 0.00013362391033623912, + "loss": 0.6314, + "step": 1749 + }, + { + "epoch": 1.0561424690612737, + "grad_norm": 0.12406179308891296, + "learning_rate": 0.000133582399335824, + "loss": 0.6278, + "step": 1750 + }, + { + "epoch": 1.0567461515242982, + "grad_norm": 0.11435554176568985, + "learning_rate": 0.0001335408883354089, + "loss": 0.5789, + "step": 1751 + }, + { + "epoch": 1.0573498339873226, + "grad_norm": 0.12788087129592896, + "learning_rate": 0.0001334993773349938, + "loss": 0.5205, + "step": 1752 + }, + { + "epoch": 1.057953516450347, + "grad_norm": 0.1364794224500656, + "learning_rate": 0.00013345786633457867, + "loss": 0.5034, + "step": 1753 + }, + { + "epoch": 1.0585571989133715, + "grad_norm": 0.13836418092250824, + "learning_rate": 0.00013341635533416357, + "loss": 0.5269, + "step": 1754 + }, + { + "epoch": 1.059160881376396, + "grad_norm": 0.14710794389247894, + "learning_rate": 0.00013337484433374844, + "loss": 0.4156, + "step": 1755 + }, + { + "epoch": 1.0597645638394204, + "grad_norm": 0.13123385608196259, + "learning_rate": 0.00013333333333333334, + "loss": 0.32, + "step": 1756 + }, + { + "epoch": 1.0603682463024449, + "grad_norm": 0.15206588804721832, + "learning_rate": 0.00013329182233291824, + "loss": 0.2739, + "step": 1757 + }, + { + "epoch": 1.0609719287654693, + "grad_norm": 0.13307802379131317, + "learning_rate": 0.0001332503113325031, + "loss": 0.8787, + "step": 1758 + }, + { + "epoch": 1.0615756112284938, + "grad_norm": 0.11052402853965759, + "learning_rate": 0.000133208800332088, + "loss": 0.7224, + "step": 1759 + }, + { + "epoch": 1.0621792936915182, + "grad_norm": 0.17755720019340515, + "learning_rate": 0.0001331672893316729, + "loss": 0.9626, + "step": 1760 + }, + { + "epoch": 1.0627829761545426, + "grad_norm": 0.10982788354158401, + "learning_rate": 0.00013312577833125779, + "loss": 0.789, + "step": 1761 + }, + { + "epoch": 1.063386658617567, + "grad_norm": 0.10073138773441315, + "learning_rate": 0.00013308426733084266, + "loss": 0.7448, + "step": 1762 + }, + { + "epoch": 1.0639903410805915, + "grad_norm": 0.09374914318323135, + "learning_rate": 0.00013304275633042759, + "loss": 0.7442, + "step": 1763 + }, + { + "epoch": 1.064594023543616, + "grad_norm": 0.10434511303901672, + "learning_rate": 0.00013300124533001246, + "loss": 0.8159, + "step": 1764 + }, + { + "epoch": 1.0651977060066404, + "grad_norm": 0.11345966905355453, + "learning_rate": 0.00013295973432959733, + "loss": 0.804, + "step": 1765 + }, + { + "epoch": 1.0658013884696649, + "grad_norm": 0.10115978866815567, + "learning_rate": 0.00013291822332918226, + "loss": 0.802, + "step": 1766 + }, + { + "epoch": 1.0664050709326893, + "grad_norm": 0.10045889765024185, + "learning_rate": 0.00013287671232876713, + "loss": 0.7314, + "step": 1767 + }, + { + "epoch": 1.0670087533957138, + "grad_norm": 0.11507212370634079, + "learning_rate": 0.000132835201328352, + "loss": 1.0092, + "step": 1768 + }, + { + "epoch": 1.0676124358587382, + "grad_norm": 0.09701373428106308, + "learning_rate": 0.0001327936903279369, + "loss": 0.9924, + "step": 1769 + }, + { + "epoch": 1.0682161183217627, + "grad_norm": 0.10776803642511368, + "learning_rate": 0.0001327521793275218, + "loss": 0.7889, + "step": 1770 + }, + { + "epoch": 1.068819800784787, + "grad_norm": 0.09671414643526077, + "learning_rate": 0.00013271066832710668, + "loss": 0.7996, + "step": 1771 + }, + { + "epoch": 1.0694234832478116, + "grad_norm": 0.10002996027469635, + "learning_rate": 0.00013266915732669158, + "loss": 0.7149, + "step": 1772 + }, + { + "epoch": 1.070027165710836, + "grad_norm": 0.09639476984739304, + "learning_rate": 0.00013262764632627648, + "loss": 0.9136, + "step": 1773 + }, + { + "epoch": 1.0706308481738604, + "grad_norm": 0.11277900636196136, + "learning_rate": 0.00013258613532586135, + "loss": 0.7176, + "step": 1774 + }, + { + "epoch": 1.071234530636885, + "grad_norm": 0.09514438360929489, + "learning_rate": 0.00013254462432544625, + "loss": 0.7988, + "step": 1775 + }, + { + "epoch": 1.0718382130999093, + "grad_norm": 0.09252556413412094, + "learning_rate": 0.00013250311332503112, + "loss": 0.7631, + "step": 1776 + }, + { + "epoch": 1.072441895562934, + "grad_norm": 0.10669627785682678, + "learning_rate": 0.00013246160232461602, + "loss": 0.8116, + "step": 1777 + }, + { + "epoch": 1.0730455780259582, + "grad_norm": 0.10841196030378342, + "learning_rate": 0.00013242009132420092, + "loss": 0.7423, + "step": 1778 + }, + { + "epoch": 1.073649260488983, + "grad_norm": 0.10168831795454025, + "learning_rate": 0.0001323785803237858, + "loss": 0.7702, + "step": 1779 + }, + { + "epoch": 1.0742529429520071, + "grad_norm": 0.10742917656898499, + "learning_rate": 0.00013233706932337072, + "loss": 0.8964, + "step": 1780 + }, + { + "epoch": 1.0748566254150318, + "grad_norm": 0.09869187325239182, + "learning_rate": 0.0001322955583229556, + "loss": 0.7048, + "step": 1781 + }, + { + "epoch": 1.0754603078780562, + "grad_norm": 0.1356370896100998, + "learning_rate": 0.00013225404732254047, + "loss": 0.9056, + "step": 1782 + }, + { + "epoch": 1.0760639903410807, + "grad_norm": 0.0955684557557106, + "learning_rate": 0.00013221253632212537, + "loss": 0.715, + "step": 1783 + }, + { + "epoch": 1.0766676728041051, + "grad_norm": 0.09964524954557419, + "learning_rate": 0.00013217102532171027, + "loss": 0.7039, + "step": 1784 + }, + { + "epoch": 1.0772713552671296, + "grad_norm": 0.10292885452508926, + "learning_rate": 0.00013212951432129514, + "loss": 0.9838, + "step": 1785 + }, + { + "epoch": 1.077875037730154, + "grad_norm": 0.101532481610775, + "learning_rate": 0.00013208800332088004, + "loss": 0.6629, + "step": 1786 + }, + { + "epoch": 1.0784787201931785, + "grad_norm": 0.09781339764595032, + "learning_rate": 0.00013204649232046494, + "loss": 0.789, + "step": 1787 + }, + { + "epoch": 1.079082402656203, + "grad_norm": 0.10302070528268814, + "learning_rate": 0.00013200498132004982, + "loss": 0.7772, + "step": 1788 + }, + { + "epoch": 1.0796860851192274, + "grad_norm": 0.10221746563911438, + "learning_rate": 0.00013196347031963472, + "loss": 0.8008, + "step": 1789 + }, + { + "epoch": 1.0802897675822518, + "grad_norm": 0.09660208970308304, + "learning_rate": 0.0001319219593192196, + "loss": 0.8582, + "step": 1790 + }, + { + "epoch": 1.0808934500452763, + "grad_norm": 0.09637465327978134, + "learning_rate": 0.0001318804483188045, + "loss": 0.7061, + "step": 1791 + }, + { + "epoch": 1.0814971325083007, + "grad_norm": 0.09803172200918198, + "learning_rate": 0.0001318389373183894, + "loss": 0.6043, + "step": 1792 + }, + { + "epoch": 1.0821008149713252, + "grad_norm": 0.10259698331356049, + "learning_rate": 0.00013179742631797426, + "loss": 0.6007, + "step": 1793 + }, + { + "epoch": 1.0827044974343496, + "grad_norm": 0.10633745789527893, + "learning_rate": 0.00013175591531755916, + "loss": 0.6679, + "step": 1794 + }, + { + "epoch": 1.083308179897374, + "grad_norm": 0.10431138426065445, + "learning_rate": 0.00013171440431714406, + "loss": 0.6999, + "step": 1795 + }, + { + "epoch": 1.0839118623603985, + "grad_norm": 0.10492059588432312, + "learning_rate": 0.00013167289331672893, + "loss": 0.7314, + "step": 1796 + }, + { + "epoch": 1.084515544823423, + "grad_norm": 0.1053004339337349, + "learning_rate": 0.00013163138231631383, + "loss": 0.5873, + "step": 1797 + }, + { + "epoch": 1.0851192272864474, + "grad_norm": 0.10796685516834259, + "learning_rate": 0.00013158987131589873, + "loss": 0.6122, + "step": 1798 + }, + { + "epoch": 1.0857229097494718, + "grad_norm": 0.11339765042066574, + "learning_rate": 0.0001315483603154836, + "loss": 0.6486, + "step": 1799 + }, + { + "epoch": 1.0863265922124963, + "grad_norm": 0.11134713143110275, + "learning_rate": 0.00013150684931506848, + "loss": 0.5925, + "step": 1800 + }, + { + "epoch": 1.0869302746755207, + "grad_norm": 0.12589313089847565, + "learning_rate": 0.0001314653383146534, + "loss": 0.6238, + "step": 1801 + }, + { + "epoch": 1.0875339571385452, + "grad_norm": 0.1222606673836708, + "learning_rate": 0.00013142382731423828, + "loss": 0.5478, + "step": 1802 + }, + { + "epoch": 1.0881376396015696, + "grad_norm": 0.1365615576505661, + "learning_rate": 0.00013138231631382315, + "loss": 0.5323, + "step": 1803 + }, + { + "epoch": 1.088741322064594, + "grad_norm": 0.1305444985628128, + "learning_rate": 0.00013134080531340805, + "loss": 0.5246, + "step": 1804 + }, + { + "epoch": 1.0893450045276185, + "grad_norm": 0.1343401074409485, + "learning_rate": 0.00013129929431299295, + "loss": 0.4212, + "step": 1805 + }, + { + "epoch": 1.089948686990643, + "grad_norm": 0.14103184640407562, + "learning_rate": 0.00013125778331257783, + "loss": 0.4255, + "step": 1806 + }, + { + "epoch": 1.0905523694536674, + "grad_norm": 0.15704935789108276, + "learning_rate": 0.00013121627231216273, + "loss": 0.3559, + "step": 1807 + }, + { + "epoch": 1.0911560519166918, + "grad_norm": 0.4550192058086395, + "learning_rate": 0.00013117476131174763, + "loss": 0.7013, + "step": 1808 + }, + { + "epoch": 1.0917597343797163, + "grad_norm": 0.12131811678409576, + "learning_rate": 0.0001311332503113325, + "loss": 0.7005, + "step": 1809 + }, + { + "epoch": 1.0923634168427407, + "grad_norm": 0.11735139787197113, + "learning_rate": 0.0001310917393109174, + "loss": 0.6871, + "step": 1810 + }, + { + "epoch": 1.0929670993057652, + "grad_norm": 0.11844879388809204, + "learning_rate": 0.0001310502283105023, + "loss": 1.2869, + "step": 1811 + }, + { + "epoch": 1.0935707817687896, + "grad_norm": 0.11029795557260513, + "learning_rate": 0.00013100871731008717, + "loss": 0.7609, + "step": 1812 + }, + { + "epoch": 1.094174464231814, + "grad_norm": 0.11960664391517639, + "learning_rate": 0.00013096720630967207, + "loss": 0.6971, + "step": 1813 + }, + { + "epoch": 1.0947781466948385, + "grad_norm": 0.09530165046453476, + "learning_rate": 0.00013092569530925695, + "loss": 0.7224, + "step": 1814 + }, + { + "epoch": 1.095381829157863, + "grad_norm": 0.1267603486776352, + "learning_rate": 0.00013088418430884185, + "loss": 0.815, + "step": 1815 + }, + { + "epoch": 1.0959855116208874, + "grad_norm": 0.11272140592336655, + "learning_rate": 0.00013084267330842675, + "loss": 0.78, + "step": 1816 + }, + { + "epoch": 1.0965891940839119, + "grad_norm": 0.10290253907442093, + "learning_rate": 0.00013080116230801162, + "loss": 0.7218, + "step": 1817 + }, + { + "epoch": 1.0971928765469363, + "grad_norm": 0.09555505961179733, + "learning_rate": 0.00013075965130759652, + "loss": 0.7541, + "step": 1818 + }, + { + "epoch": 1.0977965590099608, + "grad_norm": 0.10414687544107437, + "learning_rate": 0.00013071814030718142, + "loss": 0.9603, + "step": 1819 + }, + { + "epoch": 1.0984002414729852, + "grad_norm": 0.09657555818557739, + "learning_rate": 0.0001306766293067663, + "loss": 0.7862, + "step": 1820 + }, + { + "epoch": 1.0990039239360097, + "grad_norm": 0.09514836966991425, + "learning_rate": 0.0001306351183063512, + "loss": 0.7408, + "step": 1821 + }, + { + "epoch": 1.099607606399034, + "grad_norm": 0.09766734391450882, + "learning_rate": 0.0001305936073059361, + "loss": 0.7459, + "step": 1822 + }, + { + "epoch": 1.1002112888620585, + "grad_norm": 0.08910214155912399, + "learning_rate": 0.00013055209630552096, + "loss": 1.0229, + "step": 1823 + }, + { + "epoch": 1.100814971325083, + "grad_norm": 0.10597441345453262, + "learning_rate": 0.00013051058530510586, + "loss": 0.7877, + "step": 1824 + }, + { + "epoch": 1.1014186537881074, + "grad_norm": 0.10066650807857513, + "learning_rate": 0.00013046907430469076, + "loss": 0.6925, + "step": 1825 + }, + { + "epoch": 1.1020223362511319, + "grad_norm": 0.11692957580089569, + "learning_rate": 0.00013042756330427564, + "loss": 0.8301, + "step": 1826 + }, + { + "epoch": 1.1026260187141563, + "grad_norm": 0.10520220547914505, + "learning_rate": 0.00013038605230386054, + "loss": 0.8868, + "step": 1827 + }, + { + "epoch": 1.1032297011771808, + "grad_norm": 0.10756828635931015, + "learning_rate": 0.0001303445413034454, + "loss": 0.7164, + "step": 1828 + }, + { + "epoch": 1.1038333836402052, + "grad_norm": 0.10687938332557678, + "learning_rate": 0.0001303030303030303, + "loss": 0.8075, + "step": 1829 + }, + { + "epoch": 1.1044370661032297, + "grad_norm": 0.09681583940982819, + "learning_rate": 0.0001302615193026152, + "loss": 0.7073, + "step": 1830 + }, + { + "epoch": 1.1050407485662541, + "grad_norm": 0.0960078164935112, + "learning_rate": 0.00013022000830220008, + "loss": 0.7665, + "step": 1831 + }, + { + "epoch": 1.1056444310292786, + "grad_norm": 0.10860790312290192, + "learning_rate": 0.00013017849730178498, + "loss": 0.7155, + "step": 1832 + }, + { + "epoch": 1.106248113492303, + "grad_norm": 0.10167111456394196, + "learning_rate": 0.00013013698630136988, + "loss": 0.7887, + "step": 1833 + }, + { + "epoch": 1.1068517959553275, + "grad_norm": 0.09937427192926407, + "learning_rate": 0.00013009547530095476, + "loss": 0.7305, + "step": 1834 + }, + { + "epoch": 1.107455478418352, + "grad_norm": 0.09677271544933319, + "learning_rate": 0.00013005396430053966, + "loss": 0.7064, + "step": 1835 + }, + { + "epoch": 1.1080591608813763, + "grad_norm": 0.10012287646532059, + "learning_rate": 0.00013001245330012456, + "loss": 0.6711, + "step": 1836 + }, + { + "epoch": 1.1086628433444008, + "grad_norm": 0.10501682013273239, + "learning_rate": 0.00012997094229970943, + "loss": 0.8093, + "step": 1837 + }, + { + "epoch": 1.1092665258074252, + "grad_norm": 0.09496523439884186, + "learning_rate": 0.0001299294312992943, + "loss": 0.7329, + "step": 1838 + }, + { + "epoch": 1.1098702082704497, + "grad_norm": 0.09581634402275085, + "learning_rate": 0.00012988792029887923, + "loss": 0.6305, + "step": 1839 + }, + { + "epoch": 1.1104738907334741, + "grad_norm": 0.09801549464464188, + "learning_rate": 0.0001298464092984641, + "loss": 0.5904, + "step": 1840 + }, + { + "epoch": 1.1110775731964986, + "grad_norm": 0.0966857448220253, + "learning_rate": 0.00012980489829804898, + "loss": 0.7596, + "step": 1841 + }, + { + "epoch": 1.111681255659523, + "grad_norm": 0.10019635409116745, + "learning_rate": 0.00012976338729763388, + "loss": 0.6856, + "step": 1842 + }, + { + "epoch": 1.1122849381225475, + "grad_norm": 0.09953883290290833, + "learning_rate": 0.00012972187629721878, + "loss": 0.7132, + "step": 1843 + }, + { + "epoch": 1.112888620585572, + "grad_norm": 0.09991202503442764, + "learning_rate": 0.00012968036529680365, + "loss": 0.613, + "step": 1844 + }, + { + "epoch": 1.1134923030485964, + "grad_norm": 0.09988027811050415, + "learning_rate": 0.00012963885429638855, + "loss": 0.6142, + "step": 1845 + }, + { + "epoch": 1.1140959855116208, + "grad_norm": 0.10957928746938705, + "learning_rate": 0.00012959734329597345, + "loss": 0.6573, + "step": 1846 + }, + { + "epoch": 1.1146996679746453, + "grad_norm": 0.11322573572397232, + "learning_rate": 0.00012955583229555832, + "loss": 0.6891, + "step": 1847 + }, + { + "epoch": 1.1153033504376697, + "grad_norm": 0.11455134302377701, + "learning_rate": 0.00012951432129514322, + "loss": 0.6676, + "step": 1848 + }, + { + "epoch": 1.1159070329006942, + "grad_norm": 0.14914073050022125, + "learning_rate": 0.00012947281029472812, + "loss": 0.6206, + "step": 1849 + }, + { + "epoch": 1.1165107153637186, + "grad_norm": 0.12033107876777649, + "learning_rate": 0.000129431299294313, + "loss": 0.5269, + "step": 1850 + }, + { + "epoch": 1.117114397826743, + "grad_norm": 0.13541612029075623, + "learning_rate": 0.0001293897882938979, + "loss": 0.6263, + "step": 1851 + }, + { + "epoch": 1.1177180802897675, + "grad_norm": 0.12948794662952423, + "learning_rate": 0.00012934827729348277, + "loss": 0.6229, + "step": 1852 + }, + { + "epoch": 1.118321762752792, + "grad_norm": 0.13281860947608948, + "learning_rate": 0.00012930676629306767, + "loss": 0.6001, + "step": 1853 + }, + { + "epoch": 1.1189254452158164, + "grad_norm": 0.14206023514270782, + "learning_rate": 0.00012926525529265257, + "loss": 0.5866, + "step": 1854 + }, + { + "epoch": 1.1195291276788408, + "grad_norm": 0.15054769814014435, + "learning_rate": 0.00012922374429223744, + "loss": 0.4866, + "step": 1855 + }, + { + "epoch": 1.1201328101418653, + "grad_norm": 0.15093187987804413, + "learning_rate": 0.00012918223329182234, + "loss": 0.4194, + "step": 1856 + }, + { + "epoch": 1.1207364926048897, + "grad_norm": 0.14579585194587708, + "learning_rate": 0.00012914072229140724, + "loss": 0.3209, + "step": 1857 + }, + { + "epoch": 1.1213401750679144, + "grad_norm": 0.11571165919303894, + "learning_rate": 0.0001290992112909921, + "loss": 0.8016, + "step": 1858 + }, + { + "epoch": 1.1219438575309386, + "grad_norm": 0.13167442381381989, + "learning_rate": 0.00012905770029057699, + "loss": 0.7073, + "step": 1859 + }, + { + "epoch": 1.1225475399939633, + "grad_norm": 0.09710317105054855, + "learning_rate": 0.0001290161892901619, + "loss": 0.6618, + "step": 1860 + }, + { + "epoch": 1.1231512224569875, + "grad_norm": 0.1127190887928009, + "learning_rate": 0.00012897467828974679, + "loss": 0.7702, + "step": 1861 + }, + { + "epoch": 1.1237549049200122, + "grad_norm": 0.09531667083501816, + "learning_rate": 0.00012893316728933166, + "loss": 0.7331, + "step": 1862 + }, + { + "epoch": 1.1243585873830366, + "grad_norm": 0.10616014897823334, + "learning_rate": 0.0001288916562889166, + "loss": 0.6404, + "step": 1863 + }, + { + "epoch": 1.124962269846061, + "grad_norm": 0.09906172007322311, + "learning_rate": 0.00012885014528850146, + "loss": 0.7295, + "step": 1864 + }, + { + "epoch": 1.1255659523090855, + "grad_norm": 0.10826185345649719, + "learning_rate": 0.00012880863428808633, + "loss": 0.7036, + "step": 1865 + }, + { + "epoch": 1.12616963477211, + "grad_norm": 0.09856102615594864, + "learning_rate": 0.00012876712328767123, + "loss": 0.6587, + "step": 1866 + }, + { + "epoch": 1.1267733172351344, + "grad_norm": 0.11088103801012039, + "learning_rate": 0.00012872561228725613, + "loss": 0.8501, + "step": 1867 + }, + { + "epoch": 1.1273769996981589, + "grad_norm": 0.10369934886693954, + "learning_rate": 0.00012868410128684103, + "loss": 0.9292, + "step": 1868 + }, + { + "epoch": 1.1279806821611833, + "grad_norm": 0.10459262132644653, + "learning_rate": 0.0001286425902864259, + "loss": 0.8489, + "step": 1869 + }, + { + "epoch": 1.1285843646242077, + "grad_norm": 0.10528787970542908, + "learning_rate": 0.0001286010792860108, + "loss": 0.7793, + "step": 1870 + }, + { + "epoch": 1.1291880470872322, + "grad_norm": 0.10074903815984726, + "learning_rate": 0.0001285595682855957, + "loss": 0.7929, + "step": 1871 + }, + { + "epoch": 1.1297917295502566, + "grad_norm": 0.09653479605913162, + "learning_rate": 0.00012851805728518058, + "loss": 0.8025, + "step": 1872 + }, + { + "epoch": 1.130395412013281, + "grad_norm": 0.10775607079267502, + "learning_rate": 0.00012847654628476545, + "loss": 0.9679, + "step": 1873 + }, + { + "epoch": 1.1309990944763055, + "grad_norm": 0.10731203109025955, + "learning_rate": 0.00012843503528435038, + "loss": 0.7715, + "step": 1874 + }, + { + "epoch": 1.13160277693933, + "grad_norm": 0.1124483272433281, + "learning_rate": 0.00012839352428393525, + "loss": 0.8295, + "step": 1875 + }, + { + "epoch": 1.1322064594023544, + "grad_norm": 0.21262244880199432, + "learning_rate": 0.00012835201328352012, + "loss": 1.0585, + "step": 1876 + }, + { + "epoch": 1.1328101418653789, + "grad_norm": 0.10024654865264893, + "learning_rate": 0.00012831050228310505, + "loss": 0.7567, + "step": 1877 + }, + { + "epoch": 1.1334138243284033, + "grad_norm": 0.11772340536117554, + "learning_rate": 0.00012826899128268992, + "loss": 0.8871, + "step": 1878 + }, + { + "epoch": 1.1340175067914278, + "grad_norm": 0.10074224323034286, + "learning_rate": 0.0001282274802822748, + "loss": 0.9096, + "step": 1879 + }, + { + "epoch": 1.1346211892544522, + "grad_norm": 0.1045253723859787, + "learning_rate": 0.0001281859692818597, + "loss": 0.6523, + "step": 1880 + }, + { + "epoch": 1.1352248717174767, + "grad_norm": 0.10628263652324677, + "learning_rate": 0.0001281444582814446, + "loss": 0.6945, + "step": 1881 + }, + { + "epoch": 1.135828554180501, + "grad_norm": 0.09413371235132217, + "learning_rate": 0.00012810294728102947, + "loss": 0.6471, + "step": 1882 + }, + { + "epoch": 1.1364322366435256, + "grad_norm": 0.10187766700983047, + "learning_rate": 0.00012806143628061437, + "loss": 0.7088, + "step": 1883 + }, + { + "epoch": 1.13703591910655, + "grad_norm": 0.10155726969242096, + "learning_rate": 0.00012801992528019927, + "loss": 0.8189, + "step": 1884 + }, + { + "epoch": 1.1376396015695744, + "grad_norm": 0.101055808365345, + "learning_rate": 0.00012797841427978414, + "loss": 0.753, + "step": 1885 + }, + { + "epoch": 1.138243284032599, + "grad_norm": 0.10111107677221298, + "learning_rate": 0.00012793690327936904, + "loss": 0.8097, + "step": 1886 + }, + { + "epoch": 1.1388469664956233, + "grad_norm": 0.10023274272680283, + "learning_rate": 0.00012789539227895392, + "loss": 0.8272, + "step": 1887 + }, + { + "epoch": 1.1394506489586478, + "grad_norm": 0.09850174188613892, + "learning_rate": 0.00012785388127853882, + "loss": 0.9205, + "step": 1888 + }, + { + "epoch": 1.1400543314216722, + "grad_norm": 0.10670360922813416, + "learning_rate": 0.00012781237027812372, + "loss": 0.7138, + "step": 1889 + }, + { + "epoch": 1.1406580138846967, + "grad_norm": 0.10679405182600021, + "learning_rate": 0.0001277708592777086, + "loss": 0.8036, + "step": 1890 + }, + { + "epoch": 1.1412616963477211, + "grad_norm": 0.10706860572099686, + "learning_rate": 0.0001277293482772935, + "loss": 0.6126, + "step": 1891 + }, + { + "epoch": 1.1418653788107456, + "grad_norm": 0.10065101832151413, + "learning_rate": 0.0001276878372768784, + "loss": 0.6491, + "step": 1892 + }, + { + "epoch": 1.14246906127377, + "grad_norm": 0.10702386498451233, + "learning_rate": 0.00012764632627646326, + "loss": 0.7233, + "step": 1893 + }, + { + "epoch": 1.1430727437367945, + "grad_norm": 0.09997487813234329, + "learning_rate": 0.00012760481527604816, + "loss": 0.6635, + "step": 1894 + }, + { + "epoch": 1.143676426199819, + "grad_norm": 0.09972809255123138, + "learning_rate": 0.00012756330427563306, + "loss": 0.5934, + "step": 1895 + }, + { + "epoch": 1.1442801086628434, + "grad_norm": 0.11297734826803207, + "learning_rate": 0.00012752179327521794, + "loss": 0.6151, + "step": 1896 + }, + { + "epoch": 1.1448837911258678, + "grad_norm": 0.15789483487606049, + "learning_rate": 0.0001274802822748028, + "loss": 0.6286, + "step": 1897 + }, + { + "epoch": 1.1454874735888922, + "grad_norm": 0.10323326289653778, + "learning_rate": 0.00012743877127438774, + "loss": 0.5922, + "step": 1898 + }, + { + "epoch": 1.1460911560519167, + "grad_norm": 0.11414547264575958, + "learning_rate": 0.0001273972602739726, + "loss": 0.6048, + "step": 1899 + }, + { + "epoch": 1.1466948385149411, + "grad_norm": 0.11642369627952576, + "learning_rate": 0.00012735574927355748, + "loss": 0.6391, + "step": 1900 + }, + { + "epoch": 1.1472985209779656, + "grad_norm": 0.12589019536972046, + "learning_rate": 0.00012731423827314238, + "loss": 0.6153, + "step": 1901 + }, + { + "epoch": 1.14790220344099, + "grad_norm": 0.12789148092269897, + "learning_rate": 0.00012727272727272728, + "loss": 0.6229, + "step": 1902 + }, + { + "epoch": 1.1485058859040145, + "grad_norm": 0.14213663339614868, + "learning_rate": 0.00012723121627231215, + "loss": 0.579, + "step": 1903 + }, + { + "epoch": 1.149109568367039, + "grad_norm": 0.14148949086666107, + "learning_rate": 0.00012718970527189705, + "loss": 0.5596, + "step": 1904 + }, + { + "epoch": 1.1497132508300634, + "grad_norm": 0.14953409135341644, + "learning_rate": 0.00012714819427148195, + "loss": 0.4723, + "step": 1905 + }, + { + "epoch": 1.1503169332930878, + "grad_norm": 0.1479395627975464, + "learning_rate": 0.00012710668327106683, + "loss": 0.4351, + "step": 1906 + }, + { + "epoch": 1.1509206157561123, + "grad_norm": 0.13544803857803345, + "learning_rate": 0.00012706517227065173, + "loss": 0.3325, + "step": 1907 + }, + { + "epoch": 1.1515242982191367, + "grad_norm": 0.1175408810377121, + "learning_rate": 0.00012702366127023663, + "loss": 0.6767, + "step": 1908 + }, + { + "epoch": 1.1521279806821612, + "grad_norm": 0.209543377161026, + "learning_rate": 0.0001269821502698215, + "loss": 0.9705, + "step": 1909 + }, + { + "epoch": 1.1527316631451856, + "grad_norm": 0.10693584382534027, + "learning_rate": 0.0001269406392694064, + "loss": 0.7096, + "step": 1910 + }, + { + "epoch": 1.15333534560821, + "grad_norm": 0.10303391516208649, + "learning_rate": 0.00012689912826899127, + "loss": 0.7388, + "step": 1911 + }, + { + "epoch": 1.1539390280712345, + "grad_norm": 0.1123664528131485, + "learning_rate": 0.00012685761726857617, + "loss": 0.7372, + "step": 1912 + }, + { + "epoch": 1.154542710534259, + "grad_norm": 0.10753045976161957, + "learning_rate": 0.00012681610626816107, + "loss": 0.7714, + "step": 1913 + }, + { + "epoch": 1.1551463929972834, + "grad_norm": 0.10717937350273132, + "learning_rate": 0.00012677459526774595, + "loss": 0.6893, + "step": 1914 + }, + { + "epoch": 1.1557500754603078, + "grad_norm": 0.09875158965587616, + "learning_rate": 0.00012673308426733085, + "loss": 0.7437, + "step": 1915 + }, + { + "epoch": 1.1563537579233323, + "grad_norm": 0.12408756464719772, + "learning_rate": 0.00012669157326691575, + "loss": 0.7512, + "step": 1916 + }, + { + "epoch": 1.1569574403863567, + "grad_norm": 0.10806753486394882, + "learning_rate": 0.00012665006226650062, + "loss": 0.6687, + "step": 1917 + }, + { + "epoch": 1.1575611228493812, + "grad_norm": 0.10482674092054367, + "learning_rate": 0.00012660855126608552, + "loss": 0.6417, + "step": 1918 + }, + { + "epoch": 1.1581648053124056, + "grad_norm": 0.10658420622348785, + "learning_rate": 0.00012656704026567042, + "loss": 0.876, + "step": 1919 + }, + { + "epoch": 1.15876848777543, + "grad_norm": 0.10012143105268478, + "learning_rate": 0.0001265255292652553, + "loss": 0.6473, + "step": 1920 + }, + { + "epoch": 1.1593721702384545, + "grad_norm": 0.12289892137050629, + "learning_rate": 0.0001264840182648402, + "loss": 0.7602, + "step": 1921 + }, + { + "epoch": 1.159975852701479, + "grad_norm": 0.0990784764289856, + "learning_rate": 0.0001264425072644251, + "loss": 0.6372, + "step": 1922 + }, + { + "epoch": 1.1605795351645034, + "grad_norm": 0.0976581797003746, + "learning_rate": 0.00012640099626400997, + "loss": 0.7036, + "step": 1923 + }, + { + "epoch": 1.1611832176275279, + "grad_norm": 0.11021608114242554, + "learning_rate": 0.00012635948526359487, + "loss": 0.7666, + "step": 1924 + }, + { + "epoch": 1.1617869000905523, + "grad_norm": 0.2441258430480957, + "learning_rate": 0.00012631797426317974, + "loss": 0.7026, + "step": 1925 + }, + { + "epoch": 1.1623905825535767, + "grad_norm": 0.10685604065656662, + "learning_rate": 0.00012627646326276464, + "loss": 0.8768, + "step": 1926 + }, + { + "epoch": 1.1629942650166012, + "grad_norm": 0.10796701908111572, + "learning_rate": 0.00012623495226234954, + "loss": 0.9038, + "step": 1927 + }, + { + "epoch": 1.1635979474796256, + "grad_norm": 0.1107090562582016, + "learning_rate": 0.0001261934412619344, + "loss": 0.7874, + "step": 1928 + }, + { + "epoch": 1.16420162994265, + "grad_norm": 0.10373709350824356, + "learning_rate": 0.0001261519302615193, + "loss": 0.7946, + "step": 1929 + }, + { + "epoch": 1.1648053124056745, + "grad_norm": 0.09496215730905533, + "learning_rate": 0.0001261104192611042, + "loss": 0.688, + "step": 1930 + }, + { + "epoch": 1.165408994868699, + "grad_norm": 0.10575946420431137, + "learning_rate": 0.00012606890826068908, + "loss": 0.7762, + "step": 1931 + }, + { + "epoch": 1.1660126773317234, + "grad_norm": 0.0949690192937851, + "learning_rate": 0.00012602739726027398, + "loss": 0.6066, + "step": 1932 + }, + { + "epoch": 1.1666163597947479, + "grad_norm": 0.12045732140541077, + "learning_rate": 0.00012598588625985888, + "loss": 0.9106, + "step": 1933 + }, + { + "epoch": 1.1672200422577723, + "grad_norm": 0.10367399454116821, + "learning_rate": 0.00012594437525944376, + "loss": 0.7499, + "step": 1934 + }, + { + "epoch": 1.167823724720797, + "grad_norm": 0.09977395832538605, + "learning_rate": 0.00012590286425902863, + "loss": 0.6043, + "step": 1935 + }, + { + "epoch": 1.1684274071838212, + "grad_norm": 0.10659980773925781, + "learning_rate": 0.00012586135325861356, + "loss": 0.8781, + "step": 1936 + }, + { + "epoch": 1.1690310896468459, + "grad_norm": 0.0970248430967331, + "learning_rate": 0.00012581984225819843, + "loss": 0.6159, + "step": 1937 + }, + { + "epoch": 1.16963477210987, + "grad_norm": 0.10996770858764648, + "learning_rate": 0.0001257783312577833, + "loss": 0.7535, + "step": 1938 + }, + { + "epoch": 1.1702384545728948, + "grad_norm": 0.10588741302490234, + "learning_rate": 0.0001257368202573682, + "loss": 0.6802, + "step": 1939 + }, + { + "epoch": 1.170842137035919, + "grad_norm": 0.1041964441537857, + "learning_rate": 0.0001256953092569531, + "loss": 0.731, + "step": 1940 + }, + { + "epoch": 1.1714458194989437, + "grad_norm": 0.10040424019098282, + "learning_rate": 0.00012565379825653798, + "loss": 0.645, + "step": 1941 + }, + { + "epoch": 1.1720495019619679, + "grad_norm": 0.11066184937953949, + "learning_rate": 0.00012561228725612288, + "loss": 0.6855, + "step": 1942 + }, + { + "epoch": 1.1726531844249926, + "grad_norm": 0.10693058371543884, + "learning_rate": 0.00012557077625570778, + "loss": 0.6097, + "step": 1943 + }, + { + "epoch": 1.1732568668880168, + "grad_norm": 0.10633829981088638, + "learning_rate": 0.00012552926525529265, + "loss": 0.6503, + "step": 1944 + }, + { + "epoch": 1.1738605493510414, + "grad_norm": 0.10232945531606674, + "learning_rate": 0.00012548775425487755, + "loss": 0.6123, + "step": 1945 + }, + { + "epoch": 1.174464231814066, + "grad_norm": 0.11123668402433395, + "learning_rate": 0.00012544624325446245, + "loss": 0.574, + "step": 1946 + }, + { + "epoch": 1.1750679142770903, + "grad_norm": 0.11907467991113663, + "learning_rate": 0.00012540473225404732, + "loss": 0.6328, + "step": 1947 + }, + { + "epoch": 1.1756715967401148, + "grad_norm": 0.11772100627422333, + "learning_rate": 0.00012536322125363222, + "loss": 0.5881, + "step": 1948 + }, + { + "epoch": 1.1762752792031392, + "grad_norm": 0.11829902976751328, + "learning_rate": 0.0001253217102532171, + "loss": 0.6201, + "step": 1949 + }, + { + "epoch": 1.1768789616661637, + "grad_norm": 0.13115333020687103, + "learning_rate": 0.000125280199252802, + "loss": 0.5381, + "step": 1950 + }, + { + "epoch": 1.1774826441291881, + "grad_norm": 0.1208442747592926, + "learning_rate": 0.0001252386882523869, + "loss": 0.5937, + "step": 1951 + }, + { + "epoch": 1.1780863265922126, + "grad_norm": 0.13914504647254944, + "learning_rate": 0.00012519717725197177, + "loss": 0.5468, + "step": 1952 + }, + { + "epoch": 1.178690009055237, + "grad_norm": 0.14542445540428162, + "learning_rate": 0.00012515566625155667, + "loss": 0.5401, + "step": 1953 + }, + { + "epoch": 1.1792936915182615, + "grad_norm": 0.14165444672107697, + "learning_rate": 0.00012511415525114157, + "loss": 0.4952, + "step": 1954 + }, + { + "epoch": 1.179897373981286, + "grad_norm": 0.14939822256565094, + "learning_rate": 0.00012507264425072644, + "loss": 0.4343, + "step": 1955 + }, + { + "epoch": 1.1805010564443104, + "grad_norm": 0.14716513454914093, + "learning_rate": 0.00012503113325031131, + "loss": 0.3985, + "step": 1956 + }, + { + "epoch": 1.1811047389073348, + "grad_norm": 0.157403826713562, + "learning_rate": 0.00012498962224989624, + "loss": 0.3312, + "step": 1957 + }, + { + "epoch": 1.1817084213703593, + "grad_norm": 0.12419522553682327, + "learning_rate": 0.00012494811124948111, + "loss": 0.9177, + "step": 1958 + }, + { + "epoch": 1.1823121038333837, + "grad_norm": 0.10016956180334091, + "learning_rate": 0.00012490660024906601, + "loss": 0.8362, + "step": 1959 + }, + { + "epoch": 1.1829157862964081, + "grad_norm": 0.12086569517850876, + "learning_rate": 0.00012486508924865091, + "loss": 0.7991, + "step": 1960 + }, + { + "epoch": 1.1835194687594326, + "grad_norm": 0.11325091868638992, + "learning_rate": 0.0001248235782482358, + "loss": 0.7633, + "step": 1961 + }, + { + "epoch": 1.184123151222457, + "grad_norm": 0.11552873998880386, + "learning_rate": 0.0001247820672478207, + "loss": 0.7515, + "step": 1962 + }, + { + "epoch": 1.1847268336854815, + "grad_norm": 0.09600497037172318, + "learning_rate": 0.00012474055624740556, + "loss": 0.698, + "step": 1963 + }, + { + "epoch": 1.185330516148506, + "grad_norm": 0.0974835678935051, + "learning_rate": 0.00012469904524699046, + "loss": 0.6602, + "step": 1964 + }, + { + "epoch": 1.1859341986115304, + "grad_norm": 0.23812483251094818, + "learning_rate": 0.00012465753424657536, + "loss": 0.7079, + "step": 1965 + }, + { + "epoch": 1.1865378810745548, + "grad_norm": 0.09787070006132126, + "learning_rate": 0.00012461602324616023, + "loss": 0.752, + "step": 1966 + }, + { + "epoch": 1.1871415635375793, + "grad_norm": 0.10244850814342499, + "learning_rate": 0.00012457451224574513, + "loss": 0.8414, + "step": 1967 + }, + { + "epoch": 1.1877452460006037, + "grad_norm": 0.11027634143829346, + "learning_rate": 0.00012453300124533003, + "loss": 0.8229, + "step": 1968 + }, + { + "epoch": 1.1883489284636282, + "grad_norm": 0.1009831577539444, + "learning_rate": 0.0001244914902449149, + "loss": 0.9911, + "step": 1969 + }, + { + "epoch": 1.1889526109266526, + "grad_norm": 0.10704313963651657, + "learning_rate": 0.00012444997924449978, + "loss": 0.7212, + "step": 1970 + }, + { + "epoch": 1.189556293389677, + "grad_norm": 0.10770639777183533, + "learning_rate": 0.0001244084682440847, + "loss": 0.7613, + "step": 1971 + }, + { + "epoch": 1.1901599758527015, + "grad_norm": 0.10859589278697968, + "learning_rate": 0.00012436695724366958, + "loss": 0.8025, + "step": 1972 + }, + { + "epoch": 1.190763658315726, + "grad_norm": 0.09854567050933838, + "learning_rate": 0.00012432544624325445, + "loss": 0.6613, + "step": 1973 + }, + { + "epoch": 1.1913673407787504, + "grad_norm": 0.10922684520483017, + "learning_rate": 0.00012428393524283938, + "loss": 0.6675, + "step": 1974 + }, + { + "epoch": 1.1919710232417748, + "grad_norm": 0.11279135942459106, + "learning_rate": 0.00012424242424242425, + "loss": 0.7562, + "step": 1975 + }, + { + "epoch": 1.1925747057047993, + "grad_norm": 0.10514727234840393, + "learning_rate": 0.00012420091324200913, + "loss": 0.6954, + "step": 1976 + }, + { + "epoch": 1.1931783881678237, + "grad_norm": 0.0970945954322815, + "learning_rate": 0.00012415940224159403, + "loss": 0.8071, + "step": 1977 + }, + { + "epoch": 1.1937820706308482, + "grad_norm": 0.11609134823083878, + "learning_rate": 0.00012411789124117893, + "loss": 0.8732, + "step": 1978 + }, + { + "epoch": 1.1943857530938726, + "grad_norm": 0.10586526989936829, + "learning_rate": 0.0001240763802407638, + "loss": 0.6324, + "step": 1979 + }, + { + "epoch": 1.194989435556897, + "grad_norm": 0.11197761446237564, + "learning_rate": 0.0001240348692403487, + "loss": 0.7699, + "step": 1980 + }, + { + "epoch": 1.1955931180199215, + "grad_norm": 0.11370430141687393, + "learning_rate": 0.0001239933582399336, + "loss": 0.8104, + "step": 1981 + }, + { + "epoch": 1.196196800482946, + "grad_norm": 0.0997292697429657, + "learning_rate": 0.00012395184723951847, + "loss": 0.7494, + "step": 1982 + }, + { + "epoch": 1.1968004829459704, + "grad_norm": 0.10209706425666809, + "learning_rate": 0.00012391033623910337, + "loss": 0.6899, + "step": 1983 + }, + { + "epoch": 1.1974041654089949, + "grad_norm": 0.09956377744674683, + "learning_rate": 0.00012386882523868824, + "loss": 0.7469, + "step": 1984 + }, + { + "epoch": 1.1980078478720193, + "grad_norm": 0.12185216695070267, + "learning_rate": 0.00012382731423827314, + "loss": 1.0278, + "step": 1985 + }, + { + "epoch": 1.1986115303350438, + "grad_norm": 0.16837285459041595, + "learning_rate": 0.00012378580323785804, + "loss": 0.8619, + "step": 1986 + }, + { + "epoch": 1.1992152127980682, + "grad_norm": 0.10831692814826965, + "learning_rate": 0.00012374429223744292, + "loss": 0.6662, + "step": 1987 + }, + { + "epoch": 1.1998188952610926, + "grad_norm": 0.09968318045139313, + "learning_rate": 0.00012370278123702782, + "loss": 0.7561, + "step": 1988 + }, + { + "epoch": 1.200422577724117, + "grad_norm": 0.09389069676399231, + "learning_rate": 0.00012366127023661272, + "loss": 0.6967, + "step": 1989 + }, + { + "epoch": 1.2010262601871415, + "grad_norm": 0.10120224207639694, + "learning_rate": 0.0001236197592361976, + "loss": 0.7008, + "step": 1990 + }, + { + "epoch": 1.201629942650166, + "grad_norm": 0.1032261773943901, + "learning_rate": 0.0001235782482357825, + "loss": 0.6327, + "step": 1991 + }, + { + "epoch": 1.2022336251131904, + "grad_norm": 0.10337409377098083, + "learning_rate": 0.0001235367372353674, + "loss": 0.6639, + "step": 1992 + }, + { + "epoch": 1.2028373075762149, + "grad_norm": 0.1101367250084877, + "learning_rate": 0.00012349522623495226, + "loss": 0.6232, + "step": 1993 + }, + { + "epoch": 1.2034409900392393, + "grad_norm": 0.1051281988620758, + "learning_rate": 0.00012345371523453714, + "loss": 0.6032, + "step": 1994 + }, + { + "epoch": 1.2040446725022638, + "grad_norm": 0.10944084078073502, + "learning_rate": 0.00012341220423412206, + "loss": 0.635, + "step": 1995 + }, + { + "epoch": 1.2046483549652882, + "grad_norm": 0.10979338735342026, + "learning_rate": 0.00012337069323370694, + "loss": 0.5944, + "step": 1996 + }, + { + "epoch": 1.2052520374283127, + "grad_norm": 0.11819171160459518, + "learning_rate": 0.0001233291822332918, + "loss": 0.6785, + "step": 1997 + }, + { + "epoch": 1.205855719891337, + "grad_norm": 0.12030988931655884, + "learning_rate": 0.0001232876712328767, + "loss": 0.6699, + "step": 1998 + }, + { + "epoch": 1.2064594023543616, + "grad_norm": 0.12456244975328445, + "learning_rate": 0.0001232461602324616, + "loss": 0.6065, + "step": 1999 + }, + { + "epoch": 1.207063084817386, + "grad_norm": 0.12571309506893158, + "learning_rate": 0.00012320464923204648, + "loss": 0.5504, + "step": 2000 + }, + { + "epoch": 1.207063084817386, + "eval_loss": 0.8049178123474121, + "eval_runtime": 1222.2521, + "eval_samples_per_second": 2.283, + "eval_steps_per_second": 0.286, + "step": 2000 + }, + { + "epoch": 1.2076667672804104, + "grad_norm": 0.12835955619812012, + "learning_rate": 0.00012316313823163138, + "loss": 0.6229, + "step": 2001 + }, + { + "epoch": 1.208270449743435, + "grad_norm": 0.12921744585037231, + "learning_rate": 0.00012312162723121628, + "loss": 0.5465, + "step": 2002 + }, + { + "epoch": 1.2088741322064593, + "grad_norm": 0.139405757188797, + "learning_rate": 0.00012308011623080118, + "loss": 0.5361, + "step": 2003 + }, + { + "epoch": 1.2094778146694838, + "grad_norm": 0.14658528566360474, + "learning_rate": 0.00012303860523038605, + "loss": 0.4847, + "step": 2004 + }, + { + "epoch": 1.2100814971325082, + "grad_norm": 0.15193478763103485, + "learning_rate": 0.00012299709422997095, + "loss": 0.4933, + "step": 2005 + }, + { + "epoch": 1.2106851795955327, + "grad_norm": 0.15982936322689056, + "learning_rate": 0.00012295558322955586, + "loss": 0.4008, + "step": 2006 + }, + { + "epoch": 1.2112888620585571, + "grad_norm": 0.15444207191467285, + "learning_rate": 0.00012291407222914073, + "loss": 0.3094, + "step": 2007 + }, + { + "epoch": 1.2118925445215816, + "grad_norm": 0.11019429564476013, + "learning_rate": 0.0001228725612287256, + "loss": 0.808, + "step": 2008 + }, + { + "epoch": 1.212496226984606, + "grad_norm": 0.11978921294212341, + "learning_rate": 0.00012283105022831053, + "loss": 0.7044, + "step": 2009 + }, + { + "epoch": 1.2130999094476305, + "grad_norm": 0.10817541182041168, + "learning_rate": 0.0001227895392278954, + "loss": 0.7133, + "step": 2010 + }, + { + "epoch": 1.213703591910655, + "grad_norm": 0.10823164880275726, + "learning_rate": 0.00012274802822748027, + "loss": 0.7497, + "step": 2011 + }, + { + "epoch": 1.2143072743736794, + "grad_norm": 0.10769373923540115, + "learning_rate": 0.0001227065172270652, + "loss": 0.9813, + "step": 2012 + }, + { + "epoch": 1.2149109568367038, + "grad_norm": 0.11639753729104996, + "learning_rate": 0.00012266500622665007, + "loss": 0.8259, + "step": 2013 + }, + { + "epoch": 1.2155146392997283, + "grad_norm": 0.10166247934103012, + "learning_rate": 0.00012262349522623495, + "loss": 0.6783, + "step": 2014 + }, + { + "epoch": 1.2161183217627527, + "grad_norm": 0.10975832492113113, + "learning_rate": 0.00012258198422581985, + "loss": 0.8338, + "step": 2015 + }, + { + "epoch": 1.2167220042257774, + "grad_norm": 0.12426348030567169, + "learning_rate": 0.00012254047322540475, + "loss": 0.7086, + "step": 2016 + }, + { + "epoch": 1.2173256866888016, + "grad_norm": 0.09932038187980652, + "learning_rate": 0.00012249896222498962, + "loss": 0.7193, + "step": 2017 + }, + { + "epoch": 1.2179293691518263, + "grad_norm": 0.11152757704257965, + "learning_rate": 0.00012245745122457452, + "loss": 0.7668, + "step": 2018 + }, + { + "epoch": 1.2185330516148505, + "grad_norm": 0.1197136715054512, + "learning_rate": 0.00012241594022415942, + "loss": 1.1513, + "step": 2019 + }, + { + "epoch": 1.2191367340778752, + "grad_norm": 0.12218881398439407, + "learning_rate": 0.0001223744292237443, + "loss": 0.5988, + "step": 2020 + }, + { + "epoch": 1.2197404165408994, + "grad_norm": 0.10389691591262817, + "learning_rate": 0.0001223329182233292, + "loss": 0.7079, + "step": 2021 + }, + { + "epoch": 1.220344099003924, + "grad_norm": 0.10646260529756546, + "learning_rate": 0.00012229140722291407, + "loss": 0.6731, + "step": 2022 + }, + { + "epoch": 1.2209477814669483, + "grad_norm": 0.11292260885238647, + "learning_rate": 0.00012224989622249897, + "loss": 0.7529, + "step": 2023 + }, + { + "epoch": 1.221551463929973, + "grad_norm": 0.11751693487167358, + "learning_rate": 0.00012220838522208387, + "loss": 0.873, + "step": 2024 + }, + { + "epoch": 1.2221551463929972, + "grad_norm": 0.10794904083013535, + "learning_rate": 0.00012216687422166874, + "loss": 0.8017, + "step": 2025 + }, + { + "epoch": 1.2227588288560218, + "grad_norm": 0.10329126566648483, + "learning_rate": 0.00012212536322125364, + "loss": 0.8863, + "step": 2026 + }, + { + "epoch": 1.2233625113190463, + "grad_norm": 0.10396264493465424, + "learning_rate": 0.00012208385222083854, + "loss": 0.6399, + "step": 2027 + }, + { + "epoch": 1.2239661937820707, + "grad_norm": 0.12621258199214935, + "learning_rate": 0.00012204234122042341, + "loss": 0.7813, + "step": 2028 + }, + { + "epoch": 1.2245698762450952, + "grad_norm": 0.11045660823583603, + "learning_rate": 0.0001220008302200083, + "loss": 0.7743, + "step": 2029 + }, + { + "epoch": 1.2251735587081196, + "grad_norm": 0.11149439960718155, + "learning_rate": 0.0001219593192195932, + "loss": 0.788, + "step": 2030 + }, + { + "epoch": 1.225777241171144, + "grad_norm": 0.11239415407180786, + "learning_rate": 0.00012191780821917808, + "loss": 1.0028, + "step": 2031 + }, + { + "epoch": 1.2263809236341685, + "grad_norm": 0.11321971565485, + "learning_rate": 0.00012187629721876297, + "loss": 0.7841, + "step": 2032 + }, + { + "epoch": 1.226984606097193, + "grad_norm": 0.10027159005403519, + "learning_rate": 0.00012183478621834787, + "loss": 0.728, + "step": 2033 + }, + { + "epoch": 1.2275882885602174, + "grad_norm": 0.09943227469921112, + "learning_rate": 0.00012179327521793276, + "loss": 0.7397, + "step": 2034 + }, + { + "epoch": 1.2281919710232418, + "grad_norm": 0.09892190247774124, + "learning_rate": 0.00012175176421751764, + "loss": 0.806, + "step": 2035 + }, + { + "epoch": 1.2287956534862663, + "grad_norm": 0.10850216448307037, + "learning_rate": 0.00012171025321710254, + "loss": 0.8292, + "step": 2036 + }, + { + "epoch": 1.2293993359492907, + "grad_norm": 0.102396160364151, + "learning_rate": 0.00012166874221668743, + "loss": 0.7506, + "step": 2037 + }, + { + "epoch": 1.2300030184123152, + "grad_norm": 0.12359751015901566, + "learning_rate": 0.0001216272312162723, + "loss": 0.6433, + "step": 2038 + }, + { + "epoch": 1.2306067008753396, + "grad_norm": 0.10474561899900436, + "learning_rate": 0.00012158572021585722, + "loss": 0.7758, + "step": 2039 + }, + { + "epoch": 1.231210383338364, + "grad_norm": 0.10111495852470398, + "learning_rate": 0.00012154420921544209, + "loss": 0.6707, + "step": 2040 + }, + { + "epoch": 1.2318140658013885, + "grad_norm": 0.10861996561288834, + "learning_rate": 0.00012150269821502698, + "loss": 0.6847, + "step": 2041 + }, + { + "epoch": 1.232417748264413, + "grad_norm": 0.10541880130767822, + "learning_rate": 0.00012146118721461188, + "loss": 0.6607, + "step": 2042 + }, + { + "epoch": 1.2330214307274374, + "grad_norm": 0.11882983893156052, + "learning_rate": 0.00012141967621419676, + "loss": 0.6679, + "step": 2043 + }, + { + "epoch": 1.2336251131904619, + "grad_norm": 0.10306091606616974, + "learning_rate": 0.00012137816521378165, + "loss": 0.6325, + "step": 2044 + }, + { + "epoch": 1.2342287956534863, + "grad_norm": 0.11227353662252426, + "learning_rate": 0.00012133665421336655, + "loss": 0.6578, + "step": 2045 + }, + { + "epoch": 1.2348324781165108, + "grad_norm": 0.11392517387866974, + "learning_rate": 0.00012129514321295144, + "loss": 0.6137, + "step": 2046 + }, + { + "epoch": 1.2354361605795352, + "grad_norm": 0.117860347032547, + "learning_rate": 0.00012125363221253634, + "loss": 0.6464, + "step": 2047 + }, + { + "epoch": 1.2360398430425596, + "grad_norm": 0.11536695808172226, + "learning_rate": 0.00012121212121212122, + "loss": 0.5613, + "step": 2048 + }, + { + "epoch": 1.236643525505584, + "grad_norm": 0.12926429510116577, + "learning_rate": 0.00012117061021170611, + "loss": 0.588, + "step": 2049 + }, + { + "epoch": 1.2372472079686085, + "grad_norm": 0.12854550778865814, + "learning_rate": 0.00012112909921129101, + "loss": 0.643, + "step": 2050 + }, + { + "epoch": 1.237850890431633, + "grad_norm": 0.12910380959510803, + "learning_rate": 0.0001210875882108759, + "loss": 0.6016, + "step": 2051 + }, + { + "epoch": 1.2384545728946574, + "grad_norm": 0.1356167048215866, + "learning_rate": 0.00012104607721046077, + "loss": 0.5593, + "step": 2052 + }, + { + "epoch": 1.2390582553576819, + "grad_norm": 0.13904690742492676, + "learning_rate": 0.00012100456621004568, + "loss": 0.5238, + "step": 2053 + }, + { + "epoch": 1.2396619378207063, + "grad_norm": 0.14844262599945068, + "learning_rate": 0.00012096305520963056, + "loss": 0.5047, + "step": 2054 + }, + { + "epoch": 1.2402656202837308, + "grad_norm": 0.14652326703071594, + "learning_rate": 0.00012092154420921544, + "loss": 0.4741, + "step": 2055 + }, + { + "epoch": 1.2408693027467552, + "grad_norm": 0.15764220058918, + "learning_rate": 0.00012088003320880034, + "loss": 0.3982, + "step": 2056 + }, + { + "epoch": 1.2414729852097797, + "grad_norm": 0.146280899643898, + "learning_rate": 0.00012083852220838523, + "loss": 0.3012, + "step": 2057 + }, + { + "epoch": 1.2420766676728041, + "grad_norm": 0.13532495498657227, + "learning_rate": 0.00012079701120797011, + "loss": 0.7918, + "step": 2058 + }, + { + "epoch": 1.2426803501358286, + "grad_norm": 0.11222409456968307, + "learning_rate": 0.00012075550020755501, + "loss": 0.7279, + "step": 2059 + }, + { + "epoch": 1.243284032598853, + "grad_norm": 0.12232893705368042, + "learning_rate": 0.0001207139892071399, + "loss": 0.8694, + "step": 2060 + }, + { + "epoch": 1.2438877150618775, + "grad_norm": 0.11951932311058044, + "learning_rate": 0.00012067247820672479, + "loss": 0.6634, + "step": 2061 + }, + { + "epoch": 1.244491397524902, + "grad_norm": 0.10638052970170975, + "learning_rate": 0.00012063096720630969, + "loss": 0.7437, + "step": 2062 + }, + { + "epoch": 1.2450950799879263, + "grad_norm": 0.10910008102655411, + "learning_rate": 0.00012058945620589457, + "loss": 0.8246, + "step": 2063 + }, + { + "epoch": 1.2456987624509508, + "grad_norm": 0.12002308666706085, + "learning_rate": 0.00012054794520547945, + "loss": 0.8136, + "step": 2064 + }, + { + "epoch": 1.2463024449139752, + "grad_norm": 0.10318660736083984, + "learning_rate": 0.00012050643420506436, + "loss": 0.9517, + "step": 2065 + }, + { + "epoch": 1.2469061273769997, + "grad_norm": 0.10524383932352066, + "learning_rate": 0.00012046492320464923, + "loss": 0.8019, + "step": 2066 + }, + { + "epoch": 1.2475098098400241, + "grad_norm": 0.10615126043558121, + "learning_rate": 0.00012042341220423412, + "loss": 0.732, + "step": 2067 + }, + { + "epoch": 1.2481134923030486, + "grad_norm": 0.10820569843053818, + "learning_rate": 0.00012038190120381902, + "loss": 0.9747, + "step": 2068 + }, + { + "epoch": 1.248717174766073, + "grad_norm": 0.10168500244617462, + "learning_rate": 0.0001203403902034039, + "loss": 0.7427, + "step": 2069 + }, + { + "epoch": 1.2493208572290975, + "grad_norm": 0.11007564514875412, + "learning_rate": 0.0001202988792029888, + "loss": 0.6818, + "step": 2070 + }, + { + "epoch": 1.249924539692122, + "grad_norm": 0.10033000260591507, + "learning_rate": 0.0001202573682025737, + "loss": 0.7134, + "step": 2071 + }, + { + "epoch": 1.2505282221551464, + "grad_norm": 0.11808007955551147, + "learning_rate": 0.00012021585720215858, + "loss": 0.7718, + "step": 2072 + }, + { + "epoch": 1.2511319046181708, + "grad_norm": 0.11681666970252991, + "learning_rate": 0.00012017434620174347, + "loss": 0.7327, + "step": 2073 + }, + { + "epoch": 1.2517355870811953, + "grad_norm": 0.11521414667367935, + "learning_rate": 0.00012013283520132837, + "loss": 0.754, + "step": 2074 + }, + { + "epoch": 1.2523392695442197, + "grad_norm": 0.09546779096126556, + "learning_rate": 0.00012009132420091325, + "loss": 0.7399, + "step": 2075 + }, + { + "epoch": 1.2529429520072441, + "grad_norm": 0.10306226462125778, + "learning_rate": 0.00012004981320049813, + "loss": 0.682, + "step": 2076 + }, + { + "epoch": 1.2535466344702686, + "grad_norm": 0.11089253425598145, + "learning_rate": 0.00012000830220008304, + "loss": 0.868, + "step": 2077 + }, + { + "epoch": 1.254150316933293, + "grad_norm": 0.12641939520835876, + "learning_rate": 0.00011996679119966791, + "loss": 0.8043, + "step": 2078 + }, + { + "epoch": 1.2547539993963175, + "grad_norm": 0.11239389330148697, + "learning_rate": 0.0001199252801992528, + "loss": 0.6766, + "step": 2079 + }, + { + "epoch": 1.255357681859342, + "grad_norm": 0.11025764048099518, + "learning_rate": 0.0001198837691988377, + "loss": 0.7516, + "step": 2080 + }, + { + "epoch": 1.2559613643223664, + "grad_norm": 0.10319259762763977, + "learning_rate": 0.00011984225819842259, + "loss": 0.791, + "step": 2081 + }, + { + "epoch": 1.2565650467853908, + "grad_norm": 0.1244928389787674, + "learning_rate": 0.00011980074719800747, + "loss": 0.8266, + "step": 2082 + }, + { + "epoch": 1.2571687292484153, + "grad_norm": 0.09494119137525558, + "learning_rate": 0.00011975923619759237, + "loss": 0.8277, + "step": 2083 + }, + { + "epoch": 1.2577724117114397, + "grad_norm": 0.11088497191667557, + "learning_rate": 0.00011971772519717726, + "loss": 0.7471, + "step": 2084 + }, + { + "epoch": 1.2583760941744642, + "grad_norm": 0.10349318385124207, + "learning_rate": 0.00011967621419676213, + "loss": 0.8844, + "step": 2085 + }, + { + "epoch": 1.2589797766374886, + "grad_norm": 0.09668376296758652, + "learning_rate": 0.00011963470319634704, + "loss": 0.7249, + "step": 2086 + }, + { + "epoch": 1.259583459100513, + "grad_norm": 0.11646587401628494, + "learning_rate": 0.00011959319219593193, + "loss": 0.9458, + "step": 2087 + }, + { + "epoch": 1.2601871415635375, + "grad_norm": 0.11103519797325134, + "learning_rate": 0.0001195516811955168, + "loss": 0.8437, + "step": 2088 + }, + { + "epoch": 1.260790824026562, + "grad_norm": 0.12291695177555084, + "learning_rate": 0.00011951017019510172, + "loss": 0.8271, + "step": 2089 + }, + { + "epoch": 1.2613945064895864, + "grad_norm": 0.10509349405765533, + "learning_rate": 0.00011946865919468659, + "loss": 0.7225, + "step": 2090 + }, + { + "epoch": 1.2619981889526108, + "grad_norm": 0.09915387630462646, + "learning_rate": 0.0001194271481942715, + "loss": 0.5739, + "step": 2091 + }, + { + "epoch": 1.2626018714156353, + "grad_norm": 0.10708332806825638, + "learning_rate": 0.00011938563719385638, + "loss": 0.6239, + "step": 2092 + }, + { + "epoch": 1.26320555387866, + "grad_norm": 0.10963302105665207, + "learning_rate": 0.00011934412619344126, + "loss": 0.6454, + "step": 2093 + }, + { + "epoch": 1.2638092363416842, + "grad_norm": 0.10599739104509354, + "learning_rate": 0.00011930261519302616, + "loss": 0.6203, + "step": 2094 + }, + { + "epoch": 1.2644129188047089, + "grad_norm": 0.11157579720020294, + "learning_rate": 0.00011926110419261105, + "loss": 0.689, + "step": 2095 + }, + { + "epoch": 1.265016601267733, + "grad_norm": 0.11258841305971146, + "learning_rate": 0.00011921959319219594, + "loss": 0.5814, + "step": 2096 + }, + { + "epoch": 1.2656202837307577, + "grad_norm": 0.11933767050504684, + "learning_rate": 0.00011917808219178084, + "loss": 0.6181, + "step": 2097 + }, + { + "epoch": 1.266223966193782, + "grad_norm": 0.1172049269080162, + "learning_rate": 0.00011913657119136572, + "loss": 0.6099, + "step": 2098 + }, + { + "epoch": 1.2668276486568066, + "grad_norm": 0.1134132593870163, + "learning_rate": 0.0001190950601909506, + "loss": 0.6055, + "step": 2099 + }, + { + "epoch": 1.2674313311198309, + "grad_norm": 0.12405924499034882, + "learning_rate": 0.00011905354919053551, + "loss": 0.5902, + "step": 2100 + }, + { + "epoch": 1.2680350135828555, + "grad_norm": 0.13423797488212585, + "learning_rate": 0.0001190120381901204, + "loss": 0.6814, + "step": 2101 + }, + { + "epoch": 1.2686386960458798, + "grad_norm": 0.13356977701187134, + "learning_rate": 0.00011897052718970527, + "loss": 0.6092, + "step": 2102 + }, + { + "epoch": 1.2692423785089044, + "grad_norm": 0.13142277300357819, + "learning_rate": 0.00011892901618929018, + "loss": 0.4849, + "step": 2103 + }, + { + "epoch": 1.2698460609719286, + "grad_norm": 0.1493036448955536, + "learning_rate": 0.00011888750518887506, + "loss": 0.5734, + "step": 2104 + }, + { + "epoch": 1.2704497434349533, + "grad_norm": 0.14654318988323212, + "learning_rate": 0.00011884599418845994, + "loss": 0.4648, + "step": 2105 + }, + { + "epoch": 1.2710534258979775, + "grad_norm": 0.1532881259918213, + "learning_rate": 0.00011880448318804484, + "loss": 0.3987, + "step": 2106 + }, + { + "epoch": 1.2716571083610022, + "grad_norm": 0.16462919116020203, + "learning_rate": 0.00011876297218762973, + "loss": 0.3358, + "step": 2107 + }, + { + "epoch": 1.2722607908240264, + "grad_norm": 0.11268685758113861, + "learning_rate": 0.00011872146118721462, + "loss": 0.6965, + "step": 2108 + }, + { + "epoch": 1.272864473287051, + "grad_norm": 0.11173847317695618, + "learning_rate": 0.00011867995018679952, + "loss": 0.9126, + "step": 2109 + }, + { + "epoch": 1.2734681557500753, + "grad_norm": 0.10636541247367859, + "learning_rate": 0.0001186384391863844, + "loss": 0.8093, + "step": 2110 + }, + { + "epoch": 1.2740718382131, + "grad_norm": 0.11559230089187622, + "learning_rate": 0.00011859692818596927, + "loss": 0.7025, + "step": 2111 + }, + { + "epoch": 1.2746755206761244, + "grad_norm": 0.11961853504180908, + "learning_rate": 0.00011855541718555419, + "loss": 0.7638, + "step": 2112 + }, + { + "epoch": 1.2752792031391489, + "grad_norm": 0.11241693049669266, + "learning_rate": 0.00011851390618513906, + "loss": 0.7422, + "step": 2113 + }, + { + "epoch": 1.2758828856021733, + "grad_norm": 0.11459380388259888, + "learning_rate": 0.00011847239518472395, + "loss": 1.0325, + "step": 2114 + }, + { + "epoch": 1.2764865680651978, + "grad_norm": 0.1107303723692894, + "learning_rate": 0.00011843088418430886, + "loss": 0.6566, + "step": 2115 + }, + { + "epoch": 1.2770902505282222, + "grad_norm": 0.11201652884483337, + "learning_rate": 0.00011838937318389373, + "loss": 0.7305, + "step": 2116 + }, + { + "epoch": 1.2776939329912467, + "grad_norm": 0.1064920499920845, + "learning_rate": 0.00011834786218347862, + "loss": 0.6963, + "step": 2117 + }, + { + "epoch": 1.2782976154542711, + "grad_norm": 0.11608701199293137, + "learning_rate": 0.00011830635118306352, + "loss": 0.7162, + "step": 2118 + }, + { + "epoch": 1.2789012979172956, + "grad_norm": 0.10472101718187332, + "learning_rate": 0.00011826484018264841, + "loss": 0.683, + "step": 2119 + }, + { + "epoch": 1.27950498038032, + "grad_norm": 0.12958820164203644, + "learning_rate": 0.0001182233291822333, + "loss": 0.8274, + "step": 2120 + }, + { + "epoch": 1.2801086628433445, + "grad_norm": 0.1009947806596756, + "learning_rate": 0.0001181818181818182, + "loss": 0.6998, + "step": 2121 + }, + { + "epoch": 1.280712345306369, + "grad_norm": 0.11905679106712341, + "learning_rate": 0.00011814030718140308, + "loss": 0.7364, + "step": 2122 + }, + { + "epoch": 1.2813160277693934, + "grad_norm": 0.10903461277484894, + "learning_rate": 0.00011809879618098795, + "loss": 0.7088, + "step": 2123 + }, + { + "epoch": 1.2819197102324178, + "grad_norm": 0.11340314149856567, + "learning_rate": 0.00011805728518057287, + "loss": 0.8662, + "step": 2124 + }, + { + "epoch": 1.2825233926954422, + "grad_norm": 0.10403577238321304, + "learning_rate": 0.00011801577418015774, + "loss": 0.8047, + "step": 2125 + }, + { + "epoch": 1.2831270751584667, + "grad_norm": 0.12345468252897263, + "learning_rate": 0.00011797426317974263, + "loss": 0.9449, + "step": 2126 + }, + { + "epoch": 1.2837307576214911, + "grad_norm": 0.11308707296848297, + "learning_rate": 0.00011793275217932753, + "loss": 0.7631, + "step": 2127 + }, + { + "epoch": 1.2843344400845156, + "grad_norm": 0.10067742317914963, + "learning_rate": 0.00011789124117891241, + "loss": 0.7624, + "step": 2128 + }, + { + "epoch": 1.28493812254754, + "grad_norm": 0.11228195577859879, + "learning_rate": 0.0001178497301784973, + "loss": 1.1616, + "step": 2129 + }, + { + "epoch": 1.2855418050105645, + "grad_norm": 0.10720618814229965, + "learning_rate": 0.0001178082191780822, + "loss": 1.3342, + "step": 2130 + }, + { + "epoch": 1.286145487473589, + "grad_norm": 0.10839608311653137, + "learning_rate": 0.00011776670817766709, + "loss": 0.9353, + "step": 2131 + }, + { + "epoch": 1.2867491699366134, + "grad_norm": 0.10751800239086151, + "learning_rate": 0.00011772519717725197, + "loss": 0.7667, + "step": 2132 + }, + { + "epoch": 1.2873528523996378, + "grad_norm": 0.10154768079519272, + "learning_rate": 0.00011768368617683687, + "loss": 0.794, + "step": 2133 + }, + { + "epoch": 1.2879565348626623, + "grad_norm": 0.10651742666959763, + "learning_rate": 0.00011764217517642176, + "loss": 0.7428, + "step": 2134 + }, + { + "epoch": 1.2885602173256867, + "grad_norm": 0.1084497794508934, + "learning_rate": 0.00011760066417600663, + "loss": 0.7594, + "step": 2135 + }, + { + "epoch": 1.2891638997887112, + "grad_norm": 0.11117968708276749, + "learning_rate": 0.00011755915317559155, + "loss": 0.7812, + "step": 2136 + }, + { + "epoch": 1.2897675822517356, + "grad_norm": 0.112984798848629, + "learning_rate": 0.00011751764217517642, + "loss": 0.6982, + "step": 2137 + }, + { + "epoch": 1.29037126471476, + "grad_norm": 0.10472138971090317, + "learning_rate": 0.00011747613117476133, + "loss": 0.6542, + "step": 2138 + }, + { + "epoch": 1.2909749471777845, + "grad_norm": 0.12380527704954147, + "learning_rate": 0.0001174346201743462, + "loss": 0.7346, + "step": 2139 + }, + { + "epoch": 1.291578629640809, + "grad_norm": 0.10141772031784058, + "learning_rate": 0.00011739310917393109, + "loss": 0.6896, + "step": 2140 + }, + { + "epoch": 1.2921823121038334, + "grad_norm": 0.10354811698198318, + "learning_rate": 0.00011735159817351599, + "loss": 0.6743, + "step": 2141 + }, + { + "epoch": 1.2927859945668578, + "grad_norm": 0.10275600850582123, + "learning_rate": 0.00011731008717310088, + "loss": 0.6862, + "step": 2142 + }, + { + "epoch": 1.2933896770298823, + "grad_norm": 0.10765067487955093, + "learning_rate": 0.00011726857617268576, + "loss": 0.6372, + "step": 2143 + }, + { + "epoch": 1.2939933594929067, + "grad_norm": 0.10741368681192398, + "learning_rate": 0.00011722706517227066, + "loss": 0.6173, + "step": 2144 + }, + { + "epoch": 1.2945970419559312, + "grad_norm": 0.11568807065486908, + "learning_rate": 0.00011718555417185555, + "loss": 0.7006, + "step": 2145 + }, + { + "epoch": 1.2952007244189556, + "grad_norm": 0.12116066366434097, + "learning_rate": 0.00011714404317144044, + "loss": 0.678, + "step": 2146 + }, + { + "epoch": 1.29580440688198, + "grad_norm": 0.1318727433681488, + "learning_rate": 0.00011710253217102534, + "loss": 0.6605, + "step": 2147 + }, + { + "epoch": 1.2964080893450045, + "grad_norm": 0.1201576441526413, + "learning_rate": 0.00011706102117061022, + "loss": 0.6411, + "step": 2148 + }, + { + "epoch": 1.297011771808029, + "grad_norm": 0.12639600038528442, + "learning_rate": 0.0001170195101701951, + "loss": 0.6679, + "step": 2149 + }, + { + "epoch": 1.2976154542710534, + "grad_norm": 0.12824396789073944, + "learning_rate": 0.00011697799916978001, + "loss": 0.5963, + "step": 2150 + }, + { + "epoch": 1.2982191367340778, + "grad_norm": 0.1317749321460724, + "learning_rate": 0.00011693648816936488, + "loss": 0.6115, + "step": 2151 + }, + { + "epoch": 1.2988228191971023, + "grad_norm": 0.14059890806674957, + "learning_rate": 0.00011689497716894977, + "loss": 0.5661, + "step": 2152 + }, + { + "epoch": 1.2994265016601267, + "grad_norm": 0.14957548677921295, + "learning_rate": 0.00011685346616853467, + "loss": 0.616, + "step": 2153 + }, + { + "epoch": 1.3000301841231512, + "grad_norm": 0.15705609321594238, + "learning_rate": 0.00011681195516811956, + "loss": 0.5566, + "step": 2154 + }, + { + "epoch": 1.3006338665861756, + "grad_norm": 0.14872965216636658, + "learning_rate": 0.00011677044416770444, + "loss": 0.4887, + "step": 2155 + }, + { + "epoch": 1.3012375490492, + "grad_norm": 0.1517983227968216, + "learning_rate": 0.00011672893316728934, + "loss": 0.4465, + "step": 2156 + }, + { + "epoch": 1.3018412315122245, + "grad_norm": 0.15609890222549438, + "learning_rate": 0.00011668742216687423, + "loss": 0.4065, + "step": 2157 + }, + { + "epoch": 1.302444913975249, + "grad_norm": 0.11183691769838333, + "learning_rate": 0.00011664591116645912, + "loss": 0.6092, + "step": 2158 + }, + { + "epoch": 1.3030485964382734, + "grad_norm": 0.10590233653783798, + "learning_rate": 0.00011660440016604402, + "loss": 0.9179, + "step": 2159 + }, + { + "epoch": 1.3036522789012979, + "grad_norm": 0.11257438361644745, + "learning_rate": 0.0001165628891656289, + "loss": 0.7511, + "step": 2160 + }, + { + "epoch": 1.3042559613643223, + "grad_norm": 0.11735875904560089, + "learning_rate": 0.00011652137816521377, + "loss": 0.8342, + "step": 2161 + }, + { + "epoch": 1.3048596438273468, + "grad_norm": 0.12115509063005447, + "learning_rate": 0.00011647986716479869, + "loss": 0.6921, + "step": 2162 + }, + { + "epoch": 1.3054633262903712, + "grad_norm": 0.10376877337694168, + "learning_rate": 0.00011643835616438356, + "loss": 0.5839, + "step": 2163 + }, + { + "epoch": 1.3060670087533957, + "grad_norm": 0.10715942084789276, + "learning_rate": 0.00011639684516396845, + "loss": 0.7511, + "step": 2164 + }, + { + "epoch": 1.30667069121642, + "grad_norm": 0.1192198321223259, + "learning_rate": 0.00011635533416355335, + "loss": 0.7023, + "step": 2165 + }, + { + "epoch": 1.3072743736794445, + "grad_norm": 0.10451126098632812, + "learning_rate": 0.00011631382316313823, + "loss": 0.642, + "step": 2166 + }, + { + "epoch": 1.307878056142469, + "grad_norm": 0.10327422618865967, + "learning_rate": 0.00011627231216272312, + "loss": 0.7792, + "step": 2167 + }, + { + "epoch": 1.3084817386054934, + "grad_norm": 0.10710068792104721, + "learning_rate": 0.00011623080116230802, + "loss": 0.705, + "step": 2168 + }, + { + "epoch": 1.3090854210685179, + "grad_norm": 0.10589960962533951, + "learning_rate": 0.00011618929016189291, + "loss": 0.6591, + "step": 2169 + }, + { + "epoch": 1.3096891035315423, + "grad_norm": 0.10254230350255966, + "learning_rate": 0.0001161477791614778, + "loss": 0.723, + "step": 2170 + }, + { + "epoch": 1.3102927859945668, + "grad_norm": 0.10744060575962067, + "learning_rate": 0.0001161062681610627, + "loss": 0.7218, + "step": 2171 + }, + { + "epoch": 1.3108964684575912, + "grad_norm": 0.11566371470689774, + "learning_rate": 0.00011606475716064758, + "loss": 0.6746, + "step": 2172 + }, + { + "epoch": 1.3115001509206157, + "grad_norm": 0.10865778475999832, + "learning_rate": 0.00011602324616023245, + "loss": 0.706, + "step": 2173 + }, + { + "epoch": 1.3121038333836403, + "grad_norm": 0.11026459187269211, + "learning_rate": 0.00011598173515981737, + "loss": 0.7467, + "step": 2174 + }, + { + "epoch": 1.3127075158466646, + "grad_norm": 0.1640511453151703, + "learning_rate": 0.00011594022415940224, + "loss": 0.754, + "step": 2175 + }, + { + "epoch": 1.3133111983096892, + "grad_norm": 0.11436708271503448, + "learning_rate": 0.00011589871315898713, + "loss": 0.8731, + "step": 2176 + }, + { + "epoch": 1.3139148807727135, + "grad_norm": 0.11194545775651932, + "learning_rate": 0.00011585720215857203, + "loss": 0.8425, + "step": 2177 + }, + { + "epoch": 1.3145185632357381, + "grad_norm": 0.10926983505487442, + "learning_rate": 0.00011581569115815691, + "loss": 0.695, + "step": 2178 + }, + { + "epoch": 1.3151222456987623, + "grad_norm": 0.11056378483772278, + "learning_rate": 0.0001157741801577418, + "loss": 0.7545, + "step": 2179 + }, + { + "epoch": 1.315725928161787, + "grad_norm": 0.11646270751953125, + "learning_rate": 0.0001157326691573267, + "loss": 0.697, + "step": 2180 + }, + { + "epoch": 1.3163296106248112, + "grad_norm": 0.096307173371315, + "learning_rate": 0.00011569115815691159, + "loss": 0.7044, + "step": 2181 + }, + { + "epoch": 1.316933293087836, + "grad_norm": 0.10772639513015747, + "learning_rate": 0.00011564964715649649, + "loss": 0.6897, + "step": 2182 + }, + { + "epoch": 1.3175369755508601, + "grad_norm": 0.10433688759803772, + "learning_rate": 0.00011560813615608137, + "loss": 0.7438, + "step": 2183 + }, + { + "epoch": 1.3181406580138848, + "grad_norm": 0.10850182920694351, + "learning_rate": 0.00011556662515566626, + "loss": 0.712, + "step": 2184 + }, + { + "epoch": 1.318744340476909, + "grad_norm": 0.11556683480739594, + "learning_rate": 0.00011552511415525116, + "loss": 0.7897, + "step": 2185 + }, + { + "epoch": 1.3193480229399337, + "grad_norm": 0.11677438765764236, + "learning_rate": 0.00011548360315483605, + "loss": 0.9773, + "step": 2186 + }, + { + "epoch": 1.319951705402958, + "grad_norm": 0.10189380496740341, + "learning_rate": 0.00011544209215442092, + "loss": 0.7935, + "step": 2187 + }, + { + "epoch": 1.3205553878659826, + "grad_norm": 0.10953009128570557, + "learning_rate": 0.00011540058115400583, + "loss": 0.7774, + "step": 2188 + }, + { + "epoch": 1.3211590703290068, + "grad_norm": 0.10907643288373947, + "learning_rate": 0.0001153590701535907, + "loss": 0.6565, + "step": 2189 + }, + { + "epoch": 1.3217627527920315, + "grad_norm": 0.10476800054311752, + "learning_rate": 0.00011531755915317559, + "loss": 0.6466, + "step": 2190 + }, + { + "epoch": 1.3223664352550557, + "grad_norm": 0.11148995906114578, + "learning_rate": 0.00011527604815276049, + "loss": 0.5893, + "step": 2191 + }, + { + "epoch": 1.3229701177180804, + "grad_norm": 0.10962202399969101, + "learning_rate": 0.00011523453715234538, + "loss": 0.6037, + "step": 2192 + }, + { + "epoch": 1.3235738001811048, + "grad_norm": 0.11205828934907913, + "learning_rate": 0.00011519302615193026, + "loss": 0.6462, + "step": 2193 + }, + { + "epoch": 1.3241774826441293, + "grad_norm": 0.11916244029998779, + "learning_rate": 0.00011515151515151516, + "loss": 0.6124, + "step": 2194 + }, + { + "epoch": 1.3247811651071537, + "grad_norm": 0.11426917463541031, + "learning_rate": 0.00011511000415110005, + "loss": 0.6288, + "step": 2195 + }, + { + "epoch": 1.3253848475701782, + "grad_norm": 0.11942315101623535, + "learning_rate": 0.00011506849315068494, + "loss": 0.6146, + "step": 2196 + }, + { + "epoch": 1.3259885300332026, + "grad_norm": 0.1271522343158722, + "learning_rate": 0.00011502698215026984, + "loss": 0.6174, + "step": 2197 + }, + { + "epoch": 1.326592212496227, + "grad_norm": 0.12273116409778595, + "learning_rate": 0.00011498547114985472, + "loss": 0.6165, + "step": 2198 + }, + { + "epoch": 1.3271958949592515, + "grad_norm": 0.12854193150997162, + "learning_rate": 0.0001149439601494396, + "loss": 0.6234, + "step": 2199 + }, + { + "epoch": 1.327799577422276, + "grad_norm": 0.12989364564418793, + "learning_rate": 0.00011490244914902451, + "loss": 0.5936, + "step": 2200 + }, + { + "epoch": 1.3284032598853004, + "grad_norm": 0.13210608065128326, + "learning_rate": 0.00011486093814860938, + "loss": 0.5591, + "step": 2201 + }, + { + "epoch": 1.3290069423483248, + "grad_norm": 0.1452115774154663, + "learning_rate": 0.00011481942714819427, + "loss": 0.5945, + "step": 2202 + }, + { + "epoch": 1.3296106248113493, + "grad_norm": 0.14136159420013428, + "learning_rate": 0.00011477791614777917, + "loss": 0.5191, + "step": 2203 + }, + { + "epoch": 1.3302143072743737, + "grad_norm": 0.1478489339351654, + "learning_rate": 0.00011473640514736406, + "loss": 0.4752, + "step": 2204 + }, + { + "epoch": 1.3308179897373982, + "grad_norm": 0.15822367370128632, + "learning_rate": 0.00011469489414694894, + "loss": 0.4601, + "step": 2205 + }, + { + "epoch": 1.3314216722004226, + "grad_norm": 0.1557387262582779, + "learning_rate": 0.00011465338314653384, + "loss": 0.4091, + "step": 2206 + }, + { + "epoch": 1.332025354663447, + "grad_norm": 0.15302599966526031, + "learning_rate": 0.00011461187214611873, + "loss": 0.3174, + "step": 2207 + }, + { + "epoch": 1.3326290371264715, + "grad_norm": 0.10820896923542023, + "learning_rate": 0.0001145703611457036, + "loss": 0.852, + "step": 2208 + }, + { + "epoch": 1.333232719589496, + "grad_norm": 0.10538190603256226, + "learning_rate": 0.00011452885014528852, + "loss": 0.6777, + "step": 2209 + }, + { + "epoch": 1.3338364020525204, + "grad_norm": 0.11455769091844559, + "learning_rate": 0.0001144873391448734, + "loss": 0.7658, + "step": 2210 + }, + { + "epoch": 1.3344400845155449, + "grad_norm": 0.12761631608009338, + "learning_rate": 0.00011444582814445828, + "loss": 0.7603, + "step": 2211 + }, + { + "epoch": 1.3350437669785693, + "grad_norm": 0.12457510828971863, + "learning_rate": 0.00011440431714404319, + "loss": 0.8002, + "step": 2212 + }, + { + "epoch": 1.3356474494415937, + "grad_norm": 0.11467002332210541, + "learning_rate": 0.00011436280614362806, + "loss": 0.7013, + "step": 2213 + }, + { + "epoch": 1.3362511319046182, + "grad_norm": 0.10846070200204849, + "learning_rate": 0.00011432129514321295, + "loss": 0.8422, + "step": 2214 + }, + { + "epoch": 1.3368548143676426, + "grad_norm": 0.11062701046466827, + "learning_rate": 0.00011427978414279785, + "loss": 0.691, + "step": 2215 + }, + { + "epoch": 1.337458496830667, + "grad_norm": 0.10322251170873642, + "learning_rate": 0.00011423827314238273, + "loss": 0.7036, + "step": 2216 + }, + { + "epoch": 1.3380621792936915, + "grad_norm": 0.09952748566865921, + "learning_rate": 0.00011419676214196762, + "loss": 0.6875, + "step": 2217 + }, + { + "epoch": 1.338665861756716, + "grad_norm": 0.11490236967802048, + "learning_rate": 0.00011415525114155252, + "loss": 0.6921, + "step": 2218 + }, + { + "epoch": 1.3392695442197404, + "grad_norm": 0.1203780472278595, + "learning_rate": 0.00011411374014113741, + "loss": 0.7415, + "step": 2219 + }, + { + "epoch": 1.3398732266827649, + "grad_norm": 0.11478865891695023, + "learning_rate": 0.00011407222914072228, + "loss": 0.7476, + "step": 2220 + }, + { + "epoch": 1.3404769091457893, + "grad_norm": 0.10916067659854889, + "learning_rate": 0.0001140307181403072, + "loss": 0.7539, + "step": 2221 + }, + { + "epoch": 1.3410805916088138, + "grad_norm": 0.11967265605926514, + "learning_rate": 0.00011398920713989207, + "loss": 0.834, + "step": 2222 + }, + { + "epoch": 1.3416842740718382, + "grad_norm": 0.12789706885814667, + "learning_rate": 0.00011394769613947695, + "loss": 0.7649, + "step": 2223 + }, + { + "epoch": 1.3422879565348627, + "grad_norm": 0.11024732142686844, + "learning_rate": 0.00011390618513906187, + "loss": 0.8246, + "step": 2224 + }, + { + "epoch": 1.342891638997887, + "grad_norm": 0.10574118793010712, + "learning_rate": 0.00011386467413864674, + "loss": 0.6421, + "step": 2225 + }, + { + "epoch": 1.3434953214609116, + "grad_norm": 0.1084161251783371, + "learning_rate": 0.00011382316313823165, + "loss": 0.8431, + "step": 2226 + }, + { + "epoch": 1.344099003923936, + "grad_norm": 0.112635038793087, + "learning_rate": 0.00011378165213781653, + "loss": 0.7057, + "step": 2227 + }, + { + "epoch": 1.3447026863869604, + "grad_norm": 0.11063949763774872, + "learning_rate": 0.00011374014113740141, + "loss": 0.7764, + "step": 2228 + }, + { + "epoch": 1.345306368849985, + "grad_norm": 0.1124119758605957, + "learning_rate": 0.00011369863013698631, + "loss": 0.9151, + "step": 2229 + }, + { + "epoch": 1.3459100513130093, + "grad_norm": 0.1197538748383522, + "learning_rate": 0.0001136571191365712, + "loss": 0.6366, + "step": 2230 + }, + { + "epoch": 1.3465137337760338, + "grad_norm": 0.1143086701631546, + "learning_rate": 0.00011361560813615609, + "loss": 1.0755, + "step": 2231 + }, + { + "epoch": 1.3471174162390582, + "grad_norm": 0.1160961166024208, + "learning_rate": 0.00011357409713574099, + "loss": 0.7346, + "step": 2232 + }, + { + "epoch": 1.3477210987020827, + "grad_norm": 0.1389804482460022, + "learning_rate": 0.00011353258613532587, + "loss": 0.7935, + "step": 2233 + }, + { + "epoch": 1.3483247811651071, + "grad_norm": 0.10732017457485199, + "learning_rate": 0.00011349107513491075, + "loss": 1.1968, + "step": 2234 + }, + { + "epoch": 1.3489284636281316, + "grad_norm": 0.10339702665805817, + "learning_rate": 0.00011344956413449566, + "loss": 0.674, + "step": 2235 + }, + { + "epoch": 1.349532146091156, + "grad_norm": 0.10818582028150558, + "learning_rate": 0.00011340805313408053, + "loss": 0.7626, + "step": 2236 + }, + { + "epoch": 1.3501358285541805, + "grad_norm": 0.1044488251209259, + "learning_rate": 0.00011336654213366542, + "loss": 0.719, + "step": 2237 + }, + { + "epoch": 1.350739511017205, + "grad_norm": 0.20112541317939758, + "learning_rate": 0.00011332503113325033, + "loss": 0.5892, + "step": 2238 + }, + { + "epoch": 1.3513431934802294, + "grad_norm": 0.10670458525419235, + "learning_rate": 0.0001132835201328352, + "loss": 0.6436, + "step": 2239 + }, + { + "epoch": 1.3519468759432538, + "grad_norm": 0.11629335582256317, + "learning_rate": 0.00011324200913242009, + "loss": 1.0747, + "step": 2240 + }, + { + "epoch": 1.3525505584062782, + "grad_norm": 0.14042681455612183, + "learning_rate": 0.00011320049813200499, + "loss": 0.6851, + "step": 2241 + }, + { + "epoch": 1.3531542408693027, + "grad_norm": 0.10726059973239899, + "learning_rate": 0.00011315898713158988, + "loss": 0.6776, + "step": 2242 + }, + { + "epoch": 1.3537579233323271, + "grad_norm": 0.10431487113237381, + "learning_rate": 0.00011311747613117476, + "loss": 0.5565, + "step": 2243 + }, + { + "epoch": 1.3543616057953516, + "grad_norm": 0.11104051023721695, + "learning_rate": 0.00011307596513075966, + "loss": 0.6523, + "step": 2244 + }, + { + "epoch": 1.354965288258376, + "grad_norm": 0.1104440838098526, + "learning_rate": 0.00011303445413034455, + "loss": 0.6657, + "step": 2245 + }, + { + "epoch": 1.3555689707214005, + "grad_norm": 0.11780980974435806, + "learning_rate": 0.00011299294312992942, + "loss": 0.6437, + "step": 2246 + }, + { + "epoch": 1.356172653184425, + "grad_norm": 0.11362163722515106, + "learning_rate": 0.00011295143212951434, + "loss": 0.6607, + "step": 2247 + }, + { + "epoch": 1.3567763356474494, + "grad_norm": 0.12305210530757904, + "learning_rate": 0.00011290992112909921, + "loss": 0.6784, + "step": 2248 + }, + { + "epoch": 1.3573800181104738, + "grad_norm": 0.12743736803531647, + "learning_rate": 0.0001128684101286841, + "loss": 0.6501, + "step": 2249 + }, + { + "epoch": 1.3579837005734983, + "grad_norm": 0.12665079534053802, + "learning_rate": 0.000112826899128269, + "loss": 0.6922, + "step": 2250 + }, + { + "epoch": 1.3585873830365227, + "grad_norm": 0.12639720737934113, + "learning_rate": 0.00011278538812785388, + "loss": 0.5699, + "step": 2251 + }, + { + "epoch": 1.3591910654995472, + "grad_norm": 0.12889716029167175, + "learning_rate": 0.00011274387712743877, + "loss": 0.5657, + "step": 2252 + }, + { + "epoch": 1.3597947479625718, + "grad_norm": 0.14307141304016113, + "learning_rate": 0.00011270236612702367, + "loss": 0.5307, + "step": 2253 + }, + { + "epoch": 1.360398430425596, + "grad_norm": 0.13598784804344177, + "learning_rate": 0.00011266085512660856, + "loss": 0.4857, + "step": 2254 + }, + { + "epoch": 1.3610021128886207, + "grad_norm": 0.150955468416214, + "learning_rate": 0.00011261934412619344, + "loss": 0.4596, + "step": 2255 + }, + { + "epoch": 1.361605795351645, + "grad_norm": 0.14729629456996918, + "learning_rate": 0.00011257783312577834, + "loss": 0.3831, + "step": 2256 + }, + { + "epoch": 1.3622094778146696, + "grad_norm": 0.15139052271842957, + "learning_rate": 0.00011253632212536323, + "loss": 0.3392, + "step": 2257 + }, + { + "epoch": 1.3628131602776938, + "grad_norm": 0.10566865652799606, + "learning_rate": 0.0001124948111249481, + "loss": 0.6181, + "step": 2258 + }, + { + "epoch": 1.3634168427407185, + "grad_norm": 0.1218709796667099, + "learning_rate": 0.00011245330012453302, + "loss": 0.7976, + "step": 2259 + }, + { + "epoch": 1.3640205252037427, + "grad_norm": 0.11592400819063187, + "learning_rate": 0.00011241178912411789, + "loss": 0.8621, + "step": 2260 + }, + { + "epoch": 1.3646242076667674, + "grad_norm": 0.1122564896941185, + "learning_rate": 0.00011237027812370278, + "loss": 0.6828, + "step": 2261 + }, + { + "epoch": 1.3652278901297916, + "grad_norm": 0.1253998577594757, + "learning_rate": 0.00011232876712328768, + "loss": 0.7367, + "step": 2262 + }, + { + "epoch": 1.3658315725928163, + "grad_norm": 0.11441956460475922, + "learning_rate": 0.00011228725612287256, + "loss": 0.7129, + "step": 2263 + }, + { + "epoch": 1.3664352550558405, + "grad_norm": 0.18277932703495026, + "learning_rate": 0.00011224574512245745, + "loss": 0.7095, + "step": 2264 + }, + { + "epoch": 1.3670389375188652, + "grad_norm": 0.10515805333852768, + "learning_rate": 0.00011220423412204235, + "loss": 0.6439, + "step": 2265 + }, + { + "epoch": 1.3676426199818894, + "grad_norm": 0.10004261136054993, + "learning_rate": 0.00011216272312162724, + "loss": 0.7665, + "step": 2266 + }, + { + "epoch": 1.368246302444914, + "grad_norm": 0.1079486608505249, + "learning_rate": 0.00011212121212121212, + "loss": 0.6854, + "step": 2267 + }, + { + "epoch": 1.3688499849079383, + "grad_norm": 0.11932551115751266, + "learning_rate": 0.00011207970112079702, + "loss": 0.7683, + "step": 2268 + }, + { + "epoch": 1.369453667370963, + "grad_norm": 0.13196608424186707, + "learning_rate": 0.00011203819012038191, + "loss": 0.8149, + "step": 2269 + }, + { + "epoch": 1.3700573498339872, + "grad_norm": 0.1305970698595047, + "learning_rate": 0.00011199667911996681, + "loss": 0.8266, + "step": 2270 + }, + { + "epoch": 1.3706610322970119, + "grad_norm": 0.10973062366247177, + "learning_rate": 0.0001119551681195517, + "loss": 0.6498, + "step": 2271 + }, + { + "epoch": 1.371264714760036, + "grad_norm": 0.11326909810304642, + "learning_rate": 0.00011191365711913657, + "loss": 0.7068, + "step": 2272 + }, + { + "epoch": 1.3718683972230608, + "grad_norm": 0.1138983964920044, + "learning_rate": 0.00011187214611872148, + "loss": 0.8917, + "step": 2273 + }, + { + "epoch": 1.3724720796860852, + "grad_norm": 0.10338523238897324, + "learning_rate": 0.00011183063511830635, + "loss": 0.6048, + "step": 2274 + }, + { + "epoch": 1.3730757621491096, + "grad_norm": 0.13740071654319763, + "learning_rate": 0.00011178912411789124, + "loss": 0.7794, + "step": 2275 + }, + { + "epoch": 1.373679444612134, + "grad_norm": 0.11318357288837433, + "learning_rate": 0.00011174761311747614, + "loss": 0.7224, + "step": 2276 + }, + { + "epoch": 1.3742831270751585, + "grad_norm": 0.10989907383918762, + "learning_rate": 0.00011170610211706103, + "loss": 0.7604, + "step": 2277 + }, + { + "epoch": 1.374886809538183, + "grad_norm": 0.11369361728429794, + "learning_rate": 0.00011166459111664591, + "loss": 0.6629, + "step": 2278 + }, + { + "epoch": 1.3754904920012074, + "grad_norm": 0.11444233357906342, + "learning_rate": 0.00011162308011623081, + "loss": 0.6644, + "step": 2279 + }, + { + "epoch": 1.3760941744642319, + "grad_norm": 0.11095111072063446, + "learning_rate": 0.0001115815691158157, + "loss": 0.8084, + "step": 2280 + }, + { + "epoch": 1.3766978569272563, + "grad_norm": 0.11327888071537018, + "learning_rate": 0.00011154005811540059, + "loss": 0.9121, + "step": 2281 + }, + { + "epoch": 1.3773015393902808, + "grad_norm": 0.11659125983715057, + "learning_rate": 0.00011149854711498549, + "loss": 0.7475, + "step": 2282 + }, + { + "epoch": 1.3779052218533052, + "grad_norm": 0.11424364149570465, + "learning_rate": 0.00011145703611457037, + "loss": 0.9011, + "step": 2283 + }, + { + "epoch": 1.3785089043163297, + "grad_norm": 0.11273805052042007, + "learning_rate": 0.00011141552511415525, + "loss": 0.6689, + "step": 2284 + }, + { + "epoch": 1.379112586779354, + "grad_norm": 0.1099931076169014, + "learning_rate": 0.00011137401411374016, + "loss": 0.7201, + "step": 2285 + }, + { + "epoch": 1.3797162692423786, + "grad_norm": 0.11520043015480042, + "learning_rate": 0.00011133250311332503, + "loss": 0.6714, + "step": 2286 + }, + { + "epoch": 1.380319951705403, + "grad_norm": 0.10492314398288727, + "learning_rate": 0.00011129099211290992, + "loss": 0.8564, + "step": 2287 + }, + { + "epoch": 1.3809236341684274, + "grad_norm": 0.10921221226453781, + "learning_rate": 0.00011124948111249482, + "loss": 0.7088, + "step": 2288 + }, + { + "epoch": 1.381527316631452, + "grad_norm": 0.11280515789985657, + "learning_rate": 0.0001112079701120797, + "loss": 1.1316, + "step": 2289 + }, + { + "epoch": 1.3821309990944763, + "grad_norm": 0.10194700956344604, + "learning_rate": 0.00011116645911166459, + "loss": 1.0434, + "step": 2290 + }, + { + "epoch": 1.3827346815575008, + "grad_norm": 0.11088079959154129, + "learning_rate": 0.00011112494811124949, + "loss": 0.663, + "step": 2291 + }, + { + "epoch": 1.3833383640205252, + "grad_norm": 0.10625745356082916, + "learning_rate": 0.00011108343711083438, + "loss": 0.6609, + "step": 2292 + }, + { + "epoch": 1.3839420464835497, + "grad_norm": 0.11164417862892151, + "learning_rate": 0.00011104192611041927, + "loss": 0.646, + "step": 2293 + }, + { + "epoch": 1.3845457289465741, + "grad_norm": 0.11619970202445984, + "learning_rate": 0.00011100041511000417, + "loss": 0.6225, + "step": 2294 + }, + { + "epoch": 1.3851494114095986, + "grad_norm": 0.11362091451883316, + "learning_rate": 0.00011095890410958905, + "loss": 0.6535, + "step": 2295 + }, + { + "epoch": 1.385753093872623, + "grad_norm": 0.11212702840566635, + "learning_rate": 0.00011091739310917392, + "loss": 0.585, + "step": 2296 + }, + { + "epoch": 1.3863567763356475, + "grad_norm": 0.12629151344299316, + "learning_rate": 0.00011087588210875884, + "loss": 0.6642, + "step": 2297 + }, + { + "epoch": 1.386960458798672, + "grad_norm": 0.12351097911596298, + "learning_rate": 0.00011083437110834371, + "loss": 0.6866, + "step": 2298 + }, + { + "epoch": 1.3875641412616964, + "grad_norm": 0.11870943009853363, + "learning_rate": 0.0001107928601079286, + "loss": 0.6017, + "step": 2299 + }, + { + "epoch": 1.3881678237247208, + "grad_norm": 0.12148743122816086, + "learning_rate": 0.0001107513491075135, + "loss": 0.574, + "step": 2300 + }, + { + "epoch": 1.3887715061877453, + "grad_norm": 0.1272771656513214, + "learning_rate": 0.00011070983810709838, + "loss": 0.5676, + "step": 2301 + }, + { + "epoch": 1.3893751886507697, + "grad_norm": 0.13474443554878235, + "learning_rate": 0.00011066832710668327, + "loss": 0.5647, + "step": 2302 + }, + { + "epoch": 1.3899788711137941, + "grad_norm": 0.16455814242362976, + "learning_rate": 0.00011062681610626817, + "loss": 0.513, + "step": 2303 + }, + { + "epoch": 1.3905825535768186, + "grad_norm": 0.15098370611667633, + "learning_rate": 0.00011058530510585306, + "loss": 0.5227, + "step": 2304 + }, + { + "epoch": 1.391186236039843, + "grad_norm": 0.14636565744876862, + "learning_rate": 0.00011054379410543793, + "loss": 0.4745, + "step": 2305 + }, + { + "epoch": 1.3917899185028675, + "grad_norm": 0.15192949771881104, + "learning_rate": 0.00011050228310502284, + "loss": 0.4152, + "step": 2306 + }, + { + "epoch": 1.392393600965892, + "grad_norm": 0.15223491191864014, + "learning_rate": 0.00011046077210460773, + "loss": 0.3253, + "step": 2307 + }, + { + "epoch": 1.3929972834289164, + "grad_norm": 0.10637323558330536, + "learning_rate": 0.0001104192611041926, + "loss": 0.6033, + "step": 2308 + }, + { + "epoch": 1.3936009658919408, + "grad_norm": 0.11009783297777176, + "learning_rate": 0.00011037775010377752, + "loss": 0.6934, + "step": 2309 + }, + { + "epoch": 1.3942046483549653, + "grad_norm": 0.10806847363710403, + "learning_rate": 0.00011033623910336239, + "loss": 0.6502, + "step": 2310 + }, + { + "epoch": 1.3948083308179897, + "grad_norm": 0.11237577348947525, + "learning_rate": 0.00011029472810294728, + "loss": 0.684, + "step": 2311 + }, + { + "epoch": 1.3954120132810142, + "grad_norm": 0.15474088490009308, + "learning_rate": 0.00011025321710253218, + "loss": 0.7796, + "step": 2312 + }, + { + "epoch": 1.3960156957440386, + "grad_norm": 0.10955344140529633, + "learning_rate": 0.00011021170610211706, + "loss": 0.6851, + "step": 2313 + }, + { + "epoch": 1.396619378207063, + "grad_norm": 0.12131255120038986, + "learning_rate": 0.00011017019510170196, + "loss": 0.7527, + "step": 2314 + }, + { + "epoch": 1.3972230606700875, + "grad_norm": 0.10648848861455917, + "learning_rate": 0.00011012868410128685, + "loss": 0.7382, + "step": 2315 + }, + { + "epoch": 1.397826743133112, + "grad_norm": 0.10780826210975647, + "learning_rate": 0.00011008717310087174, + "loss": 0.873, + "step": 2316 + }, + { + "epoch": 1.3984304255961364, + "grad_norm": 0.11691659688949585, + "learning_rate": 0.00011004566210045664, + "loss": 1.0688, + "step": 2317 + }, + { + "epoch": 1.3990341080591608, + "grad_norm": 0.1263553500175476, + "learning_rate": 0.00011000415110004152, + "loss": 0.7616, + "step": 2318 + }, + { + "epoch": 1.3996377905221853, + "grad_norm": 0.1272163987159729, + "learning_rate": 0.0001099626400996264, + "loss": 0.8121, + "step": 2319 + }, + { + "epoch": 1.4002414729852097, + "grad_norm": 0.267022967338562, + "learning_rate": 0.00010992112909921131, + "loss": 0.8719, + "step": 2320 + }, + { + "epoch": 1.4008451554482342, + "grad_norm": 0.10211081057786942, + "learning_rate": 0.0001098796180987962, + "loss": 1.0426, + "step": 2321 + }, + { + "epoch": 1.4014488379112586, + "grad_norm": 0.1085173562169075, + "learning_rate": 0.00010983810709838107, + "loss": 0.6862, + "step": 2322 + }, + { + "epoch": 1.402052520374283, + "grad_norm": 0.1208515390753746, + "learning_rate": 0.00010979659609796598, + "loss": 0.7391, + "step": 2323 + }, + { + "epoch": 1.4026562028373075, + "grad_norm": 0.10681469738483429, + "learning_rate": 0.00010975508509755085, + "loss": 0.7787, + "step": 2324 + }, + { + "epoch": 1.403259885300332, + "grad_norm": 0.1286618411540985, + "learning_rate": 0.00010971357409713574, + "loss": 0.7521, + "step": 2325 + }, + { + "epoch": 1.4038635677633564, + "grad_norm": 0.10427875816822052, + "learning_rate": 0.00010967206309672064, + "loss": 0.7142, + "step": 2326 + }, + { + "epoch": 1.4044672502263809, + "grad_norm": 0.10928861796855927, + "learning_rate": 0.00010963055209630553, + "loss": 0.8745, + "step": 2327 + }, + { + "epoch": 1.4050709326894053, + "grad_norm": 0.09785252064466476, + "learning_rate": 0.00010958904109589041, + "loss": 0.6134, + "step": 2328 + }, + { + "epoch": 1.4056746151524298, + "grad_norm": 0.10797867923974991, + "learning_rate": 0.00010954753009547531, + "loss": 1.0501, + "step": 2329 + }, + { + "epoch": 1.4062782976154542, + "grad_norm": 0.10527419298887253, + "learning_rate": 0.0001095060190950602, + "loss": 0.7023, + "step": 2330 + }, + { + "epoch": 1.4068819800784786, + "grad_norm": 0.10159947723150253, + "learning_rate": 0.00010946450809464507, + "loss": 0.733, + "step": 2331 + }, + { + "epoch": 1.407485662541503, + "grad_norm": 0.1160796582698822, + "learning_rate": 0.00010942299709422999, + "loss": 0.6167, + "step": 2332 + }, + { + "epoch": 1.4080893450045275, + "grad_norm": 0.10533631592988968, + "learning_rate": 0.00010938148609381486, + "loss": 0.9244, + "step": 2333 + }, + { + "epoch": 1.4086930274675522, + "grad_norm": 0.1092139482498169, + "learning_rate": 0.00010933997509339975, + "loss": 0.7907, + "step": 2334 + }, + { + "epoch": 1.4092967099305764, + "grad_norm": 0.12767547369003296, + "learning_rate": 0.00010929846409298466, + "loss": 0.6244, + "step": 2335 + }, + { + "epoch": 1.409900392393601, + "grad_norm": 0.1770038902759552, + "learning_rate": 0.00010925695309256953, + "loss": 0.8401, + "step": 2336 + }, + { + "epoch": 1.4105040748566253, + "grad_norm": 0.10718080401420593, + "learning_rate": 0.00010921544209215442, + "loss": 0.6228, + "step": 2337 + }, + { + "epoch": 1.41110775731965, + "grad_norm": 0.1158752515912056, + "learning_rate": 0.00010917393109173932, + "loss": 0.7027, + "step": 2338 + }, + { + "epoch": 1.4117114397826742, + "grad_norm": 0.11484018713235855, + "learning_rate": 0.0001091324200913242, + "loss": 0.7482, + "step": 2339 + }, + { + "epoch": 1.4123151222456989, + "grad_norm": 0.10645020753145218, + "learning_rate": 0.00010909090909090909, + "loss": 0.7191, + "step": 2340 + }, + { + "epoch": 1.412918804708723, + "grad_norm": 0.11423823237419128, + "learning_rate": 0.00010904939809049399, + "loss": 0.6348, + "step": 2341 + }, + { + "epoch": 1.4135224871717478, + "grad_norm": 0.10967587679624557, + "learning_rate": 0.00010900788709007888, + "loss": 0.6462, + "step": 2342 + }, + { + "epoch": 1.414126169634772, + "grad_norm": 0.1130829006433487, + "learning_rate": 0.00010896637608966375, + "loss": 0.6599, + "step": 2343 + }, + { + "epoch": 1.4147298520977967, + "grad_norm": 0.10537155717611313, + "learning_rate": 0.00010892486508924867, + "loss": 0.6092, + "step": 2344 + }, + { + "epoch": 1.415333534560821, + "grad_norm": 0.11640435457229614, + "learning_rate": 0.00010888335408883354, + "loss": 0.6485, + "step": 2345 + }, + { + "epoch": 1.4159372170238456, + "grad_norm": 0.13120651245117188, + "learning_rate": 0.00010884184308841842, + "loss": 0.8094, + "step": 2346 + }, + { + "epoch": 1.4165408994868698, + "grad_norm": 0.12212494760751724, + "learning_rate": 0.00010880033208800332, + "loss": 0.642, + "step": 2347 + }, + { + "epoch": 1.4171445819498945, + "grad_norm": 0.12393233925104141, + "learning_rate": 0.00010875882108758821, + "loss": 0.6366, + "step": 2348 + }, + { + "epoch": 1.4177482644129187, + "grad_norm": 0.13876676559448242, + "learning_rate": 0.0001087173100871731, + "loss": 0.6859, + "step": 2349 + }, + { + "epoch": 1.4183519468759433, + "grad_norm": 0.13311347365379333, + "learning_rate": 0.000108675799086758, + "loss": 0.6349, + "step": 2350 + }, + { + "epoch": 1.4189556293389676, + "grad_norm": 0.13839563727378845, + "learning_rate": 0.00010863428808634288, + "loss": 0.5701, + "step": 2351 + }, + { + "epoch": 1.4195593118019922, + "grad_norm": 0.14265376329421997, + "learning_rate": 0.00010859277708592777, + "loss": 0.5708, + "step": 2352 + }, + { + "epoch": 1.4201629942650165, + "grad_norm": 0.1401066929101944, + "learning_rate": 0.00010855126608551267, + "loss": 0.5612, + "step": 2353 + }, + { + "epoch": 1.4207666767280411, + "grad_norm": 0.14841662347316742, + "learning_rate": 0.00010850975508509756, + "loss": 0.5377, + "step": 2354 + }, + { + "epoch": 1.4213703591910656, + "grad_norm": 0.15106201171875, + "learning_rate": 0.00010846824408468243, + "loss": 0.5121, + "step": 2355 + }, + { + "epoch": 1.42197404165409, + "grad_norm": 0.15050731599330902, + "learning_rate": 0.00010842673308426734, + "loss": 0.3922, + "step": 2356 + }, + { + "epoch": 1.4225777241171145, + "grad_norm": 0.1529148668050766, + "learning_rate": 0.00010838522208385222, + "loss": 0.3001, + "step": 2357 + }, + { + "epoch": 1.423181406580139, + "grad_norm": 0.10441195219755173, + "learning_rate": 0.0001083437110834371, + "loss": 0.694, + "step": 2358 + }, + { + "epoch": 1.4237850890431634, + "grad_norm": 0.11137279868125916, + "learning_rate": 0.000108302200083022, + "loss": 0.9643, + "step": 2359 + }, + { + "epoch": 1.4243887715061878, + "grad_norm": 0.12313207983970642, + "learning_rate": 0.00010826068908260689, + "loss": 0.6247, + "step": 2360 + }, + { + "epoch": 1.4249924539692123, + "grad_norm": 0.11363215744495392, + "learning_rate": 0.00010821917808219179, + "loss": 0.7187, + "step": 2361 + }, + { + "epoch": 1.4255961364322367, + "grad_norm": 0.11889949440956116, + "learning_rate": 0.00010817766708177668, + "loss": 0.9015, + "step": 2362 + }, + { + "epoch": 1.4261998188952612, + "grad_norm": 0.1457277536392212, + "learning_rate": 0.00010813615608136156, + "loss": 0.7921, + "step": 2363 + }, + { + "epoch": 1.4268035013582856, + "grad_norm": 0.11033974587917328, + "learning_rate": 0.00010809464508094646, + "loss": 0.7434, + "step": 2364 + }, + { + "epoch": 1.42740718382131, + "grad_norm": 0.1310395896434784, + "learning_rate": 0.00010805313408053135, + "loss": 0.8429, + "step": 2365 + }, + { + "epoch": 1.4280108662843345, + "grad_norm": 0.11207132786512375, + "learning_rate": 0.00010801162308011624, + "loss": 1.0315, + "step": 2366 + }, + { + "epoch": 1.428614548747359, + "grad_norm": 0.10667724162340164, + "learning_rate": 0.00010797011207970114, + "loss": 0.5591, + "step": 2367 + }, + { + "epoch": 1.4292182312103834, + "grad_norm": 0.11251533776521683, + "learning_rate": 0.00010792860107928602, + "loss": 0.7458, + "step": 2368 + }, + { + "epoch": 1.4298219136734078, + "grad_norm": 0.11157943308353424, + "learning_rate": 0.0001078870900788709, + "loss": 0.7979, + "step": 2369 + }, + { + "epoch": 1.4304255961364323, + "grad_norm": 0.10930392146110535, + "learning_rate": 0.00010784557907845581, + "loss": 0.7159, + "step": 2370 + }, + { + "epoch": 1.4310292785994567, + "grad_norm": 0.11821290105581284, + "learning_rate": 0.00010780406807804068, + "loss": 0.8176, + "step": 2371 + }, + { + "epoch": 1.4316329610624812, + "grad_norm": 0.11287893354892731, + "learning_rate": 0.00010776255707762557, + "loss": 0.7326, + "step": 2372 + }, + { + "epoch": 1.4322366435255056, + "grad_norm": 0.11231239885091782, + "learning_rate": 0.00010772104607721047, + "loss": 0.7556, + "step": 2373 + }, + { + "epoch": 1.43284032598853, + "grad_norm": 0.10399778932332993, + "learning_rate": 0.00010767953507679535, + "loss": 0.7313, + "step": 2374 + }, + { + "epoch": 1.4334440084515545, + "grad_norm": 0.112164206802845, + "learning_rate": 0.00010763802407638024, + "loss": 0.7218, + "step": 2375 + }, + { + "epoch": 1.434047690914579, + "grad_norm": 0.10718587040901184, + "learning_rate": 0.00010759651307596514, + "loss": 0.7168, + "step": 2376 + }, + { + "epoch": 1.4346513733776034, + "grad_norm": 0.1262284815311432, + "learning_rate": 0.00010755500207555003, + "loss": 0.7782, + "step": 2377 + }, + { + "epoch": 1.4352550558406278, + "grad_norm": 0.11654514819383621, + "learning_rate": 0.00010751349107513491, + "loss": 0.7666, + "step": 2378 + }, + { + "epoch": 1.4358587383036523, + "grad_norm": 0.10238191485404968, + "learning_rate": 0.00010747198007471981, + "loss": 0.7126, + "step": 2379 + }, + { + "epoch": 1.4364624207666767, + "grad_norm": 0.11885840445756912, + "learning_rate": 0.0001074304690743047, + "loss": 0.9197, + "step": 2380 + }, + { + "epoch": 1.4370661032297012, + "grad_norm": 0.11124838888645172, + "learning_rate": 0.00010738895807388957, + "loss": 0.6942, + "step": 2381 + }, + { + "epoch": 1.4376697856927256, + "grad_norm": 0.10520321130752563, + "learning_rate": 0.00010734744707347449, + "loss": 1.0973, + "step": 2382 + }, + { + "epoch": 1.43827346815575, + "grad_norm": 0.10521334409713745, + "learning_rate": 0.00010730593607305936, + "loss": 0.6247, + "step": 2383 + }, + { + "epoch": 1.4388771506187745, + "grad_norm": 0.1028476133942604, + "learning_rate": 0.00010726442507264425, + "loss": 0.9452, + "step": 2384 + }, + { + "epoch": 1.439480833081799, + "grad_norm": 0.10759977251291275, + "learning_rate": 0.00010722291407222915, + "loss": 0.7051, + "step": 2385 + }, + { + "epoch": 1.4400845155448234, + "grad_norm": 0.13091117143630981, + "learning_rate": 0.00010718140307181403, + "loss": 0.8791, + "step": 2386 + }, + { + "epoch": 1.4406881980078479, + "grad_norm": 0.11425816267728806, + "learning_rate": 0.00010713989207139892, + "loss": 0.7791, + "step": 2387 + }, + { + "epoch": 1.4412918804708723, + "grad_norm": 0.10497016459703445, + "learning_rate": 0.00010709838107098382, + "loss": 0.6761, + "step": 2388 + }, + { + "epoch": 1.4418955629338968, + "grad_norm": 0.12967948615550995, + "learning_rate": 0.0001070568700705687, + "loss": 0.9618, + "step": 2389 + }, + { + "epoch": 1.4424992453969212, + "grad_norm": 0.11426857113838196, + "learning_rate": 0.00010701535907015359, + "loss": 0.761, + "step": 2390 + }, + { + "epoch": 1.4431029278599457, + "grad_norm": 0.36786845326423645, + "learning_rate": 0.00010697384806973849, + "loss": 0.722, + "step": 2391 + }, + { + "epoch": 1.44370661032297, + "grad_norm": 0.11083753407001495, + "learning_rate": 0.00010693233706932338, + "loss": 0.6985, + "step": 2392 + }, + { + "epoch": 1.4443102927859945, + "grad_norm": 0.11332450807094574, + "learning_rate": 0.00010689082606890825, + "loss": 0.6547, + "step": 2393 + }, + { + "epoch": 1.444913975249019, + "grad_norm": 0.11285028606653214, + "learning_rate": 0.00010684931506849317, + "loss": 0.6623, + "step": 2394 + }, + { + "epoch": 1.4455176577120434, + "grad_norm": 0.12340085208415985, + "learning_rate": 0.00010680780406807804, + "loss": 0.7428, + "step": 2395 + }, + { + "epoch": 1.4461213401750679, + "grad_norm": 0.11893676221370697, + "learning_rate": 0.00010676629306766293, + "loss": 0.6858, + "step": 2396 + }, + { + "epoch": 1.4467250226380923, + "grad_norm": 0.11241084337234497, + "learning_rate": 0.00010672478206724783, + "loss": 0.6083, + "step": 2397 + }, + { + "epoch": 1.4473287051011168, + "grad_norm": 0.12452124804258347, + "learning_rate": 0.00010668327106683271, + "loss": 0.6005, + "step": 2398 + }, + { + "epoch": 1.4479323875641412, + "grad_norm": 0.3717198073863983, + "learning_rate": 0.0001066417600664176, + "loss": 0.651, + "step": 2399 + }, + { + "epoch": 1.4485360700271657, + "grad_norm": 0.13178178668022156, + "learning_rate": 0.0001066002490660025, + "loss": 0.5641, + "step": 2400 + }, + { + "epoch": 1.4491397524901901, + "grad_norm": 0.13347181677818298, + "learning_rate": 0.00010655873806558738, + "loss": 0.565, + "step": 2401 + }, + { + "epoch": 1.4497434349532146, + "grad_norm": 0.13921667635440826, + "learning_rate": 0.00010651722706517227, + "loss": 0.5777, + "step": 2402 + }, + { + "epoch": 1.450347117416239, + "grad_norm": 0.14336366951465607, + "learning_rate": 0.00010647571606475717, + "loss": 0.484, + "step": 2403 + }, + { + "epoch": 1.4509507998792635, + "grad_norm": 0.16313937306404114, + "learning_rate": 0.00010643420506434206, + "loss": 0.5522, + "step": 2404 + }, + { + "epoch": 1.451554482342288, + "grad_norm": 0.16395455598831177, + "learning_rate": 0.00010639269406392696, + "loss": 0.5376, + "step": 2405 + }, + { + "epoch": 1.4521581648053123, + "grad_norm": 0.15675240755081177, + "learning_rate": 0.00010635118306351184, + "loss": 0.4248, + "step": 2406 + }, + { + "epoch": 1.4527618472683368, + "grad_norm": 0.16776950657367706, + "learning_rate": 0.00010630967206309672, + "loss": 0.3375, + "step": 2407 + }, + { + "epoch": 1.4533655297313612, + "grad_norm": 0.10852830857038498, + "learning_rate": 0.00010626816106268163, + "loss": 0.6665, + "step": 2408 + }, + { + "epoch": 1.4539692121943857, + "grad_norm": 0.115415558218956, + "learning_rate": 0.0001062266500622665, + "loss": 0.7669, + "step": 2409 + }, + { + "epoch": 1.4545728946574101, + "grad_norm": 0.12048669904470444, + "learning_rate": 0.00010618513906185139, + "loss": 0.7165, + "step": 2410 + }, + { + "epoch": 1.4551765771204346, + "grad_norm": 0.15835049748420715, + "learning_rate": 0.00010614362806143629, + "loss": 0.7294, + "step": 2411 + }, + { + "epoch": 1.455780259583459, + "grad_norm": 0.11197509616613388, + "learning_rate": 0.00010610211706102118, + "loss": 0.9129, + "step": 2412 + }, + { + "epoch": 1.4563839420464835, + "grad_norm": 0.11068537831306458, + "learning_rate": 0.00010606060606060606, + "loss": 0.7544, + "step": 2413 + }, + { + "epoch": 1.456987624509508, + "grad_norm": 0.1250140517950058, + "learning_rate": 0.00010601909506019096, + "loss": 0.8565, + "step": 2414 + }, + { + "epoch": 1.4575913069725326, + "grad_norm": 0.11533838510513306, + "learning_rate": 0.00010597758405977585, + "loss": 0.7451, + "step": 2415 + }, + { + "epoch": 1.4581949894355568, + "grad_norm": 0.11751358211040497, + "learning_rate": 0.00010593607305936074, + "loss": 0.852, + "step": 2416 + }, + { + "epoch": 1.4587986718985815, + "grad_norm": 0.1017642691731453, + "learning_rate": 0.00010589456205894564, + "loss": 0.5441, + "step": 2417 + }, + { + "epoch": 1.4594023543616057, + "grad_norm": 0.10722334682941437, + "learning_rate": 0.00010585305105853052, + "loss": 0.8963, + "step": 2418 + }, + { + "epoch": 1.4600060368246304, + "grad_norm": 0.11673782765865326, + "learning_rate": 0.0001058115400581154, + "loss": 0.8146, + "step": 2419 + }, + { + "epoch": 1.4606097192876546, + "grad_norm": 0.12002434581518173, + "learning_rate": 0.00010577002905770031, + "loss": 0.741, + "step": 2420 + }, + { + "epoch": 1.4612134017506793, + "grad_norm": 0.11690894514322281, + "learning_rate": 0.00010572851805728518, + "loss": 0.7929, + "step": 2421 + }, + { + "epoch": 1.4618170842137035, + "grad_norm": 0.12483794242143631, + "learning_rate": 0.00010568700705687007, + "loss": 0.8771, + "step": 2422 + }, + { + "epoch": 1.4624207666767282, + "grad_norm": 0.11409945040941238, + "learning_rate": 0.00010564549605645497, + "loss": 0.6703, + "step": 2423 + }, + { + "epoch": 1.4630244491397524, + "grad_norm": 0.11765747517347336, + "learning_rate": 0.00010560398505603986, + "loss": 0.6562, + "step": 2424 + }, + { + "epoch": 1.463628131602777, + "grad_norm": 0.10898533463478088, + "learning_rate": 0.00010556247405562474, + "loss": 0.7042, + "step": 2425 + }, + { + "epoch": 1.4642318140658013, + "grad_norm": 0.12223546206951141, + "learning_rate": 0.00010552096305520964, + "loss": 0.6604, + "step": 2426 + }, + { + "epoch": 1.464835496528826, + "grad_norm": 0.10916266590356827, + "learning_rate": 0.00010547945205479453, + "loss": 0.7925, + "step": 2427 + }, + { + "epoch": 1.4654391789918502, + "grad_norm": 0.1135077029466629, + "learning_rate": 0.0001054379410543794, + "loss": 0.7005, + "step": 2428 + }, + { + "epoch": 1.4660428614548748, + "grad_norm": 0.11547960340976715, + "learning_rate": 0.00010539643005396431, + "loss": 0.7297, + "step": 2429 + }, + { + "epoch": 1.466646543917899, + "grad_norm": 0.11045881360769272, + "learning_rate": 0.0001053549190535492, + "loss": 0.7663, + "step": 2430 + }, + { + "epoch": 1.4672502263809237, + "grad_norm": 0.12327514588832855, + "learning_rate": 0.00010531340805313407, + "loss": 0.6968, + "step": 2431 + }, + { + "epoch": 1.467853908843948, + "grad_norm": 0.11502881348133087, + "learning_rate": 0.00010527189705271899, + "loss": 0.7197, + "step": 2432 + }, + { + "epoch": 1.4684575913069726, + "grad_norm": 0.11779715120792389, + "learning_rate": 0.00010523038605230386, + "loss": 0.7248, + "step": 2433 + }, + { + "epoch": 1.4690612737699968, + "grad_norm": 0.11766118556261063, + "learning_rate": 0.00010518887505188875, + "loss": 0.7546, + "step": 2434 + }, + { + "epoch": 1.4696649562330215, + "grad_norm": 0.11458957195281982, + "learning_rate": 0.00010514736405147365, + "loss": 0.7825, + "step": 2435 + }, + { + "epoch": 1.470268638696046, + "grad_norm": 0.11159491539001465, + "learning_rate": 0.00010510585305105853, + "loss": 0.8465, + "step": 2436 + }, + { + "epoch": 1.4708723211590704, + "grad_norm": 0.10881160199642181, + "learning_rate": 0.00010506434205064342, + "loss": 0.7381, + "step": 2437 + }, + { + "epoch": 1.4714760036220949, + "grad_norm": 0.11549453437328339, + "learning_rate": 0.00010502283105022832, + "loss": 0.7784, + "step": 2438 + }, + { + "epoch": 1.4720796860851193, + "grad_norm": 0.11238040775060654, + "learning_rate": 0.0001049813200498132, + "loss": 0.8611, + "step": 2439 + }, + { + "epoch": 1.4726833685481437, + "grad_norm": 0.1089930310845375, + "learning_rate": 0.00010493980904939808, + "loss": 0.7186, + "step": 2440 + }, + { + "epoch": 1.4732870510111682, + "grad_norm": 0.10445185750722885, + "learning_rate": 0.00010489829804898299, + "loss": 0.6156, + "step": 2441 + }, + { + "epoch": 1.4738907334741926, + "grad_norm": 0.10923328250646591, + "learning_rate": 0.00010485678704856787, + "loss": 0.6738, + "step": 2442 + }, + { + "epoch": 1.474494415937217, + "grad_norm": 0.11018381267786026, + "learning_rate": 0.00010481527604815275, + "loss": 0.6475, + "step": 2443 + }, + { + "epoch": 1.4750980984002415, + "grad_norm": 0.11723464727401733, + "learning_rate": 0.00010477376504773767, + "loss": 0.7307, + "step": 2444 + }, + { + "epoch": 1.475701780863266, + "grad_norm": 0.11130370199680328, + "learning_rate": 0.00010473225404732254, + "loss": 0.6028, + "step": 2445 + }, + { + "epoch": 1.4763054633262904, + "grad_norm": 0.11799994856119156, + "learning_rate": 0.00010469074304690743, + "loss": 0.5884, + "step": 2446 + }, + { + "epoch": 1.4769091457893149, + "grad_norm": 0.12066012620925903, + "learning_rate": 0.00010464923204649233, + "loss": 0.6483, + "step": 2447 + }, + { + "epoch": 1.4775128282523393, + "grad_norm": 0.12716256082057953, + "learning_rate": 0.00010460772104607721, + "loss": 0.6412, + "step": 2448 + }, + { + "epoch": 1.4781165107153638, + "grad_norm": 0.1431380957365036, + "learning_rate": 0.00010456621004566211, + "loss": 0.6544, + "step": 2449 + }, + { + "epoch": 1.4787201931783882, + "grad_norm": 0.1343405693769455, + "learning_rate": 0.000104524699045247, + "loss": 0.5788, + "step": 2450 + }, + { + "epoch": 1.4793238756414127, + "grad_norm": 0.1583533138036728, + "learning_rate": 0.00010448318804483189, + "loss": 0.6123, + "step": 2451 + }, + { + "epoch": 1.479927558104437, + "grad_norm": 0.1386100798845291, + "learning_rate": 0.00010444167704441679, + "loss": 0.6458, + "step": 2452 + }, + { + "epoch": 1.4805312405674615, + "grad_norm": 0.1477377563714981, + "learning_rate": 0.00010440016604400167, + "loss": 0.5432, + "step": 2453 + }, + { + "epoch": 1.481134923030486, + "grad_norm": 0.14655126631259918, + "learning_rate": 0.00010435865504358654, + "loss": 0.4965, + "step": 2454 + }, + { + "epoch": 1.4817386054935104, + "grad_norm": 0.15142911672592163, + "learning_rate": 0.00010431714404317146, + "loss": 0.4564, + "step": 2455 + }, + { + "epoch": 1.4823422879565349, + "grad_norm": 0.1687132865190506, + "learning_rate": 0.00010427563304275633, + "loss": 0.4261, + "step": 2456 + }, + { + "epoch": 1.4829459704195593, + "grad_norm": 0.15374094247817993, + "learning_rate": 0.00010423412204234122, + "loss": 0.2757, + "step": 2457 + }, + { + "epoch": 1.4835496528825838, + "grad_norm": 0.11385196447372437, + "learning_rate": 0.00010419261104192613, + "loss": 0.6181, + "step": 2458 + }, + { + "epoch": 1.4841533353456082, + "grad_norm": 0.12344599515199661, + "learning_rate": 0.000104151100041511, + "loss": 0.5809, + "step": 2459 + }, + { + "epoch": 1.4847570178086327, + "grad_norm": 0.12225256860256195, + "learning_rate": 0.00010410958904109589, + "loss": 0.8097, + "step": 2460 + }, + { + "epoch": 1.4853607002716571, + "grad_norm": 0.1283671259880066, + "learning_rate": 0.00010406807804068079, + "loss": 0.8238, + "step": 2461 + }, + { + "epoch": 1.4859643827346816, + "grad_norm": 0.12050528824329376, + "learning_rate": 0.00010402656704026568, + "loss": 0.7068, + "step": 2462 + }, + { + "epoch": 1.486568065197706, + "grad_norm": 0.12876033782958984, + "learning_rate": 0.00010398505603985056, + "loss": 0.7713, + "step": 2463 + }, + { + "epoch": 1.4871717476607305, + "grad_norm": 0.12785589694976807, + "learning_rate": 0.00010394354503943546, + "loss": 0.9149, + "step": 2464 + }, + { + "epoch": 1.487775430123755, + "grad_norm": 0.10582014918327332, + "learning_rate": 0.00010390203403902035, + "loss": 0.71, + "step": 2465 + }, + { + "epoch": 1.4883791125867794, + "grad_norm": 0.10887051373720169, + "learning_rate": 0.00010386052303860522, + "loss": 1.0357, + "step": 2466 + }, + { + "epoch": 1.4889827950498038, + "grad_norm": 0.10681470483541489, + "learning_rate": 0.00010381901203819014, + "loss": 0.7339, + "step": 2467 + }, + { + "epoch": 1.4895864775128282, + "grad_norm": 0.13924428820610046, + "learning_rate": 0.00010377750103777501, + "loss": 0.6848, + "step": 2468 + }, + { + "epoch": 1.4901901599758527, + "grad_norm": 0.10944285243749619, + "learning_rate": 0.0001037359900373599, + "loss": 0.7028, + "step": 2469 + }, + { + "epoch": 1.4907938424388771, + "grad_norm": 0.11315060406923294, + "learning_rate": 0.0001036944790369448, + "loss": 0.7433, + "step": 2470 + }, + { + "epoch": 1.4913975249019016, + "grad_norm": 0.11983177065849304, + "learning_rate": 0.00010365296803652968, + "loss": 0.7025, + "step": 2471 + }, + { + "epoch": 1.492001207364926, + "grad_norm": 0.11219801008701324, + "learning_rate": 0.00010361145703611457, + "loss": 0.7046, + "step": 2472 + }, + { + "epoch": 1.4926048898279505, + "grad_norm": 0.12625454366207123, + "learning_rate": 0.00010356994603569947, + "loss": 0.7736, + "step": 2473 + }, + { + "epoch": 1.493208572290975, + "grad_norm": 0.12454172223806381, + "learning_rate": 0.00010352843503528436, + "loss": 0.7124, + "step": 2474 + }, + { + "epoch": 1.4938122547539994, + "grad_norm": 0.11191127449274063, + "learning_rate": 0.00010348692403486924, + "loss": 0.8456, + "step": 2475 + }, + { + "epoch": 1.4944159372170238, + "grad_norm": 0.10700973868370056, + "learning_rate": 0.00010344541303445414, + "loss": 0.8491, + "step": 2476 + }, + { + "epoch": 1.4950196196800483, + "grad_norm": 0.7383321523666382, + "learning_rate": 0.00010340390203403903, + "loss": 0.8734, + "step": 2477 + }, + { + "epoch": 1.4956233021430727, + "grad_norm": 0.11059489101171494, + "learning_rate": 0.0001033623910336239, + "loss": 0.6143, + "step": 2478 + }, + { + "epoch": 1.4962269846060972, + "grad_norm": 0.11313536763191223, + "learning_rate": 0.00010332088003320882, + "loss": 0.6963, + "step": 2479 + }, + { + "epoch": 1.4968306670691216, + "grad_norm": 0.11421024799346924, + "learning_rate": 0.00010327936903279369, + "loss": 0.7297, + "step": 2480 + }, + { + "epoch": 1.497434349532146, + "grad_norm": 0.11083009093999863, + "learning_rate": 0.00010323785803237857, + "loss": 0.6467, + "step": 2481 + }, + { + "epoch": 1.4980380319951705, + "grad_norm": 0.110516257584095, + "learning_rate": 0.00010319634703196347, + "loss": 1.0344, + "step": 2482 + }, + { + "epoch": 1.498641714458195, + "grad_norm": 0.12053235620260239, + "learning_rate": 0.00010315483603154836, + "loss": 0.7158, + "step": 2483 + }, + { + "epoch": 1.4992453969212194, + "grad_norm": 0.11220195889472961, + "learning_rate": 0.00010311332503113325, + "loss": 0.9486, + "step": 2484 + }, + { + "epoch": 1.4998490793842438, + "grad_norm": 0.121260866522789, + "learning_rate": 0.00010307181403071815, + "loss": 0.6877, + "step": 2485 + }, + { + "epoch": 1.5004527618472685, + "grad_norm": 0.11453468352556229, + "learning_rate": 0.00010303030303030303, + "loss": 0.6868, + "step": 2486 + }, + { + "epoch": 1.5010564443102927, + "grad_norm": 0.11923350393772125, + "learning_rate": 0.00010298879202988792, + "loss": 0.7115, + "step": 2487 + }, + { + "epoch": 1.5016601267733174, + "grad_norm": 0.11905788630247116, + "learning_rate": 0.00010294728102947282, + "loss": 0.894, + "step": 2488 + }, + { + "epoch": 1.5022638092363416, + "grad_norm": 0.12174365669488907, + "learning_rate": 0.00010290577002905771, + "loss": 0.8226, + "step": 2489 + }, + { + "epoch": 1.5028674916993663, + "grad_norm": 0.1140153631567955, + "learning_rate": 0.00010286425902864258, + "loss": 0.744, + "step": 2490 + }, + { + "epoch": 1.5034711741623905, + "grad_norm": 0.11370176821947098, + "learning_rate": 0.0001028227480282275, + "loss": 0.6642, + "step": 2491 + }, + { + "epoch": 1.5040748566254152, + "grad_norm": 0.11257719993591309, + "learning_rate": 0.00010278123702781237, + "loss": 0.6352, + "step": 2492 + }, + { + "epoch": 1.5046785390884394, + "grad_norm": 0.11130896955728531, + "learning_rate": 0.00010273972602739728, + "loss": 0.6231, + "step": 2493 + }, + { + "epoch": 1.505282221551464, + "grad_norm": 0.11869868636131287, + "learning_rate": 0.00010269821502698215, + "loss": 0.695, + "step": 2494 + }, + { + "epoch": 1.5058859040144883, + "grad_norm": 0.12181151658296585, + "learning_rate": 0.00010265670402656704, + "loss": 0.6434, + "step": 2495 + }, + { + "epoch": 1.506489586477513, + "grad_norm": 0.12335172295570374, + "learning_rate": 0.00010261519302615194, + "loss": 0.6727, + "step": 2496 + }, + { + "epoch": 1.5070932689405372, + "grad_norm": 0.12331590801477432, + "learning_rate": 0.00010257368202573683, + "loss": 0.6231, + "step": 2497 + }, + { + "epoch": 1.5076969514035619, + "grad_norm": 0.13102783262729645, + "learning_rate": 0.00010253217102532171, + "loss": 0.5846, + "step": 2498 + }, + { + "epoch": 1.508300633866586, + "grad_norm": 0.1286454200744629, + "learning_rate": 0.00010249066002490661, + "loss": 0.6141, + "step": 2499 + }, + { + "epoch": 1.5089043163296108, + "grad_norm": 0.1343620866537094, + "learning_rate": 0.0001024491490244915, + "loss": 0.6316, + "step": 2500 + }, + { + "epoch": 1.5089043163296108, + "eval_loss": 0.795173168182373, + "eval_runtime": 1219.0487, + "eval_samples_per_second": 2.289, + "eval_steps_per_second": 0.286, + "step": 2500 + }, + { + "epoch": 1.509507998792635, + "grad_norm": 0.1346864253282547, + "learning_rate": 0.00010240763802407639, + "loss": 0.6343, + "step": 2501 + }, + { + "epoch": 1.5101116812556596, + "grad_norm": 0.1306980401277542, + "learning_rate": 0.00010236612702366129, + "loss": 0.5615, + "step": 2502 + }, + { + "epoch": 1.5107153637186839, + "grad_norm": 0.1505928337574005, + "learning_rate": 0.00010232461602324617, + "loss": 0.5591, + "step": 2503 + }, + { + "epoch": 1.5113190461817085, + "grad_norm": 0.1430431306362152, + "learning_rate": 0.00010228310502283104, + "loss": 0.496, + "step": 2504 + }, + { + "epoch": 1.5119227286447328, + "grad_norm": 0.1660575270652771, + "learning_rate": 0.00010224159402241596, + "loss": 0.4632, + "step": 2505 + }, + { + "epoch": 1.5125264111077574, + "grad_norm": 0.17529623210430145, + "learning_rate": 0.00010220008302200083, + "loss": 0.4824, + "step": 2506 + }, + { + "epoch": 1.5131300935707817, + "grad_norm": 0.15798205137252808, + "learning_rate": 0.00010215857202158572, + "loss": 0.3551, + "step": 2507 + }, + { + "epoch": 1.5137337760338063, + "grad_norm": 0.12132739275693893, + "learning_rate": 0.00010211706102117062, + "loss": 0.9828, + "step": 2508 + }, + { + "epoch": 1.5143374584968305, + "grad_norm": 0.12069849669933319, + "learning_rate": 0.0001020755500207555, + "loss": 0.7057, + "step": 2509 + }, + { + "epoch": 1.5149411409598552, + "grad_norm": 0.1271534413099289, + "learning_rate": 0.00010203403902034039, + "loss": 0.7686, + "step": 2510 + }, + { + "epoch": 1.5155448234228794, + "grad_norm": 0.10612853616476059, + "learning_rate": 0.00010199252801992529, + "loss": 0.7055, + "step": 2511 + }, + { + "epoch": 1.516148505885904, + "grad_norm": 0.11403614282608032, + "learning_rate": 0.00010195101701951018, + "loss": 0.673, + "step": 2512 + }, + { + "epoch": 1.5167521883489283, + "grad_norm": 0.11351923644542694, + "learning_rate": 0.00010190950601909506, + "loss": 0.6534, + "step": 2513 + }, + { + "epoch": 1.517355870811953, + "grad_norm": 0.12284346669912338, + "learning_rate": 0.00010186799501867996, + "loss": 0.7367, + "step": 2514 + }, + { + "epoch": 1.5179595532749772, + "grad_norm": 0.11047890037298203, + "learning_rate": 0.00010182648401826485, + "loss": 0.7392, + "step": 2515 + }, + { + "epoch": 1.518563235738002, + "grad_norm": 0.13320279121398926, + "learning_rate": 0.00010178497301784972, + "loss": 0.7075, + "step": 2516 + }, + { + "epoch": 1.5191669182010261, + "grad_norm": 0.1211395412683487, + "learning_rate": 0.00010174346201743464, + "loss": 0.7255, + "step": 2517 + }, + { + "epoch": 1.5197706006640508, + "grad_norm": 0.10632450878620148, + "learning_rate": 0.00010170195101701951, + "loss": 0.6936, + "step": 2518 + }, + { + "epoch": 1.520374283127075, + "grad_norm": 0.10908479243516922, + "learning_rate": 0.0001016604400166044, + "loss": 0.783, + "step": 2519 + }, + { + "epoch": 1.5209779655900997, + "grad_norm": 0.11272008717060089, + "learning_rate": 0.0001016189290161893, + "loss": 0.8508, + "step": 2520 + }, + { + "epoch": 1.521581648053124, + "grad_norm": 0.11356555670499802, + "learning_rate": 0.00010157741801577418, + "loss": 0.7808, + "step": 2521 + }, + { + "epoch": 1.5221853305161486, + "grad_norm": 0.1155843734741211, + "learning_rate": 0.00010153590701535907, + "loss": 0.9635, + "step": 2522 + }, + { + "epoch": 1.5227890129791728, + "grad_norm": 0.10973259806632996, + "learning_rate": 0.00010149439601494397, + "loss": 0.7365, + "step": 2523 + }, + { + "epoch": 1.5233926954421975, + "grad_norm": 0.1167822778224945, + "learning_rate": 0.00010145288501452886, + "loss": 0.7307, + "step": 2524 + }, + { + "epoch": 1.5239963779052217, + "grad_norm": 0.11712843924760818, + "learning_rate": 0.00010141137401411374, + "loss": 0.6832, + "step": 2525 + }, + { + "epoch": 1.5246000603682464, + "grad_norm": 0.11597023159265518, + "learning_rate": 0.00010136986301369864, + "loss": 0.7971, + "step": 2526 + }, + { + "epoch": 1.5252037428312708, + "grad_norm": 0.12328234314918518, + "learning_rate": 0.00010132835201328353, + "loss": 0.9113, + "step": 2527 + }, + { + "epoch": 1.5258074252942952, + "grad_norm": 0.1276886761188507, + "learning_rate": 0.0001012868410128684, + "loss": 0.659, + "step": 2528 + }, + { + "epoch": 1.5264111077573197, + "grad_norm": 0.11608514934778214, + "learning_rate": 0.00010124533001245332, + "loss": 0.6754, + "step": 2529 + }, + { + "epoch": 1.5270147902203441, + "grad_norm": 0.10984505712985992, + "learning_rate": 0.00010120381901203819, + "loss": 0.9727, + "step": 2530 + }, + { + "epoch": 1.5276184726833686, + "grad_norm": 0.1020493283867836, + "learning_rate": 0.00010116230801162307, + "loss": 0.6945, + "step": 2531 + }, + { + "epoch": 1.528222155146393, + "grad_norm": 0.11108864098787308, + "learning_rate": 0.00010112079701120797, + "loss": 0.6623, + "step": 2532 + }, + { + "epoch": 1.5288258376094175, + "grad_norm": 0.11104828864336014, + "learning_rate": 0.00010107928601079286, + "loss": 0.7971, + "step": 2533 + }, + { + "epoch": 1.529429520072442, + "grad_norm": 0.11642879247665405, + "learning_rate": 0.00010103777501037775, + "loss": 0.7904, + "step": 2534 + }, + { + "epoch": 1.5300332025354664, + "grad_norm": 0.11537434905767441, + "learning_rate": 0.00010099626400996265, + "loss": 0.7779, + "step": 2535 + }, + { + "epoch": 1.5306368849984908, + "grad_norm": 0.11406029760837555, + "learning_rate": 0.00010095475300954753, + "loss": 0.6764, + "step": 2536 + }, + { + "epoch": 1.5312405674615153, + "grad_norm": 0.12164659798145294, + "learning_rate": 0.00010091324200913243, + "loss": 0.6798, + "step": 2537 + }, + { + "epoch": 1.5318442499245397, + "grad_norm": 0.11121895164251328, + "learning_rate": 0.00010087173100871732, + "loss": 0.9903, + "step": 2538 + }, + { + "epoch": 1.5324479323875642, + "grad_norm": 0.13991357386112213, + "learning_rate": 0.00010083022000830221, + "loss": 0.6855, + "step": 2539 + }, + { + "epoch": 1.5330516148505886, + "grad_norm": 0.11784554272890091, + "learning_rate": 0.00010078870900788711, + "loss": 0.6766, + "step": 2540 + }, + { + "epoch": 1.533655297313613, + "grad_norm": 0.12097886949777603, + "learning_rate": 0.000100747198007472, + "loss": 0.7029, + "step": 2541 + }, + { + "epoch": 1.5342589797766375, + "grad_norm": 0.12290269136428833, + "learning_rate": 0.00010070568700705687, + "loss": 0.6882, + "step": 2542 + }, + { + "epoch": 1.534862662239662, + "grad_norm": 0.11816665530204773, + "learning_rate": 0.00010066417600664178, + "loss": 0.6425, + "step": 2543 + }, + { + "epoch": 1.5354663447026864, + "grad_norm": 0.11926699429750443, + "learning_rate": 0.00010062266500622665, + "loss": 0.6564, + "step": 2544 + }, + { + "epoch": 1.5360700271657108, + "grad_norm": 0.13197208940982819, + "learning_rate": 0.00010058115400581154, + "loss": 0.6045, + "step": 2545 + }, + { + "epoch": 1.5366737096287353, + "grad_norm": 0.12189357727766037, + "learning_rate": 0.00010053964300539644, + "loss": 0.6137, + "step": 2546 + }, + { + "epoch": 1.5372773920917597, + "grad_norm": 0.12850208580493927, + "learning_rate": 0.00010049813200498133, + "loss": 0.6533, + "step": 2547 + }, + { + "epoch": 1.5378810745547842, + "grad_norm": 0.13015426695346832, + "learning_rate": 0.00010045662100456621, + "loss": 0.597, + "step": 2548 + }, + { + "epoch": 1.5384847570178086, + "grad_norm": 0.13129258155822754, + "learning_rate": 0.00010041511000415111, + "loss": 0.5412, + "step": 2549 + }, + { + "epoch": 1.539088439480833, + "grad_norm": 0.1495341658592224, + "learning_rate": 0.000100373599003736, + "loss": 0.6265, + "step": 2550 + }, + { + "epoch": 1.5396921219438575, + "grad_norm": 0.1443036049604416, + "learning_rate": 0.00010033208800332087, + "loss": 0.5974, + "step": 2551 + }, + { + "epoch": 1.540295804406882, + "grad_norm": 0.13055108487606049, + "learning_rate": 0.00010029057700290579, + "loss": 0.5123, + "step": 2552 + }, + { + "epoch": 1.5408994868699064, + "grad_norm": 0.15532496571540833, + "learning_rate": 0.00010024906600249067, + "loss": 0.5273, + "step": 2553 + }, + { + "epoch": 1.5415031693329309, + "grad_norm": 0.15928137302398682, + "learning_rate": 0.00010020755500207555, + "loss": 0.5119, + "step": 2554 + }, + { + "epoch": 1.5421068517959553, + "grad_norm": 0.1558419167995453, + "learning_rate": 0.00010016604400166046, + "loss": 0.4332, + "step": 2555 + }, + { + "epoch": 1.5427105342589797, + "grad_norm": 0.1683838963508606, + "learning_rate": 0.00010012453300124533, + "loss": 0.3724, + "step": 2556 + }, + { + "epoch": 1.5433142167220042, + "grad_norm": 0.15534067153930664, + "learning_rate": 0.00010008302200083022, + "loss": 0.3148, + "step": 2557 + }, + { + "epoch": 1.5439178991850286, + "grad_norm": 0.10773934423923492, + "learning_rate": 0.00010004151100041512, + "loss": 0.7099, + "step": 2558 + }, + { + "epoch": 1.544521581648053, + "grad_norm": 0.1566229909658432, + "learning_rate": 0.0001, + "loss": 0.677, + "step": 2559 + }, + { + "epoch": 1.5451252641110775, + "grad_norm": 0.12028037011623383, + "learning_rate": 9.99584889995849e-05, + "loss": 0.8106, + "step": 2560 + }, + { + "epoch": 1.545728946574102, + "grad_norm": 0.1171046793460846, + "learning_rate": 9.991697799916978e-05, + "loss": 0.7519, + "step": 2561 + }, + { + "epoch": 1.5463326290371264, + "grad_norm": 0.11625587195158005, + "learning_rate": 9.987546699875468e-05, + "loss": 0.7559, + "step": 2562 + }, + { + "epoch": 1.5469363115001509, + "grad_norm": 0.11146189272403717, + "learning_rate": 9.983395599833956e-05, + "loss": 0.6246, + "step": 2563 + }, + { + "epoch": 1.5475399939631753, + "grad_norm": 0.1186312735080719, + "learning_rate": 9.979244499792445e-05, + "loss": 0.8725, + "step": 2564 + }, + { + "epoch": 1.5481436764261998, + "grad_norm": 0.11686369776725769, + "learning_rate": 9.975093399750934e-05, + "loss": 0.869, + "step": 2565 + }, + { + "epoch": 1.5487473588892242, + "grad_norm": 0.11561167985200882, + "learning_rate": 9.970942299709424e-05, + "loss": 0.7023, + "step": 2566 + }, + { + "epoch": 1.5493510413522489, + "grad_norm": 0.12024765461683273, + "learning_rate": 9.966791199667912e-05, + "loss": 0.9882, + "step": 2567 + }, + { + "epoch": 1.549954723815273, + "grad_norm": 0.10977562516927719, + "learning_rate": 9.962640099626401e-05, + "loss": 0.7127, + "step": 2568 + }, + { + "epoch": 1.5505584062782978, + "grad_norm": 0.10764128714799881, + "learning_rate": 9.958488999584891e-05, + "loss": 0.617, + "step": 2569 + }, + { + "epoch": 1.551162088741322, + "grad_norm": 0.10783983021974564, + "learning_rate": 9.954337899543378e-05, + "loss": 0.6401, + "step": 2570 + }, + { + "epoch": 1.5517657712043467, + "grad_norm": 0.12389583140611649, + "learning_rate": 9.950186799501868e-05, + "loss": 0.6535, + "step": 2571 + }, + { + "epoch": 1.552369453667371, + "grad_norm": 0.11414086818695068, + "learning_rate": 9.946035699460357e-05, + "loss": 0.7015, + "step": 2572 + }, + { + "epoch": 1.5529731361303956, + "grad_norm": 0.11838074028491974, + "learning_rate": 9.941884599418847e-05, + "loss": 0.9455, + "step": 2573 + }, + { + "epoch": 1.5535768185934198, + "grad_norm": 0.11571443825960159, + "learning_rate": 9.937733499377336e-05, + "loss": 1.1223, + "step": 2574 + }, + { + "epoch": 1.5541805010564445, + "grad_norm": 0.11583063751459122, + "learning_rate": 9.933582399335824e-05, + "loss": 0.7297, + "step": 2575 + }, + { + "epoch": 1.5547841835194687, + "grad_norm": 0.11269959062337875, + "learning_rate": 9.929431299294314e-05, + "loss": 0.7613, + "step": 2576 + }, + { + "epoch": 1.5553878659824933, + "grad_norm": 0.1328345686197281, + "learning_rate": 9.925280199252802e-05, + "loss": 0.7758, + "step": 2577 + }, + { + "epoch": 1.5559915484455176, + "grad_norm": 0.12824542820453644, + "learning_rate": 9.921129099211292e-05, + "loss": 0.7867, + "step": 2578 + }, + { + "epoch": 1.5565952309085422, + "grad_norm": 0.10252919793128967, + "learning_rate": 9.91697799916978e-05, + "loss": 0.6038, + "step": 2579 + }, + { + "epoch": 1.5571989133715665, + "grad_norm": 0.11735723912715912, + "learning_rate": 9.912826899128269e-05, + "loss": 0.718, + "step": 2580 + }, + { + "epoch": 1.5578025958345911, + "grad_norm": 0.12741461396217346, + "learning_rate": 9.908675799086759e-05, + "loss": 0.62, + "step": 2581 + }, + { + "epoch": 1.5584062782976154, + "grad_norm": 0.11475570499897003, + "learning_rate": 9.904524699045248e-05, + "loss": 0.7747, + "step": 2582 + }, + { + "epoch": 1.55900996076064, + "grad_norm": 0.11514543741941452, + "learning_rate": 9.900373599003736e-05, + "loss": 0.7556, + "step": 2583 + }, + { + "epoch": 1.5596136432236642, + "grad_norm": 0.10353957116603851, + "learning_rate": 9.896222498962225e-05, + "loss": 0.6466, + "step": 2584 + }, + { + "epoch": 1.560217325686689, + "grad_norm": 0.10758772492408752, + "learning_rate": 9.892071398920715e-05, + "loss": 1.2809, + "step": 2585 + }, + { + "epoch": 1.5608210081497131, + "grad_norm": 0.11749143153429031, + "learning_rate": 9.887920298879203e-05, + "loss": 0.6888, + "step": 2586 + }, + { + "epoch": 1.5614246906127378, + "grad_norm": 0.1256055384874344, + "learning_rate": 9.883769198837692e-05, + "loss": 0.7231, + "step": 2587 + }, + { + "epoch": 1.562028373075762, + "grad_norm": 0.11417461931705475, + "learning_rate": 9.879618098796182e-05, + "loss": 0.7842, + "step": 2588 + }, + { + "epoch": 1.5626320555387867, + "grad_norm": 0.10440021008253098, + "learning_rate": 9.87546699875467e-05, + "loss": 0.6953, + "step": 2589 + }, + { + "epoch": 1.563235738001811, + "grad_norm": 0.10997917503118515, + "learning_rate": 9.87131589871316e-05, + "loss": 0.6638, + "step": 2590 + }, + { + "epoch": 1.5638394204648356, + "grad_norm": 0.1075371578335762, + "learning_rate": 9.867164798671648e-05, + "loss": 0.5918, + "step": 2591 + }, + { + "epoch": 1.5644431029278598, + "grad_norm": 0.12016437947750092, + "learning_rate": 9.863013698630137e-05, + "loss": 0.7686, + "step": 2592 + }, + { + "epoch": 1.5650467853908845, + "grad_norm": 0.119369737803936, + "learning_rate": 9.858862598588627e-05, + "loss": 0.6284, + "step": 2593 + }, + { + "epoch": 1.5656504678539087, + "grad_norm": 0.11695661395788193, + "learning_rate": 9.854711498547115e-05, + "loss": 0.6282, + "step": 2594 + }, + { + "epoch": 1.5662541503169334, + "grad_norm": 0.11359156668186188, + "learning_rate": 9.850560398505605e-05, + "loss": 0.648, + "step": 2595 + }, + { + "epoch": 1.5668578327799576, + "grad_norm": 0.11783526092767715, + "learning_rate": 9.846409298464093e-05, + "loss": 0.5769, + "step": 2596 + }, + { + "epoch": 1.5674615152429823, + "grad_norm": 0.12867143750190735, + "learning_rate": 9.842258198422583e-05, + "loss": 0.7181, + "step": 2597 + }, + { + "epoch": 1.5680651977060065, + "grad_norm": 0.12808844447135925, + "learning_rate": 9.838107098381071e-05, + "loss": 0.6346, + "step": 2598 + }, + { + "epoch": 1.5686688801690312, + "grad_norm": 0.12455364316701889, + "learning_rate": 9.83395599833956e-05, + "loss": 0.6019, + "step": 2599 + }, + { + "epoch": 1.5692725626320554, + "grad_norm": 0.1380644142627716, + "learning_rate": 9.82980489829805e-05, + "loss": 0.6872, + "step": 2600 + }, + { + "epoch": 1.56987624509508, + "grad_norm": 0.14129561185836792, + "learning_rate": 9.825653798256539e-05, + "loss": 0.5949, + "step": 2601 + }, + { + "epoch": 1.5704799275581043, + "grad_norm": 0.13899260759353638, + "learning_rate": 9.821502698215027e-05, + "loss": 0.5492, + "step": 2602 + }, + { + "epoch": 1.571083610021129, + "grad_norm": 0.13930819928646088, + "learning_rate": 9.817351598173516e-05, + "loss": 0.5476, + "step": 2603 + }, + { + "epoch": 1.5716872924841532, + "grad_norm": 0.14664794504642487, + "learning_rate": 9.813200498132006e-05, + "loss": 0.4718, + "step": 2604 + }, + { + "epoch": 1.5722909749471778, + "grad_norm": 0.1534009575843811, + "learning_rate": 9.809049398090495e-05, + "loss": 0.4487, + "step": 2605 + }, + { + "epoch": 1.572894657410202, + "grad_norm": 0.16464664041996002, + "learning_rate": 9.804898298048983e-05, + "loss": 0.4443, + "step": 2606 + }, + { + "epoch": 1.5734983398732267, + "grad_norm": 0.15683351457118988, + "learning_rate": 9.800747198007473e-05, + "loss": 0.3545, + "step": 2607 + }, + { + "epoch": 1.5741020223362512, + "grad_norm": 0.142044797539711, + "learning_rate": 9.79659609796596e-05, + "loss": 0.7322, + "step": 2608 + }, + { + "epoch": 1.5747057047992756, + "grad_norm": 0.13781249523162842, + "learning_rate": 9.79244499792445e-05, + "loss": 0.7452, + "step": 2609 + }, + { + "epoch": 1.5753093872623, + "grad_norm": 0.12717604637145996, + "learning_rate": 9.788293897882939e-05, + "loss": 0.749, + "step": 2610 + }, + { + "epoch": 1.5759130697253245, + "grad_norm": 0.11919858306646347, + "learning_rate": 9.784142797841428e-05, + "loss": 0.7633, + "step": 2611 + }, + { + "epoch": 1.576516752188349, + "grad_norm": 0.12019964307546616, + "learning_rate": 9.779991697799918e-05, + "loss": 0.732, + "step": 2612 + }, + { + "epoch": 1.5771204346513734, + "grad_norm": 0.1292140632867813, + "learning_rate": 9.775840597758406e-05, + "loss": 0.7751, + "step": 2613 + }, + { + "epoch": 1.5777241171143979, + "grad_norm": 0.11317221075296402, + "learning_rate": 9.771689497716895e-05, + "loss": 0.7186, + "step": 2614 + }, + { + "epoch": 1.5783277995774223, + "grad_norm": 0.10719773173332214, + "learning_rate": 9.767538397675384e-05, + "loss": 0.7511, + "step": 2615 + }, + { + "epoch": 1.5789314820404468, + "grad_norm": 0.11181782931089401, + "learning_rate": 9.763387297633874e-05, + "loss": 0.7537, + "step": 2616 + }, + { + "epoch": 1.5795351645034712, + "grad_norm": 0.11379014700651169, + "learning_rate": 9.759236197592362e-05, + "loss": 0.7367, + "step": 2617 + }, + { + "epoch": 1.5801388469664956, + "grad_norm": 0.10861673206090927, + "learning_rate": 9.755085097550851e-05, + "loss": 0.6676, + "step": 2618 + }, + { + "epoch": 1.58074252942952, + "grad_norm": 0.11700250953435898, + "learning_rate": 9.750933997509341e-05, + "loss": 0.6992, + "step": 2619 + }, + { + "epoch": 1.5813462118925445, + "grad_norm": 0.10913542658090591, + "learning_rate": 9.74678289746783e-05, + "loss": 0.7337, + "step": 2620 + }, + { + "epoch": 1.581949894355569, + "grad_norm": 0.10216758400201797, + "learning_rate": 9.742631797426318e-05, + "loss": 0.658, + "step": 2621 + }, + { + "epoch": 1.5825535768185934, + "grad_norm": 0.10746050626039505, + "learning_rate": 9.738480697384807e-05, + "loss": 0.673, + "step": 2622 + }, + { + "epoch": 1.5831572592816179, + "grad_norm": 0.10291888564825058, + "learning_rate": 9.734329597343297e-05, + "loss": 0.7131, + "step": 2623 + }, + { + "epoch": 1.5837609417446423, + "grad_norm": 0.11497275531291962, + "learning_rate": 9.730178497301786e-05, + "loss": 0.7278, + "step": 2624 + }, + { + "epoch": 1.5843646242076668, + "grad_norm": 0.11286524683237076, + "learning_rate": 9.726027397260274e-05, + "loss": 0.7349, + "step": 2625 + }, + { + "epoch": 1.5849683066706912, + "grad_norm": 0.12478066235780716, + "learning_rate": 9.721876297218764e-05, + "loss": 0.686, + "step": 2626 + }, + { + "epoch": 1.5855719891337157, + "grad_norm": 0.1130586788058281, + "learning_rate": 9.717725197177252e-05, + "loss": 0.7609, + "step": 2627 + }, + { + "epoch": 1.58617567159674, + "grad_norm": 0.11195287853479385, + "learning_rate": 9.713574097135742e-05, + "loss": 0.6909, + "step": 2628 + }, + { + "epoch": 1.5867793540597646, + "grad_norm": 0.127894327044487, + "learning_rate": 9.70942299709423e-05, + "loss": 0.7867, + "step": 2629 + }, + { + "epoch": 1.587383036522789, + "grad_norm": 0.10859086364507675, + "learning_rate": 9.705271897052719e-05, + "loss": 0.7013, + "step": 2630 + }, + { + "epoch": 1.5879867189858135, + "grad_norm": 0.10940494388341904, + "learning_rate": 9.701120797011209e-05, + "loss": 0.9159, + "step": 2631 + }, + { + "epoch": 1.588590401448838, + "grad_norm": 0.1171516478061676, + "learning_rate": 9.696969696969698e-05, + "loss": 0.6883, + "step": 2632 + }, + { + "epoch": 1.5891940839118623, + "grad_norm": 0.14016135036945343, + "learning_rate": 9.692818596928186e-05, + "loss": 0.6465, + "step": 2633 + }, + { + "epoch": 1.5897977663748868, + "grad_norm": 0.10493374615907669, + "learning_rate": 9.688667496886675e-05, + "loss": 0.6847, + "step": 2634 + }, + { + "epoch": 1.5904014488379112, + "grad_norm": 0.12011600285768509, + "learning_rate": 9.684516396845165e-05, + "loss": 0.7248, + "step": 2635 + }, + { + "epoch": 1.5910051313009357, + "grad_norm": 0.1263171285390854, + "learning_rate": 9.680365296803654e-05, + "loss": 0.646, + "step": 2636 + }, + { + "epoch": 1.5916088137639601, + "grad_norm": 0.1184513196349144, + "learning_rate": 9.676214196762142e-05, + "loss": 0.6105, + "step": 2637 + }, + { + "epoch": 1.5922124962269846, + "grad_norm": 0.13153387606143951, + "learning_rate": 9.672063096720632e-05, + "loss": 0.7279, + "step": 2638 + }, + { + "epoch": 1.592816178690009, + "grad_norm": 0.12811937928199768, + "learning_rate": 9.667911996679121e-05, + "loss": 0.7132, + "step": 2639 + }, + { + "epoch": 1.5934198611530335, + "grad_norm": 0.10892543196678162, + "learning_rate": 9.66376089663761e-05, + "loss": 0.8649, + "step": 2640 + }, + { + "epoch": 1.594023543616058, + "grad_norm": 0.1102840006351471, + "learning_rate": 9.659609796596098e-05, + "loss": 0.685, + "step": 2641 + }, + { + "epoch": 1.5946272260790824, + "grad_norm": 0.11115845292806625, + "learning_rate": 9.655458696554588e-05, + "loss": 0.5798, + "step": 2642 + }, + { + "epoch": 1.5952309085421068, + "grad_norm": 0.10934832692146301, + "learning_rate": 9.651307596513077e-05, + "loss": 0.6023, + "step": 2643 + }, + { + "epoch": 1.5958345910051313, + "grad_norm": 0.12175562977790833, + "learning_rate": 9.647156496471565e-05, + "loss": 0.7045, + "step": 2644 + }, + { + "epoch": 1.5964382734681557, + "grad_norm": 0.12231041491031647, + "learning_rate": 9.643005396430055e-05, + "loss": 0.6613, + "step": 2645 + }, + { + "epoch": 1.5970419559311801, + "grad_norm": 0.11286479979753494, + "learning_rate": 9.638854296388543e-05, + "loss": 0.6132, + "step": 2646 + }, + { + "epoch": 1.5976456383942046, + "grad_norm": 0.13253097236156464, + "learning_rate": 9.634703196347033e-05, + "loss": 0.65, + "step": 2647 + }, + { + "epoch": 1.5982493208572293, + "grad_norm": 0.1287652999162674, + "learning_rate": 9.630552096305521e-05, + "loss": 0.6567, + "step": 2648 + }, + { + "epoch": 1.5988530033202535, + "grad_norm": 0.15067052841186523, + "learning_rate": 9.62640099626401e-05, + "loss": 0.6427, + "step": 2649 + }, + { + "epoch": 1.5994566857832782, + "grad_norm": 0.13497310876846313, + "learning_rate": 9.6222498962225e-05, + "loss": 0.6178, + "step": 2650 + }, + { + "epoch": 1.6000603682463024, + "grad_norm": 0.13561968505382538, + "learning_rate": 9.618098796180989e-05, + "loss": 0.5794, + "step": 2651 + }, + { + "epoch": 1.600664050709327, + "grad_norm": 0.1425342857837677, + "learning_rate": 9.613947696139477e-05, + "loss": 0.5064, + "step": 2652 + }, + { + "epoch": 1.6012677331723513, + "grad_norm": 0.13813789188861847, + "learning_rate": 9.609796596097966e-05, + "loss": 0.4899, + "step": 2653 + }, + { + "epoch": 1.601871415635376, + "grad_norm": 0.14464135468006134, + "learning_rate": 9.605645496056456e-05, + "loss": 0.4724, + "step": 2654 + }, + { + "epoch": 1.6024750980984002, + "grad_norm": 0.15810330212116241, + "learning_rate": 9.601494396014945e-05, + "loss": 0.4372, + "step": 2655 + }, + { + "epoch": 1.6030787805614248, + "grad_norm": 0.17366820573806763, + "learning_rate": 9.597343295973433e-05, + "loss": 0.4398, + "step": 2656 + }, + { + "epoch": 1.603682463024449, + "grad_norm": 0.159001424908638, + "learning_rate": 9.593192195931923e-05, + "loss": 0.2789, + "step": 2657 + }, + { + "epoch": 1.6042861454874737, + "grad_norm": 0.2292790710926056, + "learning_rate": 9.58904109589041e-05, + "loss": 0.7525, + "step": 2658 + }, + { + "epoch": 1.604889827950498, + "grad_norm": 0.10119739919900894, + "learning_rate": 9.5848899958489e-05, + "loss": 0.717, + "step": 2659 + }, + { + "epoch": 1.6054935104135226, + "grad_norm": 0.10911940038204193, + "learning_rate": 9.580738895807389e-05, + "loss": 0.6715, + "step": 2660 + }, + { + "epoch": 1.6060971928765468, + "grad_norm": 0.12329068779945374, + "learning_rate": 9.576587795765879e-05, + "loss": 0.7417, + "step": 2661 + }, + { + "epoch": 1.6067008753395715, + "grad_norm": 0.12208274751901627, + "learning_rate": 9.572436695724368e-05, + "loss": 0.7459, + "step": 2662 + }, + { + "epoch": 1.6073045578025957, + "grad_norm": 0.10721976310014725, + "learning_rate": 9.568285595682856e-05, + "loss": 0.6844, + "step": 2663 + }, + { + "epoch": 1.6079082402656204, + "grad_norm": 0.1042000949382782, + "learning_rate": 9.564134495641347e-05, + "loss": 0.7077, + "step": 2664 + }, + { + "epoch": 1.6085119227286446, + "grad_norm": 0.1249273270368576, + "learning_rate": 9.559983395599834e-05, + "loss": 0.7202, + "step": 2665 + }, + { + "epoch": 1.6091156051916693, + "grad_norm": 0.11742489039897919, + "learning_rate": 9.555832295558324e-05, + "loss": 0.6975, + "step": 2666 + }, + { + "epoch": 1.6097192876546935, + "grad_norm": 0.1132553368806839, + "learning_rate": 9.551681195516812e-05, + "loss": 0.9097, + "step": 2667 + }, + { + "epoch": 1.6103229701177182, + "grad_norm": 0.1129176989197731, + "learning_rate": 9.547530095475301e-05, + "loss": 0.6419, + "step": 2668 + }, + { + "epoch": 1.6109266525807424, + "grad_norm": 0.1597258597612381, + "learning_rate": 9.543378995433791e-05, + "loss": 0.7293, + "step": 2669 + }, + { + "epoch": 1.611530335043767, + "grad_norm": 0.11292175203561783, + "learning_rate": 9.53922789539228e-05, + "loss": 0.8514, + "step": 2670 + }, + { + "epoch": 1.6121340175067913, + "grad_norm": 0.11544401198625565, + "learning_rate": 9.535076795350768e-05, + "loss": 0.6806, + "step": 2671 + }, + { + "epoch": 1.612737699969816, + "grad_norm": 0.10773265361785889, + "learning_rate": 9.530925695309257e-05, + "loss": 0.7818, + "step": 2672 + }, + { + "epoch": 1.6133413824328402, + "grad_norm": 0.11048367619514465, + "learning_rate": 9.526774595267747e-05, + "loss": 0.6819, + "step": 2673 + }, + { + "epoch": 1.6139450648958649, + "grad_norm": 0.12018372118473053, + "learning_rate": 9.522623495226234e-05, + "loss": 0.6918, + "step": 2674 + }, + { + "epoch": 1.614548747358889, + "grad_norm": 0.1126914694905281, + "learning_rate": 9.518472395184724e-05, + "loss": 1.157, + "step": 2675 + }, + { + "epoch": 1.6151524298219138, + "grad_norm": 0.11121730506420135, + "learning_rate": 9.514321295143214e-05, + "loss": 0.7578, + "step": 2676 + }, + { + "epoch": 1.615756112284938, + "grad_norm": 0.1760278046131134, + "learning_rate": 9.510170195101702e-05, + "loss": 0.7201, + "step": 2677 + }, + { + "epoch": 1.6163597947479627, + "grad_norm": 0.1170695498585701, + "learning_rate": 9.506019095060192e-05, + "loss": 0.7593, + "step": 2678 + }, + { + "epoch": 1.6169634772109869, + "grad_norm": 0.12168256938457489, + "learning_rate": 9.50186799501868e-05, + "loss": 0.6685, + "step": 2679 + }, + { + "epoch": 1.6175671596740115, + "grad_norm": 0.1391884833574295, + "learning_rate": 9.497716894977169e-05, + "loss": 0.8251, + "step": 2680 + }, + { + "epoch": 1.6181708421370358, + "grad_norm": 0.11195441335439682, + "learning_rate": 9.493565794935658e-05, + "loss": 0.6997, + "step": 2681 + }, + { + "epoch": 1.6187745246000604, + "grad_norm": 0.11107617616653442, + "learning_rate": 9.489414694894148e-05, + "loss": 0.6234, + "step": 2682 + }, + { + "epoch": 1.6193782070630847, + "grad_norm": 0.1288166344165802, + "learning_rate": 9.485263594852638e-05, + "loss": 1.0108, + "step": 2683 + }, + { + "epoch": 1.6199818895261093, + "grad_norm": 0.11174750328063965, + "learning_rate": 9.481112494811125e-05, + "loss": 0.633, + "step": 2684 + }, + { + "epoch": 1.6205855719891336, + "grad_norm": 0.11930475383996964, + "learning_rate": 9.476961394769615e-05, + "loss": 0.7875, + "step": 2685 + }, + { + "epoch": 1.6211892544521582, + "grad_norm": 0.1079435870051384, + "learning_rate": 9.472810294728104e-05, + "loss": 0.9182, + "step": 2686 + }, + { + "epoch": 1.6217929369151824, + "grad_norm": 0.10946819186210632, + "learning_rate": 9.468659194686592e-05, + "loss": 0.6537, + "step": 2687 + }, + { + "epoch": 1.6223966193782071, + "grad_norm": 0.1106157973408699, + "learning_rate": 9.464508094645081e-05, + "loss": 0.5829, + "step": 2688 + }, + { + "epoch": 1.6230003018412316, + "grad_norm": 0.1129320040345192, + "learning_rate": 9.460356994603571e-05, + "loss": 0.7042, + "step": 2689 + }, + { + "epoch": 1.623603984304256, + "grad_norm": 0.10891727358102798, + "learning_rate": 9.45620589456206e-05, + "loss": 0.6194, + "step": 2690 + }, + { + "epoch": 1.6242076667672805, + "grad_norm": 0.1271827220916748, + "learning_rate": 9.452054794520548e-05, + "loss": 0.6827, + "step": 2691 + }, + { + "epoch": 1.624811349230305, + "grad_norm": 0.10997018963098526, + "learning_rate": 9.447903694479038e-05, + "loss": 0.6144, + "step": 2692 + }, + { + "epoch": 1.6254150316933293, + "grad_norm": 0.11678522080183029, + "learning_rate": 9.443752594437525e-05, + "loss": 0.6143, + "step": 2693 + }, + { + "epoch": 1.6260187141563538, + "grad_norm": 0.13937309384346008, + "learning_rate": 9.439601494396015e-05, + "loss": 0.7645, + "step": 2694 + }, + { + "epoch": 1.6266223966193782, + "grad_norm": 0.12499664723873138, + "learning_rate": 9.435450394354504e-05, + "loss": 0.6144, + "step": 2695 + }, + { + "epoch": 1.6272260790824027, + "grad_norm": 0.12473144382238388, + "learning_rate": 9.431299294312993e-05, + "loss": 0.5597, + "step": 2696 + }, + { + "epoch": 1.6278297615454271, + "grad_norm": 0.13701920211315155, + "learning_rate": 9.427148194271483e-05, + "loss": 0.6018, + "step": 2697 + }, + { + "epoch": 1.6284334440084516, + "grad_norm": 0.12637366354465485, + "learning_rate": 9.422997094229971e-05, + "loss": 0.6128, + "step": 2698 + }, + { + "epoch": 1.629037126471476, + "grad_norm": 0.12743206322193146, + "learning_rate": 9.41884599418846e-05, + "loss": 0.6142, + "step": 2699 + }, + { + "epoch": 1.6296408089345005, + "grad_norm": 0.1346701979637146, + "learning_rate": 9.414694894146949e-05, + "loss": 0.6122, + "step": 2700 + }, + { + "epoch": 1.630244491397525, + "grad_norm": 0.13191986083984375, + "learning_rate": 9.410543794105439e-05, + "loss": 0.566, + "step": 2701 + }, + { + "epoch": 1.6308481738605494, + "grad_norm": 0.14156706631183624, + "learning_rate": 9.406392694063927e-05, + "loss": 0.5651, + "step": 2702 + }, + { + "epoch": 1.6314518563235738, + "grad_norm": 0.1478833556175232, + "learning_rate": 9.402241594022416e-05, + "loss": 0.5533, + "step": 2703 + }, + { + "epoch": 1.6320555387865983, + "grad_norm": 0.1488913893699646, + "learning_rate": 9.398090493980906e-05, + "loss": 0.5218, + "step": 2704 + }, + { + "epoch": 1.6326592212496227, + "grad_norm": 0.1752762794494629, + "learning_rate": 9.393939393939395e-05, + "loss": 0.4899, + "step": 2705 + }, + { + "epoch": 1.6332629037126472, + "grad_norm": 0.17121976613998413, + "learning_rate": 9.389788293897883e-05, + "loss": 0.4118, + "step": 2706 + }, + { + "epoch": 1.6338665861756716, + "grad_norm": 0.16428853571414948, + "learning_rate": 9.385637193856372e-05, + "loss": 0.3176, + "step": 2707 + }, + { + "epoch": 1.634470268638696, + "grad_norm": 0.11714405566453934, + "learning_rate": 9.381486093814862e-05, + "loss": 0.8348, + "step": 2708 + }, + { + "epoch": 1.6350739511017205, + "grad_norm": 0.11836113035678864, + "learning_rate": 9.37733499377335e-05, + "loss": 0.7564, + "step": 2709 + }, + { + "epoch": 1.635677633564745, + "grad_norm": 0.11199770867824554, + "learning_rate": 9.373183893731839e-05, + "loss": 0.6204, + "step": 2710 + }, + { + "epoch": 1.6362813160277694, + "grad_norm": 0.12327376753091812, + "learning_rate": 9.369032793690329e-05, + "loss": 0.7127, + "step": 2711 + }, + { + "epoch": 1.6368849984907938, + "grad_norm": 0.12553828954696655, + "learning_rate": 9.364881693648817e-05, + "loss": 0.7299, + "step": 2712 + }, + { + "epoch": 1.6374886809538183, + "grad_norm": 0.10863874852657318, + "learning_rate": 9.360730593607307e-05, + "loss": 0.669, + "step": 2713 + }, + { + "epoch": 1.6380923634168427, + "grad_norm": 0.12236540764570236, + "learning_rate": 9.356579493565795e-05, + "loss": 0.977, + "step": 2714 + }, + { + "epoch": 1.6386960458798672, + "grad_norm": 0.1284298151731491, + "learning_rate": 9.352428393524284e-05, + "loss": 0.9, + "step": 2715 + }, + { + "epoch": 1.6392997283428916, + "grad_norm": 0.12449759244918823, + "learning_rate": 9.348277293482774e-05, + "loss": 0.5975, + "step": 2716 + }, + { + "epoch": 1.639903410805916, + "grad_norm": 0.11096858978271484, + "learning_rate": 9.344126193441262e-05, + "loss": 0.7423, + "step": 2717 + }, + { + "epoch": 1.6405070932689405, + "grad_norm": 0.11665850877761841, + "learning_rate": 9.339975093399751e-05, + "loss": 0.7947, + "step": 2718 + }, + { + "epoch": 1.641110775731965, + "grad_norm": 0.1213374212384224, + "learning_rate": 9.33582399335824e-05, + "loss": 0.7891, + "step": 2719 + }, + { + "epoch": 1.6417144581949894, + "grad_norm": 0.11723334342241287, + "learning_rate": 9.33167289331673e-05, + "loss": 0.7008, + "step": 2720 + }, + { + "epoch": 1.6423181406580138, + "grad_norm": 0.39810943603515625, + "learning_rate": 9.327521793275218e-05, + "loss": 0.7724, + "step": 2721 + }, + { + "epoch": 1.6429218231210383, + "grad_norm": 0.1107415184378624, + "learning_rate": 9.323370693233707e-05, + "loss": 0.6663, + "step": 2722 + }, + { + "epoch": 1.6435255055840627, + "grad_norm": 0.14163663983345032, + "learning_rate": 9.319219593192197e-05, + "loss": 0.8732, + "step": 2723 + }, + { + "epoch": 1.6441291880470872, + "grad_norm": 0.11508408188819885, + "learning_rate": 9.315068493150684e-05, + "loss": 0.7499, + "step": 2724 + }, + { + "epoch": 1.6447328705101116, + "grad_norm": 0.11651081591844559, + "learning_rate": 9.310917393109174e-05, + "loss": 0.6448, + "step": 2725 + }, + { + "epoch": 1.645336552973136, + "grad_norm": 0.11762251704931259, + "learning_rate": 9.306766293067663e-05, + "loss": 0.6679, + "step": 2726 + }, + { + "epoch": 1.6459402354361605, + "grad_norm": 0.11654434353113174, + "learning_rate": 9.302615193026153e-05, + "loss": 0.9322, + "step": 2727 + }, + { + "epoch": 1.646543917899185, + "grad_norm": 0.11348237842321396, + "learning_rate": 9.298464092984642e-05, + "loss": 0.7319, + "step": 2728 + }, + { + "epoch": 1.6471476003622096, + "grad_norm": 0.19426052272319794, + "learning_rate": 9.29431299294313e-05, + "loss": 0.7589, + "step": 2729 + }, + { + "epoch": 1.6477512828252339, + "grad_norm": 0.1138731986284256, + "learning_rate": 9.29016189290162e-05, + "loss": 0.6751, + "step": 2730 + }, + { + "epoch": 1.6483549652882585, + "grad_norm": 0.11389704793691635, + "learning_rate": 9.286010792860108e-05, + "loss": 0.8278, + "step": 2731 + }, + { + "epoch": 1.6489586477512828, + "grad_norm": 0.13111789524555206, + "learning_rate": 9.281859692818598e-05, + "loss": 0.8243, + "step": 2732 + }, + { + "epoch": 1.6495623302143074, + "grad_norm": 0.10878365486860275, + "learning_rate": 9.277708592777086e-05, + "loss": 0.9464, + "step": 2733 + }, + { + "epoch": 1.6501660126773317, + "grad_norm": 0.11654222756624222, + "learning_rate": 9.273557492735575e-05, + "loss": 0.7815, + "step": 2734 + }, + { + "epoch": 1.6507696951403563, + "grad_norm": 0.10811632871627808, + "learning_rate": 9.269406392694065e-05, + "loss": 0.7112, + "step": 2735 + }, + { + "epoch": 1.6513733776033805, + "grad_norm": 0.1144120842218399, + "learning_rate": 9.265255292652554e-05, + "loss": 0.6915, + "step": 2736 + }, + { + "epoch": 1.6519770600664052, + "grad_norm": 0.12281805276870728, + "learning_rate": 9.261104192611042e-05, + "loss": 0.7846, + "step": 2737 + }, + { + "epoch": 1.6525807425294294, + "grad_norm": 0.11404890567064285, + "learning_rate": 9.256953092569531e-05, + "loss": 0.6841, + "step": 2738 + }, + { + "epoch": 1.653184424992454, + "grad_norm": 0.10799200087785721, + "learning_rate": 9.252801992528021e-05, + "loss": 0.7854, + "step": 2739 + }, + { + "epoch": 1.6537881074554783, + "grad_norm": 0.11798325926065445, + "learning_rate": 9.24865089248651e-05, + "loss": 0.6409, + "step": 2740 + }, + { + "epoch": 1.654391789918503, + "grad_norm": 0.11283228546380997, + "learning_rate": 9.244499792444998e-05, + "loss": 0.7441, + "step": 2741 + }, + { + "epoch": 1.6549954723815272, + "grad_norm": 0.11868225038051605, + "learning_rate": 9.240348692403488e-05, + "loss": 0.7046, + "step": 2742 + }, + { + "epoch": 1.655599154844552, + "grad_norm": 0.11451592296361923, + "learning_rate": 9.236197592361975e-05, + "loss": 0.677, + "step": 2743 + }, + { + "epoch": 1.6562028373075761, + "grad_norm": 0.11685509234666824, + "learning_rate": 9.232046492320465e-05, + "loss": 0.6602, + "step": 2744 + }, + { + "epoch": 1.6568065197706008, + "grad_norm": 0.11831925809383392, + "learning_rate": 9.227895392278954e-05, + "loss": 0.7313, + "step": 2745 + }, + { + "epoch": 1.657410202233625, + "grad_norm": 0.11828596144914627, + "learning_rate": 9.223744292237443e-05, + "loss": 0.5914, + "step": 2746 + }, + { + "epoch": 1.6580138846966497, + "grad_norm": 0.12427350878715515, + "learning_rate": 9.219593192195933e-05, + "loss": 0.6278, + "step": 2747 + }, + { + "epoch": 1.658617567159674, + "grad_norm": 0.12681810557842255, + "learning_rate": 9.215442092154421e-05, + "loss": 0.6496, + "step": 2748 + }, + { + "epoch": 1.6592212496226986, + "grad_norm": 0.1341797262430191, + "learning_rate": 9.211290992112911e-05, + "loss": 0.6372, + "step": 2749 + }, + { + "epoch": 1.6598249320857228, + "grad_norm": 0.1370631456375122, + "learning_rate": 9.207139892071399e-05, + "loss": 0.6669, + "step": 2750 + }, + { + "epoch": 1.6604286145487475, + "grad_norm": 0.14159443974494934, + "learning_rate": 9.202988792029889e-05, + "loss": 0.5644, + "step": 2751 + }, + { + "epoch": 1.6610322970117717, + "grad_norm": 0.13964007794857025, + "learning_rate": 9.198837691988377e-05, + "loss": 0.6035, + "step": 2752 + }, + { + "epoch": 1.6616359794747964, + "grad_norm": 0.154313325881958, + "learning_rate": 9.194686591946866e-05, + "loss": 0.5195, + "step": 2753 + }, + { + "epoch": 1.6622396619378206, + "grad_norm": 0.14789967238903046, + "learning_rate": 9.190535491905356e-05, + "loss": 0.5039, + "step": 2754 + }, + { + "epoch": 1.6628433444008452, + "grad_norm": 0.1544695347547531, + "learning_rate": 9.186384391863845e-05, + "loss": 0.4487, + "step": 2755 + }, + { + "epoch": 1.6634470268638695, + "grad_norm": 0.15225286781787872, + "learning_rate": 9.182233291822333e-05, + "loss": 0.4129, + "step": 2756 + }, + { + "epoch": 1.6640507093268941, + "grad_norm": 0.16880232095718384, + "learning_rate": 9.178082191780822e-05, + "loss": 0.3468, + "step": 2757 + }, + { + "epoch": 1.6646543917899184, + "grad_norm": 0.10654531419277191, + "learning_rate": 9.173931091739312e-05, + "loss": 0.6731, + "step": 2758 + }, + { + "epoch": 1.665258074252943, + "grad_norm": 0.12524224817752838, + "learning_rate": 9.1697799916978e-05, + "loss": 0.7284, + "step": 2759 + }, + { + "epoch": 1.6658617567159673, + "grad_norm": 0.11452171951532364, + "learning_rate": 9.165628891656289e-05, + "loss": 0.7422, + "step": 2760 + }, + { + "epoch": 1.6670691216420161, + "grad_norm": 0.11638887971639633, + "learning_rate": 9.161477791614779e-05, + "loss": 0.7024, + "step": 2761 + }, + { + "epoch": 1.6676728041050408, + "grad_norm": 0.1171191856265068, + "learning_rate": 9.157326691573267e-05, + "loss": 0.6618, + "step": 2762 + }, + { + "epoch": 1.668276486568065, + "grad_norm": 0.11434746533632278, + "learning_rate": 9.153175591531757e-05, + "loss": 0.876, + "step": 2763 + }, + { + "epoch": 1.6688801690310897, + "grad_norm": 0.11305181682109833, + "learning_rate": 9.149024491490245e-05, + "loss": 0.8247, + "step": 2764 + }, + { + "epoch": 1.669483851494114, + "grad_norm": 0.10942018032073975, + "learning_rate": 9.144873391448734e-05, + "loss": 0.6116, + "step": 2765 + }, + { + "epoch": 1.6700875339571386, + "grad_norm": 0.14481772482395172, + "learning_rate": 9.140722291407224e-05, + "loss": 0.7061, + "step": 2766 + }, + { + "epoch": 1.6706912164201628, + "grad_norm": 0.12722566723823547, + "learning_rate": 9.136571191365713e-05, + "loss": 0.6226, + "step": 2767 + }, + { + "epoch": 1.6712948988831875, + "grad_norm": 0.12751980125904083, + "learning_rate": 9.132420091324201e-05, + "loss": 0.7537, + "step": 2768 + }, + { + "epoch": 1.671898581346212, + "grad_norm": 0.1445055454969406, + "learning_rate": 9.12826899128269e-05, + "loss": 0.7353, + "step": 2769 + }, + { + "epoch": 1.6725022638092364, + "grad_norm": 0.15946505963802338, + "learning_rate": 9.12411789124118e-05, + "loss": 0.6124, + "step": 2770 + }, + { + "epoch": 1.6731059462722608, + "grad_norm": 0.11702273786067963, + "learning_rate": 9.119966791199668e-05, + "loss": 0.7526, + "step": 2771 + }, + { + "epoch": 1.6737096287352853, + "grad_norm": 0.12233686447143555, + "learning_rate": 9.115815691158157e-05, + "loss": 0.6447, + "step": 2772 + }, + { + "epoch": 1.6743133111983097, + "grad_norm": 0.12842793762683868, + "learning_rate": 9.111664591116647e-05, + "loss": 0.8035, + "step": 2773 + }, + { + "epoch": 1.6749169936613342, + "grad_norm": 0.27307209372520447, + "learning_rate": 9.107513491075136e-05, + "loss": 0.8121, + "step": 2774 + }, + { + "epoch": 1.6755206761243586, + "grad_norm": 0.12133491784334183, + "learning_rate": 9.103362391033624e-05, + "loss": 0.6252, + "step": 2775 + }, + { + "epoch": 1.676124358587383, + "grad_norm": 0.12983252108097076, + "learning_rate": 9.099211290992113e-05, + "loss": 0.6484, + "step": 2776 + }, + { + "epoch": 1.6767280410504075, + "grad_norm": 0.12295020371675491, + "learning_rate": 9.095060190950603e-05, + "loss": 0.6794, + "step": 2777 + }, + { + "epoch": 1.677331723513432, + "grad_norm": 0.12450043857097626, + "learning_rate": 9.090909090909092e-05, + "loss": 0.7732, + "step": 2778 + }, + { + "epoch": 1.6779354059764564, + "grad_norm": 0.15557287633419037, + "learning_rate": 9.08675799086758e-05, + "loss": 0.601, + "step": 2779 + }, + { + "epoch": 1.6785390884394809, + "grad_norm": 0.1215049996972084, + "learning_rate": 9.08260689082607e-05, + "loss": 0.6817, + "step": 2780 + }, + { + "epoch": 1.6791427709025053, + "grad_norm": 0.11101005971431732, + "learning_rate": 9.078455790784558e-05, + "loss": 0.5621, + "step": 2781 + }, + { + "epoch": 1.6797464533655297, + "grad_norm": 0.11614114046096802, + "learning_rate": 9.074304690743048e-05, + "loss": 0.6261, + "step": 2782 + }, + { + "epoch": 1.6803501358285542, + "grad_norm": 0.11843936145305634, + "learning_rate": 9.070153590701536e-05, + "loss": 0.8192, + "step": 2783 + }, + { + "epoch": 1.6809538182915786, + "grad_norm": 0.12000729143619537, + "learning_rate": 9.066002490660025e-05, + "loss": 0.6353, + "step": 2784 + }, + { + "epoch": 1.681557500754603, + "grad_norm": 0.11501980572938919, + "learning_rate": 9.061851390618515e-05, + "loss": 0.6151, + "step": 2785 + }, + { + "epoch": 1.6821611832176275, + "grad_norm": 0.13076992332935333, + "learning_rate": 9.057700290577004e-05, + "loss": 0.7825, + "step": 2786 + }, + { + "epoch": 1.682764865680652, + "grad_norm": 0.13332267105579376, + "learning_rate": 9.053549190535492e-05, + "loss": 0.8015, + "step": 2787 + }, + { + "epoch": 1.6833685481436764, + "grad_norm": 0.12166225165128708, + "learning_rate": 9.049398090493981e-05, + "loss": 0.6739, + "step": 2788 + }, + { + "epoch": 1.6839722306067009, + "grad_norm": 0.13338027894496918, + "learning_rate": 9.045246990452471e-05, + "loss": 0.6793, + "step": 2789 + }, + { + "epoch": 1.6845759130697253, + "grad_norm": 0.11463173478841782, + "learning_rate": 9.041095890410958e-05, + "loss": 0.535, + "step": 2790 + }, + { + "epoch": 1.6851795955327498, + "grad_norm": 0.11622385680675507, + "learning_rate": 9.036944790369448e-05, + "loss": 0.6049, + "step": 2791 + }, + { + "epoch": 1.6857832779957742, + "grad_norm": 0.1293102502822876, + "learning_rate": 9.032793690327938e-05, + "loss": 0.6065, + "step": 2792 + }, + { + "epoch": 1.6863869604587987, + "grad_norm": 0.12283488363027573, + "learning_rate": 9.028642590286426e-05, + "loss": 0.5595, + "step": 2793 + }, + { + "epoch": 1.686990642921823, + "grad_norm": 0.13364657759666443, + "learning_rate": 9.024491490244916e-05, + "loss": 0.5544, + "step": 2794 + }, + { + "epoch": 1.6875943253848475, + "grad_norm": 0.1329110860824585, + "learning_rate": 9.020340390203404e-05, + "loss": 0.584, + "step": 2795 + }, + { + "epoch": 1.688198007847872, + "grad_norm": 0.13042087852954865, + "learning_rate": 9.016189290161894e-05, + "loss": 0.5376, + "step": 2796 + }, + { + "epoch": 1.6888016903108964, + "grad_norm": 0.13576170802116394, + "learning_rate": 9.012038190120381e-05, + "loss": 0.4989, + "step": 2797 + }, + { + "epoch": 1.6894053727739209, + "grad_norm": 0.14754711091518402, + "learning_rate": 9.007887090078871e-05, + "loss": 0.5445, + "step": 2798 + }, + { + "epoch": 1.6900090552369453, + "grad_norm": 0.14908315241336823, + "learning_rate": 9.003735990037361e-05, + "loss": 0.5666, + "step": 2799 + }, + { + "epoch": 1.6906127376999698, + "grad_norm": 0.1546728014945984, + "learning_rate": 8.999584889995849e-05, + "loss": 0.472, + "step": 2800 + }, + { + "epoch": 1.6912164201629942, + "grad_norm": 0.16218960285186768, + "learning_rate": 8.995433789954339e-05, + "loss": 0.3952, + "step": 2801 + }, + { + "epoch": 1.6918201026260187, + "grad_norm": 0.1644655466079712, + "learning_rate": 8.991282689912827e-05, + "loss": 0.4062, + "step": 2802 + }, + { + "epoch": 1.6924237850890431, + "grad_norm": 0.1662607192993164, + "learning_rate": 8.987131589871316e-05, + "loss": 0.3727, + "step": 2803 + }, + { + "epoch": 1.6930274675520676, + "grad_norm": 0.1667158454656601, + "learning_rate": 8.982980489829805e-05, + "loss": 0.3083, + "step": 2804 + }, + { + "epoch": 1.693631150015092, + "grad_norm": 0.16382873058319092, + "learning_rate": 8.978829389788295e-05, + "loss": 0.2529, + "step": 2805 + }, + { + "epoch": 1.6942348324781165, + "grad_norm": 0.1675097644329071, + "learning_rate": 8.974678289746783e-05, + "loss": 0.2192, + "step": 2806 + }, + { + "epoch": 1.694838514941141, + "grad_norm": 0.13541610538959503, + "learning_rate": 8.970527189705272e-05, + "loss": 0.6514, + "step": 2807 + }, + { + "epoch": 1.6954421974041654, + "grad_norm": 0.13940273225307465, + "learning_rate": 8.966376089663762e-05, + "loss": 0.6512, + "step": 2808 + }, + { + "epoch": 1.69604587986719, + "grad_norm": 0.12449961155653, + "learning_rate": 8.962224989622249e-05, + "loss": 0.6299, + "step": 2809 + }, + { + "epoch": 1.6966495623302142, + "grad_norm": 0.14902494847774506, + "learning_rate": 8.958073889580739e-05, + "loss": 0.8532, + "step": 2810 + }, + { + "epoch": 1.697253244793239, + "grad_norm": 0.14396223425865173, + "learning_rate": 8.953922789539228e-05, + "loss": 0.782, + "step": 2811 + }, + { + "epoch": 1.6978569272562631, + "grad_norm": 0.1220279112458229, + "learning_rate": 8.949771689497717e-05, + "loss": 0.6884, + "step": 2812 + }, + { + "epoch": 1.6984606097192878, + "grad_norm": 0.12583515048027039, + "learning_rate": 8.945620589456207e-05, + "loss": 0.7678, + "step": 2813 + }, + { + "epoch": 1.699064292182312, + "grad_norm": 0.12447942793369293, + "learning_rate": 8.941469489414695e-05, + "loss": 0.6479, + "step": 2814 + }, + { + "epoch": 1.6996679746453367, + "grad_norm": 0.1457148939371109, + "learning_rate": 8.937318389373184e-05, + "loss": 0.6261, + "step": 2815 + }, + { + "epoch": 1.700271657108361, + "grad_norm": 0.11962111294269562, + "learning_rate": 8.933167289331673e-05, + "loss": 0.6798, + "step": 2816 + }, + { + "epoch": 1.7008753395713856, + "grad_norm": 0.13384521007537842, + "learning_rate": 8.929016189290163e-05, + "loss": 0.7494, + "step": 2817 + }, + { + "epoch": 1.7014790220344098, + "grad_norm": 0.12299605458974838, + "learning_rate": 8.924865089248651e-05, + "loss": 0.6161, + "step": 2818 + }, + { + "epoch": 1.7020827044974345, + "grad_norm": 0.13254275918006897, + "learning_rate": 8.92071398920714e-05, + "loss": 0.693, + "step": 2819 + }, + { + "epoch": 1.7026863869604587, + "grad_norm": 0.13070107996463776, + "learning_rate": 8.91656288916563e-05, + "loss": 0.5099, + "step": 2820 + }, + { + "epoch": 1.7032900694234834, + "grad_norm": 0.13564686477184296, + "learning_rate": 8.912411789124119e-05, + "loss": 0.6848, + "step": 2821 + }, + { + "epoch": 1.7038937518865076, + "grad_norm": 0.12641029059886932, + "learning_rate": 8.908260689082607e-05, + "loss": 0.655, + "step": 2822 + }, + { + "epoch": 1.7044974343495323, + "grad_norm": 0.11833371967077255, + "learning_rate": 8.904109589041096e-05, + "loss": 0.6897, + "step": 2823 + }, + { + "epoch": 1.7051011168125565, + "grad_norm": 0.1325460523366928, + "learning_rate": 8.899958488999586e-05, + "loss": 0.677, + "step": 2824 + }, + { + "epoch": 1.7057047992755812, + "grad_norm": 0.13542653620243073, + "learning_rate": 8.895807388958074e-05, + "loss": 0.636, + "step": 2825 + }, + { + "epoch": 1.7063084817386054, + "grad_norm": 0.1281089037656784, + "learning_rate": 8.891656288916563e-05, + "loss": 0.6886, + "step": 2826 + }, + { + "epoch": 1.70691216420163, + "grad_norm": 0.11729513853788376, + "learning_rate": 8.887505188875053e-05, + "loss": 0.6064, + "step": 2827 + }, + { + "epoch": 1.7075158466646543, + "grad_norm": 0.12129988521337509, + "learning_rate": 8.88335408883354e-05, + "loss": 0.6609, + "step": 2828 + }, + { + "epoch": 1.708119529127679, + "grad_norm": 0.12838177382946014, + "learning_rate": 8.87920298879203e-05, + "loss": 0.615, + "step": 2829 + }, + { + "epoch": 1.7087232115907032, + "grad_norm": 0.12746667861938477, + "learning_rate": 8.875051888750519e-05, + "loss": 0.6424, + "step": 2830 + }, + { + "epoch": 1.7093268940537278, + "grad_norm": 0.12252197414636612, + "learning_rate": 8.870900788709008e-05, + "loss": 0.588, + "step": 2831 + }, + { + "epoch": 1.709930576516752, + "grad_norm": 0.11774613708257675, + "learning_rate": 8.866749688667498e-05, + "loss": 0.8513, + "step": 2832 + }, + { + "epoch": 1.7105342589797767, + "grad_norm": 0.13314837217330933, + "learning_rate": 8.862598588625986e-05, + "loss": 0.7945, + "step": 2833 + }, + { + "epoch": 1.711137941442801, + "grad_norm": 0.14223603904247284, + "learning_rate": 8.858447488584475e-05, + "loss": 0.6475, + "step": 2834 + }, + { + "epoch": 1.7117416239058256, + "grad_norm": 0.1323416829109192, + "learning_rate": 8.854296388542964e-05, + "loss": 0.9064, + "step": 2835 + }, + { + "epoch": 1.7123453063688499, + "grad_norm": 0.13644298911094666, + "learning_rate": 8.850145288501454e-05, + "loss": 0.6499, + "step": 2836 + }, + { + "epoch": 1.7129489888318745, + "grad_norm": 0.12705856561660767, + "learning_rate": 8.845994188459942e-05, + "loss": 0.846, + "step": 2837 + }, + { + "epoch": 1.7135526712948987, + "grad_norm": 0.13383372128009796, + "learning_rate": 8.841843088418431e-05, + "loss": 0.671, + "step": 2838 + }, + { + "epoch": 1.7141563537579234, + "grad_norm": 0.12026913464069366, + "learning_rate": 8.837691988376921e-05, + "loss": 0.6495, + "step": 2839 + }, + { + "epoch": 1.7147600362209476, + "grad_norm": 0.12273270636796951, + "learning_rate": 8.83354088833541e-05, + "loss": 0.679, + "step": 2840 + }, + { + "epoch": 1.7153637186839723, + "grad_norm": 0.1200748011469841, + "learning_rate": 8.829389788293898e-05, + "loss": 0.6017, + "step": 2841 + }, + { + "epoch": 1.7159674011469965, + "grad_norm": 0.11923687905073166, + "learning_rate": 8.825238688252387e-05, + "loss": 0.5635, + "step": 2842 + }, + { + "epoch": 1.7165710836100212, + "grad_norm": 0.1298258900642395, + "learning_rate": 8.821087588210877e-05, + "loss": 0.5879, + "step": 2843 + }, + { + "epoch": 1.7171747660730454, + "grad_norm": 0.13525626063346863, + "learning_rate": 8.816936488169366e-05, + "loss": 0.5725, + "step": 2844 + }, + { + "epoch": 1.71777844853607, + "grad_norm": 0.13261830806732178, + "learning_rate": 8.812785388127854e-05, + "loss": 0.511, + "step": 2845 + }, + { + "epoch": 1.7183821309990943, + "grad_norm": 0.1352899670600891, + "learning_rate": 8.808634288086344e-05, + "loss": 0.5725, + "step": 2846 + }, + { + "epoch": 1.718985813462119, + "grad_norm": 0.12940122187137604, + "learning_rate": 8.804483188044831e-05, + "loss": 0.5499, + "step": 2847 + }, + { + "epoch": 1.7195894959251434, + "grad_norm": 0.16278207302093506, + "learning_rate": 8.800332088003321e-05, + "loss": 0.6085, + "step": 2848 + }, + { + "epoch": 1.7201931783881679, + "grad_norm": 0.16731347143650055, + "learning_rate": 8.79618098796181e-05, + "loss": 0.57, + "step": 2849 + }, + { + "epoch": 1.7207968608511923, + "grad_norm": 0.15214033424854279, + "learning_rate": 8.792029887920299e-05, + "loss": 0.5269, + "step": 2850 + }, + { + "epoch": 1.7214005433142168, + "grad_norm": 0.1611013114452362, + "learning_rate": 8.787878787878789e-05, + "loss": 0.5705, + "step": 2851 + }, + { + "epoch": 1.7220042257772412, + "grad_norm": 0.16054251790046692, + "learning_rate": 8.783727687837277e-05, + "loss": 0.4106, + "step": 2852 + }, + { + "epoch": 1.7226079082402657, + "grad_norm": 0.16081520915031433, + "learning_rate": 8.779576587795766e-05, + "loss": 0.3376, + "step": 2853 + }, + { + "epoch": 1.72321159070329, + "grad_norm": 0.18261933326721191, + "learning_rate": 8.775425487754255e-05, + "loss": 0.3801, + "step": 2854 + }, + { + "epoch": 1.7238152731663146, + "grad_norm": 0.1675270050764084, + "learning_rate": 8.771274387712745e-05, + "loss": 0.3022, + "step": 2855 + }, + { + "epoch": 1.724418955629339, + "grad_norm": 0.17773354053497314, + "learning_rate": 8.767123287671233e-05, + "loss": 0.2563, + "step": 2856 + }, + { + "epoch": 1.7250226380923634, + "grad_norm": 0.12368903309106827, + "learning_rate": 8.762972187629722e-05, + "loss": 0.6228, + "step": 2857 + }, + { + "epoch": 1.725626320555388, + "grad_norm": 0.12369142472743988, + "learning_rate": 8.758821087588212e-05, + "loss": 0.5617, + "step": 2858 + }, + { + "epoch": 1.7262300030184123, + "grad_norm": 0.1579550802707672, + "learning_rate": 8.7546699875467e-05, + "loss": 0.6571, + "step": 2859 + }, + { + "epoch": 1.7268336854814368, + "grad_norm": 0.13555946946144104, + "learning_rate": 8.75051888750519e-05, + "loss": 0.7208, + "step": 2860 + }, + { + "epoch": 1.7274373679444612, + "grad_norm": 0.14086709916591644, + "learning_rate": 8.746367787463678e-05, + "loss": 0.801, + "step": 2861 + }, + { + "epoch": 1.7280410504074857, + "grad_norm": 0.16081978380680084, + "learning_rate": 8.742216687422168e-05, + "loss": 0.6317, + "step": 2862 + }, + { + "epoch": 1.7286447328705101, + "grad_norm": 0.1376749724149704, + "learning_rate": 8.738065587380657e-05, + "loss": 0.6912, + "step": 2863 + }, + { + "epoch": 1.7292484153335346, + "grad_norm": 0.1608782410621643, + "learning_rate": 8.733914487339145e-05, + "loss": 0.6217, + "step": 2864 + }, + { + "epoch": 1.729852097796559, + "grad_norm": 0.1334768831729889, + "learning_rate": 8.729763387297635e-05, + "loss": 0.9018, + "step": 2865 + }, + { + "epoch": 1.7304557802595835, + "grad_norm": 0.1475462019443512, + "learning_rate": 8.725612287256123e-05, + "loss": 0.5775, + "step": 2866 + }, + { + "epoch": 1.731059462722608, + "grad_norm": 0.26409459114074707, + "learning_rate": 8.721461187214613e-05, + "loss": 0.6337, + "step": 2867 + }, + { + "epoch": 1.7316631451856324, + "grad_norm": 0.1443539261817932, + "learning_rate": 8.717310087173101e-05, + "loss": 0.619, + "step": 2868 + }, + { + "epoch": 1.7322668276486568, + "grad_norm": 0.1351659595966339, + "learning_rate": 8.71315898713159e-05, + "loss": 0.6511, + "step": 2869 + }, + { + "epoch": 1.7328705101116813, + "grad_norm": 0.1312374323606491, + "learning_rate": 8.70900788709008e-05, + "loss": 0.6636, + "step": 2870 + }, + { + "epoch": 1.7334741925747057, + "grad_norm": 0.13947537541389465, + "learning_rate": 8.704856787048569e-05, + "loss": 0.682, + "step": 2871 + }, + { + "epoch": 1.7340778750377301, + "grad_norm": 0.1194947212934494, + "learning_rate": 8.700705687007057e-05, + "loss": 0.6474, + "step": 2872 + }, + { + "epoch": 1.7346815575007546, + "grad_norm": 0.11692965775728226, + "learning_rate": 8.696554586965546e-05, + "loss": 0.6318, + "step": 2873 + }, + { + "epoch": 1.735285239963779, + "grad_norm": 0.13894155621528625, + "learning_rate": 8.692403486924036e-05, + "loss": 0.642, + "step": 2874 + }, + { + "epoch": 1.7358889224268035, + "grad_norm": 0.1313973218202591, + "learning_rate": 8.688252386882524e-05, + "loss": 0.8355, + "step": 2875 + }, + { + "epoch": 1.736492604889828, + "grad_norm": 0.15785734355449677, + "learning_rate": 8.684101286841013e-05, + "loss": 0.7546, + "step": 2876 + }, + { + "epoch": 1.7370962873528524, + "grad_norm": 0.12201385200023651, + "learning_rate": 8.679950186799503e-05, + "loss": 0.5731, + "step": 2877 + }, + { + "epoch": 1.7376999698158768, + "grad_norm": 0.13196420669555664, + "learning_rate": 8.67579908675799e-05, + "loss": 0.6438, + "step": 2878 + }, + { + "epoch": 1.7383036522789013, + "grad_norm": 0.1272536814212799, + "learning_rate": 8.67164798671648e-05, + "loss": 0.5866, + "step": 2879 + }, + { + "epoch": 1.7389073347419257, + "grad_norm": 0.1259777694940567, + "learning_rate": 8.667496886674969e-05, + "loss": 0.677, + "step": 2880 + }, + { + "epoch": 1.7395110172049502, + "grad_norm": 0.13071627914905548, + "learning_rate": 8.663345786633458e-05, + "loss": 0.7244, + "step": 2881 + }, + { + "epoch": 1.7401146996679746, + "grad_norm": 0.12744002044200897, + "learning_rate": 8.659194686591948e-05, + "loss": 0.9371, + "step": 2882 + }, + { + "epoch": 1.740718382130999, + "grad_norm": 0.137882798910141, + "learning_rate": 8.655043586550436e-05, + "loss": 0.6717, + "step": 2883 + }, + { + "epoch": 1.7413220645940235, + "grad_norm": 0.13243485987186432, + "learning_rate": 8.650892486508926e-05, + "loss": 0.7217, + "step": 2884 + }, + { + "epoch": 1.741925747057048, + "grad_norm": 0.13238345086574554, + "learning_rate": 8.646741386467414e-05, + "loss": 0.7141, + "step": 2885 + }, + { + "epoch": 1.7425294295200724, + "grad_norm": 0.13204292953014374, + "learning_rate": 8.642590286425904e-05, + "loss": 0.7224, + "step": 2886 + }, + { + "epoch": 1.7431331119830968, + "grad_norm": 0.12200792878866196, + "learning_rate": 8.638439186384392e-05, + "loss": 0.585, + "step": 2887 + }, + { + "epoch": 1.7437367944461213, + "grad_norm": 0.12141503393650055, + "learning_rate": 8.634288086342881e-05, + "loss": 0.9708, + "step": 2888 + }, + { + "epoch": 1.7443404769091457, + "grad_norm": 0.14114977419376373, + "learning_rate": 8.630136986301371e-05, + "loss": 0.5942, + "step": 2889 + }, + { + "epoch": 1.7449441593721704, + "grad_norm": 0.12927395105361938, + "learning_rate": 8.62598588625986e-05, + "loss": 0.6257, + "step": 2890 + }, + { + "epoch": 1.7455478418351946, + "grad_norm": 0.1255059540271759, + "learning_rate": 8.621834786218348e-05, + "loss": 0.5921, + "step": 2891 + }, + { + "epoch": 1.7461515242982193, + "grad_norm": 0.1200440302491188, + "learning_rate": 8.617683686176837e-05, + "loss": 0.5587, + "step": 2892 + }, + { + "epoch": 1.7467552067612435, + "grad_norm": 0.12957656383514404, + "learning_rate": 8.613532586135327e-05, + "loss": 0.5773, + "step": 2893 + }, + { + "epoch": 1.7473588892242682, + "grad_norm": 0.13232208788394928, + "learning_rate": 8.609381486093814e-05, + "loss": 0.681, + "step": 2894 + }, + { + "epoch": 1.7479625716872924, + "grad_norm": 0.12879258394241333, + "learning_rate": 8.605230386052304e-05, + "loss": 0.5747, + "step": 2895 + }, + { + "epoch": 1.748566254150317, + "grad_norm": 0.1347392499446869, + "learning_rate": 8.601079286010794e-05, + "loss": 0.5334, + "step": 2896 + }, + { + "epoch": 1.7491699366133413, + "grad_norm": 0.16375549137592316, + "learning_rate": 8.596928185969282e-05, + "loss": 0.5834, + "step": 2897 + }, + { + "epoch": 1.749773619076366, + "grad_norm": 0.14500340819358826, + "learning_rate": 8.592777085927772e-05, + "loss": 0.539, + "step": 2898 + }, + { + "epoch": 1.7503773015393902, + "grad_norm": 0.16076421737670898, + "learning_rate": 8.58862598588626e-05, + "loss": 0.5071, + "step": 2899 + }, + { + "epoch": 1.7509809840024149, + "grad_norm": 0.14887742698192596, + "learning_rate": 8.584474885844749e-05, + "loss": 0.4643, + "step": 2900 + }, + { + "epoch": 1.751584666465439, + "grad_norm": 0.16193953156471252, + "learning_rate": 8.580323785803237e-05, + "loss": 0.494, + "step": 2901 + }, + { + "epoch": 1.7521883489284638, + "grad_norm": 0.1675998717546463, + "learning_rate": 8.576172685761727e-05, + "loss": 0.3624, + "step": 2902 + }, + { + "epoch": 1.752792031391488, + "grad_norm": 0.18320401012897491, + "learning_rate": 8.572021585720216e-05, + "loss": 0.4361, + "step": 2903 + }, + { + "epoch": 1.7533957138545126, + "grad_norm": 0.1881968080997467, + "learning_rate": 8.567870485678705e-05, + "loss": 0.3647, + "step": 2904 + }, + { + "epoch": 1.7539993963175369, + "grad_norm": 0.2042665183544159, + "learning_rate": 8.563719385637195e-05, + "loss": 0.3493, + "step": 2905 + }, + { + "epoch": 1.7546030787805615, + "grad_norm": 0.18004922568798065, + "learning_rate": 8.559568285595683e-05, + "loss": 0.2261, + "step": 2906 + }, + { + "epoch": 1.7552067612435858, + "grad_norm": 0.15333791077136993, + "learning_rate": 8.555417185554172e-05, + "loss": 1.0165, + "step": 2907 + }, + { + "epoch": 1.7558104437066104, + "grad_norm": 0.1424827128648758, + "learning_rate": 8.551266085512661e-05, + "loss": 0.6105, + "step": 2908 + }, + { + "epoch": 1.7564141261696347, + "grad_norm": 0.13473393023014069, + "learning_rate": 8.547114985471151e-05, + "loss": 0.6586, + "step": 2909 + }, + { + "epoch": 1.7570178086326593, + "grad_norm": 0.1452847421169281, + "learning_rate": 8.54296388542964e-05, + "loss": 0.7158, + "step": 2910 + }, + { + "epoch": 1.7576214910956836, + "grad_norm": 0.1423470824956894, + "learning_rate": 8.538812785388128e-05, + "loss": 0.6961, + "step": 2911 + }, + { + "epoch": 1.7582251735587082, + "grad_norm": 0.12046578526496887, + "learning_rate": 8.534661685346618e-05, + "loss": 0.8319, + "step": 2912 + }, + { + "epoch": 1.7588288560217324, + "grad_norm": 0.13641320168972015, + "learning_rate": 8.530510585305105e-05, + "loss": 0.6154, + "step": 2913 + }, + { + "epoch": 1.7594325384847571, + "grad_norm": 0.1376158744096756, + "learning_rate": 8.526359485263595e-05, + "loss": 0.6259, + "step": 2914 + }, + { + "epoch": 1.7600362209477813, + "grad_norm": 0.1257615089416504, + "learning_rate": 8.522208385222084e-05, + "loss": 0.6727, + "step": 2915 + }, + { + "epoch": 1.760639903410806, + "grad_norm": 0.12842705845832825, + "learning_rate": 8.518057285180573e-05, + "loss": 0.8247, + "step": 2916 + }, + { + "epoch": 1.7612435858738302, + "grad_norm": 0.12923744320869446, + "learning_rate": 8.513906185139063e-05, + "loss": 0.7409, + "step": 2917 + }, + { + "epoch": 1.761847268336855, + "grad_norm": 0.17179526388645172, + "learning_rate": 8.509755085097551e-05, + "loss": 0.6223, + "step": 2918 + }, + { + "epoch": 1.7624509507998791, + "grad_norm": 0.1274694800376892, + "learning_rate": 8.50560398505604e-05, + "loss": 0.6175, + "step": 2919 + }, + { + "epoch": 1.7630546332629038, + "grad_norm": 0.1250637173652649, + "learning_rate": 8.501452885014529e-05, + "loss": 0.8064, + "step": 2920 + }, + { + "epoch": 1.763658315725928, + "grad_norm": 0.12304326891899109, + "learning_rate": 8.497301784973019e-05, + "loss": 0.6699, + "step": 2921 + }, + { + "epoch": 1.7642619981889527, + "grad_norm": 0.1374034285545349, + "learning_rate": 8.493150684931507e-05, + "loss": 0.6121, + "step": 2922 + }, + { + "epoch": 1.764865680651977, + "grad_norm": 0.13162750005722046, + "learning_rate": 8.488999584889996e-05, + "loss": 0.7042, + "step": 2923 + }, + { + "epoch": 1.7654693631150016, + "grad_norm": 0.13185490667819977, + "learning_rate": 8.484848484848486e-05, + "loss": 0.6263, + "step": 2924 + }, + { + "epoch": 1.7660730455780258, + "grad_norm": 0.1368507295846939, + "learning_rate": 8.480697384806973e-05, + "loss": 0.695, + "step": 2925 + }, + { + "epoch": 1.7666767280410505, + "grad_norm": 0.12848018109798431, + "learning_rate": 8.476546284765463e-05, + "loss": 0.5866, + "step": 2926 + }, + { + "epoch": 1.7672804105040747, + "grad_norm": 0.12568457424640656, + "learning_rate": 8.472395184723952e-05, + "loss": 0.7202, + "step": 2927 + }, + { + "epoch": 1.7678840929670994, + "grad_norm": 0.14278379082679749, + "learning_rate": 8.468244084682442e-05, + "loss": 0.8685, + "step": 2928 + }, + { + "epoch": 1.7684877754301238, + "grad_norm": 0.1360083371400833, + "learning_rate": 8.46409298464093e-05, + "loss": 0.6535, + "step": 2929 + }, + { + "epoch": 1.7690914578931483, + "grad_norm": 0.12538328766822815, + "learning_rate": 8.459941884599419e-05, + "loss": 0.614, + "step": 2930 + }, + { + "epoch": 1.7696951403561727, + "grad_norm": 0.13750702142715454, + "learning_rate": 8.455790784557909e-05, + "loss": 0.6894, + "step": 2931 + }, + { + "epoch": 1.7702988228191971, + "grad_norm": 0.12563706934452057, + "learning_rate": 8.451639684516396e-05, + "loss": 0.6747, + "step": 2932 + }, + { + "epoch": 1.7709025052822216, + "grad_norm": 0.14879825711250305, + "learning_rate": 8.447488584474886e-05, + "loss": 0.747, + "step": 2933 + }, + { + "epoch": 1.771506187745246, + "grad_norm": 0.12372935563325882, + "learning_rate": 8.443337484433375e-05, + "loss": 0.5759, + "step": 2934 + }, + { + "epoch": 1.7721098702082705, + "grad_norm": 0.13434575498104095, + "learning_rate": 8.439186384391864e-05, + "loss": 0.8362, + "step": 2935 + }, + { + "epoch": 1.772713552671295, + "grad_norm": 0.12838785350322723, + "learning_rate": 8.435035284350354e-05, + "loss": 0.8738, + "step": 2936 + }, + { + "epoch": 1.7733172351343194, + "grad_norm": 0.12814517319202423, + "learning_rate": 8.430884184308842e-05, + "loss": 0.7118, + "step": 2937 + }, + { + "epoch": 1.7739209175973438, + "grad_norm": 0.11639917641878128, + "learning_rate": 8.426733084267331e-05, + "loss": 0.7066, + "step": 2938 + }, + { + "epoch": 1.7745246000603683, + "grad_norm": 0.1350032538175583, + "learning_rate": 8.42258198422582e-05, + "loss": 0.7287, + "step": 2939 + }, + { + "epoch": 1.7751282825233927, + "grad_norm": 0.12561342120170593, + "learning_rate": 8.41843088418431e-05, + "loss": 0.6189, + "step": 2940 + }, + { + "epoch": 1.7757319649864172, + "grad_norm": 0.13215911388397217, + "learning_rate": 8.414279784142798e-05, + "loss": 0.5686, + "step": 2941 + }, + { + "epoch": 1.7763356474494416, + "grad_norm": 0.12100309878587723, + "learning_rate": 8.410128684101287e-05, + "loss": 0.5502, + "step": 2942 + }, + { + "epoch": 1.776939329912466, + "grad_norm": 0.1172775849699974, + "learning_rate": 8.405977584059777e-05, + "loss": 0.5211, + "step": 2943 + }, + { + "epoch": 1.7775430123754905, + "grad_norm": 0.13135451078414917, + "learning_rate": 8.401826484018264e-05, + "loss": 0.6114, + "step": 2944 + }, + { + "epoch": 1.778146694838515, + "grad_norm": 0.16509322822093964, + "learning_rate": 8.397675383976754e-05, + "loss": 0.5613, + "step": 2945 + }, + { + "epoch": 1.7787503773015394, + "grad_norm": 0.14752458035945892, + "learning_rate": 8.393524283935243e-05, + "loss": 0.5444, + "step": 2946 + }, + { + "epoch": 1.7793540597645638, + "grad_norm": 0.15376845002174377, + "learning_rate": 8.389373183893732e-05, + "loss": 0.5722, + "step": 2947 + }, + { + "epoch": 1.7799577422275883, + "grad_norm": 0.14886072278022766, + "learning_rate": 8.385222083852222e-05, + "loss": 0.5796, + "step": 2948 + }, + { + "epoch": 1.7805614246906127, + "grad_norm": 0.14707441627979279, + "learning_rate": 8.38107098381071e-05, + "loss": 0.5235, + "step": 2949 + }, + { + "epoch": 1.7811651071536372, + "grad_norm": 0.16511821746826172, + "learning_rate": 8.3769198837692e-05, + "loss": 0.6322, + "step": 2950 + }, + { + "epoch": 1.7817687896166616, + "grad_norm": 0.1711074262857437, + "learning_rate": 8.372768783727688e-05, + "loss": 0.4643, + "step": 2951 + }, + { + "epoch": 1.782372472079686, + "grad_norm": 0.1574665904045105, + "learning_rate": 8.368617683686178e-05, + "loss": 0.3831, + "step": 2952 + }, + { + "epoch": 1.7829761545427105, + "grad_norm": 0.17709819972515106, + "learning_rate": 8.364466583644666e-05, + "loss": 0.3542, + "step": 2953 + }, + { + "epoch": 1.783579837005735, + "grad_norm": 0.1887846142053604, + "learning_rate": 8.360315483603155e-05, + "loss": 0.3804, + "step": 2954 + }, + { + "epoch": 1.7841835194687594, + "grad_norm": 0.19664205610752106, + "learning_rate": 8.356164383561645e-05, + "loss": 0.3055, + "step": 2955 + }, + { + "epoch": 1.7847872019317839, + "grad_norm": 0.1803441047668457, + "learning_rate": 8.352013283520133e-05, + "loss": 0.2447, + "step": 2956 + }, + { + "epoch": 1.7853908843948083, + "grad_norm": 0.13248445093631744, + "learning_rate": 8.347862183478622e-05, + "loss": 0.76, + "step": 2957 + }, + { + "epoch": 1.7859945668578328, + "grad_norm": 0.12137053906917572, + "learning_rate": 8.343711083437111e-05, + "loss": 0.6176, + "step": 2958 + }, + { + "epoch": 1.7865982493208572, + "grad_norm": 0.1513027399778366, + "learning_rate": 8.339559983395601e-05, + "loss": 0.611, + "step": 2959 + }, + { + "epoch": 1.7872019317838816, + "grad_norm": 0.15577901899814606, + "learning_rate": 8.33540888335409e-05, + "loss": 0.6373, + "step": 2960 + }, + { + "epoch": 1.787805614246906, + "grad_norm": 0.14755643904209137, + "learning_rate": 8.331257783312578e-05, + "loss": 1.0106, + "step": 2961 + }, + { + "epoch": 1.7884092967099305, + "grad_norm": 0.127186581492424, + "learning_rate": 8.327106683271068e-05, + "loss": 0.7419, + "step": 2962 + }, + { + "epoch": 1.789012979172955, + "grad_norm": 0.14396396279335022, + "learning_rate": 8.322955583229555e-05, + "loss": 0.6834, + "step": 2963 + }, + { + "epoch": 1.7896166616359794, + "grad_norm": 0.12821994721889496, + "learning_rate": 8.318804483188045e-05, + "loss": 0.9047, + "step": 2964 + }, + { + "epoch": 1.7902203440990039, + "grad_norm": 0.12685029208660126, + "learning_rate": 8.314653383146534e-05, + "loss": 0.5295, + "step": 2965 + }, + { + "epoch": 1.7908240265620283, + "grad_norm": 0.16082331538200378, + "learning_rate": 8.310502283105023e-05, + "loss": 0.6977, + "step": 2966 + }, + { + "epoch": 1.7914277090250528, + "grad_norm": 0.13325533270835876, + "learning_rate": 8.306351183063513e-05, + "loss": 1.0607, + "step": 2967 + }, + { + "epoch": 1.7920313914880772, + "grad_norm": 0.12702691555023193, + "learning_rate": 8.302200083022001e-05, + "loss": 0.6663, + "step": 2968 + }, + { + "epoch": 1.7926350739511017, + "grad_norm": 0.13634531199932098, + "learning_rate": 8.29804898298049e-05, + "loss": 0.6418, + "step": 2969 + }, + { + "epoch": 1.7932387564141261, + "grad_norm": 0.11988352239131927, + "learning_rate": 8.293897882938979e-05, + "loss": 0.7847, + "step": 2970 + }, + { + "epoch": 1.7938424388771508, + "grad_norm": 0.13620884716510773, + "learning_rate": 8.289746782897469e-05, + "loss": 0.6376, + "step": 2971 + }, + { + "epoch": 1.794446121340175, + "grad_norm": 0.13789290189743042, + "learning_rate": 8.285595682855957e-05, + "loss": 1.144, + "step": 2972 + }, + { + "epoch": 1.7950498038031997, + "grad_norm": 0.12296643108129501, + "learning_rate": 8.281444582814446e-05, + "loss": 0.6537, + "step": 2973 + }, + { + "epoch": 1.795653486266224, + "grad_norm": 0.13792656362056732, + "learning_rate": 8.277293482772936e-05, + "loss": 0.628, + "step": 2974 + }, + { + "epoch": 1.7962571687292486, + "grad_norm": 0.13332706689834595, + "learning_rate": 8.273142382731425e-05, + "loss": 0.584, + "step": 2975 + }, + { + "epoch": 1.7968608511922728, + "grad_norm": 0.11899926513433456, + "learning_rate": 8.268991282689913e-05, + "loss": 0.6707, + "step": 2976 + }, + { + "epoch": 1.7974645336552975, + "grad_norm": 0.3628057539463043, + "learning_rate": 8.264840182648402e-05, + "loss": 0.6138, + "step": 2977 + }, + { + "epoch": 1.7980682161183217, + "grad_norm": 0.1424243003129959, + "learning_rate": 8.260689082606892e-05, + "loss": 0.708, + "step": 2978 + }, + { + "epoch": 1.7986718985813464, + "grad_norm": 0.13742388784885406, + "learning_rate": 8.25653798256538e-05, + "loss": 0.7981, + "step": 2979 + }, + { + "epoch": 1.7992755810443706, + "grad_norm": 0.12499826401472092, + "learning_rate": 8.252386882523869e-05, + "loss": 0.7833, + "step": 2980 + }, + { + "epoch": 1.7998792635073952, + "grad_norm": 0.13351675868034363, + "learning_rate": 8.248235782482359e-05, + "loss": 0.9222, + "step": 2981 + }, + { + "epoch": 1.8004829459704195, + "grad_norm": 0.13743843138217926, + "learning_rate": 8.244084682440846e-05, + "loss": 0.6614, + "step": 2982 + }, + { + "epoch": 1.8010866284334441, + "grad_norm": 0.14949648082256317, + "learning_rate": 8.239933582399336e-05, + "loss": 0.5954, + "step": 2983 + }, + { + "epoch": 1.8016903108964684, + "grad_norm": 0.1315966248512268, + "learning_rate": 8.235782482357825e-05, + "loss": 0.7275, + "step": 2984 + }, + { + "epoch": 1.802293993359493, + "grad_norm": 0.16671594977378845, + "learning_rate": 8.231631382316314e-05, + "loss": 0.7025, + "step": 2985 + }, + { + "epoch": 1.8028976758225173, + "grad_norm": 0.12334276735782623, + "learning_rate": 8.227480282274804e-05, + "loss": 0.6606, + "step": 2986 + }, + { + "epoch": 1.803501358285542, + "grad_norm": 0.11940109729766846, + "learning_rate": 8.223329182233292e-05, + "loss": 0.6042, + "step": 2987 + }, + { + "epoch": 1.8041050407485661, + "grad_norm": 0.1273908019065857, + "learning_rate": 8.219178082191781e-05, + "loss": 0.6492, + "step": 2988 + }, + { + "epoch": 1.8047087232115908, + "grad_norm": 0.12909753620624542, + "learning_rate": 8.21502698215027e-05, + "loss": 0.5704, + "step": 2989 + }, + { + "epoch": 1.805312405674615, + "grad_norm": 0.1314334124326706, + "learning_rate": 8.21087588210876e-05, + "loss": 0.548, + "step": 2990 + }, + { + "epoch": 1.8059160881376397, + "grad_norm": 0.14163191616535187, + "learning_rate": 8.206724782067248e-05, + "loss": 0.6069, + "step": 2991 + }, + { + "epoch": 1.806519770600664, + "grad_norm": 0.14366085827350616, + "learning_rate": 8.202573682025737e-05, + "loss": 0.5977, + "step": 2992 + }, + { + "epoch": 1.8071234530636886, + "grad_norm": 0.1461668461561203, + "learning_rate": 8.198422581984227e-05, + "loss": 0.6007, + "step": 2993 + }, + { + "epoch": 1.8077271355267128, + "grad_norm": 0.14245660603046417, + "learning_rate": 8.194271481942716e-05, + "loss": 0.5456, + "step": 2994 + }, + { + "epoch": 1.8083308179897375, + "grad_norm": 0.15712513029575348, + "learning_rate": 8.190120381901204e-05, + "loss": 0.522, + "step": 2995 + }, + { + "epoch": 1.8089345004527617, + "grad_norm": 0.14800165593624115, + "learning_rate": 8.185969281859693e-05, + "loss": 0.541, + "step": 2996 + }, + { + "epoch": 1.8095381829157864, + "grad_norm": 0.1499776840209961, + "learning_rate": 8.181818181818183e-05, + "loss": 0.5566, + "step": 2997 + }, + { + "epoch": 1.8101418653788106, + "grad_norm": 0.15904314815998077, + "learning_rate": 8.177667081776672e-05, + "loss": 0.533, + "step": 2998 + }, + { + "epoch": 1.8107455478418353, + "grad_norm": 0.17498090863227844, + "learning_rate": 8.17351598173516e-05, + "loss": 0.535, + "step": 2999 + }, + { + "epoch": 1.8113492303048595, + "grad_norm": 0.16436955332756042, + "learning_rate": 8.16936488169365e-05, + "loss": 0.5246, + "step": 3000 + }, + { + "epoch": 1.8113492303048595, + "eval_loss": 0.6211307644844055, + "eval_runtime": 1059.4646, + "eval_samples_per_second": 2.633, + "eval_steps_per_second": 0.329, + "step": 3000 + }, + { + "epoch": 1.8119529127678842, + "grad_norm": 0.173828125, + "learning_rate": 8.165213781652138e-05, + "loss": 0.4371, + "step": 3001 + }, + { + "epoch": 1.8125565952309084, + "grad_norm": 0.203125, + "learning_rate": 8.161062681610628e-05, + "loss": 0.4749, + "step": 3002 + }, + { + "epoch": 1.813160277693933, + "grad_norm": 0.1767578125, + "learning_rate": 8.156911581569116e-05, + "loss": 0.3594, + "step": 3003 + }, + { + "epoch": 1.8137639601569573, + "grad_norm": 0.1884765625, + "learning_rate": 8.152760481527605e-05, + "loss": 0.3817, + "step": 3004 + }, + { + "epoch": 1.814367642619982, + "grad_norm": 0.185546875, + "learning_rate": 8.148609381486095e-05, + "loss": 0.3289, + "step": 3005 + }, + { + "epoch": 1.8149713250830062, + "grad_norm": 0.185546875, + "learning_rate": 8.144458281444583e-05, + "loss": 0.276, + "step": 3006 + }, + { + "epoch": 1.8155750075460309, + "grad_norm": 0.154296875, + "learning_rate": 8.140307181403072e-05, + "loss": 0.6628, + "step": 3007 + }, + { + "epoch": 1.816178690009055, + "grad_norm": 0.1298828125, + "learning_rate": 8.136156081361561e-05, + "loss": 0.6329, + "step": 3008 + }, + { + "epoch": 1.8167823724720797, + "grad_norm": 0.134765625, + "learning_rate": 8.132004981320051e-05, + "loss": 0.6207, + "step": 3009 + }, + { + "epoch": 1.8173860549351042, + "grad_norm": 0.1376953125, + "learning_rate": 8.127853881278538e-05, + "loss": 0.7047, + "step": 3010 + }, + { + "epoch": 1.8179897373981286, + "grad_norm": 0.171875, + "learning_rate": 8.123702781237028e-05, + "loss": 0.717, + "step": 3011 + }, + { + "epoch": 1.818593419861153, + "grad_norm": 0.134765625, + "learning_rate": 8.119551681195518e-05, + "loss": 0.5587, + "step": 3012 + }, + { + "epoch": 1.8191971023241775, + "grad_norm": 0.14453125, + "learning_rate": 8.115400581154005e-05, + "loss": 0.7154, + "step": 3013 + }, + { + "epoch": 1.819800784787202, + "grad_norm": 0.1337890625, + "learning_rate": 8.111249481112495e-05, + "loss": 0.6538, + "step": 3014 + }, + { + "epoch": 1.8204044672502264, + "grad_norm": 0.1328125, + "learning_rate": 8.107098381070984e-05, + "loss": 0.6695, + "step": 3015 + }, + { + "epoch": 1.8210081497132509, + "grad_norm": 0.134765625, + "learning_rate": 8.102947281029473e-05, + "loss": 0.8783, + "step": 3016 + }, + { + "epoch": 1.8216118321762753, + "grad_norm": 0.13671875, + "learning_rate": 8.098796180987961e-05, + "loss": 0.6434, + "step": 3017 + }, + { + "epoch": 1.8222155146392998, + "grad_norm": 0.1220703125, + "learning_rate": 8.094645080946451e-05, + "loss": 0.6594, + "step": 3018 + }, + { + "epoch": 1.8228191971023242, + "grad_norm": 0.142578125, + "learning_rate": 8.090493980904941e-05, + "loss": 0.6878, + "step": 3019 + }, + { + "epoch": 1.8234228795653487, + "grad_norm": 0.125, + "learning_rate": 8.086342880863429e-05, + "loss": 0.7494, + "step": 3020 + }, + { + "epoch": 1.824026562028373, + "grad_norm": 0.1337890625, + "learning_rate": 8.082191780821919e-05, + "loss": 0.9943, + "step": 3021 + }, + { + "epoch": 1.8246302444913975, + "grad_norm": 0.1279296875, + "learning_rate": 8.078040680780407e-05, + "loss": 0.636, + "step": 3022 + }, + { + "epoch": 1.825233926954422, + "grad_norm": 0.140625, + "learning_rate": 8.073889580738896e-05, + "loss": 0.7553, + "step": 3023 + }, + { + "epoch": 1.8258376094174464, + "grad_norm": 0.1337890625, + "learning_rate": 8.069738480697385e-05, + "loss": 0.6998, + "step": 3024 + }, + { + "epoch": 1.8264412918804709, + "grad_norm": 0.1318359375, + "learning_rate": 8.065587380655875e-05, + "loss": 0.6978, + "step": 3025 + }, + { + "epoch": 1.8270449743434953, + "grad_norm": 0.11767578125, + "learning_rate": 8.061436280614363e-05, + "loss": 0.6634, + "step": 3026 + }, + { + "epoch": 1.8276486568065198, + "grad_norm": 0.1357421875, + "learning_rate": 8.057285180572852e-05, + "loss": 0.6892, + "step": 3027 + }, + { + "epoch": 1.8282523392695442, + "grad_norm": 0.138671875, + "learning_rate": 8.053134080531342e-05, + "loss": 0.7321, + "step": 3028 + }, + { + "epoch": 1.8288560217325687, + "grad_norm": 0.123046875, + "learning_rate": 8.048982980489829e-05, + "loss": 0.6714, + "step": 3029 + }, + { + "epoch": 1.8294597041955931, + "grad_norm": 0.1298828125, + "learning_rate": 8.044831880448319e-05, + "loss": 0.7105, + "step": 3030 + }, + { + "epoch": 1.8300633866586176, + "grad_norm": 0.12890625, + "learning_rate": 8.040680780406808e-05, + "loss": 0.7032, + "step": 3031 + }, + { + "epoch": 1.830667069121642, + "grad_norm": 0.1201171875, + "learning_rate": 8.036529680365296e-05, + "loss": 0.6389, + "step": 3032 + }, + { + "epoch": 1.8312707515846665, + "grad_norm": 0.1220703125, + "learning_rate": 8.032378580323786e-05, + "loss": 0.8036, + "step": 3033 + }, + { + "epoch": 1.831874434047691, + "grad_norm": 0.1455078125, + "learning_rate": 8.028227480282275e-05, + "loss": 0.6143, + "step": 3034 + }, + { + "epoch": 1.8324781165107153, + "grad_norm": 0.134765625, + "learning_rate": 8.024076380240764e-05, + "loss": 0.6233, + "step": 3035 + }, + { + "epoch": 1.8330817989737398, + "grad_norm": 0.138671875, + "learning_rate": 8.019925280199252e-05, + "loss": 0.5732, + "step": 3036 + }, + { + "epoch": 1.8336854814367642, + "grad_norm": 0.125, + "learning_rate": 8.015774180157742e-05, + "loss": 0.7028, + "step": 3037 + }, + { + "epoch": 1.8342891638997887, + "grad_norm": 0.1494140625, + "learning_rate": 8.011623080116231e-05, + "loss": 0.7947, + "step": 3038 + }, + { + "epoch": 1.8348928463628131, + "grad_norm": 0.12890625, + "learning_rate": 8.00747198007472e-05, + "loss": 0.6311, + "step": 3039 + }, + { + "epoch": 1.8354965288258376, + "grad_norm": 0.1376953125, + "learning_rate": 8.00332088003321e-05, + "loss": 0.6732, + "step": 3040 + }, + { + "epoch": 1.836100211288862, + "grad_norm": 0.1259765625, + "learning_rate": 7.999169779991698e-05, + "loss": 0.5363, + "step": 3041 + }, + { + "epoch": 1.8367038937518865, + "grad_norm": 0.126953125, + "learning_rate": 7.995018679950187e-05, + "loss": 0.5187, + "step": 3042 + }, + { + "epoch": 1.837307576214911, + "grad_norm": 0.1376953125, + "learning_rate": 7.990867579908676e-05, + "loss": 0.6697, + "step": 3043 + }, + { + "epoch": 1.8379112586779354, + "grad_norm": 0.12890625, + "learning_rate": 7.986716479867166e-05, + "loss": 0.5733, + "step": 3044 + }, + { + "epoch": 1.8385149411409598, + "grad_norm": 0.142578125, + "learning_rate": 7.982565379825654e-05, + "loss": 0.5313, + "step": 3045 + }, + { + "epoch": 1.8391186236039843, + "grad_norm": 0.1474609375, + "learning_rate": 7.978414279784143e-05, + "loss": 0.5485, + "step": 3046 + }, + { + "epoch": 1.8397223060670087, + "grad_norm": 0.15625, + "learning_rate": 7.974263179742633e-05, + "loss": 0.5707, + "step": 3047 + }, + { + "epoch": 1.8403259885300332, + "grad_norm": 0.1435546875, + "learning_rate": 7.97011207970112e-05, + "loss": 0.5065, + "step": 3048 + }, + { + "epoch": 1.8409296709930576, + "grad_norm": 0.158203125, + "learning_rate": 7.96596097965961e-05, + "loss": 0.5392, + "step": 3049 + }, + { + "epoch": 1.8415333534560823, + "grad_norm": 0.1533203125, + "learning_rate": 7.961809879618099e-05, + "loss": 0.4614, + "step": 3050 + }, + { + "epoch": 1.8421370359191065, + "grad_norm": 0.171875, + "learning_rate": 7.957658779576588e-05, + "loss": 0.4686, + "step": 3051 + }, + { + "epoch": 1.8427407183821312, + "grad_norm": 0.169921875, + "learning_rate": 7.953507679535078e-05, + "loss": 0.429, + "step": 3052 + }, + { + "epoch": 1.8433444008451554, + "grad_norm": 0.173828125, + "learning_rate": 7.949356579493566e-05, + "loss": 0.3595, + "step": 3053 + }, + { + "epoch": 1.84394808330818, + "grad_norm": 0.1806640625, + "learning_rate": 7.945205479452055e-05, + "loss": 0.3526, + "step": 3054 + }, + { + "epoch": 1.8445517657712043, + "grad_norm": 0.17578125, + "learning_rate": 7.941054379410544e-05, + "loss": 0.3008, + "step": 3055 + }, + { + "epoch": 1.845155448234229, + "grad_norm": 0.17578125, + "learning_rate": 7.936903279369034e-05, + "loss": 0.222, + "step": 3056 + }, + { + "epoch": 1.8457591306972532, + "grad_norm": 0.1513671875, + "learning_rate": 7.932752179327522e-05, + "loss": 0.99, + "step": 3057 + }, + { + "epoch": 1.8463628131602778, + "grad_norm": 0.1328125, + "learning_rate": 7.928601079286011e-05, + "loss": 0.6689, + "step": 3058 + }, + { + "epoch": 1.846966495623302, + "grad_norm": 0.12890625, + "learning_rate": 7.924449979244501e-05, + "loss": 0.5613, + "step": 3059 + }, + { + "epoch": 1.8475701780863267, + "grad_norm": 0.125, + "learning_rate": 7.920298879202988e-05, + "loss": 0.6002, + "step": 3060 + }, + { + "epoch": 1.848173860549351, + "grad_norm": 0.15234375, + "learning_rate": 7.916147779161478e-05, + "loss": 0.6666, + "step": 3061 + }, + { + "epoch": 1.8487775430123756, + "grad_norm": 0.13671875, + "learning_rate": 7.911996679119967e-05, + "loss": 0.6661, + "step": 3062 + }, + { + "epoch": 1.8493812254753998, + "grad_norm": 0.1328125, + "learning_rate": 7.907845579078457e-05, + "loss": 0.588, + "step": 3063 + }, + { + "epoch": 1.8499849079384245, + "grad_norm": 0.14453125, + "learning_rate": 7.903694479036945e-05, + "loss": 0.9441, + "step": 3064 + }, + { + "epoch": 1.8505885904014487, + "grad_norm": 0.134765625, + "learning_rate": 7.899543378995434e-05, + "loss": 0.9012, + "step": 3065 + }, + { + "epoch": 1.8511922728644734, + "grad_norm": 0.1240234375, + "learning_rate": 7.895392278953924e-05, + "loss": 0.8596, + "step": 3066 + }, + { + "epoch": 1.8517959553274976, + "grad_norm": 0.1337890625, + "learning_rate": 7.891241178912411e-05, + "loss": 0.6868, + "step": 3067 + }, + { + "epoch": 1.8523996377905223, + "grad_norm": 0.1474609375, + "learning_rate": 7.887090078870901e-05, + "loss": 0.6323, + "step": 3068 + }, + { + "epoch": 1.8530033202535465, + "grad_norm": 0.1279296875, + "learning_rate": 7.88293897882939e-05, + "loss": 0.5647, + "step": 3069 + }, + { + "epoch": 1.8536070027165712, + "grad_norm": 0.1279296875, + "learning_rate": 7.878787878787879e-05, + "loss": 0.6628, + "step": 3070 + }, + { + "epoch": 1.8542106851795954, + "grad_norm": 0.1416015625, + "learning_rate": 7.874636778746369e-05, + "loss": 0.6885, + "step": 3071 + }, + { + "epoch": 1.85481436764262, + "grad_norm": 0.1201171875, + "learning_rate": 7.870485678704857e-05, + "loss": 0.5712, + "step": 3072 + }, + { + "epoch": 1.8554180501056443, + "grad_norm": 0.1318359375, + "learning_rate": 7.866334578663346e-05, + "loss": 0.6282, + "step": 3073 + }, + { + "epoch": 1.856021732568669, + "grad_norm": 0.1279296875, + "learning_rate": 7.862183478621835e-05, + "loss": 0.6369, + "step": 3074 + }, + { + "epoch": 1.8566254150316932, + "grad_norm": 0.134765625, + "learning_rate": 7.858032378580325e-05, + "loss": 0.6647, + "step": 3075 + }, + { + "epoch": 1.8572290974947179, + "grad_norm": 0.1474609375, + "learning_rate": 7.853881278538813e-05, + "loss": 0.9152, + "step": 3076 + }, + { + "epoch": 1.857832779957742, + "grad_norm": 0.12255859375, + "learning_rate": 7.849730178497302e-05, + "loss": 0.6018, + "step": 3077 + }, + { + "epoch": 1.8584364624207668, + "grad_norm": 0.14453125, + "learning_rate": 7.845579078455792e-05, + "loss": 0.9967, + "step": 3078 + }, + { + "epoch": 1.859040144883791, + "grad_norm": 0.11328125, + "learning_rate": 7.841427978414279e-05, + "loss": 0.6408, + "step": 3079 + }, + { + "epoch": 1.8596438273468157, + "grad_norm": 0.1279296875, + "learning_rate": 7.837276878372769e-05, + "loss": 0.7113, + "step": 3080 + }, + { + "epoch": 1.8602475098098399, + "grad_norm": 0.1357421875, + "learning_rate": 7.833125778331258e-05, + "loss": 0.6914, + "step": 3081 + }, + { + "epoch": 1.8608511922728646, + "grad_norm": 0.125, + "learning_rate": 7.828974678289747e-05, + "loss": 0.8815, + "step": 3082 + }, + { + "epoch": 1.8614548747358888, + "grad_norm": 0.1298828125, + "learning_rate": 7.824823578248237e-05, + "loss": 0.9352, + "step": 3083 + }, + { + "epoch": 1.8620585571989134, + "grad_norm": 0.140625, + "learning_rate": 7.820672478206725e-05, + "loss": 0.6636, + "step": 3084 + }, + { + "epoch": 1.8626622396619377, + "grad_norm": 0.13671875, + "learning_rate": 7.816521378165215e-05, + "loss": 0.6623, + "step": 3085 + }, + { + "epoch": 1.8632659221249623, + "grad_norm": 0.1318359375, + "learning_rate": 7.812370278123702e-05, + "loss": 0.8932, + "step": 3086 + }, + { + "epoch": 1.8638696045879866, + "grad_norm": 0.130859375, + "learning_rate": 7.808219178082192e-05, + "loss": 0.5604, + "step": 3087 + }, + { + "epoch": 1.8644732870510112, + "grad_norm": 0.1455078125, + "learning_rate": 7.804068078040681e-05, + "loss": 0.6196, + "step": 3088 + }, + { + "epoch": 1.8650769695140355, + "grad_norm": 0.1318359375, + "learning_rate": 7.79991697799917e-05, + "loss": 0.6475, + "step": 3089 + }, + { + "epoch": 1.8656806519770601, + "grad_norm": 0.134765625, + "learning_rate": 7.79576587795766e-05, + "loss": 0.6318, + "step": 3090 + }, + { + "epoch": 1.8662843344400846, + "grad_norm": 0.134765625, + "learning_rate": 7.791614777916148e-05, + "loss": 0.5979, + "step": 3091 + }, + { + "epoch": 1.866888016903109, + "grad_norm": 0.130859375, + "learning_rate": 7.787463677874637e-05, + "loss": 0.5466, + "step": 3092 + }, + { + "epoch": 1.8674916993661335, + "grad_norm": 0.13671875, + "learning_rate": 7.783312577833126e-05, + "loss": 0.6918, + "step": 3093 + }, + { + "epoch": 1.868095381829158, + "grad_norm": 0.1552734375, + "learning_rate": 7.779161477791616e-05, + "loss": 0.8258, + "step": 3094 + }, + { + "epoch": 1.8686990642921824, + "grad_norm": 0.169921875, + "learning_rate": 7.775010377750104e-05, + "loss": 0.5167, + "step": 3095 + }, + { + "epoch": 1.8693027467552068, + "grad_norm": 0.13671875, + "learning_rate": 7.770859277708593e-05, + "loss": 0.5426, + "step": 3096 + }, + { + "epoch": 1.8699064292182312, + "grad_norm": 0.1416015625, + "learning_rate": 7.766708177667083e-05, + "loss": 0.5094, + "step": 3097 + }, + { + "epoch": 1.8705101116812557, + "grad_norm": 0.14453125, + "learning_rate": 7.76255707762557e-05, + "loss": 0.5342, + "step": 3098 + }, + { + "epoch": 1.8711137941442801, + "grad_norm": 0.1455078125, + "learning_rate": 7.75840597758406e-05, + "loss": 0.5153, + "step": 3099 + }, + { + "epoch": 1.8717174766073046, + "grad_norm": 0.1630859375, + "learning_rate": 7.754254877542549e-05, + "loss": 0.5314, + "step": 3100 + }, + { + "epoch": 1.872321159070329, + "grad_norm": 0.1513671875, + "learning_rate": 7.750103777501038e-05, + "loss": 0.4814, + "step": 3101 + }, + { + "epoch": 1.8729248415333535, + "grad_norm": 0.162109375, + "learning_rate": 7.745952677459528e-05, + "loss": 0.4209, + "step": 3102 + }, + { + "epoch": 1.873528523996378, + "grad_norm": 0.177734375, + "learning_rate": 7.741801577418016e-05, + "loss": 0.4853, + "step": 3103 + }, + { + "epoch": 1.8741322064594024, + "grad_norm": 0.1875, + "learning_rate": 7.737650477376505e-05, + "loss": 0.427, + "step": 3104 + }, + { + "epoch": 1.8747358889224268, + "grad_norm": 0.1728515625, + "learning_rate": 7.733499377334994e-05, + "loss": 0.2704, + "step": 3105 + }, + { + "epoch": 1.8753395713854513, + "grad_norm": 0.1748046875, + "learning_rate": 7.729348277293484e-05, + "loss": 0.2182, + "step": 3106 + }, + { + "epoch": 1.8759432538484757, + "grad_norm": 0.1376953125, + "learning_rate": 7.725197177251972e-05, + "loss": 0.6389, + "step": 3107 + }, + { + "epoch": 1.8765469363115002, + "grad_norm": 0.1357421875, + "learning_rate": 7.721046077210461e-05, + "loss": 0.7304, + "step": 3108 + }, + { + "epoch": 1.8771506187745246, + "grad_norm": 0.1318359375, + "learning_rate": 7.716894977168951e-05, + "loss": 0.622, + "step": 3109 + }, + { + "epoch": 1.877754301237549, + "grad_norm": 0.140625, + "learning_rate": 7.71274387712744e-05, + "loss": 0.6608, + "step": 3110 + }, + { + "epoch": 1.8783579837005735, + "grad_norm": 0.13671875, + "learning_rate": 7.708592777085928e-05, + "loss": 0.6979, + "step": 3111 + }, + { + "epoch": 1.878961666163598, + "grad_norm": 0.14453125, + "learning_rate": 7.704441677044417e-05, + "loss": 0.64, + "step": 3112 + }, + { + "epoch": 1.8795653486266224, + "grad_norm": 0.1396484375, + "learning_rate": 7.700290577002907e-05, + "loss": 0.846, + "step": 3113 + }, + { + "epoch": 1.8801690310896468, + "grad_norm": 0.134765625, + "learning_rate": 7.696139476961395e-05, + "loss": 0.6574, + "step": 3114 + }, + { + "epoch": 1.8807727135526713, + "grad_norm": 0.12890625, + "learning_rate": 7.691988376919884e-05, + "loss": 0.6364, + "step": 3115 + }, + { + "epoch": 1.8813763960156957, + "grad_norm": 0.1591796875, + "learning_rate": 7.687837276878374e-05, + "loss": 0.916, + "step": 3116 + }, + { + "epoch": 1.8819800784787202, + "grad_norm": 0.134765625, + "learning_rate": 7.683686176836861e-05, + "loss": 0.7091, + "step": 3117 + }, + { + "epoch": 1.8825837609417446, + "grad_norm": 0.1435546875, + "learning_rate": 7.679535076795351e-05, + "loss": 0.72, + "step": 3118 + }, + { + "epoch": 1.883187443404769, + "grad_norm": 0.1318359375, + "learning_rate": 7.67538397675384e-05, + "loss": 0.6347, + "step": 3119 + }, + { + "epoch": 1.8837911258677935, + "grad_norm": 0.12451171875, + "learning_rate": 7.671232876712329e-05, + "loss": 0.6204, + "step": 3120 + }, + { + "epoch": 1.884394808330818, + "grad_norm": 0.126953125, + "learning_rate": 7.667081776670819e-05, + "loss": 0.5928, + "step": 3121 + }, + { + "epoch": 1.8849984907938424, + "grad_norm": 0.1240234375, + "learning_rate": 7.662930676629307e-05, + "loss": 0.6527, + "step": 3122 + }, + { + "epoch": 1.8856021732568669, + "grad_norm": 0.1279296875, + "learning_rate": 7.658779576587796e-05, + "loss": 0.6877, + "step": 3123 + }, + { + "epoch": 1.8862058557198913, + "grad_norm": 0.146484375, + "learning_rate": 7.654628476546285e-05, + "loss": 0.792, + "step": 3124 + }, + { + "epoch": 1.8868095381829157, + "grad_norm": 0.1328125, + "learning_rate": 7.650477376504775e-05, + "loss": 0.6725, + "step": 3125 + }, + { + "epoch": 1.8874132206459402, + "grad_norm": 0.1298828125, + "learning_rate": 7.646326276463262e-05, + "loss": 0.7279, + "step": 3126 + }, + { + "epoch": 1.8880169031089646, + "grad_norm": 0.1318359375, + "learning_rate": 7.642175176421752e-05, + "loss": 0.6495, + "step": 3127 + }, + { + "epoch": 1.888620585571989, + "grad_norm": 0.130859375, + "learning_rate": 7.638024076380242e-05, + "loss": 0.516, + "step": 3128 + }, + { + "epoch": 1.8892242680350135, + "grad_norm": 0.1279296875, + "learning_rate": 7.63387297633873e-05, + "loss": 0.9977, + "step": 3129 + }, + { + "epoch": 1.889827950498038, + "grad_norm": 0.12060546875, + "learning_rate": 7.629721876297219e-05, + "loss": 0.5642, + "step": 3130 + }, + { + "epoch": 1.8904316329610626, + "grad_norm": 0.1455078125, + "learning_rate": 7.625570776255708e-05, + "loss": 0.9672, + "step": 3131 + }, + { + "epoch": 1.8910353154240869, + "grad_norm": 0.126953125, + "learning_rate": 7.621419676214198e-05, + "loss": 0.6098, + "step": 3132 + }, + { + "epoch": 1.8916389978871115, + "grad_norm": 0.1474609375, + "learning_rate": 7.617268576172685e-05, + "loss": 0.5991, + "step": 3133 + }, + { + "epoch": 1.8922426803501358, + "grad_norm": 0.1240234375, + "learning_rate": 7.613117476131175e-05, + "loss": 0.585, + "step": 3134 + }, + { + "epoch": 1.8928463628131604, + "grad_norm": 0.12158203125, + "learning_rate": 7.608966376089665e-05, + "loss": 0.6032, + "step": 3135 + }, + { + "epoch": 1.8934500452761847, + "grad_norm": 0.1396484375, + "learning_rate": 7.604815276048153e-05, + "loss": 0.5951, + "step": 3136 + }, + { + "epoch": 1.8940537277392093, + "grad_norm": 0.1220703125, + "learning_rate": 7.600664176006643e-05, + "loss": 0.5891, + "step": 3137 + }, + { + "epoch": 1.8946574102022335, + "grad_norm": 0.134765625, + "learning_rate": 7.596513075965131e-05, + "loss": 0.6856, + "step": 3138 + }, + { + "epoch": 1.8952610926652582, + "grad_norm": 0.1279296875, + "learning_rate": 7.59236197592362e-05, + "loss": 0.5943, + "step": 3139 + }, + { + "epoch": 1.8958647751282824, + "grad_norm": 0.1279296875, + "learning_rate": 7.588210875882108e-05, + "loss": 0.5702, + "step": 3140 + }, + { + "epoch": 1.896468457591307, + "grad_norm": 0.1357421875, + "learning_rate": 7.584059775840598e-05, + "loss": 0.6605, + "step": 3141 + }, + { + "epoch": 1.8970721400543313, + "grad_norm": 0.1298828125, + "learning_rate": 7.579908675799087e-05, + "loss": 0.5738, + "step": 3142 + }, + { + "epoch": 1.897675822517356, + "grad_norm": 0.146484375, + "learning_rate": 7.575757575757576e-05, + "loss": 0.6339, + "step": 3143 + }, + { + "epoch": 1.8982795049803802, + "grad_norm": 0.13671875, + "learning_rate": 7.571606475716066e-05, + "loss": 0.6243, + "step": 3144 + }, + { + "epoch": 1.898883187443405, + "grad_norm": 0.1611328125, + "learning_rate": 7.567455375674553e-05, + "loss": 0.5782, + "step": 3145 + }, + { + "epoch": 1.8994868699064291, + "grad_norm": 0.1435546875, + "learning_rate": 7.563304275633043e-05, + "loss": 0.4687, + "step": 3146 + }, + { + "epoch": 1.9000905523694538, + "grad_norm": 0.14453125, + "learning_rate": 7.559153175591532e-05, + "loss": 0.4899, + "step": 3147 + }, + { + "epoch": 1.900694234832478, + "grad_norm": 0.150390625, + "learning_rate": 7.55500207555002e-05, + "loss": 0.4926, + "step": 3148 + }, + { + "epoch": 1.9012979172955027, + "grad_norm": 0.1552734375, + "learning_rate": 7.55085097550851e-05, + "loss": 0.511, + "step": 3149 + }, + { + "epoch": 1.901901599758527, + "grad_norm": 0.1728515625, + "learning_rate": 7.546699875466999e-05, + "loss": 0.556, + "step": 3150 + }, + { + "epoch": 1.9025052822215516, + "grad_norm": 0.1669921875, + "learning_rate": 7.542548775425489e-05, + "loss": 0.4537, + "step": 3151 + }, + { + "epoch": 1.9031089646845758, + "grad_norm": 0.1669921875, + "learning_rate": 7.538397675383976e-05, + "loss": 0.4078, + "step": 3152 + }, + { + "epoch": 1.9037126471476005, + "grad_norm": 0.1875, + "learning_rate": 7.534246575342466e-05, + "loss": 0.4259, + "step": 3153 + }, + { + "epoch": 1.9043163296106247, + "grad_norm": 0.18359375, + "learning_rate": 7.530095475300955e-05, + "loss": 0.3296, + "step": 3154 + }, + { + "epoch": 1.9049200120736494, + "grad_norm": 0.193359375, + "learning_rate": 7.525944375259444e-05, + "loss": 0.3254, + "step": 3155 + }, + { + "epoch": 1.9055236945366736, + "grad_norm": 0.1748046875, + "learning_rate": 7.521793275217934e-05, + "loss": 0.21, + "step": 3156 + }, + { + "epoch": 1.9061273769996983, + "grad_norm": 0.1357421875, + "learning_rate": 7.517642175176422e-05, + "loss": 0.604, + "step": 3157 + }, + { + "epoch": 1.9067310594627225, + "grad_norm": 0.134765625, + "learning_rate": 7.513491075134911e-05, + "loss": 0.6113, + "step": 3158 + }, + { + "epoch": 1.9073347419257471, + "grad_norm": 0.1494140625, + "learning_rate": 7.5093399750934e-05, + "loss": 0.6336, + "step": 3159 + }, + { + "epoch": 1.9079384243887714, + "grad_norm": 0.1376953125, + "learning_rate": 7.50518887505189e-05, + "loss": 0.6376, + "step": 3160 + }, + { + "epoch": 1.908542106851796, + "grad_norm": 0.13671875, + "learning_rate": 7.501037775010378e-05, + "loss": 0.6476, + "step": 3161 + }, + { + "epoch": 1.9091457893148203, + "grad_norm": 0.1240234375, + "learning_rate": 7.496886674968867e-05, + "loss": 0.6033, + "step": 3162 + }, + { + "epoch": 1.909749471777845, + "grad_norm": 0.1376953125, + "learning_rate": 7.492735574927357e-05, + "loss": 0.6158, + "step": 3163 + }, + { + "epoch": 1.9103531542408692, + "grad_norm": 0.134765625, + "learning_rate": 7.488584474885844e-05, + "loss": 0.6094, + "step": 3164 + }, + { + "epoch": 1.9109568367038938, + "grad_norm": 0.140625, + "learning_rate": 7.484433374844334e-05, + "loss": 0.6744, + "step": 3165 + }, + { + "epoch": 1.911560519166918, + "grad_norm": 0.341796875, + "learning_rate": 7.480282274802823e-05, + "loss": 0.6104, + "step": 3166 + }, + { + "epoch": 1.9121642016299427, + "grad_norm": 0.1396484375, + "learning_rate": 7.476131174761311e-05, + "loss": 0.6194, + "step": 3167 + }, + { + "epoch": 1.912767884092967, + "grad_norm": 0.126953125, + "learning_rate": 7.471980074719801e-05, + "loss": 0.8527, + "step": 3168 + }, + { + "epoch": 1.9133715665559916, + "grad_norm": 0.1572265625, + "learning_rate": 7.46782897467829e-05, + "loss": 0.6211, + "step": 3169 + }, + { + "epoch": 1.9139752490190158, + "grad_norm": 0.146484375, + "learning_rate": 7.463677874636779e-05, + "loss": 0.6325, + "step": 3170 + }, + { + "epoch": 1.9145789314820405, + "grad_norm": 0.1513671875, + "learning_rate": 7.459526774595267e-05, + "loss": 0.7137, + "step": 3171 + }, + { + "epoch": 1.915182613945065, + "grad_norm": 0.15234375, + "learning_rate": 7.455375674553757e-05, + "loss": 0.6791, + "step": 3172 + }, + { + "epoch": 1.9157862964080894, + "grad_norm": 0.1181640625, + "learning_rate": 7.451224574512246e-05, + "loss": 0.6181, + "step": 3173 + }, + { + "epoch": 1.9163899788711138, + "grad_norm": 0.15234375, + "learning_rate": 7.447073474470735e-05, + "loss": 0.6748, + "step": 3174 + }, + { + "epoch": 1.9169936613341383, + "grad_norm": 0.1533203125, + "learning_rate": 7.442922374429225e-05, + "loss": 0.7586, + "step": 3175 + }, + { + "epoch": 1.9175973437971627, + "grad_norm": 0.1376953125, + "learning_rate": 7.438771274387713e-05, + "loss": 0.6438, + "step": 3176 + }, + { + "epoch": 1.9182010262601872, + "grad_norm": 0.130859375, + "learning_rate": 7.434620174346202e-05, + "loss": 0.8398, + "step": 3177 + }, + { + "epoch": 1.9188047087232116, + "grad_norm": 0.1494140625, + "learning_rate": 7.43046907430469e-05, + "loss": 0.6768, + "step": 3178 + }, + { + "epoch": 1.919408391186236, + "grad_norm": 0.1484375, + "learning_rate": 7.42631797426318e-05, + "loss": 0.707, + "step": 3179 + }, + { + "epoch": 1.9200120736492605, + "grad_norm": 0.1318359375, + "learning_rate": 7.422166874221669e-05, + "loss": 0.9903, + "step": 3180 + }, + { + "epoch": 1.920615756112285, + "grad_norm": 0.1318359375, + "learning_rate": 7.418015774180158e-05, + "loss": 0.8726, + "step": 3181 + }, + { + "epoch": 1.9212194385753094, + "grad_norm": 0.125, + "learning_rate": 7.413864674138648e-05, + "loss": 0.6792, + "step": 3182 + }, + { + "epoch": 1.9218231210383339, + "grad_norm": 0.1396484375, + "learning_rate": 7.409713574097135e-05, + "loss": 0.6856, + "step": 3183 + }, + { + "epoch": 1.9224268035013583, + "grad_norm": 0.134765625, + "learning_rate": 7.405562474055625e-05, + "loss": 0.6322, + "step": 3184 + }, + { + "epoch": 1.9230304859643828, + "grad_norm": 0.1435546875, + "learning_rate": 7.401411374014114e-05, + "loss": 0.7408, + "step": 3185 + }, + { + "epoch": 1.9236341684274072, + "grad_norm": 0.1328125, + "learning_rate": 7.397260273972603e-05, + "loss": 0.6326, + "step": 3186 + }, + { + "epoch": 1.9242378508904316, + "grad_norm": 0.1328125, + "learning_rate": 7.393109173931093e-05, + "loss": 1.1231, + "step": 3187 + }, + { + "epoch": 1.924841533353456, + "grad_norm": 0.1337890625, + "learning_rate": 7.388958073889581e-05, + "loss": 0.5966, + "step": 3188 + }, + { + "epoch": 1.9254452158164805, + "grad_norm": 0.1357421875, + "learning_rate": 7.38480697384807e-05, + "loss": 0.6252, + "step": 3189 + }, + { + "epoch": 1.926048898279505, + "grad_norm": 0.14453125, + "learning_rate": 7.380655873806558e-05, + "loss": 0.6251, + "step": 3190 + }, + { + "epoch": 1.9266525807425294, + "grad_norm": 0.1435546875, + "learning_rate": 7.376504773765048e-05, + "loss": 0.6284, + "step": 3191 + }, + { + "epoch": 1.9272562632055539, + "grad_norm": 0.1318359375, + "learning_rate": 7.372353673723537e-05, + "loss": 0.5263, + "step": 3192 + }, + { + "epoch": 1.9278599456685783, + "grad_norm": 0.1513671875, + "learning_rate": 7.368202573682026e-05, + "loss": 0.5516, + "step": 3193 + }, + { + "epoch": 1.9284636281316028, + "grad_norm": 0.138671875, + "learning_rate": 7.364051473640516e-05, + "loss": 0.5675, + "step": 3194 + }, + { + "epoch": 1.9290673105946272, + "grad_norm": 0.146484375, + "learning_rate": 7.359900373599004e-05, + "loss": 0.5492, + "step": 3195 + }, + { + "epoch": 1.9296709930576517, + "grad_norm": 0.14453125, + "learning_rate": 7.355749273557493e-05, + "loss": 0.5312, + "step": 3196 + }, + { + "epoch": 1.930274675520676, + "grad_norm": 0.150390625, + "learning_rate": 7.351598173515982e-05, + "loss": 0.5808, + "step": 3197 + }, + { + "epoch": 1.9308783579837006, + "grad_norm": 0.1533203125, + "learning_rate": 7.347447073474472e-05, + "loss": 0.5105, + "step": 3198 + }, + { + "epoch": 1.931482040446725, + "grad_norm": 0.1513671875, + "learning_rate": 7.34329597343296e-05, + "loss": 0.514, + "step": 3199 + }, + { + "epoch": 1.9320857229097494, + "grad_norm": 0.17578125, + "learning_rate": 7.339144873391449e-05, + "loss": 0.5566, + "step": 3200 + }, + { + "epoch": 1.932689405372774, + "grad_norm": 0.1748046875, + "learning_rate": 7.334993773349939e-05, + "loss": 0.4912, + "step": 3201 + }, + { + "epoch": 1.9332930878357983, + "grad_norm": 0.181640625, + "learning_rate": 7.330842673308426e-05, + "loss": 0.4308, + "step": 3202 + }, + { + "epoch": 1.9338967702988228, + "grad_norm": 0.16796875, + "learning_rate": 7.326691573266916e-05, + "loss": 0.3551, + "step": 3203 + }, + { + "epoch": 1.9345004527618472, + "grad_norm": 0.181640625, + "learning_rate": 7.322540473225405e-05, + "loss": 0.3277, + "step": 3204 + }, + { + "epoch": 1.9351041352248717, + "grad_norm": 0.171875, + "learning_rate": 7.318389373183894e-05, + "loss": 0.2927, + "step": 3205 + }, + { + "epoch": 1.9357078176878961, + "grad_norm": 0.189453125, + "learning_rate": 7.314238273142384e-05, + "loss": 0.2479, + "step": 3206 + }, + { + "epoch": 1.9363115001509206, + "grad_norm": 0.1650390625, + "learning_rate": 7.310087173100872e-05, + "loss": 0.5836, + "step": 3207 + }, + { + "epoch": 1.936915182613945, + "grad_norm": 0.140625, + "learning_rate": 7.305936073059361e-05, + "loss": 0.5894, + "step": 3208 + }, + { + "epoch": 1.9375188650769695, + "grad_norm": 0.1259765625, + "learning_rate": 7.30178497301785e-05, + "loss": 0.6468, + "step": 3209 + }, + { + "epoch": 1.938122547539994, + "grad_norm": 0.14453125, + "learning_rate": 7.29763387297634e-05, + "loss": 0.6436, + "step": 3210 + }, + { + "epoch": 1.9387262300030184, + "grad_norm": 0.140625, + "learning_rate": 7.293482772934828e-05, + "loss": 0.6471, + "step": 3211 + }, + { + "epoch": 1.939329912466043, + "grad_norm": 0.1435546875, + "learning_rate": 7.289331672893317e-05, + "loss": 0.6474, + "step": 3212 + }, + { + "epoch": 1.9399335949290673, + "grad_norm": 0.1376953125, + "learning_rate": 7.285180572851807e-05, + "loss": 0.8865, + "step": 3213 + }, + { + "epoch": 1.940537277392092, + "grad_norm": 0.12060546875, + "learning_rate": 7.281029472810294e-05, + "loss": 0.9, + "step": 3214 + }, + { + "epoch": 1.9411409598551161, + "grad_norm": 0.1337890625, + "learning_rate": 7.276878372768784e-05, + "loss": 0.6733, + "step": 3215 + }, + { + "epoch": 1.9417446423181408, + "grad_norm": 0.13671875, + "learning_rate": 7.272727272727273e-05, + "loss": 0.6217, + "step": 3216 + }, + { + "epoch": 1.942348324781165, + "grad_norm": 0.1396484375, + "learning_rate": 7.268576172685761e-05, + "loss": 0.5774, + "step": 3217 + }, + { + "epoch": 1.9429520072441897, + "grad_norm": 0.1298828125, + "learning_rate": 7.264425072644251e-05, + "loss": 0.7204, + "step": 3218 + }, + { + "epoch": 1.943555689707214, + "grad_norm": 0.1552734375, + "learning_rate": 7.26027397260274e-05, + "loss": 0.709, + "step": 3219 + }, + { + "epoch": 1.9441593721702386, + "grad_norm": 0.1416015625, + "learning_rate": 7.25612287256123e-05, + "loss": 0.685, + "step": 3220 + }, + { + "epoch": 1.9447630546332628, + "grad_norm": 0.14453125, + "learning_rate": 7.251971772519717e-05, + "loss": 0.6819, + "step": 3221 + }, + { + "epoch": 1.9453667370962875, + "grad_norm": 0.1396484375, + "learning_rate": 7.247820672478207e-05, + "loss": 0.6286, + "step": 3222 + }, + { + "epoch": 1.9459704195593117, + "grad_norm": 0.126953125, + "learning_rate": 7.243669572436696e-05, + "loss": 0.6284, + "step": 3223 + }, + { + "epoch": 1.9465741020223364, + "grad_norm": 0.138671875, + "learning_rate": 7.239518472395185e-05, + "loss": 0.7257, + "step": 3224 + }, + { + "epoch": 1.9471777844853606, + "grad_norm": 0.1259765625, + "learning_rate": 7.235367372353675e-05, + "loss": 0.6584, + "step": 3225 + }, + { + "epoch": 1.9477814669483853, + "grad_norm": 0.1337890625, + "learning_rate": 7.231216272312163e-05, + "loss": 0.6025, + "step": 3226 + }, + { + "epoch": 1.9483851494114095, + "grad_norm": 0.1748046875, + "learning_rate": 7.227065172270652e-05, + "loss": 0.6027, + "step": 3227 + }, + { + "epoch": 1.9489888318744342, + "grad_norm": 0.173828125, + "learning_rate": 7.22291407222914e-05, + "loss": 0.7539, + "step": 3228 + }, + { + "epoch": 1.9495925143374584, + "grad_norm": 0.150390625, + "learning_rate": 7.218762972187631e-05, + "loss": 0.6194, + "step": 3229 + }, + { + "epoch": 1.950196196800483, + "grad_norm": 0.1474609375, + "learning_rate": 7.21461187214612e-05, + "loss": 0.7173, + "step": 3230 + }, + { + "epoch": 1.9507998792635073, + "grad_norm": 0.154296875, + "learning_rate": 7.210460772104608e-05, + "loss": 0.7522, + "step": 3231 + }, + { + "epoch": 1.951403561726532, + "grad_norm": 0.1689453125, + "learning_rate": 7.206309672063098e-05, + "loss": 0.7973, + "step": 3232 + }, + { + "epoch": 1.9520072441895562, + "grad_norm": 0.1337890625, + "learning_rate": 7.202158572021585e-05, + "loss": 0.7033, + "step": 3233 + }, + { + "epoch": 1.9526109266525808, + "grad_norm": 0.1533203125, + "learning_rate": 7.198007471980075e-05, + "loss": 0.6033, + "step": 3234 + }, + { + "epoch": 1.953214609115605, + "grad_norm": 0.140625, + "learning_rate": 7.193856371938564e-05, + "loss": 0.5346, + "step": 3235 + }, + { + "epoch": 1.9538182915786297, + "grad_norm": 0.146484375, + "learning_rate": 7.189705271897053e-05, + "loss": 0.666, + "step": 3236 + }, + { + "epoch": 1.954421974041654, + "grad_norm": 0.12890625, + "learning_rate": 7.185554171855543e-05, + "loss": 0.6379, + "step": 3237 + }, + { + "epoch": 1.9550256565046786, + "grad_norm": 0.2041015625, + "learning_rate": 7.181403071814031e-05, + "loss": 0.5682, + "step": 3238 + }, + { + "epoch": 1.9556293389677029, + "grad_norm": 0.134765625, + "learning_rate": 7.17725197177252e-05, + "loss": 0.697, + "step": 3239 + }, + { + "epoch": 1.9562330214307275, + "grad_norm": 0.12890625, + "learning_rate": 7.173100871731009e-05, + "loss": 0.5934, + "step": 3240 + }, + { + "epoch": 1.9568367038937517, + "grad_norm": 0.134765625, + "learning_rate": 7.168949771689499e-05, + "loss": 0.5425, + "step": 3241 + }, + { + "epoch": 1.9574403863567764, + "grad_norm": 0.134765625, + "learning_rate": 7.164798671647987e-05, + "loss": 0.5661, + "step": 3242 + }, + { + "epoch": 1.9580440688198006, + "grad_norm": 0.138671875, + "learning_rate": 7.160647571606476e-05, + "loss": 0.598, + "step": 3243 + }, + { + "epoch": 1.9586477512828253, + "grad_norm": 0.140625, + "learning_rate": 7.156496471564966e-05, + "loss": 0.5434, + "step": 3244 + }, + { + "epoch": 1.9592514337458495, + "grad_norm": 0.14453125, + "learning_rate": 7.152345371523454e-05, + "loss": 0.5406, + "step": 3245 + }, + { + "epoch": 1.9598551162088742, + "grad_norm": 0.15625, + "learning_rate": 7.148194271481943e-05, + "loss": 0.5284, + "step": 3246 + }, + { + "epoch": 1.9604587986718984, + "grad_norm": 0.154296875, + "learning_rate": 7.144043171440432e-05, + "loss": 0.5847, + "step": 3247 + }, + { + "epoch": 1.961062481134923, + "grad_norm": 0.1533203125, + "learning_rate": 7.139892071398922e-05, + "loss": 0.5552, + "step": 3248 + }, + { + "epoch": 1.9616661635979473, + "grad_norm": 0.1611328125, + "learning_rate": 7.135740971357409e-05, + "loss": 0.4877, + "step": 3249 + }, + { + "epoch": 1.962269846060972, + "grad_norm": 0.1669921875, + "learning_rate": 7.131589871315899e-05, + "loss": 0.5421, + "step": 3250 + }, + { + "epoch": 1.9628735285239962, + "grad_norm": 0.1796875, + "learning_rate": 7.127438771274389e-05, + "loss": 0.4781, + "step": 3251 + }, + { + "epoch": 1.9634772109870209, + "grad_norm": 0.1650390625, + "learning_rate": 7.123287671232876e-05, + "loss": 0.4282, + "step": 3252 + }, + { + "epoch": 1.9640808934500453, + "grad_norm": 0.1806640625, + "learning_rate": 7.119136571191366e-05, + "loss": 0.4592, + "step": 3253 + }, + { + "epoch": 1.9646845759130698, + "grad_norm": 0.18359375, + "learning_rate": 7.114985471149855e-05, + "loss": 0.3757, + "step": 3254 + }, + { + "epoch": 1.9652882583760942, + "grad_norm": 0.2041015625, + "learning_rate": 7.110834371108344e-05, + "loss": 0.3974, + "step": 3255 + }, + { + "epoch": 1.9658919408391187, + "grad_norm": 0.1767578125, + "learning_rate": 7.106683271066832e-05, + "loss": 0.249, + "step": 3256 + }, + { + "epoch": 1.9664956233021431, + "grad_norm": 0.1279296875, + "learning_rate": 7.102532171025322e-05, + "loss": 0.643, + "step": 3257 + }, + { + "epoch": 1.9670993057651676, + "grad_norm": 0.1259765625, + "learning_rate": 7.098381070983811e-05, + "loss": 0.5703, + "step": 3258 + }, + { + "epoch": 1.967702988228192, + "grad_norm": 0.138671875, + "learning_rate": 7.0942299709423e-05, + "loss": 0.6712, + "step": 3259 + }, + { + "epoch": 1.9683066706912165, + "grad_norm": 0.138671875, + "learning_rate": 7.09007887090079e-05, + "loss": 0.6473, + "step": 3260 + }, + { + "epoch": 1.968910353154241, + "grad_norm": 0.1552734375, + "learning_rate": 7.085927770859277e-05, + "loss": 0.7151, + "step": 3261 + }, + { + "epoch": 1.9695140356172653, + "grad_norm": 0.1416015625, + "learning_rate": 7.081776670817767e-05, + "loss": 0.6657, + "step": 3262 + }, + { + "epoch": 1.9701177180802898, + "grad_norm": 0.15625, + "learning_rate": 7.077625570776256e-05, + "loss": 0.7846, + "step": 3263 + }, + { + "epoch": 1.9707214005433142, + "grad_norm": 0.1611328125, + "learning_rate": 7.073474470734746e-05, + "loss": 0.6625, + "step": 3264 + }, + { + "epoch": 1.9713250830063387, + "grad_norm": 0.14453125, + "learning_rate": 7.069323370693234e-05, + "loss": 0.5718, + "step": 3265 + }, + { + "epoch": 1.9719287654693631, + "grad_norm": 0.1416015625, + "learning_rate": 7.065172270651723e-05, + "loss": 0.855, + "step": 3266 + }, + { + "epoch": 1.9725324479323876, + "grad_norm": 0.140625, + "learning_rate": 7.061021170610213e-05, + "loss": 0.8964, + "step": 3267 + }, + { + "epoch": 1.973136130395412, + "grad_norm": 0.146484375, + "learning_rate": 7.0568700705687e-05, + "loss": 0.9168, + "step": 3268 + }, + { + "epoch": 1.9737398128584365, + "grad_norm": 0.1455078125, + "learning_rate": 7.05271897052719e-05, + "loss": 0.6553, + "step": 3269 + }, + { + "epoch": 1.974343495321461, + "grad_norm": 0.1396484375, + "learning_rate": 7.048567870485679e-05, + "loss": 0.6188, + "step": 3270 + }, + { + "epoch": 1.9749471777844854, + "grad_norm": 0.1337890625, + "learning_rate": 7.044416770444167e-05, + "loss": 0.5445, + "step": 3271 + }, + { + "epoch": 1.9755508602475098, + "grad_norm": 0.1396484375, + "learning_rate": 7.040265670402657e-05, + "loss": 0.6776, + "step": 3272 + }, + { + "epoch": 1.9761545427105343, + "grad_norm": 0.140625, + "learning_rate": 7.036114570361146e-05, + "loss": 0.6244, + "step": 3273 + }, + { + "epoch": 1.9767582251735587, + "grad_norm": 0.1416015625, + "learning_rate": 7.031963470319635e-05, + "loss": 0.6687, + "step": 3274 + }, + { + "epoch": 1.9773619076365831, + "grad_norm": 0.11962890625, + "learning_rate": 7.027812370278123e-05, + "loss": 0.5678, + "step": 3275 + }, + { + "epoch": 1.9779655900996076, + "grad_norm": 0.1298828125, + "learning_rate": 7.023661270236613e-05, + "loss": 0.7597, + "step": 3276 + }, + { + "epoch": 1.978569272562632, + "grad_norm": 0.1279296875, + "learning_rate": 7.019510170195102e-05, + "loss": 0.6088, + "step": 3277 + }, + { + "epoch": 1.9791729550256565, + "grad_norm": 0.12109375, + "learning_rate": 7.015359070153591e-05, + "loss": 0.8278, + "step": 3278 + }, + { + "epoch": 1.979776637488681, + "grad_norm": 0.138671875, + "learning_rate": 7.011207970112081e-05, + "loss": 1.011, + "step": 3279 + }, + { + "epoch": 1.9803803199517054, + "grad_norm": 0.1259765625, + "learning_rate": 7.007056870070568e-05, + "loss": 0.5667, + "step": 3280 + }, + { + "epoch": 1.9809840024147298, + "grad_norm": 0.126953125, + "learning_rate": 7.002905770029058e-05, + "loss": 0.7015, + "step": 3281 + }, + { + "epoch": 1.9815876848777543, + "grad_norm": 0.1318359375, + "learning_rate": 6.998754669987547e-05, + "loss": 0.614, + "step": 3282 + }, + { + "epoch": 1.9821913673407787, + "grad_norm": 0.146484375, + "learning_rate": 6.994603569946035e-05, + "loss": 0.6907, + "step": 3283 + }, + { + "epoch": 1.9827950498038032, + "grad_norm": 0.12890625, + "learning_rate": 6.990452469904525e-05, + "loss": 0.5913, + "step": 3284 + }, + { + "epoch": 1.9833987322668276, + "grad_norm": 0.134765625, + "learning_rate": 6.986301369863014e-05, + "loss": 0.6824, + "step": 3285 + }, + { + "epoch": 1.984002414729852, + "grad_norm": 0.1181640625, + "learning_rate": 6.982150269821504e-05, + "loss": 0.6013, + "step": 3286 + }, + { + "epoch": 1.9846060971928765, + "grad_norm": 0.14453125, + "learning_rate": 6.977999169779991e-05, + "loss": 0.8288, + "step": 3287 + }, + { + "epoch": 1.985209779655901, + "grad_norm": 0.1318359375, + "learning_rate": 6.973848069738481e-05, + "loss": 0.6383, + "step": 3288 + }, + { + "epoch": 1.9858134621189254, + "grad_norm": 0.1259765625, + "learning_rate": 6.96969696969697e-05, + "loss": 0.7101, + "step": 3289 + }, + { + "epoch": 1.9864171445819498, + "grad_norm": 0.123046875, + "learning_rate": 6.965545869655459e-05, + "loss": 0.7906, + "step": 3290 + }, + { + "epoch": 1.9870208270449743, + "grad_norm": 0.1259765625, + "learning_rate": 6.961394769613949e-05, + "loss": 0.5729, + "step": 3291 + }, + { + "epoch": 1.9876245095079987, + "grad_norm": 0.1376953125, + "learning_rate": 6.957243669572437e-05, + "loss": 0.6291, + "step": 3292 + }, + { + "epoch": 1.9882281919710234, + "grad_norm": 0.1513671875, + "learning_rate": 6.953092569530926e-05, + "loss": 0.6279, + "step": 3293 + }, + { + "epoch": 1.9888318744340476, + "grad_norm": 0.1376953125, + "learning_rate": 6.948941469489415e-05, + "loss": 0.5872, + "step": 3294 + }, + { + "epoch": 1.9894355568970723, + "grad_norm": 0.150390625, + "learning_rate": 6.944790369447905e-05, + "loss": 0.5892, + "step": 3295 + }, + { + "epoch": 1.9900392393600965, + "grad_norm": 0.140625, + "learning_rate": 6.940639269406393e-05, + "loss": 0.5677, + "step": 3296 + }, + { + "epoch": 1.9906429218231212, + "grad_norm": 0.1513671875, + "learning_rate": 6.936488169364882e-05, + "loss": 0.5897, + "step": 3297 + }, + { + "epoch": 1.9912466042861454, + "grad_norm": 0.14453125, + "learning_rate": 6.932337069323372e-05, + "loss": 0.5832, + "step": 3298 + }, + { + "epoch": 1.99185028674917, + "grad_norm": 0.1533203125, + "learning_rate": 6.928185969281859e-05, + "loss": 0.5267, + "step": 3299 + }, + { + "epoch": 1.9924539692121943, + "grad_norm": 0.1572265625, + "learning_rate": 6.924034869240349e-05, + "loss": 0.491, + "step": 3300 + }, + { + "epoch": 1.993057651675219, + "grad_norm": 0.177734375, + "learning_rate": 6.919883769198838e-05, + "loss": 0.4929, + "step": 3301 + }, + { + "epoch": 1.9936613341382432, + "grad_norm": 0.171875, + "learning_rate": 6.915732669157326e-05, + "loss": 0.4367, + "step": 3302 + }, + { + "epoch": 1.9942650166012679, + "grad_norm": 0.1845703125, + "learning_rate": 6.911581569115816e-05, + "loss": 0.3382, + "step": 3303 + }, + { + "epoch": 1.994868699064292, + "grad_norm": 0.1953125, + "learning_rate": 6.907430469074305e-05, + "loss": 0.3819, + "step": 3304 + }, + { + "epoch": 1.9954723815273168, + "grad_norm": 0.1875, + "learning_rate": 6.903279369032794e-05, + "loss": 0.3094, + "step": 3305 + }, + { + "epoch": 1.996076063990341, + "grad_norm": 0.220703125, + "learning_rate": 6.899128268991282e-05, + "loss": 0.2772, + "step": 3306 + }, + { + "epoch": 1.9966797464533657, + "grad_norm": 0.1416015625, + "learning_rate": 6.894977168949772e-05, + "loss": 0.8034, + "step": 3307 + }, + { + "epoch": 1.9972834289163899, + "grad_norm": 0.1484375, + "learning_rate": 6.890826068908261e-05, + "loss": 0.6767, + "step": 3308 + }, + { + "epoch": 1.9978871113794145, + "grad_norm": 0.142578125, + "learning_rate": 6.88667496886675e-05, + "loss": 0.7063, + "step": 3309 + }, + { + "epoch": 1.9984907938424388, + "grad_norm": 0.23828125, + "learning_rate": 6.88252386882524e-05, + "loss": 0.5946, + "step": 3310 + }, + { + "epoch": 1.9990944763054634, + "grad_norm": 0.1298828125, + "learning_rate": 6.878372768783728e-05, + "loss": 0.7957, + "step": 3311 + }, + { + "epoch": 1.9996981587684877, + "grad_norm": 0.1494140625, + "learning_rate": 6.874221668742217e-05, + "loss": 0.4724, + "step": 3312 + }, + { + "epoch": 2.0006036824630247, + "grad_norm": 0.279296875, + "learning_rate": 6.870070568700706e-05, + "loss": 0.9675, + "step": 3313 + }, + { + "epoch": 2.001207364926049, + "grad_norm": 0.1337890625, + "learning_rate": 6.865919468659196e-05, + "loss": 0.6052, + "step": 3314 + }, + { + "epoch": 2.0018110473890736, + "grad_norm": 0.140625, + "learning_rate": 6.861768368617684e-05, + "loss": 0.675, + "step": 3315 + }, + { + "epoch": 2.002414729852098, + "grad_norm": 0.12451171875, + "learning_rate": 6.857617268576173e-05, + "loss": 0.7636, + "step": 3316 + }, + { + "epoch": 2.0030184123151225, + "grad_norm": 0.1337890625, + "learning_rate": 6.853466168534663e-05, + "loss": 0.5467, + "step": 3317 + }, + { + "epoch": 2.0036220947781467, + "grad_norm": 0.1328125, + "learning_rate": 6.84931506849315e-05, + "loss": 0.6109, + "step": 3318 + }, + { + "epoch": 2.0042257772411713, + "grad_norm": 0.1376953125, + "learning_rate": 6.84516396845164e-05, + "loss": 0.7048, + "step": 3319 + }, + { + "epoch": 2.0048294597041956, + "grad_norm": 0.12060546875, + "learning_rate": 6.841012868410129e-05, + "loss": 1.0679, + "step": 3320 + }, + { + "epoch": 2.0054331421672202, + "grad_norm": 0.1494140625, + "learning_rate": 6.836861768368617e-05, + "loss": 0.5886, + "step": 3321 + }, + { + "epoch": 2.0060368246302445, + "grad_norm": 0.142578125, + "learning_rate": 6.832710668327108e-05, + "loss": 0.6399, + "step": 3322 + }, + { + "epoch": 2.006640507093269, + "grad_norm": 0.134765625, + "learning_rate": 6.828559568285596e-05, + "loss": 0.6309, + "step": 3323 + }, + { + "epoch": 2.0072441895562934, + "grad_norm": 0.130859375, + "learning_rate": 6.824408468244085e-05, + "loss": 0.5685, + "step": 3324 + }, + { + "epoch": 2.007847872019318, + "grad_norm": 0.1279296875, + "learning_rate": 6.820257368202573e-05, + "loss": 0.5241, + "step": 3325 + }, + { + "epoch": 2.0084515544823422, + "grad_norm": 0.1533203125, + "learning_rate": 6.816106268161063e-05, + "loss": 0.6275, + "step": 3326 + }, + { + "epoch": 2.009055236945367, + "grad_norm": 0.1494140625, + "learning_rate": 6.811955168119552e-05, + "loss": 0.853, + "step": 3327 + }, + { + "epoch": 2.009658919408391, + "grad_norm": 0.1337890625, + "learning_rate": 6.807804068078041e-05, + "loss": 0.5848, + "step": 3328 + }, + { + "epoch": 2.010262601871416, + "grad_norm": 0.134765625, + "learning_rate": 6.803652968036531e-05, + "loss": 0.8565, + "step": 3329 + }, + { + "epoch": 2.01086628433444, + "grad_norm": 0.1484375, + "learning_rate": 6.79950186799502e-05, + "loss": 0.7739, + "step": 3330 + }, + { + "epoch": 2.0114699667974647, + "grad_norm": 0.1396484375, + "learning_rate": 6.795350767953508e-05, + "loss": 0.6635, + "step": 3331 + }, + { + "epoch": 2.012073649260489, + "grad_norm": 0.12890625, + "learning_rate": 6.791199667911997e-05, + "loss": 0.5807, + "step": 3332 + }, + { + "epoch": 2.0126773317235136, + "grad_norm": 0.1376953125, + "learning_rate": 6.787048567870487e-05, + "loss": 0.6374, + "step": 3333 + }, + { + "epoch": 2.013281014186538, + "grad_norm": 0.125, + "learning_rate": 6.782897467828975e-05, + "loss": 0.5274, + "step": 3334 + }, + { + "epoch": 2.0138846966495625, + "grad_norm": 0.1728515625, + "learning_rate": 6.778746367787464e-05, + "loss": 0.6502, + "step": 3335 + }, + { + "epoch": 2.0144883791125867, + "grad_norm": 0.1455078125, + "learning_rate": 6.774595267745954e-05, + "loss": 0.4959, + "step": 3336 + }, + { + "epoch": 2.0150920615756114, + "grad_norm": 0.1337890625, + "learning_rate": 6.770444167704441e-05, + "loss": 0.6071, + "step": 3337 + }, + { + "epoch": 2.0156957440386356, + "grad_norm": 0.1357421875, + "learning_rate": 6.766293067662931e-05, + "loss": 0.5929, + "step": 3338 + }, + { + "epoch": 2.0162994265016603, + "grad_norm": 0.15234375, + "learning_rate": 6.76214196762142e-05, + "loss": 0.6494, + "step": 3339 + }, + { + "epoch": 2.0169031089646845, + "grad_norm": 0.150390625, + "learning_rate": 6.757990867579909e-05, + "loss": 1.0521, + "step": 3340 + }, + { + "epoch": 2.017506791427709, + "grad_norm": 0.1337890625, + "learning_rate": 6.753839767538399e-05, + "loss": 0.6239, + "step": 3341 + }, + { + "epoch": 2.0181104738907334, + "grad_norm": 0.134765625, + "learning_rate": 6.749688667496887e-05, + "loss": 0.6032, + "step": 3342 + }, + { + "epoch": 2.018714156353758, + "grad_norm": 0.130859375, + "learning_rate": 6.745537567455376e-05, + "loss": 0.5677, + "step": 3343 + }, + { + "epoch": 2.0193178388167823, + "grad_norm": 0.1318359375, + "learning_rate": 6.741386467413865e-05, + "loss": 0.5771, + "step": 3344 + }, + { + "epoch": 2.019921521279807, + "grad_norm": 0.1357421875, + "learning_rate": 6.737235367372355e-05, + "loss": 0.5857, + "step": 3345 + }, + { + "epoch": 2.020525203742831, + "grad_norm": 0.138671875, + "learning_rate": 6.733084267330843e-05, + "loss": 0.548, + "step": 3346 + }, + { + "epoch": 2.021128886205856, + "grad_norm": 0.13671875, + "learning_rate": 6.728933167289332e-05, + "loss": 0.5641, + "step": 3347 + }, + { + "epoch": 2.02173256866888, + "grad_norm": 0.126953125, + "learning_rate": 6.724782067247822e-05, + "loss": 0.5014, + "step": 3348 + }, + { + "epoch": 2.0223362511319047, + "grad_norm": 0.1494140625, + "learning_rate": 6.720630967206309e-05, + "loss": 0.5817, + "step": 3349 + }, + { + "epoch": 2.022939933594929, + "grad_norm": 0.1416015625, + "learning_rate": 6.716479867164799e-05, + "loss": 0.5907, + "step": 3350 + }, + { + "epoch": 2.0235436160579536, + "grad_norm": 0.1484375, + "learning_rate": 6.712328767123288e-05, + "loss": 0.5136, + "step": 3351 + }, + { + "epoch": 2.024147298520978, + "grad_norm": 0.150390625, + "learning_rate": 6.708177667081778e-05, + "loss": 0.5174, + "step": 3352 + }, + { + "epoch": 2.0247509809840025, + "grad_norm": 0.1494140625, + "learning_rate": 6.704026567040266e-05, + "loss": 0.4647, + "step": 3353 + }, + { + "epoch": 2.0253546634470267, + "grad_norm": 0.1591796875, + "learning_rate": 6.699875466998755e-05, + "loss": 0.4701, + "step": 3354 + }, + { + "epoch": 2.0259583459100514, + "grad_norm": 0.1640625, + "learning_rate": 6.695724366957245e-05, + "loss": 0.4639, + "step": 3355 + }, + { + "epoch": 2.0265620283730756, + "grad_norm": 0.169921875, + "learning_rate": 6.691573266915732e-05, + "loss": 0.4349, + "step": 3356 + }, + { + "epoch": 2.0271657108361003, + "grad_norm": 0.181640625, + "learning_rate": 6.687422166874222e-05, + "loss": 0.4562, + "step": 3357 + }, + { + "epoch": 2.0277693932991245, + "grad_norm": 0.1728515625, + "learning_rate": 6.683271066832711e-05, + "loss": 0.3564, + "step": 3358 + }, + { + "epoch": 2.028373075762149, + "grad_norm": 0.201171875, + "learning_rate": 6.6791199667912e-05, + "loss": 0.3151, + "step": 3359 + }, + { + "epoch": 2.0289767582251734, + "grad_norm": 0.1943359375, + "learning_rate": 6.67496886674969e-05, + "loss": 0.2686, + "step": 3360 + }, + { + "epoch": 2.029580440688198, + "grad_norm": 0.2080078125, + "learning_rate": 6.670817766708178e-05, + "loss": 0.2815, + "step": 3361 + }, + { + "epoch": 2.0301841231512223, + "grad_norm": 0.1923828125, + "learning_rate": 6.666666666666667e-05, + "loss": 0.2146, + "step": 3362 + }, + { + "epoch": 2.030787805614247, + "grad_norm": 0.154296875, + "learning_rate": 6.662515566625156e-05, + "loss": 0.6592, + "step": 3363 + }, + { + "epoch": 2.031391488077271, + "grad_norm": 0.158203125, + "learning_rate": 6.658364466583646e-05, + "loss": 0.6675, + "step": 3364 + }, + { + "epoch": 2.031995170540296, + "grad_norm": 0.1494140625, + "learning_rate": 6.654213366542133e-05, + "loss": 0.6109, + "step": 3365 + }, + { + "epoch": 2.03259885300332, + "grad_norm": 0.14453125, + "learning_rate": 6.650062266500623e-05, + "loss": 0.5674, + "step": 3366 + }, + { + "epoch": 2.0332025354663448, + "grad_norm": 0.1416015625, + "learning_rate": 6.645911166459113e-05, + "loss": 0.5863, + "step": 3367 + }, + { + "epoch": 2.033806217929369, + "grad_norm": 0.1435546875, + "learning_rate": 6.6417600664176e-05, + "loss": 0.7262, + "step": 3368 + }, + { + "epoch": 2.0344099003923937, + "grad_norm": 0.142578125, + "learning_rate": 6.63760896637609e-05, + "loss": 0.8392, + "step": 3369 + }, + { + "epoch": 2.035013582855418, + "grad_norm": 0.1376953125, + "learning_rate": 6.633457866334579e-05, + "loss": 0.5431, + "step": 3370 + }, + { + "epoch": 2.0356172653184426, + "grad_norm": 0.1298828125, + "learning_rate": 6.629306766293068e-05, + "loss": 0.4897, + "step": 3371 + }, + { + "epoch": 2.036220947781467, + "grad_norm": 0.1484375, + "learning_rate": 6.625155666251556e-05, + "loss": 0.7426, + "step": 3372 + }, + { + "epoch": 2.0368246302444915, + "grad_norm": 0.138671875, + "learning_rate": 6.621004566210046e-05, + "loss": 0.8515, + "step": 3373 + }, + { + "epoch": 2.0374283127075157, + "grad_norm": 0.1259765625, + "learning_rate": 6.616853466168536e-05, + "loss": 0.4825, + "step": 3374 + }, + { + "epoch": 2.0380319951705403, + "grad_norm": 0.1376953125, + "learning_rate": 6.612702366127023e-05, + "loss": 0.6163, + "step": 3375 + }, + { + "epoch": 2.0386356776335646, + "grad_norm": 0.1435546875, + "learning_rate": 6.608551266085513e-05, + "loss": 0.4817, + "step": 3376 + }, + { + "epoch": 2.0392393600965892, + "grad_norm": 0.142578125, + "learning_rate": 6.604400166044002e-05, + "loss": 0.7172, + "step": 3377 + }, + { + "epoch": 2.0398430425596135, + "grad_norm": 0.140625, + "learning_rate": 6.600249066002491e-05, + "loss": 0.6068, + "step": 3378 + }, + { + "epoch": 2.040446725022638, + "grad_norm": 0.1455078125, + "learning_rate": 6.59609796596098e-05, + "loss": 0.8924, + "step": 3379 + }, + { + "epoch": 2.0410504074856624, + "grad_norm": 0.134765625, + "learning_rate": 6.59194686591947e-05, + "loss": 0.8522, + "step": 3380 + }, + { + "epoch": 2.041654089948687, + "grad_norm": 0.1298828125, + "learning_rate": 6.587795765877958e-05, + "loss": 0.7332, + "step": 3381 + }, + { + "epoch": 2.0422577724117112, + "grad_norm": 0.1416015625, + "learning_rate": 6.583644665836447e-05, + "loss": 0.5677, + "step": 3382 + }, + { + "epoch": 2.042861454874736, + "grad_norm": 0.15234375, + "learning_rate": 6.579493565794937e-05, + "loss": 0.8008, + "step": 3383 + }, + { + "epoch": 2.04346513733776, + "grad_norm": 0.1376953125, + "learning_rate": 6.575342465753424e-05, + "loss": 0.6705, + "step": 3384 + }, + { + "epoch": 2.044068819800785, + "grad_norm": 0.1376953125, + "learning_rate": 6.571191365711914e-05, + "loss": 0.5933, + "step": 3385 + }, + { + "epoch": 2.044672502263809, + "grad_norm": 0.1845703125, + "learning_rate": 6.567040265670403e-05, + "loss": 0.8533, + "step": 3386 + }, + { + "epoch": 2.0452761847268337, + "grad_norm": 0.1689453125, + "learning_rate": 6.562889165628891e-05, + "loss": 0.7038, + "step": 3387 + }, + { + "epoch": 2.045879867189858, + "grad_norm": 0.1357421875, + "learning_rate": 6.558738065587381e-05, + "loss": 0.584, + "step": 3388 + }, + { + "epoch": 2.0464835496528826, + "grad_norm": 0.14453125, + "learning_rate": 6.55458696554587e-05, + "loss": 0.9452, + "step": 3389 + }, + { + "epoch": 2.047087232115907, + "grad_norm": 0.1611328125, + "learning_rate": 6.550435865504359e-05, + "loss": 0.5957, + "step": 3390 + }, + { + "epoch": 2.0476909145789315, + "grad_norm": 0.1298828125, + "learning_rate": 6.546284765462847e-05, + "loss": 0.6208, + "step": 3391 + }, + { + "epoch": 2.0482945970419557, + "grad_norm": 0.15234375, + "learning_rate": 6.542133665421337e-05, + "loss": 0.7178, + "step": 3392 + }, + { + "epoch": 2.0488982795049804, + "grad_norm": 0.255859375, + "learning_rate": 6.537982565379826e-05, + "loss": 0.7035, + "step": 3393 + }, + { + "epoch": 2.049501961968005, + "grad_norm": 0.154296875, + "learning_rate": 6.533831465338315e-05, + "loss": 0.7374, + "step": 3394 + }, + { + "epoch": 2.0501056444310293, + "grad_norm": 0.130859375, + "learning_rate": 6.529680365296805e-05, + "loss": 0.5717, + "step": 3395 + }, + { + "epoch": 2.050709326894054, + "grad_norm": 0.1357421875, + "learning_rate": 6.525529265255293e-05, + "loss": 0.5723, + "step": 3396 + }, + { + "epoch": 2.051313009357078, + "grad_norm": 0.1591796875, + "learning_rate": 6.521378165213782e-05, + "loss": 0.6278, + "step": 3397 + }, + { + "epoch": 2.051916691820103, + "grad_norm": 0.140625, + "learning_rate": 6.51722706517227e-05, + "loss": 0.5791, + "step": 3398 + }, + { + "epoch": 2.052520374283127, + "grad_norm": 0.1416015625, + "learning_rate": 6.51307596513076e-05, + "loss": 0.5202, + "step": 3399 + }, + { + "epoch": 2.0531240567461517, + "grad_norm": 0.1455078125, + "learning_rate": 6.508924865089249e-05, + "loss": 0.4888, + "step": 3400 + }, + { + "epoch": 2.053727739209176, + "grad_norm": 0.1416015625, + "learning_rate": 6.504773765047738e-05, + "loss": 0.5308, + "step": 3401 + }, + { + "epoch": 2.0543314216722006, + "grad_norm": 0.1552734375, + "learning_rate": 6.500622665006228e-05, + "loss": 0.4807, + "step": 3402 + }, + { + "epoch": 2.054935104135225, + "grad_norm": 0.1708984375, + "learning_rate": 6.496471564964715e-05, + "loss": 0.5628, + "step": 3403 + }, + { + "epoch": 2.0555387865982495, + "grad_norm": 0.17578125, + "learning_rate": 6.492320464923205e-05, + "loss": 0.5691, + "step": 3404 + }, + { + "epoch": 2.0561424690612737, + "grad_norm": 0.15625, + "learning_rate": 6.488169364881694e-05, + "loss": 0.4539, + "step": 3405 + }, + { + "epoch": 2.0567461515242984, + "grad_norm": 0.162109375, + "learning_rate": 6.484018264840182e-05, + "loss": 0.598, + "step": 3406 + }, + { + "epoch": 2.0573498339873226, + "grad_norm": 0.1689453125, + "learning_rate": 6.479867164798672e-05, + "loss": 0.417, + "step": 3407 + }, + { + "epoch": 2.0579535164503473, + "grad_norm": 0.1845703125, + "learning_rate": 6.475716064757161e-05, + "loss": 0.3549, + "step": 3408 + }, + { + "epoch": 2.0585571989133715, + "grad_norm": 0.19921875, + "learning_rate": 6.47156496471565e-05, + "loss": 0.4513, + "step": 3409 + }, + { + "epoch": 2.059160881376396, + "grad_norm": 0.18359375, + "learning_rate": 6.467413864674138e-05, + "loss": 0.3228, + "step": 3410 + }, + { + "epoch": 2.0597645638394204, + "grad_norm": 0.2109375, + "learning_rate": 6.463262764632628e-05, + "loss": 0.3144, + "step": 3411 + }, + { + "epoch": 2.060368246302445, + "grad_norm": 0.1826171875, + "learning_rate": 6.459111664591117e-05, + "loss": 0.2016, + "step": 3412 + }, + { + "epoch": 2.0609719287654693, + "grad_norm": 0.1630859375, + "learning_rate": 6.454960564549606e-05, + "loss": 0.5913, + "step": 3413 + }, + { + "epoch": 2.061575611228494, + "grad_norm": 0.14453125, + "learning_rate": 6.450809464508096e-05, + "loss": 1.0476, + "step": 3414 + }, + { + "epoch": 2.062179293691518, + "grad_norm": 0.14453125, + "learning_rate": 6.446658364466583e-05, + "loss": 0.6198, + "step": 3415 + }, + { + "epoch": 2.062782976154543, + "grad_norm": 0.1533203125, + "learning_rate": 6.442507264425073e-05, + "loss": 0.5562, + "step": 3416 + }, + { + "epoch": 2.063386658617567, + "grad_norm": 0.150390625, + "learning_rate": 6.438356164383562e-05, + "loss": 0.646, + "step": 3417 + }, + { + "epoch": 2.0639903410805918, + "grad_norm": 0.1416015625, + "learning_rate": 6.434205064342052e-05, + "loss": 0.6686, + "step": 3418 + }, + { + "epoch": 2.064594023543616, + "grad_norm": 0.1650390625, + "learning_rate": 6.43005396430054e-05, + "loss": 0.8418, + "step": 3419 + }, + { + "epoch": 2.0651977060066407, + "grad_norm": 0.142578125, + "learning_rate": 6.425902864259029e-05, + "loss": 1.04, + "step": 3420 + }, + { + "epoch": 2.065801388469665, + "grad_norm": 0.1669921875, + "learning_rate": 6.421751764217519e-05, + "loss": 0.6007, + "step": 3421 + }, + { + "epoch": 2.0664050709326895, + "grad_norm": 0.1435546875, + "learning_rate": 6.417600664176006e-05, + "loss": 0.6781, + "step": 3422 + }, + { + "epoch": 2.0670087533957138, + "grad_norm": 0.1455078125, + "learning_rate": 6.413449564134496e-05, + "loss": 0.6264, + "step": 3423 + }, + { + "epoch": 2.0676124358587384, + "grad_norm": 0.341796875, + "learning_rate": 6.409298464092985e-05, + "loss": 0.7689, + "step": 3424 + }, + { + "epoch": 2.0682161183217627, + "grad_norm": 0.1318359375, + "learning_rate": 6.405147364051474e-05, + "loss": 0.5546, + "step": 3425 + }, + { + "epoch": 2.0688198007847873, + "grad_norm": 0.130859375, + "learning_rate": 6.400996264009964e-05, + "loss": 0.7563, + "step": 3426 + }, + { + "epoch": 2.0694234832478116, + "grad_norm": 0.13671875, + "learning_rate": 6.396845163968452e-05, + "loss": 0.6584, + "step": 3427 + }, + { + "epoch": 2.0700271657108362, + "grad_norm": 0.1455078125, + "learning_rate": 6.392694063926941e-05, + "loss": 0.6751, + "step": 3428 + }, + { + "epoch": 2.0706308481738604, + "grad_norm": 0.130859375, + "learning_rate": 6.38854296388543e-05, + "loss": 0.607, + "step": 3429 + }, + { + "epoch": 2.071234530636885, + "grad_norm": 0.1689453125, + "learning_rate": 6.38439186384392e-05, + "loss": 0.6411, + "step": 3430 + }, + { + "epoch": 2.0718382130999093, + "grad_norm": 0.1240234375, + "learning_rate": 6.380240763802408e-05, + "loss": 0.6765, + "step": 3431 + }, + { + "epoch": 2.072441895562934, + "grad_norm": 0.14453125, + "learning_rate": 6.376089663760897e-05, + "loss": 1.0625, + "step": 3432 + }, + { + "epoch": 2.0730455780259582, + "grad_norm": 0.13671875, + "learning_rate": 6.371938563719387e-05, + "loss": 0.8033, + "step": 3433 + }, + { + "epoch": 2.073649260488983, + "grad_norm": 0.134765625, + "learning_rate": 6.367787463677874e-05, + "loss": 0.5758, + "step": 3434 + }, + { + "epoch": 2.074252942952007, + "grad_norm": 0.154296875, + "learning_rate": 6.363636363636364e-05, + "loss": 0.8254, + "step": 3435 + }, + { + "epoch": 2.074856625415032, + "grad_norm": 0.1357421875, + "learning_rate": 6.359485263594853e-05, + "loss": 0.5519, + "step": 3436 + }, + { + "epoch": 2.075460307878056, + "grad_norm": 0.1435546875, + "learning_rate": 6.355334163553341e-05, + "loss": 0.5636, + "step": 3437 + }, + { + "epoch": 2.0760639903410807, + "grad_norm": 0.1640625, + "learning_rate": 6.351183063511831e-05, + "loss": 0.7238, + "step": 3438 + }, + { + "epoch": 2.076667672804105, + "grad_norm": 0.1279296875, + "learning_rate": 6.34703196347032e-05, + "loss": 0.6205, + "step": 3439 + }, + { + "epoch": 2.0772713552671296, + "grad_norm": 0.138671875, + "learning_rate": 6.342880863428809e-05, + "loss": 0.5848, + "step": 3440 + }, + { + "epoch": 2.077875037730154, + "grad_norm": 0.140625, + "learning_rate": 6.338729763387297e-05, + "loss": 0.7892, + "step": 3441 + }, + { + "epoch": 2.0784787201931785, + "grad_norm": 0.1494140625, + "learning_rate": 6.334578663345787e-05, + "loss": 0.6707, + "step": 3442 + }, + { + "epoch": 2.0790824026562027, + "grad_norm": 0.1357421875, + "learning_rate": 6.330427563304276e-05, + "loss": 0.7932, + "step": 3443 + }, + { + "epoch": 2.0796860851192274, + "grad_norm": 0.1474609375, + "learning_rate": 6.326276463262765e-05, + "loss": 0.662, + "step": 3444 + }, + { + "epoch": 2.0802897675822516, + "grad_norm": 0.1572265625, + "learning_rate": 6.322125363221255e-05, + "loss": 0.7459, + "step": 3445 + }, + { + "epoch": 2.0808934500452763, + "grad_norm": 0.1455078125, + "learning_rate": 6.317974263179743e-05, + "loss": 0.5336, + "step": 3446 + }, + { + "epoch": 2.0814971325083005, + "grad_norm": 0.146484375, + "learning_rate": 6.313823163138232e-05, + "loss": 0.619, + "step": 3447 + }, + { + "epoch": 2.082100814971325, + "grad_norm": 0.146484375, + "learning_rate": 6.30967206309672e-05, + "loss": 0.5999, + "step": 3448 + }, + { + "epoch": 2.0827044974343494, + "grad_norm": 0.1337890625, + "learning_rate": 6.30552096305521e-05, + "loss": 0.5155, + "step": 3449 + }, + { + "epoch": 2.083308179897374, + "grad_norm": 0.162109375, + "learning_rate": 6.301369863013699e-05, + "loss": 0.5144, + "step": 3450 + }, + { + "epoch": 2.0839118623603983, + "grad_norm": 0.1513671875, + "learning_rate": 6.297218762972188e-05, + "loss": 0.5631, + "step": 3451 + }, + { + "epoch": 2.084515544823423, + "grad_norm": 0.15234375, + "learning_rate": 6.293067662930678e-05, + "loss": 0.6323, + "step": 3452 + }, + { + "epoch": 2.085119227286447, + "grad_norm": 0.162109375, + "learning_rate": 6.288916562889165e-05, + "loss": 0.447, + "step": 3453 + }, + { + "epoch": 2.085722909749472, + "grad_norm": 0.16015625, + "learning_rate": 6.284765462847655e-05, + "loss": 0.5228, + "step": 3454 + }, + { + "epoch": 2.086326592212496, + "grad_norm": 0.166015625, + "learning_rate": 6.280614362806144e-05, + "loss": 0.4278, + "step": 3455 + }, + { + "epoch": 2.0869302746755207, + "grad_norm": 0.177734375, + "learning_rate": 6.276463262764632e-05, + "loss": 0.4298, + "step": 3456 + }, + { + "epoch": 2.087533957138545, + "grad_norm": 0.18359375, + "learning_rate": 6.272312162723122e-05, + "loss": 0.4683, + "step": 3457 + }, + { + "epoch": 2.0881376396015696, + "grad_norm": 0.18359375, + "learning_rate": 6.268161062681611e-05, + "loss": 0.4114, + "step": 3458 + }, + { + "epoch": 2.088741322064594, + "grad_norm": 0.2021484375, + "learning_rate": 6.2640099626401e-05, + "loss": 0.4019, + "step": 3459 + }, + { + "epoch": 2.0893450045276185, + "grad_norm": 0.203125, + "learning_rate": 6.259858862598588e-05, + "loss": 0.3359, + "step": 3460 + }, + { + "epoch": 2.0899486869906427, + "grad_norm": 0.205078125, + "learning_rate": 6.255707762557078e-05, + "loss": 0.2717, + "step": 3461 + }, + { + "epoch": 2.0905523694536674, + "grad_norm": 0.1962890625, + "learning_rate": 6.251556662515566e-05, + "loss": 0.1975, + "step": 3462 + }, + { + "epoch": 2.0911560519166916, + "grad_norm": 0.142578125, + "learning_rate": 6.247405562474056e-05, + "loss": 0.9222, + "step": 3463 + }, + { + "epoch": 2.0917597343797163, + "grad_norm": 0.142578125, + "learning_rate": 6.243254462432546e-05, + "loss": 0.5585, + "step": 3464 + }, + { + "epoch": 2.0923634168427405, + "grad_norm": 0.1630859375, + "learning_rate": 6.239103362391034e-05, + "loss": 0.653, + "step": 3465 + }, + { + "epoch": 2.092967099305765, + "grad_norm": 0.1474609375, + "learning_rate": 6.234952262349523e-05, + "loss": 0.6304, + "step": 3466 + }, + { + "epoch": 2.0935707817687894, + "grad_norm": 0.154296875, + "learning_rate": 6.230801162308012e-05, + "loss": 0.6062, + "step": 3467 + }, + { + "epoch": 2.094174464231814, + "grad_norm": 0.15625, + "learning_rate": 6.226650062266502e-05, + "loss": 0.5847, + "step": 3468 + }, + { + "epoch": 2.0947781466948383, + "grad_norm": 0.1455078125, + "learning_rate": 6.222498962224989e-05, + "loss": 0.6474, + "step": 3469 + }, + { + "epoch": 2.095381829157863, + "grad_norm": 0.1337890625, + "learning_rate": 6.218347862183479e-05, + "loss": 0.6167, + "step": 3470 + }, + { + "epoch": 2.095985511620887, + "grad_norm": 0.1630859375, + "learning_rate": 6.214196762141969e-05, + "loss": 0.6131, + "step": 3471 + }, + { + "epoch": 2.096589194083912, + "grad_norm": 0.1728515625, + "learning_rate": 6.210045662100456e-05, + "loss": 0.7248, + "step": 3472 + }, + { + "epoch": 2.097192876546936, + "grad_norm": 0.177734375, + "learning_rate": 6.205894562058946e-05, + "loss": 0.5809, + "step": 3473 + }, + { + "epoch": 2.0977965590099608, + "grad_norm": 0.150390625, + "learning_rate": 6.201743462017435e-05, + "loss": 0.6752, + "step": 3474 + }, + { + "epoch": 2.0984002414729854, + "grad_norm": 0.1513671875, + "learning_rate": 6.197592361975924e-05, + "loss": 0.6793, + "step": 3475 + }, + { + "epoch": 2.0990039239360097, + "grad_norm": 0.1455078125, + "learning_rate": 6.193441261934412e-05, + "loss": 0.6169, + "step": 3476 + }, + { + "epoch": 2.0996076063990343, + "grad_norm": 0.1484375, + "learning_rate": 6.189290161892902e-05, + "loss": 0.5645, + "step": 3477 + }, + { + "epoch": 2.1002112888620585, + "grad_norm": 0.1376953125, + "learning_rate": 6.185139061851391e-05, + "loss": 0.5911, + "step": 3478 + }, + { + "epoch": 2.100814971325083, + "grad_norm": 0.146484375, + "learning_rate": 6.18098796180988e-05, + "loss": 0.5954, + "step": 3479 + }, + { + "epoch": 2.1014186537881074, + "grad_norm": 0.134765625, + "learning_rate": 6.17683686176837e-05, + "loss": 0.5374, + "step": 3480 + }, + { + "epoch": 2.102022336251132, + "grad_norm": 0.14453125, + "learning_rate": 6.172685761726857e-05, + "loss": 0.5509, + "step": 3481 + }, + { + "epoch": 2.1026260187141563, + "grad_norm": 0.1376953125, + "learning_rate": 6.168534661685347e-05, + "loss": 0.6542, + "step": 3482 + }, + { + "epoch": 2.103229701177181, + "grad_norm": 0.1494140625, + "learning_rate": 6.164383561643835e-05, + "loss": 0.7288, + "step": 3483 + }, + { + "epoch": 2.1038333836402052, + "grad_norm": 0.1376953125, + "learning_rate": 6.160232461602324e-05, + "loss": 0.6775, + "step": 3484 + }, + { + "epoch": 2.10443706610323, + "grad_norm": 0.1396484375, + "learning_rate": 6.156081361560814e-05, + "loss": 0.6655, + "step": 3485 + }, + { + "epoch": 2.105040748566254, + "grad_norm": 0.142578125, + "learning_rate": 6.151930261519303e-05, + "loss": 0.5989, + "step": 3486 + }, + { + "epoch": 2.105644431029279, + "grad_norm": 0.1416015625, + "learning_rate": 6.147779161477793e-05, + "loss": 0.6795, + "step": 3487 + }, + { + "epoch": 2.106248113492303, + "grad_norm": 0.1396484375, + "learning_rate": 6.14362806143628e-05, + "loss": 0.6076, + "step": 3488 + }, + { + "epoch": 2.1068517959553277, + "grad_norm": 0.1337890625, + "learning_rate": 6.13947696139477e-05, + "loss": 0.5628, + "step": 3489 + }, + { + "epoch": 2.107455478418352, + "grad_norm": 0.1455078125, + "learning_rate": 6.13532586135326e-05, + "loss": 0.6294, + "step": 3490 + }, + { + "epoch": 2.1080591608813766, + "grad_norm": 0.1435546875, + "learning_rate": 6.131174761311747e-05, + "loss": 0.6435, + "step": 3491 + }, + { + "epoch": 2.108662843344401, + "grad_norm": 0.13671875, + "learning_rate": 6.127023661270237e-05, + "loss": 0.6564, + "step": 3492 + }, + { + "epoch": 2.1092665258074255, + "grad_norm": 0.1435546875, + "learning_rate": 6.122872561228726e-05, + "loss": 0.6102, + "step": 3493 + }, + { + "epoch": 2.1098702082704497, + "grad_norm": 0.1328125, + "learning_rate": 6.118721461187215e-05, + "loss": 0.5999, + "step": 3494 + }, + { + "epoch": 2.1104738907334744, + "grad_norm": 0.1416015625, + "learning_rate": 6.114570361145703e-05, + "loss": 0.6853, + "step": 3495 + }, + { + "epoch": 2.1110775731964986, + "grad_norm": 0.1357421875, + "learning_rate": 6.110419261104193e-05, + "loss": 0.5913, + "step": 3496 + }, + { + "epoch": 2.1116812556595232, + "grad_norm": 0.1328125, + "learning_rate": 6.106268161062682e-05, + "loss": 0.5238, + "step": 3497 + }, + { + "epoch": 2.1122849381225475, + "grad_norm": 0.1494140625, + "learning_rate": 6.1021170610211706e-05, + "loss": 0.5639, + "step": 3498 + }, + { + "epoch": 2.112888620585572, + "grad_norm": 0.1455078125, + "learning_rate": 6.09796596097966e-05, + "loss": 0.5235, + "step": 3499 + }, + { + "epoch": 2.1134923030485964, + "grad_norm": 0.1484375, + "learning_rate": 6.0938148609381486e-05, + "loss": 0.5606, + "step": 3500 + }, + { + "epoch": 2.1134923030485964, + "eval_loss": 0.617886483669281, + "eval_runtime": 1059.7393, + "eval_samples_per_second": 2.633, + "eval_steps_per_second": 0.329, + "step": 3500 + }, + { + "epoch": 2.114095985511621, + "grad_norm": 0.1513671875, + "learning_rate": 6.089663760896638e-05, + "loss": 0.5319, + "step": 3501 + }, + { + "epoch": 2.1146996679746453, + "grad_norm": 0.1474609375, + "learning_rate": 6.085512660855127e-05, + "loss": 0.4808, + "step": 3502 + }, + { + "epoch": 2.11530335043767, + "grad_norm": 0.1630859375, + "learning_rate": 6.081361560813615e-05, + "loss": 0.5266, + "step": 3503 + }, + { + "epoch": 2.115907032900694, + "grad_norm": 0.1806640625, + "learning_rate": 6.0772104607721045e-05, + "loss": 0.5523, + "step": 3504 + }, + { + "epoch": 2.116510715363719, + "grad_norm": 0.1708984375, + "learning_rate": 6.073059360730594e-05, + "loss": 0.4785, + "step": 3505 + }, + { + "epoch": 2.117114397826743, + "grad_norm": 0.1669921875, + "learning_rate": 6.0689082606890825e-05, + "loss": 0.4426, + "step": 3506 + }, + { + "epoch": 2.1177180802897677, + "grad_norm": 0.177734375, + "learning_rate": 6.064757160647572e-05, + "loss": 0.4489, + "step": 3507 + }, + { + "epoch": 2.118321762752792, + "grad_norm": 0.19921875, + "learning_rate": 6.060606060606061e-05, + "loss": 0.4412, + "step": 3508 + }, + { + "epoch": 2.1189254452158166, + "grad_norm": 0.1845703125, + "learning_rate": 6.0564549605645505e-05, + "loss": 0.3421, + "step": 3509 + }, + { + "epoch": 2.119529127678841, + "grad_norm": 0.1982421875, + "learning_rate": 6.0523038605230384e-05, + "loss": 0.3627, + "step": 3510 + }, + { + "epoch": 2.1201328101418655, + "grad_norm": 0.1845703125, + "learning_rate": 6.048152760481528e-05, + "loss": 0.265, + "step": 3511 + }, + { + "epoch": 2.1207364926048897, + "grad_norm": 0.1982421875, + "learning_rate": 6.044001660440017e-05, + "loss": 0.2138, + "step": 3512 + }, + { + "epoch": 2.1213401750679144, + "grad_norm": 0.138671875, + "learning_rate": 6.039850560398506e-05, + "loss": 0.574, + "step": 3513 + }, + { + "epoch": 2.1219438575309386, + "grad_norm": 0.158203125, + "learning_rate": 6.035699460356995e-05, + "loss": 0.754, + "step": 3514 + }, + { + "epoch": 2.1225475399939633, + "grad_norm": 0.1396484375, + "learning_rate": 6.0315483603154844e-05, + "loss": 0.6438, + "step": 3515 + }, + { + "epoch": 2.1231512224569875, + "grad_norm": 0.1376953125, + "learning_rate": 6.0273972602739724e-05, + "loss": 0.6454, + "step": 3516 + }, + { + "epoch": 2.123754904920012, + "grad_norm": 0.1357421875, + "learning_rate": 6.023246160232462e-05, + "loss": 0.5961, + "step": 3517 + }, + { + "epoch": 2.1243585873830364, + "grad_norm": 0.1572265625, + "learning_rate": 6.019095060190951e-05, + "loss": 0.6337, + "step": 3518 + }, + { + "epoch": 2.124962269846061, + "grad_norm": 0.1357421875, + "learning_rate": 6.01494396014944e-05, + "loss": 0.58, + "step": 3519 + }, + { + "epoch": 2.1255659523090853, + "grad_norm": 0.173828125, + "learning_rate": 6.010792860107929e-05, + "loss": 0.8145, + "step": 3520 + }, + { + "epoch": 2.12616963477211, + "grad_norm": 0.12890625, + "learning_rate": 6.006641760066418e-05, + "loss": 0.5888, + "step": 3521 + }, + { + "epoch": 2.126773317235134, + "grad_norm": 0.1494140625, + "learning_rate": 6.002490660024906e-05, + "loss": 0.5517, + "step": 3522 + }, + { + "epoch": 2.127376999698159, + "grad_norm": 0.171875, + "learning_rate": 5.9983395599833956e-05, + "loss": 0.7016, + "step": 3523 + }, + { + "epoch": 2.127980682161183, + "grad_norm": 0.154296875, + "learning_rate": 5.994188459941885e-05, + "loss": 0.6212, + "step": 3524 + }, + { + "epoch": 2.1285843646242077, + "grad_norm": 0.140625, + "learning_rate": 5.9900373599003736e-05, + "loss": 0.8909, + "step": 3525 + }, + { + "epoch": 2.129188047087232, + "grad_norm": 0.150390625, + "learning_rate": 5.985886259858863e-05, + "loss": 0.8154, + "step": 3526 + }, + { + "epoch": 2.1297917295502566, + "grad_norm": 0.1533203125, + "learning_rate": 5.981735159817352e-05, + "loss": 0.6854, + "step": 3527 + }, + { + "epoch": 2.130395412013281, + "grad_norm": 0.1416015625, + "learning_rate": 5.97758405977584e-05, + "loss": 0.5829, + "step": 3528 + }, + { + "epoch": 2.1309990944763055, + "grad_norm": 0.134765625, + "learning_rate": 5.9734329597343295e-05, + "loss": 0.7917, + "step": 3529 + }, + { + "epoch": 2.1316027769393298, + "grad_norm": 0.138671875, + "learning_rate": 5.969281859692819e-05, + "loss": 0.6034, + "step": 3530 + }, + { + "epoch": 2.1322064594023544, + "grad_norm": 0.140625, + "learning_rate": 5.965130759651308e-05, + "loss": 0.6514, + "step": 3531 + }, + { + "epoch": 2.1328101418653787, + "grad_norm": 0.1298828125, + "learning_rate": 5.960979659609797e-05, + "loss": 0.5277, + "step": 3532 + }, + { + "epoch": 2.1334138243284033, + "grad_norm": 0.1552734375, + "learning_rate": 5.956828559568286e-05, + "loss": 0.6354, + "step": 3533 + }, + { + "epoch": 2.1340175067914275, + "grad_norm": 0.140625, + "learning_rate": 5.9526774595267755e-05, + "loss": 0.7139, + "step": 3534 + }, + { + "epoch": 2.134621189254452, + "grad_norm": 0.1484375, + "learning_rate": 5.9485263594852635e-05, + "loss": 0.6376, + "step": 3535 + }, + { + "epoch": 2.1352248717174764, + "grad_norm": 0.1640625, + "learning_rate": 5.944375259443753e-05, + "loss": 0.6315, + "step": 3536 + }, + { + "epoch": 2.135828554180501, + "grad_norm": 0.1318359375, + "learning_rate": 5.940224159402242e-05, + "loss": 0.5425, + "step": 3537 + }, + { + "epoch": 2.1364322366435253, + "grad_norm": 0.1337890625, + "learning_rate": 5.936073059360731e-05, + "loss": 0.6503, + "step": 3538 + }, + { + "epoch": 2.13703591910655, + "grad_norm": 0.1494140625, + "learning_rate": 5.93192195931922e-05, + "loss": 0.6238, + "step": 3539 + }, + { + "epoch": 2.137639601569574, + "grad_norm": 0.185546875, + "learning_rate": 5.9277708592777094e-05, + "loss": 0.6117, + "step": 3540 + }, + { + "epoch": 2.138243284032599, + "grad_norm": 0.1337890625, + "learning_rate": 5.9236197592361974e-05, + "loss": 0.7165, + "step": 3541 + }, + { + "epoch": 2.138846966495623, + "grad_norm": 0.1513671875, + "learning_rate": 5.919468659194687e-05, + "loss": 0.7887, + "step": 3542 + }, + { + "epoch": 2.139450648958648, + "grad_norm": 0.15625, + "learning_rate": 5.915317559153176e-05, + "loss": 0.7581, + "step": 3543 + }, + { + "epoch": 2.140054331421672, + "grad_norm": 0.142578125, + "learning_rate": 5.911166459111665e-05, + "loss": 0.6213, + "step": 3544 + }, + { + "epoch": 2.1406580138846967, + "grad_norm": 0.142578125, + "learning_rate": 5.907015359070154e-05, + "loss": 0.7873, + "step": 3545 + }, + { + "epoch": 2.141261696347721, + "grad_norm": 0.142578125, + "learning_rate": 5.902864259028643e-05, + "loss": 0.6151, + "step": 3546 + }, + { + "epoch": 2.1418653788107456, + "grad_norm": 0.1357421875, + "learning_rate": 5.898713158987131e-05, + "loss": 0.5618, + "step": 3547 + }, + { + "epoch": 2.14246906127377, + "grad_norm": 0.1416015625, + "learning_rate": 5.8945620589456206e-05, + "loss": 0.5792, + "step": 3548 + }, + { + "epoch": 2.1430727437367945, + "grad_norm": 0.1572265625, + "learning_rate": 5.89041095890411e-05, + "loss": 0.5894, + "step": 3549 + }, + { + "epoch": 2.1436764261998187, + "grad_norm": 0.15234375, + "learning_rate": 5.8862598588625986e-05, + "loss": 0.5326, + "step": 3550 + }, + { + "epoch": 2.1442801086628434, + "grad_norm": 0.1513671875, + "learning_rate": 5.882108758821088e-05, + "loss": 0.5008, + "step": 3551 + }, + { + "epoch": 2.144883791125868, + "grad_norm": 0.1435546875, + "learning_rate": 5.877957658779577e-05, + "loss": 0.4708, + "step": 3552 + }, + { + "epoch": 2.1454874735888922, + "grad_norm": 0.158203125, + "learning_rate": 5.8738065587380666e-05, + "loss": 0.4923, + "step": 3553 + }, + { + "epoch": 2.1460911560519165, + "grad_norm": 0.1650390625, + "learning_rate": 5.8696554586965546e-05, + "loss": 0.4978, + "step": 3554 + }, + { + "epoch": 2.146694838514941, + "grad_norm": 0.1708984375, + "learning_rate": 5.865504358655044e-05, + "loss": 0.4812, + "step": 3555 + }, + { + "epoch": 2.147298520977966, + "grad_norm": 0.1650390625, + "learning_rate": 5.861353258613533e-05, + "loss": 0.4216, + "step": 3556 + }, + { + "epoch": 2.14790220344099, + "grad_norm": 0.177734375, + "learning_rate": 5.857202158572022e-05, + "loss": 0.4385, + "step": 3557 + }, + { + "epoch": 2.1485058859040143, + "grad_norm": 0.177734375, + "learning_rate": 5.853051058530511e-05, + "loss": 0.3954, + "step": 3558 + }, + { + "epoch": 2.149109568367039, + "grad_norm": 0.189453125, + "learning_rate": 5.8488999584890005e-05, + "loss": 0.3699, + "step": 3559 + }, + { + "epoch": 2.1497132508300636, + "grad_norm": 0.2001953125, + "learning_rate": 5.8447488584474885e-05, + "loss": 0.332, + "step": 3560 + }, + { + "epoch": 2.150316933293088, + "grad_norm": 0.208984375, + "learning_rate": 5.840597758405978e-05, + "loss": 0.2789, + "step": 3561 + }, + { + "epoch": 2.1509206157561125, + "grad_norm": 0.1904296875, + "learning_rate": 5.836446658364467e-05, + "loss": 0.2135, + "step": 3562 + }, + { + "epoch": 2.1515242982191367, + "grad_norm": 0.15234375, + "learning_rate": 5.832295558322956e-05, + "loss": 0.6707, + "step": 3563 + }, + { + "epoch": 2.1521279806821614, + "grad_norm": 0.138671875, + "learning_rate": 5.828144458281445e-05, + "loss": 0.5971, + "step": 3564 + }, + { + "epoch": 2.1527316631451856, + "grad_norm": 0.1416015625, + "learning_rate": 5.8239933582399344e-05, + "loss": 0.7411, + "step": 3565 + }, + { + "epoch": 2.1533353456082103, + "grad_norm": 0.150390625, + "learning_rate": 5.8198422581984224e-05, + "loss": 0.7833, + "step": 3566 + }, + { + "epoch": 2.1539390280712345, + "grad_norm": 0.1474609375, + "learning_rate": 5.815691158156912e-05, + "loss": 0.519, + "step": 3567 + }, + { + "epoch": 2.154542710534259, + "grad_norm": 0.138671875, + "learning_rate": 5.811540058115401e-05, + "loss": 0.5645, + "step": 3568 + }, + { + "epoch": 2.1551463929972834, + "grad_norm": 0.1748046875, + "learning_rate": 5.80738895807389e-05, + "loss": 0.5951, + "step": 3569 + }, + { + "epoch": 2.155750075460308, + "grad_norm": 0.150390625, + "learning_rate": 5.803237858032379e-05, + "loss": 0.7003, + "step": 3570 + }, + { + "epoch": 2.1563537579233323, + "grad_norm": 0.1689453125, + "learning_rate": 5.7990867579908683e-05, + "loss": 0.7031, + "step": 3571 + }, + { + "epoch": 2.156957440386357, + "grad_norm": 0.158203125, + "learning_rate": 5.794935657949356e-05, + "loss": 0.8678, + "step": 3572 + }, + { + "epoch": 2.157561122849381, + "grad_norm": 0.16015625, + "learning_rate": 5.7907845579078456e-05, + "loss": 0.6421, + "step": 3573 + }, + { + "epoch": 2.158164805312406, + "grad_norm": 0.15234375, + "learning_rate": 5.786633457866335e-05, + "loss": 0.631, + "step": 3574 + }, + { + "epoch": 2.15876848777543, + "grad_norm": 0.15234375, + "learning_rate": 5.782482357824824e-05, + "loss": 0.5944, + "step": 3575 + }, + { + "epoch": 2.1593721702384547, + "grad_norm": 0.1484375, + "learning_rate": 5.778331257783313e-05, + "loss": 0.6048, + "step": 3576 + }, + { + "epoch": 2.159975852701479, + "grad_norm": 0.146484375, + "learning_rate": 5.774180157741802e-05, + "loss": 0.6322, + "step": 3577 + }, + { + "epoch": 2.1605795351645036, + "grad_norm": 0.134765625, + "learning_rate": 5.7700290577002916e-05, + "loss": 0.8564, + "step": 3578 + }, + { + "epoch": 2.161183217627528, + "grad_norm": 0.1533203125, + "learning_rate": 5.7658779576587796e-05, + "loss": 0.6573, + "step": 3579 + }, + { + "epoch": 2.1617869000905525, + "grad_norm": 0.1455078125, + "learning_rate": 5.761726857617269e-05, + "loss": 0.6145, + "step": 3580 + }, + { + "epoch": 2.1623905825535767, + "grad_norm": 0.15234375, + "learning_rate": 5.757575757575758e-05, + "loss": 0.6561, + "step": 3581 + }, + { + "epoch": 2.1629942650166014, + "grad_norm": 0.150390625, + "learning_rate": 5.753424657534247e-05, + "loss": 0.5926, + "step": 3582 + }, + { + "epoch": 2.1635979474796256, + "grad_norm": 0.15234375, + "learning_rate": 5.749273557492736e-05, + "loss": 0.5751, + "step": 3583 + }, + { + "epoch": 2.1642016299426503, + "grad_norm": 0.1416015625, + "learning_rate": 5.7451224574512255e-05, + "loss": 0.6293, + "step": 3584 + }, + { + "epoch": 2.1648053124056745, + "grad_norm": 0.13671875, + "learning_rate": 5.7409713574097135e-05, + "loss": 0.6692, + "step": 3585 + }, + { + "epoch": 2.165408994868699, + "grad_norm": 0.12890625, + "learning_rate": 5.736820257368203e-05, + "loss": 0.7354, + "step": 3586 + }, + { + "epoch": 2.1660126773317234, + "grad_norm": 0.1513671875, + "learning_rate": 5.732669157326692e-05, + "loss": 0.6266, + "step": 3587 + }, + { + "epoch": 2.166616359794748, + "grad_norm": 0.14453125, + "learning_rate": 5.72851805728518e-05, + "loss": 0.6857, + "step": 3588 + }, + { + "epoch": 2.1672200422577723, + "grad_norm": 0.1435546875, + "learning_rate": 5.72436695724367e-05, + "loss": 0.7397, + "step": 3589 + }, + { + "epoch": 2.167823724720797, + "grad_norm": 0.1318359375, + "learning_rate": 5.7202158572021594e-05, + "loss": 0.5876, + "step": 3590 + }, + { + "epoch": 2.168427407183821, + "grad_norm": 0.1357421875, + "learning_rate": 5.7160647571606474e-05, + "loss": 0.5402, + "step": 3591 + }, + { + "epoch": 2.169031089646846, + "grad_norm": 0.142578125, + "learning_rate": 5.711913657119137e-05, + "loss": 0.6663, + "step": 3592 + }, + { + "epoch": 2.16963477210987, + "grad_norm": 0.166015625, + "learning_rate": 5.707762557077626e-05, + "loss": 0.8411, + "step": 3593 + }, + { + "epoch": 2.1702384545728948, + "grad_norm": 0.14453125, + "learning_rate": 5.703611457036114e-05, + "loss": 0.5573, + "step": 3594 + }, + { + "epoch": 2.170842137035919, + "grad_norm": 0.150390625, + "learning_rate": 5.6994603569946034e-05, + "loss": 0.5112, + "step": 3595 + }, + { + "epoch": 2.1714458194989437, + "grad_norm": 0.1318359375, + "learning_rate": 5.6953092569530934e-05, + "loss": 0.5262, + "step": 3596 + }, + { + "epoch": 2.172049501961968, + "grad_norm": 0.140625, + "learning_rate": 5.691158156911583e-05, + "loss": 0.5988, + "step": 3597 + }, + { + "epoch": 2.1726531844249926, + "grad_norm": 0.1435546875, + "learning_rate": 5.687007056870071e-05, + "loss": 0.5237, + "step": 3598 + }, + { + "epoch": 2.173256866888017, + "grad_norm": 0.1494140625, + "learning_rate": 5.68285595682856e-05, + "loss": 0.7144, + "step": 3599 + }, + { + "epoch": 2.1738605493510414, + "grad_norm": 0.158203125, + "learning_rate": 5.678704856787049e-05, + "loss": 0.5637, + "step": 3600 + }, + { + "epoch": 2.1744642318140657, + "grad_norm": 0.15625, + "learning_rate": 5.674553756745537e-05, + "loss": 0.5446, + "step": 3601 + }, + { + "epoch": 2.1750679142770903, + "grad_norm": 0.162109375, + "learning_rate": 5.6704026567040266e-05, + "loss": 0.5356, + "step": 3602 + }, + { + "epoch": 2.1756715967401146, + "grad_norm": 0.1650390625, + "learning_rate": 5.6662515566625166e-05, + "loss": 0.4565, + "step": 3603 + }, + { + "epoch": 2.1762752792031392, + "grad_norm": 0.1708984375, + "learning_rate": 5.6621004566210046e-05, + "loss": 0.4962, + "step": 3604 + }, + { + "epoch": 2.1768789616661635, + "grad_norm": 0.1650390625, + "learning_rate": 5.657949356579494e-05, + "loss": 0.4551, + "step": 3605 + }, + { + "epoch": 2.177482644129188, + "grad_norm": 0.189453125, + "learning_rate": 5.653798256537983e-05, + "loss": 0.5525, + "step": 3606 + }, + { + "epoch": 2.1780863265922124, + "grad_norm": 0.185546875, + "learning_rate": 5.649647156496471e-05, + "loss": 0.4038, + "step": 3607 + }, + { + "epoch": 2.178690009055237, + "grad_norm": 0.1845703125, + "learning_rate": 5.6454960564549605e-05, + "loss": 0.3714, + "step": 3608 + }, + { + "epoch": 2.1792936915182612, + "grad_norm": 0.1875, + "learning_rate": 5.64134495641345e-05, + "loss": 0.3403, + "step": 3609 + }, + { + "epoch": 2.179897373981286, + "grad_norm": 0.220703125, + "learning_rate": 5.6371938563719385e-05, + "loss": 0.3037, + "step": 3610 + }, + { + "epoch": 2.18050105644431, + "grad_norm": 0.197265625, + "learning_rate": 5.633042756330428e-05, + "loss": 0.23, + "step": 3611 + }, + { + "epoch": 2.181104738907335, + "grad_norm": 0.197265625, + "learning_rate": 5.628891656288917e-05, + "loss": 0.2189, + "step": 3612 + }, + { + "epoch": 2.181708421370359, + "grad_norm": 0.1494140625, + "learning_rate": 5.624740556247405e-05, + "loss": 0.85, + "step": 3613 + }, + { + "epoch": 2.1823121038333837, + "grad_norm": 0.150390625, + "learning_rate": 5.6205894562058945e-05, + "loss": 0.6516, + "step": 3614 + }, + { + "epoch": 2.182915786296408, + "grad_norm": 0.142578125, + "learning_rate": 5.616438356164384e-05, + "loss": 0.6577, + "step": 3615 + }, + { + "epoch": 2.1835194687594326, + "grad_norm": 0.1416015625, + "learning_rate": 5.6122872561228724e-05, + "loss": 0.5393, + "step": 3616 + }, + { + "epoch": 2.184123151222457, + "grad_norm": 0.134765625, + "learning_rate": 5.608136156081362e-05, + "loss": 0.5632, + "step": 3617 + }, + { + "epoch": 2.1847268336854815, + "grad_norm": 0.171875, + "learning_rate": 5.603985056039851e-05, + "loss": 0.6989, + "step": 3618 + }, + { + "epoch": 2.1853305161485057, + "grad_norm": 0.1630859375, + "learning_rate": 5.5998339559983404e-05, + "loss": 0.5195, + "step": 3619 + }, + { + "epoch": 2.1859341986115304, + "grad_norm": 0.1435546875, + "learning_rate": 5.5956828559568284e-05, + "loss": 0.6129, + "step": 3620 + }, + { + "epoch": 2.1865378810745546, + "grad_norm": 0.1611328125, + "learning_rate": 5.591531755915318e-05, + "loss": 0.6626, + "step": 3621 + }, + { + "epoch": 2.1871415635375793, + "grad_norm": 0.14453125, + "learning_rate": 5.587380655873807e-05, + "loss": 0.8414, + "step": 3622 + }, + { + "epoch": 2.1877452460006035, + "grad_norm": 0.140625, + "learning_rate": 5.583229555832296e-05, + "loss": 0.5994, + "step": 3623 + }, + { + "epoch": 2.188348928463628, + "grad_norm": 0.1455078125, + "learning_rate": 5.579078455790785e-05, + "loss": 0.5107, + "step": 3624 + }, + { + "epoch": 2.1889526109266524, + "grad_norm": 0.1669921875, + "learning_rate": 5.574927355749274e-05, + "loss": 0.612, + "step": 3625 + }, + { + "epoch": 2.189556293389677, + "grad_norm": 0.1474609375, + "learning_rate": 5.570776255707762e-05, + "loss": 0.6054, + "step": 3626 + }, + { + "epoch": 2.1901599758527013, + "grad_norm": 0.154296875, + "learning_rate": 5.5666251556662516e-05, + "loss": 0.6852, + "step": 3627 + }, + { + "epoch": 2.190763658315726, + "grad_norm": 0.1396484375, + "learning_rate": 5.562474055624741e-05, + "loss": 0.6717, + "step": 3628 + }, + { + "epoch": 2.19136734077875, + "grad_norm": 0.125, + "learning_rate": 5.5583229555832296e-05, + "loss": 0.5286, + "step": 3629 + }, + { + "epoch": 2.191971023241775, + "grad_norm": 0.1279296875, + "learning_rate": 5.554171855541719e-05, + "loss": 0.627, + "step": 3630 + }, + { + "epoch": 2.192574705704799, + "grad_norm": 0.1357421875, + "learning_rate": 5.550020755500208e-05, + "loss": 0.5631, + "step": 3631 + }, + { + "epoch": 2.1931783881678237, + "grad_norm": 0.13671875, + "learning_rate": 5.545869655458696e-05, + "loss": 0.644, + "step": 3632 + }, + { + "epoch": 2.1937820706308484, + "grad_norm": 0.142578125, + "learning_rate": 5.5417185554171856e-05, + "loss": 0.6144, + "step": 3633 + }, + { + "epoch": 2.1943857530938726, + "grad_norm": 0.1318359375, + "learning_rate": 5.537567455375675e-05, + "loss": 0.5612, + "step": 3634 + }, + { + "epoch": 2.194989435556897, + "grad_norm": 0.130859375, + "learning_rate": 5.5334163553341635e-05, + "loss": 0.8231, + "step": 3635 + }, + { + "epoch": 2.1955931180199215, + "grad_norm": 0.1650390625, + "learning_rate": 5.529265255292653e-05, + "loss": 0.6544, + "step": 3636 + }, + { + "epoch": 2.196196800482946, + "grad_norm": 0.13671875, + "learning_rate": 5.525114155251142e-05, + "loss": 0.5665, + "step": 3637 + }, + { + "epoch": 2.1968004829459704, + "grad_norm": 0.1494140625, + "learning_rate": 5.52096305520963e-05, + "loss": 0.6914, + "step": 3638 + }, + { + "epoch": 2.1974041654089946, + "grad_norm": 0.1484375, + "learning_rate": 5.5168119551681195e-05, + "loss": 0.6857, + "step": 3639 + }, + { + "epoch": 2.1980078478720193, + "grad_norm": 0.140625, + "learning_rate": 5.512660855126609e-05, + "loss": 0.5609, + "step": 3640 + }, + { + "epoch": 2.198611530335044, + "grad_norm": 0.15234375, + "learning_rate": 5.508509755085098e-05, + "loss": 0.5702, + "step": 3641 + }, + { + "epoch": 2.199215212798068, + "grad_norm": 0.14453125, + "learning_rate": 5.504358655043587e-05, + "loss": 0.6845, + "step": 3642 + }, + { + "epoch": 2.199818895261093, + "grad_norm": 0.1298828125, + "learning_rate": 5.500207555002076e-05, + "loss": 0.8029, + "step": 3643 + }, + { + "epoch": 2.200422577724117, + "grad_norm": 0.1279296875, + "learning_rate": 5.4960564549605654e-05, + "loss": 0.582, + "step": 3644 + }, + { + "epoch": 2.2010262601871418, + "grad_norm": 0.216796875, + "learning_rate": 5.4919053549190534e-05, + "loss": 0.6569, + "step": 3645 + }, + { + "epoch": 2.201629942650166, + "grad_norm": 0.1435546875, + "learning_rate": 5.487754254877543e-05, + "loss": 0.5649, + "step": 3646 + }, + { + "epoch": 2.2022336251131907, + "grad_norm": 0.142578125, + "learning_rate": 5.483603154836032e-05, + "loss": 0.5068, + "step": 3647 + }, + { + "epoch": 2.202837307576215, + "grad_norm": 0.146484375, + "learning_rate": 5.479452054794521e-05, + "loss": 0.5033, + "step": 3648 + }, + { + "epoch": 2.2034409900392395, + "grad_norm": 0.1591796875, + "learning_rate": 5.47530095475301e-05, + "loss": 0.5824, + "step": 3649 + }, + { + "epoch": 2.2040446725022638, + "grad_norm": 0.14453125, + "learning_rate": 5.4711498547114994e-05, + "loss": 0.5361, + "step": 3650 + }, + { + "epoch": 2.2046483549652884, + "grad_norm": 0.169921875, + "learning_rate": 5.466998754669987e-05, + "loss": 0.5451, + "step": 3651 + }, + { + "epoch": 2.2052520374283127, + "grad_norm": 0.1484375, + "learning_rate": 5.4628476546284766e-05, + "loss": 0.4929, + "step": 3652 + }, + { + "epoch": 2.2058557198913373, + "grad_norm": 0.16796875, + "learning_rate": 5.458696554586966e-05, + "loss": 0.5349, + "step": 3653 + }, + { + "epoch": 2.2064594023543616, + "grad_norm": 0.1767578125, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.5587, + "step": 3654 + }, + { + "epoch": 2.2070630848173862, + "grad_norm": 0.1845703125, + "learning_rate": 5.450394354503944e-05, + "loss": 0.4678, + "step": 3655 + }, + { + "epoch": 2.2076667672804104, + "grad_norm": 0.181640625, + "learning_rate": 5.446243254462433e-05, + "loss": 0.4529, + "step": 3656 + }, + { + "epoch": 2.208270449743435, + "grad_norm": 0.2041015625, + "learning_rate": 5.442092154420921e-05, + "loss": 0.4281, + "step": 3657 + }, + { + "epoch": 2.2088741322064593, + "grad_norm": 0.1787109375, + "learning_rate": 5.4379410543794106e-05, + "loss": 0.3754, + "step": 3658 + }, + { + "epoch": 2.209477814669484, + "grad_norm": 0.3203125, + "learning_rate": 5.4337899543379e-05, + "loss": 0.3509, + "step": 3659 + }, + { + "epoch": 2.2100814971325082, + "grad_norm": 0.201171875, + "learning_rate": 5.4296388542963885e-05, + "loss": 0.368, + "step": 3660 + }, + { + "epoch": 2.210685179595533, + "grad_norm": 0.2041015625, + "learning_rate": 5.425487754254878e-05, + "loss": 0.2797, + "step": 3661 + }, + { + "epoch": 2.211288862058557, + "grad_norm": 0.189453125, + "learning_rate": 5.421336654213367e-05, + "loss": 0.19, + "step": 3662 + }, + { + "epoch": 2.211892544521582, + "grad_norm": 0.1552734375, + "learning_rate": 5.417185554171855e-05, + "loss": 1.0725, + "step": 3663 + }, + { + "epoch": 2.212496226984606, + "grad_norm": 0.234375, + "learning_rate": 5.4130344541303445e-05, + "loss": 0.655, + "step": 3664 + }, + { + "epoch": 2.2130999094476307, + "grad_norm": 0.154296875, + "learning_rate": 5.408883354088834e-05, + "loss": 0.693, + "step": 3665 + }, + { + "epoch": 2.213703591910655, + "grad_norm": 0.146484375, + "learning_rate": 5.404732254047323e-05, + "loss": 0.8072, + "step": 3666 + }, + { + "epoch": 2.2143072743736796, + "grad_norm": 0.1572265625, + "learning_rate": 5.400581154005812e-05, + "loss": 0.686, + "step": 3667 + }, + { + "epoch": 2.214910956836704, + "grad_norm": 0.1474609375, + "learning_rate": 5.396430053964301e-05, + "loss": 0.9237, + "step": 3668 + }, + { + "epoch": 2.2155146392997285, + "grad_norm": 0.1494140625, + "learning_rate": 5.3922789539227904e-05, + "loss": 0.5754, + "step": 3669 + }, + { + "epoch": 2.2161183217627527, + "grad_norm": 0.1474609375, + "learning_rate": 5.3881278538812784e-05, + "loss": 0.6025, + "step": 3670 + }, + { + "epoch": 2.2167220042257774, + "grad_norm": 0.1513671875, + "learning_rate": 5.383976753839768e-05, + "loss": 0.7161, + "step": 3671 + }, + { + "epoch": 2.2173256866888016, + "grad_norm": 0.134765625, + "learning_rate": 5.379825653798257e-05, + "loss": 0.8643, + "step": 3672 + }, + { + "epoch": 2.2179293691518263, + "grad_norm": 0.142578125, + "learning_rate": 5.375674553756746e-05, + "loss": 0.7475, + "step": 3673 + }, + { + "epoch": 2.2185330516148505, + "grad_norm": 0.1240234375, + "learning_rate": 5.371523453715235e-05, + "loss": 0.5257, + "step": 3674 + }, + { + "epoch": 2.219136734077875, + "grad_norm": 0.1474609375, + "learning_rate": 5.3673723536737244e-05, + "loss": 0.6046, + "step": 3675 + }, + { + "epoch": 2.2197404165408994, + "grad_norm": 0.146484375, + "learning_rate": 5.3632212536322123e-05, + "loss": 0.5928, + "step": 3676 + }, + { + "epoch": 2.220344099003924, + "grad_norm": 0.154296875, + "learning_rate": 5.359070153590702e-05, + "loss": 0.7284, + "step": 3677 + }, + { + "epoch": 2.2209477814669483, + "grad_norm": 0.1494140625, + "learning_rate": 5.354919053549191e-05, + "loss": 0.6325, + "step": 3678 + }, + { + "epoch": 2.221551463929973, + "grad_norm": 0.162109375, + "learning_rate": 5.3507679535076796e-05, + "loss": 0.7032, + "step": 3679 + }, + { + "epoch": 2.222155146392997, + "grad_norm": 0.1455078125, + "learning_rate": 5.346616853466169e-05, + "loss": 0.5552, + "step": 3680 + }, + { + "epoch": 2.222758828856022, + "grad_norm": 0.154296875, + "learning_rate": 5.342465753424658e-05, + "loss": 0.6397, + "step": 3681 + }, + { + "epoch": 2.223362511319046, + "grad_norm": 0.1318359375, + "learning_rate": 5.338314653383146e-05, + "loss": 0.6766, + "step": 3682 + }, + { + "epoch": 2.2239661937820707, + "grad_norm": 0.15234375, + "learning_rate": 5.3341635533416356e-05, + "loss": 0.6068, + "step": 3683 + }, + { + "epoch": 2.224569876245095, + "grad_norm": 0.1513671875, + "learning_rate": 5.330012453300125e-05, + "loss": 0.7731, + "step": 3684 + }, + { + "epoch": 2.2251735587081196, + "grad_norm": 0.1552734375, + "learning_rate": 5.3258613532586136e-05, + "loss": 0.6333, + "step": 3685 + }, + { + "epoch": 2.225777241171144, + "grad_norm": 0.1435546875, + "learning_rate": 5.321710253217103e-05, + "loss": 0.6319, + "step": 3686 + }, + { + "epoch": 2.2263809236341685, + "grad_norm": 0.142578125, + "learning_rate": 5.317559153175592e-05, + "loss": 0.6213, + "step": 3687 + }, + { + "epoch": 2.2269846060971927, + "grad_norm": 0.1396484375, + "learning_rate": 5.3134080531340815e-05, + "loss": 0.6248, + "step": 3688 + }, + { + "epoch": 2.2275882885602174, + "grad_norm": 0.1357421875, + "learning_rate": 5.3092569530925695e-05, + "loss": 0.6109, + "step": 3689 + }, + { + "epoch": 2.2281919710232416, + "grad_norm": 0.1396484375, + "learning_rate": 5.305105853051059e-05, + "loss": 0.8635, + "step": 3690 + }, + { + "epoch": 2.2287956534862663, + "grad_norm": 0.1455078125, + "learning_rate": 5.300954753009548e-05, + "loss": 0.5466, + "step": 3691 + }, + { + "epoch": 2.2293993359492905, + "grad_norm": 0.169921875, + "learning_rate": 5.296803652968037e-05, + "loss": 0.7101, + "step": 3692 + }, + { + "epoch": 2.230003018412315, + "grad_norm": 0.1435546875, + "learning_rate": 5.292652552926526e-05, + "loss": 0.6589, + "step": 3693 + }, + { + "epoch": 2.2306067008753394, + "grad_norm": 0.1474609375, + "learning_rate": 5.2885014528850155e-05, + "loss": 0.6498, + "step": 3694 + }, + { + "epoch": 2.231210383338364, + "grad_norm": 0.1435546875, + "learning_rate": 5.2843503528435034e-05, + "loss": 0.5665, + "step": 3695 + }, + { + "epoch": 2.2318140658013883, + "grad_norm": 0.146484375, + "learning_rate": 5.280199252801993e-05, + "loss": 0.5706, + "step": 3696 + }, + { + "epoch": 2.232417748264413, + "grad_norm": 0.140625, + "learning_rate": 5.276048152760482e-05, + "loss": 0.55, + "step": 3697 + }, + { + "epoch": 2.233021430727437, + "grad_norm": 0.162109375, + "learning_rate": 5.27189705271897e-05, + "loss": 0.5897, + "step": 3698 + }, + { + "epoch": 2.233625113190462, + "grad_norm": 0.193359375, + "learning_rate": 5.26774595267746e-05, + "loss": 0.6028, + "step": 3699 + }, + { + "epoch": 2.234228795653486, + "grad_norm": 0.162109375, + "learning_rate": 5.2635948526359494e-05, + "loss": 0.5925, + "step": 3700 + }, + { + "epoch": 2.2348324781165108, + "grad_norm": 0.1455078125, + "learning_rate": 5.2594437525944374e-05, + "loss": 0.5198, + "step": 3701 + }, + { + "epoch": 2.235436160579535, + "grad_norm": 0.158203125, + "learning_rate": 5.255292652552927e-05, + "loss": 0.5255, + "step": 3702 + }, + { + "epoch": 2.2360398430425596, + "grad_norm": 0.16015625, + "learning_rate": 5.251141552511416e-05, + "loss": 0.5007, + "step": 3703 + }, + { + "epoch": 2.236643525505584, + "grad_norm": 0.171875, + "learning_rate": 5.246990452469904e-05, + "loss": 0.5518, + "step": 3704 + }, + { + "epoch": 2.2372472079686085, + "grad_norm": 0.173828125, + "learning_rate": 5.242839352428393e-05, + "loss": 0.487, + "step": 3705 + }, + { + "epoch": 2.2378508904316328, + "grad_norm": 0.1845703125, + "learning_rate": 5.238688252386883e-05, + "loss": 0.4804, + "step": 3706 + }, + { + "epoch": 2.2384545728946574, + "grad_norm": 0.181640625, + "learning_rate": 5.234537152345371e-05, + "loss": 0.4207, + "step": 3707 + }, + { + "epoch": 2.2390582553576817, + "grad_norm": 0.1787109375, + "learning_rate": 5.2303860523038606e-05, + "loss": 0.391, + "step": 3708 + }, + { + "epoch": 2.2396619378207063, + "grad_norm": 0.1982421875, + "learning_rate": 5.22623495226235e-05, + "loss": 0.3689, + "step": 3709 + }, + { + "epoch": 2.2402656202837306, + "grad_norm": 0.2080078125, + "learning_rate": 5.222083852220839e-05, + "loss": 0.34, + "step": 3710 + }, + { + "epoch": 2.240869302746755, + "grad_norm": 0.2109375, + "learning_rate": 5.217932752179327e-05, + "loss": 0.2817, + "step": 3711 + }, + { + "epoch": 2.2414729852097794, + "grad_norm": 0.2080078125, + "learning_rate": 5.2137816521378166e-05, + "loss": 0.2022, + "step": 3712 + }, + { + "epoch": 2.242076667672804, + "grad_norm": 0.140625, + "learning_rate": 5.2096305520963066e-05, + "loss": 0.5713, + "step": 3713 + }, + { + "epoch": 2.242680350135829, + "grad_norm": 0.1533203125, + "learning_rate": 5.2054794520547945e-05, + "loss": 0.5972, + "step": 3714 + }, + { + "epoch": 2.243284032598853, + "grad_norm": 0.1572265625, + "learning_rate": 5.201328352013284e-05, + "loss": 1.3534, + "step": 3715 + }, + { + "epoch": 2.2438877150618772, + "grad_norm": 0.13671875, + "learning_rate": 5.197177251971773e-05, + "loss": 0.544, + "step": 3716 + }, + { + "epoch": 2.244491397524902, + "grad_norm": 0.14453125, + "learning_rate": 5.193026151930261e-05, + "loss": 0.7116, + "step": 3717 + }, + { + "epoch": 2.2450950799879266, + "grad_norm": 0.1474609375, + "learning_rate": 5.1888750518887505e-05, + "loss": 0.4888, + "step": 3718 + }, + { + "epoch": 2.245698762450951, + "grad_norm": 0.1494140625, + "learning_rate": 5.18472395184724e-05, + "loss": 0.6472, + "step": 3719 + }, + { + "epoch": 2.246302444913975, + "grad_norm": 0.181640625, + "learning_rate": 5.1805728518057285e-05, + "loss": 0.7131, + "step": 3720 + }, + { + "epoch": 2.2469061273769997, + "grad_norm": 0.1357421875, + "learning_rate": 5.176421751764218e-05, + "loss": 0.6801, + "step": 3721 + }, + { + "epoch": 2.2475098098400244, + "grad_norm": 0.146484375, + "learning_rate": 5.172270651722707e-05, + "loss": 0.5962, + "step": 3722 + }, + { + "epoch": 2.2481134923030486, + "grad_norm": 0.1455078125, + "learning_rate": 5.168119551681195e-05, + "loss": 0.6611, + "step": 3723 + }, + { + "epoch": 2.2487171747660732, + "grad_norm": 0.1337890625, + "learning_rate": 5.1639684516396844e-05, + "loss": 0.6505, + "step": 3724 + }, + { + "epoch": 2.2493208572290975, + "grad_norm": 0.1318359375, + "learning_rate": 5.159817351598174e-05, + "loss": 0.5929, + "step": 3725 + }, + { + "epoch": 2.249924539692122, + "grad_norm": 0.1455078125, + "learning_rate": 5.1556662515566624e-05, + "loss": 0.6366, + "step": 3726 + }, + { + "epoch": 2.2505282221551464, + "grad_norm": 0.158203125, + "learning_rate": 5.151515151515152e-05, + "loss": 0.7009, + "step": 3727 + }, + { + "epoch": 2.251131904618171, + "grad_norm": 0.1572265625, + "learning_rate": 5.147364051473641e-05, + "loss": 0.6801, + "step": 3728 + }, + { + "epoch": 2.2517355870811953, + "grad_norm": 0.146484375, + "learning_rate": 5.143212951432129e-05, + "loss": 0.7714, + "step": 3729 + }, + { + "epoch": 2.25233926954422, + "grad_norm": 0.1357421875, + "learning_rate": 5.139061851390618e-05, + "loss": 0.5927, + "step": 3730 + }, + { + "epoch": 2.252942952007244, + "grad_norm": 0.1484375, + "learning_rate": 5.1349107513491076e-05, + "loss": 0.581, + "step": 3731 + }, + { + "epoch": 2.253546634470269, + "grad_norm": 0.1484375, + "learning_rate": 5.130759651307597e-05, + "loss": 0.6646, + "step": 3732 + }, + { + "epoch": 2.254150316933293, + "grad_norm": 0.138671875, + "learning_rate": 5.1266085512660856e-05, + "loss": 0.6749, + "step": 3733 + }, + { + "epoch": 2.2547539993963177, + "grad_norm": 0.1513671875, + "learning_rate": 5.122457451224575e-05, + "loss": 0.7835, + "step": 3734 + }, + { + "epoch": 2.255357681859342, + "grad_norm": 0.13671875, + "learning_rate": 5.118306351183064e-05, + "loss": 0.5993, + "step": 3735 + }, + { + "epoch": 2.2559613643223666, + "grad_norm": 0.146484375, + "learning_rate": 5.114155251141552e-05, + "loss": 0.8175, + "step": 3736 + }, + { + "epoch": 2.256565046785391, + "grad_norm": 0.1767578125, + "learning_rate": 5.1100041511000416e-05, + "loss": 0.5801, + "step": 3737 + }, + { + "epoch": 2.2571687292484155, + "grad_norm": 0.1533203125, + "learning_rate": 5.105853051058531e-05, + "loss": 0.6434, + "step": 3738 + }, + { + "epoch": 2.2577724117114397, + "grad_norm": 0.197265625, + "learning_rate": 5.1017019510170195e-05, + "loss": 0.6335, + "step": 3739 + }, + { + "epoch": 2.2583760941744644, + "grad_norm": 0.16015625, + "learning_rate": 5.097550850975509e-05, + "loss": 0.8821, + "step": 3740 + }, + { + "epoch": 2.2589797766374886, + "grad_norm": 0.1611328125, + "learning_rate": 5.093399750933998e-05, + "loss": 0.6041, + "step": 3741 + }, + { + "epoch": 2.2595834591005133, + "grad_norm": 0.19140625, + "learning_rate": 5.089248650892486e-05, + "loss": 0.8298, + "step": 3742 + }, + { + "epoch": 2.2601871415635375, + "grad_norm": 0.142578125, + "learning_rate": 5.0850975508509755e-05, + "loss": 0.7614, + "step": 3743 + }, + { + "epoch": 2.260790824026562, + "grad_norm": 0.1689453125, + "learning_rate": 5.080946450809465e-05, + "loss": 0.508, + "step": 3744 + }, + { + "epoch": 2.2613945064895864, + "grad_norm": 0.275390625, + "learning_rate": 5.0767953507679535e-05, + "loss": 0.7969, + "step": 3745 + }, + { + "epoch": 2.261998188952611, + "grad_norm": 0.1337890625, + "learning_rate": 5.072644250726443e-05, + "loss": 0.4903, + "step": 3746 + }, + { + "epoch": 2.2626018714156353, + "grad_norm": 0.1484375, + "learning_rate": 5.068493150684932e-05, + "loss": 0.5706, + "step": 3747 + }, + { + "epoch": 2.26320555387866, + "grad_norm": 0.1474609375, + "learning_rate": 5.06434205064342e-05, + "loss": 0.5735, + "step": 3748 + }, + { + "epoch": 2.263809236341684, + "grad_norm": 0.1455078125, + "learning_rate": 5.0601909506019094e-05, + "loss": 0.568, + "step": 3749 + }, + { + "epoch": 2.264412918804709, + "grad_norm": 0.16796875, + "learning_rate": 5.056039850560399e-05, + "loss": 0.5466, + "step": 3750 + }, + { + "epoch": 2.265016601267733, + "grad_norm": 0.154296875, + "learning_rate": 5.0518887505188874e-05, + "loss": 0.5017, + "step": 3751 + }, + { + "epoch": 2.2656202837307577, + "grad_norm": 0.1611328125, + "learning_rate": 5.047737650477377e-05, + "loss": 0.4916, + "step": 3752 + }, + { + "epoch": 2.266223966193782, + "grad_norm": 0.1611328125, + "learning_rate": 5.043586550435866e-05, + "loss": 0.5218, + "step": 3753 + }, + { + "epoch": 2.2668276486568066, + "grad_norm": 0.1640625, + "learning_rate": 5.0394354503943554e-05, + "loss": 0.4766, + "step": 3754 + }, + { + "epoch": 2.267431331119831, + "grad_norm": 0.1708984375, + "learning_rate": 5.0352843503528433e-05, + "loss": 0.5308, + "step": 3755 + }, + { + "epoch": 2.2680350135828555, + "grad_norm": 0.1728515625, + "learning_rate": 5.031133250311333e-05, + "loss": 0.4462, + "step": 3756 + }, + { + "epoch": 2.2686386960458798, + "grad_norm": 0.1806640625, + "learning_rate": 5.026982150269822e-05, + "loss": 0.3928, + "step": 3757 + }, + { + "epoch": 2.2692423785089044, + "grad_norm": 0.193359375, + "learning_rate": 5.0228310502283106e-05, + "loss": 0.3917, + "step": 3758 + }, + { + "epoch": 2.2698460609719286, + "grad_norm": 0.1875, + "learning_rate": 5.0186799501868e-05, + "loss": 0.349, + "step": 3759 + }, + { + "epoch": 2.2704497434349533, + "grad_norm": 0.203125, + "learning_rate": 5.014528850145289e-05, + "loss": 0.3521, + "step": 3760 + }, + { + "epoch": 2.2710534258979775, + "grad_norm": 0.2138671875, + "learning_rate": 5.010377750103777e-05, + "loss": 0.2588, + "step": 3761 + }, + { + "epoch": 2.271657108361002, + "grad_norm": 0.1767578125, + "learning_rate": 5.0062266500622666e-05, + "loss": 0.1769, + "step": 3762 + }, + { + "epoch": 2.2722607908240264, + "grad_norm": 0.1728515625, + "learning_rate": 5.002075550020756e-05, + "loss": 0.6932, + "step": 3763 + }, + { + "epoch": 2.272864473287051, + "grad_norm": 0.1533203125, + "learning_rate": 4.997924449979245e-05, + "loss": 0.6689, + "step": 3764 + }, + { + "epoch": 2.2734681557500753, + "grad_norm": 0.1552734375, + "learning_rate": 4.993773349937734e-05, + "loss": 1.0518, + "step": 3765 + }, + { + "epoch": 2.2740718382131, + "grad_norm": 0.1474609375, + "learning_rate": 4.9896222498962225e-05, + "loss": 0.6322, + "step": 3766 + }, + { + "epoch": 2.274675520676124, + "grad_norm": 0.1533203125, + "learning_rate": 4.985471149854712e-05, + "loss": 0.6422, + "step": 3767 + }, + { + "epoch": 2.275279203139149, + "grad_norm": 0.1484375, + "learning_rate": 4.9813200498132005e-05, + "loss": 0.6127, + "step": 3768 + }, + { + "epoch": 2.275882885602173, + "grad_norm": 0.1416015625, + "learning_rate": 4.977168949771689e-05, + "loss": 0.6145, + "step": 3769 + }, + { + "epoch": 2.276486568065198, + "grad_norm": 0.146484375, + "learning_rate": 4.9730178497301785e-05, + "loss": 0.5599, + "step": 3770 + }, + { + "epoch": 2.277090250528222, + "grad_norm": 0.142578125, + "learning_rate": 4.968866749688668e-05, + "loss": 0.8418, + "step": 3771 + }, + { + "epoch": 2.2776939329912467, + "grad_norm": 0.1376953125, + "learning_rate": 4.964715649647157e-05, + "loss": 0.562, + "step": 3772 + }, + { + "epoch": 2.278297615454271, + "grad_norm": 0.1494140625, + "learning_rate": 4.960564549605646e-05, + "loss": 0.6149, + "step": 3773 + }, + { + "epoch": 2.2789012979172956, + "grad_norm": 0.1494140625, + "learning_rate": 4.9564134495641344e-05, + "loss": 0.6435, + "step": 3774 + }, + { + "epoch": 2.27950498038032, + "grad_norm": 0.1494140625, + "learning_rate": 4.952262349522624e-05, + "loss": 0.7169, + "step": 3775 + }, + { + "epoch": 2.2801086628433445, + "grad_norm": 0.1357421875, + "learning_rate": 4.9481112494811124e-05, + "loss": 0.9838, + "step": 3776 + }, + { + "epoch": 2.2807123453063687, + "grad_norm": 0.150390625, + "learning_rate": 4.943960149439602e-05, + "loss": 0.6183, + "step": 3777 + }, + { + "epoch": 2.2813160277693934, + "grad_norm": 0.138671875, + "learning_rate": 4.939809049398091e-05, + "loss": 0.5653, + "step": 3778 + }, + { + "epoch": 2.2819197102324176, + "grad_norm": 0.1298828125, + "learning_rate": 4.93565794935658e-05, + "loss": 0.8278, + "step": 3779 + }, + { + "epoch": 2.2825233926954422, + "grad_norm": 0.142578125, + "learning_rate": 4.9315068493150684e-05, + "loss": 0.6817, + "step": 3780 + }, + { + "epoch": 2.2831270751584665, + "grad_norm": 0.154296875, + "learning_rate": 4.927355749273558e-05, + "loss": 0.548, + "step": 3781 + }, + { + "epoch": 2.283730757621491, + "grad_norm": 0.142578125, + "learning_rate": 4.923204649232046e-05, + "loss": 0.6265, + "step": 3782 + }, + { + "epoch": 2.2843344400845154, + "grad_norm": 0.1484375, + "learning_rate": 4.9190535491905357e-05, + "loss": 0.5528, + "step": 3783 + }, + { + "epoch": 2.28493812254754, + "grad_norm": 0.1513671875, + "learning_rate": 4.914902449149025e-05, + "loss": 0.9304, + "step": 3784 + }, + { + "epoch": 2.2855418050105643, + "grad_norm": 0.154296875, + "learning_rate": 4.9107513491075136e-05, + "loss": 0.6049, + "step": 3785 + }, + { + "epoch": 2.286145487473589, + "grad_norm": 0.140625, + "learning_rate": 4.906600249066003e-05, + "loss": 0.6067, + "step": 3786 + }, + { + "epoch": 2.2867491699366136, + "grad_norm": 0.1416015625, + "learning_rate": 4.9024491490244916e-05, + "loss": 0.5666, + "step": 3787 + }, + { + "epoch": 2.287352852399638, + "grad_norm": 0.1533203125, + "learning_rate": 4.89829804898298e-05, + "loss": 0.6094, + "step": 3788 + }, + { + "epoch": 2.287956534862662, + "grad_norm": 0.140625, + "learning_rate": 4.8941469489414696e-05, + "loss": 0.4883, + "step": 3789 + }, + { + "epoch": 2.2885602173256867, + "grad_norm": 0.140625, + "learning_rate": 4.889995848899959e-05, + "loss": 0.5744, + "step": 3790 + }, + { + "epoch": 2.2891638997887114, + "grad_norm": 0.134765625, + "learning_rate": 4.8858447488584476e-05, + "loss": 0.6248, + "step": 3791 + }, + { + "epoch": 2.2897675822517356, + "grad_norm": 0.1474609375, + "learning_rate": 4.881693648816937e-05, + "loss": 0.8144, + "step": 3792 + }, + { + "epoch": 2.29037126471476, + "grad_norm": 0.14453125, + "learning_rate": 4.8775425487754255e-05, + "loss": 0.6317, + "step": 3793 + }, + { + "epoch": 2.2909749471777845, + "grad_norm": 0.15234375, + "learning_rate": 4.873391448733915e-05, + "loss": 0.6998, + "step": 3794 + }, + { + "epoch": 2.291578629640809, + "grad_norm": 0.1357421875, + "learning_rate": 4.8692403486924035e-05, + "loss": 0.5846, + "step": 3795 + }, + { + "epoch": 2.2921823121038334, + "grad_norm": 0.13671875, + "learning_rate": 4.865089248650893e-05, + "loss": 0.5276, + "step": 3796 + }, + { + "epoch": 2.2927859945668576, + "grad_norm": 0.16015625, + "learning_rate": 4.860938148609382e-05, + "loss": 0.5752, + "step": 3797 + }, + { + "epoch": 2.2933896770298823, + "grad_norm": 0.1513671875, + "learning_rate": 4.856787048567871e-05, + "loss": 0.5444, + "step": 3798 + }, + { + "epoch": 2.293993359492907, + "grad_norm": 0.140625, + "learning_rate": 4.8526359485263595e-05, + "loss": 0.5088, + "step": 3799 + }, + { + "epoch": 2.294597041955931, + "grad_norm": 0.16015625, + "learning_rate": 4.848484848484849e-05, + "loss": 0.5237, + "step": 3800 + }, + { + "epoch": 2.2952007244189554, + "grad_norm": 0.162109375, + "learning_rate": 4.8443337484433374e-05, + "loss": 0.5165, + "step": 3801 + }, + { + "epoch": 2.29580440688198, + "grad_norm": 0.1669921875, + "learning_rate": 4.840182648401827e-05, + "loss": 0.5137, + "step": 3802 + }, + { + "epoch": 2.2964080893450047, + "grad_norm": 0.16015625, + "learning_rate": 4.836031548360316e-05, + "loss": 0.5314, + "step": 3803 + }, + { + "epoch": 2.297011771808029, + "grad_norm": 0.17578125, + "learning_rate": 4.831880448318805e-05, + "loss": 0.5597, + "step": 3804 + }, + { + "epoch": 2.297615454271053, + "grad_norm": 0.1748046875, + "learning_rate": 4.827729348277294e-05, + "loss": 0.492, + "step": 3805 + }, + { + "epoch": 2.298219136734078, + "grad_norm": 0.1826171875, + "learning_rate": 4.823578248235783e-05, + "loss": 0.4438, + "step": 3806 + }, + { + "epoch": 2.2988228191971025, + "grad_norm": 0.1806640625, + "learning_rate": 4.8194271481942714e-05, + "loss": 0.3985, + "step": 3807 + }, + { + "epoch": 2.2994265016601267, + "grad_norm": 0.1884765625, + "learning_rate": 4.815276048152761e-05, + "loss": 0.513, + "step": 3808 + }, + { + "epoch": 2.3000301841231514, + "grad_norm": 0.2109375, + "learning_rate": 4.81112494811125e-05, + "loss": 0.3765, + "step": 3809 + }, + { + "epoch": 2.3006338665861756, + "grad_norm": 0.1943359375, + "learning_rate": 4.8069738480697387e-05, + "loss": 0.2697, + "step": 3810 + }, + { + "epoch": 2.3012375490492003, + "grad_norm": 0.185546875, + "learning_rate": 4.802822748028228e-05, + "loss": 0.2245, + "step": 3811 + }, + { + "epoch": 2.3018412315122245, + "grad_norm": 0.2001953125, + "learning_rate": 4.7986716479867166e-05, + "loss": 0.1918, + "step": 3812 + }, + { + "epoch": 2.302444913975249, + "grad_norm": 0.1484375, + "learning_rate": 4.794520547945205e-05, + "loss": 0.7845, + "step": 3813 + }, + { + "epoch": 2.3030485964382734, + "grad_norm": 0.12255859375, + "learning_rate": 4.7903694479036946e-05, + "loss": 0.5505, + "step": 3814 + }, + { + "epoch": 2.303652278901298, + "grad_norm": 0.1533203125, + "learning_rate": 4.786218347862184e-05, + "loss": 0.601, + "step": 3815 + }, + { + "epoch": 2.3042559613643223, + "grad_norm": 0.15234375, + "learning_rate": 4.782067247820673e-05, + "loss": 0.5139, + "step": 3816 + }, + { + "epoch": 2.304859643827347, + "grad_norm": 0.1318359375, + "learning_rate": 4.777916147779162e-05, + "loss": 0.6804, + "step": 3817 + }, + { + "epoch": 2.305463326290371, + "grad_norm": 0.1435546875, + "learning_rate": 4.7737650477376505e-05, + "loss": 0.5875, + "step": 3818 + }, + { + "epoch": 2.306067008753396, + "grad_norm": 0.1748046875, + "learning_rate": 4.76961394769614e-05, + "loss": 0.6261, + "step": 3819 + }, + { + "epoch": 2.30667069121642, + "grad_norm": 0.1396484375, + "learning_rate": 4.7654628476546285e-05, + "loss": 0.597, + "step": 3820 + }, + { + "epoch": 2.3072743736794448, + "grad_norm": 0.14453125, + "learning_rate": 4.761311747613117e-05, + "loss": 0.4979, + "step": 3821 + }, + { + "epoch": 2.307878056142469, + "grad_norm": 0.13671875, + "learning_rate": 4.757160647571607e-05, + "loss": 0.5344, + "step": 3822 + }, + { + "epoch": 2.3084817386054937, + "grad_norm": 0.154296875, + "learning_rate": 4.753009547530096e-05, + "loss": 0.575, + "step": 3823 + }, + { + "epoch": 2.309085421068518, + "grad_norm": 0.154296875, + "learning_rate": 4.7488584474885845e-05, + "loss": 0.6303, + "step": 3824 + }, + { + "epoch": 2.3096891035315426, + "grad_norm": 0.1474609375, + "learning_rate": 4.744707347447074e-05, + "loss": 0.65, + "step": 3825 + }, + { + "epoch": 2.3102927859945668, + "grad_norm": 0.1611328125, + "learning_rate": 4.7405562474055624e-05, + "loss": 0.6018, + "step": 3826 + }, + { + "epoch": 2.3108964684575914, + "grad_norm": 0.16015625, + "learning_rate": 4.736405147364052e-05, + "loss": 0.6526, + "step": 3827 + }, + { + "epoch": 2.3115001509206157, + "grad_norm": 0.140625, + "learning_rate": 4.7322540473225404e-05, + "loss": 0.5563, + "step": 3828 + }, + { + "epoch": 2.3121038333836403, + "grad_norm": 0.13671875, + "learning_rate": 4.72810294728103e-05, + "loss": 0.6763, + "step": 3829 + }, + { + "epoch": 2.3127075158466646, + "grad_norm": 0.1484375, + "learning_rate": 4.723951847239519e-05, + "loss": 0.5513, + "step": 3830 + }, + { + "epoch": 2.3133111983096892, + "grad_norm": 0.146484375, + "learning_rate": 4.719800747198008e-05, + "loss": 0.7, + "step": 3831 + }, + { + "epoch": 2.3139148807727135, + "grad_norm": 0.140625, + "learning_rate": 4.7156496471564964e-05, + "loss": 0.6488, + "step": 3832 + }, + { + "epoch": 2.314518563235738, + "grad_norm": 0.15625, + "learning_rate": 4.711498547114986e-05, + "loss": 0.6514, + "step": 3833 + }, + { + "epoch": 2.3151222456987623, + "grad_norm": 0.1435546875, + "learning_rate": 4.7073474470734743e-05, + "loss": 0.5488, + "step": 3834 + }, + { + "epoch": 2.315725928161787, + "grad_norm": 0.146484375, + "learning_rate": 4.703196347031964e-05, + "loss": 0.6489, + "step": 3835 + }, + { + "epoch": 2.3163296106248112, + "grad_norm": 0.146484375, + "learning_rate": 4.699045246990453e-05, + "loss": 0.5646, + "step": 3836 + }, + { + "epoch": 2.316933293087836, + "grad_norm": 0.1494140625, + "learning_rate": 4.6948941469489416e-05, + "loss": 0.6441, + "step": 3837 + }, + { + "epoch": 2.31753697555086, + "grad_norm": 0.1435546875, + "learning_rate": 4.690743046907431e-05, + "loss": 0.665, + "step": 3838 + }, + { + "epoch": 2.318140658013885, + "grad_norm": 0.12890625, + "learning_rate": 4.6865919468659196e-05, + "loss": 0.5684, + "step": 3839 + }, + { + "epoch": 2.318744340476909, + "grad_norm": 0.1376953125, + "learning_rate": 4.682440846824408e-05, + "loss": 0.5394, + "step": 3840 + }, + { + "epoch": 2.3193480229399337, + "grad_norm": 0.154296875, + "learning_rate": 4.6782897467828976e-05, + "loss": 0.5669, + "step": 3841 + }, + { + "epoch": 2.319951705402958, + "grad_norm": 0.1572265625, + "learning_rate": 4.674138646741387e-05, + "loss": 0.6004, + "step": 3842 + }, + { + "epoch": 2.3205553878659826, + "grad_norm": 0.1533203125, + "learning_rate": 4.6699875466998756e-05, + "loss": 0.6637, + "step": 3843 + }, + { + "epoch": 2.321159070329007, + "grad_norm": 0.1337890625, + "learning_rate": 4.665836446658365e-05, + "loss": 0.5828, + "step": 3844 + }, + { + "epoch": 2.3217627527920315, + "grad_norm": 0.13671875, + "learning_rate": 4.6616853466168535e-05, + "loss": 0.5728, + "step": 3845 + }, + { + "epoch": 2.3223664352550557, + "grad_norm": 0.1376953125, + "learning_rate": 4.657534246575342e-05, + "loss": 0.5247, + "step": 3846 + }, + { + "epoch": 2.3229701177180804, + "grad_norm": 0.1416015625, + "learning_rate": 4.6533831465338315e-05, + "loss": 0.5778, + "step": 3847 + }, + { + "epoch": 2.3235738001811046, + "grad_norm": 0.1416015625, + "learning_rate": 4.649232046492321e-05, + "loss": 0.4809, + "step": 3848 + }, + { + "epoch": 2.3241774826441293, + "grad_norm": 0.1591796875, + "learning_rate": 4.64508094645081e-05, + "loss": 0.5791, + "step": 3849 + }, + { + "epoch": 2.3247811651071535, + "grad_norm": 0.154296875, + "learning_rate": 4.640929846409299e-05, + "loss": 0.4888, + "step": 3850 + }, + { + "epoch": 2.325384847570178, + "grad_norm": 0.1630859375, + "learning_rate": 4.6367787463677875e-05, + "loss": 0.5856, + "step": 3851 + }, + { + "epoch": 2.3259885300332024, + "grad_norm": 0.1650390625, + "learning_rate": 4.632627646326277e-05, + "loss": 0.5226, + "step": 3852 + }, + { + "epoch": 2.326592212496227, + "grad_norm": 0.16796875, + "learning_rate": 4.6284765462847654e-05, + "loss": 0.494, + "step": 3853 + }, + { + "epoch": 2.3271958949592513, + "grad_norm": 0.1689453125, + "learning_rate": 4.624325446243255e-05, + "loss": 0.4667, + "step": 3854 + }, + { + "epoch": 2.327799577422276, + "grad_norm": 0.177734375, + "learning_rate": 4.620174346201744e-05, + "loss": 0.4776, + "step": 3855 + }, + { + "epoch": 2.3284032598853, + "grad_norm": 0.1806640625, + "learning_rate": 4.616023246160233e-05, + "loss": 0.4472, + "step": 3856 + }, + { + "epoch": 2.329006942348325, + "grad_norm": 0.1884765625, + "learning_rate": 4.6118721461187214e-05, + "loss": 0.3986, + "step": 3857 + }, + { + "epoch": 2.329610624811349, + "grad_norm": 0.19140625, + "learning_rate": 4.607721046077211e-05, + "loss": 0.3597, + "step": 3858 + }, + { + "epoch": 2.3302143072743737, + "grad_norm": 0.201171875, + "learning_rate": 4.6035699460356994e-05, + "loss": 0.3309, + "step": 3859 + }, + { + "epoch": 2.330817989737398, + "grad_norm": 0.2119140625, + "learning_rate": 4.599418845994189e-05, + "loss": 0.2997, + "step": 3860 + }, + { + "epoch": 2.3314216722004226, + "grad_norm": 0.2080078125, + "learning_rate": 4.595267745952678e-05, + "loss": 0.2452, + "step": 3861 + }, + { + "epoch": 2.332025354663447, + "grad_norm": 0.21484375, + "learning_rate": 4.5911166459111667e-05, + "loss": 0.1978, + "step": 3862 + }, + { + "epoch": 2.3326290371264715, + "grad_norm": 0.14453125, + "learning_rate": 4.586965545869656e-05, + "loss": 0.6661, + "step": 3863 + }, + { + "epoch": 2.3332327195894957, + "grad_norm": 0.1552734375, + "learning_rate": 4.5828144458281446e-05, + "loss": 0.5825, + "step": 3864 + }, + { + "epoch": 2.3338364020525204, + "grad_norm": 0.158203125, + "learning_rate": 4.578663345786633e-05, + "loss": 0.6307, + "step": 3865 + }, + { + "epoch": 2.3344400845155446, + "grad_norm": 0.36328125, + "learning_rate": 4.5745122457451226e-05, + "loss": 0.555, + "step": 3866 + }, + { + "epoch": 2.3350437669785693, + "grad_norm": 0.150390625, + "learning_rate": 4.570361145703612e-05, + "loss": 0.5548, + "step": 3867 + }, + { + "epoch": 2.335647449441594, + "grad_norm": 0.1435546875, + "learning_rate": 4.5662100456621006e-05, + "loss": 0.7686, + "step": 3868 + }, + { + "epoch": 2.336251131904618, + "grad_norm": 0.1591796875, + "learning_rate": 4.56205894562059e-05, + "loss": 0.8087, + "step": 3869 + }, + { + "epoch": 2.3368548143676424, + "grad_norm": 0.1572265625, + "learning_rate": 4.5579078455790786e-05, + "loss": 0.7414, + "step": 3870 + }, + { + "epoch": 2.337458496830667, + "grad_norm": 0.1494140625, + "learning_rate": 4.553756745537568e-05, + "loss": 0.5163, + "step": 3871 + }, + { + "epoch": 2.3380621792936918, + "grad_norm": 0.1455078125, + "learning_rate": 4.5496056454960565e-05, + "loss": 0.6429, + "step": 3872 + }, + { + "epoch": 2.338665861756716, + "grad_norm": 0.1533203125, + "learning_rate": 4.545454545454546e-05, + "loss": 0.6131, + "step": 3873 + }, + { + "epoch": 2.33926954421974, + "grad_norm": 0.13671875, + "learning_rate": 4.541303445413035e-05, + "loss": 0.546, + "step": 3874 + }, + { + "epoch": 2.339873226682765, + "grad_norm": 0.1484375, + "learning_rate": 4.537152345371524e-05, + "loss": 0.7007, + "step": 3875 + }, + { + "epoch": 2.3404769091457895, + "grad_norm": 0.1435546875, + "learning_rate": 4.5330012453300125e-05, + "loss": 0.503, + "step": 3876 + }, + { + "epoch": 2.3410805916088138, + "grad_norm": 0.15625, + "learning_rate": 4.528850145288502e-05, + "loss": 0.6473, + "step": 3877 + }, + { + "epoch": 2.341684274071838, + "grad_norm": 0.1474609375, + "learning_rate": 4.5246990452469905e-05, + "loss": 0.6571, + "step": 3878 + }, + { + "epoch": 2.3422879565348627, + "grad_norm": 0.1416015625, + "learning_rate": 4.520547945205479e-05, + "loss": 0.5842, + "step": 3879 + }, + { + "epoch": 2.3428916389978873, + "grad_norm": 0.1416015625, + "learning_rate": 4.516396845163969e-05, + "loss": 0.5991, + "step": 3880 + }, + { + "epoch": 2.3434953214609116, + "grad_norm": 0.1455078125, + "learning_rate": 4.512245745122458e-05, + "loss": 0.6484, + "step": 3881 + }, + { + "epoch": 2.3440990039239358, + "grad_norm": 0.1513671875, + "learning_rate": 4.508094645080947e-05, + "loss": 0.603, + "step": 3882 + }, + { + "epoch": 2.3447026863869604, + "grad_norm": 0.146484375, + "learning_rate": 4.503943545039436e-05, + "loss": 0.6013, + "step": 3883 + }, + { + "epoch": 2.345306368849985, + "grad_norm": 0.150390625, + "learning_rate": 4.4997924449979244e-05, + "loss": 0.6968, + "step": 3884 + }, + { + "epoch": 2.3459100513130093, + "grad_norm": 0.1533203125, + "learning_rate": 4.495641344956414e-05, + "loss": 0.7278, + "step": 3885 + }, + { + "epoch": 2.3465137337760336, + "grad_norm": 0.1455078125, + "learning_rate": 4.4914902449149024e-05, + "loss": 0.5987, + "step": 3886 + }, + { + "epoch": 2.3471174162390582, + "grad_norm": 0.1533203125, + "learning_rate": 4.487339144873392e-05, + "loss": 0.6939, + "step": 3887 + }, + { + "epoch": 2.347721098702083, + "grad_norm": 0.1630859375, + "learning_rate": 4.483188044831881e-05, + "loss": 0.6199, + "step": 3888 + }, + { + "epoch": 2.348324781165107, + "grad_norm": 0.16015625, + "learning_rate": 4.4790369447903697e-05, + "loss": 0.653, + "step": 3889 + }, + { + "epoch": 2.348928463628132, + "grad_norm": 0.1376953125, + "learning_rate": 4.474885844748858e-05, + "loss": 0.5988, + "step": 3890 + }, + { + "epoch": 2.349532146091156, + "grad_norm": 0.1513671875, + "learning_rate": 4.4707347447073476e-05, + "loss": 0.603, + "step": 3891 + }, + { + "epoch": 2.3501358285541807, + "grad_norm": 0.142578125, + "learning_rate": 4.466583644665836e-05, + "loss": 0.5959, + "step": 3892 + }, + { + "epoch": 2.350739511017205, + "grad_norm": 0.13671875, + "learning_rate": 4.4624325446243256e-05, + "loss": 0.6052, + "step": 3893 + }, + { + "epoch": 2.3513431934802296, + "grad_norm": 0.1484375, + "learning_rate": 4.458281444582815e-05, + "loss": 1.0572, + "step": 3894 + }, + { + "epoch": 2.351946875943254, + "grad_norm": 0.1640625, + "learning_rate": 4.4541303445413036e-05, + "loss": 0.6273, + "step": 3895 + }, + { + "epoch": 2.3525505584062785, + "grad_norm": 0.15625, + "learning_rate": 4.449979244499793e-05, + "loss": 0.6219, + "step": 3896 + }, + { + "epoch": 2.3531542408693027, + "grad_norm": 0.142578125, + "learning_rate": 4.4458281444582815e-05, + "loss": 0.5061, + "step": 3897 + }, + { + "epoch": 2.3537579233323274, + "grad_norm": 0.150390625, + "learning_rate": 4.44167704441677e-05, + "loss": 0.5157, + "step": 3898 + }, + { + "epoch": 2.3543616057953516, + "grad_norm": 0.1455078125, + "learning_rate": 4.4375259443752595e-05, + "loss": 0.5173, + "step": 3899 + }, + { + "epoch": 2.3549652882583763, + "grad_norm": 0.15234375, + "learning_rate": 4.433374844333749e-05, + "loss": 0.5796, + "step": 3900 + }, + { + "epoch": 2.3555689707214005, + "grad_norm": 0.1650390625, + "learning_rate": 4.4292237442922375e-05, + "loss": 0.5505, + "step": 3901 + }, + { + "epoch": 2.356172653184425, + "grad_norm": 0.169921875, + "learning_rate": 4.425072644250727e-05, + "loss": 0.5425, + "step": 3902 + }, + { + "epoch": 2.3567763356474494, + "grad_norm": 0.1708984375, + "learning_rate": 4.4209215442092155e-05, + "loss": 0.4759, + "step": 3903 + }, + { + "epoch": 2.357380018110474, + "grad_norm": 0.1787109375, + "learning_rate": 4.416770444167705e-05, + "loss": 0.5116, + "step": 3904 + }, + { + "epoch": 2.3579837005734983, + "grad_norm": 0.16796875, + "learning_rate": 4.4126193441261934e-05, + "loss": 0.3866, + "step": 3905 + }, + { + "epoch": 2.358587383036523, + "grad_norm": 0.171875, + "learning_rate": 4.408468244084683e-05, + "loss": 0.4082, + "step": 3906 + }, + { + "epoch": 2.359191065499547, + "grad_norm": 0.19140625, + "learning_rate": 4.404317144043172e-05, + "loss": 0.4422, + "step": 3907 + }, + { + "epoch": 2.359794747962572, + "grad_norm": 0.1865234375, + "learning_rate": 4.400166044001661e-05, + "loss": 0.4258, + "step": 3908 + }, + { + "epoch": 2.360398430425596, + "grad_norm": 0.1826171875, + "learning_rate": 4.3960149439601494e-05, + "loss": 0.3006, + "step": 3909 + }, + { + "epoch": 2.3610021128886207, + "grad_norm": 0.1923828125, + "learning_rate": 4.391863843918639e-05, + "loss": 0.2906, + "step": 3910 + }, + { + "epoch": 2.361605795351645, + "grad_norm": 0.2060546875, + "learning_rate": 4.3877127438771274e-05, + "loss": 0.2704, + "step": 3911 + }, + { + "epoch": 2.3622094778146696, + "grad_norm": 0.1787109375, + "learning_rate": 4.383561643835617e-05, + "loss": 0.191, + "step": 3912 + }, + { + "epoch": 2.362813160277694, + "grad_norm": 0.146484375, + "learning_rate": 4.379410543794106e-05, + "loss": 0.6088, + "step": 3913 + }, + { + "epoch": 2.3634168427407185, + "grad_norm": 0.1376953125, + "learning_rate": 4.375259443752595e-05, + "loss": 0.7435, + "step": 3914 + }, + { + "epoch": 2.3640205252037427, + "grad_norm": 0.15234375, + "learning_rate": 4.371108343711084e-05, + "loss": 0.6249, + "step": 3915 + }, + { + "epoch": 2.3646242076667674, + "grad_norm": 0.15234375, + "learning_rate": 4.3669572436695726e-05, + "loss": 0.6178, + "step": 3916 + }, + { + "epoch": 2.3652278901297916, + "grad_norm": 0.1455078125, + "learning_rate": 4.362806143628061e-05, + "loss": 0.6073, + "step": 3917 + }, + { + "epoch": 2.3658315725928163, + "grad_norm": 0.1474609375, + "learning_rate": 4.3586550435865506e-05, + "loss": 0.647, + "step": 3918 + }, + { + "epoch": 2.3664352550558405, + "grad_norm": 0.14453125, + "learning_rate": 4.35450394354504e-05, + "loss": 0.6807, + "step": 3919 + }, + { + "epoch": 2.367038937518865, + "grad_norm": 0.1357421875, + "learning_rate": 4.3503528435035286e-05, + "loss": 0.5945, + "step": 3920 + }, + { + "epoch": 2.3676426199818894, + "grad_norm": 0.1416015625, + "learning_rate": 4.346201743462018e-05, + "loss": 0.6119, + "step": 3921 + }, + { + "epoch": 2.368246302444914, + "grad_norm": 0.1640625, + "learning_rate": 4.3420506434205066e-05, + "loss": 0.6251, + "step": 3922 + }, + { + "epoch": 2.3688499849079383, + "grad_norm": 0.162109375, + "learning_rate": 4.337899543378995e-05, + "loss": 0.5501, + "step": 3923 + }, + { + "epoch": 2.369453667370963, + "grad_norm": 0.14453125, + "learning_rate": 4.3337484433374845e-05, + "loss": 0.6416, + "step": 3924 + }, + { + "epoch": 2.370057349833987, + "grad_norm": 0.1533203125, + "learning_rate": 4.329597343295974e-05, + "loss": 0.6151, + "step": 3925 + }, + { + "epoch": 2.370661032297012, + "grad_norm": 0.1611328125, + "learning_rate": 4.325446243254463e-05, + "loss": 0.7269, + "step": 3926 + }, + { + "epoch": 2.371264714760036, + "grad_norm": 0.1435546875, + "learning_rate": 4.321295143212952e-05, + "loss": 0.5769, + "step": 3927 + }, + { + "epoch": 2.3718683972230608, + "grad_norm": 0.140625, + "learning_rate": 4.3171440431714405e-05, + "loss": 0.6409, + "step": 3928 + }, + { + "epoch": 2.372472079686085, + "grad_norm": 0.14453125, + "learning_rate": 4.31299294312993e-05, + "loss": 0.6153, + "step": 3929 + }, + { + "epoch": 2.3730757621491096, + "grad_norm": 0.14453125, + "learning_rate": 4.3088418430884185e-05, + "loss": 0.5554, + "step": 3930 + }, + { + "epoch": 2.373679444612134, + "grad_norm": 0.1669921875, + "learning_rate": 4.304690743046907e-05, + "loss": 0.5879, + "step": 3931 + }, + { + "epoch": 2.3742831270751585, + "grad_norm": 0.1494140625, + "learning_rate": 4.300539643005397e-05, + "loss": 0.5892, + "step": 3932 + }, + { + "epoch": 2.3748868095381828, + "grad_norm": 0.1689453125, + "learning_rate": 4.296388542963886e-05, + "loss": 0.621, + "step": 3933 + }, + { + "epoch": 2.3754904920012074, + "grad_norm": 0.15234375, + "learning_rate": 4.2922374429223744e-05, + "loss": 1.0716, + "step": 3934 + }, + { + "epoch": 2.3760941744642317, + "grad_norm": 0.1533203125, + "learning_rate": 4.288086342880864e-05, + "loss": 0.6236, + "step": 3935 + }, + { + "epoch": 2.3766978569272563, + "grad_norm": 0.142578125, + "learning_rate": 4.2839352428393524e-05, + "loss": 0.6244, + "step": 3936 + }, + { + "epoch": 2.3773015393902805, + "grad_norm": 0.150390625, + "learning_rate": 4.279784142797842e-05, + "loss": 0.6411, + "step": 3937 + }, + { + "epoch": 2.377905221853305, + "grad_norm": 0.1396484375, + "learning_rate": 4.2756330427563304e-05, + "loss": 0.517, + "step": 3938 + }, + { + "epoch": 2.3785089043163294, + "grad_norm": 0.14453125, + "learning_rate": 4.27148194271482e-05, + "loss": 0.8545, + "step": 3939 + }, + { + "epoch": 2.379112586779354, + "grad_norm": 0.154296875, + "learning_rate": 4.267330842673309e-05, + "loss": 0.9618, + "step": 3940 + }, + { + "epoch": 2.3797162692423783, + "grad_norm": 0.1494140625, + "learning_rate": 4.2631797426317977e-05, + "loss": 0.6273, + "step": 3941 + }, + { + "epoch": 2.380319951705403, + "grad_norm": 0.13671875, + "learning_rate": 4.259028642590286e-05, + "loss": 0.9013, + "step": 3942 + }, + { + "epoch": 2.3809236341684272, + "grad_norm": 0.1494140625, + "learning_rate": 4.2548775425487756e-05, + "loss": 0.8099, + "step": 3943 + }, + { + "epoch": 2.381527316631452, + "grad_norm": 0.1494140625, + "learning_rate": 4.250726442507264e-05, + "loss": 0.633, + "step": 3944 + }, + { + "epoch": 2.382130999094476, + "grad_norm": 0.1640625, + "learning_rate": 4.2465753424657536e-05, + "loss": 0.5849, + "step": 3945 + }, + { + "epoch": 2.382734681557501, + "grad_norm": 0.1376953125, + "learning_rate": 4.242424242424243e-05, + "loss": 0.5141, + "step": 3946 + }, + { + "epoch": 2.383338364020525, + "grad_norm": 0.142578125, + "learning_rate": 4.2382731423827316e-05, + "loss": 0.5301, + "step": 3947 + }, + { + "epoch": 2.3839420464835497, + "grad_norm": 0.142578125, + "learning_rate": 4.234122042341221e-05, + "loss": 0.5945, + "step": 3948 + }, + { + "epoch": 2.3845457289465744, + "grad_norm": 0.146484375, + "learning_rate": 4.2299709422997096e-05, + "loss": 0.5041, + "step": 3949 + }, + { + "epoch": 2.3851494114095986, + "grad_norm": 0.15625, + "learning_rate": 4.225819842258198e-05, + "loss": 0.5563, + "step": 3950 + }, + { + "epoch": 2.385753093872623, + "grad_norm": 0.1611328125, + "learning_rate": 4.2216687422166875e-05, + "loss": 0.5933, + "step": 3951 + }, + { + "epoch": 2.3863567763356475, + "grad_norm": 0.1650390625, + "learning_rate": 4.217517642175177e-05, + "loss": 0.5283, + "step": 3952 + }, + { + "epoch": 2.386960458798672, + "grad_norm": 0.171875, + "learning_rate": 4.2133665421336655e-05, + "loss": 0.5161, + "step": 3953 + }, + { + "epoch": 2.3875641412616964, + "grad_norm": 0.1708984375, + "learning_rate": 4.209215442092155e-05, + "loss": 0.5154, + "step": 3954 + }, + { + "epoch": 2.3881678237247206, + "grad_norm": 0.1630859375, + "learning_rate": 4.2050643420506435e-05, + "loss": 0.4278, + "step": 3955 + }, + { + "epoch": 2.3887715061877453, + "grad_norm": 0.177734375, + "learning_rate": 4.200913242009132e-05, + "loss": 0.5358, + "step": 3956 + }, + { + "epoch": 2.38937518865077, + "grad_norm": 0.1796875, + "learning_rate": 4.1967621419676215e-05, + "loss": 0.4306, + "step": 3957 + }, + { + "epoch": 2.389978871113794, + "grad_norm": 0.1904296875, + "learning_rate": 4.192611041926111e-05, + "loss": 0.4228, + "step": 3958 + }, + { + "epoch": 2.3905825535768184, + "grad_norm": 0.1826171875, + "learning_rate": 4.1884599418846e-05, + "loss": 0.3486, + "step": 3959 + }, + { + "epoch": 2.391186236039843, + "grad_norm": 0.205078125, + "learning_rate": 4.184308841843089e-05, + "loss": 0.3173, + "step": 3960 + }, + { + "epoch": 2.3917899185028677, + "grad_norm": 0.212890625, + "learning_rate": 4.1801577418015774e-05, + "loss": 0.328, + "step": 3961 + }, + { + "epoch": 2.392393600965892, + "grad_norm": 0.2001953125, + "learning_rate": 4.176006641760067e-05, + "loss": 0.2109, + "step": 3962 + }, + { + "epoch": 2.392997283428916, + "grad_norm": 0.14453125, + "learning_rate": 4.1718555417185554e-05, + "loss": 0.7375, + "step": 3963 + }, + { + "epoch": 2.393600965891941, + "grad_norm": 0.146484375, + "learning_rate": 4.167704441677045e-05, + "loss": 0.515, + "step": 3964 + }, + { + "epoch": 2.3942046483549655, + "grad_norm": 0.150390625, + "learning_rate": 4.163553341635534e-05, + "loss": 0.6026, + "step": 3965 + }, + { + "epoch": 2.3948083308179897, + "grad_norm": 0.154296875, + "learning_rate": 4.159402241594023e-05, + "loss": 0.5353, + "step": 3966 + }, + { + "epoch": 2.395412013281014, + "grad_norm": 0.1376953125, + "learning_rate": 4.155251141552511e-05, + "loss": 0.6007, + "step": 3967 + }, + { + "epoch": 2.3960156957440386, + "grad_norm": 0.134765625, + "learning_rate": 4.1511000415110007e-05, + "loss": 0.5639, + "step": 3968 + }, + { + "epoch": 2.3966193782070633, + "grad_norm": 0.1396484375, + "learning_rate": 4.146948941469489e-05, + "loss": 0.5642, + "step": 3969 + }, + { + "epoch": 2.3972230606700875, + "grad_norm": 0.146484375, + "learning_rate": 4.1427978414279786e-05, + "loss": 0.584, + "step": 3970 + }, + { + "epoch": 2.397826743133112, + "grad_norm": 0.1416015625, + "learning_rate": 4.138646741386468e-05, + "loss": 0.5897, + "step": 3971 + }, + { + "epoch": 2.3984304255961364, + "grad_norm": 0.1484375, + "learning_rate": 4.1344956413449566e-05, + "loss": 0.7358, + "step": 3972 + }, + { + "epoch": 2.399034108059161, + "grad_norm": 0.177734375, + "learning_rate": 4.130344541303446e-05, + "loss": 0.5912, + "step": 3973 + }, + { + "epoch": 2.3996377905221853, + "grad_norm": 0.158203125, + "learning_rate": 4.1261934412619346e-05, + "loss": 0.8172, + "step": 3974 + }, + { + "epoch": 2.40024147298521, + "grad_norm": 0.1376953125, + "learning_rate": 4.122042341220423e-05, + "loss": 0.8762, + "step": 3975 + }, + { + "epoch": 2.400845155448234, + "grad_norm": 0.1484375, + "learning_rate": 4.1178912411789126e-05, + "loss": 0.6193, + "step": 3976 + }, + { + "epoch": 2.401448837911259, + "grad_norm": 0.1591796875, + "learning_rate": 4.113740141137402e-05, + "loss": 0.6507, + "step": 3977 + }, + { + "epoch": 2.402052520374283, + "grad_norm": 0.158203125, + "learning_rate": 4.1095890410958905e-05, + "loss": 0.6294, + "step": 3978 + }, + { + "epoch": 2.4026562028373077, + "grad_norm": 0.1669921875, + "learning_rate": 4.10543794105438e-05, + "loss": 0.6852, + "step": 3979 + }, + { + "epoch": 2.403259885300332, + "grad_norm": 0.189453125, + "learning_rate": 4.1012868410128685e-05, + "loss": 0.6219, + "step": 3980 + }, + { + "epoch": 2.4038635677633566, + "grad_norm": 0.138671875, + "learning_rate": 4.097135740971358e-05, + "loss": 0.5864, + "step": 3981 + }, + { + "epoch": 2.404467250226381, + "grad_norm": 0.1572265625, + "learning_rate": 4.0929846409298465e-05, + "loss": 0.5816, + "step": 3982 + }, + { + "epoch": 2.4050709326894055, + "grad_norm": 0.1689453125, + "learning_rate": 4.088833540888336e-05, + "loss": 0.7003, + "step": 3983 + }, + { + "epoch": 2.4056746151524298, + "grad_norm": 0.140625, + "learning_rate": 4.084682440846825e-05, + "loss": 0.6031, + "step": 3984 + }, + { + "epoch": 2.4062782976154544, + "grad_norm": 0.1435546875, + "learning_rate": 4.080531340805314e-05, + "loss": 0.9704, + "step": 3985 + }, + { + "epoch": 2.4068819800784786, + "grad_norm": 0.142578125, + "learning_rate": 4.0763802407638024e-05, + "loss": 0.5736, + "step": 3986 + }, + { + "epoch": 2.4074856625415033, + "grad_norm": 0.1455078125, + "learning_rate": 4.072229140722292e-05, + "loss": 0.8293, + "step": 3987 + }, + { + "epoch": 2.4080893450045275, + "grad_norm": 0.1494140625, + "learning_rate": 4.0680780406807804e-05, + "loss": 0.5689, + "step": 3988 + }, + { + "epoch": 2.408693027467552, + "grad_norm": 0.1396484375, + "learning_rate": 4.063926940639269e-05, + "loss": 0.6113, + "step": 3989 + }, + { + "epoch": 2.4092967099305764, + "grad_norm": 0.1298828125, + "learning_rate": 4.059775840597759e-05, + "loss": 0.5141, + "step": 3990 + }, + { + "epoch": 2.409900392393601, + "grad_norm": 0.1435546875, + "learning_rate": 4.055624740556248e-05, + "loss": 0.8989, + "step": 3991 + }, + { + "epoch": 2.4105040748566253, + "grad_norm": 0.140625, + "learning_rate": 4.0514736405147363e-05, + "loss": 0.5546, + "step": 3992 + }, + { + "epoch": 2.41110775731965, + "grad_norm": 0.1435546875, + "learning_rate": 4.047322540473226e-05, + "loss": 0.5768, + "step": 3993 + }, + { + "epoch": 2.411711439782674, + "grad_norm": 0.373046875, + "learning_rate": 4.043171440431714e-05, + "loss": 0.5518, + "step": 3994 + }, + { + "epoch": 2.412315122245699, + "grad_norm": 0.1328125, + "learning_rate": 4.0390203403902036e-05, + "loss": 0.5835, + "step": 3995 + }, + { + "epoch": 2.412918804708723, + "grad_norm": 0.14453125, + "learning_rate": 4.034869240348692e-05, + "loss": 0.8638, + "step": 3996 + }, + { + "epoch": 2.4135224871717478, + "grad_norm": 0.138671875, + "learning_rate": 4.0307181403071816e-05, + "loss": 0.5474, + "step": 3997 + }, + { + "epoch": 2.414126169634772, + "grad_norm": 0.13671875, + "learning_rate": 4.026567040265671e-05, + "loss": 0.5179, + "step": 3998 + }, + { + "epoch": 2.4147298520977967, + "grad_norm": 0.150390625, + "learning_rate": 4.0224159402241596e-05, + "loss": 0.5755, + "step": 3999 + }, + { + "epoch": 2.415333534560821, + "grad_norm": 0.15234375, + "learning_rate": 4.018264840182648e-05, + "loss": 0.4952, + "step": 4000 + }, + { + "epoch": 2.415333534560821, + "eval_loss": 0.6154825687408447, + "eval_runtime": 1059.7325, + "eval_samples_per_second": 2.633, + "eval_steps_per_second": 0.329, + "step": 4000 + }, + { + "epoch": 2.4159372170238456, + "grad_norm": 0.1552734375, + "learning_rate": 4.0141137401411376e-05, + "loss": 0.5103, + "step": 4001 + }, + { + "epoch": 2.41654089948687, + "grad_norm": 0.1650390625, + "learning_rate": 4.009962640099626e-05, + "loss": 0.5184, + "step": 4002 + }, + { + "epoch": 2.4171445819498945, + "grad_norm": 0.1640625, + "learning_rate": 4.0058115400581155e-05, + "loss": 0.4408, + "step": 4003 + }, + { + "epoch": 2.4177482644129187, + "grad_norm": 0.1591796875, + "learning_rate": 4.001660440016605e-05, + "loss": 0.531, + "step": 4004 + }, + { + "epoch": 2.4183519468759433, + "grad_norm": 0.1845703125, + "learning_rate": 3.9975093399750935e-05, + "loss": 0.5268, + "step": 4005 + }, + { + "epoch": 2.4189556293389676, + "grad_norm": 0.1748046875, + "learning_rate": 3.993358239933583e-05, + "loss": 0.4319, + "step": 4006 + }, + { + "epoch": 2.4195593118019922, + "grad_norm": 0.1748046875, + "learning_rate": 3.9892071398920715e-05, + "loss": 0.3856, + "step": 4007 + }, + { + "epoch": 2.4201629942650165, + "grad_norm": 0.1982421875, + "learning_rate": 3.98505603985056e-05, + "loss": 0.4184, + "step": 4008 + }, + { + "epoch": 2.420766676728041, + "grad_norm": 0.2099609375, + "learning_rate": 3.9809049398090495e-05, + "loss": 0.4034, + "step": 4009 + }, + { + "epoch": 2.4213703591910654, + "grad_norm": 0.193359375, + "learning_rate": 3.976753839767539e-05, + "loss": 0.2712, + "step": 4010 + }, + { + "epoch": 2.42197404165409, + "grad_norm": 0.1875, + "learning_rate": 3.9726027397260274e-05, + "loss": 0.2546, + "step": 4011 + }, + { + "epoch": 2.4225777241171143, + "grad_norm": 0.185546875, + "learning_rate": 3.968451639684517e-05, + "loss": 0.2069, + "step": 4012 + }, + { + "epoch": 2.423181406580139, + "grad_norm": 0.16015625, + "learning_rate": 3.9643005396430054e-05, + "loss": 0.6653, + "step": 4013 + }, + { + "epoch": 2.423785089043163, + "grad_norm": 0.1484375, + "learning_rate": 3.960149439601494e-05, + "loss": 0.5395, + "step": 4014 + }, + { + "epoch": 2.424388771506188, + "grad_norm": 0.1357421875, + "learning_rate": 3.9559983395599834e-05, + "loss": 0.6241, + "step": 4015 + }, + { + "epoch": 2.424992453969212, + "grad_norm": 0.1357421875, + "learning_rate": 3.951847239518473e-05, + "loss": 0.628, + "step": 4016 + }, + { + "epoch": 2.4255961364322367, + "grad_norm": 0.1640625, + "learning_rate": 3.947696139476962e-05, + "loss": 0.6188, + "step": 4017 + }, + { + "epoch": 2.426199818895261, + "grad_norm": 0.15234375, + "learning_rate": 3.943545039435451e-05, + "loss": 0.7127, + "step": 4018 + }, + { + "epoch": 2.4268035013582856, + "grad_norm": 0.154296875, + "learning_rate": 3.939393939393939e-05, + "loss": 0.8119, + "step": 4019 + }, + { + "epoch": 2.42740718382131, + "grad_norm": 0.1484375, + "learning_rate": 3.935242839352429e-05, + "loss": 0.5477, + "step": 4020 + }, + { + "epoch": 2.4280108662843345, + "grad_norm": 0.138671875, + "learning_rate": 3.931091739310917e-05, + "loss": 0.557, + "step": 4021 + }, + { + "epoch": 2.4286145487473587, + "grad_norm": 0.1435546875, + "learning_rate": 3.9269406392694066e-05, + "loss": 0.8675, + "step": 4022 + }, + { + "epoch": 2.4292182312103834, + "grad_norm": 0.1328125, + "learning_rate": 3.922789539227896e-05, + "loss": 0.7214, + "step": 4023 + }, + { + "epoch": 2.4298219136734076, + "grad_norm": 0.14453125, + "learning_rate": 3.9186384391863846e-05, + "loss": 0.6073, + "step": 4024 + }, + { + "epoch": 2.4304255961364323, + "grad_norm": 0.1376953125, + "learning_rate": 3.914487339144873e-05, + "loss": 0.6403, + "step": 4025 + }, + { + "epoch": 2.4310292785994565, + "grad_norm": 0.146484375, + "learning_rate": 3.9103362391033626e-05, + "loss": 0.7011, + "step": 4026 + }, + { + "epoch": 2.431632961062481, + "grad_norm": 0.1416015625, + "learning_rate": 3.906185139061851e-05, + "loss": 0.6148, + "step": 4027 + }, + { + "epoch": 2.4322366435255054, + "grad_norm": 0.1533203125, + "learning_rate": 3.9020340390203406e-05, + "loss": 0.743, + "step": 4028 + }, + { + "epoch": 2.43284032598853, + "grad_norm": 0.1494140625, + "learning_rate": 3.89788293897883e-05, + "loss": 0.6212, + "step": 4029 + }, + { + "epoch": 2.4334440084515547, + "grad_norm": 0.1513671875, + "learning_rate": 3.8937318389373185e-05, + "loss": 0.5285, + "step": 4030 + }, + { + "epoch": 2.434047690914579, + "grad_norm": 0.140625, + "learning_rate": 3.889580738895808e-05, + "loss": 0.7145, + "step": 4031 + }, + { + "epoch": 2.434651373377603, + "grad_norm": 0.1865234375, + "learning_rate": 3.8854296388542965e-05, + "loss": 0.6085, + "step": 4032 + }, + { + "epoch": 2.435255055840628, + "grad_norm": 0.1474609375, + "learning_rate": 3.881278538812785e-05, + "loss": 0.8197, + "step": 4033 + }, + { + "epoch": 2.4358587383036525, + "grad_norm": 0.142578125, + "learning_rate": 3.8771274387712745e-05, + "loss": 0.5681, + "step": 4034 + }, + { + "epoch": 2.4364624207666767, + "grad_norm": 0.1640625, + "learning_rate": 3.872976338729764e-05, + "loss": 0.6784, + "step": 4035 + }, + { + "epoch": 2.437066103229701, + "grad_norm": 0.15234375, + "learning_rate": 3.8688252386882525e-05, + "loss": 1.1645, + "step": 4036 + }, + { + "epoch": 2.4376697856927256, + "grad_norm": 0.15234375, + "learning_rate": 3.864674138646742e-05, + "loss": 0.8407, + "step": 4037 + }, + { + "epoch": 2.4382734681557503, + "grad_norm": 0.17578125, + "learning_rate": 3.8605230386052304e-05, + "loss": 0.5816, + "step": 4038 + }, + { + "epoch": 2.4388771506187745, + "grad_norm": 0.208984375, + "learning_rate": 3.85637193856372e-05, + "loss": 0.9043, + "step": 4039 + }, + { + "epoch": 2.4394808330817987, + "grad_norm": 0.1435546875, + "learning_rate": 3.8522208385222084e-05, + "loss": 0.5051, + "step": 4040 + }, + { + "epoch": 2.4400845155448234, + "grad_norm": 0.1650390625, + "learning_rate": 3.848069738480698e-05, + "loss": 0.5917, + "step": 4041 + }, + { + "epoch": 2.440688198007848, + "grad_norm": 0.15625, + "learning_rate": 3.843918638439187e-05, + "loss": 0.6743, + "step": 4042 + }, + { + "epoch": 2.4412918804708723, + "grad_norm": 0.1455078125, + "learning_rate": 3.839767538397676e-05, + "loss": 0.6768, + "step": 4043 + }, + { + "epoch": 2.4418955629338965, + "grad_norm": 0.1416015625, + "learning_rate": 3.8356164383561644e-05, + "loss": 0.7161, + "step": 4044 + }, + { + "epoch": 2.442499245396921, + "grad_norm": 0.1416015625, + "learning_rate": 3.831465338314654e-05, + "loss": 0.5666, + "step": 4045 + }, + { + "epoch": 2.443102927859946, + "grad_norm": 0.1435546875, + "learning_rate": 3.827314238273142e-05, + "loss": 0.5713, + "step": 4046 + }, + { + "epoch": 2.44370661032297, + "grad_norm": 0.142578125, + "learning_rate": 3.823163138231631e-05, + "loss": 0.5426, + "step": 4047 + }, + { + "epoch": 2.4443102927859943, + "grad_norm": 0.1533203125, + "learning_rate": 3.819012038190121e-05, + "loss": 0.5295, + "step": 4048 + }, + { + "epoch": 2.444913975249019, + "grad_norm": 0.158203125, + "learning_rate": 3.8148609381486096e-05, + "loss": 0.5478, + "step": 4049 + }, + { + "epoch": 2.4455176577120437, + "grad_norm": 0.1748046875, + "learning_rate": 3.810709838107099e-05, + "loss": 0.5445, + "step": 4050 + }, + { + "epoch": 2.446121340175068, + "grad_norm": 0.185546875, + "learning_rate": 3.8065587380655876e-05, + "loss": 0.5635, + "step": 4051 + }, + { + "epoch": 2.4467250226380926, + "grad_norm": 0.1640625, + "learning_rate": 3.802407638024076e-05, + "loss": 0.5435, + "step": 4052 + }, + { + "epoch": 2.4473287051011168, + "grad_norm": 0.1572265625, + "learning_rate": 3.7982565379825656e-05, + "loss": 0.4776, + "step": 4053 + }, + { + "epoch": 2.4479323875641414, + "grad_norm": 0.171875, + "learning_rate": 3.794105437941054e-05, + "loss": 0.4868, + "step": 4054 + }, + { + "epoch": 2.4485360700271657, + "grad_norm": 0.1787109375, + "learning_rate": 3.7899543378995436e-05, + "loss": 0.4523, + "step": 4055 + }, + { + "epoch": 2.4491397524901903, + "grad_norm": 0.1962890625, + "learning_rate": 3.785803237858033e-05, + "loss": 0.3794, + "step": 4056 + }, + { + "epoch": 2.4497434349532146, + "grad_norm": 0.193359375, + "learning_rate": 3.7816521378165215e-05, + "loss": 0.4109, + "step": 4057 + }, + { + "epoch": 2.4503471174162392, + "grad_norm": 0.2001953125, + "learning_rate": 3.77750103777501e-05, + "loss": 0.3978, + "step": 4058 + }, + { + "epoch": 2.4509507998792635, + "grad_norm": 0.189453125, + "learning_rate": 3.7733499377334995e-05, + "loss": 0.3167, + "step": 4059 + }, + { + "epoch": 2.451554482342288, + "grad_norm": 0.2041015625, + "learning_rate": 3.769198837691988e-05, + "loss": 0.3251, + "step": 4060 + }, + { + "epoch": 2.4521581648053123, + "grad_norm": 0.189453125, + "learning_rate": 3.7650477376504775e-05, + "loss": 0.241, + "step": 4061 + }, + { + "epoch": 2.452761847268337, + "grad_norm": 0.20703125, + "learning_rate": 3.760896637608967e-05, + "loss": 0.2344, + "step": 4062 + }, + { + "epoch": 2.4533655297313612, + "grad_norm": 0.1435546875, + "learning_rate": 3.7567455375674554e-05, + "loss": 0.6431, + "step": 4063 + }, + { + "epoch": 2.453969212194386, + "grad_norm": 0.14453125, + "learning_rate": 3.752594437525945e-05, + "loss": 0.5608, + "step": 4064 + }, + { + "epoch": 2.45457289465741, + "grad_norm": 0.158203125, + "learning_rate": 3.7484433374844334e-05, + "loss": 0.6186, + "step": 4065 + }, + { + "epoch": 2.455176577120435, + "grad_norm": 0.1337890625, + "learning_rate": 3.744292237442922e-05, + "loss": 0.5959, + "step": 4066 + }, + { + "epoch": 2.455780259583459, + "grad_norm": 0.1552734375, + "learning_rate": 3.7401411374014114e-05, + "loss": 0.5077, + "step": 4067 + }, + { + "epoch": 2.4563839420464837, + "grad_norm": 0.1513671875, + "learning_rate": 3.735990037359901e-05, + "loss": 0.578, + "step": 4068 + }, + { + "epoch": 2.456987624509508, + "grad_norm": 0.154296875, + "learning_rate": 3.7318389373183894e-05, + "loss": 0.6015, + "step": 4069 + }, + { + "epoch": 2.4575913069725326, + "grad_norm": 0.146484375, + "learning_rate": 3.727687837276879e-05, + "loss": 0.6267, + "step": 4070 + }, + { + "epoch": 2.458194989435557, + "grad_norm": 0.1494140625, + "learning_rate": 3.7235367372353673e-05, + "loss": 0.6992, + "step": 4071 + }, + { + "epoch": 2.4587986718985815, + "grad_norm": 0.15234375, + "learning_rate": 3.719385637193857e-05, + "loss": 0.6082, + "step": 4072 + }, + { + "epoch": 2.4594023543616057, + "grad_norm": 0.150390625, + "learning_rate": 3.715234537152345e-05, + "loss": 0.6808, + "step": 4073 + }, + { + "epoch": 2.4600060368246304, + "grad_norm": 0.154296875, + "learning_rate": 3.7110834371108346e-05, + "loss": 0.6117, + "step": 4074 + }, + { + "epoch": 2.4606097192876546, + "grad_norm": 0.146484375, + "learning_rate": 3.706932337069324e-05, + "loss": 0.9315, + "step": 4075 + }, + { + "epoch": 2.4612134017506793, + "grad_norm": 0.1513671875, + "learning_rate": 3.7027812370278126e-05, + "loss": 0.647, + "step": 4076 + }, + { + "epoch": 2.4618170842137035, + "grad_norm": 0.1611328125, + "learning_rate": 3.698630136986301e-05, + "loss": 0.7295, + "step": 4077 + }, + { + "epoch": 2.462420766676728, + "grad_norm": 0.1494140625, + "learning_rate": 3.6944790369447906e-05, + "loss": 0.6171, + "step": 4078 + }, + { + "epoch": 2.4630244491397524, + "grad_norm": 0.1474609375, + "learning_rate": 3.690327936903279e-05, + "loss": 0.698, + "step": 4079 + }, + { + "epoch": 2.463628131602777, + "grad_norm": 0.1435546875, + "learning_rate": 3.6861768368617686e-05, + "loss": 0.5728, + "step": 4080 + }, + { + "epoch": 2.4642318140658013, + "grad_norm": 0.13671875, + "learning_rate": 3.682025736820258e-05, + "loss": 0.6312, + "step": 4081 + }, + { + "epoch": 2.464835496528826, + "grad_norm": 0.1533203125, + "learning_rate": 3.6778746367787465e-05, + "loss": 0.6361, + "step": 4082 + }, + { + "epoch": 2.46543917899185, + "grad_norm": 0.138671875, + "learning_rate": 3.673723536737236e-05, + "loss": 0.6073, + "step": 4083 + }, + { + "epoch": 2.466042861454875, + "grad_norm": 0.16796875, + "learning_rate": 3.6695724366957245e-05, + "loss": 0.5762, + "step": 4084 + }, + { + "epoch": 2.466646543917899, + "grad_norm": 0.146484375, + "learning_rate": 3.665421336654213e-05, + "loss": 0.704, + "step": 4085 + }, + { + "epoch": 2.4672502263809237, + "grad_norm": 0.2255859375, + "learning_rate": 3.6612702366127025e-05, + "loss": 0.5739, + "step": 4086 + }, + { + "epoch": 2.467853908843948, + "grad_norm": 0.1640625, + "learning_rate": 3.657119136571192e-05, + "loss": 0.6475, + "step": 4087 + }, + { + "epoch": 2.4684575913069726, + "grad_norm": 0.1591796875, + "learning_rate": 3.6529680365296805e-05, + "loss": 0.6407, + "step": 4088 + }, + { + "epoch": 2.469061273769997, + "grad_norm": 0.1357421875, + "learning_rate": 3.64881693648817e-05, + "loss": 0.6102, + "step": 4089 + }, + { + "epoch": 2.4696649562330215, + "grad_norm": 0.138671875, + "learning_rate": 3.6446658364466584e-05, + "loss": 0.5091, + "step": 4090 + }, + { + "epoch": 2.4702686386960457, + "grad_norm": 0.146484375, + "learning_rate": 3.640514736405147e-05, + "loss": 0.5409, + "step": 4091 + }, + { + "epoch": 2.4708723211590704, + "grad_norm": 0.142578125, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.7817, + "step": 4092 + }, + { + "epoch": 2.4714760036220946, + "grad_norm": 0.1513671875, + "learning_rate": 3.632212536322126e-05, + "loss": 0.604, + "step": 4093 + }, + { + "epoch": 2.4720796860851193, + "grad_norm": 0.1396484375, + "learning_rate": 3.628061436280615e-05, + "loss": 0.6486, + "step": 4094 + }, + { + "epoch": 2.4726833685481435, + "grad_norm": 0.1552734375, + "learning_rate": 3.623910336239104e-05, + "loss": 0.5922, + "step": 4095 + }, + { + "epoch": 2.473287051011168, + "grad_norm": 0.1494140625, + "learning_rate": 3.6197592361975924e-05, + "loss": 0.5334, + "step": 4096 + }, + { + "epoch": 2.4738907334741924, + "grad_norm": 0.166015625, + "learning_rate": 3.615608136156082e-05, + "loss": 0.5537, + "step": 4097 + }, + { + "epoch": 2.474494415937217, + "grad_norm": 0.146484375, + "learning_rate": 3.61145703611457e-05, + "loss": 0.5546, + "step": 4098 + }, + { + "epoch": 2.4750980984002413, + "grad_norm": 0.1572265625, + "learning_rate": 3.60730593607306e-05, + "loss": 0.4821, + "step": 4099 + }, + { + "epoch": 2.475701780863266, + "grad_norm": 0.1650390625, + "learning_rate": 3.603154836031549e-05, + "loss": 0.5078, + "step": 4100 + }, + { + "epoch": 2.47630546332629, + "grad_norm": 0.1552734375, + "learning_rate": 3.5990037359900376e-05, + "loss": 0.4714, + "step": 4101 + }, + { + "epoch": 2.476909145789315, + "grad_norm": 0.17578125, + "learning_rate": 3.594852635948526e-05, + "loss": 0.5351, + "step": 4102 + }, + { + "epoch": 2.477512828252339, + "grad_norm": 0.169921875, + "learning_rate": 3.5907015359070156e-05, + "loss": 0.5389, + "step": 4103 + }, + { + "epoch": 2.4781165107153638, + "grad_norm": 0.1796875, + "learning_rate": 3.586550435865504e-05, + "loss": 0.5224, + "step": 4104 + }, + { + "epoch": 2.478720193178388, + "grad_norm": 0.18359375, + "learning_rate": 3.5823993358239936e-05, + "loss": 0.4301, + "step": 4105 + }, + { + "epoch": 2.4793238756414127, + "grad_norm": 0.181640625, + "learning_rate": 3.578248235782483e-05, + "loss": 0.4746, + "step": 4106 + }, + { + "epoch": 2.479927558104437, + "grad_norm": 0.19140625, + "learning_rate": 3.5740971357409716e-05, + "loss": 0.3572, + "step": 4107 + }, + { + "epoch": 2.4805312405674615, + "grad_norm": 0.2041015625, + "learning_rate": 3.569946035699461e-05, + "loss": 0.4036, + "step": 4108 + }, + { + "epoch": 2.4811349230304858, + "grad_norm": 0.21484375, + "learning_rate": 3.5657949356579495e-05, + "loss": 0.3364, + "step": 4109 + }, + { + "epoch": 2.4817386054935104, + "grad_norm": 0.1962890625, + "learning_rate": 3.561643835616438e-05, + "loss": 0.3264, + "step": 4110 + }, + { + "epoch": 2.482342287956535, + "grad_norm": 0.2119140625, + "learning_rate": 3.5574927355749275e-05, + "loss": 0.2525, + "step": 4111 + }, + { + "epoch": 2.4829459704195593, + "grad_norm": 0.353515625, + "learning_rate": 3.553341635533416e-05, + "loss": 0.2355, + "step": 4112 + }, + { + "epoch": 2.4835496528825836, + "grad_norm": 0.13671875, + "learning_rate": 3.5491905354919055e-05, + "loss": 0.6404, + "step": 4113 + }, + { + "epoch": 2.4841533353456082, + "grad_norm": 0.15625, + "learning_rate": 3.545039435450395e-05, + "loss": 0.6651, + "step": 4114 + }, + { + "epoch": 2.484757017808633, + "grad_norm": 0.146484375, + "learning_rate": 3.5408883354088835e-05, + "loss": 0.685, + "step": 4115 + }, + { + "epoch": 2.485360700271657, + "grad_norm": 0.1357421875, + "learning_rate": 3.536737235367373e-05, + "loss": 0.5109, + "step": 4116 + }, + { + "epoch": 2.4859643827346813, + "grad_norm": 0.140625, + "learning_rate": 3.5325861353258614e-05, + "loss": 0.7178, + "step": 4117 + }, + { + "epoch": 2.486568065197706, + "grad_norm": 0.1513671875, + "learning_rate": 3.52843503528435e-05, + "loss": 0.5863, + "step": 4118 + }, + { + "epoch": 2.4871717476607307, + "grad_norm": 0.146484375, + "learning_rate": 3.5242839352428394e-05, + "loss": 0.6693, + "step": 4119 + }, + { + "epoch": 2.487775430123755, + "grad_norm": 0.1513671875, + "learning_rate": 3.520132835201329e-05, + "loss": 0.6018, + "step": 4120 + }, + { + "epoch": 2.488379112586779, + "grad_norm": 0.1494140625, + "learning_rate": 3.5159817351598174e-05, + "loss": 0.6141, + "step": 4121 + }, + { + "epoch": 2.488982795049804, + "grad_norm": 0.1552734375, + "learning_rate": 3.511830635118307e-05, + "loss": 0.7566, + "step": 4122 + }, + { + "epoch": 2.4895864775128285, + "grad_norm": 0.1513671875, + "learning_rate": 3.5076795350767954e-05, + "loss": 0.6085, + "step": 4123 + }, + { + "epoch": 2.4901901599758527, + "grad_norm": 0.14453125, + "learning_rate": 3.503528435035284e-05, + "loss": 0.6169, + "step": 4124 + }, + { + "epoch": 2.490793842438877, + "grad_norm": 0.1494140625, + "learning_rate": 3.499377334993773e-05, + "loss": 1.1005, + "step": 4125 + }, + { + "epoch": 2.4913975249019016, + "grad_norm": 0.1611328125, + "learning_rate": 3.4952262349522627e-05, + "loss": 0.6369, + "step": 4126 + }, + { + "epoch": 2.4920012073649263, + "grad_norm": 0.140625, + "learning_rate": 3.491075134910752e-05, + "loss": 0.7739, + "step": 4127 + }, + { + "epoch": 2.4926048898279505, + "grad_norm": 0.1416015625, + "learning_rate": 3.4869240348692406e-05, + "loss": 0.6243, + "step": 4128 + }, + { + "epoch": 2.4932085722909747, + "grad_norm": 0.158203125, + "learning_rate": 3.482772934827729e-05, + "loss": 0.6468, + "step": 4129 + }, + { + "epoch": 2.4938122547539994, + "grad_norm": 0.1474609375, + "learning_rate": 3.4786218347862186e-05, + "loss": 0.6467, + "step": 4130 + }, + { + "epoch": 2.494415937217024, + "grad_norm": 0.1474609375, + "learning_rate": 3.474470734744707e-05, + "loss": 0.6936, + "step": 4131 + }, + { + "epoch": 2.4950196196800483, + "grad_norm": 0.138671875, + "learning_rate": 3.4703196347031966e-05, + "loss": 0.6583, + "step": 4132 + }, + { + "epoch": 2.495623302143073, + "grad_norm": 0.13671875, + "learning_rate": 3.466168534661686e-05, + "loss": 0.55, + "step": 4133 + }, + { + "epoch": 2.496226984606097, + "grad_norm": 0.14453125, + "learning_rate": 3.4620174346201746e-05, + "loss": 0.6066, + "step": 4134 + }, + { + "epoch": 2.496830667069122, + "grad_norm": 0.1328125, + "learning_rate": 3.457866334578663e-05, + "loss": 0.6148, + "step": 4135 + }, + { + "epoch": 2.497434349532146, + "grad_norm": 0.1455078125, + "learning_rate": 3.4537152345371525e-05, + "loss": 0.6258, + "step": 4136 + }, + { + "epoch": 2.4980380319951707, + "grad_norm": 0.142578125, + "learning_rate": 3.449564134495641e-05, + "loss": 0.6906, + "step": 4137 + }, + { + "epoch": 2.498641714458195, + "grad_norm": 0.1435546875, + "learning_rate": 3.4454130344541305e-05, + "loss": 0.601, + "step": 4138 + }, + { + "epoch": 2.4992453969212196, + "grad_norm": 0.1435546875, + "learning_rate": 3.44126193441262e-05, + "loss": 0.6109, + "step": 4139 + }, + { + "epoch": 2.499849079384244, + "grad_norm": 0.138671875, + "learning_rate": 3.4371108343711085e-05, + "loss": 0.6139, + "step": 4140 + }, + { + "epoch": 2.5004527618472685, + "grad_norm": 0.1357421875, + "learning_rate": 3.432959734329598e-05, + "loss": 0.4505, + "step": 4141 + }, + { + "epoch": 2.5010564443102927, + "grad_norm": 0.1376953125, + "learning_rate": 3.4288086342880864e-05, + "loss": 0.5743, + "step": 4142 + }, + { + "epoch": 2.5016601267733174, + "grad_norm": 0.138671875, + "learning_rate": 3.424657534246575e-05, + "loss": 0.6745, + "step": 4143 + }, + { + "epoch": 2.5022638092363416, + "grad_norm": 0.1396484375, + "learning_rate": 3.4205064342050644e-05, + "loss": 0.7918, + "step": 4144 + }, + { + "epoch": 2.5028674916993663, + "grad_norm": 0.1533203125, + "learning_rate": 3.416355334163554e-05, + "loss": 0.5862, + "step": 4145 + }, + { + "epoch": 2.5034711741623905, + "grad_norm": 0.1435546875, + "learning_rate": 3.4122042341220424e-05, + "loss": 0.6077, + "step": 4146 + }, + { + "epoch": 2.504074856625415, + "grad_norm": 0.142578125, + "learning_rate": 3.408053134080532e-05, + "loss": 0.5726, + "step": 4147 + }, + { + "epoch": 2.5046785390884394, + "grad_norm": 0.1435546875, + "learning_rate": 3.4039020340390204e-05, + "loss": 0.5069, + "step": 4148 + }, + { + "epoch": 2.505282221551464, + "grad_norm": 0.1474609375, + "learning_rate": 3.39975093399751e-05, + "loss": 0.5979, + "step": 4149 + }, + { + "epoch": 2.5058859040144883, + "grad_norm": 0.1552734375, + "learning_rate": 3.3955998339559983e-05, + "loss": 0.5018, + "step": 4150 + }, + { + "epoch": 2.506489586477513, + "grad_norm": 0.158203125, + "learning_rate": 3.391448733914488e-05, + "loss": 0.4882, + "step": 4151 + }, + { + "epoch": 2.507093268940537, + "grad_norm": 0.158203125, + "learning_rate": 3.387297633872977e-05, + "loss": 0.5086, + "step": 4152 + }, + { + "epoch": 2.507696951403562, + "grad_norm": 0.1669921875, + "learning_rate": 3.3831465338314656e-05, + "loss": 0.523, + "step": 4153 + }, + { + "epoch": 2.508300633866586, + "grad_norm": 0.1923828125, + "learning_rate": 3.378995433789954e-05, + "loss": 0.4776, + "step": 4154 + }, + { + "epoch": 2.5089043163296108, + "grad_norm": 0.1796875, + "learning_rate": 3.3748443337484436e-05, + "loss": 0.5318, + "step": 4155 + }, + { + "epoch": 2.509507998792635, + "grad_norm": 0.1875, + "learning_rate": 3.370693233706932e-05, + "loss": 0.4694, + "step": 4156 + }, + { + "epoch": 2.5101116812556596, + "grad_norm": 0.1865234375, + "learning_rate": 3.3665421336654216e-05, + "loss": 0.4135, + "step": 4157 + }, + { + "epoch": 2.510715363718684, + "grad_norm": 0.18359375, + "learning_rate": 3.362391033623911e-05, + "loss": 0.3618, + "step": 4158 + }, + { + "epoch": 2.5113190461817085, + "grad_norm": 0.20703125, + "learning_rate": 3.3582399335823996e-05, + "loss": 0.412, + "step": 4159 + }, + { + "epoch": 2.5119227286447328, + "grad_norm": 0.2080078125, + "learning_rate": 3.354088833540889e-05, + "loss": 0.3636, + "step": 4160 + }, + { + "epoch": 2.5125264111077574, + "grad_norm": 0.2021484375, + "learning_rate": 3.3499377334993775e-05, + "loss": 0.2725, + "step": 4161 + }, + { + "epoch": 2.5131300935707817, + "grad_norm": 0.2177734375, + "learning_rate": 3.345786633457866e-05, + "loss": 0.2317, + "step": 4162 + }, + { + "epoch": 2.5137337760338063, + "grad_norm": 0.1396484375, + "learning_rate": 3.3416355334163555e-05, + "loss": 0.5467, + "step": 4163 + }, + { + "epoch": 2.5143374584968305, + "grad_norm": 0.1455078125, + "learning_rate": 3.337484433374845e-05, + "loss": 0.6671, + "step": 4164 + }, + { + "epoch": 2.514941140959855, + "grad_norm": 0.1533203125, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.653, + "step": 4165 + }, + { + "epoch": 2.5155448234228794, + "grad_norm": 0.1533203125, + "learning_rate": 3.329182233291823e-05, + "loss": 0.6325, + "step": 4166 + }, + { + "epoch": 2.516148505885904, + "grad_norm": 0.1533203125, + "learning_rate": 3.3250311332503115e-05, + "loss": 0.593, + "step": 4167 + }, + { + "epoch": 2.5167521883489283, + "grad_norm": 0.162109375, + "learning_rate": 3.3208800332088e-05, + "loss": 0.6592, + "step": 4168 + }, + { + "epoch": 2.517355870811953, + "grad_norm": 0.15234375, + "learning_rate": 3.3167289331672894e-05, + "loss": 0.6527, + "step": 4169 + }, + { + "epoch": 2.5179595532749772, + "grad_norm": 0.1474609375, + "learning_rate": 3.312577833125778e-05, + "loss": 0.6054, + "step": 4170 + }, + { + "epoch": 2.518563235738002, + "grad_norm": 0.150390625, + "learning_rate": 3.308426733084268e-05, + "loss": 0.6671, + "step": 4171 + }, + { + "epoch": 2.519166918201026, + "grad_norm": 0.1318359375, + "learning_rate": 3.304275633042757e-05, + "loss": 0.527, + "step": 4172 + }, + { + "epoch": 2.519770600664051, + "grad_norm": 0.1474609375, + "learning_rate": 3.3001245330012454e-05, + "loss": 0.5836, + "step": 4173 + }, + { + "epoch": 2.520374283127075, + "grad_norm": 0.1416015625, + "learning_rate": 3.295973432959735e-05, + "loss": 0.5882, + "step": 4174 + }, + { + "epoch": 2.5209779655900997, + "grad_norm": 0.14453125, + "learning_rate": 3.2918223329182234e-05, + "loss": 0.6148, + "step": 4175 + }, + { + "epoch": 2.521581648053124, + "grad_norm": 0.16796875, + "learning_rate": 3.287671232876712e-05, + "loss": 0.6586, + "step": 4176 + }, + { + "epoch": 2.5221853305161486, + "grad_norm": 0.13671875, + "learning_rate": 3.2835201328352013e-05, + "loss": 0.496, + "step": 4177 + }, + { + "epoch": 2.522789012979173, + "grad_norm": 0.1396484375, + "learning_rate": 3.279369032793691e-05, + "loss": 0.6182, + "step": 4178 + }, + { + "epoch": 2.5233926954421975, + "grad_norm": 0.15625, + "learning_rate": 3.275217932752179e-05, + "loss": 0.5696, + "step": 4179 + }, + { + "epoch": 2.5239963779052217, + "grad_norm": 0.1474609375, + "learning_rate": 3.2710668327106686e-05, + "loss": 0.643, + "step": 4180 + }, + { + "epoch": 2.5246000603682464, + "grad_norm": 0.1474609375, + "learning_rate": 3.266915732669157e-05, + "loss": 0.6102, + "step": 4181 + }, + { + "epoch": 2.5252037428312706, + "grad_norm": 0.15234375, + "learning_rate": 3.2627646326276466e-05, + "loss": 0.5995, + "step": 4182 + }, + { + "epoch": 2.5258074252942952, + "grad_norm": 0.1630859375, + "learning_rate": 3.258613532586135e-05, + "loss": 0.5864, + "step": 4183 + }, + { + "epoch": 2.52641110775732, + "grad_norm": 0.1474609375, + "learning_rate": 3.2544624325446246e-05, + "loss": 0.5564, + "step": 4184 + }, + { + "epoch": 2.527014790220344, + "grad_norm": 0.15234375, + "learning_rate": 3.250311332503114e-05, + "loss": 0.5808, + "step": 4185 + }, + { + "epoch": 2.5276184726833684, + "grad_norm": 0.2119140625, + "learning_rate": 3.2461602324616026e-05, + "loss": 0.5563, + "step": 4186 + }, + { + "epoch": 2.528222155146393, + "grad_norm": 0.140625, + "learning_rate": 3.242009132420091e-05, + "loss": 0.5779, + "step": 4187 + }, + { + "epoch": 2.5288258376094177, + "grad_norm": 0.14453125, + "learning_rate": 3.2378580323785805e-05, + "loss": 0.6378, + "step": 4188 + }, + { + "epoch": 2.529429520072442, + "grad_norm": 0.1484375, + "learning_rate": 3.233706932337069e-05, + "loss": 0.5168, + "step": 4189 + }, + { + "epoch": 2.530033202535466, + "grad_norm": 0.146484375, + "learning_rate": 3.2295558322955585e-05, + "loss": 0.6191, + "step": 4190 + }, + { + "epoch": 2.530636884998491, + "grad_norm": 0.1474609375, + "learning_rate": 3.225404732254048e-05, + "loss": 0.6251, + "step": 4191 + }, + { + "epoch": 2.5312405674615155, + "grad_norm": 0.1533203125, + "learning_rate": 3.2212536322125365e-05, + "loss": 0.592, + "step": 4192 + }, + { + "epoch": 2.5318442499245397, + "grad_norm": 0.1474609375, + "learning_rate": 3.217102532171026e-05, + "loss": 0.5686, + "step": 4193 + }, + { + "epoch": 2.532447932387564, + "grad_norm": 0.138671875, + "learning_rate": 3.2129514321295145e-05, + "loss": 0.7272, + "step": 4194 + }, + { + "epoch": 2.5330516148505886, + "grad_norm": 0.1328125, + "learning_rate": 3.208800332088003e-05, + "loss": 0.684, + "step": 4195 + }, + { + "epoch": 2.5336552973136133, + "grad_norm": 0.1455078125, + "learning_rate": 3.2046492320464924e-05, + "loss": 0.5861, + "step": 4196 + }, + { + "epoch": 2.5342589797766375, + "grad_norm": 0.1484375, + "learning_rate": 3.200498132004982e-05, + "loss": 0.555, + "step": 4197 + }, + { + "epoch": 2.5348626622396617, + "grad_norm": 0.154296875, + "learning_rate": 3.1963470319634704e-05, + "loss": 0.5677, + "step": 4198 + }, + { + "epoch": 2.5354663447026864, + "grad_norm": 0.1533203125, + "learning_rate": 3.19219593192196e-05, + "loss": 0.5287, + "step": 4199 + }, + { + "epoch": 2.536070027165711, + "grad_norm": 0.1640625, + "learning_rate": 3.1880448318804484e-05, + "loss": 0.5441, + "step": 4200 + }, + { + "epoch": 2.5366737096287353, + "grad_norm": 0.16015625, + "learning_rate": 3.183893731838937e-05, + "loss": 0.4927, + "step": 4201 + }, + { + "epoch": 2.5372773920917595, + "grad_norm": 0.1572265625, + "learning_rate": 3.1797426317974264e-05, + "loss": 0.4615, + "step": 4202 + }, + { + "epoch": 2.537881074554784, + "grad_norm": 0.16015625, + "learning_rate": 3.175591531755916e-05, + "loss": 0.4678, + "step": 4203 + }, + { + "epoch": 2.538484757017809, + "grad_norm": 0.1689453125, + "learning_rate": 3.171440431714404e-05, + "loss": 0.48, + "step": 4204 + }, + { + "epoch": 2.539088439480833, + "grad_norm": 0.1796875, + "learning_rate": 3.1672893316728937e-05, + "loss": 0.5278, + "step": 4205 + }, + { + "epoch": 2.5396921219438573, + "grad_norm": 0.1826171875, + "learning_rate": 3.163138231631382e-05, + "loss": 0.4566, + "step": 4206 + }, + { + "epoch": 2.540295804406882, + "grad_norm": 0.1796875, + "learning_rate": 3.1589871315898716e-05, + "loss": 0.4041, + "step": 4207 + }, + { + "epoch": 2.5408994868699066, + "grad_norm": 0.2021484375, + "learning_rate": 3.15483603154836e-05, + "loss": 0.4396, + "step": 4208 + }, + { + "epoch": 2.541503169332931, + "grad_norm": 0.1962890625, + "learning_rate": 3.1506849315068496e-05, + "loss": 0.4315, + "step": 4209 + }, + { + "epoch": 2.542106851795955, + "grad_norm": 0.201171875, + "learning_rate": 3.146533831465339e-05, + "loss": 0.318, + "step": 4210 + }, + { + "epoch": 2.5427105342589797, + "grad_norm": 0.201171875, + "learning_rate": 3.1423827314238276e-05, + "loss": 0.2812, + "step": 4211 + }, + { + "epoch": 2.5433142167220044, + "grad_norm": 0.1953125, + "learning_rate": 3.138231631382316e-05, + "loss": 0.2069, + "step": 4212 + }, + { + "epoch": 2.5439178991850286, + "grad_norm": 0.1357421875, + "learning_rate": 3.1340805313408056e-05, + "loss": 0.5955, + "step": 4213 + }, + { + "epoch": 2.544521581648053, + "grad_norm": 0.1357421875, + "learning_rate": 3.129929431299294e-05, + "loss": 0.5779, + "step": 4214 + }, + { + "epoch": 2.5451252641110775, + "grad_norm": 0.142578125, + "learning_rate": 3.125778331257783e-05, + "loss": 0.6802, + "step": 4215 + }, + { + "epoch": 2.545728946574102, + "grad_norm": 0.1630859375, + "learning_rate": 3.121627231216273e-05, + "loss": 0.8038, + "step": 4216 + }, + { + "epoch": 2.5463326290371264, + "grad_norm": 0.1435546875, + "learning_rate": 3.1174761311747615e-05, + "loss": 0.8096, + "step": 4217 + }, + { + "epoch": 2.5469363115001507, + "grad_norm": 0.140625, + "learning_rate": 3.113325031133251e-05, + "loss": 1.203, + "step": 4218 + }, + { + "epoch": 2.5475399939631753, + "grad_norm": 0.1435546875, + "learning_rate": 3.1091739310917395e-05, + "loss": 0.7647, + "step": 4219 + }, + { + "epoch": 2.5481436764262, + "grad_norm": 0.1455078125, + "learning_rate": 3.105022831050228e-05, + "loss": 0.5275, + "step": 4220 + }, + { + "epoch": 2.548747358889224, + "grad_norm": 0.13671875, + "learning_rate": 3.1008717310087175e-05, + "loss": 0.658, + "step": 4221 + }, + { + "epoch": 2.549351041352249, + "grad_norm": 0.150390625, + "learning_rate": 3.096720630967206e-05, + "loss": 0.6599, + "step": 4222 + }, + { + "epoch": 2.549954723815273, + "grad_norm": 0.1513671875, + "learning_rate": 3.0925695309256954e-05, + "loss": 0.6281, + "step": 4223 + }, + { + "epoch": 2.5505584062782978, + "grad_norm": 0.1572265625, + "learning_rate": 3.088418430884185e-05, + "loss": 0.6384, + "step": 4224 + }, + { + "epoch": 2.551162088741322, + "grad_norm": 0.14453125, + "learning_rate": 3.0842673308426734e-05, + "loss": 0.6109, + "step": 4225 + }, + { + "epoch": 2.5517657712043467, + "grad_norm": 0.154296875, + "learning_rate": 3.080116230801162e-05, + "loss": 0.7078, + "step": 4226 + }, + { + "epoch": 2.552369453667371, + "grad_norm": 0.1455078125, + "learning_rate": 3.0759651307596514e-05, + "loss": 0.5598, + "step": 4227 + }, + { + "epoch": 2.5529731361303956, + "grad_norm": 0.1650390625, + "learning_rate": 3.07181403071814e-05, + "loss": 0.8109, + "step": 4228 + }, + { + "epoch": 2.55357681859342, + "grad_norm": 0.1904296875, + "learning_rate": 3.06766293067663e-05, + "loss": 0.6628, + "step": 4229 + }, + { + "epoch": 2.5541805010564445, + "grad_norm": 0.1533203125, + "learning_rate": 3.063511830635119e-05, + "loss": 0.6804, + "step": 4230 + }, + { + "epoch": 2.5547841835194687, + "grad_norm": 0.1435546875, + "learning_rate": 3.059360730593607e-05, + "loss": 0.5877, + "step": 4231 + }, + { + "epoch": 2.5553878659824933, + "grad_norm": 0.1630859375, + "learning_rate": 3.0552096305520966e-05, + "loss": 0.656, + "step": 4232 + }, + { + "epoch": 2.5559915484455176, + "grad_norm": 0.1513671875, + "learning_rate": 3.0510585305105853e-05, + "loss": 0.676, + "step": 4233 + }, + { + "epoch": 2.5565952309085422, + "grad_norm": 0.1435546875, + "learning_rate": 3.0469074304690743e-05, + "loss": 0.5534, + "step": 4234 + }, + { + "epoch": 2.5571989133715665, + "grad_norm": 0.1513671875, + "learning_rate": 3.0427563304275636e-05, + "loss": 0.6103, + "step": 4235 + }, + { + "epoch": 2.557802595834591, + "grad_norm": 0.150390625, + "learning_rate": 3.0386052303860523e-05, + "loss": 0.6122, + "step": 4236 + }, + { + "epoch": 2.5584062782976154, + "grad_norm": 0.14453125, + "learning_rate": 3.0344541303445412e-05, + "loss": 0.6039, + "step": 4237 + }, + { + "epoch": 2.55900996076064, + "grad_norm": 0.185546875, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.5834, + "step": 4238 + }, + { + "epoch": 2.5596136432236642, + "grad_norm": 0.1474609375, + "learning_rate": 3.0261519302615192e-05, + "loss": 0.6298, + "step": 4239 + }, + { + "epoch": 2.560217325686689, + "grad_norm": 0.1396484375, + "learning_rate": 3.0220008302200085e-05, + "loss": 0.5479, + "step": 4240 + }, + { + "epoch": 2.560821008149713, + "grad_norm": 0.1533203125, + "learning_rate": 3.0178497301784975e-05, + "loss": 0.5281, + "step": 4241 + }, + { + "epoch": 2.561424690612738, + "grad_norm": 0.169921875, + "learning_rate": 3.0136986301369862e-05, + "loss": 0.7109, + "step": 4242 + }, + { + "epoch": 2.562028373075762, + "grad_norm": 0.1416015625, + "learning_rate": 3.0095475300954755e-05, + "loss": 0.6514, + "step": 4243 + }, + { + "epoch": 2.5626320555387867, + "grad_norm": 0.162109375, + "learning_rate": 3.0053964300539645e-05, + "loss": 1.1654, + "step": 4244 + }, + { + "epoch": 2.563235738001811, + "grad_norm": 0.158203125, + "learning_rate": 3.001245330012453e-05, + "loss": 0.5638, + "step": 4245 + }, + { + "epoch": 2.5638394204648356, + "grad_norm": 0.1376953125, + "learning_rate": 2.9970942299709425e-05, + "loss": 0.5556, + "step": 4246 + }, + { + "epoch": 2.56444310292786, + "grad_norm": 0.1552734375, + "learning_rate": 2.9929431299294315e-05, + "loss": 0.6126, + "step": 4247 + }, + { + "epoch": 2.5650467853908845, + "grad_norm": 0.150390625, + "learning_rate": 2.98879202988792e-05, + "loss": 0.5597, + "step": 4248 + }, + { + "epoch": 2.5656504678539087, + "grad_norm": 0.1552734375, + "learning_rate": 2.9846409298464094e-05, + "loss": 0.5701, + "step": 4249 + }, + { + "epoch": 2.5662541503169334, + "grad_norm": 0.1533203125, + "learning_rate": 2.9804898298048984e-05, + "loss": 0.5262, + "step": 4250 + }, + { + "epoch": 2.5668578327799576, + "grad_norm": 0.1650390625, + "learning_rate": 2.9763387297633877e-05, + "loss": 0.6605, + "step": 4251 + }, + { + "epoch": 2.5674615152429823, + "grad_norm": 0.1728515625, + "learning_rate": 2.9721876297218764e-05, + "loss": 0.5195, + "step": 4252 + }, + { + "epoch": 2.5680651977060065, + "grad_norm": 0.1708984375, + "learning_rate": 2.9680365296803654e-05, + "loss": 0.5135, + "step": 4253 + }, + { + "epoch": 2.568668880169031, + "grad_norm": 0.1708984375, + "learning_rate": 2.9638854296388547e-05, + "loss": 0.55, + "step": 4254 + }, + { + "epoch": 2.5692725626320554, + "grad_norm": 0.173828125, + "learning_rate": 2.9597343295973434e-05, + "loss": 0.4785, + "step": 4255 + }, + { + "epoch": 2.56987624509508, + "grad_norm": 0.171875, + "learning_rate": 2.9555832295558323e-05, + "loss": 0.4183, + "step": 4256 + }, + { + "epoch": 2.5704799275581043, + "grad_norm": 0.1875, + "learning_rate": 2.9514321295143217e-05, + "loss": 0.4437, + "step": 4257 + }, + { + "epoch": 2.571083610021129, + "grad_norm": 0.19140625, + "learning_rate": 2.9472810294728103e-05, + "loss": 0.4089, + "step": 4258 + }, + { + "epoch": 2.571687292484153, + "grad_norm": 0.2001953125, + "learning_rate": 2.9431299294312993e-05, + "loss": 0.355, + "step": 4259 + }, + { + "epoch": 2.572290974947178, + "grad_norm": 0.2265625, + "learning_rate": 2.9389788293897886e-05, + "loss": 0.4149, + "step": 4260 + }, + { + "epoch": 2.572894657410202, + "grad_norm": 0.1923828125, + "learning_rate": 2.9348277293482773e-05, + "loss": 0.2763, + "step": 4261 + }, + { + "epoch": 2.5734983398732267, + "grad_norm": 0.212890625, + "learning_rate": 2.9306766293067666e-05, + "loss": 0.2371, + "step": 4262 + }, + { + "epoch": 2.574102022336251, + "grad_norm": 0.1591796875, + "learning_rate": 2.9265255292652556e-05, + "loss": 0.7012, + "step": 4263 + }, + { + "epoch": 2.5747057047992756, + "grad_norm": 0.302734375, + "learning_rate": 2.9223744292237442e-05, + "loss": 0.7176, + "step": 4264 + }, + { + "epoch": 2.5753093872623003, + "grad_norm": 0.1416015625, + "learning_rate": 2.9182233291822336e-05, + "loss": 0.6191, + "step": 4265 + }, + { + "epoch": 2.5759130697253245, + "grad_norm": 0.1611328125, + "learning_rate": 2.9140722291407226e-05, + "loss": 0.7192, + "step": 4266 + }, + { + "epoch": 2.5765167521883487, + "grad_norm": 0.1435546875, + "learning_rate": 2.9099211290992112e-05, + "loss": 0.6505, + "step": 4267 + }, + { + "epoch": 2.5771204346513734, + "grad_norm": 0.1416015625, + "learning_rate": 2.9057700290577005e-05, + "loss": 0.6055, + "step": 4268 + }, + { + "epoch": 2.577724117114398, + "grad_norm": 0.1572265625, + "learning_rate": 2.9016189290161895e-05, + "loss": 0.7176, + "step": 4269 + }, + { + "epoch": 2.5783277995774223, + "grad_norm": 0.1474609375, + "learning_rate": 2.897467828974678e-05, + "loss": 0.7773, + "step": 4270 + }, + { + "epoch": 2.5789314820404465, + "grad_norm": 0.1416015625, + "learning_rate": 2.8933167289331675e-05, + "loss": 0.5553, + "step": 4271 + }, + { + "epoch": 2.579535164503471, + "grad_norm": 0.1484375, + "learning_rate": 2.8891656288916565e-05, + "loss": 0.7196, + "step": 4272 + }, + { + "epoch": 2.580138846966496, + "grad_norm": 0.138671875, + "learning_rate": 2.8850145288501458e-05, + "loss": 0.5476, + "step": 4273 + }, + { + "epoch": 2.58074252942952, + "grad_norm": 0.1650390625, + "learning_rate": 2.8808634288086344e-05, + "loss": 0.6434, + "step": 4274 + }, + { + "epoch": 2.5813462118925443, + "grad_norm": 0.15234375, + "learning_rate": 2.8767123287671234e-05, + "loss": 0.8742, + "step": 4275 + }, + { + "epoch": 2.581949894355569, + "grad_norm": 0.1484375, + "learning_rate": 2.8725612287256128e-05, + "loss": 0.6174, + "step": 4276 + }, + { + "epoch": 2.5825535768185937, + "grad_norm": 0.146484375, + "learning_rate": 2.8684101286841014e-05, + "loss": 0.6064, + "step": 4277 + }, + { + "epoch": 2.583157259281618, + "grad_norm": 0.1435546875, + "learning_rate": 2.86425902864259e-05, + "loss": 0.5538, + "step": 4278 + }, + { + "epoch": 2.583760941744642, + "grad_norm": 0.1474609375, + "learning_rate": 2.8601079286010797e-05, + "loss": 0.5924, + "step": 4279 + }, + { + "epoch": 2.5843646242076668, + "grad_norm": 0.1455078125, + "learning_rate": 2.8559568285595684e-05, + "loss": 0.5754, + "step": 4280 + }, + { + "epoch": 2.5849683066706914, + "grad_norm": 0.142578125, + "learning_rate": 2.851805728518057e-05, + "loss": 0.5529, + "step": 4281 + }, + { + "epoch": 2.5855719891337157, + "grad_norm": 0.181640625, + "learning_rate": 2.8476546284765467e-05, + "loss": 0.7275, + "step": 4282 + }, + { + "epoch": 2.58617567159674, + "grad_norm": 0.1455078125, + "learning_rate": 2.8435035284350353e-05, + "loss": 0.6515, + "step": 4283 + }, + { + "epoch": 2.5867793540597646, + "grad_norm": 0.1484375, + "learning_rate": 2.8393524283935247e-05, + "loss": 0.6439, + "step": 4284 + }, + { + "epoch": 2.5873830365227892, + "grad_norm": 0.1533203125, + "learning_rate": 2.8352013283520133e-05, + "loss": 0.5381, + "step": 4285 + }, + { + "epoch": 2.5879867189858135, + "grad_norm": 0.1513671875, + "learning_rate": 2.8310502283105023e-05, + "loss": 0.865, + "step": 4286 + }, + { + "epoch": 2.5885904014488377, + "grad_norm": 0.138671875, + "learning_rate": 2.8268991282689916e-05, + "loss": 0.7643, + "step": 4287 + }, + { + "epoch": 2.5891940839118623, + "grad_norm": 0.1611328125, + "learning_rate": 2.8227480282274803e-05, + "loss": 0.6419, + "step": 4288 + }, + { + "epoch": 2.589797766374887, + "grad_norm": 0.16015625, + "learning_rate": 2.8185969281859693e-05, + "loss": 0.6687, + "step": 4289 + }, + { + "epoch": 2.5904014488379112, + "grad_norm": 0.13671875, + "learning_rate": 2.8144458281444586e-05, + "loss": 0.7835, + "step": 4290 + }, + { + "epoch": 2.5910051313009355, + "grad_norm": 0.16015625, + "learning_rate": 2.8102947281029472e-05, + "loss": 0.6415, + "step": 4291 + }, + { + "epoch": 2.59160881376396, + "grad_norm": 0.154296875, + "learning_rate": 2.8061436280614362e-05, + "loss": 0.7106, + "step": 4292 + }, + { + "epoch": 2.592212496226985, + "grad_norm": 0.1455078125, + "learning_rate": 2.8019925280199255e-05, + "loss": 0.617, + "step": 4293 + }, + { + "epoch": 2.592816178690009, + "grad_norm": 0.1494140625, + "learning_rate": 2.7978414279784142e-05, + "loss": 0.595, + "step": 4294 + }, + { + "epoch": 2.5934198611530332, + "grad_norm": 0.1357421875, + "learning_rate": 2.7936903279369035e-05, + "loss": 0.5337, + "step": 4295 + }, + { + "epoch": 2.594023543616058, + "grad_norm": 0.150390625, + "learning_rate": 2.7895392278953925e-05, + "loss": 0.5098, + "step": 4296 + }, + { + "epoch": 2.5946272260790826, + "grad_norm": 0.1533203125, + "learning_rate": 2.785388127853881e-05, + "loss": 0.519, + "step": 4297 + }, + { + "epoch": 2.595230908542107, + "grad_norm": 0.1533203125, + "learning_rate": 2.7812370278123705e-05, + "loss": 0.5765, + "step": 4298 + }, + { + "epoch": 2.595834591005131, + "grad_norm": 0.1552734375, + "learning_rate": 2.7770859277708595e-05, + "loss": 0.5621, + "step": 4299 + }, + { + "epoch": 2.5964382734681557, + "grad_norm": 0.154296875, + "learning_rate": 2.772934827729348e-05, + "loss": 0.5247, + "step": 4300 + }, + { + "epoch": 2.5970419559311804, + "grad_norm": 0.1494140625, + "learning_rate": 2.7687837276878374e-05, + "loss": 0.455, + "step": 4301 + }, + { + "epoch": 2.5976456383942046, + "grad_norm": 0.154296875, + "learning_rate": 2.7646326276463264e-05, + "loss": 0.4566, + "step": 4302 + }, + { + "epoch": 2.5982493208572293, + "grad_norm": 0.1669921875, + "learning_rate": 2.760481527604815e-05, + "loss": 0.4368, + "step": 4303 + }, + { + "epoch": 2.5988530033202535, + "grad_norm": 0.169921875, + "learning_rate": 2.7563304275633044e-05, + "loss": 0.5235, + "step": 4304 + }, + { + "epoch": 2.599456685783278, + "grad_norm": 0.1708984375, + "learning_rate": 2.7521793275217934e-05, + "loss": 0.4767, + "step": 4305 + }, + { + "epoch": 2.6000603682463024, + "grad_norm": 0.2412109375, + "learning_rate": 2.7480282274802827e-05, + "loss": 0.3947, + "step": 4306 + }, + { + "epoch": 2.600664050709327, + "grad_norm": 0.1904296875, + "learning_rate": 2.7438771274387714e-05, + "loss": 0.3915, + "step": 4307 + }, + { + "epoch": 2.6012677331723513, + "grad_norm": 0.1953125, + "learning_rate": 2.7397260273972603e-05, + "loss": 0.3913, + "step": 4308 + }, + { + "epoch": 2.601871415635376, + "grad_norm": 0.2001953125, + "learning_rate": 2.7355749273557497e-05, + "loss": 0.3407, + "step": 4309 + }, + { + "epoch": 2.6024750980984, + "grad_norm": 0.205078125, + "learning_rate": 2.7314238273142383e-05, + "loss": 0.3446, + "step": 4310 + }, + { + "epoch": 2.603078780561425, + "grad_norm": 0.2255859375, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.3252, + "step": 4311 + }, + { + "epoch": 2.603682463024449, + "grad_norm": 0.1962890625, + "learning_rate": 2.7231216272312166e-05, + "loss": 0.1949, + "step": 4312 + }, + { + "epoch": 2.6042861454874737, + "grad_norm": 0.1533203125, + "learning_rate": 2.7189705271897053e-05, + "loss": 0.6236, + "step": 4313 + }, + { + "epoch": 2.604889827950498, + "grad_norm": 0.1435546875, + "learning_rate": 2.7148194271481943e-05, + "loss": 0.5858, + "step": 4314 + }, + { + "epoch": 2.6054935104135226, + "grad_norm": 0.1416015625, + "learning_rate": 2.7106683271066836e-05, + "loss": 0.6003, + "step": 4315 + }, + { + "epoch": 2.606097192876547, + "grad_norm": 0.1513671875, + "learning_rate": 2.7065172270651722e-05, + "loss": 0.5897, + "step": 4316 + }, + { + "epoch": 2.6067008753395715, + "grad_norm": 0.140625, + "learning_rate": 2.7023661270236616e-05, + "loss": 0.6271, + "step": 4317 + }, + { + "epoch": 2.6073045578025957, + "grad_norm": 0.1513671875, + "learning_rate": 2.6982150269821506e-05, + "loss": 0.5715, + "step": 4318 + }, + { + "epoch": 2.6079082402656204, + "grad_norm": 0.12890625, + "learning_rate": 2.6940639269406392e-05, + "loss": 0.5873, + "step": 4319 + }, + { + "epoch": 2.6085119227286446, + "grad_norm": 0.1376953125, + "learning_rate": 2.6899128268991285e-05, + "loss": 0.5097, + "step": 4320 + }, + { + "epoch": 2.6091156051916693, + "grad_norm": 0.1357421875, + "learning_rate": 2.6857617268576175e-05, + "loss": 0.8605, + "step": 4321 + }, + { + "epoch": 2.6097192876546935, + "grad_norm": 0.154296875, + "learning_rate": 2.6816106268161062e-05, + "loss": 0.6028, + "step": 4322 + }, + { + "epoch": 2.610322970117718, + "grad_norm": 0.1708984375, + "learning_rate": 2.6774595267745955e-05, + "loss": 0.6827, + "step": 4323 + }, + { + "epoch": 2.6109266525807424, + "grad_norm": 0.15234375, + "learning_rate": 2.6733084267330845e-05, + "loss": 0.6564, + "step": 4324 + }, + { + "epoch": 2.611530335043767, + "grad_norm": 0.1494140625, + "learning_rate": 2.669157326691573e-05, + "loss": 0.5687, + "step": 4325 + }, + { + "epoch": 2.6121340175067913, + "grad_norm": 0.1796875, + "learning_rate": 2.6650062266500625e-05, + "loss": 0.856, + "step": 4326 + }, + { + "epoch": 2.612737699969816, + "grad_norm": 0.1474609375, + "learning_rate": 2.6608551266085514e-05, + "loss": 0.6517, + "step": 4327 + }, + { + "epoch": 2.61334138243284, + "grad_norm": 0.1484375, + "learning_rate": 2.6567040265670408e-05, + "loss": 0.6049, + "step": 4328 + }, + { + "epoch": 2.613945064895865, + "grad_norm": 0.1455078125, + "learning_rate": 2.6525529265255294e-05, + "loss": 0.6425, + "step": 4329 + }, + { + "epoch": 2.614548747358889, + "grad_norm": 0.1533203125, + "learning_rate": 2.6484018264840184e-05, + "loss": 0.6713, + "step": 4330 + }, + { + "epoch": 2.6151524298219138, + "grad_norm": 0.16015625, + "learning_rate": 2.6442507264425077e-05, + "loss": 0.7973, + "step": 4331 + }, + { + "epoch": 2.615756112284938, + "grad_norm": 0.158203125, + "learning_rate": 2.6400996264009964e-05, + "loss": 0.8084, + "step": 4332 + }, + { + "epoch": 2.6163597947479627, + "grad_norm": 0.16015625, + "learning_rate": 2.635948526359485e-05, + "loss": 0.7568, + "step": 4333 + }, + { + "epoch": 2.616963477210987, + "grad_norm": 0.142578125, + "learning_rate": 2.6317974263179747e-05, + "loss": 0.6364, + "step": 4334 + }, + { + "epoch": 2.6175671596740115, + "grad_norm": 0.150390625, + "learning_rate": 2.6276463262764633e-05, + "loss": 0.7264, + "step": 4335 + }, + { + "epoch": 2.6181708421370358, + "grad_norm": 0.1513671875, + "learning_rate": 2.623495226234952e-05, + "loss": 0.6437, + "step": 4336 + }, + { + "epoch": 2.6187745246000604, + "grad_norm": 0.1884765625, + "learning_rate": 2.6193441261934417e-05, + "loss": 0.9275, + "step": 4337 + }, + { + "epoch": 2.6193782070630847, + "grad_norm": 0.16015625, + "learning_rate": 2.6151930261519303e-05, + "loss": 0.6317, + "step": 4338 + }, + { + "epoch": 2.6199818895261093, + "grad_norm": 0.1533203125, + "learning_rate": 2.6110419261104196e-05, + "loss": 0.5622, + "step": 4339 + }, + { + "epoch": 2.6205855719891336, + "grad_norm": 0.1484375, + "learning_rate": 2.6068908260689083e-05, + "loss": 0.6879, + "step": 4340 + }, + { + "epoch": 2.6211892544521582, + "grad_norm": 0.1494140625, + "learning_rate": 2.6027397260273973e-05, + "loss": 0.8795, + "step": 4341 + }, + { + "epoch": 2.6217929369151824, + "grad_norm": 0.1416015625, + "learning_rate": 2.5985886259858866e-05, + "loss": 0.5918, + "step": 4342 + }, + { + "epoch": 2.622396619378207, + "grad_norm": 0.13671875, + "learning_rate": 2.5944375259443752e-05, + "loss": 0.5913, + "step": 4343 + }, + { + "epoch": 2.6230003018412313, + "grad_norm": 0.1435546875, + "learning_rate": 2.5902864259028642e-05, + "loss": 0.7423, + "step": 4344 + }, + { + "epoch": 2.623603984304256, + "grad_norm": 0.14453125, + "learning_rate": 2.5861353258613536e-05, + "loss": 0.6315, + "step": 4345 + }, + { + "epoch": 2.6242076667672807, + "grad_norm": 0.140625, + "learning_rate": 2.5819842258198422e-05, + "loss": 0.5559, + "step": 4346 + }, + { + "epoch": 2.624811349230305, + "grad_norm": 0.142578125, + "learning_rate": 2.5778331257783312e-05, + "loss": 0.4504, + "step": 4347 + }, + { + "epoch": 2.625415031693329, + "grad_norm": 0.150390625, + "learning_rate": 2.5736820257368205e-05, + "loss": 0.5707, + "step": 4348 + }, + { + "epoch": 2.626018714156354, + "grad_norm": 0.1611328125, + "learning_rate": 2.569530925695309e-05, + "loss": 0.5686, + "step": 4349 + }, + { + "epoch": 2.6266223966193785, + "grad_norm": 0.169921875, + "learning_rate": 2.5653798256537985e-05, + "loss": 0.5619, + "step": 4350 + }, + { + "epoch": 2.6272260790824027, + "grad_norm": 0.16796875, + "learning_rate": 2.5612287256122875e-05, + "loss": 0.467, + "step": 4351 + }, + { + "epoch": 2.627829761545427, + "grad_norm": 0.1748046875, + "learning_rate": 2.557077625570776e-05, + "loss": 0.5095, + "step": 4352 + }, + { + "epoch": 2.6284334440084516, + "grad_norm": 0.169921875, + "learning_rate": 2.5529265255292654e-05, + "loss": 0.5222, + "step": 4353 + }, + { + "epoch": 2.6290371264714762, + "grad_norm": 0.166015625, + "learning_rate": 2.5487754254877544e-05, + "loss": 0.5631, + "step": 4354 + }, + { + "epoch": 2.6296408089345005, + "grad_norm": 0.1787109375, + "learning_rate": 2.544624325446243e-05, + "loss": 0.4773, + "step": 4355 + }, + { + "epoch": 2.6302444913975247, + "grad_norm": 0.1953125, + "learning_rate": 2.5404732254047324e-05, + "loss": 0.5103, + "step": 4356 + }, + { + "epoch": 2.6308481738605494, + "grad_norm": 0.1796875, + "learning_rate": 2.5363221253632214e-05, + "loss": 0.4466, + "step": 4357 + }, + { + "epoch": 2.631451856323574, + "grad_norm": 0.205078125, + "learning_rate": 2.53217102532171e-05, + "loss": 0.41, + "step": 4358 + }, + { + "epoch": 2.6320555387865983, + "grad_norm": 0.1962890625, + "learning_rate": 2.5280199252801994e-05, + "loss": 0.3226, + "step": 4359 + }, + { + "epoch": 2.6326592212496225, + "grad_norm": 0.1904296875, + "learning_rate": 2.5238688252386884e-05, + "loss": 0.2631, + "step": 4360 + }, + { + "epoch": 2.633262903712647, + "grad_norm": 0.21484375, + "learning_rate": 2.5197177251971777e-05, + "loss": 0.2883, + "step": 4361 + }, + { + "epoch": 2.633866586175672, + "grad_norm": 0.224609375, + "learning_rate": 2.5155666251556663e-05, + "loss": 0.2396, + "step": 4362 + }, + { + "epoch": 2.634470268638696, + "grad_norm": 0.14453125, + "learning_rate": 2.5114155251141553e-05, + "loss": 0.6584, + "step": 4363 + }, + { + "epoch": 2.6350739511017203, + "grad_norm": 0.130859375, + "learning_rate": 2.5072644250726446e-05, + "loss": 0.5349, + "step": 4364 + }, + { + "epoch": 2.635677633564745, + "grad_norm": 0.15234375, + "learning_rate": 2.5031133250311333e-05, + "loss": 0.6576, + "step": 4365 + }, + { + "epoch": 2.6362813160277696, + "grad_norm": 0.140625, + "learning_rate": 2.4989622249896226e-05, + "loss": 0.61, + "step": 4366 + }, + { + "epoch": 2.636884998490794, + "grad_norm": 0.1533203125, + "learning_rate": 2.4948111249481113e-05, + "loss": 0.7938, + "step": 4367 + }, + { + "epoch": 2.637488680953818, + "grad_norm": 0.15234375, + "learning_rate": 2.4906600249066003e-05, + "loss": 0.5602, + "step": 4368 + }, + { + "epoch": 2.6380923634168427, + "grad_norm": 0.158203125, + "learning_rate": 2.4865089248650892e-05, + "loss": 0.6619, + "step": 4369 + }, + { + "epoch": 2.6386960458798674, + "grad_norm": 0.1552734375, + "learning_rate": 2.4823578248235786e-05, + "loss": 0.6877, + "step": 4370 + }, + { + "epoch": 2.6392997283428916, + "grad_norm": 0.1357421875, + "learning_rate": 2.4782067247820672e-05, + "loss": 0.5548, + "step": 4371 + }, + { + "epoch": 2.639903410805916, + "grad_norm": 0.1533203125, + "learning_rate": 2.4740556247405562e-05, + "loss": 0.5933, + "step": 4372 + }, + { + "epoch": 2.6405070932689405, + "grad_norm": 0.1806640625, + "learning_rate": 2.4699045246990455e-05, + "loss": 0.6843, + "step": 4373 + }, + { + "epoch": 2.641110775731965, + "grad_norm": 0.1474609375, + "learning_rate": 2.4657534246575342e-05, + "loss": 0.6951, + "step": 4374 + }, + { + "epoch": 2.6417144581949894, + "grad_norm": 0.140625, + "learning_rate": 2.461602324616023e-05, + "loss": 0.6361, + "step": 4375 + }, + { + "epoch": 2.6423181406580136, + "grad_norm": 0.15234375, + "learning_rate": 2.4574512245745125e-05, + "loss": 0.6971, + "step": 4376 + }, + { + "epoch": 2.6429218231210383, + "grad_norm": 0.1494140625, + "learning_rate": 2.4533001245330015e-05, + "loss": 0.6643, + "step": 4377 + }, + { + "epoch": 2.643525505584063, + "grad_norm": 0.1416015625, + "learning_rate": 2.44914902449149e-05, + "loss": 0.5958, + "step": 4378 + }, + { + "epoch": 2.644129188047087, + "grad_norm": 0.1904296875, + "learning_rate": 2.4449979244499795e-05, + "loss": 0.5614, + "step": 4379 + }, + { + "epoch": 2.6447328705101114, + "grad_norm": 0.1591796875, + "learning_rate": 2.4408468244084684e-05, + "loss": 0.6388, + "step": 4380 + }, + { + "epoch": 2.645336552973136, + "grad_norm": 0.140625, + "learning_rate": 2.4366957243669574e-05, + "loss": 0.6615, + "step": 4381 + }, + { + "epoch": 2.6459402354361607, + "grad_norm": 0.1533203125, + "learning_rate": 2.4325446243254464e-05, + "loss": 0.5931, + "step": 4382 + }, + { + "epoch": 2.646543917899185, + "grad_norm": 0.134765625, + "learning_rate": 2.4283935242839354e-05, + "loss": 0.618, + "step": 4383 + }, + { + "epoch": 2.6471476003622096, + "grad_norm": 0.150390625, + "learning_rate": 2.4242424242424244e-05, + "loss": 0.7889, + "step": 4384 + }, + { + "epoch": 2.647751282825234, + "grad_norm": 0.158203125, + "learning_rate": 2.4200913242009134e-05, + "loss": 0.5992, + "step": 4385 + }, + { + "epoch": 2.6483549652882585, + "grad_norm": 0.1552734375, + "learning_rate": 2.4159402241594024e-05, + "loss": 0.6226, + "step": 4386 + }, + { + "epoch": 2.6489586477512828, + "grad_norm": 0.1650390625, + "learning_rate": 2.4117891241178914e-05, + "loss": 0.6265, + "step": 4387 + }, + { + "epoch": 2.6495623302143074, + "grad_norm": 0.16015625, + "learning_rate": 2.4076380240763803e-05, + "loss": 0.5859, + "step": 4388 + }, + { + "epoch": 2.6501660126773317, + "grad_norm": 0.1474609375, + "learning_rate": 2.4034869240348693e-05, + "loss": 0.6647, + "step": 4389 + }, + { + "epoch": 2.6507696951403563, + "grad_norm": 0.1533203125, + "learning_rate": 2.3993358239933583e-05, + "loss": 0.6158, + "step": 4390 + }, + { + "epoch": 2.6513733776033805, + "grad_norm": 0.1484375, + "learning_rate": 2.3951847239518473e-05, + "loss": 0.7374, + "step": 4391 + }, + { + "epoch": 2.651977060066405, + "grad_norm": 0.1455078125, + "learning_rate": 2.3910336239103366e-05, + "loss": 0.7737, + "step": 4392 + }, + { + "epoch": 2.6525807425294294, + "grad_norm": 0.181640625, + "learning_rate": 2.3868825238688253e-05, + "loss": 0.5964, + "step": 4393 + }, + { + "epoch": 2.653184424992454, + "grad_norm": 0.1455078125, + "learning_rate": 2.3827314238273143e-05, + "loss": 0.7454, + "step": 4394 + }, + { + "epoch": 2.6537881074554783, + "grad_norm": 0.15234375, + "learning_rate": 2.3785803237858036e-05, + "loss": 0.5554, + "step": 4395 + }, + { + "epoch": 2.654391789918503, + "grad_norm": 0.1552734375, + "learning_rate": 2.3744292237442922e-05, + "loss": 0.6501, + "step": 4396 + }, + { + "epoch": 2.654995472381527, + "grad_norm": 0.1474609375, + "learning_rate": 2.3702781237027812e-05, + "loss": 0.5483, + "step": 4397 + }, + { + "epoch": 2.655599154844552, + "grad_norm": 0.146484375, + "learning_rate": 2.3661270236612702e-05, + "loss": 0.5345, + "step": 4398 + }, + { + "epoch": 2.656202837307576, + "grad_norm": 0.142578125, + "learning_rate": 2.3619759236197595e-05, + "loss": 0.4837, + "step": 4399 + }, + { + "epoch": 2.656806519770601, + "grad_norm": 0.150390625, + "learning_rate": 2.3578248235782482e-05, + "loss": 0.5075, + "step": 4400 + }, + { + "epoch": 2.657410202233625, + "grad_norm": 0.158203125, + "learning_rate": 2.3536737235367372e-05, + "loss": 0.5213, + "step": 4401 + }, + { + "epoch": 2.6580138846966497, + "grad_norm": 0.1572265625, + "learning_rate": 2.3495226234952265e-05, + "loss": 0.488, + "step": 4402 + }, + { + "epoch": 2.658617567159674, + "grad_norm": 0.171875, + "learning_rate": 2.3453715234537155e-05, + "loss": 0.5311, + "step": 4403 + }, + { + "epoch": 2.6592212496226986, + "grad_norm": 0.169921875, + "learning_rate": 2.341220423412204e-05, + "loss": 0.4742, + "step": 4404 + }, + { + "epoch": 2.659824932085723, + "grad_norm": 0.1767578125, + "learning_rate": 2.3370693233706935e-05, + "loss": 0.498, + "step": 4405 + }, + { + "epoch": 2.6604286145487475, + "grad_norm": 0.1796875, + "learning_rate": 2.3329182233291824e-05, + "loss": 0.4055, + "step": 4406 + }, + { + "epoch": 2.6610322970117717, + "grad_norm": 0.18359375, + "learning_rate": 2.328767123287671e-05, + "loss": 0.3957, + "step": 4407 + }, + { + "epoch": 2.6616359794747964, + "grad_norm": 0.193359375, + "learning_rate": 2.3246160232461604e-05, + "loss": 0.3804, + "step": 4408 + }, + { + "epoch": 2.6622396619378206, + "grad_norm": 0.1845703125, + "learning_rate": 2.3204649232046494e-05, + "loss": 0.337, + "step": 4409 + }, + { + "epoch": 2.6628433444008452, + "grad_norm": 0.197265625, + "learning_rate": 2.3163138231631384e-05, + "loss": 0.3131, + "step": 4410 + }, + { + "epoch": 2.6634470268638695, + "grad_norm": 0.203125, + "learning_rate": 2.3121627231216274e-05, + "loss": 0.2671, + "step": 4411 + }, + { + "epoch": 2.664050709326894, + "grad_norm": 0.201171875, + "learning_rate": 2.3080116230801164e-05, + "loss": 0.2049, + "step": 4412 + }, + { + "epoch": 2.6646543917899184, + "grad_norm": 0.1435546875, + "learning_rate": 2.3038605230386054e-05, + "loss": 0.7446, + "step": 4413 + }, + { + "epoch": 2.665258074252943, + "grad_norm": 0.1484375, + "learning_rate": 2.2997094229970943e-05, + "loss": 0.6451, + "step": 4414 + }, + { + "epoch": 2.6658617567159673, + "grad_norm": 0.15234375, + "learning_rate": 2.2955583229555833e-05, + "loss": 0.6584, + "step": 4415 + }, + { + "epoch": 2.666465439178992, + "grad_norm": 0.1474609375, + "learning_rate": 2.2914072229140723e-05, + "loss": 0.595, + "step": 4416 + }, + { + "epoch": 2.667069121642016, + "grad_norm": 0.158203125, + "learning_rate": 2.2872561228725613e-05, + "loss": 0.7, + "step": 4417 + }, + { + "epoch": 2.667672804105041, + "grad_norm": 0.2265625, + "learning_rate": 2.2831050228310503e-05, + "loss": 0.6295, + "step": 4418 + }, + { + "epoch": 2.668276486568065, + "grad_norm": 0.1376953125, + "learning_rate": 2.2789539227895393e-05, + "loss": 0.5823, + "step": 4419 + }, + { + "epoch": 2.6688801690310897, + "grad_norm": 0.138671875, + "learning_rate": 2.2748028227480283e-05, + "loss": 0.6256, + "step": 4420 + }, + { + "epoch": 2.669483851494114, + "grad_norm": 0.1533203125, + "learning_rate": 2.2706517227065176e-05, + "loss": 0.6366, + "step": 4421 + }, + { + "epoch": 2.6700875339571386, + "grad_norm": 0.1552734375, + "learning_rate": 2.2665006226650062e-05, + "loss": 0.6766, + "step": 4422 + }, + { + "epoch": 2.670691216420163, + "grad_norm": 0.150390625, + "learning_rate": 2.2623495226234952e-05, + "loss": 0.6968, + "step": 4423 + }, + { + "epoch": 2.6712948988831875, + "grad_norm": 0.146484375, + "learning_rate": 2.2581984225819846e-05, + "loss": 0.6291, + "step": 4424 + }, + { + "epoch": 2.6718985813462117, + "grad_norm": 0.1396484375, + "learning_rate": 2.2540473225404735e-05, + "loss": 0.649, + "step": 4425 + }, + { + "epoch": 2.6725022638092364, + "grad_norm": 0.1591796875, + "learning_rate": 2.2498962224989622e-05, + "loss": 0.6302, + "step": 4426 + }, + { + "epoch": 2.673105946272261, + "grad_norm": 0.154296875, + "learning_rate": 2.2457451224574512e-05, + "loss": 0.6207, + "step": 4427 + }, + { + "epoch": 2.6737096287352853, + "grad_norm": 0.1484375, + "learning_rate": 2.2415940224159405e-05, + "loss": 0.6337, + "step": 4428 + }, + { + "epoch": 2.6743133111983095, + "grad_norm": 0.146484375, + "learning_rate": 2.237442922374429e-05, + "loss": 0.655, + "step": 4429 + }, + { + "epoch": 2.674916993661334, + "grad_norm": 0.1396484375, + "learning_rate": 2.233291822332918e-05, + "loss": 0.6017, + "step": 4430 + }, + { + "epoch": 2.675520676124359, + "grad_norm": 0.15625, + "learning_rate": 2.2291407222914075e-05, + "loss": 0.7195, + "step": 4431 + }, + { + "epoch": 2.676124358587383, + "grad_norm": 0.1591796875, + "learning_rate": 2.2249896222498965e-05, + "loss": 0.4655, + "step": 4432 + }, + { + "epoch": 2.6767280410504073, + "grad_norm": 0.154296875, + "learning_rate": 2.220838522208385e-05, + "loss": 0.6356, + "step": 4433 + }, + { + "epoch": 2.677331723513432, + "grad_norm": 0.298828125, + "learning_rate": 2.2166874221668744e-05, + "loss": 0.6569, + "step": 4434 + }, + { + "epoch": 2.6779354059764566, + "grad_norm": 0.1484375, + "learning_rate": 2.2125363221253634e-05, + "loss": 0.654, + "step": 4435 + }, + { + "epoch": 2.678539088439481, + "grad_norm": 0.1396484375, + "learning_rate": 2.2083852220838524e-05, + "loss": 0.5911, + "step": 4436 + }, + { + "epoch": 2.679142770902505, + "grad_norm": 0.140625, + "learning_rate": 2.2042341220423414e-05, + "loss": 0.6751, + "step": 4437 + }, + { + "epoch": 2.6797464533655297, + "grad_norm": 0.146484375, + "learning_rate": 2.2000830220008304e-05, + "loss": 0.6674, + "step": 4438 + }, + { + "epoch": 2.6803501358285544, + "grad_norm": 0.1396484375, + "learning_rate": 2.1959319219593194e-05, + "loss": 0.8804, + "step": 4439 + }, + { + "epoch": 2.6809538182915786, + "grad_norm": 0.1474609375, + "learning_rate": 2.1917808219178083e-05, + "loss": 0.5718, + "step": 4440 + }, + { + "epoch": 2.681557500754603, + "grad_norm": 0.1630859375, + "learning_rate": 2.1876297218762973e-05, + "loss": 0.6883, + "step": 4441 + }, + { + "epoch": 2.6821611832176275, + "grad_norm": 0.1552734375, + "learning_rate": 2.1834786218347863e-05, + "loss": 0.7524, + "step": 4442 + }, + { + "epoch": 2.682764865680652, + "grad_norm": 0.1279296875, + "learning_rate": 2.1793275217932753e-05, + "loss": 0.5331, + "step": 4443 + }, + { + "epoch": 2.6833685481436764, + "grad_norm": 0.1396484375, + "learning_rate": 2.1751764217517643e-05, + "loss": 0.786, + "step": 4444 + }, + { + "epoch": 2.6839722306067006, + "grad_norm": 0.146484375, + "learning_rate": 2.1710253217102533e-05, + "loss": 0.6069, + "step": 4445 + }, + { + "epoch": 2.6845759130697253, + "grad_norm": 0.146484375, + "learning_rate": 2.1668742216687423e-05, + "loss": 0.6164, + "step": 4446 + }, + { + "epoch": 2.68517959553275, + "grad_norm": 0.1494140625, + "learning_rate": 2.1627231216272316e-05, + "loss": 0.5618, + "step": 4447 + }, + { + "epoch": 2.685783277995774, + "grad_norm": 0.1455078125, + "learning_rate": 2.1585720215857202e-05, + "loss": 0.535, + "step": 4448 + }, + { + "epoch": 2.6863869604587984, + "grad_norm": 0.1611328125, + "learning_rate": 2.1544209215442092e-05, + "loss": 0.6048, + "step": 4449 + }, + { + "epoch": 2.686990642921823, + "grad_norm": 0.1591796875, + "learning_rate": 2.1502698215026986e-05, + "loss": 0.5397, + "step": 4450 + }, + { + "epoch": 2.6875943253848478, + "grad_norm": 0.1708984375, + "learning_rate": 2.1461187214611872e-05, + "loss": 0.5, + "step": 4451 + }, + { + "epoch": 2.688198007847872, + "grad_norm": 0.1689453125, + "learning_rate": 2.1419676214196762e-05, + "loss": 0.514, + "step": 4452 + }, + { + "epoch": 2.688801690310896, + "grad_norm": 0.1689453125, + "learning_rate": 2.1378165213781652e-05, + "loss": 0.4886, + "step": 4453 + }, + { + "epoch": 2.689405372773921, + "grad_norm": 0.1689453125, + "learning_rate": 2.1336654213366545e-05, + "loss": 0.4477, + "step": 4454 + }, + { + "epoch": 2.6900090552369456, + "grad_norm": 0.181640625, + "learning_rate": 2.129514321295143e-05, + "loss": 0.5355, + "step": 4455 + }, + { + "epoch": 2.69061273769997, + "grad_norm": 0.185546875, + "learning_rate": 2.125363221253632e-05, + "loss": 0.414, + "step": 4456 + }, + { + "epoch": 2.691216420162994, + "grad_norm": 0.1962890625, + "learning_rate": 2.1212121212121215e-05, + "loss": 0.4451, + "step": 4457 + }, + { + "epoch": 2.6918201026260187, + "grad_norm": 0.212890625, + "learning_rate": 2.1170610211706105e-05, + "loss": 0.4357, + "step": 4458 + }, + { + "epoch": 2.6924237850890433, + "grad_norm": 0.208984375, + "learning_rate": 2.112909921129099e-05, + "loss": 0.3608, + "step": 4459 + }, + { + "epoch": 2.6930274675520676, + "grad_norm": 0.201171875, + "learning_rate": 2.1087588210875884e-05, + "loss": 0.2876, + "step": 4460 + }, + { + "epoch": 2.693631150015092, + "grad_norm": 0.2001953125, + "learning_rate": 2.1046077210460774e-05, + "loss": 0.2317, + "step": 4461 + }, + { + "epoch": 2.6942348324781165, + "grad_norm": 0.1953125, + "learning_rate": 2.100456621004566e-05, + "loss": 0.1918, + "step": 4462 + }, + { + "epoch": 2.694838514941141, + "grad_norm": 0.15234375, + "learning_rate": 2.0963055209630554e-05, + "loss": 0.6023, + "step": 4463 + }, + { + "epoch": 2.6954421974041654, + "grad_norm": 0.15234375, + "learning_rate": 2.0921544209215444e-05, + "loss": 0.6662, + "step": 4464 + }, + { + "epoch": 2.69604587986719, + "grad_norm": 0.1416015625, + "learning_rate": 2.0880033208800334e-05, + "loss": 0.5831, + "step": 4465 + }, + { + "epoch": 2.6966495623302142, + "grad_norm": 0.158203125, + "learning_rate": 2.0838522208385224e-05, + "loss": 0.6397, + "step": 4466 + }, + { + "epoch": 2.697253244793239, + "grad_norm": 0.16796875, + "learning_rate": 2.0797011207970113e-05, + "loss": 0.6301, + "step": 4467 + }, + { + "epoch": 2.697856927256263, + "grad_norm": 0.150390625, + "learning_rate": 2.0755500207555003e-05, + "loss": 0.7089, + "step": 4468 + }, + { + "epoch": 2.698460609719288, + "grad_norm": 0.138671875, + "learning_rate": 2.0713989207139893e-05, + "loss": 0.519, + "step": 4469 + }, + { + "epoch": 2.699064292182312, + "grad_norm": 0.1416015625, + "learning_rate": 2.0672478206724783e-05, + "loss": 0.6105, + "step": 4470 + }, + { + "epoch": 2.6996679746453367, + "grad_norm": 0.1669921875, + "learning_rate": 2.0630967206309673e-05, + "loss": 0.7206, + "step": 4471 + }, + { + "epoch": 2.700271657108361, + "grad_norm": 0.1708984375, + "learning_rate": 2.0589456205894563e-05, + "loss": 0.6736, + "step": 4472 + }, + { + "epoch": 2.7008753395713856, + "grad_norm": 0.14453125, + "learning_rate": 2.0547945205479453e-05, + "loss": 0.6061, + "step": 4473 + }, + { + "epoch": 2.70147902203441, + "grad_norm": 0.1533203125, + "learning_rate": 2.0506434205064342e-05, + "loss": 0.5616, + "step": 4474 + }, + { + "epoch": 2.7020827044974345, + "grad_norm": 0.1513671875, + "learning_rate": 2.0464923204649232e-05, + "loss": 0.6123, + "step": 4475 + }, + { + "epoch": 2.7026863869604587, + "grad_norm": 0.1474609375, + "learning_rate": 2.0423412204234126e-05, + "loss": 0.8862, + "step": 4476 + }, + { + "epoch": 2.7032900694234834, + "grad_norm": 0.1513671875, + "learning_rate": 2.0381901203819012e-05, + "loss": 0.7349, + "step": 4477 + }, + { + "epoch": 2.7038937518865076, + "grad_norm": 0.1533203125, + "learning_rate": 2.0340390203403902e-05, + "loss": 0.6979, + "step": 4478 + }, + { + "epoch": 2.7044974343495323, + "grad_norm": 0.1728515625, + "learning_rate": 2.0298879202988795e-05, + "loss": 0.6383, + "step": 4479 + }, + { + "epoch": 2.7051011168125565, + "grad_norm": 0.1474609375, + "learning_rate": 2.0257368202573682e-05, + "loss": 0.6267, + "step": 4480 + }, + { + "epoch": 2.705704799275581, + "grad_norm": 0.146484375, + "learning_rate": 2.021585720215857e-05, + "loss": 0.6785, + "step": 4481 + }, + { + "epoch": 2.7063084817386054, + "grad_norm": 0.16015625, + "learning_rate": 2.017434620174346e-05, + "loss": 0.753, + "step": 4482 + }, + { + "epoch": 2.70691216420163, + "grad_norm": 0.1865234375, + "learning_rate": 2.0132835201328355e-05, + "loss": 0.5688, + "step": 4483 + }, + { + "epoch": 2.7075158466646543, + "grad_norm": 0.142578125, + "learning_rate": 2.009132420091324e-05, + "loss": 0.7494, + "step": 4484 + }, + { + "epoch": 2.708119529127679, + "grad_norm": 0.1611328125, + "learning_rate": 2.004981320049813e-05, + "loss": 0.74, + "step": 4485 + }, + { + "epoch": 2.708723211590703, + "grad_norm": 0.142578125, + "learning_rate": 2.0008302200083024e-05, + "loss": 0.5672, + "step": 4486 + }, + { + "epoch": 2.709326894053728, + "grad_norm": 0.15234375, + "learning_rate": 1.9966791199667914e-05, + "loss": 0.8192, + "step": 4487 + }, + { + "epoch": 2.709930576516752, + "grad_norm": 0.1708984375, + "learning_rate": 1.99252801992528e-05, + "loss": 0.6851, + "step": 4488 + }, + { + "epoch": 2.7105342589797767, + "grad_norm": 0.14453125, + "learning_rate": 1.9883769198837694e-05, + "loss": 0.6071, + "step": 4489 + }, + { + "epoch": 2.711137941442801, + "grad_norm": 0.154296875, + "learning_rate": 1.9842258198422584e-05, + "loss": 0.6756, + "step": 4490 + }, + { + "epoch": 2.7117416239058256, + "grad_norm": 0.1552734375, + "learning_rate": 1.980074719800747e-05, + "loss": 0.6658, + "step": 4491 + }, + { + "epoch": 2.71234530636885, + "grad_norm": 0.140625, + "learning_rate": 1.9759236197592364e-05, + "loss": 0.5517, + "step": 4492 + }, + { + "epoch": 2.7129489888318745, + "grad_norm": 0.173828125, + "learning_rate": 1.9717725197177253e-05, + "loss": 0.6189, + "step": 4493 + }, + { + "epoch": 2.7135526712948987, + "grad_norm": 0.1455078125, + "learning_rate": 1.9676214196762143e-05, + "loss": 0.5697, + "step": 4494 + }, + { + "epoch": 2.7141563537579234, + "grad_norm": 0.1416015625, + "learning_rate": 1.9634703196347033e-05, + "loss": 0.6535, + "step": 4495 + }, + { + "epoch": 2.7147600362209476, + "grad_norm": 0.1416015625, + "learning_rate": 1.9593192195931923e-05, + "loss": 0.5543, + "step": 4496 + }, + { + "epoch": 2.7153637186839723, + "grad_norm": 0.1572265625, + "learning_rate": 1.9551681195516813e-05, + "loss": 0.5723, + "step": 4497 + }, + { + "epoch": 2.7159674011469965, + "grad_norm": 0.1474609375, + "learning_rate": 1.9510170195101703e-05, + "loss": 0.5723, + "step": 4498 + }, + { + "epoch": 2.716571083610021, + "grad_norm": 0.1650390625, + "learning_rate": 1.9468659194686593e-05, + "loss": 0.5244, + "step": 4499 + }, + { + "epoch": 2.7171747660730454, + "grad_norm": 0.1640625, + "learning_rate": 1.9427148194271483e-05, + "loss": 0.5962, + "step": 4500 + }, + { + "epoch": 2.7171747660730454, + "eval_loss": 0.6135643124580383, + "eval_runtime": 1059.4817, + "eval_samples_per_second": 2.633, + "eval_steps_per_second": 0.329, + "step": 4500 + }, + { + "epoch": 2.71777844853607, + "grad_norm": 0.158203125, + "learning_rate": 1.9385637193856372e-05, + "loss": 0.5302, + "step": 4501 + }, + { + "epoch": 2.7183821309990943, + "grad_norm": 0.1748046875, + "learning_rate": 1.9344126193441262e-05, + "loss": 0.5348, + "step": 4502 + }, + { + "epoch": 2.718985813462119, + "grad_norm": 0.1923828125, + "learning_rate": 1.9302615193026152e-05, + "loss": 0.5221, + "step": 4503 + }, + { + "epoch": 2.7195894959251437, + "grad_norm": 0.173828125, + "learning_rate": 1.9261104192611042e-05, + "loss": 0.4312, + "step": 4504 + }, + { + "epoch": 2.720193178388168, + "grad_norm": 0.1796875, + "learning_rate": 1.9219593192195935e-05, + "loss": 0.4838, + "step": 4505 + }, + { + "epoch": 2.720796860851192, + "grad_norm": 0.1953125, + "learning_rate": 1.9178082191780822e-05, + "loss": 0.471, + "step": 4506 + }, + { + "epoch": 2.7214005433142168, + "grad_norm": 0.1865234375, + "learning_rate": 1.913657119136571e-05, + "loss": 0.4915, + "step": 4507 + }, + { + "epoch": 2.7220042257772414, + "grad_norm": 0.173828125, + "learning_rate": 1.9095060190950605e-05, + "loss": 0.3882, + "step": 4508 + }, + { + "epoch": 2.7226079082402657, + "grad_norm": 0.1962890625, + "learning_rate": 1.9053549190535495e-05, + "loss": 0.3019, + "step": 4509 + }, + { + "epoch": 2.72321159070329, + "grad_norm": 0.2080078125, + "learning_rate": 1.901203819012038e-05, + "loss": 0.3022, + "step": 4510 + }, + { + "epoch": 2.7238152731663146, + "grad_norm": 0.2158203125, + "learning_rate": 1.897052718970527e-05, + "loss": 0.3075, + "step": 4511 + }, + { + "epoch": 2.7244189556293392, + "grad_norm": 0.205078125, + "learning_rate": 1.8929016189290164e-05, + "loss": 0.2109, + "step": 4512 + }, + { + "epoch": 2.7250226380923634, + "grad_norm": 0.146484375, + "learning_rate": 1.888750518887505e-05, + "loss": 0.6597, + "step": 4513 + }, + { + "epoch": 2.7256263205553877, + "grad_norm": 0.1318359375, + "learning_rate": 1.884599418845994e-05, + "loss": 0.5063, + "step": 4514 + }, + { + "epoch": 2.7262300030184123, + "grad_norm": 0.171875, + "learning_rate": 1.8804483188044834e-05, + "loss": 0.722, + "step": 4515 + }, + { + "epoch": 2.726833685481437, + "grad_norm": 0.14453125, + "learning_rate": 1.8762972187629724e-05, + "loss": 0.733, + "step": 4516 + }, + { + "epoch": 2.7274373679444612, + "grad_norm": 0.1337890625, + "learning_rate": 1.872146118721461e-05, + "loss": 0.5692, + "step": 4517 + }, + { + "epoch": 2.7280410504074855, + "grad_norm": 0.1572265625, + "learning_rate": 1.8679950186799504e-05, + "loss": 0.5854, + "step": 4518 + }, + { + "epoch": 2.72864473287051, + "grad_norm": 0.1904296875, + "learning_rate": 1.8638439186384393e-05, + "loss": 0.4641, + "step": 4519 + }, + { + "epoch": 2.729248415333535, + "grad_norm": 0.1435546875, + "learning_rate": 1.8596928185969283e-05, + "loss": 0.5903, + "step": 4520 + }, + { + "epoch": 2.729852097796559, + "grad_norm": 0.1484375, + "learning_rate": 1.8555417185554173e-05, + "loss": 0.5939, + "step": 4521 + }, + { + "epoch": 2.7304557802595832, + "grad_norm": 0.21484375, + "learning_rate": 1.8513906185139063e-05, + "loss": 0.6747, + "step": 4522 + }, + { + "epoch": 2.731059462722608, + "grad_norm": 0.142578125, + "learning_rate": 1.8472395184723953e-05, + "loss": 0.6233, + "step": 4523 + }, + { + "epoch": 2.7316631451856326, + "grad_norm": 0.142578125, + "learning_rate": 1.8430884184308843e-05, + "loss": 0.5499, + "step": 4524 + }, + { + "epoch": 2.732266827648657, + "grad_norm": 0.15234375, + "learning_rate": 1.8389373183893733e-05, + "loss": 0.5983, + "step": 4525 + }, + { + "epoch": 2.732870510111681, + "grad_norm": 0.154296875, + "learning_rate": 1.8347862183478623e-05, + "loss": 0.7181, + "step": 4526 + }, + { + "epoch": 2.7334741925747057, + "grad_norm": 0.15625, + "learning_rate": 1.8306351183063512e-05, + "loss": 0.5598, + "step": 4527 + }, + { + "epoch": 2.7340778750377304, + "grad_norm": 0.1494140625, + "learning_rate": 1.8264840182648402e-05, + "loss": 0.6007, + "step": 4528 + }, + { + "epoch": 2.7346815575007546, + "grad_norm": 0.1455078125, + "learning_rate": 1.8223329182233292e-05, + "loss": 0.5836, + "step": 4529 + }, + { + "epoch": 2.735285239963779, + "grad_norm": 0.1455078125, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.5936, + "step": 4530 + }, + { + "epoch": 2.7358889224268035, + "grad_norm": 0.1416015625, + "learning_rate": 1.8140307181403075e-05, + "loss": 0.8953, + "step": 4531 + }, + { + "epoch": 2.736492604889828, + "grad_norm": 0.142578125, + "learning_rate": 1.8098796180987962e-05, + "loss": 0.5644, + "step": 4532 + }, + { + "epoch": 2.7370962873528524, + "grad_norm": 0.1611328125, + "learning_rate": 1.805728518057285e-05, + "loss": 0.7689, + "step": 4533 + }, + { + "epoch": 2.7376999698158766, + "grad_norm": 0.15625, + "learning_rate": 1.8015774180157745e-05, + "loss": 0.7984, + "step": 4534 + }, + { + "epoch": 2.7383036522789013, + "grad_norm": 0.17578125, + "learning_rate": 1.797426317974263e-05, + "loss": 0.7707, + "step": 4535 + }, + { + "epoch": 2.738907334741926, + "grad_norm": 0.1513671875, + "learning_rate": 1.793275217932752e-05, + "loss": 0.7229, + "step": 4536 + }, + { + "epoch": 2.73951101720495, + "grad_norm": 0.1474609375, + "learning_rate": 1.7891241178912415e-05, + "loss": 0.9094, + "step": 4537 + }, + { + "epoch": 2.7401146996679744, + "grad_norm": 0.1552734375, + "learning_rate": 1.7849730178497304e-05, + "loss": 0.6551, + "step": 4538 + }, + { + "epoch": 2.740718382130999, + "grad_norm": 0.1611328125, + "learning_rate": 1.780821917808219e-05, + "loss": 0.696, + "step": 4539 + }, + { + "epoch": 2.7413220645940237, + "grad_norm": 0.1552734375, + "learning_rate": 1.776670817766708e-05, + "loss": 0.7026, + "step": 4540 + }, + { + "epoch": 2.741925747057048, + "grad_norm": 0.1552734375, + "learning_rate": 1.7725197177251974e-05, + "loss": 0.6059, + "step": 4541 + }, + { + "epoch": 2.742529429520072, + "grad_norm": 0.1396484375, + "learning_rate": 1.7683686176836864e-05, + "loss": 0.6566, + "step": 4542 + }, + { + "epoch": 2.743133111983097, + "grad_norm": 0.1455078125, + "learning_rate": 1.764217517642175e-05, + "loss": 0.6337, + "step": 4543 + }, + { + "epoch": 2.7437367944461215, + "grad_norm": 0.1396484375, + "learning_rate": 1.7600664176006644e-05, + "loss": 0.512, + "step": 4544 + }, + { + "epoch": 2.7443404769091457, + "grad_norm": 0.1357421875, + "learning_rate": 1.7559153175591534e-05, + "loss": 0.4839, + "step": 4545 + }, + { + "epoch": 2.7449441593721704, + "grad_norm": 0.1787109375, + "learning_rate": 1.751764217517642e-05, + "loss": 0.5621, + "step": 4546 + }, + { + "epoch": 2.7455478418351946, + "grad_norm": 0.1455078125, + "learning_rate": 1.7476131174761313e-05, + "loss": 0.5687, + "step": 4547 + }, + { + "epoch": 2.7461515242982193, + "grad_norm": 0.142578125, + "learning_rate": 1.7434620174346203e-05, + "loss": 0.5606, + "step": 4548 + }, + { + "epoch": 2.7467552067612435, + "grad_norm": 0.1455078125, + "learning_rate": 1.7393109173931093e-05, + "loss": 0.5483, + "step": 4549 + }, + { + "epoch": 2.747358889224268, + "grad_norm": 0.150390625, + "learning_rate": 1.7351598173515983e-05, + "loss": 0.4612, + "step": 4550 + }, + { + "epoch": 2.7479625716872924, + "grad_norm": 0.158203125, + "learning_rate": 1.7310087173100873e-05, + "loss": 0.5156, + "step": 4551 + }, + { + "epoch": 2.748566254150317, + "grad_norm": 0.1630859375, + "learning_rate": 1.7268576172685763e-05, + "loss": 0.5005, + "step": 4552 + }, + { + "epoch": 2.7491699366133413, + "grad_norm": 0.1630859375, + "learning_rate": 1.7227065172270653e-05, + "loss": 0.454, + "step": 4553 + }, + { + "epoch": 2.749773619076366, + "grad_norm": 0.166015625, + "learning_rate": 1.7185554171855542e-05, + "loss": 0.5291, + "step": 4554 + }, + { + "epoch": 2.75037730153939, + "grad_norm": 0.18359375, + "learning_rate": 1.7144043171440432e-05, + "loss": 0.5026, + "step": 4555 + }, + { + "epoch": 2.750980984002415, + "grad_norm": 0.189453125, + "learning_rate": 1.7102532171025322e-05, + "loss": 0.4815, + "step": 4556 + }, + { + "epoch": 2.751584666465439, + "grad_norm": 0.1904296875, + "learning_rate": 1.7061021170610212e-05, + "loss": 0.4753, + "step": 4557 + }, + { + "epoch": 2.7521883489284638, + "grad_norm": 0.1806640625, + "learning_rate": 1.7019510170195102e-05, + "loss": 0.4151, + "step": 4558 + }, + { + "epoch": 2.752792031391488, + "grad_norm": 0.2197265625, + "learning_rate": 1.6977999169779992e-05, + "loss": 0.3104, + "step": 4559 + }, + { + "epoch": 2.7533957138545126, + "grad_norm": 0.203125, + "learning_rate": 1.6936488169364885e-05, + "loss": 0.3303, + "step": 4560 + }, + { + "epoch": 2.753999396317537, + "grad_norm": 0.203125, + "learning_rate": 1.689497716894977e-05, + "loss": 0.308, + "step": 4561 + }, + { + "epoch": 2.7546030787805615, + "grad_norm": 0.2197265625, + "learning_rate": 1.685346616853466e-05, + "loss": 0.2067, + "step": 4562 + }, + { + "epoch": 2.7552067612435858, + "grad_norm": 0.1357421875, + "learning_rate": 1.6811955168119555e-05, + "loss": 1.0449, + "step": 4563 + }, + { + "epoch": 2.7558104437066104, + "grad_norm": 0.1435546875, + "learning_rate": 1.6770444167704444e-05, + "loss": 0.6562, + "step": 4564 + }, + { + "epoch": 2.7564141261696347, + "grad_norm": 0.13671875, + "learning_rate": 1.672893316728933e-05, + "loss": 0.5127, + "step": 4565 + }, + { + "epoch": 2.7570178086326593, + "grad_norm": 0.146484375, + "learning_rate": 1.6687422166874224e-05, + "loss": 0.6374, + "step": 4566 + }, + { + "epoch": 2.7576214910956836, + "grad_norm": 0.1513671875, + "learning_rate": 1.6645911166459114e-05, + "loss": 0.7662, + "step": 4567 + }, + { + "epoch": 2.758225173558708, + "grad_norm": 0.1435546875, + "learning_rate": 1.6604400166044e-05, + "loss": 0.7541, + "step": 4568 + }, + { + "epoch": 2.7588288560217324, + "grad_norm": 0.16796875, + "learning_rate": 1.656288916562889e-05, + "loss": 0.735, + "step": 4569 + }, + { + "epoch": 2.759432538484757, + "grad_norm": 0.1591796875, + "learning_rate": 1.6521378165213784e-05, + "loss": 0.59, + "step": 4570 + }, + { + "epoch": 2.7600362209477813, + "grad_norm": 0.138671875, + "learning_rate": 1.6479867164798674e-05, + "loss": 0.5174, + "step": 4571 + }, + { + "epoch": 2.760639903410806, + "grad_norm": 0.140625, + "learning_rate": 1.643835616438356e-05, + "loss": 0.5364, + "step": 4572 + }, + { + "epoch": 2.7612435858738302, + "grad_norm": 0.150390625, + "learning_rate": 1.6396845163968453e-05, + "loss": 0.6259, + "step": 4573 + }, + { + "epoch": 2.761847268336855, + "grad_norm": 0.1455078125, + "learning_rate": 1.6355334163553343e-05, + "loss": 0.8076, + "step": 4574 + }, + { + "epoch": 2.762450950799879, + "grad_norm": 0.1533203125, + "learning_rate": 1.6313823163138233e-05, + "loss": 0.5766, + "step": 4575 + }, + { + "epoch": 2.763054633262904, + "grad_norm": 0.1357421875, + "learning_rate": 1.6272312162723123e-05, + "loss": 0.6713, + "step": 4576 + }, + { + "epoch": 2.763658315725928, + "grad_norm": 0.1513671875, + "learning_rate": 1.6230801162308013e-05, + "loss": 0.6865, + "step": 4577 + }, + { + "epoch": 2.7642619981889527, + "grad_norm": 0.1513671875, + "learning_rate": 1.6189290161892903e-05, + "loss": 0.8193, + "step": 4578 + }, + { + "epoch": 2.764865680651977, + "grad_norm": 0.146484375, + "learning_rate": 1.6147779161477793e-05, + "loss": 0.5764, + "step": 4579 + }, + { + "epoch": 2.7654693631150016, + "grad_norm": 0.1552734375, + "learning_rate": 1.6106268161062682e-05, + "loss": 0.7132, + "step": 4580 + }, + { + "epoch": 2.766073045578026, + "grad_norm": 0.1416015625, + "learning_rate": 1.6064757160647572e-05, + "loss": 0.7006, + "step": 4581 + }, + { + "epoch": 2.7666767280410505, + "grad_norm": 0.169921875, + "learning_rate": 1.6023246160232462e-05, + "loss": 0.62, + "step": 4582 + }, + { + "epoch": 2.7672804105040747, + "grad_norm": 0.146484375, + "learning_rate": 1.5981735159817352e-05, + "loss": 0.5348, + "step": 4583 + }, + { + "epoch": 2.7678840929670994, + "grad_norm": 0.14453125, + "learning_rate": 1.5940224159402242e-05, + "loss": 0.6188, + "step": 4584 + }, + { + "epoch": 2.768487775430124, + "grad_norm": 0.15625, + "learning_rate": 1.5898713158987132e-05, + "loss": 0.6714, + "step": 4585 + }, + { + "epoch": 2.7690914578931483, + "grad_norm": 0.1474609375, + "learning_rate": 1.585720215857202e-05, + "loss": 0.7338, + "step": 4586 + }, + { + "epoch": 2.7696951403561725, + "grad_norm": 0.15625, + "learning_rate": 1.581569115815691e-05, + "loss": 0.5708, + "step": 4587 + }, + { + "epoch": 2.770298822819197, + "grad_norm": 0.15234375, + "learning_rate": 1.57741801577418e-05, + "loss": 0.6864, + "step": 4588 + }, + { + "epoch": 2.770902505282222, + "grad_norm": 0.1328125, + "learning_rate": 1.5732669157326695e-05, + "loss": 0.57, + "step": 4589 + }, + { + "epoch": 2.771506187745246, + "grad_norm": 0.158203125, + "learning_rate": 1.569115815691158e-05, + "loss": 0.5914, + "step": 4590 + }, + { + "epoch": 2.7721098702082703, + "grad_norm": 0.1591796875, + "learning_rate": 1.564964715649647e-05, + "loss": 0.9134, + "step": 4591 + }, + { + "epoch": 2.772713552671295, + "grad_norm": 0.365234375, + "learning_rate": 1.5608136156081364e-05, + "loss": 0.6086, + "step": 4592 + }, + { + "epoch": 2.7733172351343196, + "grad_norm": 0.154296875, + "learning_rate": 1.5566625155666254e-05, + "loss": 0.7351, + "step": 4593 + }, + { + "epoch": 2.773920917597344, + "grad_norm": 0.1484375, + "learning_rate": 1.552511415525114e-05, + "loss": 0.8879, + "step": 4594 + }, + { + "epoch": 2.774524600060368, + "grad_norm": 0.150390625, + "learning_rate": 1.548360315483603e-05, + "loss": 0.8547, + "step": 4595 + }, + { + "epoch": 2.7751282825233927, + "grad_norm": 0.134765625, + "learning_rate": 1.5442092154420924e-05, + "loss": 0.5326, + "step": 4596 + }, + { + "epoch": 2.7757319649864174, + "grad_norm": 0.1455078125, + "learning_rate": 1.540058115400581e-05, + "loss": 0.4954, + "step": 4597 + }, + { + "epoch": 2.7763356474494416, + "grad_norm": 0.16015625, + "learning_rate": 1.53590701535907e-05, + "loss": 0.6014, + "step": 4598 + }, + { + "epoch": 2.776939329912466, + "grad_norm": 0.158203125, + "learning_rate": 1.5317559153175593e-05, + "loss": 0.5491, + "step": 4599 + }, + { + "epoch": 2.7775430123754905, + "grad_norm": 0.1650390625, + "learning_rate": 1.5276048152760483e-05, + "loss": 0.5466, + "step": 4600 + }, + { + "epoch": 2.778146694838515, + "grad_norm": 0.1630859375, + "learning_rate": 1.5234537152345371e-05, + "loss": 0.508, + "step": 4601 + }, + { + "epoch": 2.7787503773015394, + "grad_norm": 0.1728515625, + "learning_rate": 1.5193026151930261e-05, + "loss": 0.5131, + "step": 4602 + }, + { + "epoch": 2.7793540597645636, + "grad_norm": 0.169921875, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.4846, + "step": 4603 + }, + { + "epoch": 2.7799577422275883, + "grad_norm": 0.1767578125, + "learning_rate": 1.5110004151100043e-05, + "loss": 0.5484, + "step": 4604 + }, + { + "epoch": 2.780561424690613, + "grad_norm": 0.181640625, + "learning_rate": 1.5068493150684931e-05, + "loss": 0.4347, + "step": 4605 + }, + { + "epoch": 2.781165107153637, + "grad_norm": 0.193359375, + "learning_rate": 1.5026982150269822e-05, + "loss": 0.4263, + "step": 4606 + }, + { + "epoch": 2.7817687896166614, + "grad_norm": 0.19140625, + "learning_rate": 1.4985471149854712e-05, + "loss": 0.4077, + "step": 4607 + }, + { + "epoch": 2.782372472079686, + "grad_norm": 0.201171875, + "learning_rate": 1.49439601494396e-05, + "loss": 0.4031, + "step": 4608 + }, + { + "epoch": 2.7829761545427107, + "grad_norm": 0.2060546875, + "learning_rate": 1.4902449149024492e-05, + "loss": 0.3571, + "step": 4609 + }, + { + "epoch": 2.783579837005735, + "grad_norm": 0.20703125, + "learning_rate": 1.4860938148609382e-05, + "loss": 0.2875, + "step": 4610 + }, + { + "epoch": 2.784183519468759, + "grad_norm": 0.197265625, + "learning_rate": 1.4819427148194274e-05, + "loss": 0.257, + "step": 4611 + }, + { + "epoch": 2.784787201931784, + "grad_norm": 0.2041015625, + "learning_rate": 1.4777916147779162e-05, + "loss": 0.2157, + "step": 4612 + }, + { + "epoch": 2.7853908843948085, + "grad_norm": 0.1318359375, + "learning_rate": 1.4736405147364052e-05, + "loss": 0.7389, + "step": 4613 + }, + { + "epoch": 2.7859945668578328, + "grad_norm": 0.1630859375, + "learning_rate": 1.4694894146948943e-05, + "loss": 0.7929, + "step": 4614 + }, + { + "epoch": 2.786598249320857, + "grad_norm": 0.138671875, + "learning_rate": 1.4653383146533833e-05, + "loss": 0.6031, + "step": 4615 + }, + { + "epoch": 2.7872019317838816, + "grad_norm": 0.1396484375, + "learning_rate": 1.4611872146118721e-05, + "loss": 0.5555, + "step": 4616 + }, + { + "epoch": 2.7878056142469063, + "grad_norm": 0.1533203125, + "learning_rate": 1.4570361145703613e-05, + "loss": 0.6512, + "step": 4617 + }, + { + "epoch": 2.7884092967099305, + "grad_norm": 0.1552734375, + "learning_rate": 1.4528850145288503e-05, + "loss": 0.6039, + "step": 4618 + }, + { + "epoch": 2.7890129791729548, + "grad_norm": 0.1494140625, + "learning_rate": 1.448733914487339e-05, + "loss": 0.6583, + "step": 4619 + }, + { + "epoch": 2.7896166616359794, + "grad_norm": 0.1455078125, + "learning_rate": 1.4445828144458282e-05, + "loss": 0.6774, + "step": 4620 + }, + { + "epoch": 2.790220344099004, + "grad_norm": 0.1396484375, + "learning_rate": 1.4404317144043172e-05, + "loss": 0.5934, + "step": 4621 + }, + { + "epoch": 2.7908240265620283, + "grad_norm": 0.1640625, + "learning_rate": 1.4362806143628064e-05, + "loss": 0.6648, + "step": 4622 + }, + { + "epoch": 2.7914277090250526, + "grad_norm": 0.130859375, + "learning_rate": 1.432129514321295e-05, + "loss": 0.5695, + "step": 4623 + }, + { + "epoch": 2.792031391488077, + "grad_norm": 0.1474609375, + "learning_rate": 1.4279784142797842e-05, + "loss": 0.9932, + "step": 4624 + }, + { + "epoch": 2.792635073951102, + "grad_norm": 0.154296875, + "learning_rate": 1.4238273142382733e-05, + "loss": 0.6688, + "step": 4625 + }, + { + "epoch": 2.793238756414126, + "grad_norm": 0.169921875, + "learning_rate": 1.4196762141967623e-05, + "loss": 0.8921, + "step": 4626 + }, + { + "epoch": 2.793842438877151, + "grad_norm": 0.12890625, + "learning_rate": 1.4155251141552511e-05, + "loss": 0.8431, + "step": 4627 + }, + { + "epoch": 2.794446121340175, + "grad_norm": 0.1357421875, + "learning_rate": 1.4113740141137401e-05, + "loss": 0.6138, + "step": 4628 + }, + { + "epoch": 2.7950498038031997, + "grad_norm": 0.171875, + "learning_rate": 1.4072229140722293e-05, + "loss": 0.7836, + "step": 4629 + }, + { + "epoch": 2.795653486266224, + "grad_norm": 0.14453125, + "learning_rate": 1.4030718140307181e-05, + "loss": 0.8925, + "step": 4630 + }, + { + "epoch": 2.7962571687292486, + "grad_norm": 0.15234375, + "learning_rate": 1.3989207139892071e-05, + "loss": 0.6221, + "step": 4631 + }, + { + "epoch": 2.796860851192273, + "grad_norm": 0.1513671875, + "learning_rate": 1.3947696139476963e-05, + "loss": 0.6567, + "step": 4632 + }, + { + "epoch": 2.7974645336552975, + "grad_norm": 0.146484375, + "learning_rate": 1.3906185139061852e-05, + "loss": 0.5648, + "step": 4633 + }, + { + "epoch": 2.7980682161183217, + "grad_norm": 0.158203125, + "learning_rate": 1.386467413864674e-05, + "loss": 0.6673, + "step": 4634 + }, + { + "epoch": 2.7986718985813464, + "grad_norm": 0.1435546875, + "learning_rate": 1.3823163138231632e-05, + "loss": 0.605, + "step": 4635 + }, + { + "epoch": 2.7992755810443706, + "grad_norm": 0.1396484375, + "learning_rate": 1.3781652137816522e-05, + "loss": 0.6171, + "step": 4636 + }, + { + "epoch": 2.7998792635073952, + "grad_norm": 0.1494140625, + "learning_rate": 1.3740141137401414e-05, + "loss": 0.5335, + "step": 4637 + }, + { + "epoch": 2.8004829459704195, + "grad_norm": 0.138671875, + "learning_rate": 1.3698630136986302e-05, + "loss": 0.7067, + "step": 4638 + }, + { + "epoch": 2.801086628433444, + "grad_norm": 0.1484375, + "learning_rate": 1.3657119136571192e-05, + "loss": 0.655, + "step": 4639 + }, + { + "epoch": 2.8016903108964684, + "grad_norm": 0.1435546875, + "learning_rate": 1.3615608136156083e-05, + "loss": 0.5433, + "step": 4640 + }, + { + "epoch": 2.802293993359493, + "grad_norm": 0.1474609375, + "learning_rate": 1.3574097135740971e-05, + "loss": 0.7166, + "step": 4641 + }, + { + "epoch": 2.8028976758225173, + "grad_norm": 0.171875, + "learning_rate": 1.3532586135325861e-05, + "loss": 0.6537, + "step": 4642 + }, + { + "epoch": 2.803501358285542, + "grad_norm": 0.1494140625, + "learning_rate": 1.3491075134910753e-05, + "loss": 0.6526, + "step": 4643 + }, + { + "epoch": 2.804105040748566, + "grad_norm": 0.158203125, + "learning_rate": 1.3449564134495643e-05, + "loss": 0.6646, + "step": 4644 + }, + { + "epoch": 2.804708723211591, + "grad_norm": 0.1328125, + "learning_rate": 1.3408053134080531e-05, + "loss": 0.5491, + "step": 4645 + }, + { + "epoch": 2.805312405674615, + "grad_norm": 0.146484375, + "learning_rate": 1.3366542133665422e-05, + "loss": 0.5573, + "step": 4646 + }, + { + "epoch": 2.8059160881376397, + "grad_norm": 0.154296875, + "learning_rate": 1.3325031133250312e-05, + "loss": 0.5577, + "step": 4647 + }, + { + "epoch": 2.806519770600664, + "grad_norm": 0.1533203125, + "learning_rate": 1.3283520132835204e-05, + "loss": 0.5257, + "step": 4648 + }, + { + "epoch": 2.8071234530636886, + "grad_norm": 0.1630859375, + "learning_rate": 1.3242009132420092e-05, + "loss": 0.6006, + "step": 4649 + }, + { + "epoch": 2.807727135526713, + "grad_norm": 0.1630859375, + "learning_rate": 1.3200498132004982e-05, + "loss": 0.5615, + "step": 4650 + }, + { + "epoch": 2.8083308179897375, + "grad_norm": 0.15625, + "learning_rate": 1.3158987131589873e-05, + "loss": 0.5161, + "step": 4651 + }, + { + "epoch": 2.8089345004527617, + "grad_norm": 0.1630859375, + "learning_rate": 1.311747613117476e-05, + "loss": 0.5035, + "step": 4652 + }, + { + "epoch": 2.8095381829157864, + "grad_norm": 0.1650390625, + "learning_rate": 1.3075965130759652e-05, + "loss": 0.5236, + "step": 4653 + }, + { + "epoch": 2.8101418653788106, + "grad_norm": 0.1787109375, + "learning_rate": 1.3034454130344541e-05, + "loss": 0.5802, + "step": 4654 + }, + { + "epoch": 2.8107455478418353, + "grad_norm": 0.17578125, + "learning_rate": 1.2992943129929433e-05, + "loss": 0.4602, + "step": 4655 + }, + { + "epoch": 2.8113492303048595, + "grad_norm": 0.181640625, + "learning_rate": 1.2951432129514321e-05, + "loss": 0.4859, + "step": 4656 + }, + { + "epoch": 2.811952912767884, + "grad_norm": 0.1884765625, + "learning_rate": 1.2909921129099211e-05, + "loss": 0.4449, + "step": 4657 + }, + { + "epoch": 2.8125565952309084, + "grad_norm": 0.1875, + "learning_rate": 1.2868410128684103e-05, + "loss": 0.3615, + "step": 4658 + }, + { + "epoch": 2.813160277693933, + "grad_norm": 0.19921875, + "learning_rate": 1.2826899128268992e-05, + "loss": 0.325, + "step": 4659 + }, + { + "epoch": 2.8137639601569573, + "grad_norm": 0.21875, + "learning_rate": 1.278538812785388e-05, + "loss": 0.3904, + "step": 4660 + }, + { + "epoch": 2.814367642619982, + "grad_norm": 0.20703125, + "learning_rate": 1.2743877127438772e-05, + "loss": 0.271, + "step": 4661 + }, + { + "epoch": 2.814971325083006, + "grad_norm": 0.1953125, + "learning_rate": 1.2702366127023662e-05, + "loss": 0.2315, + "step": 4662 + }, + { + "epoch": 2.815575007546031, + "grad_norm": 0.13671875, + "learning_rate": 1.266085512660855e-05, + "loss": 0.7124, + "step": 4663 + }, + { + "epoch": 2.816178690009055, + "grad_norm": 0.1533203125, + "learning_rate": 1.2619344126193442e-05, + "loss": 0.6489, + "step": 4664 + }, + { + "epoch": 2.8167823724720797, + "grad_norm": 0.15625, + "learning_rate": 1.2577833125778332e-05, + "loss": 0.966, + "step": 4665 + }, + { + "epoch": 2.8173860549351044, + "grad_norm": 0.1650390625, + "learning_rate": 1.2536322125363223e-05, + "loss": 0.7182, + "step": 4666 + }, + { + "epoch": 2.8179897373981286, + "grad_norm": 0.166015625, + "learning_rate": 1.2494811124948113e-05, + "loss": 0.6668, + "step": 4667 + }, + { + "epoch": 2.818593419861153, + "grad_norm": 0.1572265625, + "learning_rate": 1.2453300124533001e-05, + "loss": 0.6513, + "step": 4668 + }, + { + "epoch": 2.8191971023241775, + "grad_norm": 0.13671875, + "learning_rate": 1.2411789124117893e-05, + "loss": 0.5954, + "step": 4669 + }, + { + "epoch": 2.819800784787202, + "grad_norm": 0.171875, + "learning_rate": 1.2370278123702781e-05, + "loss": 0.7074, + "step": 4670 + }, + { + "epoch": 2.8204044672502264, + "grad_norm": 0.1552734375, + "learning_rate": 1.2328767123287671e-05, + "loss": 0.5661, + "step": 4671 + }, + { + "epoch": 2.8210081497132506, + "grad_norm": 0.1435546875, + "learning_rate": 1.2287256122872562e-05, + "loss": 0.6607, + "step": 4672 + }, + { + "epoch": 2.8216118321762753, + "grad_norm": 0.1630859375, + "learning_rate": 1.224574512245745e-05, + "loss": 0.7987, + "step": 4673 + }, + { + "epoch": 2.8222155146393, + "grad_norm": 0.14453125, + "learning_rate": 1.2204234122042342e-05, + "loss": 0.8645, + "step": 4674 + }, + { + "epoch": 2.822819197102324, + "grad_norm": 0.140625, + "learning_rate": 1.2162723121627232e-05, + "loss": 0.4833, + "step": 4675 + }, + { + "epoch": 2.8234228795653484, + "grad_norm": 0.1513671875, + "learning_rate": 1.2121212121212122e-05, + "loss": 0.6758, + "step": 4676 + }, + { + "epoch": 2.824026562028373, + "grad_norm": 0.1484375, + "learning_rate": 1.2079701120797012e-05, + "loss": 0.6682, + "step": 4677 + }, + { + "epoch": 2.8246302444913978, + "grad_norm": 0.1474609375, + "learning_rate": 1.2038190120381902e-05, + "loss": 0.753, + "step": 4678 + }, + { + "epoch": 2.825233926954422, + "grad_norm": 0.1376953125, + "learning_rate": 1.1996679119966792e-05, + "loss": 0.4793, + "step": 4679 + }, + { + "epoch": 2.825837609417446, + "grad_norm": 0.19140625, + "learning_rate": 1.1955168119551683e-05, + "loss": 0.6239, + "step": 4680 + }, + { + "epoch": 2.826441291880471, + "grad_norm": 0.15625, + "learning_rate": 1.1913657119136571e-05, + "loss": 0.607, + "step": 4681 + }, + { + "epoch": 2.8270449743434956, + "grad_norm": 0.150390625, + "learning_rate": 1.1872146118721461e-05, + "loss": 0.7043, + "step": 4682 + }, + { + "epoch": 2.82764865680652, + "grad_norm": 0.1572265625, + "learning_rate": 1.1830635118306351e-05, + "loss": 0.7399, + "step": 4683 + }, + { + "epoch": 2.828252339269544, + "grad_norm": 0.1513671875, + "learning_rate": 1.1789124117891241e-05, + "loss": 0.5987, + "step": 4684 + }, + { + "epoch": 2.8288560217325687, + "grad_norm": 0.140625, + "learning_rate": 1.1747613117476132e-05, + "loss": 0.6033, + "step": 4685 + }, + { + "epoch": 2.8294597041955933, + "grad_norm": 0.1552734375, + "learning_rate": 1.170610211706102e-05, + "loss": 0.5987, + "step": 4686 + }, + { + "epoch": 2.8300633866586176, + "grad_norm": 0.1884765625, + "learning_rate": 1.1664591116645912e-05, + "loss": 0.6025, + "step": 4687 + }, + { + "epoch": 2.830667069121642, + "grad_norm": 0.142578125, + "learning_rate": 1.1623080116230802e-05, + "loss": 0.588, + "step": 4688 + }, + { + "epoch": 2.8312707515846665, + "grad_norm": 0.1572265625, + "learning_rate": 1.1581569115815692e-05, + "loss": 0.6295, + "step": 4689 + }, + { + "epoch": 2.831874434047691, + "grad_norm": 0.1552734375, + "learning_rate": 1.1540058115400582e-05, + "loss": 0.7745, + "step": 4690 + }, + { + "epoch": 2.8324781165107153, + "grad_norm": 0.1416015625, + "learning_rate": 1.1498547114985472e-05, + "loss": 0.6095, + "step": 4691 + }, + { + "epoch": 2.8330817989737396, + "grad_norm": 0.14453125, + "learning_rate": 1.1457036114570362e-05, + "loss": 0.8522, + "step": 4692 + }, + { + "epoch": 2.8336854814367642, + "grad_norm": 0.1455078125, + "learning_rate": 1.1415525114155251e-05, + "loss": 0.7047, + "step": 4693 + }, + { + "epoch": 2.834289163899789, + "grad_norm": 0.150390625, + "learning_rate": 1.1374014113740141e-05, + "loss": 0.561, + "step": 4694 + }, + { + "epoch": 2.834892846362813, + "grad_norm": 0.1416015625, + "learning_rate": 1.1332503113325031e-05, + "loss": 0.8301, + "step": 4695 + }, + { + "epoch": 2.8354965288258374, + "grad_norm": 0.1474609375, + "learning_rate": 1.1290992112909923e-05, + "loss": 0.5409, + "step": 4696 + }, + { + "epoch": 2.836100211288862, + "grad_norm": 0.1552734375, + "learning_rate": 1.1249481112494811e-05, + "loss": 0.5462, + "step": 4697 + }, + { + "epoch": 2.8367038937518867, + "grad_norm": 0.1435546875, + "learning_rate": 1.1207970112079703e-05, + "loss": 0.5022, + "step": 4698 + }, + { + "epoch": 2.837307576214911, + "grad_norm": 0.1572265625, + "learning_rate": 1.116645911166459e-05, + "loss": 0.6117, + "step": 4699 + }, + { + "epoch": 2.837911258677935, + "grad_norm": 0.1650390625, + "learning_rate": 1.1124948111249482e-05, + "loss": 0.4875, + "step": 4700 + }, + { + "epoch": 2.83851494114096, + "grad_norm": 0.166015625, + "learning_rate": 1.1083437110834372e-05, + "loss": 0.5104, + "step": 4701 + }, + { + "epoch": 2.8391186236039845, + "grad_norm": 0.166015625, + "learning_rate": 1.1041926110419262e-05, + "loss": 0.5757, + "step": 4702 + }, + { + "epoch": 2.8397223060670087, + "grad_norm": 0.1669921875, + "learning_rate": 1.1000415110004152e-05, + "loss": 0.4598, + "step": 4703 + }, + { + "epoch": 2.840325988530033, + "grad_norm": 0.1728515625, + "learning_rate": 1.0958904109589042e-05, + "loss": 0.494, + "step": 4704 + }, + { + "epoch": 2.8409296709930576, + "grad_norm": 0.1806640625, + "learning_rate": 1.0917393109173932e-05, + "loss": 0.4241, + "step": 4705 + }, + { + "epoch": 2.8415333534560823, + "grad_norm": 0.2001953125, + "learning_rate": 1.0875882108758821e-05, + "loss": 0.4557, + "step": 4706 + }, + { + "epoch": 2.8421370359191065, + "grad_norm": 0.19140625, + "learning_rate": 1.0834371108343711e-05, + "loss": 0.4101, + "step": 4707 + }, + { + "epoch": 2.842740718382131, + "grad_norm": 0.1875, + "learning_rate": 1.0792860107928601e-05, + "loss": 0.3463, + "step": 4708 + }, + { + "epoch": 2.8433444008451554, + "grad_norm": 0.2197265625, + "learning_rate": 1.0751349107513493e-05, + "loss": 0.3673, + "step": 4709 + }, + { + "epoch": 2.84394808330818, + "grad_norm": 0.1982421875, + "learning_rate": 1.0709838107098381e-05, + "loss": 0.2754, + "step": 4710 + }, + { + "epoch": 2.8445517657712043, + "grad_norm": 0.193359375, + "learning_rate": 1.0668327106683273e-05, + "loss": 0.2248, + "step": 4711 + }, + { + "epoch": 2.845155448234229, + "grad_norm": 0.2060546875, + "learning_rate": 1.062681610626816e-05, + "loss": 0.2027, + "step": 4712 + }, + { + "epoch": 2.845759130697253, + "grad_norm": 0.1513671875, + "learning_rate": 1.0585305105853052e-05, + "loss": 0.674, + "step": 4713 + }, + { + "epoch": 2.846362813160278, + "grad_norm": 0.15234375, + "learning_rate": 1.0543794105437942e-05, + "loss": 0.8905, + "step": 4714 + }, + { + "epoch": 2.846966495623302, + "grad_norm": 0.1435546875, + "learning_rate": 1.050228310502283e-05, + "loss": 0.7134, + "step": 4715 + }, + { + "epoch": 2.8475701780863267, + "grad_norm": 0.251953125, + "learning_rate": 1.0460772104607722e-05, + "loss": 0.5395, + "step": 4716 + }, + { + "epoch": 2.848173860549351, + "grad_norm": 0.1298828125, + "learning_rate": 1.0419261104192612e-05, + "loss": 0.4918, + "step": 4717 + }, + { + "epoch": 2.8487775430123756, + "grad_norm": 0.1455078125, + "learning_rate": 1.0377750103777502e-05, + "loss": 0.5988, + "step": 4718 + }, + { + "epoch": 2.8493812254754, + "grad_norm": 0.142578125, + "learning_rate": 1.0336239103362392e-05, + "loss": 0.5771, + "step": 4719 + }, + { + "epoch": 2.8499849079384245, + "grad_norm": 0.1396484375, + "learning_rate": 1.0294728102947281e-05, + "loss": 0.6012, + "step": 4720 + }, + { + "epoch": 2.8505885904014487, + "grad_norm": 0.1533203125, + "learning_rate": 1.0253217102532171e-05, + "loss": 0.6447, + "step": 4721 + }, + { + "epoch": 2.8511922728644734, + "grad_norm": 0.15625, + "learning_rate": 1.0211706102117063e-05, + "loss": 0.6304, + "step": 4722 + }, + { + "epoch": 2.8517959553274976, + "grad_norm": 0.1494140625, + "learning_rate": 1.0170195101701951e-05, + "loss": 1.1191, + "step": 4723 + }, + { + "epoch": 2.8523996377905223, + "grad_norm": 0.134765625, + "learning_rate": 1.0128684101286841e-05, + "loss": 0.8335, + "step": 4724 + }, + { + "epoch": 2.8530033202535465, + "grad_norm": 0.14453125, + "learning_rate": 1.008717310087173e-05, + "loss": 0.5978, + "step": 4725 + }, + { + "epoch": 2.853607002716571, + "grad_norm": 0.138671875, + "learning_rate": 1.004566210045662e-05, + "loss": 0.6225, + "step": 4726 + }, + { + "epoch": 2.8542106851795954, + "grad_norm": 0.1728515625, + "learning_rate": 1.0004151100041512e-05, + "loss": 0.7899, + "step": 4727 + }, + { + "epoch": 2.85481436764262, + "grad_norm": 0.1630859375, + "learning_rate": 9.9626400996264e-06, + "loss": 0.6002, + "step": 4728 + }, + { + "epoch": 2.8554180501056443, + "grad_norm": 0.150390625, + "learning_rate": 9.921129099211292e-06, + "loss": 0.6239, + "step": 4729 + }, + { + "epoch": 2.856021732568669, + "grad_norm": 0.138671875, + "learning_rate": 9.879618098796182e-06, + "loss": 0.6067, + "step": 4730 + }, + { + "epoch": 2.856625415031693, + "grad_norm": 0.140625, + "learning_rate": 9.838107098381072e-06, + "loss": 0.5572, + "step": 4731 + }, + { + "epoch": 2.857229097494718, + "grad_norm": 0.154296875, + "learning_rate": 9.796596097965962e-06, + "loss": 0.5798, + "step": 4732 + }, + { + "epoch": 2.857832779957742, + "grad_norm": 0.140625, + "learning_rate": 9.755085097550851e-06, + "loss": 0.6557, + "step": 4733 + }, + { + "epoch": 2.8584364624207668, + "grad_norm": 0.1357421875, + "learning_rate": 9.713574097135741e-06, + "loss": 0.8296, + "step": 4734 + }, + { + "epoch": 2.859040144883791, + "grad_norm": 0.1572265625, + "learning_rate": 9.672063096720631e-06, + "loss": 0.9093, + "step": 4735 + }, + { + "epoch": 2.8596438273468157, + "grad_norm": 0.13671875, + "learning_rate": 9.630552096305521e-06, + "loss": 0.6259, + "step": 4736 + }, + { + "epoch": 2.86024750980984, + "grad_norm": 0.1533203125, + "learning_rate": 9.589041095890411e-06, + "loss": 0.6026, + "step": 4737 + }, + { + "epoch": 2.8608511922728646, + "grad_norm": 0.16015625, + "learning_rate": 9.547530095475302e-06, + "loss": 0.67, + "step": 4738 + }, + { + "epoch": 2.8614548747358888, + "grad_norm": 0.1484375, + "learning_rate": 9.50601909506019e-06, + "loss": 0.5923, + "step": 4739 + }, + { + "epoch": 2.8620585571989134, + "grad_norm": 0.14453125, + "learning_rate": 9.464508094645082e-06, + "loss": 0.7869, + "step": 4740 + }, + { + "epoch": 2.8626622396619377, + "grad_norm": 0.1416015625, + "learning_rate": 9.42299709422997e-06, + "loss": 0.5751, + "step": 4741 + }, + { + "epoch": 2.8632659221249623, + "grad_norm": 0.1669921875, + "learning_rate": 9.381486093814862e-06, + "loss": 0.7413, + "step": 4742 + }, + { + "epoch": 2.8638696045879866, + "grad_norm": 0.14453125, + "learning_rate": 9.339975093399752e-06, + "loss": 0.6217, + "step": 4743 + }, + { + "epoch": 2.8644732870510112, + "grad_norm": 0.142578125, + "learning_rate": 9.298464092984642e-06, + "loss": 0.5413, + "step": 4744 + }, + { + "epoch": 2.8650769695140355, + "grad_norm": 0.1396484375, + "learning_rate": 9.256953092569532e-06, + "loss": 0.5922, + "step": 4745 + }, + { + "epoch": 2.86568065197706, + "grad_norm": 0.1416015625, + "learning_rate": 9.215442092154421e-06, + "loss": 0.5451, + "step": 4746 + }, + { + "epoch": 2.866284334440085, + "grad_norm": 0.154296875, + "learning_rate": 9.173931091739311e-06, + "loss": 0.6009, + "step": 4747 + }, + { + "epoch": 2.866888016903109, + "grad_norm": 0.16015625, + "learning_rate": 9.132420091324201e-06, + "loss": 0.6489, + "step": 4748 + }, + { + "epoch": 2.8674916993661332, + "grad_norm": 0.1591796875, + "learning_rate": 9.090909090909091e-06, + "loss": 0.5733, + "step": 4749 + }, + { + "epoch": 2.868095381829158, + "grad_norm": 0.150390625, + "learning_rate": 9.049398090493981e-06, + "loss": 0.5809, + "step": 4750 + }, + { + "epoch": 2.8686990642921826, + "grad_norm": 0.166015625, + "learning_rate": 9.007887090078872e-06, + "loss": 0.6779, + "step": 4751 + }, + { + "epoch": 2.869302746755207, + "grad_norm": 0.1591796875, + "learning_rate": 8.96637608966376e-06, + "loss": 0.5155, + "step": 4752 + }, + { + "epoch": 2.869906429218231, + "grad_norm": 0.16796875, + "learning_rate": 8.924865089248652e-06, + "loss": 0.5757, + "step": 4753 + }, + { + "epoch": 2.8705101116812557, + "grad_norm": 0.16015625, + "learning_rate": 8.88335408883354e-06, + "loss": 0.4536, + "step": 4754 + }, + { + "epoch": 2.8711137941442804, + "grad_norm": 0.177734375, + "learning_rate": 8.841843088418432e-06, + "loss": 0.4535, + "step": 4755 + }, + { + "epoch": 2.8717174766073046, + "grad_norm": 0.1884765625, + "learning_rate": 8.800332088003322e-06, + "loss": 0.463, + "step": 4756 + }, + { + "epoch": 2.872321159070329, + "grad_norm": 0.203125, + "learning_rate": 8.75882108758821e-06, + "loss": 0.452, + "step": 4757 + }, + { + "epoch": 2.8729248415333535, + "grad_norm": 0.208984375, + "learning_rate": 8.717310087173102e-06, + "loss": 0.387, + "step": 4758 + }, + { + "epoch": 2.873528523996378, + "grad_norm": 0.2080078125, + "learning_rate": 8.675799086757991e-06, + "loss": 0.3369, + "step": 4759 + }, + { + "epoch": 2.8741322064594024, + "grad_norm": 0.1943359375, + "learning_rate": 8.634288086342881e-06, + "loss": 0.2758, + "step": 4760 + }, + { + "epoch": 2.8747358889224266, + "grad_norm": 0.19921875, + "learning_rate": 8.592777085927771e-06, + "loss": 0.2384, + "step": 4761 + }, + { + "epoch": 2.8753395713854513, + "grad_norm": 0.21484375, + "learning_rate": 8.551266085512661e-06, + "loss": 0.2201, + "step": 4762 + }, + { + "epoch": 2.875943253848476, + "grad_norm": 0.1396484375, + "learning_rate": 8.509755085097551e-06, + "loss": 0.5102, + "step": 4763 + }, + { + "epoch": 2.8765469363115, + "grad_norm": 0.16796875, + "learning_rate": 8.468244084682442e-06, + "loss": 0.7652, + "step": 4764 + }, + { + "epoch": 2.8771506187745244, + "grad_norm": 0.142578125, + "learning_rate": 8.42673308426733e-06, + "loss": 0.8522, + "step": 4765 + }, + { + "epoch": 2.877754301237549, + "grad_norm": 0.1435546875, + "learning_rate": 8.385222083852222e-06, + "loss": 1.0595, + "step": 4766 + }, + { + "epoch": 2.8783579837005737, + "grad_norm": 0.1376953125, + "learning_rate": 8.343711083437112e-06, + "loss": 0.5743, + "step": 4767 + }, + { + "epoch": 2.878961666163598, + "grad_norm": 0.154296875, + "learning_rate": 8.302200083022e-06, + "loss": 0.6199, + "step": 4768 + }, + { + "epoch": 2.879565348626622, + "grad_norm": 0.1455078125, + "learning_rate": 8.260689082606892e-06, + "loss": 0.6884, + "step": 4769 + }, + { + "epoch": 2.880169031089647, + "grad_norm": 0.140625, + "learning_rate": 8.21917808219178e-06, + "loss": 0.5411, + "step": 4770 + }, + { + "epoch": 2.8807727135526715, + "grad_norm": 0.1396484375, + "learning_rate": 8.177667081776672e-06, + "loss": 0.5833, + "step": 4771 + }, + { + "epoch": 2.8813763960156957, + "grad_norm": 0.1357421875, + "learning_rate": 8.136156081361561e-06, + "loss": 0.8275, + "step": 4772 + }, + { + "epoch": 2.88198007847872, + "grad_norm": 0.1474609375, + "learning_rate": 8.094645080946451e-06, + "loss": 0.8618, + "step": 4773 + }, + { + "epoch": 2.8825837609417446, + "grad_norm": 0.146484375, + "learning_rate": 8.053134080531341e-06, + "loss": 0.9263, + "step": 4774 + }, + { + "epoch": 2.8831874434047693, + "grad_norm": 0.1484375, + "learning_rate": 8.011623080116231e-06, + "loss": 0.5723, + "step": 4775 + }, + { + "epoch": 2.8837911258677935, + "grad_norm": 0.1513671875, + "learning_rate": 7.970112079701121e-06, + "loss": 0.5347, + "step": 4776 + }, + { + "epoch": 2.8843948083308177, + "grad_norm": 0.1474609375, + "learning_rate": 7.92860107928601e-06, + "loss": 0.8219, + "step": 4777 + }, + { + "epoch": 2.8849984907938424, + "grad_norm": 0.1435546875, + "learning_rate": 7.8870900788709e-06, + "loss": 0.6338, + "step": 4778 + }, + { + "epoch": 2.885602173256867, + "grad_norm": 0.2177734375, + "learning_rate": 7.84557907845579e-06, + "loss": 0.696, + "step": 4779 + }, + { + "epoch": 2.8862058557198913, + "grad_norm": 0.130859375, + "learning_rate": 7.804068078040682e-06, + "loss": 0.7539, + "step": 4780 + }, + { + "epoch": 2.8868095381829155, + "grad_norm": 0.13671875, + "learning_rate": 7.76255707762557e-06, + "loss": 0.5847, + "step": 4781 + }, + { + "epoch": 2.88741322064594, + "grad_norm": 0.1494140625, + "learning_rate": 7.721046077210462e-06, + "loss": 0.6444, + "step": 4782 + }, + { + "epoch": 2.888016903108965, + "grad_norm": 0.154296875, + "learning_rate": 7.67953507679535e-06, + "loss": 0.4944, + "step": 4783 + }, + { + "epoch": 2.888620585571989, + "grad_norm": 0.1494140625, + "learning_rate": 7.638024076380242e-06, + "loss": 0.6233, + "step": 4784 + }, + { + "epoch": 2.8892242680350133, + "grad_norm": 0.150390625, + "learning_rate": 7.596513075965131e-06, + "loss": 0.6144, + "step": 4785 + }, + { + "epoch": 2.889827950498038, + "grad_norm": 0.16015625, + "learning_rate": 7.555002075550021e-06, + "loss": 0.5874, + "step": 4786 + }, + { + "epoch": 2.8904316329610626, + "grad_norm": 0.1630859375, + "learning_rate": 7.513491075134911e-06, + "loss": 0.7338, + "step": 4787 + }, + { + "epoch": 2.891035315424087, + "grad_norm": 0.1455078125, + "learning_rate": 7.4719800747198e-06, + "loss": 0.6417, + "step": 4788 + }, + { + "epoch": 2.8916389978871115, + "grad_norm": 0.15625, + "learning_rate": 7.430469074304691e-06, + "loss": 0.8307, + "step": 4789 + }, + { + "epoch": 2.8922426803501358, + "grad_norm": 0.1640625, + "learning_rate": 7.388958073889581e-06, + "loss": 0.637, + "step": 4790 + }, + { + "epoch": 2.8928463628131604, + "grad_norm": 0.146484375, + "learning_rate": 7.3474470734744716e-06, + "loss": 0.6723, + "step": 4791 + }, + { + "epoch": 2.8934500452761847, + "grad_norm": 0.142578125, + "learning_rate": 7.305936073059361e-06, + "loss": 0.5976, + "step": 4792 + }, + { + "epoch": 2.8940537277392093, + "grad_norm": 0.140625, + "learning_rate": 7.264425072644251e-06, + "loss": 0.5413, + "step": 4793 + }, + { + "epoch": 2.8946574102022335, + "grad_norm": 0.150390625, + "learning_rate": 7.222914072229141e-06, + "loss": 0.6038, + "step": 4794 + }, + { + "epoch": 2.895261092665258, + "grad_norm": 0.1552734375, + "learning_rate": 7.181403071814032e-06, + "loss": 0.5861, + "step": 4795 + }, + { + "epoch": 2.8958647751282824, + "grad_norm": 0.14453125, + "learning_rate": 7.139892071398921e-06, + "loss": 0.5549, + "step": 4796 + }, + { + "epoch": 2.896468457591307, + "grad_norm": 0.146484375, + "learning_rate": 7.098381070983812e-06, + "loss": 0.5177, + "step": 4797 + }, + { + "epoch": 2.8970721400543313, + "grad_norm": 0.1552734375, + "learning_rate": 7.056870070568701e-06, + "loss": 0.5494, + "step": 4798 + }, + { + "epoch": 2.897675822517356, + "grad_norm": 0.177734375, + "learning_rate": 7.0153590701535905e-06, + "loss": 0.5592, + "step": 4799 + }, + { + "epoch": 2.8982795049803802, + "grad_norm": 0.1591796875, + "learning_rate": 6.973848069738481e-06, + "loss": 0.5537, + "step": 4800 + }, + { + "epoch": 2.898883187443405, + "grad_norm": 0.17578125, + "learning_rate": 6.93233706932337e-06, + "loss": 0.4819, + "step": 4801 + }, + { + "epoch": 2.899486869906429, + "grad_norm": 0.15234375, + "learning_rate": 6.890826068908261e-06, + "loss": 0.45, + "step": 4802 + }, + { + "epoch": 2.900090552369454, + "grad_norm": 0.166015625, + "learning_rate": 6.849315068493151e-06, + "loss": 0.4768, + "step": 4803 + }, + { + "epoch": 2.900694234832478, + "grad_norm": 0.1904296875, + "learning_rate": 6.807804068078042e-06, + "loss": 0.5442, + "step": 4804 + }, + { + "epoch": 2.9012979172955027, + "grad_norm": 0.171875, + "learning_rate": 6.766293067662931e-06, + "loss": 0.4122, + "step": 4805 + }, + { + "epoch": 2.901901599758527, + "grad_norm": 0.2021484375, + "learning_rate": 6.724782067247821e-06, + "loss": 0.5047, + "step": 4806 + }, + { + "epoch": 2.9025052822215516, + "grad_norm": 0.197265625, + "learning_rate": 6.683271066832711e-06, + "loss": 0.4761, + "step": 4807 + }, + { + "epoch": 2.903108964684576, + "grad_norm": 0.2197265625, + "learning_rate": 6.641760066417602e-06, + "loss": 0.3971, + "step": 4808 + }, + { + "epoch": 2.9037126471476005, + "grad_norm": 0.2119140625, + "learning_rate": 6.600249066002491e-06, + "loss": 0.3409, + "step": 4809 + }, + { + "epoch": 2.9043163296106247, + "grad_norm": 0.1982421875, + "learning_rate": 6.55873806558738e-06, + "loss": 0.2721, + "step": 4810 + }, + { + "epoch": 2.9049200120736494, + "grad_norm": 0.2060546875, + "learning_rate": 6.517227065172271e-06, + "loss": 0.2655, + "step": 4811 + }, + { + "epoch": 2.9055236945366736, + "grad_norm": 0.208984375, + "learning_rate": 6.4757160647571606e-06, + "loss": 0.2052, + "step": 4812 + }, + { + "epoch": 2.9061273769996983, + "grad_norm": 0.1416015625, + "learning_rate": 6.434205064342051e-06, + "loss": 0.7075, + "step": 4813 + }, + { + "epoch": 2.9067310594627225, + "grad_norm": 0.1640625, + "learning_rate": 6.39269406392694e-06, + "loss": 0.6261, + "step": 4814 + }, + { + "epoch": 2.907334741925747, + "grad_norm": 0.14453125, + "learning_rate": 6.351183063511831e-06, + "loss": 0.5754, + "step": 4815 + }, + { + "epoch": 2.9079384243887714, + "grad_norm": 0.310546875, + "learning_rate": 6.309672063096721e-06, + "loss": 0.6827, + "step": 4816 + }, + { + "epoch": 2.908542106851796, + "grad_norm": 0.1484375, + "learning_rate": 6.268161062681612e-06, + "loss": 0.6161, + "step": 4817 + }, + { + "epoch": 2.9091457893148203, + "grad_norm": 0.16015625, + "learning_rate": 6.226650062266501e-06, + "loss": 0.65, + "step": 4818 + }, + { + "epoch": 2.909749471777845, + "grad_norm": 0.1396484375, + "learning_rate": 6.1851390618513905e-06, + "loss": 0.8536, + "step": 4819 + }, + { + "epoch": 2.910353154240869, + "grad_norm": 0.162109375, + "learning_rate": 6.143628061436281e-06, + "loss": 0.7328, + "step": 4820 + }, + { + "epoch": 2.910956836703894, + "grad_norm": 0.1513671875, + "learning_rate": 6.102117061021171e-06, + "loss": 0.6667, + "step": 4821 + }, + { + "epoch": 2.911560519166918, + "grad_norm": 0.1533203125, + "learning_rate": 6.060606060606061e-06, + "loss": 0.7117, + "step": 4822 + }, + { + "epoch": 2.9121642016299427, + "grad_norm": 0.138671875, + "learning_rate": 6.019095060190951e-06, + "loss": 0.5267, + "step": 4823 + }, + { + "epoch": 2.912767884092967, + "grad_norm": 0.1474609375, + "learning_rate": 5.9775840597758416e-06, + "loss": 0.6946, + "step": 4824 + }, + { + "epoch": 2.9133715665559916, + "grad_norm": 0.146484375, + "learning_rate": 5.936073059360731e-06, + "loss": 0.6895, + "step": 4825 + }, + { + "epoch": 2.913975249019016, + "grad_norm": 0.15234375, + "learning_rate": 5.8945620589456205e-06, + "loss": 0.6591, + "step": 4826 + }, + { + "epoch": 2.9145789314820405, + "grad_norm": 0.1982421875, + "learning_rate": 5.85305105853051e-06, + "loss": 0.599, + "step": 4827 + }, + { + "epoch": 2.915182613945065, + "grad_norm": 0.1552734375, + "learning_rate": 5.811540058115401e-06, + "loss": 0.6378, + "step": 4828 + }, + { + "epoch": 2.9157862964080894, + "grad_norm": 0.1650390625, + "learning_rate": 5.770029057700291e-06, + "loss": 0.7807, + "step": 4829 + }, + { + "epoch": 2.9163899788711136, + "grad_norm": 0.146484375, + "learning_rate": 5.728518057285181e-06, + "loss": 0.6659, + "step": 4830 + }, + { + "epoch": 2.9169936613341383, + "grad_norm": 0.162109375, + "learning_rate": 5.687007056870071e-06, + "loss": 0.7076, + "step": 4831 + }, + { + "epoch": 2.917597343797163, + "grad_norm": 0.1533203125, + "learning_rate": 5.645496056454961e-06, + "loss": 0.6585, + "step": 4832 + }, + { + "epoch": 2.918201026260187, + "grad_norm": 0.1572265625, + "learning_rate": 5.603985056039851e-06, + "loss": 0.7424, + "step": 4833 + }, + { + "epoch": 2.9188047087232114, + "grad_norm": 0.130859375, + "learning_rate": 5.562474055624741e-06, + "loss": 0.5734, + "step": 4834 + }, + { + "epoch": 2.919408391186236, + "grad_norm": 0.154296875, + "learning_rate": 5.520963055209631e-06, + "loss": 0.5937, + "step": 4835 + }, + { + "epoch": 2.9200120736492607, + "grad_norm": 0.1533203125, + "learning_rate": 5.479452054794521e-06, + "loss": 0.5906, + "step": 4836 + }, + { + "epoch": 2.920615756112285, + "grad_norm": 0.1552734375, + "learning_rate": 5.437941054379411e-06, + "loss": 0.5796, + "step": 4837 + }, + { + "epoch": 2.921219438575309, + "grad_norm": 0.1357421875, + "learning_rate": 5.396430053964301e-06, + "loss": 0.4951, + "step": 4838 + }, + { + "epoch": 2.921823121038334, + "grad_norm": 0.1494140625, + "learning_rate": 5.3549190535491905e-06, + "loss": 0.6764, + "step": 4839 + }, + { + "epoch": 2.9224268035013585, + "grad_norm": 0.140625, + "learning_rate": 5.31340805313408e-06, + "loss": 0.7883, + "step": 4840 + }, + { + "epoch": 2.9230304859643828, + "grad_norm": 0.15625, + "learning_rate": 5.271897052718971e-06, + "loss": 0.6394, + "step": 4841 + }, + { + "epoch": 2.923634168427407, + "grad_norm": 0.1455078125, + "learning_rate": 5.230386052303861e-06, + "loss": 0.5879, + "step": 4842 + }, + { + "epoch": 2.9242378508904316, + "grad_norm": 0.134765625, + "learning_rate": 5.188875051888751e-06, + "loss": 0.5605, + "step": 4843 + }, + { + "epoch": 2.9248415333534563, + "grad_norm": 0.1630859375, + "learning_rate": 5.147364051473641e-06, + "loss": 0.6246, + "step": 4844 + }, + { + "epoch": 2.9254452158164805, + "grad_norm": 0.1474609375, + "learning_rate": 5.105853051058531e-06, + "loss": 0.6152, + "step": 4845 + }, + { + "epoch": 2.9260488982795048, + "grad_norm": 0.1435546875, + "learning_rate": 5.0643420506434204e-06, + "loss": 0.5924, + "step": 4846 + }, + { + "epoch": 2.9266525807425294, + "grad_norm": 0.1474609375, + "learning_rate": 5.02283105022831e-06, + "loss": 0.6363, + "step": 4847 + }, + { + "epoch": 2.927256263205554, + "grad_norm": 0.1513671875, + "learning_rate": 4.9813200498132e-06, + "loss": 0.5652, + "step": 4848 + }, + { + "epoch": 2.9278599456685783, + "grad_norm": 0.146484375, + "learning_rate": 4.939809049398091e-06, + "loss": 0.4982, + "step": 4849 + }, + { + "epoch": 2.9284636281316025, + "grad_norm": 0.1513671875, + "learning_rate": 4.898298048982981e-06, + "loss": 0.5055, + "step": 4850 + }, + { + "epoch": 2.929067310594627, + "grad_norm": 0.1572265625, + "learning_rate": 4.856787048567871e-06, + "loss": 0.4912, + "step": 4851 + }, + { + "epoch": 2.929670993057652, + "grad_norm": 0.1806640625, + "learning_rate": 4.8152760481527605e-06, + "loss": 0.5231, + "step": 4852 + }, + { + "epoch": 2.930274675520676, + "grad_norm": 0.162109375, + "learning_rate": 4.773765047737651e-06, + "loss": 0.4411, + "step": 4853 + }, + { + "epoch": 2.9308783579837003, + "grad_norm": 0.1630859375, + "learning_rate": 4.732254047322541e-06, + "loss": 0.5186, + "step": 4854 + }, + { + "epoch": 2.931482040446725, + "grad_norm": 0.1640625, + "learning_rate": 4.690743046907431e-06, + "loss": 0.4872, + "step": 4855 + }, + { + "epoch": 2.9320857229097497, + "grad_norm": 0.1767578125, + "learning_rate": 4.649232046492321e-06, + "loss": 0.4443, + "step": 4856 + }, + { + "epoch": 2.932689405372774, + "grad_norm": 0.193359375, + "learning_rate": 4.607721046077211e-06, + "loss": 0.596, + "step": 4857 + }, + { + "epoch": 2.933293087835798, + "grad_norm": 0.1982421875, + "learning_rate": 4.566210045662101e-06, + "loss": 0.3698, + "step": 4858 + }, + { + "epoch": 2.933896770298823, + "grad_norm": 0.1953125, + "learning_rate": 4.5246990452469905e-06, + "loss": 0.3508, + "step": 4859 + }, + { + "epoch": 2.9345004527618475, + "grad_norm": 0.2099609375, + "learning_rate": 4.48318804483188e-06, + "loss": 0.3613, + "step": 4860 + }, + { + "epoch": 2.9351041352248717, + "grad_norm": 0.2333984375, + "learning_rate": 4.44167704441677e-06, + "loss": 0.3906, + "step": 4861 + }, + { + "epoch": 2.935707817687896, + "grad_norm": 0.2041015625, + "learning_rate": 4.400166044001661e-06, + "loss": 0.2025, + "step": 4862 + }, + { + "epoch": 2.9363115001509206, + "grad_norm": 0.14453125, + "learning_rate": 4.358655043586551e-06, + "loss": 0.7123, + "step": 4863 + }, + { + "epoch": 2.9369151826139452, + "grad_norm": 0.1611328125, + "learning_rate": 4.317144043171441e-06, + "loss": 0.6143, + "step": 4864 + }, + { + "epoch": 2.9375188650769695, + "grad_norm": 0.1533203125, + "learning_rate": 4.2756330427563305e-06, + "loss": 0.6854, + "step": 4865 + }, + { + "epoch": 2.9381225475399937, + "grad_norm": 0.1376953125, + "learning_rate": 4.234122042341221e-06, + "loss": 0.5651, + "step": 4866 + }, + { + "epoch": 2.9387262300030184, + "grad_norm": 0.138671875, + "learning_rate": 4.192611041926111e-06, + "loss": 0.5303, + "step": 4867 + }, + { + "epoch": 2.939329912466043, + "grad_norm": 0.1630859375, + "learning_rate": 4.151100041511e-06, + "loss": 0.6131, + "step": 4868 + }, + { + "epoch": 2.9399335949290673, + "grad_norm": 0.181640625, + "learning_rate": 4.10958904109589e-06, + "loss": 0.6331, + "step": 4869 + }, + { + "epoch": 2.940537277392092, + "grad_norm": 0.142578125, + "learning_rate": 4.068078040680781e-06, + "loss": 0.567, + "step": 4870 + }, + { + "epoch": 2.941140959855116, + "grad_norm": 0.1572265625, + "learning_rate": 4.026567040265671e-06, + "loss": 0.6356, + "step": 4871 + }, + { + "epoch": 2.941744642318141, + "grad_norm": 0.15625, + "learning_rate": 3.9850560398505605e-06, + "loss": 0.5873, + "step": 4872 + }, + { + "epoch": 2.942348324781165, + "grad_norm": 0.13671875, + "learning_rate": 3.94354503943545e-06, + "loss": 0.7104, + "step": 4873 + }, + { + "epoch": 2.9429520072441897, + "grad_norm": 0.1474609375, + "learning_rate": 3.902034039020341e-06, + "loss": 0.6211, + "step": 4874 + }, + { + "epoch": 2.943555689707214, + "grad_norm": 0.169921875, + "learning_rate": 3.860523038605231e-06, + "loss": 0.613, + "step": 4875 + }, + { + "epoch": 2.9441593721702386, + "grad_norm": 0.1474609375, + "learning_rate": 3.819012038190121e-06, + "loss": 0.7952, + "step": 4876 + }, + { + "epoch": 2.944763054633263, + "grad_norm": 0.1484375, + "learning_rate": 3.7775010377750107e-06, + "loss": 0.5848, + "step": 4877 + }, + { + "epoch": 2.9453667370962875, + "grad_norm": 0.146484375, + "learning_rate": 3.7359900373599e-06, + "loss": 0.5493, + "step": 4878 + }, + { + "epoch": 2.9459704195593117, + "grad_norm": 0.1494140625, + "learning_rate": 3.6944790369447904e-06, + "loss": 0.6622, + "step": 4879 + }, + { + "epoch": 2.9465741020223364, + "grad_norm": 0.1474609375, + "learning_rate": 3.6529680365296803e-06, + "loss": 0.6939, + "step": 4880 + }, + { + "epoch": 2.9471777844853606, + "grad_norm": 0.14453125, + "learning_rate": 3.6114570361145706e-06, + "loss": 0.6388, + "step": 4881 + }, + { + "epoch": 2.9477814669483853, + "grad_norm": 0.1533203125, + "learning_rate": 3.5699460356994605e-06, + "loss": 0.5472, + "step": 4882 + }, + { + "epoch": 2.9483851494114095, + "grad_norm": 0.1416015625, + "learning_rate": 3.5284350352843503e-06, + "loss": 0.5682, + "step": 4883 + }, + { + "epoch": 2.948988831874434, + "grad_norm": 0.1416015625, + "learning_rate": 3.4869240348692406e-06, + "loss": 0.5742, + "step": 4884 + }, + { + "epoch": 2.9495925143374584, + "grad_norm": 0.1640625, + "learning_rate": 3.4454130344541305e-06, + "loss": 0.7286, + "step": 4885 + }, + { + "epoch": 2.950196196800483, + "grad_norm": 0.14453125, + "learning_rate": 3.403902034039021e-06, + "loss": 0.7722, + "step": 4886 + }, + { + "epoch": 2.9507998792635073, + "grad_norm": 0.1552734375, + "learning_rate": 3.3623910336239107e-06, + "loss": 0.6654, + "step": 4887 + }, + { + "epoch": 2.951403561726532, + "grad_norm": 0.146484375, + "learning_rate": 3.320880033208801e-06, + "loss": 0.6901, + "step": 4888 + }, + { + "epoch": 2.952007244189556, + "grad_norm": 0.1494140625, + "learning_rate": 3.27936903279369e-06, + "loss": 0.637, + "step": 4889 + }, + { + "epoch": 2.952610926652581, + "grad_norm": 0.1376953125, + "learning_rate": 3.2378580323785803e-06, + "loss": 0.7953, + "step": 4890 + }, + { + "epoch": 2.953214609115605, + "grad_norm": 0.1328125, + "learning_rate": 3.19634703196347e-06, + "loss": 0.5706, + "step": 4891 + }, + { + "epoch": 2.9538182915786297, + "grad_norm": 0.296875, + "learning_rate": 3.1548360315483604e-06, + "loss": 0.6715, + "step": 4892 + }, + { + "epoch": 2.954421974041654, + "grad_norm": 0.1591796875, + "learning_rate": 3.1133250311332503e-06, + "loss": 0.5795, + "step": 4893 + }, + { + "epoch": 2.9550256565046786, + "grad_norm": 0.142578125, + "learning_rate": 3.0718140307181406e-06, + "loss": 0.5772, + "step": 4894 + }, + { + "epoch": 2.955629338967703, + "grad_norm": 0.1513671875, + "learning_rate": 3.0303030303030305e-06, + "loss": 0.7201, + "step": 4895 + }, + { + "epoch": 2.9562330214307275, + "grad_norm": 0.140625, + "learning_rate": 2.9887920298879208e-06, + "loss": 0.6037, + "step": 4896 + }, + { + "epoch": 2.9568367038937517, + "grad_norm": 0.146484375, + "learning_rate": 2.9472810294728102e-06, + "loss": 0.5457, + "step": 4897 + }, + { + "epoch": 2.9574403863567764, + "grad_norm": 0.15234375, + "learning_rate": 2.9057700290577005e-06, + "loss": 0.6012, + "step": 4898 + }, + { + "epoch": 2.9580440688198006, + "grad_norm": 0.1416015625, + "learning_rate": 2.8642590286425904e-06, + "loss": 0.4818, + "step": 4899 + }, + { + "epoch": 2.9586477512828253, + "grad_norm": 0.146484375, + "learning_rate": 2.8227480282274807e-06, + "loss": 0.4889, + "step": 4900 + }, + { + "epoch": 2.9592514337458495, + "grad_norm": 0.1728515625, + "learning_rate": 2.7812370278123706e-06, + "loss": 0.6557, + "step": 4901 + }, + { + "epoch": 2.959855116208874, + "grad_norm": 0.1591796875, + "learning_rate": 2.7397260273972604e-06, + "loss": 0.4602, + "step": 4902 + }, + { + "epoch": 2.9604587986718984, + "grad_norm": 0.1728515625, + "learning_rate": 2.6982150269821503e-06, + "loss": 0.514, + "step": 4903 + }, + { + "epoch": 2.961062481134923, + "grad_norm": 0.1845703125, + "learning_rate": 2.65670402656704e-06, + "loss": 0.4437, + "step": 4904 + }, + { + "epoch": 2.9616661635979473, + "grad_norm": 0.1865234375, + "learning_rate": 2.6151930261519305e-06, + "loss": 0.4955, + "step": 4905 + }, + { + "epoch": 2.962269846060972, + "grad_norm": 0.193359375, + "learning_rate": 2.5736820257368203e-06, + "loss": 0.4478, + "step": 4906 + }, + { + "epoch": 2.962873528523996, + "grad_norm": 0.1884765625, + "learning_rate": 2.5321710253217102e-06, + "loss": 0.3539, + "step": 4907 + }, + { + "epoch": 2.963477210987021, + "grad_norm": 0.232421875, + "learning_rate": 2.4906600249066e-06, + "loss": 0.3473, + "step": 4908 + }, + { + "epoch": 2.9640808934500456, + "grad_norm": 0.2080078125, + "learning_rate": 2.4491490244914904e-06, + "loss": 0.3788, + "step": 4909 + }, + { + "epoch": 2.9646845759130698, + "grad_norm": 0.212890625, + "learning_rate": 2.4076380240763803e-06, + "loss": 0.3048, + "step": 4910 + }, + { + "epoch": 2.965288258376094, + "grad_norm": 0.2177734375, + "learning_rate": 2.3661270236612705e-06, + "loss": 0.2741, + "step": 4911 + }, + { + "epoch": 2.9658919408391187, + "grad_norm": 0.2001953125, + "learning_rate": 2.3246160232461604e-06, + "loss": 0.2138, + "step": 4912 + }, + { + "epoch": 2.9664956233021433, + "grad_norm": 0.1494140625, + "learning_rate": 2.2831050228310503e-06, + "loss": 0.8374, + "step": 4913 + }, + { + "epoch": 2.9670993057651676, + "grad_norm": 0.150390625, + "learning_rate": 2.24159402241594e-06, + "loss": 0.5438, + "step": 4914 + }, + { + "epoch": 2.967702988228192, + "grad_norm": 0.1484375, + "learning_rate": 2.2000830220008305e-06, + "loss": 0.5951, + "step": 4915 + }, + { + "epoch": 2.9683066706912165, + "grad_norm": 0.15234375, + "learning_rate": 2.1585720215857203e-06, + "loss": 0.9442, + "step": 4916 + }, + { + "epoch": 2.968910353154241, + "grad_norm": 0.140625, + "learning_rate": 2.1170610211706106e-06, + "loss": 0.5268, + "step": 4917 + }, + { + "epoch": 2.9695140356172653, + "grad_norm": 0.16015625, + "learning_rate": 2.0755500207555e-06, + "loss": 0.6036, + "step": 4918 + }, + { + "epoch": 2.9701177180802896, + "grad_norm": 0.1396484375, + "learning_rate": 2.0340390203403904e-06, + "loss": 0.7207, + "step": 4919 + }, + { + "epoch": 2.9707214005433142, + "grad_norm": 0.1591796875, + "learning_rate": 1.9925280199252802e-06, + "loss": 0.8401, + "step": 4920 + }, + { + "epoch": 2.971325083006339, + "grad_norm": 0.14453125, + "learning_rate": 1.9510170195101705e-06, + "loss": 0.6123, + "step": 4921 + }, + { + "epoch": 2.971928765469363, + "grad_norm": 0.1416015625, + "learning_rate": 1.9095060190950604e-06, + "loss": 0.6068, + "step": 4922 + }, + { + "epoch": 2.9725324479323874, + "grad_norm": 0.1357421875, + "learning_rate": 1.86799501867995e-06, + "loss": 0.8205, + "step": 4923 + }, + { + "epoch": 2.973136130395412, + "grad_norm": 0.1533203125, + "learning_rate": 1.8264840182648401e-06, + "loss": 0.6167, + "step": 4924 + }, + { + "epoch": 2.9737398128584367, + "grad_norm": 0.1416015625, + "learning_rate": 1.7849730178497302e-06, + "loss": 0.5753, + "step": 4925 + }, + { + "epoch": 2.974343495321461, + "grad_norm": 0.1416015625, + "learning_rate": 1.7434620174346203e-06, + "loss": 0.558, + "step": 4926 + }, + { + "epoch": 2.974947177784485, + "grad_norm": 0.1435546875, + "learning_rate": 1.7019510170195104e-06, + "loss": 0.8501, + "step": 4927 + }, + { + "epoch": 2.97555086024751, + "grad_norm": 0.1552734375, + "learning_rate": 1.6604400166044005e-06, + "loss": 0.7634, + "step": 4928 + }, + { + "epoch": 2.9761545427105345, + "grad_norm": 0.154296875, + "learning_rate": 1.6189290161892901e-06, + "loss": 0.8527, + "step": 4929 + }, + { + "epoch": 2.9767582251735587, + "grad_norm": 0.1650390625, + "learning_rate": 1.5774180157741802e-06, + "loss": 0.7618, + "step": 4930 + }, + { + "epoch": 2.977361907636583, + "grad_norm": 0.142578125, + "learning_rate": 1.5359070153590703e-06, + "loss": 0.5931, + "step": 4931 + }, + { + "epoch": 2.9779655900996076, + "grad_norm": 0.1455078125, + "learning_rate": 1.4943960149439604e-06, + "loss": 0.9053, + "step": 4932 + }, + { + "epoch": 2.9785692725626323, + "grad_norm": 0.171875, + "learning_rate": 1.4528850145288503e-06, + "loss": 0.6993, + "step": 4933 + }, + { + "epoch": 2.9791729550256565, + "grad_norm": 0.1416015625, + "learning_rate": 1.4113740141137403e-06, + "loss": 0.5533, + "step": 4934 + }, + { + "epoch": 2.9797766374886807, + "grad_norm": 0.1611328125, + "learning_rate": 1.3698630136986302e-06, + "loss": 0.8972, + "step": 4935 + }, + { + "epoch": 2.9803803199517054, + "grad_norm": 0.138671875, + "learning_rate": 1.32835201328352e-06, + "loss": 0.5559, + "step": 4936 + }, + { + "epoch": 2.98098400241473, + "grad_norm": 0.1416015625, + "learning_rate": 1.2868410128684102e-06, + "loss": 0.6695, + "step": 4937 + }, + { + "epoch": 2.9815876848777543, + "grad_norm": 0.1494140625, + "learning_rate": 1.2453300124533e-06, + "loss": 0.6848, + "step": 4938 + }, + { + "epoch": 2.9821913673407785, + "grad_norm": 0.13671875, + "learning_rate": 1.2038190120381901e-06, + "loss": 0.6172, + "step": 4939 + }, + { + "epoch": 2.982795049803803, + "grad_norm": 0.1455078125, + "learning_rate": 1.1623080116230802e-06, + "loss": 0.5114, + "step": 4940 + }, + { + "epoch": 2.983398732266828, + "grad_norm": 0.13671875, + "learning_rate": 1.12079701120797e-06, + "loss": 0.6146, + "step": 4941 + }, + { + "epoch": 2.984002414729852, + "grad_norm": 0.1298828125, + "learning_rate": 1.0792860107928602e-06, + "loss": 0.5792, + "step": 4942 + }, + { + "epoch": 2.9846060971928763, + "grad_norm": 0.15625, + "learning_rate": 1.03777501037775e-06, + "loss": 0.7686, + "step": 4943 + }, + { + "epoch": 2.985209779655901, + "grad_norm": 0.1435546875, + "learning_rate": 9.962640099626401e-07, + "loss": 0.6975, + "step": 4944 + }, + { + "epoch": 2.9858134621189256, + "grad_norm": 0.1455078125, + "learning_rate": 9.547530095475302e-07, + "loss": 0.5667, + "step": 4945 + }, + { + "epoch": 2.98641714458195, + "grad_norm": 0.158203125, + "learning_rate": 9.132420091324201e-07, + "loss": 0.5319, + "step": 4946 + }, + { + "epoch": 2.987020827044974, + "grad_norm": 0.1640625, + "learning_rate": 8.717310087173102e-07, + "loss": 0.573, + "step": 4947 + }, + { + "epoch": 2.9876245095079987, + "grad_norm": 0.1474609375, + "learning_rate": 8.302200083022002e-07, + "loss": 0.5194, + "step": 4948 + }, + { + "epoch": 2.9882281919710234, + "grad_norm": 0.1552734375, + "learning_rate": 7.887090078870901e-07, + "loss": 0.541, + "step": 4949 + }, + { + "epoch": 2.9888318744340476, + "grad_norm": 0.1572265625, + "learning_rate": 7.471980074719802e-07, + "loss": 0.5331, + "step": 4950 + }, + { + "epoch": 2.9894355568970723, + "grad_norm": 0.1708984375, + "learning_rate": 7.056870070568702e-07, + "loss": 0.501, + "step": 4951 + }, + { + "epoch": 2.9900392393600965, + "grad_norm": 0.171875, + "learning_rate": 6.6417600664176e-07, + "loss": 0.5099, + "step": 4952 + }, + { + "epoch": 2.990642921823121, + "grad_norm": 0.1611328125, + "learning_rate": 6.2266500622665e-07, + "loss": 0.4793, + "step": 4953 + }, + { + "epoch": 2.9912466042861454, + "grad_norm": 0.18359375, + "learning_rate": 5.811540058115401e-07, + "loss": 0.5053, + "step": 4954 + }, + { + "epoch": 2.99185028674917, + "grad_norm": 0.1923828125, + "learning_rate": 5.396430053964301e-07, + "loss": 0.4264, + "step": 4955 + }, + { + "epoch": 2.9924539692121943, + "grad_norm": 0.19140625, + "learning_rate": 4.981320049813201e-07, + "loss": 0.5337, + "step": 4956 + }, + { + "epoch": 2.993057651675219, + "grad_norm": 0.189453125, + "learning_rate": 4.5662100456621004e-07, + "loss": 0.4105, + "step": 4957 + }, + { + "epoch": 2.993661334138243, + "grad_norm": 0.212890625, + "learning_rate": 4.151100041511001e-07, + "loss": 0.4347, + "step": 4958 + }, + { + "epoch": 2.994265016601268, + "grad_norm": 0.2021484375, + "learning_rate": 3.735990037359901e-07, + "loss": 0.3364, + "step": 4959 + }, + { + "epoch": 2.994868699064292, + "grad_norm": 0.22265625, + "learning_rate": 3.3208800332088e-07, + "loss": 0.3349, + "step": 4960 + }, + { + "epoch": 2.9954723815273168, + "grad_norm": 0.193359375, + "learning_rate": 2.9057700290577005e-07, + "loss": 0.2545, + "step": 4961 + }, + { + "epoch": 2.996076063990341, + "grad_norm": 0.205078125, + "learning_rate": 2.4906600249066003e-07, + "loss": 0.184, + "step": 4962 + }, + { + "epoch": 2.9966797464533657, + "grad_norm": 0.16015625, + "learning_rate": 2.0755500207555006e-07, + "loss": 0.6758, + "step": 4963 + }, + { + "epoch": 2.99728342891639, + "grad_norm": 0.146484375, + "learning_rate": 1.6604400166044e-07, + "loss": 0.5998, + "step": 4964 + }, + { + "epoch": 2.9978871113794145, + "grad_norm": 0.1552734375, + "learning_rate": 1.2453300124533001e-07, + "loss": 0.7291, + "step": 4965 + }, + { + "epoch": 2.9984907938424388, + "grad_norm": 0.1650390625, + "learning_rate": 8.302200083022e-08, + "loss": 0.6966, + "step": 4966 + }, + { + "epoch": 2.9990944763054634, + "grad_norm": 0.1748046875, + "learning_rate": 4.151100041511e-08, + "loss": 0.5435, + "step": 4967 + }, + { + "epoch": 2.9996981587684877, + "grad_norm": 0.169921875, + "learning_rate": 0.0, + "loss": 0.4412, + "step": 4968 + } + ], + "logging_steps": 1, + "max_steps": 4968, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 276, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.988555585450428e+19, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}