{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 40, "global_step": 1950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010256410256410256, "grad_norm": 1.2734375, "learning_rate": 1.25e-05, "loss": 9.3754, "step": 1 }, { "epoch": 0.0020512820512820513, "grad_norm": 1.234375, "learning_rate": 2.5e-05, "loss": 9.5186, "step": 2 }, { "epoch": 0.003076923076923077, "grad_norm": 1.1796875, "learning_rate": 3.75e-05, "loss": 9.3076, "step": 3 }, { "epoch": 0.0041025641025641026, "grad_norm": 0.9140625, "learning_rate": 5e-05, "loss": 9.0736, "step": 4 }, { "epoch": 0.005128205128205128, "grad_norm": 0.92578125, "learning_rate": 6.25e-05, "loss": 8.6805, "step": 5 }, { "epoch": 0.006153846153846154, "grad_norm": 1.234375, "learning_rate": 7.5e-05, "loss": 8.328, "step": 6 }, { "epoch": 0.0071794871794871795, "grad_norm": 1.1328125, "learning_rate": 8.75e-05, "loss": 7.9546, "step": 7 }, { "epoch": 0.008205128205128205, "grad_norm": 0.94140625, "learning_rate": 0.0001, "loss": 7.0715, "step": 8 }, { "epoch": 0.009230769230769232, "grad_norm": 1.96875, "learning_rate": 0.00011250000000000001, "loss": 6.7071, "step": 9 }, { "epoch": 0.010256410256410256, "grad_norm": 2.703125, "learning_rate": 0.000125, "loss": 6.4597, "step": 10 }, { "epoch": 0.011282051282051283, "grad_norm": 2.265625, "learning_rate": 0.0001375, "loss": 6.0395, "step": 11 }, { "epoch": 0.012307692307692308, "grad_norm": 1.6171875, "learning_rate": 0.00015, "loss": 5.784, "step": 12 }, { "epoch": 0.013333333333333334, "grad_norm": 1.2421875, "learning_rate": 0.00016250000000000002, "loss": 5.4673, "step": 13 }, { "epoch": 0.014358974358974359, "grad_norm": 1.1328125, "learning_rate": 0.000175, "loss": 5.258, "step": 14 }, { "epoch": 0.015384615384615385, "grad_norm": 1.0546875, "learning_rate": 0.0001875, "loss": 4.999, "step": 15 }, { "epoch": 0.01641025641025641, "grad_norm": 1.1875, "learning_rate": 0.0002, "loss": 4.5738, "step": 16 }, { "epoch": 0.017435897435897435, "grad_norm": 1.546875, "learning_rate": 0.0002125, "loss": 4.6204, "step": 17 }, { "epoch": 0.018461538461538463, "grad_norm": 0.9140625, "learning_rate": 0.00022500000000000002, "loss": 4.3842, "step": 18 }, { "epoch": 0.019487179487179488, "grad_norm": 0.78125, "learning_rate": 0.0002375, "loss": 4.1915, "step": 19 }, { "epoch": 0.020512820512820513, "grad_norm": 0.71484375, "learning_rate": 0.00025, "loss": 3.9875, "step": 20 }, { "epoch": 0.021538461538461538, "grad_norm": 0.7265625, "learning_rate": 0.00026250000000000004, "loss": 3.8977, "step": 21 }, { "epoch": 0.022564102564102566, "grad_norm": 0.6484375, "learning_rate": 0.000275, "loss": 3.8187, "step": 22 }, { "epoch": 0.02358974358974359, "grad_norm": 0.6328125, "learning_rate": 0.0002875, "loss": 3.5442, "step": 23 }, { "epoch": 0.024615384615384615, "grad_norm": 0.5859375, "learning_rate": 0.0003, "loss": 3.5165, "step": 24 }, { "epoch": 0.02564102564102564, "grad_norm": 0.50390625, "learning_rate": 0.0003125, "loss": 3.476, "step": 25 }, { "epoch": 0.02666666666666667, "grad_norm": 0.498046875, "learning_rate": 0.00032500000000000004, "loss": 3.3653, "step": 26 }, { "epoch": 0.027692307692307693, "grad_norm": 0.42578125, "learning_rate": 0.0003375, "loss": 3.4578, "step": 27 }, { "epoch": 0.028717948717948718, "grad_norm": 0.404296875, "learning_rate": 0.00035, "loss": 3.2696, "step": 28 }, { "epoch": 0.029743589743589743, "grad_norm": 0.400390625, "learning_rate": 0.0003625, "loss": 3.3524, "step": 29 }, { "epoch": 0.03076923076923077, "grad_norm": 0.375, "learning_rate": 0.000375, "loss": 3.2464, "step": 30 }, { "epoch": 0.031794871794871796, "grad_norm": 0.380859375, "learning_rate": 0.00038750000000000004, "loss": 3.1908, "step": 31 }, { "epoch": 0.03282051282051282, "grad_norm": 0.341796875, "learning_rate": 0.0004, "loss": 3.1472, "step": 32 }, { "epoch": 0.033846153846153845, "grad_norm": 0.32421875, "learning_rate": 0.0004125, "loss": 3.1488, "step": 33 }, { "epoch": 0.03487179487179487, "grad_norm": 0.31640625, "learning_rate": 0.000425, "loss": 3.06, "step": 34 }, { "epoch": 0.035897435897435895, "grad_norm": 0.31640625, "learning_rate": 0.0004375, "loss": 3.0669, "step": 35 }, { "epoch": 0.036923076923076927, "grad_norm": 0.314453125, "learning_rate": 0.00045000000000000004, "loss": 3.0621, "step": 36 }, { "epoch": 0.03794871794871795, "grad_norm": 0.310546875, "learning_rate": 0.0004625, "loss": 3.0407, "step": 37 }, { "epoch": 0.038974358974358976, "grad_norm": 0.287109375, "learning_rate": 0.000475, "loss": 3.0882, "step": 38 }, { "epoch": 0.04, "grad_norm": 0.3359375, "learning_rate": 0.0004875, "loss": 2.8335, "step": 39 }, { "epoch": 0.041025641025641026, "grad_norm": 0.28125, "learning_rate": 0.0005, "loss": 3.0038, "step": 40 }, { "epoch": 0.041025641025641026, "eval_loss": NaN, "eval_runtime": 73.9444, "eval_samples_per_second": 9.291, "eval_steps_per_second": 1.163, "step": 40 }, { "epoch": 0.04205128205128205, "grad_norm": 0.283203125, "learning_rate": 0.0004999996618238887, "loss": 2.8374, "step": 41 }, { "epoch": 0.043076923076923075, "grad_norm": 0.279296875, "learning_rate": 0.0004999986472964693, "loss": 3.0613, "step": 42 }, { "epoch": 0.0441025641025641, "grad_norm": 0.29296875, "learning_rate": 0.0004999969564204868, "loss": 2.9548, "step": 43 }, { "epoch": 0.04512820512820513, "grad_norm": 0.298828125, "learning_rate": 0.0004999945892005157, "loss": 2.9424, "step": 44 }, { "epoch": 0.046153846153846156, "grad_norm": 0.271484375, "learning_rate": 0.0004999915456429602, "loss": 2.8775, "step": 45 }, { "epoch": 0.04717948717948718, "grad_norm": 0.26171875, "learning_rate": 0.0004999878257560544, "loss": 2.964, "step": 46 }, { "epoch": 0.048205128205128206, "grad_norm": 0.275390625, "learning_rate": 0.000499983429549862, "loss": 2.9445, "step": 47 }, { "epoch": 0.04923076923076923, "grad_norm": 0.2490234375, "learning_rate": 0.0004999783570362767, "loss": 2.9258, "step": 48 }, { "epoch": 0.050256410256410255, "grad_norm": 0.267578125, "learning_rate": 0.0004999726082290217, "loss": 2.9669, "step": 49 }, { "epoch": 0.05128205128205128, "grad_norm": 0.23828125, "learning_rate": 0.0004999661831436498, "loss": 2.8812, "step": 50 }, { "epoch": 0.052307692307692305, "grad_norm": 0.2451171875, "learning_rate": 0.0004999590817975436, "loss": 2.9764, "step": 51 }, { "epoch": 0.05333333333333334, "grad_norm": 0.244140625, "learning_rate": 0.0004999513042099151, "loss": 2.9131, "step": 52 }, { "epoch": 0.05435897435897436, "grad_norm": 0.24609375, "learning_rate": 0.0004999428504018057, "loss": 2.8926, "step": 53 }, { "epoch": 0.055384615384615386, "grad_norm": 0.255859375, "learning_rate": 0.0004999337203960866, "loss": 2.912, "step": 54 }, { "epoch": 0.05641025641025641, "grad_norm": 0.283203125, "learning_rate": 0.0004999239142174581, "loss": 3.0416, "step": 55 }, { "epoch": 0.057435897435897436, "grad_norm": 0.2392578125, "learning_rate": 0.0004999134318924499, "loss": 2.9356, "step": 56 }, { "epoch": 0.05846153846153846, "grad_norm": 0.318359375, "learning_rate": 0.0004999022734494211, "loss": 2.8295, "step": 57 }, { "epoch": 0.059487179487179485, "grad_norm": 0.2412109375, "learning_rate": 0.0004998904389185598, "loss": 2.8644, "step": 58 }, { "epoch": 0.06051282051282051, "grad_norm": 0.2333984375, "learning_rate": 0.0004998779283318832, "loss": 2.8911, "step": 59 }, { "epoch": 0.06153846153846154, "grad_norm": 0.24609375, "learning_rate": 0.0004998647417232376, "loss": 2.9487, "step": 60 }, { "epoch": 0.06256410256410257, "grad_norm": 0.228515625, "learning_rate": 0.0004998508791282981, "loss": 2.8789, "step": 61 }, { "epoch": 0.06358974358974359, "grad_norm": 0.255859375, "learning_rate": 0.0004998363405845687, "loss": 2.8284, "step": 62 }, { "epoch": 0.06461538461538462, "grad_norm": 0.265625, "learning_rate": 0.0004998211261313822, "loss": 2.7707, "step": 63 }, { "epoch": 0.06564102564102564, "grad_norm": 0.28125, "learning_rate": 0.0004998052358098998, "loss": 2.8432, "step": 64 }, { "epoch": 0.06666666666666667, "grad_norm": 0.25390625, "learning_rate": 0.0004997886696631115, "loss": 2.8685, "step": 65 }, { "epoch": 0.06769230769230769, "grad_norm": 0.2353515625, "learning_rate": 0.0004997714277358352, "loss": 2.837, "step": 66 }, { "epoch": 0.06871794871794872, "grad_norm": 0.25390625, "learning_rate": 0.0004997535100747176, "loss": 2.8735, "step": 67 }, { "epoch": 0.06974358974358974, "grad_norm": 0.28515625, "learning_rate": 0.0004997349167282333, "loss": 2.7661, "step": 68 }, { "epoch": 0.07076923076923076, "grad_norm": 0.255859375, "learning_rate": 0.0004997156477466848, "loss": 2.9012, "step": 69 }, { "epoch": 0.07179487179487179, "grad_norm": 0.2412109375, "learning_rate": 0.0004996957031822025, "loss": 2.8145, "step": 70 }, { "epoch": 0.07282051282051281, "grad_norm": 0.2470703125, "learning_rate": 0.000499675083088745, "loss": 2.8928, "step": 71 }, { "epoch": 0.07384615384615385, "grad_norm": 0.26171875, "learning_rate": 0.0004996537875220975, "loss": 2.8028, "step": 72 }, { "epoch": 0.07487179487179488, "grad_norm": 0.26171875, "learning_rate": 0.0004996318165398737, "loss": 2.8123, "step": 73 }, { "epoch": 0.0758974358974359, "grad_norm": 0.265625, "learning_rate": 0.0004996091702015138, "loss": 2.9138, "step": 74 }, { "epoch": 0.07692307692307693, "grad_norm": 0.26171875, "learning_rate": 0.0004995858485682856, "loss": 2.8805, "step": 75 }, { "epoch": 0.07794871794871795, "grad_norm": 0.2734375, "learning_rate": 0.0004995618517032836, "loss": 2.8687, "step": 76 }, { "epoch": 0.07897435897435898, "grad_norm": 0.271484375, "learning_rate": 0.000499537179671429, "loss": 2.7618, "step": 77 }, { "epoch": 0.08, "grad_norm": 0.26171875, "learning_rate": 0.0004995118325394699, "loss": 2.8704, "step": 78 }, { "epoch": 0.08102564102564103, "grad_norm": 0.298828125, "learning_rate": 0.0004994858103759805, "loss": 2.922, "step": 79 }, { "epoch": 0.08205128205128205, "grad_norm": 0.2490234375, "learning_rate": 0.0004994591132513616, "loss": 2.881, "step": 80 }, { "epoch": 0.08205128205128205, "eval_loss": NaN, "eval_runtime": 73.9172, "eval_samples_per_second": 9.294, "eval_steps_per_second": 1.163, "step": 80 }, { "epoch": 0.08307692307692308, "grad_norm": 0.251953125, "learning_rate": 0.0004994317412378396, "loss": 2.8859, "step": 81 }, { "epoch": 0.0841025641025641, "grad_norm": 0.265625, "learning_rate": 0.0004994036944094673, "loss": 2.7814, "step": 82 }, { "epoch": 0.08512820512820513, "grad_norm": 0.255859375, "learning_rate": 0.0004993749728421224, "loss": 2.8369, "step": 83 }, { "epoch": 0.08615384615384615, "grad_norm": 0.263671875, "learning_rate": 0.0004993455766135088, "loss": 2.6328, "step": 84 }, { "epoch": 0.08717948717948718, "grad_norm": 0.26171875, "learning_rate": 0.0004993155058031553, "loss": 2.9201, "step": 85 }, { "epoch": 0.0882051282051282, "grad_norm": 0.2578125, "learning_rate": 0.0004992847604924157, "loss": 2.7388, "step": 86 }, { "epoch": 0.08923076923076922, "grad_norm": 0.314453125, "learning_rate": 0.0004992533407644686, "loss": 2.8858, "step": 87 }, { "epoch": 0.09025641025641026, "grad_norm": 0.251953125, "learning_rate": 0.0004992212467043172, "loss": 2.8449, "step": 88 }, { "epoch": 0.09128205128205129, "grad_norm": 0.2333984375, "learning_rate": 0.0004991884783987891, "loss": 2.7678, "step": 89 }, { "epoch": 0.09230769230769231, "grad_norm": 0.2421875, "learning_rate": 0.000499155035936536, "loss": 2.8102, "step": 90 }, { "epoch": 0.09333333333333334, "grad_norm": 0.259765625, "learning_rate": 0.0004991209194080334, "loss": 2.8242, "step": 91 }, { "epoch": 0.09435897435897436, "grad_norm": 0.236328125, "learning_rate": 0.0004990861289055804, "loss": 2.7657, "step": 92 }, { "epoch": 0.09538461538461539, "grad_norm": 0.236328125, "learning_rate": 0.0004990506645232996, "loss": 2.8789, "step": 93 }, { "epoch": 0.09641025641025641, "grad_norm": 0.26953125, "learning_rate": 0.0004990145263571367, "loss": 2.8767, "step": 94 }, { "epoch": 0.09743589743589744, "grad_norm": 0.2490234375, "learning_rate": 0.0004989777145048601, "loss": 2.941, "step": 95 }, { "epoch": 0.09846153846153846, "grad_norm": 0.263671875, "learning_rate": 0.000498940229066061, "loss": 2.8177, "step": 96 }, { "epoch": 0.09948717948717949, "grad_norm": 0.2275390625, "learning_rate": 0.0004989020701421527, "loss": 2.7936, "step": 97 }, { "epoch": 0.10051282051282051, "grad_norm": 0.2734375, "learning_rate": 0.0004988632378363709, "loss": 2.8293, "step": 98 }, { "epoch": 0.10153846153846154, "grad_norm": 0.287109375, "learning_rate": 0.0004988237322537727, "loss": 2.7191, "step": 99 }, { "epoch": 0.10256410256410256, "grad_norm": 0.2412109375, "learning_rate": 0.000498783553501237, "loss": 2.8819, "step": 100 }, { "epoch": 0.10358974358974359, "grad_norm": 0.259765625, "learning_rate": 0.0004987427016874636, "loss": 2.8212, "step": 101 }, { "epoch": 0.10461538461538461, "grad_norm": 0.2734375, "learning_rate": 0.0004987011769229735, "loss": 2.7309, "step": 102 }, { "epoch": 0.10564102564102563, "grad_norm": 0.25390625, "learning_rate": 0.0004986589793201081, "loss": 2.8263, "step": 103 }, { "epoch": 0.10666666666666667, "grad_norm": 0.255859375, "learning_rate": 0.0004986161089930291, "loss": 2.7407, "step": 104 }, { "epoch": 0.1076923076923077, "grad_norm": 0.26953125, "learning_rate": 0.0004985725660577184, "loss": 2.9486, "step": 105 }, { "epoch": 0.10871794871794872, "grad_norm": 0.2451171875, "learning_rate": 0.0004985283506319774, "loss": 2.7858, "step": 106 }, { "epoch": 0.10974358974358975, "grad_norm": 0.25390625, "learning_rate": 0.0004984834628354268, "loss": 2.845, "step": 107 }, { "epoch": 0.11076923076923077, "grad_norm": 0.2734375, "learning_rate": 0.0004984379027895066, "loss": 2.7756, "step": 108 }, { "epoch": 0.1117948717948718, "grad_norm": 0.283203125, "learning_rate": 0.0004983916706174752, "loss": 2.9129, "step": 109 }, { "epoch": 0.11282051282051282, "grad_norm": 0.2578125, "learning_rate": 0.0004983447664444096, "loss": 2.8223, "step": 110 }, { "epoch": 0.11384615384615385, "grad_norm": 0.26953125, "learning_rate": 0.0004982971903972049, "loss": 2.804, "step": 111 }, { "epoch": 0.11487179487179487, "grad_norm": 0.2490234375, "learning_rate": 0.0004982489426045737, "loss": 2.8519, "step": 112 }, { "epoch": 0.1158974358974359, "grad_norm": 0.244140625, "learning_rate": 0.0004982000231970457, "loss": 2.8596, "step": 113 }, { "epoch": 0.11692307692307692, "grad_norm": 0.2294921875, "learning_rate": 0.0004981504323069684, "loss": 2.8082, "step": 114 }, { "epoch": 0.11794871794871795, "grad_norm": 0.2578125, "learning_rate": 0.000498100170068505, "loss": 2.7364, "step": 115 }, { "epoch": 0.11897435897435897, "grad_norm": 0.236328125, "learning_rate": 0.0004980492366176356, "loss": 2.8783, "step": 116 }, { "epoch": 0.12, "grad_norm": 0.265625, "learning_rate": 0.0004979976320921561, "loss": 2.7935, "step": 117 }, { "epoch": 0.12102564102564102, "grad_norm": 0.2470703125, "learning_rate": 0.0004979453566316777, "loss": 2.8763, "step": 118 }, { "epoch": 0.12205128205128205, "grad_norm": 0.265625, "learning_rate": 0.0004978924103776268, "loss": 2.6928, "step": 119 }, { "epoch": 0.12307692307692308, "grad_norm": 0.234375, "learning_rate": 0.000497838793473245, "loss": 2.7344, "step": 120 }, { "epoch": 0.12307692307692308, "eval_loss": NaN, "eval_runtime": 73.9769, "eval_samples_per_second": 9.287, "eval_steps_per_second": 1.163, "step": 120 }, { "epoch": 0.12410256410256411, "grad_norm": 0.26171875, "learning_rate": 0.0004977845060635878, "loss": 2.8733, "step": 121 }, { "epoch": 0.12512820512820513, "grad_norm": 0.26171875, "learning_rate": 0.0004977295482955247, "loss": 2.7865, "step": 122 }, { "epoch": 0.12615384615384614, "grad_norm": 0.2294921875, "learning_rate": 0.0004976739203177389, "loss": 2.75, "step": 123 }, { "epoch": 0.12717948717948718, "grad_norm": 0.25, "learning_rate": 0.0004976176222807271, "loss": 2.9631, "step": 124 }, { "epoch": 0.1282051282051282, "grad_norm": 0.29296875, "learning_rate": 0.0004975606543367983, "loss": 2.9877, "step": 125 }, { "epoch": 0.12923076923076923, "grad_norm": 0.22265625, "learning_rate": 0.0004975030166400742, "loss": 2.7888, "step": 126 }, { "epoch": 0.13025641025641024, "grad_norm": 0.236328125, "learning_rate": 0.0004974447093464882, "loss": 2.8653, "step": 127 }, { "epoch": 0.13128205128205128, "grad_norm": 0.21484375, "learning_rate": 0.0004973857326137855, "loss": 2.845, "step": 128 }, { "epoch": 0.13230769230769232, "grad_norm": 0.232421875, "learning_rate": 0.0004973260866015224, "loss": 2.9031, "step": 129 }, { "epoch": 0.13333333333333333, "grad_norm": 0.2373046875, "learning_rate": 0.0004972657714710653, "loss": 2.9482, "step": 130 }, { "epoch": 0.13435897435897437, "grad_norm": 0.259765625, "learning_rate": 0.0004972047873855917, "loss": 2.7798, "step": 131 }, { "epoch": 0.13538461538461538, "grad_norm": 0.251953125, "learning_rate": 0.0004971431345100884, "loss": 2.9302, "step": 132 }, { "epoch": 0.13641025641025642, "grad_norm": 0.2734375, "learning_rate": 0.0004970808130113516, "loss": 2.8131, "step": 133 }, { "epoch": 0.13743589743589743, "grad_norm": 0.259765625, "learning_rate": 0.0004970178230579863, "loss": 2.8385, "step": 134 }, { "epoch": 0.13846153846153847, "grad_norm": 0.259765625, "learning_rate": 0.0004969541648204063, "loss": 2.6896, "step": 135 }, { "epoch": 0.13948717948717948, "grad_norm": 0.2275390625, "learning_rate": 0.0004968898384708332, "loss": 2.8197, "step": 136 }, { "epoch": 0.14051282051282052, "grad_norm": 0.25390625, "learning_rate": 0.0004968248441832957, "loss": 2.766, "step": 137 }, { "epoch": 0.14153846153846153, "grad_norm": 0.267578125, "learning_rate": 0.0004967591821336304, "loss": 2.9642, "step": 138 }, { "epoch": 0.14256410256410257, "grad_norm": 0.2470703125, "learning_rate": 0.0004966928524994797, "loss": 3.0062, "step": 139 }, { "epoch": 0.14358974358974358, "grad_norm": 0.26953125, "learning_rate": 0.0004966258554602924, "loss": 2.7061, "step": 140 }, { "epoch": 0.14461538461538462, "grad_norm": 0.2412109375, "learning_rate": 0.0004965581911973231, "loss": 2.8083, "step": 141 }, { "epoch": 0.14564102564102563, "grad_norm": 0.25, "learning_rate": 0.0004964898598936309, "loss": 2.892, "step": 142 }, { "epoch": 0.14666666666666667, "grad_norm": 0.2421875, "learning_rate": 0.0004964208617340803, "loss": 2.7372, "step": 143 }, { "epoch": 0.1476923076923077, "grad_norm": 0.2294921875, "learning_rate": 0.0004963511969053394, "loss": 2.7976, "step": 144 }, { "epoch": 0.14871794871794872, "grad_norm": 0.2373046875, "learning_rate": 0.0004962808655958799, "loss": 2.8878, "step": 145 }, { "epoch": 0.14974358974358976, "grad_norm": 0.255859375, "learning_rate": 0.000496209867995977, "loss": 2.8502, "step": 146 }, { "epoch": 0.15076923076923077, "grad_norm": 0.2451171875, "learning_rate": 0.0004961382042977081, "loss": 2.7176, "step": 147 }, { "epoch": 0.1517948717948718, "grad_norm": 0.2373046875, "learning_rate": 0.0004960658746949528, "loss": 2.8168, "step": 148 }, { "epoch": 0.15282051282051282, "grad_norm": 0.26171875, "learning_rate": 0.0004959928793833923, "loss": 2.9103, "step": 149 }, { "epoch": 0.15384615384615385, "grad_norm": 0.259765625, "learning_rate": 0.0004959192185605088, "loss": 2.9834, "step": 150 }, { "epoch": 0.15487179487179487, "grad_norm": 0.2578125, "learning_rate": 0.0004958448924255849, "loss": 2.8788, "step": 151 }, { "epoch": 0.1558974358974359, "grad_norm": 0.25390625, "learning_rate": 0.0004957699011797031, "loss": 2.7303, "step": 152 }, { "epoch": 0.15692307692307692, "grad_norm": 0.240234375, "learning_rate": 0.0004956942450257454, "loss": 2.7905, "step": 153 }, { "epoch": 0.15794871794871795, "grad_norm": 0.259765625, "learning_rate": 0.0004956179241683928, "loss": 2.6549, "step": 154 }, { "epoch": 0.15897435897435896, "grad_norm": 0.2578125, "learning_rate": 0.0004955409388141244, "loss": 3.0035, "step": 155 }, { "epoch": 0.16, "grad_norm": 0.263671875, "learning_rate": 0.0004954632891712168, "loss": 3.0538, "step": 156 }, { "epoch": 0.16102564102564101, "grad_norm": 0.2470703125, "learning_rate": 0.0004953849754497443, "loss": 2.7647, "step": 157 }, { "epoch": 0.16205128205128205, "grad_norm": 0.25, "learning_rate": 0.0004953059978615774, "loss": 2.7751, "step": 158 }, { "epoch": 0.16307692307692306, "grad_norm": 0.24609375, "learning_rate": 0.0004952263566203828, "loss": 2.5988, "step": 159 }, { "epoch": 0.1641025641025641, "grad_norm": 0.2421875, "learning_rate": 0.0004951460519416228, "loss": 2.7523, "step": 160 }, { "epoch": 0.1641025641025641, "eval_loss": NaN, "eval_runtime": 74.0525, "eval_samples_per_second": 9.277, "eval_steps_per_second": 1.161, "step": 160 }, { "epoch": 0.16512820512820514, "grad_norm": 0.251953125, "learning_rate": 0.0004950650840425541, "loss": 2.9122, "step": 161 }, { "epoch": 0.16615384615384615, "grad_norm": 0.2373046875, "learning_rate": 0.0004949834531422281, "loss": 2.8136, "step": 162 }, { "epoch": 0.1671794871794872, "grad_norm": 0.259765625, "learning_rate": 0.0004949011594614899, "loss": 2.9062, "step": 163 }, { "epoch": 0.1682051282051282, "grad_norm": 0.271484375, "learning_rate": 0.0004948182032229774, "loss": 2.8322, "step": 164 }, { "epoch": 0.16923076923076924, "grad_norm": 0.2451171875, "learning_rate": 0.000494734584651121, "loss": 2.8113, "step": 165 }, { "epoch": 0.17025641025641025, "grad_norm": 0.2373046875, "learning_rate": 0.0004946503039721435, "loss": 2.9439, "step": 166 }, { "epoch": 0.1712820512820513, "grad_norm": 0.26171875, "learning_rate": 0.0004945653614140583, "loss": 2.7564, "step": 167 }, { "epoch": 0.1723076923076923, "grad_norm": 0.251953125, "learning_rate": 0.0004944797572066698, "loss": 2.7837, "step": 168 }, { "epoch": 0.17333333333333334, "grad_norm": 0.2451171875, "learning_rate": 0.0004943934915815725, "loss": 2.8411, "step": 169 }, { "epoch": 0.17435897435897435, "grad_norm": 0.2294921875, "learning_rate": 0.0004943065647721502, "loss": 2.7355, "step": 170 }, { "epoch": 0.1753846153846154, "grad_norm": 0.25390625, "learning_rate": 0.0004942189770135752, "loss": 2.5963, "step": 171 }, { "epoch": 0.1764102564102564, "grad_norm": 0.224609375, "learning_rate": 0.0004941307285428086, "loss": 2.8289, "step": 172 }, { "epoch": 0.17743589743589744, "grad_norm": 0.259765625, "learning_rate": 0.0004940418195985983, "loss": 2.8069, "step": 173 }, { "epoch": 0.17846153846153845, "grad_norm": 0.2138671875, "learning_rate": 0.0004939522504214794, "loss": 2.6596, "step": 174 }, { "epoch": 0.1794871794871795, "grad_norm": 0.224609375, "learning_rate": 0.0004938620212537732, "loss": 2.7244, "step": 175 }, { "epoch": 0.18051282051282053, "grad_norm": 0.2314453125, "learning_rate": 0.0004937711323395865, "loss": 2.808, "step": 176 }, { "epoch": 0.18153846153846154, "grad_norm": 0.296875, "learning_rate": 0.0004936795839248109, "loss": 2.9269, "step": 177 }, { "epoch": 0.18256410256410258, "grad_norm": 0.263671875, "learning_rate": 0.0004935873762571223, "loss": 2.8407, "step": 178 }, { "epoch": 0.1835897435897436, "grad_norm": 0.2294921875, "learning_rate": 0.0004934945095859803, "loss": 2.8959, "step": 179 }, { "epoch": 0.18461538461538463, "grad_norm": 0.271484375, "learning_rate": 0.0004934009841626271, "loss": 2.968, "step": 180 }, { "epoch": 0.18564102564102564, "grad_norm": 0.224609375, "learning_rate": 0.0004933068002400872, "loss": 2.8695, "step": 181 }, { "epoch": 0.18666666666666668, "grad_norm": 0.2392578125, "learning_rate": 0.0004932119580731666, "loss": 2.9147, "step": 182 }, { "epoch": 0.18769230769230769, "grad_norm": 0.236328125, "learning_rate": 0.0004931164579184523, "loss": 2.8695, "step": 183 }, { "epoch": 0.18871794871794872, "grad_norm": 0.2353515625, "learning_rate": 0.000493020300034311, "loss": 2.8442, "step": 184 }, { "epoch": 0.18974358974358974, "grad_norm": 0.2490234375, "learning_rate": 0.0004929234846808893, "loss": 2.8292, "step": 185 }, { "epoch": 0.19076923076923077, "grad_norm": 0.283203125, "learning_rate": 0.0004928260121201123, "loss": 2.8075, "step": 186 }, { "epoch": 0.19179487179487179, "grad_norm": 0.2421875, "learning_rate": 0.0004927278826156831, "loss": 2.8878, "step": 187 }, { "epoch": 0.19282051282051282, "grad_norm": 0.2890625, "learning_rate": 0.0004926290964330821, "loss": 2.8227, "step": 188 }, { "epoch": 0.19384615384615383, "grad_norm": 0.26953125, "learning_rate": 0.0004925296538395664, "loss": 2.8928, "step": 189 }, { "epoch": 0.19487179487179487, "grad_norm": 0.23828125, "learning_rate": 0.0004924295551041688, "loss": 2.8819, "step": 190 }, { "epoch": 0.19589743589743588, "grad_norm": 0.271484375, "learning_rate": 0.0004923288004976972, "loss": 2.716, "step": 191 }, { "epoch": 0.19692307692307692, "grad_norm": 0.236328125, "learning_rate": 0.0004922273902927344, "loss": 2.9129, "step": 192 }, { "epoch": 0.19794871794871796, "grad_norm": 0.287109375, "learning_rate": 0.000492125324763636, "loss": 2.9449, "step": 193 }, { "epoch": 0.19897435897435897, "grad_norm": 0.30859375, "learning_rate": 0.0004920226041865313, "loss": 2.9063, "step": 194 }, { "epoch": 0.2, "grad_norm": 0.26953125, "learning_rate": 0.0004919192288393212, "loss": 2.6706, "step": 195 }, { "epoch": 0.20102564102564102, "grad_norm": 0.2412109375, "learning_rate": 0.0004918151990016786, "loss": 2.8931, "step": 196 }, { "epoch": 0.20205128205128206, "grad_norm": 0.2890625, "learning_rate": 0.0004917105149550466, "loss": 2.7143, "step": 197 }, { "epoch": 0.20307692307692307, "grad_norm": 0.259765625, "learning_rate": 0.0004916051769826383, "loss": 2.7915, "step": 198 }, { "epoch": 0.2041025641025641, "grad_norm": 0.267578125, "learning_rate": 0.000491499185369436, "loss": 2.8123, "step": 199 }, { "epoch": 0.20512820512820512, "grad_norm": 0.25, "learning_rate": 0.0004913925404021904, "loss": 2.8974, "step": 200 }, { "epoch": 0.20512820512820512, "eval_loss": NaN, "eval_runtime": 74.0131, "eval_samples_per_second": 9.282, "eval_steps_per_second": 1.162, "step": 200 }, { "epoch": 0.20615384615384616, "grad_norm": 0.240234375, "learning_rate": 0.0004912852423694198, "loss": 2.8748, "step": 201 }, { "epoch": 0.20717948717948717, "grad_norm": 0.32421875, "learning_rate": 0.0004911772915614091, "loss": 2.7596, "step": 202 }, { "epoch": 0.2082051282051282, "grad_norm": 0.287109375, "learning_rate": 0.0004910686882702095, "loss": 2.9041, "step": 203 }, { "epoch": 0.20923076923076922, "grad_norm": 0.2734375, "learning_rate": 0.0004909594327896373, "loss": 2.7508, "step": 204 }, { "epoch": 0.21025641025641026, "grad_norm": 0.279296875, "learning_rate": 0.0004908495254152732, "loss": 2.9025, "step": 205 }, { "epoch": 0.21128205128205127, "grad_norm": 0.267578125, "learning_rate": 0.0004907389664444615, "loss": 2.82, "step": 206 }, { "epoch": 0.2123076923076923, "grad_norm": 0.240234375, "learning_rate": 0.0004906277561763096, "loss": 2.7497, "step": 207 }, { "epoch": 0.21333333333333335, "grad_norm": 0.298828125, "learning_rate": 0.0004905158949116866, "loss": 3.0071, "step": 208 }, { "epoch": 0.21435897435897436, "grad_norm": 0.255859375, "learning_rate": 0.0004904033829532231, "loss": 2.8361, "step": 209 }, { "epoch": 0.2153846153846154, "grad_norm": 0.2470703125, "learning_rate": 0.0004902902206053098, "loss": 2.899, "step": 210 }, { "epoch": 0.2164102564102564, "grad_norm": 0.25390625, "learning_rate": 0.0004901764081740973, "loss": 2.7779, "step": 211 }, { "epoch": 0.21743589743589745, "grad_norm": 0.228515625, "learning_rate": 0.0004900619459674947, "loss": 2.8584, "step": 212 }, { "epoch": 0.21846153846153846, "grad_norm": 0.28125, "learning_rate": 0.000489946834295169, "loss": 2.8865, "step": 213 }, { "epoch": 0.2194871794871795, "grad_norm": 0.2734375, "learning_rate": 0.0004898310734685444, "loss": 2.7206, "step": 214 }, { "epoch": 0.2205128205128205, "grad_norm": 0.2431640625, "learning_rate": 0.0004897146638008013, "loss": 2.7576, "step": 215 }, { "epoch": 0.22153846153846155, "grad_norm": 0.255859375, "learning_rate": 0.0004895976056068752, "loss": 2.9659, "step": 216 }, { "epoch": 0.22256410256410256, "grad_norm": 0.26953125, "learning_rate": 0.0004894798992034567, "loss": 2.7687, "step": 217 }, { "epoch": 0.2235897435897436, "grad_norm": 0.2451171875, "learning_rate": 0.0004893615449089897, "loss": 2.7733, "step": 218 }, { "epoch": 0.2246153846153846, "grad_norm": 0.265625, "learning_rate": 0.0004892425430436708, "loss": 2.7778, "step": 219 }, { "epoch": 0.22564102564102564, "grad_norm": 0.248046875, "learning_rate": 0.0004891228939294488, "loss": 2.9468, "step": 220 }, { "epoch": 0.22666666666666666, "grad_norm": 0.267578125, "learning_rate": 0.0004890025978900235, "loss": 2.8252, "step": 221 }, { "epoch": 0.2276923076923077, "grad_norm": 0.248046875, "learning_rate": 0.0004888816552508448, "loss": 2.8578, "step": 222 }, { "epoch": 0.2287179487179487, "grad_norm": 0.265625, "learning_rate": 0.0004887600663391122, "loss": 2.793, "step": 223 }, { "epoch": 0.22974358974358974, "grad_norm": 0.24609375, "learning_rate": 0.0004886378314837732, "loss": 2.9359, "step": 224 }, { "epoch": 0.23076923076923078, "grad_norm": 0.255859375, "learning_rate": 0.0004885149510155231, "loss": 3.0637, "step": 225 }, { "epoch": 0.2317948717948718, "grad_norm": 0.248046875, "learning_rate": 0.0004883914252668038, "loss": 2.8655, "step": 226 }, { "epoch": 0.23282051282051283, "grad_norm": 0.2470703125, "learning_rate": 0.000488267254571803, "loss": 2.7356, "step": 227 }, { "epoch": 0.23384615384615384, "grad_norm": 0.275390625, "learning_rate": 0.0004881424392664532, "loss": 2.8685, "step": 228 }, { "epoch": 0.23487179487179488, "grad_norm": 0.267578125, "learning_rate": 0.00048801697968843083, "loss": 2.8886, "step": 229 }, { "epoch": 0.2358974358974359, "grad_norm": 0.28125, "learning_rate": 0.00048789087617715545, "loss": 2.8211, "step": 230 }, { "epoch": 0.23692307692307693, "grad_norm": 0.25390625, "learning_rate": 0.0004877641290737884, "loss": 2.9133, "step": 231 }, { "epoch": 0.23794871794871794, "grad_norm": 0.283203125, "learning_rate": 0.0004876367387212326, "loss": 2.8127, "step": 232 }, { "epoch": 0.23897435897435898, "grad_norm": 0.2451171875, "learning_rate": 0.000487508705464131, "loss": 2.7825, "step": 233 }, { "epoch": 0.24, "grad_norm": 0.2451171875, "learning_rate": 0.0004873800296488659, "loss": 2.7171, "step": 234 }, { "epoch": 0.24102564102564103, "grad_norm": 0.349609375, "learning_rate": 0.000487250711623558, "loss": 2.9763, "step": 235 }, { "epoch": 0.24205128205128204, "grad_norm": 0.302734375, "learning_rate": 0.0004871207517380654, "loss": 3.0364, "step": 236 }, { "epoch": 0.24307692307692308, "grad_norm": 0.271484375, "learning_rate": 0.00048699015034398273, "loss": 2.8263, "step": 237 }, { "epoch": 0.2441025641025641, "grad_norm": 0.234375, "learning_rate": 0.00048685890779464025, "loss": 2.8013, "step": 238 }, { "epoch": 0.24512820512820513, "grad_norm": 0.240234375, "learning_rate": 0.0004867270244451027, "loss": 2.7351, "step": 239 }, { "epoch": 0.24615384615384617, "grad_norm": 0.298828125, "learning_rate": 0.00048659450065216835, "loss": 2.8822, "step": 240 }, { "epoch": 0.24615384615384617, "eval_loss": NaN, "eval_runtime": 74.0907, "eval_samples_per_second": 9.272, "eval_steps_per_second": 1.161, "step": 240 }, { "epoch": 0.24717948717948718, "grad_norm": 0.2294921875, "learning_rate": 0.0004864613367743684, "loss": 2.8338, "step": 241 }, { "epoch": 0.24820512820512822, "grad_norm": 0.25, "learning_rate": 0.0004863275331719655, "loss": 2.8128, "step": 242 }, { "epoch": 0.24923076923076923, "grad_norm": 0.26171875, "learning_rate": 0.00048619309020695307, "loss": 3.0076, "step": 243 }, { "epoch": 0.25025641025641027, "grad_norm": 0.255859375, "learning_rate": 0.00048605800824305434, "loss": 2.8366, "step": 244 }, { "epoch": 0.2512820512820513, "grad_norm": 0.2373046875, "learning_rate": 0.00048592228764572133, "loss": 2.926, "step": 245 }, { "epoch": 0.2523076923076923, "grad_norm": 0.2412109375, "learning_rate": 0.0004857859287821337, "loss": 2.8218, "step": 246 }, { "epoch": 0.25333333333333335, "grad_norm": 0.2412109375, "learning_rate": 0.00048564893202119794, "loss": 2.7926, "step": 247 }, { "epoch": 0.25435897435897437, "grad_norm": 0.2373046875, "learning_rate": 0.0004855112977335462, "loss": 2.8308, "step": 248 }, { "epoch": 0.2553846153846154, "grad_norm": 0.2216796875, "learning_rate": 0.00048537302629153567, "loss": 2.7537, "step": 249 }, { "epoch": 0.2564102564102564, "grad_norm": 0.28515625, "learning_rate": 0.000485234118069247, "loss": 3.0161, "step": 250 }, { "epoch": 0.25743589743589745, "grad_norm": 0.25, "learning_rate": 0.00048509457344248386, "loss": 2.8734, "step": 251 }, { "epoch": 0.25846153846153846, "grad_norm": 0.2412109375, "learning_rate": 0.0004849543927887714, "loss": 2.8234, "step": 252 }, { "epoch": 0.2594871794871795, "grad_norm": 0.263671875, "learning_rate": 0.0004848135764873557, "loss": 2.8665, "step": 253 }, { "epoch": 0.2605128205128205, "grad_norm": 0.279296875, "learning_rate": 0.0004846721249192024, "loss": 2.7087, "step": 254 }, { "epoch": 0.26153846153846155, "grad_norm": 0.2353515625, "learning_rate": 0.0004845300384669958, "loss": 2.8978, "step": 255 }, { "epoch": 0.26256410256410256, "grad_norm": 0.2578125, "learning_rate": 0.00048438731751513785, "loss": 3.1368, "step": 256 }, { "epoch": 0.2635897435897436, "grad_norm": 0.25, "learning_rate": 0.0004842439624497472, "loss": 2.6442, "step": 257 }, { "epoch": 0.26461538461538464, "grad_norm": 0.232421875, "learning_rate": 0.00048409997365865776, "loss": 2.8965, "step": 258 }, { "epoch": 0.26564102564102565, "grad_norm": 0.267578125, "learning_rate": 0.0004839553515314182, "loss": 2.8909, "step": 259 }, { "epoch": 0.26666666666666666, "grad_norm": 0.3125, "learning_rate": 0.0004838100964592904, "loss": 2.7598, "step": 260 }, { "epoch": 0.2676923076923077, "grad_norm": 0.251953125, "learning_rate": 0.00048366420883524887, "loss": 2.76, "step": 261 }, { "epoch": 0.26871794871794874, "grad_norm": 0.265625, "learning_rate": 0.00048351768905397906, "loss": 2.8662, "step": 262 }, { "epoch": 0.26974358974358975, "grad_norm": 0.28125, "learning_rate": 0.0004833705375118772, "loss": 2.6952, "step": 263 }, { "epoch": 0.27076923076923076, "grad_norm": 0.265625, "learning_rate": 0.00048322275460704813, "loss": 2.78, "step": 264 }, { "epoch": 0.2717948717948718, "grad_norm": 0.259765625, "learning_rate": 0.0004830743407393051, "loss": 2.9689, "step": 265 }, { "epoch": 0.27282051282051284, "grad_norm": 0.2578125, "learning_rate": 0.00048292529631016834, "loss": 2.8028, "step": 266 }, { "epoch": 0.27384615384615385, "grad_norm": 0.251953125, "learning_rate": 0.00048277562172286407, "loss": 2.8645, "step": 267 }, { "epoch": 0.27487179487179486, "grad_norm": 0.27734375, "learning_rate": 0.00048262531738232293, "loss": 2.7137, "step": 268 }, { "epoch": 0.27589743589743587, "grad_norm": 0.23828125, "learning_rate": 0.00048247438369517995, "loss": 2.8599, "step": 269 }, { "epoch": 0.27692307692307694, "grad_norm": 0.263671875, "learning_rate": 0.0004823228210697723, "loss": 2.8781, "step": 270 }, { "epoch": 0.27794871794871795, "grad_norm": 0.28125, "learning_rate": 0.00048217062991613897, "loss": 2.7963, "step": 271 }, { "epoch": 0.27897435897435896, "grad_norm": 0.220703125, "learning_rate": 0.0004820178106460191, "loss": 2.8905, "step": 272 }, { "epoch": 0.28, "grad_norm": 0.240234375, "learning_rate": 0.00048186436367285147, "loss": 2.8217, "step": 273 }, { "epoch": 0.28102564102564104, "grad_norm": 0.2236328125, "learning_rate": 0.00048171028941177275, "loss": 2.7953, "step": 274 }, { "epoch": 0.28205128205128205, "grad_norm": 0.234375, "learning_rate": 0.0004815555882796169, "loss": 3.0109, "step": 275 }, { "epoch": 0.28307692307692306, "grad_norm": 0.234375, "learning_rate": 0.00048140026069491373, "loss": 2.8815, "step": 276 }, { "epoch": 0.2841025641025641, "grad_norm": 0.294921875, "learning_rate": 0.00048124430707788783, "loss": 3.0948, "step": 277 }, { "epoch": 0.28512820512820514, "grad_norm": 0.2177734375, "learning_rate": 0.00048108772785045753, "loss": 2.8212, "step": 278 }, { "epoch": 0.28615384615384615, "grad_norm": 0.2236328125, "learning_rate": 0.00048093052343623367, "loss": 2.7473, "step": 279 }, { "epoch": 0.28717948717948716, "grad_norm": 0.2177734375, "learning_rate": 0.0004807726942605184, "loss": 2.8679, "step": 280 }, { "epoch": 0.28717948717948716, "eval_loss": NaN, "eval_runtime": 74.1849, "eval_samples_per_second": 9.261, "eval_steps_per_second": 1.159, "step": 280 }, { "epoch": 0.2882051282051282, "grad_norm": 0.2314453125, "learning_rate": 0.0004806142407503043, "loss": 2.8487, "step": 281 }, { "epoch": 0.28923076923076924, "grad_norm": 0.205078125, "learning_rate": 0.00048045516333427283, "loss": 2.7397, "step": 282 }, { "epoch": 0.29025641025641025, "grad_norm": 0.2421875, "learning_rate": 0.0004802954624427934, "loss": 2.8362, "step": 283 }, { "epoch": 0.29128205128205126, "grad_norm": 0.25, "learning_rate": 0.0004801351385079223, "loss": 2.7576, "step": 284 }, { "epoch": 0.2923076923076923, "grad_norm": 0.2421875, "learning_rate": 0.0004799741919634013, "loss": 2.7945, "step": 285 }, { "epoch": 0.29333333333333333, "grad_norm": 0.22265625, "learning_rate": 0.00047981262324465665, "loss": 2.8001, "step": 286 }, { "epoch": 0.29435897435897435, "grad_norm": 0.26171875, "learning_rate": 0.0004796504327887977, "loss": 2.8698, "step": 287 }, { "epoch": 0.2953846153846154, "grad_norm": 0.2578125, "learning_rate": 0.00047948762103461605, "loss": 2.9252, "step": 288 }, { "epoch": 0.2964102564102564, "grad_norm": 0.251953125, "learning_rate": 0.00047932418842258406, "loss": 2.9024, "step": 289 }, { "epoch": 0.29743589743589743, "grad_norm": 0.263671875, "learning_rate": 0.0004791601353948537, "loss": 2.8964, "step": 290 }, { "epoch": 0.29846153846153844, "grad_norm": 0.26953125, "learning_rate": 0.0004789954623952556, "loss": 2.7973, "step": 291 }, { "epoch": 0.2994871794871795, "grad_norm": 0.2275390625, "learning_rate": 0.0004788301698692974, "loss": 2.9535, "step": 292 }, { "epoch": 0.3005128205128205, "grad_norm": 0.265625, "learning_rate": 0.00047866425826416316, "loss": 2.7872, "step": 293 }, { "epoch": 0.30153846153846153, "grad_norm": 0.251953125, "learning_rate": 0.0004784977280287114, "loss": 2.8398, "step": 294 }, { "epoch": 0.30256410256410254, "grad_norm": 0.2470703125, "learning_rate": 0.0004783305796134747, "loss": 2.8536, "step": 295 }, { "epoch": 0.3035897435897436, "grad_norm": 0.240234375, "learning_rate": 0.0004781628134706577, "loss": 2.6214, "step": 296 }, { "epoch": 0.3046153846153846, "grad_norm": 0.2578125, "learning_rate": 0.00047799443005413654, "loss": 2.8535, "step": 297 }, { "epoch": 0.30564102564102563, "grad_norm": 0.232421875, "learning_rate": 0.0004778254298194572, "loss": 2.802, "step": 298 }, { "epoch": 0.30666666666666664, "grad_norm": 0.287109375, "learning_rate": 0.0004776558132238344, "loss": 2.8695, "step": 299 }, { "epoch": 0.3076923076923077, "grad_norm": 0.236328125, "learning_rate": 0.0004774855807261503, "loss": 2.8588, "step": 300 }, { "epoch": 0.3087179487179487, "grad_norm": 0.2470703125, "learning_rate": 0.00047731473278695357, "loss": 2.8924, "step": 301 }, { "epoch": 0.30974358974358973, "grad_norm": 0.2451171875, "learning_rate": 0.00047714326986845757, "loss": 2.7775, "step": 302 }, { "epoch": 0.31076923076923074, "grad_norm": 0.23828125, "learning_rate": 0.00047697119243453974, "loss": 2.9689, "step": 303 }, { "epoch": 0.3117948717948718, "grad_norm": 0.24609375, "learning_rate": 0.00047679850095073983, "loss": 2.7367, "step": 304 }, { "epoch": 0.3128205128205128, "grad_norm": 0.220703125, "learning_rate": 0.00047662519588425895, "loss": 2.8245, "step": 305 }, { "epoch": 0.31384615384615383, "grad_norm": 0.265625, "learning_rate": 0.00047645127770395816, "loss": 2.9681, "step": 306 }, { "epoch": 0.3148717948717949, "grad_norm": 0.240234375, "learning_rate": 0.0004762767468803573, "loss": 2.9169, "step": 307 }, { "epoch": 0.3158974358974359, "grad_norm": 0.23828125, "learning_rate": 0.00047610160388563344, "loss": 3.0643, "step": 308 }, { "epoch": 0.3169230769230769, "grad_norm": 0.2421875, "learning_rate": 0.0004759258491936202, "loss": 2.7625, "step": 309 }, { "epoch": 0.31794871794871793, "grad_norm": 0.2470703125, "learning_rate": 0.00047574948327980574, "loss": 2.8405, "step": 310 }, { "epoch": 0.318974358974359, "grad_norm": 0.232421875, "learning_rate": 0.00047557250662133213, "loss": 2.8324, "step": 311 }, { "epoch": 0.32, "grad_norm": 0.234375, "learning_rate": 0.0004753949196969934, "loss": 2.9981, "step": 312 }, { "epoch": 0.321025641025641, "grad_norm": 0.23828125, "learning_rate": 0.00047521672298723495, "loss": 2.7169, "step": 313 }, { "epoch": 0.32205128205128203, "grad_norm": 0.2373046875, "learning_rate": 0.0004750379169741516, "loss": 2.7619, "step": 314 }, { "epoch": 0.3230769230769231, "grad_norm": 0.255859375, "learning_rate": 0.00047485850214148685, "loss": 3.0088, "step": 315 }, { "epoch": 0.3241025641025641, "grad_norm": 0.24609375, "learning_rate": 0.00047467847897463125, "loss": 2.7811, "step": 316 }, { "epoch": 0.3251282051282051, "grad_norm": 0.2333984375, "learning_rate": 0.00047449784796062077, "loss": 2.8794, "step": 317 }, { "epoch": 0.3261538461538461, "grad_norm": 0.2578125, "learning_rate": 0.00047431660958813655, "loss": 2.6472, "step": 318 }, { "epoch": 0.3271794871794872, "grad_norm": 0.29296875, "learning_rate": 0.00047413476434750225, "loss": 2.9633, "step": 319 }, { "epoch": 0.3282051282051282, "grad_norm": 0.267578125, "learning_rate": 0.0004739523127306836, "loss": 2.8764, "step": 320 }, { "epoch": 0.3282051282051282, "eval_loss": NaN, "eval_runtime": 74.1625, "eval_samples_per_second": 9.263, "eval_steps_per_second": 1.16, "step": 320 }, { "epoch": 0.3292307692307692, "grad_norm": 0.228515625, "learning_rate": 0.0004737692552312869, "loss": 2.7547, "step": 321 }, { "epoch": 0.3302564102564103, "grad_norm": 0.2265625, "learning_rate": 0.00047358559234455757, "loss": 2.776, "step": 322 }, { "epoch": 0.3312820512820513, "grad_norm": 0.240234375, "learning_rate": 0.0004734013245673787, "loss": 2.6469, "step": 323 }, { "epoch": 0.3323076923076923, "grad_norm": 0.2392578125, "learning_rate": 0.00047321645239827015, "loss": 2.9942, "step": 324 }, { "epoch": 0.3333333333333333, "grad_norm": 0.25390625, "learning_rate": 0.0004730309763373866, "loss": 2.8237, "step": 325 }, { "epoch": 0.3343589743589744, "grad_norm": 0.2216796875, "learning_rate": 0.00047284489688651654, "loss": 2.8756, "step": 326 }, { "epoch": 0.3353846153846154, "grad_norm": 0.2421875, "learning_rate": 0.0004726582145490811, "loss": 2.7479, "step": 327 }, { "epoch": 0.3364102564102564, "grad_norm": 0.251953125, "learning_rate": 0.0004724709298301324, "loss": 2.8532, "step": 328 }, { "epoch": 0.3374358974358974, "grad_norm": 0.255859375, "learning_rate": 0.0004722830432363521, "loss": 2.9542, "step": 329 }, { "epoch": 0.3384615384615385, "grad_norm": 0.318359375, "learning_rate": 0.0004720945552760503, "loss": 2.9399, "step": 330 }, { "epoch": 0.3394871794871795, "grad_norm": 0.20703125, "learning_rate": 0.0004719054664591639, "loss": 2.8353, "step": 331 }, { "epoch": 0.3405128205128205, "grad_norm": 0.28125, "learning_rate": 0.0004717157772972555, "loss": 2.8998, "step": 332 }, { "epoch": 0.3415384615384615, "grad_norm": 0.2294921875, "learning_rate": 0.0004715254883035119, "loss": 2.748, "step": 333 }, { "epoch": 0.3425641025641026, "grad_norm": 0.23046875, "learning_rate": 0.0004713345999927426, "loss": 2.7891, "step": 334 }, { "epoch": 0.3435897435897436, "grad_norm": 0.263671875, "learning_rate": 0.00047114311288137865, "loss": 2.6888, "step": 335 }, { "epoch": 0.3446153846153846, "grad_norm": 0.23828125, "learning_rate": 0.00047095102748747085, "loss": 2.7932, "step": 336 }, { "epoch": 0.34564102564102567, "grad_norm": 0.2197265625, "learning_rate": 0.00047075834433068874, "loss": 2.8044, "step": 337 }, { "epoch": 0.3466666666666667, "grad_norm": 0.240234375, "learning_rate": 0.0004705650639323191, "loss": 2.9569, "step": 338 }, { "epoch": 0.3476923076923077, "grad_norm": 0.240234375, "learning_rate": 0.00047037118681526435, "loss": 2.9779, "step": 339 }, { "epoch": 0.3487179487179487, "grad_norm": 0.236328125, "learning_rate": 0.00047017671350404143, "loss": 2.6389, "step": 340 }, { "epoch": 0.34974358974358977, "grad_norm": 0.326171875, "learning_rate": 0.0004699816445247802, "loss": 2.9757, "step": 341 }, { "epoch": 0.3507692307692308, "grad_norm": 0.2373046875, "learning_rate": 0.00046978598040522203, "loss": 2.6848, "step": 342 }, { "epoch": 0.3517948717948718, "grad_norm": 0.2294921875, "learning_rate": 0.00046958972167471827, "loss": 2.7872, "step": 343 }, { "epoch": 0.3528205128205128, "grad_norm": 0.255859375, "learning_rate": 0.00046939286886422905, "loss": 2.9455, "step": 344 }, { "epoch": 0.35384615384615387, "grad_norm": 0.2236328125, "learning_rate": 0.00046919542250632175, "loss": 2.7433, "step": 345 }, { "epoch": 0.3548717948717949, "grad_norm": 0.259765625, "learning_rate": 0.0004689973831351695, "loss": 2.9037, "step": 346 }, { "epoch": 0.3558974358974359, "grad_norm": 0.294921875, "learning_rate": 0.0004687987512865497, "loss": 2.764, "step": 347 }, { "epoch": 0.3569230769230769, "grad_norm": 0.2333984375, "learning_rate": 0.00046859952749784287, "loss": 2.7412, "step": 348 }, { "epoch": 0.35794871794871796, "grad_norm": 0.240234375, "learning_rate": 0.0004683997123080307, "loss": 2.7448, "step": 349 }, { "epoch": 0.358974358974359, "grad_norm": 0.259765625, "learning_rate": 0.000468199306257695, "loss": 2.7766, "step": 350 }, { "epoch": 0.36, "grad_norm": 0.248046875, "learning_rate": 0.00046799830988901615, "loss": 2.7336, "step": 351 }, { "epoch": 0.36102564102564105, "grad_norm": 0.2392578125, "learning_rate": 0.00046779672374577135, "loss": 2.781, "step": 352 }, { "epoch": 0.36205128205128206, "grad_norm": 0.251953125, "learning_rate": 0.00046759454837333377, "loss": 2.8251, "step": 353 }, { "epoch": 0.3630769230769231, "grad_norm": 0.27734375, "learning_rate": 0.0004673917843186703, "loss": 2.8933, "step": 354 }, { "epoch": 0.3641025641025641, "grad_norm": 0.25390625, "learning_rate": 0.0004671884321303407, "loss": 2.7874, "step": 355 }, { "epoch": 0.36512820512820515, "grad_norm": 0.23046875, "learning_rate": 0.0004669844923584957, "loss": 2.8201, "step": 356 }, { "epoch": 0.36615384615384616, "grad_norm": 0.2412109375, "learning_rate": 0.0004667799655548758, "loss": 2.97, "step": 357 }, { "epoch": 0.3671794871794872, "grad_norm": 0.298828125, "learning_rate": 0.0004665748522728097, "loss": 3.0374, "step": 358 }, { "epoch": 0.3682051282051282, "grad_norm": 0.224609375, "learning_rate": 0.0004663691530672126, "loss": 2.9043, "step": 359 }, { "epoch": 0.36923076923076925, "grad_norm": 0.2265625, "learning_rate": 0.0004661628684945851, "loss": 2.7328, "step": 360 }, { "epoch": 0.36923076923076925, "eval_loss": NaN, "eval_runtime": 74.0668, "eval_samples_per_second": 9.275, "eval_steps_per_second": 1.161, "step": 360 }, { "epoch": 0.37025641025641026, "grad_norm": 0.251953125, "learning_rate": 0.00046595599911301123, "loss": 2.8602, "step": 361 }, { "epoch": 0.3712820512820513, "grad_norm": 0.23046875, "learning_rate": 0.00046574854548215715, "loss": 2.7526, "step": 362 }, { "epoch": 0.3723076923076923, "grad_norm": 0.2158203125, "learning_rate": 0.00046554050816326995, "loss": 2.7566, "step": 363 }, { "epoch": 0.37333333333333335, "grad_norm": 0.23046875, "learning_rate": 0.0004653318877191755, "loss": 2.9871, "step": 364 }, { "epoch": 0.37435897435897436, "grad_norm": 0.2294921875, "learning_rate": 0.0004651226847142774, "loss": 2.6961, "step": 365 }, { "epoch": 0.37538461538461537, "grad_norm": 0.224609375, "learning_rate": 0.0004649128997145555, "loss": 2.8553, "step": 366 }, { "epoch": 0.3764102564102564, "grad_norm": 0.240234375, "learning_rate": 0.00046470253328756385, "loss": 2.6923, "step": 367 }, { "epoch": 0.37743589743589745, "grad_norm": 0.2490234375, "learning_rate": 0.0004644915860024296, "loss": 2.7656, "step": 368 }, { "epoch": 0.37846153846153846, "grad_norm": 0.23046875, "learning_rate": 0.0004642800584298516, "loss": 2.8269, "step": 369 }, { "epoch": 0.37948717948717947, "grad_norm": 0.265625, "learning_rate": 0.00046406795114209827, "loss": 2.8814, "step": 370 }, { "epoch": 0.38051282051282054, "grad_norm": 0.2314453125, "learning_rate": 0.0004638552647130066, "loss": 2.7761, "step": 371 }, { "epoch": 0.38153846153846155, "grad_norm": 0.2353515625, "learning_rate": 0.0004636419997179803, "loss": 2.8827, "step": 372 }, { "epoch": 0.38256410256410256, "grad_norm": 0.2490234375, "learning_rate": 0.0004634281567339885, "loss": 2.6603, "step": 373 }, { "epoch": 0.38358974358974357, "grad_norm": 0.2294921875, "learning_rate": 0.00046321373633956384, "loss": 2.9216, "step": 374 }, { "epoch": 0.38461538461538464, "grad_norm": 0.23828125, "learning_rate": 0.0004629987391148011, "loss": 2.807, "step": 375 }, { "epoch": 0.38564102564102565, "grad_norm": 0.2431640625, "learning_rate": 0.0004627831656413558, "loss": 2.8055, "step": 376 }, { "epoch": 0.38666666666666666, "grad_norm": 0.248046875, "learning_rate": 0.00046256701650244225, "loss": 2.8926, "step": 377 }, { "epoch": 0.38769230769230767, "grad_norm": 0.212890625, "learning_rate": 0.0004623502922828323, "loss": 2.621, "step": 378 }, { "epoch": 0.38871794871794874, "grad_norm": 0.248046875, "learning_rate": 0.0004621329935688535, "loss": 2.7326, "step": 379 }, { "epoch": 0.38974358974358975, "grad_norm": 0.228515625, "learning_rate": 0.00046191512094838783, "loss": 2.6931, "step": 380 }, { "epoch": 0.39076923076923076, "grad_norm": 0.400390625, "learning_rate": 0.0004616966750108698, "loss": 3.0929, "step": 381 }, { "epoch": 0.39179487179487177, "grad_norm": 0.255859375, "learning_rate": 0.0004614776563472849, "loss": 2.752, "step": 382 }, { "epoch": 0.39282051282051283, "grad_norm": 0.2275390625, "learning_rate": 0.00046125806555016825, "loss": 2.6774, "step": 383 }, { "epoch": 0.39384615384615385, "grad_norm": 0.275390625, "learning_rate": 0.00046103790321360273, "loss": 2.8531, "step": 384 }, { "epoch": 0.39487179487179486, "grad_norm": 0.240234375, "learning_rate": 0.00046081716993321743, "loss": 2.9365, "step": 385 }, { "epoch": 0.3958974358974359, "grad_norm": 0.28125, "learning_rate": 0.00046059586630618624, "loss": 2.7568, "step": 386 }, { "epoch": 0.39692307692307693, "grad_norm": 0.248046875, "learning_rate": 0.0004603739929312258, "loss": 2.7481, "step": 387 }, { "epoch": 0.39794871794871794, "grad_norm": 0.25390625, "learning_rate": 0.0004601515504085946, "loss": 2.8783, "step": 388 }, { "epoch": 0.39897435897435896, "grad_norm": 0.28125, "learning_rate": 0.0004599285393400903, "loss": 2.8745, "step": 389 }, { "epoch": 0.4, "grad_norm": 0.265625, "learning_rate": 0.00045970496032904905, "loss": 2.8752, "step": 390 }, { "epoch": 0.40102564102564103, "grad_norm": 0.2412109375, "learning_rate": 0.00045948081398034375, "loss": 2.675, "step": 391 }, { "epoch": 0.40205128205128204, "grad_norm": 0.3203125, "learning_rate": 0.0004592561009003817, "loss": 2.7065, "step": 392 }, { "epoch": 0.40307692307692305, "grad_norm": 0.23046875, "learning_rate": 0.0004590308216971037, "loss": 3.0077, "step": 393 }, { "epoch": 0.4041025641025641, "grad_norm": 0.2490234375, "learning_rate": 0.0004588049769799823, "loss": 2.7965, "step": 394 }, { "epoch": 0.40512820512820513, "grad_norm": 0.240234375, "learning_rate": 0.00045857856736001955, "loss": 2.6369, "step": 395 }, { "epoch": 0.40615384615384614, "grad_norm": 0.224609375, "learning_rate": 0.0004583515934497462, "loss": 2.7849, "step": 396 }, { "epoch": 0.40717948717948715, "grad_norm": 0.259765625, "learning_rate": 0.00045812405586321934, "loss": 2.5922, "step": 397 }, { "epoch": 0.4082051282051282, "grad_norm": 0.236328125, "learning_rate": 0.00045789595521602135, "loss": 2.7375, "step": 398 }, { "epoch": 0.40923076923076923, "grad_norm": 0.2421875, "learning_rate": 0.00045766729212525767, "loss": 2.722, "step": 399 }, { "epoch": 0.41025641025641024, "grad_norm": 0.23828125, "learning_rate": 0.0004574380672095554, "loss": 2.6559, "step": 400 }, { "epoch": 0.41025641025641024, "eval_loss": NaN, "eval_runtime": 74.0424, "eval_samples_per_second": 9.278, "eval_steps_per_second": 1.161, "step": 400 }, { "epoch": 0.4112820512820513, "grad_norm": 0.271484375, "learning_rate": 0.0004572082810890618, "loss": 2.8694, "step": 401 }, { "epoch": 0.4123076923076923, "grad_norm": 0.2578125, "learning_rate": 0.00045697793438544224, "loss": 2.7694, "step": 402 }, { "epoch": 0.41333333333333333, "grad_norm": 0.2216796875, "learning_rate": 0.0004567470277218786, "loss": 2.7643, "step": 403 }, { "epoch": 0.41435897435897434, "grad_norm": 0.2265625, "learning_rate": 0.0004565155617230681, "loss": 2.8339, "step": 404 }, { "epoch": 0.4153846153846154, "grad_norm": 0.240234375, "learning_rate": 0.0004562835370152205, "loss": 2.6953, "step": 405 }, { "epoch": 0.4164102564102564, "grad_norm": 0.2060546875, "learning_rate": 0.000456050954226058, "loss": 2.7239, "step": 406 }, { "epoch": 0.41743589743589743, "grad_norm": 0.298828125, "learning_rate": 0.0004558178139848117, "loss": 2.9534, "step": 407 }, { "epoch": 0.41846153846153844, "grad_norm": 0.2119140625, "learning_rate": 0.00045558411692222156, "loss": 2.7606, "step": 408 }, { "epoch": 0.4194871794871795, "grad_norm": 0.2451171875, "learning_rate": 0.00045534986367053366, "loss": 2.9403, "step": 409 }, { "epoch": 0.4205128205128205, "grad_norm": 0.251953125, "learning_rate": 0.0004551150548634987, "loss": 2.7, "step": 410 }, { "epoch": 0.42153846153846153, "grad_norm": 0.23046875, "learning_rate": 0.0004548796911363706, "loss": 2.8521, "step": 411 }, { "epoch": 0.42256410256410254, "grad_norm": 0.267578125, "learning_rate": 0.00045464377312590457, "loss": 3.1744, "step": 412 }, { "epoch": 0.4235897435897436, "grad_norm": 0.2314453125, "learning_rate": 0.0004544073014703551, "loss": 2.8482, "step": 413 }, { "epoch": 0.4246153846153846, "grad_norm": 0.203125, "learning_rate": 0.00045417027680947486, "loss": 2.7571, "step": 414 }, { "epoch": 0.4256410256410256, "grad_norm": 0.2412109375, "learning_rate": 0.00045393269978451234, "loss": 2.6415, "step": 415 }, { "epoch": 0.4266666666666667, "grad_norm": 0.201171875, "learning_rate": 0.00045369457103821073, "loss": 2.597, "step": 416 }, { "epoch": 0.4276923076923077, "grad_norm": 0.2109375, "learning_rate": 0.0004534558912148055, "loss": 2.8068, "step": 417 }, { "epoch": 0.4287179487179487, "grad_norm": 0.2265625, "learning_rate": 0.0004532166609600232, "loss": 2.7493, "step": 418 }, { "epoch": 0.4297435897435897, "grad_norm": 0.2421875, "learning_rate": 0.00045297688092107947, "loss": 2.7929, "step": 419 }, { "epoch": 0.4307692307692308, "grad_norm": 0.2294921875, "learning_rate": 0.00045273655174667745, "loss": 2.7424, "step": 420 }, { "epoch": 0.4317948717948718, "grad_norm": 0.2421875, "learning_rate": 0.0004524956740870057, "loss": 2.8742, "step": 421 }, { "epoch": 0.4328205128205128, "grad_norm": 0.2109375, "learning_rate": 0.0004522542485937369, "loss": 2.7054, "step": 422 }, { "epoch": 0.4338461538461538, "grad_norm": 0.2197265625, "learning_rate": 0.0004520122759200256, "loss": 2.639, "step": 423 }, { "epoch": 0.4348717948717949, "grad_norm": 0.2353515625, "learning_rate": 0.000451769756720507, "loss": 2.9032, "step": 424 }, { "epoch": 0.4358974358974359, "grad_norm": 0.2109375, "learning_rate": 0.0004515266916512945, "loss": 2.7476, "step": 425 }, { "epoch": 0.4369230769230769, "grad_norm": 0.236328125, "learning_rate": 0.00045128308136997865, "loss": 2.6335, "step": 426 }, { "epoch": 0.4379487179487179, "grad_norm": 0.244140625, "learning_rate": 0.0004510389265356247, "loss": 2.7871, "step": 427 }, { "epoch": 0.438974358974359, "grad_norm": 0.2275390625, "learning_rate": 0.00045079422780877154, "loss": 2.8556, "step": 428 }, { "epoch": 0.44, "grad_norm": 0.24609375, "learning_rate": 0.0004505489858514292, "loss": 2.7058, "step": 429 }, { "epoch": 0.441025641025641, "grad_norm": 0.2353515625, "learning_rate": 0.00045030320132707734, "loss": 2.8928, "step": 430 }, { "epoch": 0.442051282051282, "grad_norm": 0.22265625, "learning_rate": 0.0004500568749006637, "loss": 2.7562, "step": 431 }, { "epoch": 0.4430769230769231, "grad_norm": 0.2333984375, "learning_rate": 0.000449810007238602, "loss": 2.7791, "step": 432 }, { "epoch": 0.4441025641025641, "grad_norm": 0.2314453125, "learning_rate": 0.0004495625990087701, "loss": 2.7026, "step": 433 }, { "epoch": 0.4451282051282051, "grad_norm": 0.22265625, "learning_rate": 0.0004493146508805085, "loss": 2.5481, "step": 434 }, { "epoch": 0.4461538461538462, "grad_norm": 0.2373046875, "learning_rate": 0.00044906616352461824, "loss": 2.9261, "step": 435 }, { "epoch": 0.4471794871794872, "grad_norm": 0.25, "learning_rate": 0.0004488171376133593, "loss": 2.6702, "step": 436 }, { "epoch": 0.4482051282051282, "grad_norm": 0.2275390625, "learning_rate": 0.0004485675738204485, "loss": 2.8407, "step": 437 }, { "epoch": 0.4492307692307692, "grad_norm": 0.224609375, "learning_rate": 0.00044831747282105797, "loss": 2.7928, "step": 438 }, { "epoch": 0.4502564102564103, "grad_norm": 0.2373046875, "learning_rate": 0.00044806683529181326, "loss": 2.7557, "step": 439 }, { "epoch": 0.4512820512820513, "grad_norm": 0.236328125, "learning_rate": 0.0004478156619107912, "loss": 2.9797, "step": 440 }, { "epoch": 0.4512820512820513, "eval_loss": NaN, "eval_runtime": 73.8798, "eval_samples_per_second": 9.299, "eval_steps_per_second": 1.164, "step": 440 }, { "epoch": 0.4523076923076923, "grad_norm": 0.251953125, "learning_rate": 0.00044756395335751863, "loss": 2.66, "step": 441 }, { "epoch": 0.4533333333333333, "grad_norm": 0.21875, "learning_rate": 0.00044731171031297016, "loss": 2.7394, "step": 442 }, { "epoch": 0.4543589743589744, "grad_norm": 0.2421875, "learning_rate": 0.00044705893345956617, "loss": 2.8122, "step": 443 }, { "epoch": 0.4553846153846154, "grad_norm": 0.220703125, "learning_rate": 0.00044680562348117146, "loss": 2.5933, "step": 444 }, { "epoch": 0.4564102564102564, "grad_norm": 0.2353515625, "learning_rate": 0.0004465517810630932, "loss": 2.8981, "step": 445 }, { "epoch": 0.4574358974358974, "grad_norm": 0.224609375, "learning_rate": 0.0004462974068920789, "loss": 2.7026, "step": 446 }, { "epoch": 0.4584615384615385, "grad_norm": 0.2431640625, "learning_rate": 0.00044604250165631464, "loss": 2.7415, "step": 447 }, { "epoch": 0.4594871794871795, "grad_norm": 0.2333984375, "learning_rate": 0.0004457870660454234, "loss": 2.4892, "step": 448 }, { "epoch": 0.4605128205128205, "grad_norm": 0.21875, "learning_rate": 0.00044553110075046286, "loss": 2.6929, "step": 449 }, { "epoch": 0.46153846153846156, "grad_norm": 0.216796875, "learning_rate": 0.00044527460646392385, "loss": 2.7663, "step": 450 }, { "epoch": 0.4625641025641026, "grad_norm": 0.248046875, "learning_rate": 0.00044501758387972834, "loss": 2.6729, "step": 451 }, { "epoch": 0.4635897435897436, "grad_norm": 0.2353515625, "learning_rate": 0.0004447600336932274, "loss": 2.5403, "step": 452 }, { "epoch": 0.4646153846153846, "grad_norm": 0.2255859375, "learning_rate": 0.00044450195660119965, "loss": 2.6544, "step": 453 }, { "epoch": 0.46564102564102566, "grad_norm": 0.2890625, "learning_rate": 0.0004442433533018492, "loss": 2.819, "step": 454 }, { "epoch": 0.4666666666666667, "grad_norm": 0.2333984375, "learning_rate": 0.0004439842244948036, "loss": 2.827, "step": 455 }, { "epoch": 0.4676923076923077, "grad_norm": 0.294921875, "learning_rate": 0.0004437245708811124, "loss": 2.7845, "step": 456 }, { "epoch": 0.4687179487179487, "grad_norm": 0.2197265625, "learning_rate": 0.0004434643931632446, "loss": 2.8144, "step": 457 }, { "epoch": 0.46974358974358976, "grad_norm": 0.22265625, "learning_rate": 0.0004432036920450875, "loss": 2.6029, "step": 458 }, { "epoch": 0.4707692307692308, "grad_norm": 0.2412109375, "learning_rate": 0.000442942468231944, "loss": 2.8237, "step": 459 }, { "epoch": 0.4717948717948718, "grad_norm": 0.2216796875, "learning_rate": 0.00044268072243053156, "loss": 2.7062, "step": 460 }, { "epoch": 0.4728205128205128, "grad_norm": 0.2197265625, "learning_rate": 0.0004424184553489795, "loss": 2.9272, "step": 461 }, { "epoch": 0.47384615384615386, "grad_norm": 0.2255859375, "learning_rate": 0.0004421556676968274, "loss": 2.6386, "step": 462 }, { "epoch": 0.4748717948717949, "grad_norm": 0.2373046875, "learning_rate": 0.00044189236018502356, "loss": 2.6118, "step": 463 }, { "epoch": 0.4758974358974359, "grad_norm": 0.2001953125, "learning_rate": 0.0004416285335259222, "loss": 2.6128, "step": 464 }, { "epoch": 0.47692307692307695, "grad_norm": 0.22265625, "learning_rate": 0.00044136418843328246, "loss": 2.8181, "step": 465 }, { "epoch": 0.47794871794871796, "grad_norm": 0.2138671875, "learning_rate": 0.00044109932562226594, "loss": 2.7798, "step": 466 }, { "epoch": 0.47897435897435897, "grad_norm": 0.2275390625, "learning_rate": 0.0004408339458094347, "loss": 2.6435, "step": 467 }, { "epoch": 0.48, "grad_norm": 0.1982421875, "learning_rate": 0.0004405680497127498, "loss": 2.7732, "step": 468 }, { "epoch": 0.48102564102564105, "grad_norm": 0.216796875, "learning_rate": 0.00044030163805156876, "loss": 2.598, "step": 469 }, { "epoch": 0.48205128205128206, "grad_norm": 0.2236328125, "learning_rate": 0.0004400347115466442, "loss": 2.7014, "step": 470 }, { "epoch": 0.48307692307692307, "grad_norm": 0.2265625, "learning_rate": 0.00043976727092012133, "loss": 2.8318, "step": 471 }, { "epoch": 0.4841025641025641, "grad_norm": 0.2333984375, "learning_rate": 0.00043949931689553644, "loss": 2.5939, "step": 472 }, { "epoch": 0.48512820512820515, "grad_norm": 0.220703125, "learning_rate": 0.0004392308501978148, "loss": 2.7532, "step": 473 }, { "epoch": 0.48615384615384616, "grad_norm": 0.240234375, "learning_rate": 0.0004389618715532685, "loss": 2.9353, "step": 474 }, { "epoch": 0.48717948717948717, "grad_norm": 0.287109375, "learning_rate": 0.0004386923816895948, "loss": 2.9722, "step": 475 }, { "epoch": 0.4882051282051282, "grad_norm": 0.2431640625, "learning_rate": 0.000438422381335874, "loss": 2.6788, "step": 476 }, { "epoch": 0.48923076923076925, "grad_norm": 0.2216796875, "learning_rate": 0.00043815187122256746, "loss": 2.6543, "step": 477 }, { "epoch": 0.49025641025641026, "grad_norm": 0.21484375, "learning_rate": 0.0004378808520815156, "loss": 2.8093, "step": 478 }, { "epoch": 0.49128205128205127, "grad_norm": 0.208984375, "learning_rate": 0.000437609324645936, "loss": 2.8028, "step": 479 }, { "epoch": 0.49230769230769234, "grad_norm": 0.216796875, "learning_rate": 0.0004373372896504215, "loss": 2.6416, "step": 480 }, { "epoch": 0.49230769230769234, "eval_loss": NaN, "eval_runtime": 73.81, "eval_samples_per_second": 9.308, "eval_steps_per_second": 1.165, "step": 480 }, { "epoch": 0.49333333333333335, "grad_norm": 0.2294921875, "learning_rate": 0.0004370647478309379, "loss": 2.6881, "step": 481 }, { "epoch": 0.49435897435897436, "grad_norm": 0.2333984375, "learning_rate": 0.0004367916999248222, "loss": 2.7655, "step": 482 }, { "epoch": 0.49538461538461537, "grad_norm": 0.2119140625, "learning_rate": 0.0004365181466707808, "loss": 2.8272, "step": 483 }, { "epoch": 0.49641025641025643, "grad_norm": 0.22265625, "learning_rate": 0.000436244088808887, "loss": 2.6559, "step": 484 }, { "epoch": 0.49743589743589745, "grad_norm": 0.21484375, "learning_rate": 0.00043596952708057946, "loss": 2.5397, "step": 485 }, { "epoch": 0.49846153846153846, "grad_norm": 0.2080078125, "learning_rate": 0.00043569446222865984, "loss": 2.6262, "step": 486 }, { "epoch": 0.49948717948717947, "grad_norm": 0.2119140625, "learning_rate": 0.00043541889499729104, "loss": 2.7953, "step": 487 }, { "epoch": 0.5005128205128205, "grad_norm": 0.208984375, "learning_rate": 0.0004351428261319951, "loss": 2.6342, "step": 488 }, { "epoch": 0.5015384615384615, "grad_norm": 0.205078125, "learning_rate": 0.00043486625637965124, "loss": 2.7282, "step": 489 }, { "epoch": 0.5025641025641026, "grad_norm": 0.2138671875, "learning_rate": 0.0004345891864884937, "loss": 2.8391, "step": 490 }, { "epoch": 0.5035897435897436, "grad_norm": 0.2294921875, "learning_rate": 0.00043431161720810976, "loss": 2.7957, "step": 491 }, { "epoch": 0.5046153846153846, "grad_norm": 0.2109375, "learning_rate": 0.00043403354928943784, "loss": 2.7152, "step": 492 }, { "epoch": 0.5056410256410256, "grad_norm": 0.24609375, "learning_rate": 0.0004337549834847655, "loss": 2.7956, "step": 493 }, { "epoch": 0.5066666666666667, "grad_norm": 0.2119140625, "learning_rate": 0.000433475920547727, "loss": 2.4966, "step": 494 }, { "epoch": 0.5076923076923077, "grad_norm": 0.234375, "learning_rate": 0.0004331963612333017, "loss": 2.5738, "step": 495 }, { "epoch": 0.5087179487179487, "grad_norm": 0.22265625, "learning_rate": 0.00043291630629781186, "loss": 2.8515, "step": 496 }, { "epoch": 0.5097435897435898, "grad_norm": 0.2177734375, "learning_rate": 0.00043263575649892075, "loss": 2.7672, "step": 497 }, { "epoch": 0.5107692307692308, "grad_norm": 0.236328125, "learning_rate": 0.00043235471259563005, "loss": 2.9032, "step": 498 }, { "epoch": 0.5117948717948718, "grad_norm": 0.2373046875, "learning_rate": 0.0004320731753482785, "loss": 2.6883, "step": 499 }, { "epoch": 0.5128205128205128, "grad_norm": 0.2021484375, "learning_rate": 0.0004317911455185396, "loss": 2.6788, "step": 500 }, { "epoch": 0.5138461538461538, "grad_norm": 0.224609375, "learning_rate": 0.0004315086238694192, "loss": 2.6786, "step": 501 }, { "epoch": 0.5148717948717949, "grad_norm": 0.2373046875, "learning_rate": 0.00043122561116525407, "loss": 2.7459, "step": 502 }, { "epoch": 0.5158974358974359, "grad_norm": 0.2314453125, "learning_rate": 0.0004309421081717091, "loss": 2.7177, "step": 503 }, { "epoch": 0.5169230769230769, "grad_norm": 0.2353515625, "learning_rate": 0.00043065811565577597, "loss": 2.7544, "step": 504 }, { "epoch": 0.517948717948718, "grad_norm": 0.2294921875, "learning_rate": 0.0004303736343857704, "loss": 2.6769, "step": 505 }, { "epoch": 0.518974358974359, "grad_norm": 0.26171875, "learning_rate": 0.0004300886651313306, "loss": 2.7597, "step": 506 }, { "epoch": 0.52, "grad_norm": 0.208984375, "learning_rate": 0.000429803208663415, "loss": 2.5511, "step": 507 }, { "epoch": 0.521025641025641, "grad_norm": 0.220703125, "learning_rate": 0.0004295172657543, "loss": 2.694, "step": 508 }, { "epoch": 0.522051282051282, "grad_norm": 0.20703125, "learning_rate": 0.0004292308371775781, "loss": 2.7004, "step": 509 }, { "epoch": 0.5230769230769231, "grad_norm": 0.234375, "learning_rate": 0.0004289439237081557, "loss": 2.6974, "step": 510 }, { "epoch": 0.5241025641025641, "grad_norm": 0.2265625, "learning_rate": 0.00042865652612225115, "loss": 2.7225, "step": 511 }, { "epoch": 0.5251282051282051, "grad_norm": 0.259765625, "learning_rate": 0.00042836864519739225, "loss": 2.8044, "step": 512 }, { "epoch": 0.5261538461538462, "grad_norm": 0.21484375, "learning_rate": 0.00042808028171241485, "loss": 2.6341, "step": 513 }, { "epoch": 0.5271794871794871, "grad_norm": 0.2041015625, "learning_rate": 0.0004277914364474599, "loss": 2.8343, "step": 514 }, { "epoch": 0.5282051282051282, "grad_norm": 0.2197265625, "learning_rate": 0.000427502110183972, "loss": 2.6009, "step": 515 }, { "epoch": 0.5292307692307693, "grad_norm": 0.291015625, "learning_rate": 0.00042721230370469697, "loss": 2.7838, "step": 516 }, { "epoch": 0.5302564102564102, "grad_norm": 0.2197265625, "learning_rate": 0.0004269220177936799, "loss": 2.8187, "step": 517 }, { "epoch": 0.5312820512820513, "grad_norm": 0.2177734375, "learning_rate": 0.0004266312532362628, "loss": 2.7781, "step": 518 }, { "epoch": 0.5323076923076923, "grad_norm": 0.21484375, "learning_rate": 0.0004263400108190828, "loss": 2.6663, "step": 519 }, { "epoch": 0.5333333333333333, "grad_norm": 0.2265625, "learning_rate": 0.00042604829133006966, "loss": 2.5732, "step": 520 }, { "epoch": 0.5333333333333333, "eval_loss": NaN, "eval_runtime": 73.9353, "eval_samples_per_second": 9.292, "eval_steps_per_second": 1.163, "step": 520 }, { "epoch": 0.5343589743589744, "grad_norm": 0.2197265625, "learning_rate": 0.00042575609555844387, "loss": 2.5354, "step": 521 }, { "epoch": 0.5353846153846153, "grad_norm": 0.265625, "learning_rate": 0.0004254634242947144, "loss": 2.9681, "step": 522 }, { "epoch": 0.5364102564102564, "grad_norm": 0.22265625, "learning_rate": 0.00042517027833067686, "loss": 2.741, "step": 523 }, { "epoch": 0.5374358974358975, "grad_norm": 0.326171875, "learning_rate": 0.00042487665845941083, "loss": 2.942, "step": 524 }, { "epoch": 0.5384615384615384, "grad_norm": 0.2451171875, "learning_rate": 0.00042458256547527805, "loss": 2.7034, "step": 525 }, { "epoch": 0.5394871794871795, "grad_norm": 0.2236328125, "learning_rate": 0.00042428800017392044, "loss": 2.7113, "step": 526 }, { "epoch": 0.5405128205128205, "grad_norm": 0.24609375, "learning_rate": 0.00042399296335225755, "loss": 2.8319, "step": 527 }, { "epoch": 0.5415384615384615, "grad_norm": 0.2451171875, "learning_rate": 0.0004236974558084846, "loss": 2.6368, "step": 528 }, { "epoch": 0.5425641025641026, "grad_norm": 0.244140625, "learning_rate": 0.00042340147834207033, "loss": 2.5424, "step": 529 }, { "epoch": 0.5435897435897435, "grad_norm": 0.265625, "learning_rate": 0.0004231050317537548, "loss": 2.6148, "step": 530 }, { "epoch": 0.5446153846153846, "grad_norm": 0.255859375, "learning_rate": 0.00042280811684554723, "loss": 2.7953, "step": 531 }, { "epoch": 0.5456410256410257, "grad_norm": 0.2294921875, "learning_rate": 0.00042251073442072385, "loss": 2.6117, "step": 532 }, { "epoch": 0.5466666666666666, "grad_norm": 0.2421875, "learning_rate": 0.00042221288528382584, "loss": 2.698, "step": 533 }, { "epoch": 0.5476923076923077, "grad_norm": 0.2255859375, "learning_rate": 0.00042191457024065674, "loss": 2.799, "step": 534 }, { "epoch": 0.5487179487179488, "grad_norm": 0.2294921875, "learning_rate": 0.00042161579009828077, "loss": 2.6709, "step": 535 }, { "epoch": 0.5497435897435897, "grad_norm": 0.212890625, "learning_rate": 0.0004213165456650204, "loss": 2.6391, "step": 536 }, { "epoch": 0.5507692307692308, "grad_norm": 0.2431640625, "learning_rate": 0.0004210168377504542, "loss": 2.6646, "step": 537 }, { "epoch": 0.5517948717948717, "grad_norm": 0.2236328125, "learning_rate": 0.00042071666716541467, "loss": 2.7299, "step": 538 }, { "epoch": 0.5528205128205128, "grad_norm": 0.208984375, "learning_rate": 0.00042041603472198577, "loss": 2.7099, "step": 539 }, { "epoch": 0.5538461538461539, "grad_norm": 0.2109375, "learning_rate": 0.0004201149412335015, "loss": 2.6482, "step": 540 }, { "epoch": 0.5548717948717948, "grad_norm": 0.1982421875, "learning_rate": 0.0004198133875145426, "loss": 2.6779, "step": 541 }, { "epoch": 0.5558974358974359, "grad_norm": 0.2216796875, "learning_rate": 0.00041951137438093523, "loss": 2.7735, "step": 542 }, { "epoch": 0.556923076923077, "grad_norm": 0.2080078125, "learning_rate": 0.0004192089026497484, "loss": 2.607, "step": 543 }, { "epoch": 0.5579487179487179, "grad_norm": 0.19921875, "learning_rate": 0.00041890597313929204, "loss": 2.6168, "step": 544 }, { "epoch": 0.558974358974359, "grad_norm": 0.2001953125, "learning_rate": 0.00041860258666911413, "loss": 2.5972, "step": 545 }, { "epoch": 0.56, "grad_norm": 0.20703125, "learning_rate": 0.0004182987440599991, "loss": 2.6105, "step": 546 }, { "epoch": 0.561025641025641, "grad_norm": 0.2177734375, "learning_rate": 0.00041799444613396553, "loss": 2.699, "step": 547 }, { "epoch": 0.5620512820512821, "grad_norm": 0.228515625, "learning_rate": 0.00041768969371426377, "loss": 2.6869, "step": 548 }, { "epoch": 0.563076923076923, "grad_norm": 0.2216796875, "learning_rate": 0.00041738448762537365, "loss": 2.5407, "step": 549 }, { "epoch": 0.5641025641025641, "grad_norm": 0.236328125, "learning_rate": 0.0004170788286930024, "loss": 2.7302, "step": 550 }, { "epoch": 0.5651282051282052, "grad_norm": 0.2119140625, "learning_rate": 0.0004167727177440825, "loss": 2.557, "step": 551 }, { "epoch": 0.5661538461538461, "grad_norm": 0.2041015625, "learning_rate": 0.00041646615560676927, "loss": 2.5531, "step": 552 }, { "epoch": 0.5671794871794872, "grad_norm": 0.2021484375, "learning_rate": 0.0004161591431104385, "loss": 2.5916, "step": 553 }, { "epoch": 0.5682051282051283, "grad_norm": 0.2099609375, "learning_rate": 0.00041585168108568466, "loss": 2.5991, "step": 554 }, { "epoch": 0.5692307692307692, "grad_norm": 0.197265625, "learning_rate": 0.00041554377036431817, "loss": 2.6071, "step": 555 }, { "epoch": 0.5702564102564103, "grad_norm": 0.310546875, "learning_rate": 0.0004152354117793634, "loss": 2.9511, "step": 556 }, { "epoch": 0.5712820512820512, "grad_norm": 0.220703125, "learning_rate": 0.0004149266061650565, "loss": 2.5978, "step": 557 }, { "epoch": 0.5723076923076923, "grad_norm": 0.2138671875, "learning_rate": 0.000414617354356843, "loss": 2.7596, "step": 558 }, { "epoch": 0.5733333333333334, "grad_norm": 0.2275390625, "learning_rate": 0.0004143076571913752, "loss": 2.5049, "step": 559 }, { "epoch": 0.5743589743589743, "grad_norm": 0.298828125, "learning_rate": 0.0004139975155065109, "loss": 2.5785, "step": 560 }, { "epoch": 0.5743589743589743, "eval_loss": NaN, "eval_runtime": 73.9002, "eval_samples_per_second": 9.296, "eval_steps_per_second": 1.164, "step": 560 }, { "epoch": 0.5753846153846154, "grad_norm": 0.228515625, "learning_rate": 0.00041368693014130986, "loss": 2.6694, "step": 561 }, { "epoch": 0.5764102564102564, "grad_norm": 0.23046875, "learning_rate": 0.0004133759019360328, "loss": 2.5953, "step": 562 }, { "epoch": 0.5774358974358974, "grad_norm": 0.220703125, "learning_rate": 0.0004130644317321379, "loss": 2.5617, "step": 563 }, { "epoch": 0.5784615384615385, "grad_norm": 0.216796875, "learning_rate": 0.0004127525203722796, "loss": 2.611, "step": 564 }, { "epoch": 0.5794871794871795, "grad_norm": 0.322265625, "learning_rate": 0.00041244016870030565, "loss": 2.7919, "step": 565 }, { "epoch": 0.5805128205128205, "grad_norm": 0.2099609375, "learning_rate": 0.00041212737756125497, "loss": 2.7089, "step": 566 }, { "epoch": 0.5815384615384616, "grad_norm": 0.208984375, "learning_rate": 0.0004118141478013555, "loss": 2.5439, "step": 567 }, { "epoch": 0.5825641025641025, "grad_norm": 0.19921875, "learning_rate": 0.00041150048026802194, "loss": 2.7129, "step": 568 }, { "epoch": 0.5835897435897436, "grad_norm": 0.21875, "learning_rate": 0.0004111863758098531, "loss": 2.7272, "step": 569 }, { "epoch": 0.5846153846153846, "grad_norm": 0.2041015625, "learning_rate": 0.00041087183527662997, "loss": 2.5177, "step": 570 }, { "epoch": 0.5856410256410256, "grad_norm": 0.228515625, "learning_rate": 0.0004105568595193133, "loss": 2.5399, "step": 571 }, { "epoch": 0.5866666666666667, "grad_norm": 0.248046875, "learning_rate": 0.0004102414493900415, "loss": 2.5678, "step": 572 }, { "epoch": 0.5876923076923077, "grad_norm": 0.21484375, "learning_rate": 0.00040992560574212763, "loss": 2.6003, "step": 573 }, { "epoch": 0.5887179487179487, "grad_norm": 0.2275390625, "learning_rate": 0.0004096093294300581, "loss": 2.7195, "step": 574 }, { "epoch": 0.5897435897435898, "grad_norm": 0.2216796875, "learning_rate": 0.00040929262130948965, "loss": 2.7055, "step": 575 }, { "epoch": 0.5907692307692308, "grad_norm": 0.2333984375, "learning_rate": 0.00040897548223724716, "loss": 2.8603, "step": 576 }, { "epoch": 0.5917948717948718, "grad_norm": 0.2255859375, "learning_rate": 0.0004086579130713215, "loss": 2.6861, "step": 577 }, { "epoch": 0.5928205128205128, "grad_norm": 0.1953125, "learning_rate": 0.0004083399146708673, "loss": 2.8076, "step": 578 }, { "epoch": 0.5938461538461538, "grad_norm": 0.240234375, "learning_rate": 0.00040802148789619995, "loss": 2.6454, "step": 579 }, { "epoch": 0.5948717948717949, "grad_norm": 0.2216796875, "learning_rate": 0.00040770263360879436, "loss": 2.6901, "step": 580 }, { "epoch": 0.5958974358974359, "grad_norm": 0.2197265625, "learning_rate": 0.0004073833526712816, "loss": 2.5395, "step": 581 }, { "epoch": 0.5969230769230769, "grad_norm": 0.236328125, "learning_rate": 0.00040706364594744727, "loss": 2.8199, "step": 582 }, { "epoch": 0.597948717948718, "grad_norm": 0.220703125, "learning_rate": 0.0004067435143022286, "loss": 2.6179, "step": 583 }, { "epoch": 0.598974358974359, "grad_norm": 0.220703125, "learning_rate": 0.00040642295860171274, "loss": 2.8625, "step": 584 }, { "epoch": 0.6, "grad_norm": 0.21875, "learning_rate": 0.0004061019797131339, "loss": 2.6588, "step": 585 }, { "epoch": 0.601025641025641, "grad_norm": 0.2158203125, "learning_rate": 0.0004057805785048712, "loss": 2.753, "step": 586 }, { "epoch": 0.602051282051282, "grad_norm": 0.220703125, "learning_rate": 0.00040545875584644634, "loss": 2.6371, "step": 587 }, { "epoch": 0.6030769230769231, "grad_norm": 0.2236328125, "learning_rate": 0.00040513651260852123, "loss": 2.5686, "step": 588 }, { "epoch": 0.6041025641025641, "grad_norm": 0.2060546875, "learning_rate": 0.00040481384966289545, "loss": 2.6215, "step": 589 }, { "epoch": 0.6051282051282051, "grad_norm": 0.2294921875, "learning_rate": 0.00040449076788250446, "loss": 2.7353, "step": 590 }, { "epoch": 0.6061538461538462, "grad_norm": 0.2119140625, "learning_rate": 0.00040416726814141633, "loss": 2.6479, "step": 591 }, { "epoch": 0.6071794871794872, "grad_norm": 0.2021484375, "learning_rate": 0.0004038433513148303, "loss": 2.5643, "step": 592 }, { "epoch": 0.6082051282051282, "grad_norm": 0.2392578125, "learning_rate": 0.00040351901827907375, "loss": 2.6139, "step": 593 }, { "epoch": 0.6092307692307692, "grad_norm": 0.205078125, "learning_rate": 0.0004031942699116001, "loss": 2.7103, "step": 594 }, { "epoch": 0.6102564102564103, "grad_norm": 0.2099609375, "learning_rate": 0.00040286910709098667, "loss": 2.5725, "step": 595 }, { "epoch": 0.6112820512820513, "grad_norm": 0.19140625, "learning_rate": 0.0004025435306969317, "loss": 2.6258, "step": 596 }, { "epoch": 0.6123076923076923, "grad_norm": 0.1982421875, "learning_rate": 0.0004022175416102525, "loss": 2.6917, "step": 597 }, { "epoch": 0.6133333333333333, "grad_norm": 0.2099609375, "learning_rate": 0.0004018911407128828, "loss": 2.6364, "step": 598 }, { "epoch": 0.6143589743589744, "grad_norm": 0.2080078125, "learning_rate": 0.0004015643288878705, "loss": 2.4383, "step": 599 }, { "epoch": 0.6153846153846154, "grad_norm": 0.2041015625, "learning_rate": 0.0004012371070193753, "loss": 2.6892, "step": 600 }, { "epoch": 0.6153846153846154, "eval_loss": NaN, "eval_runtime": 73.9154, "eval_samples_per_second": 9.294, "eval_steps_per_second": 1.163, "step": 600 }, { "epoch": 0.6164102564102564, "grad_norm": 0.197265625, "learning_rate": 0.00040090947599266604, "loss": 2.6745, "step": 601 }, { "epoch": 0.6174358974358974, "grad_norm": 0.20703125, "learning_rate": 0.00040058143669411864, "loss": 2.6251, "step": 602 }, { "epoch": 0.6184615384615385, "grad_norm": 0.212890625, "learning_rate": 0.0004002529900112136, "loss": 2.6597, "step": 603 }, { "epoch": 0.6194871794871795, "grad_norm": 0.201171875, "learning_rate": 0.00039992413683253344, "loss": 2.7406, "step": 604 }, { "epoch": 0.6205128205128205, "grad_norm": 0.2421875, "learning_rate": 0.0003995948780477605, "loss": 2.6663, "step": 605 }, { "epoch": 0.6215384615384615, "grad_norm": 0.2001953125, "learning_rate": 0.0003992652145476744, "loss": 2.628, "step": 606 }, { "epoch": 0.6225641025641026, "grad_norm": 0.2177734375, "learning_rate": 0.0003989351472241497, "loss": 2.5465, "step": 607 }, { "epoch": 0.6235897435897436, "grad_norm": 0.2158203125, "learning_rate": 0.0003986046769701535, "loss": 2.774, "step": 608 }, { "epoch": 0.6246153846153846, "grad_norm": 0.28515625, "learning_rate": 0.00039827380467974296, "loss": 2.5741, "step": 609 }, { "epoch": 0.6256410256410256, "grad_norm": 0.2421875, "learning_rate": 0.00039794253124806293, "loss": 2.7067, "step": 610 }, { "epoch": 0.6266666666666667, "grad_norm": 0.2021484375, "learning_rate": 0.0003976108575713435, "loss": 2.5942, "step": 611 }, { "epoch": 0.6276923076923077, "grad_norm": 0.2236328125, "learning_rate": 0.0003972787845468975, "loss": 2.6547, "step": 612 }, { "epoch": 0.6287179487179487, "grad_norm": 0.2255859375, "learning_rate": 0.0003969463130731183, "loss": 2.6043, "step": 613 }, { "epoch": 0.6297435897435898, "grad_norm": 0.23828125, "learning_rate": 0.0003966134440494772, "loss": 2.6785, "step": 614 }, { "epoch": 0.6307692307692307, "grad_norm": 0.220703125, "learning_rate": 0.00039628017837652096, "loss": 2.7543, "step": 615 }, { "epoch": 0.6317948717948718, "grad_norm": 0.2255859375, "learning_rate": 0.0003959465169558696, "loss": 2.7286, "step": 616 }, { "epoch": 0.6328205128205128, "grad_norm": 0.2490234375, "learning_rate": 0.0003956124606902136, "loss": 2.6903, "step": 617 }, { "epoch": 0.6338461538461538, "grad_norm": 0.236328125, "learning_rate": 0.0003952780104833118, "loss": 2.6149, "step": 618 }, { "epoch": 0.6348717948717949, "grad_norm": 0.224609375, "learning_rate": 0.0003949431672399887, "loss": 2.6791, "step": 619 }, { "epoch": 0.6358974358974359, "grad_norm": 0.2353515625, "learning_rate": 0.0003946079318661323, "loss": 2.7759, "step": 620 }, { "epoch": 0.6369230769230769, "grad_norm": 0.263671875, "learning_rate": 0.00039427230526869117, "loss": 2.5691, "step": 621 }, { "epoch": 0.637948717948718, "grad_norm": 0.22265625, "learning_rate": 0.00039393628835567273, "loss": 2.689, "step": 622 }, { "epoch": 0.638974358974359, "grad_norm": 0.2353515625, "learning_rate": 0.00039359988203613994, "loss": 2.5729, "step": 623 }, { "epoch": 0.64, "grad_norm": 0.220703125, "learning_rate": 0.0003932630872202096, "loss": 2.726, "step": 624 }, { "epoch": 0.6410256410256411, "grad_norm": 0.2177734375, "learning_rate": 0.00039292590481904924, "loss": 2.5309, "step": 625 }, { "epoch": 0.642051282051282, "grad_norm": 0.212890625, "learning_rate": 0.0003925883357448752, "loss": 2.7082, "step": 626 }, { "epoch": 0.6430769230769231, "grad_norm": 0.2255859375, "learning_rate": 0.0003922503809109499, "loss": 2.6187, "step": 627 }, { "epoch": 0.6441025641025641, "grad_norm": 0.224609375, "learning_rate": 0.0003919120412315792, "loss": 2.4554, "step": 628 }, { "epoch": 0.6451282051282051, "grad_norm": 0.2333984375, "learning_rate": 0.00039157331762211046, "loss": 2.6213, "step": 629 }, { "epoch": 0.6461538461538462, "grad_norm": 0.2177734375, "learning_rate": 0.00039123421099892955, "loss": 2.5569, "step": 630 }, { "epoch": 0.6471794871794871, "grad_norm": 0.2353515625, "learning_rate": 0.00039089472227945833, "loss": 2.5895, "step": 631 }, { "epoch": 0.6482051282051282, "grad_norm": 0.26171875, "learning_rate": 0.00039055485238215275, "loss": 2.7407, "step": 632 }, { "epoch": 0.6492307692307693, "grad_norm": 0.2099609375, "learning_rate": 0.00039021460222649984, "loss": 2.634, "step": 633 }, { "epoch": 0.6502564102564102, "grad_norm": 0.259765625, "learning_rate": 0.0003898739727330155, "loss": 2.5963, "step": 634 }, { "epoch": 0.6512820512820513, "grad_norm": 0.251953125, "learning_rate": 0.0003895329648232416, "loss": 2.5544, "step": 635 }, { "epoch": 0.6523076923076923, "grad_norm": 0.25, "learning_rate": 0.00038919157941974406, "loss": 2.5499, "step": 636 }, { "epoch": 0.6533333333333333, "grad_norm": 0.2265625, "learning_rate": 0.0003888498174461099, "loss": 2.6823, "step": 637 }, { "epoch": 0.6543589743589744, "grad_norm": 0.2255859375, "learning_rate": 0.00038850767982694524, "loss": 2.4382, "step": 638 }, { "epoch": 0.6553846153846153, "grad_norm": 0.197265625, "learning_rate": 0.0003881651674878719, "loss": 2.43, "step": 639 }, { "epoch": 0.6564102564102564, "grad_norm": 0.2421875, "learning_rate": 0.00038782228135552613, "loss": 2.5908, "step": 640 }, { "epoch": 0.6564102564102564, "eval_loss": NaN, "eval_runtime": 73.841, "eval_samples_per_second": 9.304, "eval_steps_per_second": 1.165, "step": 640 }, { "epoch": 0.6574358974358975, "grad_norm": 0.2373046875, "learning_rate": 0.0003874790223575549, "loss": 2.7731, "step": 641 }, { "epoch": 0.6584615384615384, "grad_norm": 0.2373046875, "learning_rate": 0.00038713539142261425, "loss": 2.5467, "step": 642 }, { "epoch": 0.6594871794871795, "grad_norm": 0.197265625, "learning_rate": 0.0003867913894803663, "loss": 2.5883, "step": 643 }, { "epoch": 0.6605128205128206, "grad_norm": 0.228515625, "learning_rate": 0.0003864470174614771, "loss": 2.4547, "step": 644 }, { "epoch": 0.6615384615384615, "grad_norm": 0.21484375, "learning_rate": 0.0003861022762976136, "loss": 2.5617, "step": 645 }, { "epoch": 0.6625641025641026, "grad_norm": 0.1953125, "learning_rate": 0.0003857571669214417, "loss": 2.5294, "step": 646 }, { "epoch": 0.6635897435897435, "grad_norm": 0.2060546875, "learning_rate": 0.00038541169026662343, "loss": 2.6425, "step": 647 }, { "epoch": 0.6646153846153846, "grad_norm": 0.2216796875, "learning_rate": 0.0003850658472678143, "loss": 2.4164, "step": 648 }, { "epoch": 0.6656410256410257, "grad_norm": 0.2431640625, "learning_rate": 0.0003847196388606611, "loss": 2.7438, "step": 649 }, { "epoch": 0.6666666666666666, "grad_norm": 0.20703125, "learning_rate": 0.00038437306598179905, "loss": 2.6079, "step": 650 }, { "epoch": 0.6676923076923077, "grad_norm": 0.216796875, "learning_rate": 0.0003840261295688496, "loss": 2.7817, "step": 651 }, { "epoch": 0.6687179487179488, "grad_norm": 0.2138671875, "learning_rate": 0.0003836788305604175, "loss": 2.5575, "step": 652 }, { "epoch": 0.6697435897435897, "grad_norm": 0.197265625, "learning_rate": 0.0003833311698960887, "loss": 2.5834, "step": 653 }, { "epoch": 0.6707692307692308, "grad_norm": 0.212890625, "learning_rate": 0.00038298314851642744, "loss": 2.5504, "step": 654 }, { "epoch": 0.6717948717948717, "grad_norm": 0.2060546875, "learning_rate": 0.00038263476736297375, "loss": 2.736, "step": 655 }, { "epoch": 0.6728205128205128, "grad_norm": 0.2080078125, "learning_rate": 0.00038228602737824113, "loss": 2.6853, "step": 656 }, { "epoch": 0.6738461538461539, "grad_norm": 0.1943359375, "learning_rate": 0.0003819369295057139, "loss": 2.5801, "step": 657 }, { "epoch": 0.6748717948717948, "grad_norm": 0.2080078125, "learning_rate": 0.00038158747468984447, "loss": 2.4488, "step": 658 }, { "epoch": 0.6758974358974359, "grad_norm": 0.2119140625, "learning_rate": 0.00038123766387605107, "loss": 2.5575, "step": 659 }, { "epoch": 0.676923076923077, "grad_norm": 0.2119140625, "learning_rate": 0.00038088749801071497, "loss": 2.6971, "step": 660 }, { "epoch": 0.6779487179487179, "grad_norm": 0.1904296875, "learning_rate": 0.00038053697804117794, "loss": 2.5725, "step": 661 }, { "epoch": 0.678974358974359, "grad_norm": 0.23828125, "learning_rate": 0.00038018610491573996, "loss": 2.6447, "step": 662 }, { "epoch": 0.68, "grad_norm": 0.20703125, "learning_rate": 0.0003798348795836562, "loss": 2.6148, "step": 663 }, { "epoch": 0.681025641025641, "grad_norm": 0.228515625, "learning_rate": 0.0003794833029951348, "loss": 2.5099, "step": 664 }, { "epoch": 0.6820512820512821, "grad_norm": 0.25, "learning_rate": 0.0003791313761013343, "loss": 2.7695, "step": 665 }, { "epoch": 0.683076923076923, "grad_norm": 0.20703125, "learning_rate": 0.00037877909985436066, "loss": 2.6397, "step": 666 }, { "epoch": 0.6841025641025641, "grad_norm": 0.197265625, "learning_rate": 0.00037842647520726537, "loss": 2.6185, "step": 667 }, { "epoch": 0.6851282051282052, "grad_norm": 0.212890625, "learning_rate": 0.0003780735031140421, "loss": 2.4917, "step": 668 }, { "epoch": 0.6861538461538461, "grad_norm": 0.296875, "learning_rate": 0.0003777201845296249, "loss": 2.5357, "step": 669 }, { "epoch": 0.6871794871794872, "grad_norm": 0.22265625, "learning_rate": 0.00037736652040988475, "loss": 2.6593, "step": 670 }, { "epoch": 0.6882051282051282, "grad_norm": 0.2080078125, "learning_rate": 0.00037701251171162785, "loss": 2.5464, "step": 671 }, { "epoch": 0.6892307692307692, "grad_norm": 0.2099609375, "learning_rate": 0.00037665815939259253, "loss": 2.6009, "step": 672 }, { "epoch": 0.6902564102564103, "grad_norm": 0.25390625, "learning_rate": 0.00037630346441144656, "loss": 2.7386, "step": 673 }, { "epoch": 0.6912820512820513, "grad_norm": 0.2353515625, "learning_rate": 0.00037594842772778513, "loss": 2.5905, "step": 674 }, { "epoch": 0.6923076923076923, "grad_norm": 0.2255859375, "learning_rate": 0.00037559305030212745, "loss": 2.5557, "step": 675 }, { "epoch": 0.6933333333333334, "grad_norm": 0.2470703125, "learning_rate": 0.0003752373330959148, "loss": 2.6288, "step": 676 }, { "epoch": 0.6943589743589743, "grad_norm": 0.20703125, "learning_rate": 0.0003748812770715076, "loss": 2.5807, "step": 677 }, { "epoch": 0.6953846153846154, "grad_norm": 0.2236328125, "learning_rate": 0.0003745248831921831, "loss": 2.5731, "step": 678 }, { "epoch": 0.6964102564102564, "grad_norm": 0.1904296875, "learning_rate": 0.00037416815242213256, "loss": 2.5506, "step": 679 }, { "epoch": 0.6974358974358974, "grad_norm": 0.2333984375, "learning_rate": 0.0003738110857264583, "loss": 2.5158, "step": 680 }, { "epoch": 0.6974358974358974, "eval_loss": NaN, "eval_runtime": 73.8185, "eval_samples_per_second": 9.307, "eval_steps_per_second": 1.165, "step": 680 }, { "epoch": 0.6984615384615385, "grad_norm": 0.2119140625, "learning_rate": 0.000373453684071172, "loss": 2.5983, "step": 681 }, { "epoch": 0.6994871794871795, "grad_norm": 0.2119140625, "learning_rate": 0.0003730959484231911, "loss": 2.6236, "step": 682 }, { "epoch": 0.7005128205128205, "grad_norm": 0.2265625, "learning_rate": 0.00037273787975033686, "loss": 2.7207, "step": 683 }, { "epoch": 0.7015384615384616, "grad_norm": 0.203125, "learning_rate": 0.0003723794790213315, "loss": 2.5688, "step": 684 }, { "epoch": 0.7025641025641025, "grad_norm": 0.2177734375, "learning_rate": 0.0003720207472057954, "loss": 2.4878, "step": 685 }, { "epoch": 0.7035897435897436, "grad_norm": 0.2041015625, "learning_rate": 0.00037166168527424503, "loss": 2.4823, "step": 686 }, { "epoch": 0.7046153846153846, "grad_norm": 0.189453125, "learning_rate": 0.0003713022941980895, "loss": 2.4424, "step": 687 }, { "epoch": 0.7056410256410256, "grad_norm": 0.2021484375, "learning_rate": 0.0003709425749496288, "loss": 2.6458, "step": 688 }, { "epoch": 0.7066666666666667, "grad_norm": 0.205078125, "learning_rate": 0.00037058252850205044, "loss": 2.7103, "step": 689 }, { "epoch": 0.7076923076923077, "grad_norm": 0.2294921875, "learning_rate": 0.00037022215582942734, "loss": 2.4544, "step": 690 }, { "epoch": 0.7087179487179487, "grad_norm": 0.1943359375, "learning_rate": 0.00036986145790671507, "loss": 2.4967, "step": 691 }, { "epoch": 0.7097435897435898, "grad_norm": 0.2041015625, "learning_rate": 0.00036950043570974875, "loss": 2.5871, "step": 692 }, { "epoch": 0.7107692307692308, "grad_norm": 0.224609375, "learning_rate": 0.0003691390902152412, "loss": 2.3779, "step": 693 }, { "epoch": 0.7117948717948718, "grad_norm": 0.267578125, "learning_rate": 0.0003687774224007796, "loss": 2.4174, "step": 694 }, { "epoch": 0.7128205128205128, "grad_norm": 0.2041015625, "learning_rate": 0.0003684154332448235, "loss": 2.6812, "step": 695 }, { "epoch": 0.7138461538461538, "grad_norm": 0.2109375, "learning_rate": 0.0003680531237267014, "loss": 2.6705, "step": 696 }, { "epoch": 0.7148717948717949, "grad_norm": 0.212890625, "learning_rate": 0.00036769049482660867, "loss": 2.4993, "step": 697 }, { "epoch": 0.7158974358974359, "grad_norm": 0.2001953125, "learning_rate": 0.00036732754752560485, "loss": 2.5529, "step": 698 }, { "epoch": 0.7169230769230769, "grad_norm": 0.205078125, "learning_rate": 0.00036696428280561086, "loss": 2.7544, "step": 699 }, { "epoch": 0.717948717948718, "grad_norm": 0.2099609375, "learning_rate": 0.00036660070164940613, "loss": 2.618, "step": 700 }, { "epoch": 0.718974358974359, "grad_norm": 0.2138671875, "learning_rate": 0.0003662368050406264, "loss": 2.7707, "step": 701 }, { "epoch": 0.72, "grad_norm": 0.1904296875, "learning_rate": 0.00036587259396376093, "loss": 2.5048, "step": 702 }, { "epoch": 0.721025641025641, "grad_norm": 0.212890625, "learning_rate": 0.0003655080694041495, "loss": 2.6354, "step": 703 }, { "epoch": 0.7220512820512821, "grad_norm": 0.1884765625, "learning_rate": 0.00036514323234798, "loss": 2.3791, "step": 704 }, { "epoch": 0.7230769230769231, "grad_norm": 0.2119140625, "learning_rate": 0.000364778083782286, "loss": 2.5579, "step": 705 }, { "epoch": 0.7241025641025641, "grad_norm": 0.21484375, "learning_rate": 0.00036441262469494366, "loss": 2.6219, "step": 706 }, { "epoch": 0.7251282051282051, "grad_norm": 0.1962890625, "learning_rate": 0.0003640468560746692, "loss": 2.6468, "step": 707 }, { "epoch": 0.7261538461538461, "grad_norm": 0.21875, "learning_rate": 0.00036368077891101627, "loss": 2.5197, "step": 708 }, { "epoch": 0.7271794871794872, "grad_norm": 0.21875, "learning_rate": 0.00036331439419437327, "loss": 2.5601, "step": 709 }, { "epoch": 0.7282051282051282, "grad_norm": 0.21484375, "learning_rate": 0.0003629477029159608, "loss": 2.8623, "step": 710 }, { "epoch": 0.7292307692307692, "grad_norm": 0.21875, "learning_rate": 0.00036258070606782854, "loss": 2.4527, "step": 711 }, { "epoch": 0.7302564102564103, "grad_norm": 0.21484375, "learning_rate": 0.00036221340464285325, "loss": 2.3911, "step": 712 }, { "epoch": 0.7312820512820513, "grad_norm": 0.2314453125, "learning_rate": 0.0003618457996347352, "loss": 2.6006, "step": 713 }, { "epoch": 0.7323076923076923, "grad_norm": 0.22265625, "learning_rate": 0.0003614778920379964, "loss": 2.4196, "step": 714 }, { "epoch": 0.7333333333333333, "grad_norm": 0.2041015625, "learning_rate": 0.0003611096828479773, "loss": 2.516, "step": 715 }, { "epoch": 0.7343589743589743, "grad_norm": 0.2216796875, "learning_rate": 0.00036074117306083436, "loss": 2.5029, "step": 716 }, { "epoch": 0.7353846153846154, "grad_norm": 0.2119140625, "learning_rate": 0.00036037236367353717, "loss": 2.4165, "step": 717 }, { "epoch": 0.7364102564102564, "grad_norm": 0.2060546875, "learning_rate": 0.0003600032556838659, "loss": 2.5183, "step": 718 }, { "epoch": 0.7374358974358974, "grad_norm": 0.259765625, "learning_rate": 0.00035963385009040876, "loss": 2.6984, "step": 719 }, { "epoch": 0.7384615384615385, "grad_norm": 0.201171875, "learning_rate": 0.00035926414789255875, "loss": 2.43, "step": 720 }, { "epoch": 0.7384615384615385, "eval_loss": NaN, "eval_runtime": 73.8288, "eval_samples_per_second": 9.305, "eval_steps_per_second": 1.165, "step": 720 }, { "epoch": 0.7394871794871795, "grad_norm": 0.1982421875, "learning_rate": 0.00035889415009051154, "loss": 2.4484, "step": 721 }, { "epoch": 0.7405128205128205, "grad_norm": 0.2294921875, "learning_rate": 0.00035852385768526246, "loss": 2.5026, "step": 722 }, { "epoch": 0.7415384615384616, "grad_norm": 0.19140625, "learning_rate": 0.00035815327167860393, "loss": 2.5288, "step": 723 }, { "epoch": 0.7425641025641025, "grad_norm": 0.212890625, "learning_rate": 0.00035778239307312256, "loss": 2.4242, "step": 724 }, { "epoch": 0.7435897435897436, "grad_norm": 0.1923828125, "learning_rate": 0.00035741122287219663, "loss": 2.6553, "step": 725 }, { "epoch": 0.7446153846153846, "grad_norm": 0.2421875, "learning_rate": 0.0003570397620799934, "loss": 2.6681, "step": 726 }, { "epoch": 0.7456410256410256, "grad_norm": 0.20703125, "learning_rate": 0.0003566680117014661, "loss": 2.5054, "step": 727 }, { "epoch": 0.7466666666666667, "grad_norm": 0.2001953125, "learning_rate": 0.00035629597274235153, "loss": 2.6381, "step": 728 }, { "epoch": 0.7476923076923077, "grad_norm": 0.2021484375, "learning_rate": 0.0003559236462091672, "loss": 2.5799, "step": 729 }, { "epoch": 0.7487179487179487, "grad_norm": 0.1943359375, "learning_rate": 0.0003555510331092087, "loss": 2.6828, "step": 730 }, { "epoch": 0.7497435897435898, "grad_norm": 0.2099609375, "learning_rate": 0.00035517813445054666, "loss": 2.7495, "step": 731 }, { "epoch": 0.7507692307692307, "grad_norm": 0.2041015625, "learning_rate": 0.0003548049512420245, "loss": 2.551, "step": 732 }, { "epoch": 0.7517948717948718, "grad_norm": 0.2236328125, "learning_rate": 0.00035443148449325546, "loss": 2.444, "step": 733 }, { "epoch": 0.7528205128205128, "grad_norm": 0.1865234375, "learning_rate": 0.0003540577352146198, "loss": 2.4821, "step": 734 }, { "epoch": 0.7538461538461538, "grad_norm": 0.20703125, "learning_rate": 0.0003536837044172619, "loss": 2.45, "step": 735 }, { "epoch": 0.7548717948717949, "grad_norm": 0.21484375, "learning_rate": 0.0003533093931130884, "loss": 2.5726, "step": 736 }, { "epoch": 0.7558974358974359, "grad_norm": 0.2412109375, "learning_rate": 0.00035293480231476416, "loss": 2.6473, "step": 737 }, { "epoch": 0.7569230769230769, "grad_norm": 0.1982421875, "learning_rate": 0.00035255993303571055, "loss": 2.6106, "step": 738 }, { "epoch": 0.757948717948718, "grad_norm": 0.2158203125, "learning_rate": 0.00035218478629010216, "loss": 2.4296, "step": 739 }, { "epoch": 0.7589743589743589, "grad_norm": 0.189453125, "learning_rate": 0.00035180936309286444, "loss": 2.6179, "step": 740 }, { "epoch": 0.76, "grad_norm": 0.2392578125, "learning_rate": 0.00035143366445967056, "loss": 2.6238, "step": 741 }, { "epoch": 0.7610256410256411, "grad_norm": 0.203125, "learning_rate": 0.00035105769140693897, "loss": 2.3882, "step": 742 }, { "epoch": 0.762051282051282, "grad_norm": 0.203125, "learning_rate": 0.00035068144495183063, "loss": 2.3989, "step": 743 }, { "epoch": 0.7630769230769231, "grad_norm": 0.2294921875, "learning_rate": 0.0003503049261122459, "loss": 2.468, "step": 744 }, { "epoch": 0.764102564102564, "grad_norm": 0.2158203125, "learning_rate": 0.0003499281359068222, "loss": 2.6152, "step": 745 }, { "epoch": 0.7651282051282051, "grad_norm": 0.197265625, "learning_rate": 0.00034955107535493114, "loss": 2.5048, "step": 746 }, { "epoch": 0.7661538461538462, "grad_norm": 0.2109375, "learning_rate": 0.0003491737454766757, "loss": 2.2689, "step": 747 }, { "epoch": 0.7671794871794871, "grad_norm": 0.2734375, "learning_rate": 0.0003487961472928874, "loss": 2.5413, "step": 748 }, { "epoch": 0.7682051282051282, "grad_norm": 0.1982421875, "learning_rate": 0.0003484182818251239, "loss": 2.49, "step": 749 }, { "epoch": 0.7692307692307693, "grad_norm": 0.212890625, "learning_rate": 0.0003480401500956657, "loss": 2.4083, "step": 750 }, { "epoch": 0.7702564102564102, "grad_norm": 0.19921875, "learning_rate": 0.00034766175312751367, "loss": 2.4498, "step": 751 }, { "epoch": 0.7712820512820513, "grad_norm": 0.205078125, "learning_rate": 0.00034728309194438645, "loss": 2.6144, "step": 752 }, { "epoch": 0.7723076923076924, "grad_norm": 0.2099609375, "learning_rate": 0.0003469041675707173, "loss": 2.6345, "step": 753 }, { "epoch": 0.7733333333333333, "grad_norm": 0.21484375, "learning_rate": 0.00034652498103165166, "loss": 2.6246, "step": 754 }, { "epoch": 0.7743589743589744, "grad_norm": 0.2236328125, "learning_rate": 0.00034614553335304404, "loss": 2.626, "step": 755 }, { "epoch": 0.7753846153846153, "grad_norm": 0.1923828125, "learning_rate": 0.0003457658255614556, "loss": 2.5476, "step": 756 }, { "epoch": 0.7764102564102564, "grad_norm": 0.216796875, "learning_rate": 0.00034538585868415126, "loss": 2.5455, "step": 757 }, { "epoch": 0.7774358974358975, "grad_norm": 0.18359375, "learning_rate": 0.0003450056337490968, "loss": 2.4534, "step": 758 }, { "epoch": 0.7784615384615384, "grad_norm": 0.2021484375, "learning_rate": 0.000344625151784956, "loss": 2.4045, "step": 759 }, { "epoch": 0.7794871794871795, "grad_norm": 0.20703125, "learning_rate": 0.0003442444138210883, "loss": 2.4511, "step": 760 }, { "epoch": 0.7794871794871795, "eval_loss": NaN, "eval_runtime": 73.8988, "eval_samples_per_second": 9.296, "eval_steps_per_second": 1.164, "step": 760 }, { "epoch": 0.7805128205128206, "grad_norm": 0.220703125, "learning_rate": 0.0003438634208875455, "loss": 2.5253, "step": 761 }, { "epoch": 0.7815384615384615, "grad_norm": 0.2021484375, "learning_rate": 0.0003434821740150692, "loss": 2.4335, "step": 762 }, { "epoch": 0.7825641025641026, "grad_norm": 0.1982421875, "learning_rate": 0.00034310067423508816, "loss": 2.4023, "step": 763 }, { "epoch": 0.7835897435897435, "grad_norm": 0.197265625, "learning_rate": 0.00034271892257971536, "loss": 2.4175, "step": 764 }, { "epoch": 0.7846153846153846, "grad_norm": 0.181640625, "learning_rate": 0.00034233692008174495, "loss": 2.368, "step": 765 }, { "epoch": 0.7856410256410257, "grad_norm": 0.193359375, "learning_rate": 0.00034195466777465, "loss": 2.5828, "step": 766 }, { "epoch": 0.7866666666666666, "grad_norm": 0.1953125, "learning_rate": 0.00034157216669257923, "loss": 2.5544, "step": 767 }, { "epoch": 0.7876923076923077, "grad_norm": 0.1845703125, "learning_rate": 0.00034118941787035457, "loss": 2.5055, "step": 768 }, { "epoch": 0.7887179487179488, "grad_norm": 0.197265625, "learning_rate": 0.0003408064223434679, "loss": 2.3426, "step": 769 }, { "epoch": 0.7897435897435897, "grad_norm": 0.2099609375, "learning_rate": 0.0003404231811480789, "loss": 2.4081, "step": 770 }, { "epoch": 0.7907692307692308, "grad_norm": 0.205078125, "learning_rate": 0.00034003969532101175, "loss": 2.6909, "step": 771 }, { "epoch": 0.7917948717948718, "grad_norm": 0.1953125, "learning_rate": 0.00033965596589975223, "loss": 2.4486, "step": 772 }, { "epoch": 0.7928205128205128, "grad_norm": 0.193359375, "learning_rate": 0.00033927199392244536, "loss": 2.5242, "step": 773 }, { "epoch": 0.7938461538461539, "grad_norm": 0.1884765625, "learning_rate": 0.00033888778042789246, "loss": 2.6317, "step": 774 }, { "epoch": 0.7948717948717948, "grad_norm": 0.2109375, "learning_rate": 0.0003385033264555482, "loss": 2.5687, "step": 775 }, { "epoch": 0.7958974358974359, "grad_norm": 0.19921875, "learning_rate": 0.0003381186330455174, "loss": 2.4872, "step": 776 }, { "epoch": 0.796923076923077, "grad_norm": 0.2314453125, "learning_rate": 0.0003377337012385534, "loss": 2.441, "step": 777 }, { "epoch": 0.7979487179487179, "grad_norm": 0.212890625, "learning_rate": 0.000337348532076054, "loss": 2.4263, "step": 778 }, { "epoch": 0.798974358974359, "grad_norm": 0.197265625, "learning_rate": 0.00033696312660005934, "loss": 2.4195, "step": 779 }, { "epoch": 0.8, "grad_norm": 0.1923828125, "learning_rate": 0.0003365774858532487, "loss": 2.4034, "step": 780 }, { "epoch": 0.801025641025641, "grad_norm": 0.24609375, "learning_rate": 0.00033619161087893804, "loss": 2.9613, "step": 781 }, { "epoch": 0.8020512820512821, "grad_norm": 0.1865234375, "learning_rate": 0.00033580550272107706, "loss": 2.5318, "step": 782 }, { "epoch": 0.803076923076923, "grad_norm": 0.2138671875, "learning_rate": 0.00033541916242424603, "loss": 2.3944, "step": 783 }, { "epoch": 0.8041025641025641, "grad_norm": 0.208984375, "learning_rate": 0.0003350325910336536, "loss": 2.5475, "step": 784 }, { "epoch": 0.8051282051282052, "grad_norm": 0.2158203125, "learning_rate": 0.0003346457895951332, "loss": 2.3374, "step": 785 }, { "epoch": 0.8061538461538461, "grad_norm": 0.2080078125, "learning_rate": 0.00033425875915514113, "loss": 2.5155, "step": 786 }, { "epoch": 0.8071794871794872, "grad_norm": 0.1953125, "learning_rate": 0.0003338715007607528, "loss": 2.5609, "step": 787 }, { "epoch": 0.8082051282051282, "grad_norm": 0.21484375, "learning_rate": 0.00033348401545966067, "loss": 2.343, "step": 788 }, { "epoch": 0.8092307692307692, "grad_norm": 0.255859375, "learning_rate": 0.0003330963043001708, "loss": 2.3984, "step": 789 }, { "epoch": 0.8102564102564103, "grad_norm": 0.310546875, "learning_rate": 0.0003327083683312004, "loss": 2.436, "step": 790 }, { "epoch": 0.8112820512820513, "grad_norm": 0.21484375, "learning_rate": 0.00033232020860227504, "loss": 2.4495, "step": 791 }, { "epoch": 0.8123076923076923, "grad_norm": 0.251953125, "learning_rate": 0.00033193182616352533, "loss": 2.3882, "step": 792 }, { "epoch": 0.8133333333333334, "grad_norm": 0.255859375, "learning_rate": 0.0003315432220656847, "loss": 2.531, "step": 793 }, { "epoch": 0.8143589743589743, "grad_norm": 0.2060546875, "learning_rate": 0.00033115439736008603, "loss": 2.4703, "step": 794 }, { "epoch": 0.8153846153846154, "grad_norm": 0.193359375, "learning_rate": 0.00033076535309865926, "loss": 2.5429, "step": 795 }, { "epoch": 0.8164102564102564, "grad_norm": 0.2275390625, "learning_rate": 0.00033037609033392806, "loss": 2.3957, "step": 796 }, { "epoch": 0.8174358974358974, "grad_norm": 0.25, "learning_rate": 0.00032998661011900734, "loss": 2.4254, "step": 797 }, { "epoch": 0.8184615384615385, "grad_norm": 0.22265625, "learning_rate": 0.00032959691350760054, "loss": 2.5591, "step": 798 }, { "epoch": 0.8194871794871795, "grad_norm": 0.2001953125, "learning_rate": 0.0003292070015539962, "loss": 2.3209, "step": 799 }, { "epoch": 0.8205128205128205, "grad_norm": 0.203125, "learning_rate": 0.0003288168753130657, "loss": 2.3835, "step": 800 }, { "epoch": 0.8205128205128205, "eval_loss": NaN, "eval_runtime": 73.8343, "eval_samples_per_second": 9.305, "eval_steps_per_second": 1.165, "step": 800 }, { "epoch": 0.8215384615384616, "grad_norm": 0.236328125, "learning_rate": 0.0003284265358402598, "loss": 2.6226, "step": 801 }, { "epoch": 0.8225641025641026, "grad_norm": 0.197265625, "learning_rate": 0.00032803598419160664, "loss": 2.5038, "step": 802 }, { "epoch": 0.8235897435897436, "grad_norm": 0.216796875, "learning_rate": 0.0003276452214237079, "loss": 2.5074, "step": 803 }, { "epoch": 0.8246153846153846, "grad_norm": 0.1796875, "learning_rate": 0.00032725424859373687, "loss": 2.526, "step": 804 }, { "epoch": 0.8256410256410256, "grad_norm": 0.1923828125, "learning_rate": 0.0003268630667594348, "loss": 2.55, "step": 805 }, { "epoch": 0.8266666666666667, "grad_norm": 0.1884765625, "learning_rate": 0.00032647167697910846, "loss": 2.3791, "step": 806 }, { "epoch": 0.8276923076923077, "grad_norm": 0.23046875, "learning_rate": 0.0003260800803116274, "loss": 2.5704, "step": 807 }, { "epoch": 0.8287179487179487, "grad_norm": 0.1875, "learning_rate": 0.0003256882778164205, "loss": 2.5329, "step": 808 }, { "epoch": 0.8297435897435897, "grad_norm": 0.1953125, "learning_rate": 0.0003252962705534738, "loss": 2.3182, "step": 809 }, { "epoch": 0.8307692307692308, "grad_norm": 0.193359375, "learning_rate": 0.0003249040595833274, "loss": 2.5185, "step": 810 }, { "epoch": 0.8317948717948718, "grad_norm": 0.2041015625, "learning_rate": 0.0003245116459670722, "loss": 2.4739, "step": 811 }, { "epoch": 0.8328205128205128, "grad_norm": 0.1884765625, "learning_rate": 0.00032411903076634747, "loss": 2.4063, "step": 812 }, { "epoch": 0.8338461538461538, "grad_norm": 0.205078125, "learning_rate": 0.0003237262150433379, "loss": 2.4, "step": 813 }, { "epoch": 0.8348717948717949, "grad_norm": 0.193359375, "learning_rate": 0.0003233331998607706, "loss": 2.372, "step": 814 }, { "epoch": 0.8358974358974359, "grad_norm": 0.1943359375, "learning_rate": 0.0003229399862819125, "loss": 2.4962, "step": 815 }, { "epoch": 0.8369230769230769, "grad_norm": 0.19921875, "learning_rate": 0.0003225465753705669, "loss": 2.4999, "step": 816 }, { "epoch": 0.837948717948718, "grad_norm": 0.203125, "learning_rate": 0.0003221529681910712, "loss": 2.4755, "step": 817 }, { "epoch": 0.838974358974359, "grad_norm": 0.205078125, "learning_rate": 0.0003217591658082939, "loss": 2.3165, "step": 818 }, { "epoch": 0.84, "grad_norm": 0.232421875, "learning_rate": 0.0003213651692876314, "loss": 2.5371, "step": 819 }, { "epoch": 0.841025641025641, "grad_norm": 0.21875, "learning_rate": 0.0003209709796950054, "loss": 2.4433, "step": 820 }, { "epoch": 0.8420512820512821, "grad_norm": 0.2021484375, "learning_rate": 0.00032057659809685984, "loss": 2.3621, "step": 821 }, { "epoch": 0.8430769230769231, "grad_norm": 0.2021484375, "learning_rate": 0.00032018202556015834, "loss": 2.4259, "step": 822 }, { "epoch": 0.8441025641025641, "grad_norm": 0.1923828125, "learning_rate": 0.00031978726315238094, "loss": 2.4894, "step": 823 }, { "epoch": 0.8451282051282051, "grad_norm": 0.2099609375, "learning_rate": 0.0003193923119415213, "loss": 2.5457, "step": 824 }, { "epoch": 0.8461538461538461, "grad_norm": 0.20703125, "learning_rate": 0.00031899717299608383, "loss": 2.5317, "step": 825 }, { "epoch": 0.8471794871794872, "grad_norm": 0.1943359375, "learning_rate": 0.00031860184738508117, "loss": 2.432, "step": 826 }, { "epoch": 0.8482051282051282, "grad_norm": 0.2001953125, "learning_rate": 0.00031820633617803064, "loss": 2.4432, "step": 827 }, { "epoch": 0.8492307692307692, "grad_norm": 0.189453125, "learning_rate": 0.00031781064044495176, "loss": 2.504, "step": 828 }, { "epoch": 0.8502564102564103, "grad_norm": 0.1884765625, "learning_rate": 0.00031741476125636325, "loss": 2.3986, "step": 829 }, { "epoch": 0.8512820512820513, "grad_norm": 0.2001953125, "learning_rate": 0.00031701869968328036, "loss": 2.3167, "step": 830 }, { "epoch": 0.8523076923076923, "grad_norm": 0.1923828125, "learning_rate": 0.0003166224567972114, "loss": 2.5175, "step": 831 }, { "epoch": 0.8533333333333334, "grad_norm": 0.1962890625, "learning_rate": 0.00031622603367015544, "loss": 2.6008, "step": 832 }, { "epoch": 0.8543589743589743, "grad_norm": 0.2109375, "learning_rate": 0.0003158294313745992, "loss": 2.4091, "step": 833 }, { "epoch": 0.8553846153846154, "grad_norm": 0.2099609375, "learning_rate": 0.00031543265098351404, "loss": 2.5893, "step": 834 }, { "epoch": 0.8564102564102564, "grad_norm": 0.1962890625, "learning_rate": 0.0003150356935703531, "loss": 2.5106, "step": 835 }, { "epoch": 0.8574358974358974, "grad_norm": 0.2021484375, "learning_rate": 0.00031463856020904853, "loss": 2.3589, "step": 836 }, { "epoch": 0.8584615384615385, "grad_norm": 0.1875, "learning_rate": 0.00031424125197400843, "loss": 2.3726, "step": 837 }, { "epoch": 0.8594871794871795, "grad_norm": 0.18359375, "learning_rate": 0.0003138437699401141, "loss": 2.4017, "step": 838 }, { "epoch": 0.8605128205128205, "grad_norm": 0.2236328125, "learning_rate": 0.00031344611518271693, "loss": 2.3203, "step": 839 }, { "epoch": 0.8615384615384616, "grad_norm": 0.1884765625, "learning_rate": 0.00031304828877763566, "loss": 2.1151, "step": 840 }, { "epoch": 0.8615384615384616, "eval_loss": NaN, "eval_runtime": 73.9396, "eval_samples_per_second": 9.291, "eval_steps_per_second": 1.163, "step": 840 }, { "epoch": 0.8625641025641025, "grad_norm": 0.1826171875, "learning_rate": 0.0003126502918011532, "loss": 2.2937, "step": 841 }, { "epoch": 0.8635897435897436, "grad_norm": 0.1767578125, "learning_rate": 0.0003122521253300144, "loss": 2.5178, "step": 842 }, { "epoch": 0.8646153846153846, "grad_norm": 0.1884765625, "learning_rate": 0.00031185379044142223, "loss": 2.3875, "step": 843 }, { "epoch": 0.8656410256410256, "grad_norm": 0.20703125, "learning_rate": 0.0003114552882130355, "loss": 2.4026, "step": 844 }, { "epoch": 0.8666666666666667, "grad_norm": 0.1875, "learning_rate": 0.00031105661972296553, "loss": 2.453, "step": 845 }, { "epoch": 0.8676923076923077, "grad_norm": 0.205078125, "learning_rate": 0.0003106577860497737, "loss": 2.5126, "step": 846 }, { "epoch": 0.8687179487179487, "grad_norm": 0.1982421875, "learning_rate": 0.0003102587882724682, "loss": 2.2988, "step": 847 }, { "epoch": 0.8697435897435898, "grad_norm": 0.1953125, "learning_rate": 0.0003098596274705011, "loss": 2.4134, "step": 848 }, { "epoch": 0.8707692307692307, "grad_norm": 0.1826171875, "learning_rate": 0.0003094603047237656, "loss": 2.4299, "step": 849 }, { "epoch": 0.8717948717948718, "grad_norm": 0.1943359375, "learning_rate": 0.0003090608211125931, "loss": 2.5191, "step": 850 }, { "epoch": 0.8728205128205129, "grad_norm": 0.228515625, "learning_rate": 0.00030866117771775, "loss": 2.5536, "step": 851 }, { "epoch": 0.8738461538461538, "grad_norm": 0.193359375, "learning_rate": 0.00030826137562043514, "loss": 2.3984, "step": 852 }, { "epoch": 0.8748717948717949, "grad_norm": 0.2158203125, "learning_rate": 0.0003078614159022767, "loss": 2.6074, "step": 853 }, { "epoch": 0.8758974358974358, "grad_norm": 0.1884765625, "learning_rate": 0.00030746129964532914, "loss": 2.4885, "step": 854 }, { "epoch": 0.8769230769230769, "grad_norm": 0.2197265625, "learning_rate": 0.00030706102793207074, "loss": 2.5023, "step": 855 }, { "epoch": 0.877948717948718, "grad_norm": 0.1962890625, "learning_rate": 0.0003066606018453999, "loss": 2.5833, "step": 856 }, { "epoch": 0.8789743589743589, "grad_norm": 0.2490234375, "learning_rate": 0.0003062600224686331, "loss": 2.5119, "step": 857 }, { "epoch": 0.88, "grad_norm": 0.21875, "learning_rate": 0.00030585929088550136, "loss": 2.5439, "step": 858 }, { "epoch": 0.8810256410256411, "grad_norm": 0.1767578125, "learning_rate": 0.00030545840818014733, "loss": 2.3164, "step": 859 }, { "epoch": 0.882051282051282, "grad_norm": 0.193359375, "learning_rate": 0.00030505737543712276, "loss": 2.4732, "step": 860 }, { "epoch": 0.8830769230769231, "grad_norm": 0.1904296875, "learning_rate": 0.00030465619374138507, "loss": 2.4447, "step": 861 }, { "epoch": 0.884102564102564, "grad_norm": 0.1875, "learning_rate": 0.00030425486417829495, "loss": 2.6294, "step": 862 }, { "epoch": 0.8851282051282051, "grad_norm": 0.208984375, "learning_rate": 0.0003038533878336128, "loss": 2.311, "step": 863 }, { "epoch": 0.8861538461538462, "grad_norm": 0.19140625, "learning_rate": 0.00030345176579349653, "loss": 2.4898, "step": 864 }, { "epoch": 0.8871794871794871, "grad_norm": 0.189453125, "learning_rate": 0.0003030499991444977, "loss": 2.4094, "step": 865 }, { "epoch": 0.8882051282051282, "grad_norm": 0.28125, "learning_rate": 0.00030264808897355957, "loss": 2.5453, "step": 866 }, { "epoch": 0.8892307692307693, "grad_norm": 0.1884765625, "learning_rate": 0.00030224603636801347, "loss": 2.3397, "step": 867 }, { "epoch": 0.8902564102564102, "grad_norm": 0.1806640625, "learning_rate": 0.000301843842415576, "loss": 2.2851, "step": 868 }, { "epoch": 0.8912820512820513, "grad_norm": 0.1875, "learning_rate": 0.0003014415082043463, "loss": 2.3974, "step": 869 }, { "epoch": 0.8923076923076924, "grad_norm": 0.1953125, "learning_rate": 0.0003010390348228029, "loss": 2.2571, "step": 870 }, { "epoch": 0.8933333333333333, "grad_norm": 0.1962890625, "learning_rate": 0.00030063642335980095, "loss": 2.4686, "step": 871 }, { "epoch": 0.8943589743589744, "grad_norm": 0.2158203125, "learning_rate": 0.00030023367490456904, "loss": 2.6228, "step": 872 }, { "epoch": 0.8953846153846153, "grad_norm": 0.173828125, "learning_rate": 0.00029983079054670623, "loss": 2.2345, "step": 873 }, { "epoch": 0.8964102564102564, "grad_norm": 0.2197265625, "learning_rate": 0.00029942777137617984, "loss": 2.4774, "step": 874 }, { "epoch": 0.8974358974358975, "grad_norm": 0.197265625, "learning_rate": 0.00029902461848332125, "loss": 2.4289, "step": 875 }, { "epoch": 0.8984615384615384, "grad_norm": 0.216796875, "learning_rate": 0.00029862133295882387, "loss": 2.343, "step": 876 }, { "epoch": 0.8994871794871795, "grad_norm": 0.1767578125, "learning_rate": 0.00029821791589373995, "loss": 2.4987, "step": 877 }, { "epoch": 0.9005128205128206, "grad_norm": 0.1943359375, "learning_rate": 0.0002978143683794778, "loss": 2.4377, "step": 878 }, { "epoch": 0.9015384615384615, "grad_norm": 0.232421875, "learning_rate": 0.00029741069150779825, "loss": 2.3818, "step": 879 }, { "epoch": 0.9025641025641026, "grad_norm": 0.205078125, "learning_rate": 0.0002970068863708123, "loss": 2.4363, "step": 880 }, { "epoch": 0.9025641025641026, "eval_loss": NaN, "eval_runtime": 73.8369, "eval_samples_per_second": 9.304, "eval_steps_per_second": 1.165, "step": 880 }, { "epoch": 0.9035897435897436, "grad_norm": 0.19140625, "learning_rate": 0.00029660295406097804, "loss": 2.2977, "step": 881 }, { "epoch": 0.9046153846153846, "grad_norm": 0.21484375, "learning_rate": 0.0002961988956710976, "loss": 2.5454, "step": 882 }, { "epoch": 0.9056410256410257, "grad_norm": 0.2138671875, "learning_rate": 0.0002957947122943139, "loss": 2.4156, "step": 883 }, { "epoch": 0.9066666666666666, "grad_norm": 0.2294921875, "learning_rate": 0.0002953904050241085, "loss": 2.3201, "step": 884 }, { "epoch": 0.9076923076923077, "grad_norm": 0.201171875, "learning_rate": 0.0002949859749542977, "loss": 2.4136, "step": 885 }, { "epoch": 0.9087179487179488, "grad_norm": 0.2255859375, "learning_rate": 0.00029458142317903037, "loss": 2.4207, "step": 886 }, { "epoch": 0.9097435897435897, "grad_norm": 0.2138671875, "learning_rate": 0.0002941767507927843, "loss": 2.3247, "step": 887 }, { "epoch": 0.9107692307692308, "grad_norm": 0.2294921875, "learning_rate": 0.0002937719588903639, "loss": 2.4688, "step": 888 }, { "epoch": 0.9117948717948718, "grad_norm": 0.18359375, "learning_rate": 0.00029336704856689665, "loss": 2.3908, "step": 889 }, { "epoch": 0.9128205128205128, "grad_norm": 0.20703125, "learning_rate": 0.0002929620209178307, "loss": 2.3451, "step": 890 }, { "epoch": 0.9138461538461539, "grad_norm": 0.1982421875, "learning_rate": 0.0002925568770389314, "loss": 2.2828, "step": 891 }, { "epoch": 0.9148717948717948, "grad_norm": 0.201171875, "learning_rate": 0.0002921516180262785, "loss": 2.497, "step": 892 }, { "epoch": 0.9158974358974359, "grad_norm": 0.22265625, "learning_rate": 0.0002917462449762635, "loss": 2.3791, "step": 893 }, { "epoch": 0.916923076923077, "grad_norm": 0.2001953125, "learning_rate": 0.0002913407589855861, "loss": 2.3309, "step": 894 }, { "epoch": 0.9179487179487179, "grad_norm": 0.19140625, "learning_rate": 0.00029093516115125177, "loss": 2.2947, "step": 895 }, { "epoch": 0.918974358974359, "grad_norm": 0.19140625, "learning_rate": 0.0002905294525705686, "loss": 2.5789, "step": 896 }, { "epoch": 0.92, "grad_norm": 0.1943359375, "learning_rate": 0.000290123634341144, "loss": 2.3122, "step": 897 }, { "epoch": 0.921025641025641, "grad_norm": 0.19140625, "learning_rate": 0.00028971770756088227, "loss": 2.3093, "step": 898 }, { "epoch": 0.9220512820512821, "grad_norm": 0.1953125, "learning_rate": 0.0002893116733279815, "loss": 2.288, "step": 899 }, { "epoch": 0.9230769230769231, "grad_norm": 0.1953125, "learning_rate": 0.0002889055327409301, "loss": 2.2348, "step": 900 }, { "epoch": 0.9241025641025641, "grad_norm": 0.2041015625, "learning_rate": 0.0002884992868985044, "loss": 2.3852, "step": 901 }, { "epoch": 0.9251282051282051, "grad_norm": 0.1875, "learning_rate": 0.0002880929368997657, "loss": 2.3964, "step": 902 }, { "epoch": 0.9261538461538461, "grad_norm": 0.19921875, "learning_rate": 0.00028768648384405694, "loss": 2.4764, "step": 903 }, { "epoch": 0.9271794871794872, "grad_norm": 0.1845703125, "learning_rate": 0.00028727992883099957, "loss": 2.2759, "step": 904 }, { "epoch": 0.9282051282051282, "grad_norm": 0.19140625, "learning_rate": 0.0002868732729604913, "loss": 2.4534, "step": 905 }, { "epoch": 0.9292307692307692, "grad_norm": 0.1826171875, "learning_rate": 0.0002864665173327026, "loss": 2.3142, "step": 906 }, { "epoch": 0.9302564102564103, "grad_norm": 0.2255859375, "learning_rate": 0.00028605966304807355, "loss": 2.4911, "step": 907 }, { "epoch": 0.9312820512820513, "grad_norm": 0.193359375, "learning_rate": 0.0002856527112073116, "loss": 2.4108, "step": 908 }, { "epoch": 0.9323076923076923, "grad_norm": 0.1865234375, "learning_rate": 0.0002852456629113876, "loss": 2.299, "step": 909 }, { "epoch": 0.9333333333333333, "grad_norm": 0.181640625, "learning_rate": 0.00028483851926153393, "loss": 2.4696, "step": 910 }, { "epoch": 0.9343589743589743, "grad_norm": 0.193359375, "learning_rate": 0.0002844312813592404, "loss": 2.4572, "step": 911 }, { "epoch": 0.9353846153846154, "grad_norm": 0.181640625, "learning_rate": 0.0002840239503062522, "loss": 2.2182, "step": 912 }, { "epoch": 0.9364102564102564, "grad_norm": 0.22265625, "learning_rate": 0.00028361652720456634, "loss": 2.2569, "step": 913 }, { "epoch": 0.9374358974358974, "grad_norm": 0.19140625, "learning_rate": 0.0002832090131564288, "loss": 2.3693, "step": 914 }, { "epoch": 0.9384615384615385, "grad_norm": 0.220703125, "learning_rate": 0.0002828014092643319, "loss": 2.3154, "step": 915 }, { "epoch": 0.9394871794871795, "grad_norm": 0.197265625, "learning_rate": 0.0002823937166310107, "loss": 2.3093, "step": 916 }, { "epoch": 0.9405128205128205, "grad_norm": 0.189453125, "learning_rate": 0.00028198593635944046, "loss": 2.4648, "step": 917 }, { "epoch": 0.9415384615384615, "grad_norm": 0.201171875, "learning_rate": 0.0002815780695528336, "loss": 2.2622, "step": 918 }, { "epoch": 0.9425641025641026, "grad_norm": 0.2392578125, "learning_rate": 0.0002811701173146367, "loss": 2.4457, "step": 919 }, { "epoch": 0.9435897435897436, "grad_norm": 0.1826171875, "learning_rate": 0.00028076208074852727, "loss": 2.3235, "step": 920 }, { "epoch": 0.9435897435897436, "eval_loss": NaN, "eval_runtime": 73.824, "eval_samples_per_second": 9.306, "eval_steps_per_second": 1.165, "step": 920 }, { "epoch": 0.9446153846153846, "grad_norm": 0.18359375, "learning_rate": 0.000280353960958411, "loss": 2.2633, "step": 921 }, { "epoch": 0.9456410256410256, "grad_norm": 0.2177734375, "learning_rate": 0.0002799457590484188, "loss": 2.5573, "step": 922 }, { "epoch": 0.9466666666666667, "grad_norm": 0.1708984375, "learning_rate": 0.00027953747612290396, "loss": 2.1333, "step": 923 }, { "epoch": 0.9476923076923077, "grad_norm": 0.193359375, "learning_rate": 0.0002791291132864386, "loss": 2.1743, "step": 924 }, { "epoch": 0.9487179487179487, "grad_norm": 0.1806640625, "learning_rate": 0.0002787206716438111, "loss": 2.2939, "step": 925 }, { "epoch": 0.9497435897435897, "grad_norm": 0.193359375, "learning_rate": 0.00027831215230002314, "loss": 2.3892, "step": 926 }, { "epoch": 0.9507692307692308, "grad_norm": 0.1806640625, "learning_rate": 0.0002779035563602867, "loss": 2.3148, "step": 927 }, { "epoch": 0.9517948717948718, "grad_norm": 0.1826171875, "learning_rate": 0.00027749488493002074, "loss": 2.3834, "step": 928 }, { "epoch": 0.9528205128205128, "grad_norm": 0.384765625, "learning_rate": 0.0002770861391148486, "loss": 2.4278, "step": 929 }, { "epoch": 0.9538461538461539, "grad_norm": 0.1875, "learning_rate": 0.00027667732002059495, "loss": 2.3561, "step": 930 }, { "epoch": 0.9548717948717949, "grad_norm": 0.2255859375, "learning_rate": 0.0002762684287532825, "loss": 2.3556, "step": 931 }, { "epoch": 0.9558974358974359, "grad_norm": 0.1767578125, "learning_rate": 0.0002758594664191292, "loss": 2.4342, "step": 932 }, { "epoch": 0.9569230769230769, "grad_norm": 0.1953125, "learning_rate": 0.00027545043412454567, "loss": 2.3745, "step": 933 }, { "epoch": 0.9579487179487179, "grad_norm": 0.2060546875, "learning_rate": 0.0002750413329761314, "loss": 2.3178, "step": 934 }, { "epoch": 0.958974358974359, "grad_norm": 0.1953125, "learning_rate": 0.00027463216408067214, "loss": 2.3885, "step": 935 }, { "epoch": 0.96, "grad_norm": 0.1904296875, "learning_rate": 0.0002742229285451373, "loss": 2.3986, "step": 936 }, { "epoch": 0.961025641025641, "grad_norm": 0.1796875, "learning_rate": 0.0002738136274766761, "loss": 2.3568, "step": 937 }, { "epoch": 0.9620512820512821, "grad_norm": 0.1748046875, "learning_rate": 0.00027340426198261544, "loss": 2.3505, "step": 938 }, { "epoch": 0.963076923076923, "grad_norm": 0.1982421875, "learning_rate": 0.0002729948331704563, "loss": 2.2724, "step": 939 }, { "epoch": 0.9641025641025641, "grad_norm": 0.181640625, "learning_rate": 0.00027258534214787106, "loss": 2.2841, "step": 940 }, { "epoch": 0.9651282051282051, "grad_norm": 0.220703125, "learning_rate": 0.0002721757900227003, "loss": 2.6108, "step": 941 }, { "epoch": 0.9661538461538461, "grad_norm": 0.1748046875, "learning_rate": 0.00027176617790295013, "loss": 2.4668, "step": 942 }, { "epoch": 0.9671794871794872, "grad_norm": 0.19140625, "learning_rate": 0.0002713565068967887, "loss": 2.1891, "step": 943 }, { "epoch": 0.9682051282051282, "grad_norm": 0.2099609375, "learning_rate": 0.0002709467781125436, "loss": 2.5307, "step": 944 }, { "epoch": 0.9692307692307692, "grad_norm": 0.201171875, "learning_rate": 0.0002705369926586988, "loss": 2.2617, "step": 945 }, { "epoch": 0.9702564102564103, "grad_norm": 0.189453125, "learning_rate": 0.00027012715164389145, "loss": 2.4517, "step": 946 }, { "epoch": 0.9712820512820513, "grad_norm": 0.1865234375, "learning_rate": 0.0002697172561769091, "loss": 2.2625, "step": 947 }, { "epoch": 0.9723076923076923, "grad_norm": 0.2041015625, "learning_rate": 0.00026930730736668656, "loss": 2.3476, "step": 948 }, { "epoch": 0.9733333333333334, "grad_norm": 0.1884765625, "learning_rate": 0.000268897306322303, "loss": 2.3752, "step": 949 }, { "epoch": 0.9743589743589743, "grad_norm": 0.1845703125, "learning_rate": 0.00026848725415297886, "loss": 2.3982, "step": 950 }, { "epoch": 0.9753846153846154, "grad_norm": 0.2060546875, "learning_rate": 0.00026807715196807303, "loss": 2.275, "step": 951 }, { "epoch": 0.9764102564102564, "grad_norm": 0.185546875, "learning_rate": 0.0002676670008770795, "loss": 2.3404, "step": 952 }, { "epoch": 0.9774358974358974, "grad_norm": 0.2060546875, "learning_rate": 0.0002672568019896248, "loss": 2.3578, "step": 953 }, { "epoch": 0.9784615384615385, "grad_norm": 0.1826171875, "learning_rate": 0.00026684655641546443, "loss": 2.2652, "step": 954 }, { "epoch": 0.9794871794871794, "grad_norm": 0.17578125, "learning_rate": 0.0002664362652644806, "loss": 2.4094, "step": 955 }, { "epoch": 0.9805128205128205, "grad_norm": 0.1884765625, "learning_rate": 0.0002660259296466786, "loss": 2.4241, "step": 956 }, { "epoch": 0.9815384615384616, "grad_norm": 0.1884765625, "learning_rate": 0.00026561555067218404, "loss": 2.4127, "step": 957 }, { "epoch": 0.9825641025641025, "grad_norm": 0.1630859375, "learning_rate": 0.0002652051294512399, "loss": 2.2042, "step": 958 }, { "epoch": 0.9835897435897436, "grad_norm": 0.1787109375, "learning_rate": 0.00026479466709420323, "loss": 2.4081, "step": 959 }, { "epoch": 0.9846153846153847, "grad_norm": 0.1689453125, "learning_rate": 0.00026438416471154274, "loss": 2.2581, "step": 960 }, { "epoch": 0.9846153846153847, "eval_loss": NaN, "eval_runtime": 73.8291, "eval_samples_per_second": 9.305, "eval_steps_per_second": 1.165, "step": 960 }, { "epoch": 0.9856410256410256, "grad_norm": 0.1953125, "learning_rate": 0.0002639736234138351, "loss": 2.3341, "step": 961 }, { "epoch": 0.9866666666666667, "grad_norm": 0.1708984375, "learning_rate": 0.0002635630443117625, "loss": 2.3534, "step": 962 }, { "epoch": 0.9876923076923076, "grad_norm": 0.166015625, "learning_rate": 0.0002631524285161092, "loss": 2.3954, "step": 963 }, { "epoch": 0.9887179487179487, "grad_norm": 0.16796875, "learning_rate": 0.00026274177713775886, "loss": 2.217, "step": 964 }, { "epoch": 0.9897435897435898, "grad_norm": 0.181640625, "learning_rate": 0.00026233109128769136, "loss": 2.4182, "step": 965 }, { "epoch": 0.9907692307692307, "grad_norm": 0.16796875, "learning_rate": 0.0002619203720769798, "loss": 2.211, "step": 966 }, { "epoch": 0.9917948717948718, "grad_norm": 0.1865234375, "learning_rate": 0.0002615096206167877, "loss": 2.2399, "step": 967 }, { "epoch": 0.9928205128205129, "grad_norm": 0.1669921875, "learning_rate": 0.00026109883801836567, "loss": 2.1059, "step": 968 }, { "epoch": 0.9938461538461538, "grad_norm": 0.166015625, "learning_rate": 0.0002606880253930485, "loss": 2.3727, "step": 969 }, { "epoch": 0.9948717948717949, "grad_norm": 0.177734375, "learning_rate": 0.0002602771838522525, "loss": 2.2027, "step": 970 }, { "epoch": 0.9958974358974358, "grad_norm": 0.1787109375, "learning_rate": 0.00025986631450747186, "loss": 2.4085, "step": 971 }, { "epoch": 0.9969230769230769, "grad_norm": 0.2236328125, "learning_rate": 0.00025945541847027624, "loss": 2.4738, "step": 972 }, { "epoch": 0.997948717948718, "grad_norm": 0.19140625, "learning_rate": 0.0002590444968523074, "loss": 2.2885, "step": 973 }, { "epoch": 0.9989743589743589, "grad_norm": 0.1904296875, "learning_rate": 0.0002586335507652764, "loss": 2.1961, "step": 974 }, { "epoch": 1.0, "grad_norm": 0.17578125, "learning_rate": 0.0002582225813209604, "loss": 2.2778, "step": 975 }, { "epoch": 1.001025641025641, "grad_norm": 0.1728515625, "learning_rate": 0.00025781158963119976, "loss": 2.3649, "step": 976 }, { "epoch": 1.0020512820512821, "grad_norm": 0.17578125, "learning_rate": 0.0002574005768078951, "loss": 2.3353, "step": 977 }, { "epoch": 1.003076923076923, "grad_norm": 0.181640625, "learning_rate": 0.00025698954396300404, "loss": 2.2238, "step": 978 }, { "epoch": 1.004102564102564, "grad_norm": 0.1796875, "learning_rate": 0.0002565784922085387, "loss": 2.289, "step": 979 }, { "epoch": 1.005128205128205, "grad_norm": 0.1767578125, "learning_rate": 0.00025616742265656205, "loss": 2.322, "step": 980 }, { "epoch": 1.0061538461538462, "grad_norm": 0.2109375, "learning_rate": 0.0002557563364191853, "loss": 2.1801, "step": 981 }, { "epoch": 1.0071794871794872, "grad_norm": 0.173828125, "learning_rate": 0.0002553452346085648, "loss": 2.3708, "step": 982 }, { "epoch": 1.0082051282051283, "grad_norm": 0.1689453125, "learning_rate": 0.00025493411833689905, "loss": 2.1867, "step": 983 }, { "epoch": 1.0092307692307692, "grad_norm": 0.162109375, "learning_rate": 0.00025452298871642576, "loss": 2.2181, "step": 984 }, { "epoch": 1.0102564102564102, "grad_norm": 0.19921875, "learning_rate": 0.0002541118468594185, "loss": 2.3591, "step": 985 }, { "epoch": 1.0112820512820513, "grad_norm": 0.18359375, "learning_rate": 0.0002537006938781842, "loss": 2.2131, "step": 986 }, { "epoch": 1.0123076923076924, "grad_norm": 0.19921875, "learning_rate": 0.0002532895308850598, "loss": 2.2471, "step": 987 }, { "epoch": 1.0133333333333334, "grad_norm": 0.1904296875, "learning_rate": 0.0002528783589924093, "loss": 2.0832, "step": 988 }, { "epoch": 1.0143589743589743, "grad_norm": 0.169921875, "learning_rate": 0.0002524671793126208, "loss": 2.1193, "step": 989 }, { "epoch": 1.0153846153846153, "grad_norm": 0.1767578125, "learning_rate": 0.0002520559929581034, "loss": 2.2661, "step": 990 }, { "epoch": 1.0164102564102564, "grad_norm": 0.166015625, "learning_rate": 0.0002516448010412844, "loss": 2.0526, "step": 991 }, { "epoch": 1.0174358974358975, "grad_norm": 0.177734375, "learning_rate": 0.0002512336046746061, "loss": 2.4149, "step": 992 }, { "epoch": 1.0184615384615385, "grad_norm": 0.1845703125, "learning_rate": 0.00025082240497052266, "loss": 2.2991, "step": 993 }, { "epoch": 1.0194871794871796, "grad_norm": 0.1767578125, "learning_rate": 0.0002504112030414975, "loss": 2.1238, "step": 994 }, { "epoch": 1.0205128205128204, "grad_norm": 0.1865234375, "learning_rate": 0.00025, "loss": 2.2376, "step": 995 }, { "epoch": 1.0215384615384615, "grad_norm": 0.2060546875, "learning_rate": 0.0002495887969585025, "loss": 2.1748, "step": 996 }, { "epoch": 1.0225641025641026, "grad_norm": 0.19140625, "learning_rate": 0.0002491775950294774, "loss": 2.2744, "step": 997 }, { "epoch": 1.0235897435897436, "grad_norm": 0.1796875, "learning_rate": 0.000248766395325394, "loss": 2.0056, "step": 998 }, { "epoch": 1.0246153846153847, "grad_norm": 0.1865234375, "learning_rate": 0.00024835519895871557, "loss": 2.0907, "step": 999 }, { "epoch": 1.0256410256410255, "grad_norm": 0.169921875, "learning_rate": 0.00024794400704189663, "loss": 2.1462, "step": 1000 }, { "epoch": 1.0256410256410255, "eval_loss": NaN, "eval_runtime": 73.8405, "eval_samples_per_second": 9.304, "eval_steps_per_second": 1.165, "step": 1000 }, { "epoch": 1.0266666666666666, "grad_norm": 0.169921875, "learning_rate": 0.00024753282068737923, "loss": 2.0668, "step": 1001 }, { "epoch": 1.0276923076923077, "grad_norm": 0.1630859375, "learning_rate": 0.00024712164100759064, "loss": 2.1973, "step": 1002 }, { "epoch": 1.0287179487179487, "grad_norm": 0.1708984375, "learning_rate": 0.0002467104691149402, "loss": 2.0432, "step": 1003 }, { "epoch": 1.0297435897435898, "grad_norm": 0.205078125, "learning_rate": 0.0002462993061218158, "loss": 2.2532, "step": 1004 }, { "epoch": 1.0307692307692307, "grad_norm": 0.1923828125, "learning_rate": 0.00024588815314058153, "loss": 2.2101, "step": 1005 }, { "epoch": 1.0317948717948717, "grad_norm": 0.197265625, "learning_rate": 0.0002454770112835743, "loss": 2.1126, "step": 1006 }, { "epoch": 1.0328205128205128, "grad_norm": 0.1728515625, "learning_rate": 0.0002450658816631009, "loss": 2.1414, "step": 1007 }, { "epoch": 1.0338461538461539, "grad_norm": 0.1689453125, "learning_rate": 0.0002446547653914353, "loss": 2.1555, "step": 1008 }, { "epoch": 1.034871794871795, "grad_norm": 0.1787109375, "learning_rate": 0.00024424366358081475, "loss": 2.0453, "step": 1009 }, { "epoch": 1.035897435897436, "grad_norm": 0.16796875, "learning_rate": 0.00024383257734343796, "loss": 2.0606, "step": 1010 }, { "epoch": 1.0369230769230768, "grad_norm": 0.173828125, "learning_rate": 0.00024342150779146134, "loss": 2.1423, "step": 1011 }, { "epoch": 1.037948717948718, "grad_norm": 0.177734375, "learning_rate": 0.00024301045603699597, "loss": 2.1917, "step": 1012 }, { "epoch": 1.038974358974359, "grad_norm": 0.171875, "learning_rate": 0.00024259942319210497, "loss": 2.2215, "step": 1013 }, { "epoch": 1.04, "grad_norm": 0.189453125, "learning_rate": 0.00024218841036880033, "loss": 1.985, "step": 1014 }, { "epoch": 1.041025641025641, "grad_norm": 0.181640625, "learning_rate": 0.00024177741867903967, "loss": 2.1696, "step": 1015 }, { "epoch": 1.042051282051282, "grad_norm": 0.1650390625, "learning_rate": 0.0002413664492347236, "loss": 1.9982, "step": 1016 }, { "epoch": 1.043076923076923, "grad_norm": 0.1689453125, "learning_rate": 0.00024095550314769262, "loss": 2.0957, "step": 1017 }, { "epoch": 1.044102564102564, "grad_norm": 0.1865234375, "learning_rate": 0.00024054458152972377, "loss": 2.1596, "step": 1018 }, { "epoch": 1.0451282051282051, "grad_norm": 0.1845703125, "learning_rate": 0.00024013368549252826, "loss": 2.1243, "step": 1019 }, { "epoch": 1.0461538461538462, "grad_norm": 0.1708984375, "learning_rate": 0.0002397228161477476, "loss": 2.1008, "step": 1020 }, { "epoch": 1.0471794871794873, "grad_norm": 0.1708984375, "learning_rate": 0.00023931197460695154, "loss": 2.1166, "step": 1021 }, { "epoch": 1.0482051282051281, "grad_norm": 0.173828125, "learning_rate": 0.00023890116198163443, "loss": 2.1706, "step": 1022 }, { "epoch": 1.0492307692307692, "grad_norm": 0.1630859375, "learning_rate": 0.00023849037938321232, "loss": 2.1431, "step": 1023 }, { "epoch": 1.0502564102564103, "grad_norm": 0.205078125, "learning_rate": 0.00023807962792302018, "loss": 2.2386, "step": 1024 }, { "epoch": 1.0512820512820513, "grad_norm": 0.1611328125, "learning_rate": 0.0002376689087123087, "loss": 2.0062, "step": 1025 }, { "epoch": 1.0523076923076924, "grad_norm": 0.1748046875, "learning_rate": 0.00023725822286224115, "loss": 2.1716, "step": 1026 }, { "epoch": 1.0533333333333332, "grad_norm": 0.171875, "learning_rate": 0.00023684757148389077, "loss": 2.1463, "step": 1027 }, { "epoch": 1.0543589743589743, "grad_norm": 0.166015625, "learning_rate": 0.00023643695568823756, "loss": 2.1207, "step": 1028 }, { "epoch": 1.0553846153846154, "grad_norm": 0.181640625, "learning_rate": 0.00023602637658616491, "loss": 2.1704, "step": 1029 }, { "epoch": 1.0564102564102564, "grad_norm": 0.19921875, "learning_rate": 0.00023561583528845724, "loss": 2.27, "step": 1030 }, { "epoch": 1.0574358974358975, "grad_norm": 0.1669921875, "learning_rate": 0.0002352053329057968, "loss": 2.1913, "step": 1031 }, { "epoch": 1.0584615384615386, "grad_norm": 0.22265625, "learning_rate": 0.0002347948705487602, "loss": 2.1392, "step": 1032 }, { "epoch": 1.0594871794871794, "grad_norm": 0.1591796875, "learning_rate": 0.00023438444932781597, "loss": 2.1157, "step": 1033 }, { "epoch": 1.0605128205128205, "grad_norm": 0.1572265625, "learning_rate": 0.00023397407035332144, "loss": 2.1209, "step": 1034 }, { "epoch": 1.0615384615384615, "grad_norm": 0.169921875, "learning_rate": 0.0002335637347355194, "loss": 2.1997, "step": 1035 }, { "epoch": 1.0625641025641026, "grad_norm": 0.162109375, "learning_rate": 0.00023315344358453566, "loss": 2.1186, "step": 1036 }, { "epoch": 1.0635897435897437, "grad_norm": 0.1767578125, "learning_rate": 0.00023274319801037529, "loss": 2.1421, "step": 1037 }, { "epoch": 1.0646153846153845, "grad_norm": 0.16796875, "learning_rate": 0.00023233299912292046, "loss": 2.1085, "step": 1038 }, { "epoch": 1.0656410256410256, "grad_norm": 0.1767578125, "learning_rate": 0.00023192284803192704, "loss": 2.0296, "step": 1039 }, { "epoch": 1.0666666666666667, "grad_norm": 0.16796875, "learning_rate": 0.00023151274584702118, "loss": 2.163, "step": 1040 }, { "epoch": 1.0666666666666667, "eval_loss": NaN, "eval_runtime": 73.8413, "eval_samples_per_second": 9.304, "eval_steps_per_second": 1.165, "step": 1040 }, { "epoch": 1.0676923076923077, "grad_norm": 0.1611328125, "learning_rate": 0.00023110269367769703, "loss": 2.1827, "step": 1041 }, { "epoch": 1.0687179487179488, "grad_norm": 0.16015625, "learning_rate": 0.00023069269263331353, "loss": 2.0883, "step": 1042 }, { "epoch": 1.0697435897435899, "grad_norm": 0.1748046875, "learning_rate": 0.00023028274382309096, "loss": 2.0003, "step": 1043 }, { "epoch": 1.0707692307692307, "grad_norm": 0.1669921875, "learning_rate": 0.00022987284835610856, "loss": 2.0297, "step": 1044 }, { "epoch": 1.0717948717948718, "grad_norm": 0.158203125, "learning_rate": 0.00022946300734130124, "loss": 2.0221, "step": 1045 }, { "epoch": 1.0728205128205128, "grad_norm": 0.16796875, "learning_rate": 0.00022905322188745638, "loss": 2.1578, "step": 1046 }, { "epoch": 1.073846153846154, "grad_norm": 0.162109375, "learning_rate": 0.00022864349310321128, "loss": 2.0387, "step": 1047 }, { "epoch": 1.074871794871795, "grad_norm": 0.171875, "learning_rate": 0.00022823382209704994, "loss": 2.0242, "step": 1048 }, { "epoch": 1.0758974358974358, "grad_norm": 0.1748046875, "learning_rate": 0.00022782420997729971, "loss": 2.0597, "step": 1049 }, { "epoch": 1.0769230769230769, "grad_norm": 0.16796875, "learning_rate": 0.00022741465785212903, "loss": 2.0825, "step": 1050 }, { "epoch": 1.077948717948718, "grad_norm": 0.1728515625, "learning_rate": 0.00022700516682954376, "loss": 2.1366, "step": 1051 }, { "epoch": 1.078974358974359, "grad_norm": 0.173828125, "learning_rate": 0.0002265957380173846, "loss": 2.0445, "step": 1052 }, { "epoch": 1.08, "grad_norm": 0.1630859375, "learning_rate": 0.00022618637252332398, "loss": 2.1056, "step": 1053 }, { "epoch": 1.081025641025641, "grad_norm": 0.1845703125, "learning_rate": 0.00022577707145486276, "loss": 2.1977, "step": 1054 }, { "epoch": 1.082051282051282, "grad_norm": 0.1640625, "learning_rate": 0.00022536783591932784, "loss": 2.0733, "step": 1055 }, { "epoch": 1.083076923076923, "grad_norm": 0.1708984375, "learning_rate": 0.0002249586670238687, "loss": 2.069, "step": 1056 }, { "epoch": 1.0841025641025641, "grad_norm": 0.1630859375, "learning_rate": 0.00022454956587545437, "loss": 1.9138, "step": 1057 }, { "epoch": 1.0851282051282052, "grad_norm": 0.169921875, "learning_rate": 0.0002241405335808707, "loss": 2.017, "step": 1058 }, { "epoch": 1.0861538461538462, "grad_norm": 0.1669921875, "learning_rate": 0.00022373157124671764, "loss": 1.8315, "step": 1059 }, { "epoch": 1.087179487179487, "grad_norm": 0.1640625, "learning_rate": 0.00022332267997940514, "loss": 2.0109, "step": 1060 }, { "epoch": 1.0882051282051282, "grad_norm": 0.1669921875, "learning_rate": 0.00022291386088515143, "loss": 2.0118, "step": 1061 }, { "epoch": 1.0892307692307692, "grad_norm": 0.17578125, "learning_rate": 0.00022250511506997933, "loss": 2.0702, "step": 1062 }, { "epoch": 1.0902564102564103, "grad_norm": 0.162109375, "learning_rate": 0.00022209644363971334, "loss": 2.0094, "step": 1063 }, { "epoch": 1.0912820512820514, "grad_norm": 0.1494140625, "learning_rate": 0.00022168784769997684, "loss": 1.9867, "step": 1064 }, { "epoch": 1.0923076923076924, "grad_norm": 0.17578125, "learning_rate": 0.00022127932835618898, "loss": 2.1259, "step": 1065 }, { "epoch": 1.0933333333333333, "grad_norm": 0.1650390625, "learning_rate": 0.00022087088671356143, "loss": 2.0402, "step": 1066 }, { "epoch": 1.0943589743589743, "grad_norm": 0.1611328125, "learning_rate": 0.0002204625238770961, "loss": 2.016, "step": 1067 }, { "epoch": 1.0953846153846154, "grad_norm": 0.1689453125, "learning_rate": 0.0002200542409515812, "loss": 2.0629, "step": 1068 }, { "epoch": 1.0964102564102565, "grad_norm": 0.1669921875, "learning_rate": 0.00021964603904158904, "loss": 2.0418, "step": 1069 }, { "epoch": 1.0974358974358975, "grad_norm": 0.1650390625, "learning_rate": 0.00021923791925147285, "loss": 2.2228, "step": 1070 }, { "epoch": 1.0984615384615384, "grad_norm": 0.173828125, "learning_rate": 0.00021882988268536333, "loss": 2.0678, "step": 1071 }, { "epoch": 1.0994871794871794, "grad_norm": 0.154296875, "learning_rate": 0.00021842193044716634, "loss": 2.0785, "step": 1072 }, { "epoch": 1.1005128205128205, "grad_norm": 0.2109375, "learning_rate": 0.00021801406364055955, "loss": 2.0718, "step": 1073 }, { "epoch": 1.1015384615384616, "grad_norm": 0.193359375, "learning_rate": 0.00021760628336898931, "loss": 1.9573, "step": 1074 }, { "epoch": 1.1025641025641026, "grad_norm": 0.169921875, "learning_rate": 0.0002171985907356681, "loss": 2.1294, "step": 1075 }, { "epoch": 1.1035897435897435, "grad_norm": 0.185546875, "learning_rate": 0.00021679098684357123, "loss": 1.9516, "step": 1076 }, { "epoch": 1.1046153846153846, "grad_norm": 0.1669921875, "learning_rate": 0.00021638347279543375, "loss": 1.9423, "step": 1077 }, { "epoch": 1.1056410256410256, "grad_norm": 0.16015625, "learning_rate": 0.00021597604969374784, "loss": 2.0626, "step": 1078 }, { "epoch": 1.1066666666666667, "grad_norm": 0.16015625, "learning_rate": 0.00021556871864075963, "loss": 1.9325, "step": 1079 }, { "epoch": 1.1076923076923078, "grad_norm": 0.1787109375, "learning_rate": 0.0002151614807384661, "loss": 2.2433, "step": 1080 }, { "epoch": 1.1076923076923078, "eval_loss": NaN, "eval_runtime": 73.9226, "eval_samples_per_second": 9.294, "eval_steps_per_second": 1.163, "step": 1080 }, { "epoch": 1.1087179487179488, "grad_norm": 0.16015625, "learning_rate": 0.00021475433708861242, "loss": 2.0999, "step": 1081 }, { "epoch": 1.1097435897435897, "grad_norm": 0.1591796875, "learning_rate": 0.0002143472887926885, "loss": 2.0778, "step": 1082 }, { "epoch": 1.1107692307692307, "grad_norm": 0.1767578125, "learning_rate": 0.00021394033695192643, "loss": 1.9615, "step": 1083 }, { "epoch": 1.1117948717948718, "grad_norm": 0.1982421875, "learning_rate": 0.00021353348266729756, "loss": 2.1739, "step": 1084 }, { "epoch": 1.1128205128205129, "grad_norm": 0.166015625, "learning_rate": 0.00021312672703950876, "loss": 2.0249, "step": 1085 }, { "epoch": 1.113846153846154, "grad_norm": 0.1806640625, "learning_rate": 0.00021272007116900044, "loss": 2.0591, "step": 1086 }, { "epoch": 1.1148717948717948, "grad_norm": 0.1689453125, "learning_rate": 0.00021231351615594318, "loss": 2.0483, "step": 1087 }, { "epoch": 1.1158974358974358, "grad_norm": 0.1630859375, "learning_rate": 0.0002119070631002343, "loss": 2.0964, "step": 1088 }, { "epoch": 1.116923076923077, "grad_norm": 0.1591796875, "learning_rate": 0.0002115007131014956, "loss": 2.0679, "step": 1089 }, { "epoch": 1.117948717948718, "grad_norm": 0.1708984375, "learning_rate": 0.00021109446725907003, "loss": 2.0037, "step": 1090 }, { "epoch": 1.118974358974359, "grad_norm": 0.1650390625, "learning_rate": 0.0002106883266720186, "loss": 2.0765, "step": 1091 }, { "epoch": 1.12, "grad_norm": 0.17578125, "learning_rate": 0.00021028229243911774, "loss": 2.0573, "step": 1092 }, { "epoch": 1.121025641025641, "grad_norm": 0.16796875, "learning_rate": 0.00020987636565885607, "loss": 2.0802, "step": 1093 }, { "epoch": 1.122051282051282, "grad_norm": 0.173828125, "learning_rate": 0.00020947054742943144, "loss": 1.9484, "step": 1094 }, { "epoch": 1.123076923076923, "grad_norm": 0.1552734375, "learning_rate": 0.00020906483884874816, "loss": 2.0049, "step": 1095 }, { "epoch": 1.1241025641025642, "grad_norm": 0.1748046875, "learning_rate": 0.00020865924101441395, "loss": 2.0264, "step": 1096 }, { "epoch": 1.1251282051282052, "grad_norm": 0.171875, "learning_rate": 0.00020825375502373655, "loss": 2.0084, "step": 1097 }, { "epoch": 1.126153846153846, "grad_norm": 0.16015625, "learning_rate": 0.0002078483819737215, "loss": 2.0465, "step": 1098 }, { "epoch": 1.1271794871794871, "grad_norm": 0.169921875, "learning_rate": 0.00020744312296106865, "loss": 2.113, "step": 1099 }, { "epoch": 1.1282051282051282, "grad_norm": 0.2099609375, "learning_rate": 0.0002070379790821693, "loss": 2.1843, "step": 1100 }, { "epoch": 1.1292307692307693, "grad_norm": 0.1611328125, "learning_rate": 0.00020663295143310333, "loss": 1.9603, "step": 1101 }, { "epoch": 1.1302564102564103, "grad_norm": 0.1611328125, "learning_rate": 0.00020622804110963613, "loss": 2.0683, "step": 1102 }, { "epoch": 1.1312820512820512, "grad_norm": 0.1591796875, "learning_rate": 0.0002058232492072157, "loss": 2.1234, "step": 1103 }, { "epoch": 1.1323076923076922, "grad_norm": 0.158203125, "learning_rate": 0.00020541857682096975, "loss": 2.1001, "step": 1104 }, { "epoch": 1.1333333333333333, "grad_norm": 0.15625, "learning_rate": 0.00020501402504570233, "loss": 2.07, "step": 1105 }, { "epoch": 1.1343589743589744, "grad_norm": 0.1875, "learning_rate": 0.00020460959497589154, "loss": 1.9687, "step": 1106 }, { "epoch": 1.1353846153846154, "grad_norm": 0.1591796875, "learning_rate": 0.00020420528770568613, "loss": 2.1508, "step": 1107 }, { "epoch": 1.1364102564102565, "grad_norm": 0.17578125, "learning_rate": 0.0002038011043289025, "loss": 2.0047, "step": 1108 }, { "epoch": 1.1374358974358973, "grad_norm": 0.171875, "learning_rate": 0.00020339704593902195, "loss": 2.0281, "step": 1109 }, { "epoch": 1.1384615384615384, "grad_norm": 0.162109375, "learning_rate": 0.00020299311362918774, "loss": 1.9443, "step": 1110 }, { "epoch": 1.1394871794871795, "grad_norm": 0.1630859375, "learning_rate": 0.00020258930849220182, "loss": 2.0942, "step": 1111 }, { "epoch": 1.1405128205128205, "grad_norm": 0.162109375, "learning_rate": 0.0002021856316205223, "loss": 1.9553, "step": 1112 }, { "epoch": 1.1415384615384616, "grad_norm": 0.181640625, "learning_rate": 0.00020178208410626006, "loss": 2.0687, "step": 1113 }, { "epoch": 1.1425641025641027, "grad_norm": 0.1787109375, "learning_rate": 0.00020137866704117614, "loss": 2.1854, "step": 1114 }, { "epoch": 1.1435897435897435, "grad_norm": 0.1875, "learning_rate": 0.00020097538151667886, "loss": 1.9635, "step": 1115 }, { "epoch": 1.1446153846153846, "grad_norm": 0.1669921875, "learning_rate": 0.0002005722286238202, "loss": 2.028, "step": 1116 }, { "epoch": 1.1456410256410257, "grad_norm": 0.1708984375, "learning_rate": 0.0002001692094532937, "loss": 2.0051, "step": 1117 }, { "epoch": 1.1466666666666667, "grad_norm": 0.1513671875, "learning_rate": 0.00019976632509543106, "loss": 1.9573, "step": 1118 }, { "epoch": 1.1476923076923078, "grad_norm": 0.16015625, "learning_rate": 0.0001993635766401991, "loss": 1.9861, "step": 1119 }, { "epoch": 1.1487179487179486, "grad_norm": 0.1552734375, "learning_rate": 0.0001989609651771971, "loss": 2.0609, "step": 1120 }, { "epoch": 1.1487179487179486, "eval_loss": NaN, "eval_runtime": 73.9126, "eval_samples_per_second": 9.295, "eval_steps_per_second": 1.164, "step": 1120 }, { "epoch": 1.1497435897435897, "grad_norm": 0.16796875, "learning_rate": 0.00019855849179565375, "loss": 2.0815, "step": 1121 }, { "epoch": 1.1507692307692308, "grad_norm": 0.1689453125, "learning_rate": 0.000198156157584424, "loss": 1.9093, "step": 1122 }, { "epoch": 1.1517948717948718, "grad_norm": 0.1572265625, "learning_rate": 0.00019775396363198652, "loss": 2.0461, "step": 1123 }, { "epoch": 1.152820512820513, "grad_norm": 0.1611328125, "learning_rate": 0.00019735191102644045, "loss": 1.9636, "step": 1124 }, { "epoch": 1.1538461538461537, "grad_norm": 0.1708984375, "learning_rate": 0.0001969500008555023, "loss": 2.1116, "step": 1125 }, { "epoch": 1.1548717948717948, "grad_norm": 0.18359375, "learning_rate": 0.00019654823420650356, "loss": 2.1453, "step": 1126 }, { "epoch": 1.1558974358974359, "grad_norm": 0.1630859375, "learning_rate": 0.00019614661216638718, "loss": 1.9406, "step": 1127 }, { "epoch": 1.156923076923077, "grad_norm": 0.1513671875, "learning_rate": 0.00019574513582170508, "loss": 2.0201, "step": 1128 }, { "epoch": 1.157948717948718, "grad_norm": 0.158203125, "learning_rate": 0.00019534380625861497, "loss": 1.8443, "step": 1129 }, { "epoch": 1.1589743589743589, "grad_norm": 0.1826171875, "learning_rate": 0.0001949426245628773, "loss": 2.2307, "step": 1130 }, { "epoch": 1.16, "grad_norm": 0.1689453125, "learning_rate": 0.0001945415918198527, "loss": 2.183, "step": 1131 }, { "epoch": 1.161025641025641, "grad_norm": 0.162109375, "learning_rate": 0.00019414070911449876, "loss": 1.9941, "step": 1132 }, { "epoch": 1.162051282051282, "grad_norm": 0.1552734375, "learning_rate": 0.00019373997753136692, "loss": 1.8648, "step": 1133 }, { "epoch": 1.1630769230769231, "grad_norm": 0.1552734375, "learning_rate": 0.0001933393981546001, "loss": 1.8184, "step": 1134 }, { "epoch": 1.1641025641025642, "grad_norm": 0.15625, "learning_rate": 0.00019293897206792938, "loss": 1.9956, "step": 1135 }, { "epoch": 1.1651282051282053, "grad_norm": 0.16796875, "learning_rate": 0.00019253870035467087, "loss": 2.1083, "step": 1136 }, { "epoch": 1.166153846153846, "grad_norm": 0.1591796875, "learning_rate": 0.00019213858409772332, "loss": 1.9896, "step": 1137 }, { "epoch": 1.1671794871794872, "grad_norm": 0.1640625, "learning_rate": 0.00019173862437956487, "loss": 2.0541, "step": 1138 }, { "epoch": 1.1682051282051282, "grad_norm": 0.16796875, "learning_rate": 0.00019133882228224998, "loss": 1.9949, "step": 1139 }, { "epoch": 1.1692307692307693, "grad_norm": 0.146484375, "learning_rate": 0.0001909391788874069, "loss": 1.9119, "step": 1140 }, { "epoch": 1.1702564102564104, "grad_norm": 0.1767578125, "learning_rate": 0.00019053969527623444, "loss": 2.1745, "step": 1141 }, { "epoch": 1.1712820512820512, "grad_norm": 0.1591796875, "learning_rate": 0.00019014037252949889, "loss": 1.894, "step": 1142 }, { "epoch": 1.1723076923076923, "grad_norm": 0.154296875, "learning_rate": 0.0001897412117275319, "loss": 1.9579, "step": 1143 }, { "epoch": 1.1733333333333333, "grad_norm": 0.171875, "learning_rate": 0.00018934221395022638, "loss": 1.9892, "step": 1144 }, { "epoch": 1.1743589743589744, "grad_norm": 0.1474609375, "learning_rate": 0.00018894338027703456, "loss": 1.9539, "step": 1145 }, { "epoch": 1.1753846153846155, "grad_norm": 0.15625, "learning_rate": 0.00018854471178696464, "loss": 1.806, "step": 1146 }, { "epoch": 1.1764102564102563, "grad_norm": 0.1513671875, "learning_rate": 0.0001881462095585778, "loss": 2.0541, "step": 1147 }, { "epoch": 1.1774358974358974, "grad_norm": 0.173828125, "learning_rate": 0.0001877478746699856, "loss": 1.9443, "step": 1148 }, { "epoch": 1.1784615384615384, "grad_norm": 0.150390625, "learning_rate": 0.00018734970819884677, "loss": 1.9068, "step": 1149 }, { "epoch": 1.1794871794871795, "grad_norm": 0.1474609375, "learning_rate": 0.00018695171122236443, "loss": 1.9427, "step": 1150 }, { "epoch": 1.1805128205128206, "grad_norm": 0.1611328125, "learning_rate": 0.00018655388481728308, "loss": 1.9763, "step": 1151 }, { "epoch": 1.1815384615384614, "grad_norm": 0.1728515625, "learning_rate": 0.00018615623005988598, "loss": 2.0342, "step": 1152 }, { "epoch": 1.1825641025641025, "grad_norm": 0.16015625, "learning_rate": 0.0001857587480259916, "loss": 1.9657, "step": 1153 }, { "epoch": 1.1835897435897436, "grad_norm": 0.1416015625, "learning_rate": 0.0001853614397909515, "loss": 2.0545, "step": 1154 }, { "epoch": 1.1846153846153846, "grad_norm": 0.1533203125, "learning_rate": 0.00018496430642964696, "loss": 2.0401, "step": 1155 }, { "epoch": 1.1856410256410257, "grad_norm": 0.142578125, "learning_rate": 0.000184567349016486, "loss": 1.9797, "step": 1156 }, { "epoch": 1.1866666666666668, "grad_norm": 0.150390625, "learning_rate": 0.00018417056862540083, "loss": 2.037, "step": 1157 }, { "epoch": 1.1876923076923076, "grad_norm": 0.166015625, "learning_rate": 0.00018377396632984454, "loss": 2.0317, "step": 1158 }, { "epoch": 1.1887179487179487, "grad_norm": 0.1494140625, "learning_rate": 0.0001833775432027886, "loss": 2.0295, "step": 1159 }, { "epoch": 1.1897435897435897, "grad_norm": 0.1669921875, "learning_rate": 0.00018298130031671974, "loss": 1.9598, "step": 1160 }, { "epoch": 1.1897435897435897, "eval_loss": NaN, "eval_runtime": 73.9129, "eval_samples_per_second": 9.295, "eval_steps_per_second": 1.164, "step": 1160 }, { "epoch": 1.1907692307692308, "grad_norm": 0.18359375, "learning_rate": 0.00018258523874363676, "loss": 1.904, "step": 1161 }, { "epoch": 1.1917948717948719, "grad_norm": 0.1572265625, "learning_rate": 0.00018218935955504828, "loss": 2.018, "step": 1162 }, { "epoch": 1.192820512820513, "grad_norm": 0.1787109375, "learning_rate": 0.00018179366382196942, "loss": 1.8727, "step": 1163 }, { "epoch": 1.1938461538461538, "grad_norm": 0.162109375, "learning_rate": 0.00018139815261491887, "loss": 1.9307, "step": 1164 }, { "epoch": 1.1948717948717948, "grad_norm": 0.162109375, "learning_rate": 0.00018100282700391615, "loss": 2.0405, "step": 1165 }, { "epoch": 1.195897435897436, "grad_norm": 0.1640625, "learning_rate": 0.0001806076880584788, "loss": 1.8421, "step": 1166 }, { "epoch": 1.196923076923077, "grad_norm": 0.1591796875, "learning_rate": 0.0001802127368476191, "loss": 2.034, "step": 1167 }, { "epoch": 1.197948717948718, "grad_norm": 0.1953125, "learning_rate": 0.0001798179744398416, "loss": 2.0767, "step": 1168 }, { "epoch": 1.198974358974359, "grad_norm": 0.1708984375, "learning_rate": 0.00017942340190314022, "loss": 2.0403, "step": 1169 }, { "epoch": 1.2, "grad_norm": 0.162109375, "learning_rate": 0.00017902902030499462, "loss": 1.8991, "step": 1170 }, { "epoch": 1.201025641025641, "grad_norm": 0.142578125, "learning_rate": 0.00017863483071236857, "loss": 1.9595, "step": 1171 }, { "epoch": 1.202051282051282, "grad_norm": 0.154296875, "learning_rate": 0.00017824083419170616, "loss": 1.9511, "step": 1172 }, { "epoch": 1.2030769230769232, "grad_norm": 0.158203125, "learning_rate": 0.00017784703180892883, "loss": 1.9113, "step": 1173 }, { "epoch": 1.204102564102564, "grad_norm": 0.1640625, "learning_rate": 0.00017745342462943323, "loss": 1.9432, "step": 1174 }, { "epoch": 1.205128205128205, "grad_norm": 0.154296875, "learning_rate": 0.0001770600137180876, "loss": 1.9731, "step": 1175 }, { "epoch": 1.2061538461538461, "grad_norm": 0.150390625, "learning_rate": 0.0001766668001392294, "loss": 2.0279, "step": 1176 }, { "epoch": 1.2071794871794872, "grad_norm": 0.1826171875, "learning_rate": 0.00017627378495666215, "loss": 1.8641, "step": 1177 }, { "epoch": 1.2082051282051283, "grad_norm": 0.1884765625, "learning_rate": 0.00017588096923365257, "loss": 1.9996, "step": 1178 }, { "epoch": 1.209230769230769, "grad_norm": 0.1591796875, "learning_rate": 0.00017548835403292782, "loss": 1.9621, "step": 1179 }, { "epoch": 1.2102564102564102, "grad_norm": 0.1640625, "learning_rate": 0.00017509594041667265, "loss": 2.0169, "step": 1180 }, { "epoch": 1.2112820512820512, "grad_norm": 0.1630859375, "learning_rate": 0.00017470372944652617, "loss": 2.005, "step": 1181 }, { "epoch": 1.2123076923076923, "grad_norm": 0.1640625, "learning_rate": 0.00017431172218357956, "loss": 1.9248, "step": 1182 }, { "epoch": 1.2133333333333334, "grad_norm": 0.2080078125, "learning_rate": 0.00017391991968837274, "loss": 2.0575, "step": 1183 }, { "epoch": 1.2143589743589744, "grad_norm": 0.16015625, "learning_rate": 0.00017352832302089155, "loss": 1.9113, "step": 1184 }, { "epoch": 1.2153846153846155, "grad_norm": 0.162109375, "learning_rate": 0.00017313693324056523, "loss": 2.0322, "step": 1185 }, { "epoch": 1.2164102564102564, "grad_norm": 0.14453125, "learning_rate": 0.00017274575140626317, "loss": 1.9651, "step": 1186 }, { "epoch": 1.2174358974358974, "grad_norm": 0.15625, "learning_rate": 0.00017235477857629206, "loss": 2.0174, "step": 1187 }, { "epoch": 1.2184615384615385, "grad_norm": 0.1708984375, "learning_rate": 0.0001719640158083935, "loss": 1.9034, "step": 1188 }, { "epoch": 1.2194871794871796, "grad_norm": 0.1611328125, "learning_rate": 0.00017157346415974027, "loss": 1.8911, "step": 1189 }, { "epoch": 1.2205128205128206, "grad_norm": 0.16796875, "learning_rate": 0.00017118312468693438, "loss": 1.943, "step": 1190 }, { "epoch": 1.2215384615384615, "grad_norm": 0.1875, "learning_rate": 0.0001707929984460038, "loss": 2.1602, "step": 1191 }, { "epoch": 1.2225641025641025, "grad_norm": 0.1591796875, "learning_rate": 0.0001704030864923995, "loss": 1.8399, "step": 1192 }, { "epoch": 1.2235897435897436, "grad_norm": 0.158203125, "learning_rate": 0.00017001338988099264, "loss": 1.8968, "step": 1193 }, { "epoch": 1.2246153846153847, "grad_norm": 0.173828125, "learning_rate": 0.00016962390966607203, "loss": 1.942, "step": 1194 }, { "epoch": 1.2256410256410257, "grad_norm": 0.1513671875, "learning_rate": 0.0001692346469013408, "loss": 2.0704, "step": 1195 }, { "epoch": 1.2266666666666666, "grad_norm": 0.1611328125, "learning_rate": 0.00016884560263991395, "loss": 1.9776, "step": 1196 }, { "epoch": 1.2276923076923076, "grad_norm": 0.1689453125, "learning_rate": 0.00016845677793431537, "loss": 1.9522, "step": 1197 }, { "epoch": 1.2287179487179487, "grad_norm": 0.140625, "learning_rate": 0.00016806817383647465, "loss": 2.0156, "step": 1198 }, { "epoch": 1.2297435897435898, "grad_norm": 0.1630859375, "learning_rate": 0.00016767979139772495, "loss": 1.9739, "step": 1199 }, { "epoch": 1.2307692307692308, "grad_norm": 0.1796875, "learning_rate": 0.00016729163166879962, "loss": 2.1146, "step": 1200 }, { "epoch": 1.2307692307692308, "eval_loss": NaN, "eval_runtime": 73.852, "eval_samples_per_second": 9.302, "eval_steps_per_second": 1.164, "step": 1200 }, { "epoch": 1.2317948717948717, "grad_norm": 0.15234375, "learning_rate": 0.00016690369569982927, "loss": 1.9492, "step": 1201 }, { "epoch": 1.2328205128205127, "grad_norm": 0.16015625, "learning_rate": 0.0001665159845403394, "loss": 1.863, "step": 1202 }, { "epoch": 1.2338461538461538, "grad_norm": 0.1650390625, "learning_rate": 0.00016612849923924722, "loss": 1.9962, "step": 1203 }, { "epoch": 1.2348717948717949, "grad_norm": 0.162109375, "learning_rate": 0.00016574124084485893, "loss": 1.9589, "step": 1204 }, { "epoch": 1.235897435897436, "grad_norm": 0.1611328125, "learning_rate": 0.00016535421040486683, "loss": 1.882, "step": 1205 }, { "epoch": 1.236923076923077, "grad_norm": 0.1630859375, "learning_rate": 0.0001649674089663465, "loss": 1.9821, "step": 1206 }, { "epoch": 1.2379487179487179, "grad_norm": 0.16015625, "learning_rate": 0.00016458083757575392, "loss": 1.8811, "step": 1207 }, { "epoch": 1.238974358974359, "grad_norm": 0.1572265625, "learning_rate": 0.000164194497278923, "loss": 1.8946, "step": 1208 }, { "epoch": 1.24, "grad_norm": 0.1650390625, "learning_rate": 0.00016380838912106194, "loss": 1.8844, "step": 1209 }, { "epoch": 1.241025641025641, "grad_norm": 0.20703125, "learning_rate": 0.0001634225141467513, "loss": 2.1033, "step": 1210 }, { "epoch": 1.2420512820512821, "grad_norm": 0.220703125, "learning_rate": 0.00016303687339994072, "loss": 1.9691, "step": 1211 }, { "epoch": 1.2430769230769232, "grad_norm": 0.1708984375, "learning_rate": 0.000162651467923946, "loss": 1.8857, "step": 1212 }, { "epoch": 1.244102564102564, "grad_norm": 0.15234375, "learning_rate": 0.00016226629876144657, "loss": 1.9431, "step": 1213 }, { "epoch": 1.245128205128205, "grad_norm": 0.1435546875, "learning_rate": 0.00016188136695448262, "loss": 1.864, "step": 1214 }, { "epoch": 1.2461538461538462, "grad_norm": 0.2080078125, "learning_rate": 0.00016149667354445192, "loss": 1.9667, "step": 1215 }, { "epoch": 1.2471794871794872, "grad_norm": 0.1552734375, "learning_rate": 0.00016111221957210747, "loss": 1.9695, "step": 1216 }, { "epoch": 1.2482051282051283, "grad_norm": 0.158203125, "learning_rate": 0.00016072800607755468, "loss": 1.953, "step": 1217 }, { "epoch": 1.2492307692307691, "grad_norm": 0.158203125, "learning_rate": 0.00016034403410024778, "loss": 2.0025, "step": 1218 }, { "epoch": 1.2502564102564102, "grad_norm": 0.1630859375, "learning_rate": 0.00015996030467898837, "loss": 1.9243, "step": 1219 }, { "epoch": 1.2512820512820513, "grad_norm": 0.1533203125, "learning_rate": 0.0001595768188519211, "loss": 2.0543, "step": 1220 }, { "epoch": 1.2523076923076923, "grad_norm": 0.1455078125, "learning_rate": 0.00015919357765653208, "loss": 2.0142, "step": 1221 }, { "epoch": 1.2533333333333334, "grad_norm": 0.1513671875, "learning_rate": 0.00015881058212964555, "loss": 1.9241, "step": 1222 }, { "epoch": 1.2543589743589743, "grad_norm": 0.15234375, "learning_rate": 0.0001584278333074208, "loss": 2.0027, "step": 1223 }, { "epoch": 1.2553846153846153, "grad_norm": 0.1435546875, "learning_rate": 0.00015804533222535004, "loss": 1.9516, "step": 1224 }, { "epoch": 1.2564102564102564, "grad_norm": 0.16796875, "learning_rate": 0.00015766307991825512, "loss": 2.0212, "step": 1225 }, { "epoch": 1.2574358974358975, "grad_norm": 0.15625, "learning_rate": 0.0001572810774202847, "loss": 2.012, "step": 1226 }, { "epoch": 1.2584615384615385, "grad_norm": 0.1689453125, "learning_rate": 0.0001568993257649118, "loss": 1.9318, "step": 1227 }, { "epoch": 1.2594871794871794, "grad_norm": 0.1650390625, "learning_rate": 0.00015651782598493085, "loss": 1.9956, "step": 1228 }, { "epoch": 1.2605128205128204, "grad_norm": 0.1650390625, "learning_rate": 0.0001561365791124546, "loss": 1.8017, "step": 1229 }, { "epoch": 1.2615384615384615, "grad_norm": 0.1494140625, "learning_rate": 0.00015575558617891171, "loss": 1.9987, "step": 1230 }, { "epoch": 1.2625641025641026, "grad_norm": 0.1552734375, "learning_rate": 0.00015537484821504403, "loss": 2.0807, "step": 1231 }, { "epoch": 1.2635897435897436, "grad_norm": 0.1474609375, "learning_rate": 0.00015499436625090324, "loss": 1.8128, "step": 1232 }, { "epoch": 1.2646153846153847, "grad_norm": 0.1416015625, "learning_rate": 0.00015461414131584872, "loss": 2.0025, "step": 1233 }, { "epoch": 1.2656410256410258, "grad_norm": 0.1552734375, "learning_rate": 0.0001542341744385444, "loss": 2.0073, "step": 1234 }, { "epoch": 1.2666666666666666, "grad_norm": 0.1669921875, "learning_rate": 0.000153854466646956, "loss": 1.8978, "step": 1235 }, { "epoch": 1.2676923076923077, "grad_norm": 0.1484375, "learning_rate": 0.0001534750189683485, "loss": 1.8505, "step": 1236 }, { "epoch": 1.2687179487179487, "grad_norm": 0.1650390625, "learning_rate": 0.00015309583242928277, "loss": 1.899, "step": 1237 }, { "epoch": 1.2697435897435898, "grad_norm": 0.1494140625, "learning_rate": 0.0001527169080556136, "loss": 1.8416, "step": 1238 }, { "epoch": 1.2707692307692309, "grad_norm": 0.15625, "learning_rate": 0.00015233824687248637, "loss": 1.9072, "step": 1239 }, { "epoch": 1.2717948717948717, "grad_norm": 0.1767578125, "learning_rate": 0.00015195984990433437, "loss": 2.0555, "step": 1240 }, { "epoch": 1.2717948717948717, "eval_loss": NaN, "eval_runtime": 73.8365, "eval_samples_per_second": 9.304, "eval_steps_per_second": 1.165, "step": 1240 }, { "epoch": 1.2728205128205128, "grad_norm": 0.1572265625, "learning_rate": 0.0001515817181748761, "loss": 1.9676, "step": 1241 }, { "epoch": 1.2738461538461539, "grad_norm": 0.15625, "learning_rate": 0.00015120385270711256, "loss": 1.9067, "step": 1242 }, { "epoch": 1.274871794871795, "grad_norm": 0.1669921875, "learning_rate": 0.00015082625452332433, "loss": 1.8324, "step": 1243 }, { "epoch": 1.275897435897436, "grad_norm": 0.154296875, "learning_rate": 0.00015044892464506887, "loss": 1.9543, "step": 1244 }, { "epoch": 1.2769230769230768, "grad_norm": 0.1669921875, "learning_rate": 0.0001500718640931779, "loss": 2.0031, "step": 1245 }, { "epoch": 1.277948717948718, "grad_norm": 0.1572265625, "learning_rate": 0.0001496950738877541, "loss": 1.9124, "step": 1246 }, { "epoch": 1.278974358974359, "grad_norm": 0.1474609375, "learning_rate": 0.00014931855504816935, "loss": 2.0017, "step": 1247 }, { "epoch": 1.28, "grad_norm": 0.14453125, "learning_rate": 0.00014894230859306101, "loss": 1.9212, "step": 1248 }, { "epoch": 1.281025641025641, "grad_norm": 0.1630859375, "learning_rate": 0.00014856633554032945, "loss": 1.9432, "step": 1249 }, { "epoch": 1.282051282051282, "grad_norm": 0.1494140625, "learning_rate": 0.00014819063690713562, "loss": 2.0368, "step": 1250 }, { "epoch": 1.283076923076923, "grad_norm": 0.1513671875, "learning_rate": 0.00014781521370989788, "loss": 1.8654, "step": 1251 }, { "epoch": 1.284102564102564, "grad_norm": 0.16796875, "learning_rate": 0.0001474400669642895, "loss": 2.0461, "step": 1252 }, { "epoch": 1.2851282051282051, "grad_norm": 0.1435546875, "learning_rate": 0.00014706519768523595, "loss": 1.9218, "step": 1253 }, { "epoch": 1.2861538461538462, "grad_norm": 0.15625, "learning_rate": 0.00014669060688691162, "loss": 1.8957, "step": 1254 }, { "epoch": 1.287179487179487, "grad_norm": 0.1474609375, "learning_rate": 0.00014631629558273801, "loss": 1.9394, "step": 1255 }, { "epoch": 1.2882051282051283, "grad_norm": 0.1484375, "learning_rate": 0.00014594226478538031, "loss": 1.9183, "step": 1256 }, { "epoch": 1.2892307692307692, "grad_norm": 0.142578125, "learning_rate": 0.00014556851550674455, "loss": 1.8752, "step": 1257 }, { "epoch": 1.2902564102564102, "grad_norm": 0.154296875, "learning_rate": 0.00014519504875797545, "loss": 1.9414, "step": 1258 }, { "epoch": 1.2912820512820513, "grad_norm": 0.16015625, "learning_rate": 0.00014482186554945343, "loss": 1.8621, "step": 1259 }, { "epoch": 1.2923076923076924, "grad_norm": 0.1533203125, "learning_rate": 0.0001444489668907914, "loss": 1.9322, "step": 1260 }, { "epoch": 1.2933333333333334, "grad_norm": 0.1494140625, "learning_rate": 0.0001440763537908328, "loss": 1.9273, "step": 1261 }, { "epoch": 1.2943589743589743, "grad_norm": 0.16015625, "learning_rate": 0.00014370402725764854, "loss": 1.9092, "step": 1262 }, { "epoch": 1.2953846153846154, "grad_norm": 0.1640625, "learning_rate": 0.00014333198829853395, "loss": 1.9795, "step": 1263 }, { "epoch": 1.2964102564102564, "grad_norm": 0.1572265625, "learning_rate": 0.0001429602379200066, "loss": 1.9758, "step": 1264 }, { "epoch": 1.2974358974358975, "grad_norm": 0.1552734375, "learning_rate": 0.00014258877712780332, "loss": 1.9149, "step": 1265 }, { "epoch": 1.2984615384615386, "grad_norm": 0.1669921875, "learning_rate": 0.00014221760692687742, "loss": 1.9409, "step": 1266 }, { "epoch": 1.2994871794871794, "grad_norm": 0.1533203125, "learning_rate": 0.00014184672832139614, "loss": 2.027, "step": 1267 }, { "epoch": 1.3005128205128205, "grad_norm": 0.1455078125, "learning_rate": 0.00014147614231473758, "loss": 1.8172, "step": 1268 }, { "epoch": 1.3015384615384615, "grad_norm": 0.162109375, "learning_rate": 0.0001411058499094885, "loss": 1.9529, "step": 1269 }, { "epoch": 1.3025641025641026, "grad_norm": 0.146484375, "learning_rate": 0.00014073585210744137, "loss": 1.9135, "step": 1270 }, { "epoch": 1.3035897435897437, "grad_norm": 0.150390625, "learning_rate": 0.00014036614990959134, "loss": 1.7519, "step": 1271 }, { "epoch": 1.3046153846153845, "grad_norm": 0.1640625, "learning_rate": 0.00013999674431613412, "loss": 1.9113, "step": 1272 }, { "epoch": 1.3056410256410256, "grad_norm": 0.1533203125, "learning_rate": 0.0001396276363264629, "loss": 1.9661, "step": 1273 }, { "epoch": 1.3066666666666666, "grad_norm": 0.16015625, "learning_rate": 0.0001392588269391657, "loss": 1.8355, "step": 1274 }, { "epoch": 1.3076923076923077, "grad_norm": 0.1513671875, "learning_rate": 0.0001388903171520227, "loss": 1.8856, "step": 1275 }, { "epoch": 1.3087179487179488, "grad_norm": 0.15234375, "learning_rate": 0.0001385221079620037, "loss": 1.9393, "step": 1276 }, { "epoch": 1.3097435897435896, "grad_norm": 0.1513671875, "learning_rate": 0.00013815420036526489, "loss": 1.8188, "step": 1277 }, { "epoch": 1.3107692307692307, "grad_norm": 0.166015625, "learning_rate": 0.0001377865953571468, "loss": 1.9644, "step": 1278 }, { "epoch": 1.3117948717948718, "grad_norm": 0.146484375, "learning_rate": 0.0001374192939321715, "loss": 1.8523, "step": 1279 }, { "epoch": 1.3128205128205128, "grad_norm": 0.1484375, "learning_rate": 0.00013705229708403926, "loss": 1.8906, "step": 1280 }, { "epoch": 1.3128205128205128, "eval_loss": NaN, "eval_runtime": 73.8544, "eval_samples_per_second": 9.302, "eval_steps_per_second": 1.164, "step": 1280 }, { "epoch": 1.3138461538461539, "grad_norm": 0.16796875, "learning_rate": 0.00013668560580562674, "loss": 2.0264, "step": 1281 }, { "epoch": 1.314871794871795, "grad_norm": 0.158203125, "learning_rate": 0.0001363192210889838, "loss": 1.9783, "step": 1282 }, { "epoch": 1.315897435897436, "grad_norm": 0.1494140625, "learning_rate": 0.00013595314392533082, "loss": 2.0562, "step": 1283 }, { "epoch": 1.3169230769230769, "grad_norm": 0.15625, "learning_rate": 0.00013558737530505643, "loss": 1.93, "step": 1284 }, { "epoch": 1.317948717948718, "grad_norm": 0.146484375, "learning_rate": 0.00013522191621771401, "loss": 1.9414, "step": 1285 }, { "epoch": 1.318974358974359, "grad_norm": 0.1357421875, "learning_rate": 0.00013485676765201998, "loss": 1.8833, "step": 1286 }, { "epoch": 1.32, "grad_norm": 0.15625, "learning_rate": 0.00013449193059585063, "loss": 2.0894, "step": 1287 }, { "epoch": 1.3210256410256411, "grad_norm": 0.13671875, "learning_rate": 0.00013412740603623905, "loss": 1.8414, "step": 1288 }, { "epoch": 1.322051282051282, "grad_norm": 0.14453125, "learning_rate": 0.0001337631949593735, "loss": 1.8207, "step": 1289 }, { "epoch": 1.323076923076923, "grad_norm": 0.1552734375, "learning_rate": 0.0001333992983505939, "loss": 2.0065, "step": 1290 }, { "epoch": 1.324102564102564, "grad_norm": 0.154296875, "learning_rate": 0.00013303571719438918, "loss": 1.8415, "step": 1291 }, { "epoch": 1.3251282051282052, "grad_norm": 0.146484375, "learning_rate": 0.00013267245247439513, "loss": 1.9804, "step": 1292 }, { "epoch": 1.3261538461538462, "grad_norm": 0.1494140625, "learning_rate": 0.0001323095051733914, "loss": 1.7408, "step": 1293 }, { "epoch": 1.327179487179487, "grad_norm": 0.177734375, "learning_rate": 0.00013194687627329871, "loss": 1.9806, "step": 1294 }, { "epoch": 1.3282051282051281, "grad_norm": 0.150390625, "learning_rate": 0.00013158456675517656, "loss": 1.8978, "step": 1295 }, { "epoch": 1.3292307692307692, "grad_norm": 0.14453125, "learning_rate": 0.00013122257759922034, "loss": 1.8101, "step": 1296 }, { "epoch": 1.3302564102564103, "grad_norm": 0.1552734375, "learning_rate": 0.00013086090978475882, "loss": 1.8758, "step": 1297 }, { "epoch": 1.3312820512820513, "grad_norm": 0.1396484375, "learning_rate": 0.00013049956429025134, "loss": 1.7596, "step": 1298 }, { "epoch": 1.3323076923076922, "grad_norm": 0.154296875, "learning_rate": 0.00013013854209328502, "loss": 2.0049, "step": 1299 }, { "epoch": 1.3333333333333333, "grad_norm": 0.15234375, "learning_rate": 0.0001297778441705726, "loss": 1.8401, "step": 1300 }, { "epoch": 1.3343589743589743, "grad_norm": 0.154296875, "learning_rate": 0.00012941747149794962, "loss": 1.9876, "step": 1301 }, { "epoch": 1.3353846153846154, "grad_norm": 0.1494140625, "learning_rate": 0.00012905742505037128, "loss": 1.8446, "step": 1302 }, { "epoch": 1.3364102564102565, "grad_norm": 0.1494140625, "learning_rate": 0.0001286977058019105, "loss": 1.9178, "step": 1303 }, { "epoch": 1.3374358974358973, "grad_norm": 0.150390625, "learning_rate": 0.00012833831472575498, "loss": 1.9479, "step": 1304 }, { "epoch": 1.3384615384615386, "grad_norm": 0.2001953125, "learning_rate": 0.00012797925279420453, "loss": 1.9639, "step": 1305 }, { "epoch": 1.3394871794871794, "grad_norm": 0.13671875, "learning_rate": 0.0001276205209786685, "loss": 1.9279, "step": 1306 }, { "epoch": 1.3405128205128205, "grad_norm": 0.1591796875, "learning_rate": 0.00012726212024966315, "loss": 1.8569, "step": 1307 }, { "epoch": 1.3415384615384616, "grad_norm": 0.1416015625, "learning_rate": 0.00012690405157680893, "loss": 1.8391, "step": 1308 }, { "epoch": 1.3425641025641026, "grad_norm": 0.1474609375, "learning_rate": 0.00012654631592882802, "loss": 1.8801, "step": 1309 }, { "epoch": 1.3435897435897437, "grad_norm": 0.154296875, "learning_rate": 0.00012618891427354173, "loss": 1.792, "step": 1310 }, { "epoch": 1.3446153846153845, "grad_norm": 0.15234375, "learning_rate": 0.00012583184757786753, "loss": 1.8849, "step": 1311 }, { "epoch": 1.3456410256410256, "grad_norm": 0.138671875, "learning_rate": 0.00012547511680781686, "loss": 1.8969, "step": 1312 }, { "epoch": 1.3466666666666667, "grad_norm": 0.150390625, "learning_rate": 0.00012511872292849237, "loss": 2.0278, "step": 1313 }, { "epoch": 1.3476923076923077, "grad_norm": 0.1494140625, "learning_rate": 0.00012476266690408523, "loss": 2.0041, "step": 1314 }, { "epoch": 1.3487179487179488, "grad_norm": 0.1396484375, "learning_rate": 0.00012440694969787262, "loss": 1.8243, "step": 1315 }, { "epoch": 1.3497435897435897, "grad_norm": 0.19140625, "learning_rate": 0.00012405157227221488, "loss": 1.9061, "step": 1316 }, { "epoch": 1.3507692307692307, "grad_norm": 0.1337890625, "learning_rate": 0.00012369653558855337, "loss": 1.825, "step": 1317 }, { "epoch": 1.3517948717948718, "grad_norm": 0.140625, "learning_rate": 0.00012334184060740756, "loss": 1.8558, "step": 1318 }, { "epoch": 1.3528205128205129, "grad_norm": 0.1494140625, "learning_rate": 0.00012298748828837219, "loss": 1.9522, "step": 1319 }, { "epoch": 1.353846153846154, "grad_norm": 0.1435546875, "learning_rate": 0.00012263347959011535, "loss": 1.8369, "step": 1320 }, { "epoch": 1.353846153846154, "eval_loss": NaN, "eval_runtime": 73.7662, "eval_samples_per_second": 9.313, "eval_steps_per_second": 1.166, "step": 1320 }, { "epoch": 1.3548717948717948, "grad_norm": 0.150390625, "learning_rate": 0.0001222798154703752, "loss": 1.9656, "step": 1321 }, { "epoch": 1.3558974358974358, "grad_norm": 0.1533203125, "learning_rate": 0.00012192649688595791, "loss": 1.8501, "step": 1322 }, { "epoch": 1.356923076923077, "grad_norm": 0.142578125, "learning_rate": 0.00012157352479273464, "loss": 1.8131, "step": 1323 }, { "epoch": 1.357948717948718, "grad_norm": 0.146484375, "learning_rate": 0.00012122090014563939, "loss": 1.864, "step": 1324 }, { "epoch": 1.358974358974359, "grad_norm": 0.15234375, "learning_rate": 0.00012086862389866576, "loss": 1.8466, "step": 1325 }, { "epoch": 1.3599999999999999, "grad_norm": 0.1552734375, "learning_rate": 0.0001205166970048652, "loss": 1.8518, "step": 1326 }, { "epoch": 1.3610256410256412, "grad_norm": 0.1455078125, "learning_rate": 0.00012016512041634389, "loss": 1.8871, "step": 1327 }, { "epoch": 1.362051282051282, "grad_norm": 0.1416015625, "learning_rate": 0.0001198138950842601, "loss": 1.9436, "step": 1328 }, { "epoch": 1.363076923076923, "grad_norm": 0.1708984375, "learning_rate": 0.00011946302195882208, "loss": 1.8984, "step": 1329 }, { "epoch": 1.3641025641025641, "grad_norm": 0.142578125, "learning_rate": 0.00011911250198928508, "loss": 1.8264, "step": 1330 }, { "epoch": 1.3651282051282052, "grad_norm": 0.13671875, "learning_rate": 0.00011876233612394893, "loss": 1.8589, "step": 1331 }, { "epoch": 1.3661538461538463, "grad_norm": 0.162109375, "learning_rate": 0.0001184125253101556, "loss": 2.0307, "step": 1332 }, { "epoch": 1.3671794871794871, "grad_norm": 0.162109375, "learning_rate": 0.00011806307049428616, "loss": 1.9761, "step": 1333 }, { "epoch": 1.3682051282051282, "grad_norm": 0.146484375, "learning_rate": 0.00011771397262175889, "loss": 1.971, "step": 1334 }, { "epoch": 1.3692307692307693, "grad_norm": 0.1435546875, "learning_rate": 0.00011736523263702637, "loss": 1.8021, "step": 1335 }, { "epoch": 1.3702564102564103, "grad_norm": 0.1630859375, "learning_rate": 0.00011701685148357263, "loss": 1.8955, "step": 1336 }, { "epoch": 1.3712820512820514, "grad_norm": 0.138671875, "learning_rate": 0.00011666883010391122, "loss": 1.8228, "step": 1337 }, { "epoch": 1.3723076923076922, "grad_norm": 0.14453125, "learning_rate": 0.00011632116943958245, "loss": 1.8247, "step": 1338 }, { "epoch": 1.3733333333333333, "grad_norm": 0.1474609375, "learning_rate": 0.0001159738704311504, "loss": 1.9709, "step": 1339 }, { "epoch": 1.3743589743589744, "grad_norm": 0.1474609375, "learning_rate": 0.00011562693401820092, "loss": 1.7846, "step": 1340 }, { "epoch": 1.3753846153846154, "grad_norm": 0.140625, "learning_rate": 0.00011528036113933898, "loss": 1.9136, "step": 1341 }, { "epoch": 1.3764102564102565, "grad_norm": 0.1630859375, "learning_rate": 0.00011493415273218574, "loss": 1.7796, "step": 1342 }, { "epoch": 1.3774358974358973, "grad_norm": 0.15625, "learning_rate": 0.00011458830973337669, "loss": 1.8562, "step": 1343 }, { "epoch": 1.3784615384615384, "grad_norm": 0.1376953125, "learning_rate": 0.00011424283307855827, "loss": 1.8933, "step": 1344 }, { "epoch": 1.3794871794871795, "grad_norm": 0.14453125, "learning_rate": 0.00011389772370238639, "loss": 1.8537, "step": 1345 }, { "epoch": 1.3805128205128205, "grad_norm": 0.1533203125, "learning_rate": 0.00011355298253852298, "loss": 1.9007, "step": 1346 }, { "epoch": 1.3815384615384616, "grad_norm": 0.1455078125, "learning_rate": 0.00011320861051963371, "loss": 1.9062, "step": 1347 }, { "epoch": 1.3825641025641024, "grad_norm": 0.1474609375, "learning_rate": 0.00011286460857738579, "loss": 1.7311, "step": 1348 }, { "epoch": 1.3835897435897435, "grad_norm": 0.140625, "learning_rate": 0.0001125209776424452, "loss": 1.8787, "step": 1349 }, { "epoch": 1.3846153846153846, "grad_norm": 0.140625, "learning_rate": 0.00011217771864447396, "loss": 1.8356, "step": 1350 }, { "epoch": 1.3856410256410256, "grad_norm": 0.1494140625, "learning_rate": 0.00011183483251212811, "loss": 1.8793, "step": 1351 }, { "epoch": 1.3866666666666667, "grad_norm": 0.1484375, "learning_rate": 0.00011149232017305483, "loss": 1.8356, "step": 1352 }, { "epoch": 1.3876923076923076, "grad_norm": 0.130859375, "learning_rate": 0.00011115018255389006, "loss": 1.721, "step": 1353 }, { "epoch": 1.3887179487179488, "grad_norm": 0.1357421875, "learning_rate": 0.00011080842058025595, "loss": 1.8374, "step": 1354 }, { "epoch": 1.3897435897435897, "grad_norm": 0.1396484375, "learning_rate": 0.00011046703517675846, "loss": 1.7247, "step": 1355 }, { "epoch": 1.3907692307692308, "grad_norm": 0.1650390625, "learning_rate": 0.00011012602726698454, "loss": 1.9581, "step": 1356 }, { "epoch": 1.3917948717948718, "grad_norm": 0.1708984375, "learning_rate": 0.0001097853977735002, "loss": 1.8267, "step": 1357 }, { "epoch": 1.392820512820513, "grad_norm": 0.1357421875, "learning_rate": 0.0001094451476178473, "loss": 1.8031, "step": 1358 }, { "epoch": 1.393846153846154, "grad_norm": 0.154296875, "learning_rate": 0.0001091052777205417, "loss": 1.8652, "step": 1359 }, { "epoch": 1.3948717948717948, "grad_norm": 0.1494140625, "learning_rate": 0.00010876578900107054, "loss": 1.9503, "step": 1360 }, { "epoch": 1.3948717948717948, "eval_loss": NaN, "eval_runtime": 73.8306, "eval_samples_per_second": 9.305, "eval_steps_per_second": 1.165, "step": 1360 }, { "epoch": 1.3958974358974359, "grad_norm": 0.1455078125, "learning_rate": 0.00010842668237788953, "loss": 1.7046, "step": 1361 }, { "epoch": 1.396923076923077, "grad_norm": 0.14453125, "learning_rate": 0.00010808795876842076, "loss": 1.8255, "step": 1362 }, { "epoch": 1.397948717948718, "grad_norm": 0.1435546875, "learning_rate": 0.00010774961908905021, "loss": 1.8786, "step": 1363 }, { "epoch": 1.398974358974359, "grad_norm": 0.146484375, "learning_rate": 0.00010741166425512486, "loss": 1.7557, "step": 1364 }, { "epoch": 1.4, "grad_norm": 0.1455078125, "learning_rate": 0.00010707409518095079, "loss": 1.8946, "step": 1365 }, { "epoch": 1.401025641025641, "grad_norm": 0.15625, "learning_rate": 0.00010673691277979053, "loss": 1.7613, "step": 1366 }, { "epoch": 1.402051282051282, "grad_norm": 0.138671875, "learning_rate": 0.0001064001179638601, "loss": 1.7312, "step": 1367 }, { "epoch": 1.403076923076923, "grad_norm": 0.162109375, "learning_rate": 0.00010606371164432732, "loss": 1.9659, "step": 1368 }, { "epoch": 1.4041025641025642, "grad_norm": 0.134765625, "learning_rate": 0.0001057276947313088, "loss": 1.8888, "step": 1369 }, { "epoch": 1.405128205128205, "grad_norm": 0.1416015625, "learning_rate": 0.00010539206813386773, "loss": 1.7309, "step": 1370 }, { "epoch": 1.406153846153846, "grad_norm": 0.2890625, "learning_rate": 0.00010505683276001126, "loss": 1.8601, "step": 1371 }, { "epoch": 1.4071794871794872, "grad_norm": 0.1298828125, "learning_rate": 0.00010472198951668826, "loss": 1.7275, "step": 1372 }, { "epoch": 1.4082051282051282, "grad_norm": 0.1396484375, "learning_rate": 0.00010438753930978642, "loss": 1.7797, "step": 1373 }, { "epoch": 1.4092307692307693, "grad_norm": 0.1357421875, "learning_rate": 0.00010405348304413048, "loss": 1.7966, "step": 1374 }, { "epoch": 1.4102564102564101, "grad_norm": 0.134765625, "learning_rate": 0.00010371982162347909, "loss": 1.7723, "step": 1375 }, { "epoch": 1.4112820512820514, "grad_norm": 0.134765625, "learning_rate": 0.00010338655595052284, "loss": 1.9023, "step": 1376 }, { "epoch": 1.4123076923076923, "grad_norm": 0.13671875, "learning_rate": 0.00010305368692688174, "loss": 1.7689, "step": 1377 }, { "epoch": 1.4133333333333333, "grad_norm": 0.1396484375, "learning_rate": 0.00010272121545310254, "loss": 1.8339, "step": 1378 }, { "epoch": 1.4143589743589744, "grad_norm": 0.1259765625, "learning_rate": 0.00010238914242865651, "loss": 1.9408, "step": 1379 }, { "epoch": 1.4153846153846155, "grad_norm": 0.1474609375, "learning_rate": 0.00010205746875193712, "loss": 1.7951, "step": 1380 }, { "epoch": 1.4164102564102565, "grad_norm": 0.1298828125, "learning_rate": 0.00010172619532025703, "loss": 1.8492, "step": 1381 }, { "epoch": 1.4174358974358974, "grad_norm": 0.1708984375, "learning_rate": 0.0001013953230298465, "loss": 1.9838, "step": 1382 }, { "epoch": 1.4184615384615384, "grad_norm": 0.1318359375, "learning_rate": 0.00010106485277585036, "loss": 1.8444, "step": 1383 }, { "epoch": 1.4194871794871795, "grad_norm": 0.1494140625, "learning_rate": 0.00010073478545232567, "loss": 1.9476, "step": 1384 }, { "epoch": 1.4205128205128206, "grad_norm": 0.1552734375, "learning_rate": 0.00010040512195223947, "loss": 1.7923, "step": 1385 }, { "epoch": 1.4215384615384616, "grad_norm": 0.1416015625, "learning_rate": 0.00010007586316746659, "loss": 1.9064, "step": 1386 }, { "epoch": 1.4225641025641025, "grad_norm": 0.1513671875, "learning_rate": 9.974700998878642e-05, "loss": 1.9876, "step": 1387 }, { "epoch": 1.4235897435897436, "grad_norm": 0.1435546875, "learning_rate": 9.941856330588145e-05, "loss": 1.8984, "step": 1388 }, { "epoch": 1.4246153846153846, "grad_norm": 0.1318359375, "learning_rate": 9.909052400733407e-05, "loss": 1.8338, "step": 1389 }, { "epoch": 1.4256410256410257, "grad_norm": 0.1474609375, "learning_rate": 9.876289298062478e-05, "loss": 1.7642, "step": 1390 }, { "epoch": 1.4266666666666667, "grad_norm": 0.1328125, "learning_rate": 9.843567111212962e-05, "loss": 1.7469, "step": 1391 }, { "epoch": 1.4276923076923076, "grad_norm": 0.1396484375, "learning_rate": 9.81088592871173e-05, "loss": 1.9112, "step": 1392 }, { "epoch": 1.4287179487179487, "grad_norm": 0.1455078125, "learning_rate": 9.77824583897475e-05, "loss": 1.8563, "step": 1393 }, { "epoch": 1.4297435897435897, "grad_norm": 0.146484375, "learning_rate": 9.745646930306834e-05, "loss": 1.8367, "step": 1394 }, { "epoch": 1.4307692307692308, "grad_norm": 0.1416015625, "learning_rate": 9.713089290901333e-05, "loss": 1.8713, "step": 1395 }, { "epoch": 1.4317948717948719, "grad_norm": 0.134765625, "learning_rate": 9.680573008839983e-05, "loss": 1.8876, "step": 1396 }, { "epoch": 1.4328205128205127, "grad_norm": 0.13671875, "learning_rate": 9.648098172092631e-05, "loss": 1.7624, "step": 1397 }, { "epoch": 1.4338461538461538, "grad_norm": 0.1435546875, "learning_rate": 9.615664868516974e-05, "loss": 1.7254, "step": 1398 }, { "epoch": 1.4348717948717948, "grad_norm": 0.154296875, "learning_rate": 9.583273185858368e-05, "loss": 1.8907, "step": 1399 }, { "epoch": 1.435897435897436, "grad_norm": 0.1376953125, "learning_rate": 9.550923211749558e-05, "loss": 1.8217, "step": 1400 }, { "epoch": 1.435897435897436, "eval_loss": NaN, "eval_runtime": 73.8286, "eval_samples_per_second": 9.305, "eval_steps_per_second": 1.165, "step": 1400 }, { "epoch": 1.436923076923077, "grad_norm": 0.14453125, "learning_rate": 9.518615033710448e-05, "loss": 1.7356, "step": 1401 }, { "epoch": 1.4379487179487178, "grad_norm": 0.1396484375, "learning_rate": 9.486348739147877e-05, "loss": 1.7465, "step": 1402 }, { "epoch": 1.438974358974359, "grad_norm": 0.16015625, "learning_rate": 9.45412441535537e-05, "loss": 1.8975, "step": 1403 }, { "epoch": 1.44, "grad_norm": 0.1396484375, "learning_rate": 9.42194214951288e-05, "loss": 1.7474, "step": 1404 }, { "epoch": 1.441025641025641, "grad_norm": 0.13671875, "learning_rate": 9.389802028686617e-05, "loss": 1.9179, "step": 1405 }, { "epoch": 1.442051282051282, "grad_norm": 0.1494140625, "learning_rate": 9.357704139828732e-05, "loss": 1.8938, "step": 1406 }, { "epoch": 1.4430769230769231, "grad_norm": 0.1484375, "learning_rate": 9.325648569777145e-05, "loss": 1.835, "step": 1407 }, { "epoch": 1.4441025641025642, "grad_norm": 0.1416015625, "learning_rate": 9.29363540525528e-05, "loss": 1.8336, "step": 1408 }, { "epoch": 1.445128205128205, "grad_norm": 0.1416015625, "learning_rate": 9.261664732871839e-05, "loss": 1.7085, "step": 1409 }, { "epoch": 1.4461538461538461, "grad_norm": 0.1396484375, "learning_rate": 9.229736639120561e-05, "loss": 1.8068, "step": 1410 }, { "epoch": 1.4471794871794872, "grad_norm": 0.1474609375, "learning_rate": 9.197851210380007e-05, "loss": 1.7732, "step": 1411 }, { "epoch": 1.4482051282051283, "grad_norm": 0.1318359375, "learning_rate": 9.166008532913276e-05, "loss": 1.834, "step": 1412 }, { "epoch": 1.4492307692307693, "grad_norm": 0.1416015625, "learning_rate": 9.134208692867846e-05, "loss": 1.8291, "step": 1413 }, { "epoch": 1.4502564102564102, "grad_norm": 0.1357421875, "learning_rate": 9.102451776275292e-05, "loss": 1.7779, "step": 1414 }, { "epoch": 1.4512820512820512, "grad_norm": 0.13671875, "learning_rate": 9.070737869051044e-05, "loss": 1.9437, "step": 1415 }, { "epoch": 1.4523076923076923, "grad_norm": 0.1337890625, "learning_rate": 9.039067056994193e-05, "loss": 1.7708, "step": 1416 }, { "epoch": 1.4533333333333334, "grad_norm": 0.1416015625, "learning_rate": 9.00743942578724e-05, "loss": 1.822, "step": 1417 }, { "epoch": 1.4543589743589744, "grad_norm": 0.1572265625, "learning_rate": 8.975855060995855e-05, "loss": 1.8709, "step": 1418 }, { "epoch": 1.4553846153846153, "grad_norm": 0.1328125, "learning_rate": 8.944314048068673e-05, "loss": 1.7577, "step": 1419 }, { "epoch": 1.4564102564102563, "grad_norm": 0.1435546875, "learning_rate": 8.912816472337007e-05, "loss": 1.8738, "step": 1420 }, { "epoch": 1.4574358974358974, "grad_norm": 0.1416015625, "learning_rate": 8.881362419014694e-05, "loss": 1.7773, "step": 1421 }, { "epoch": 1.4584615384615385, "grad_norm": 0.142578125, "learning_rate": 8.849951973197815e-05, "loss": 1.7839, "step": 1422 }, { "epoch": 1.4594871794871795, "grad_norm": 0.14453125, "learning_rate": 8.818585219864453e-05, "loss": 1.6831, "step": 1423 }, { "epoch": 1.4605128205128204, "grad_norm": 0.1318359375, "learning_rate": 8.787262243874508e-05, "loss": 1.7888, "step": 1424 }, { "epoch": 1.4615384615384617, "grad_norm": 0.1259765625, "learning_rate": 8.755983129969438e-05, "loss": 1.8116, "step": 1425 }, { "epoch": 1.4625641025641025, "grad_norm": 0.1396484375, "learning_rate": 8.724747962772039e-05, "loss": 1.7609, "step": 1426 }, { "epoch": 1.4635897435897436, "grad_norm": 0.1318359375, "learning_rate": 8.693556826786208e-05, "loss": 1.6577, "step": 1427 }, { "epoch": 1.4646153846153847, "grad_norm": 0.1328125, "learning_rate": 8.662409806396732e-05, "loss": 1.7436, "step": 1428 }, { "epoch": 1.4656410256410257, "grad_norm": 0.13671875, "learning_rate": 8.631306985869015e-05, "loss": 1.7757, "step": 1429 }, { "epoch": 1.4666666666666668, "grad_norm": 0.15234375, "learning_rate": 8.600248449348916e-05, "loss": 1.8813, "step": 1430 }, { "epoch": 1.4676923076923076, "grad_norm": 0.16015625, "learning_rate": 8.569234280862484e-05, "loss": 1.8234, "step": 1431 }, { "epoch": 1.4687179487179487, "grad_norm": 0.1279296875, "learning_rate": 8.538264564315709e-05, "loss": 1.8688, "step": 1432 }, { "epoch": 1.4697435897435898, "grad_norm": 0.12158203125, "learning_rate": 8.507339383494347e-05, "loss": 1.7631, "step": 1433 }, { "epoch": 1.4707692307692308, "grad_norm": 0.150390625, "learning_rate": 8.476458822063656e-05, "loss": 1.9413, "step": 1434 }, { "epoch": 1.471794871794872, "grad_norm": 0.1279296875, "learning_rate": 8.445622963568184e-05, "loss": 1.7468, "step": 1435 }, { "epoch": 1.4728205128205127, "grad_norm": 0.1474609375, "learning_rate": 8.41483189143154e-05, "loss": 1.9903, "step": 1436 }, { "epoch": 1.4738461538461538, "grad_norm": 0.134765625, "learning_rate": 8.384085688956153e-05, "loss": 1.7485, "step": 1437 }, { "epoch": 1.4748717948717949, "grad_norm": 0.1396484375, "learning_rate": 8.353384439323078e-05, "loss": 1.7307, "step": 1438 }, { "epoch": 1.475897435897436, "grad_norm": 0.1259765625, "learning_rate": 8.322728225591758e-05, "loss": 1.7711, "step": 1439 }, { "epoch": 1.476923076923077, "grad_norm": 0.1376953125, "learning_rate": 8.292117130699767e-05, "loss": 1.9437, "step": 1440 }, { "epoch": 1.476923076923077, "eval_loss": NaN, "eval_runtime": 73.7872, "eval_samples_per_second": 9.311, "eval_steps_per_second": 1.166, "step": 1440 }, { "epoch": 1.4779487179487178, "grad_norm": 0.134765625, "learning_rate": 8.261551237462636e-05, "loss": 1.8539, "step": 1441 }, { "epoch": 1.478974358974359, "grad_norm": 0.1337890625, "learning_rate": 8.231030628573629e-05, "loss": 1.7884, "step": 1442 }, { "epoch": 1.48, "grad_norm": 0.12890625, "learning_rate": 8.200555386603448e-05, "loss": 1.838, "step": 1443 }, { "epoch": 1.481025641025641, "grad_norm": 0.1328125, "learning_rate": 8.170125594000092e-05, "loss": 1.7037, "step": 1444 }, { "epoch": 1.4820512820512821, "grad_norm": 0.1357421875, "learning_rate": 8.139741333088596e-05, "loss": 1.7532, "step": 1445 }, { "epoch": 1.483076923076923, "grad_norm": 0.1435546875, "learning_rate": 8.109402686070801e-05, "loss": 1.9154, "step": 1446 }, { "epoch": 1.484102564102564, "grad_norm": 0.1357421875, "learning_rate": 8.079109735025156e-05, "loss": 1.6874, "step": 1447 }, { "epoch": 1.485128205128205, "grad_norm": 0.1376953125, "learning_rate": 8.048862561906489e-05, "loss": 1.8137, "step": 1448 }, { "epoch": 1.4861538461538462, "grad_norm": 0.1572265625, "learning_rate": 8.018661248545745e-05, "loss": 1.9888, "step": 1449 }, { "epoch": 1.4871794871794872, "grad_norm": 0.1640625, "learning_rate": 7.988505876649862e-05, "loss": 1.91, "step": 1450 }, { "epoch": 1.488205128205128, "grad_norm": 0.1337890625, "learning_rate": 7.958396527801423e-05, "loss": 1.7527, "step": 1451 }, { "epoch": 1.4892307692307694, "grad_norm": 0.138671875, "learning_rate": 7.928333283458539e-05, "loss": 1.7769, "step": 1452 }, { "epoch": 1.4902564102564102, "grad_norm": 0.1328125, "learning_rate": 7.898316224954586e-05, "loss": 1.8725, "step": 1453 }, { "epoch": 1.4912820512820513, "grad_norm": 0.1396484375, "learning_rate": 7.868345433497964e-05, "loss": 1.8686, "step": 1454 }, { "epoch": 1.4923076923076923, "grad_norm": 0.1328125, "learning_rate": 7.838420990171927e-05, "loss": 1.7683, "step": 1455 }, { "epoch": 1.4933333333333334, "grad_norm": 0.134765625, "learning_rate": 7.80854297593433e-05, "loss": 1.7411, "step": 1456 }, { "epoch": 1.4943589743589745, "grad_norm": 0.146484375, "learning_rate": 7.77871147161742e-05, "loss": 1.8171, "step": 1457 }, { "epoch": 1.4953846153846153, "grad_norm": 0.1396484375, "learning_rate": 7.74892655792761e-05, "loss": 1.86, "step": 1458 }, { "epoch": 1.4964102564102564, "grad_norm": 0.1474609375, "learning_rate": 7.719188315445283e-05, "loss": 1.8345, "step": 1459 }, { "epoch": 1.4974358974358974, "grad_norm": 0.1298828125, "learning_rate": 7.689496824624525e-05, "loss": 1.6653, "step": 1460 }, { "epoch": 1.4984615384615385, "grad_norm": 0.1279296875, "learning_rate": 7.659852165792969e-05, "loss": 1.724, "step": 1461 }, { "epoch": 1.4994871794871796, "grad_norm": 0.123046875, "learning_rate": 7.630254419151547e-05, "loss": 1.9167, "step": 1462 }, { "epoch": 1.5005128205128204, "grad_norm": 0.1298828125, "learning_rate": 7.60070366477425e-05, "loss": 1.758, "step": 1463 }, { "epoch": 1.5015384615384615, "grad_norm": 0.126953125, "learning_rate": 7.571199982607957e-05, "loss": 1.8288, "step": 1464 }, { "epoch": 1.5025641025641026, "grad_norm": 0.140625, "learning_rate": 7.541743452472194e-05, "loss": 1.899, "step": 1465 }, { "epoch": 1.5035897435897436, "grad_norm": 0.140625, "learning_rate": 7.512334154058922e-05, "loss": 1.8585, "step": 1466 }, { "epoch": 1.5046153846153847, "grad_norm": 0.1318359375, "learning_rate": 7.482972166932323e-05, "loss": 1.7837, "step": 1467 }, { "epoch": 1.5056410256410255, "grad_norm": 0.146484375, "learning_rate": 7.45365757052856e-05, "loss": 1.8599, "step": 1468 }, { "epoch": 1.5066666666666668, "grad_norm": 0.125, "learning_rate": 7.424390444155618e-05, "loss": 1.6003, "step": 1469 }, { "epoch": 1.5076923076923077, "grad_norm": 0.126953125, "learning_rate": 7.395170866993042e-05, "loss": 1.7188, "step": 1470 }, { "epoch": 1.5087179487179487, "grad_norm": 0.13671875, "learning_rate": 7.365998918091724e-05, "loss": 1.8786, "step": 1471 }, { "epoch": 1.5097435897435898, "grad_norm": 0.130859375, "learning_rate": 7.336874676373717e-05, "loss": 1.8281, "step": 1472 }, { "epoch": 1.5107692307692306, "grad_norm": 0.1298828125, "learning_rate": 7.307798220632012e-05, "loss": 1.7949, "step": 1473 }, { "epoch": 1.511794871794872, "grad_norm": 0.125, "learning_rate": 7.278769629530302e-05, "loss": 1.7451, "step": 1474 }, { "epoch": 1.5128205128205128, "grad_norm": 0.12255859375, "learning_rate": 7.249788981602801e-05, "loss": 1.7551, "step": 1475 }, { "epoch": 1.5138461538461538, "grad_norm": 0.1396484375, "learning_rate": 7.220856355254016e-05, "loss": 1.765, "step": 1476 }, { "epoch": 1.514871794871795, "grad_norm": 0.138671875, "learning_rate": 7.19197182875852e-05, "loss": 1.7863, "step": 1477 }, { "epoch": 1.5158974358974358, "grad_norm": 0.140625, "learning_rate": 7.163135480260771e-05, "loss": 1.8163, "step": 1478 }, { "epoch": 1.516923076923077, "grad_norm": 0.14453125, "learning_rate": 7.134347387774891e-05, "loss": 1.8454, "step": 1479 }, { "epoch": 1.5179487179487179, "grad_norm": 0.1376953125, "learning_rate": 7.105607629184432e-05, "loss": 1.7392, "step": 1480 }, { "epoch": 1.5179487179487179, "eval_loss": NaN, "eval_runtime": 73.8272, "eval_samples_per_second": 9.306, "eval_steps_per_second": 1.165, "step": 1480 }, { "epoch": 1.518974358974359, "grad_norm": 0.138671875, "learning_rate": 7.076916282242194e-05, "loss": 1.8277, "step": 1481 }, { "epoch": 1.52, "grad_norm": 0.1171875, "learning_rate": 7.048273424570004e-05, "loss": 1.6414, "step": 1482 }, { "epoch": 1.5210256410256409, "grad_norm": 0.1279296875, "learning_rate": 7.019679133658502e-05, "loss": 1.8274, "step": 1483 }, { "epoch": 1.5220512820512822, "grad_norm": 0.1337890625, "learning_rate": 6.991133486866946e-05, "loss": 1.822, "step": 1484 }, { "epoch": 1.523076923076923, "grad_norm": 0.1357421875, "learning_rate": 6.962636561422966e-05, "loss": 1.7587, "step": 1485 }, { "epoch": 1.524102564102564, "grad_norm": 0.1376953125, "learning_rate": 6.934188434422411e-05, "loss": 1.7967, "step": 1486 }, { "epoch": 1.5251282051282051, "grad_norm": 0.154296875, "learning_rate": 6.905789182829097e-05, "loss": 1.8337, "step": 1487 }, { "epoch": 1.5261538461538462, "grad_norm": 0.142578125, "learning_rate": 6.877438883474599e-05, "loss": 1.7273, "step": 1488 }, { "epoch": 1.5271794871794873, "grad_norm": 0.130859375, "learning_rate": 6.849137613058079e-05, "loss": 1.8898, "step": 1489 }, { "epoch": 1.528205128205128, "grad_norm": 0.130859375, "learning_rate": 6.820885448146041e-05, "loss": 1.6691, "step": 1490 }, { "epoch": 1.5292307692307694, "grad_norm": 0.33984375, "learning_rate": 6.792682465172148e-05, "loss": 1.8274, "step": 1491 }, { "epoch": 1.5302564102564102, "grad_norm": 0.1376953125, "learning_rate": 6.764528740436995e-05, "loss": 1.8323, "step": 1492 }, { "epoch": 1.5312820512820513, "grad_norm": 0.140625, "learning_rate": 6.736424350107934e-05, "loss": 1.8097, "step": 1493 }, { "epoch": 1.5323076923076924, "grad_norm": 0.1376953125, "learning_rate": 6.708369370218815e-05, "loss": 1.7134, "step": 1494 }, { "epoch": 1.5333333333333332, "grad_norm": 0.1474609375, "learning_rate": 6.680363876669831e-05, "loss": 1.6579, "step": 1495 }, { "epoch": 1.5343589743589745, "grad_norm": 0.130859375, "learning_rate": 6.652407945227309e-05, "loss": 1.6851, "step": 1496 }, { "epoch": 1.5353846153846153, "grad_norm": 0.1494140625, "learning_rate": 6.624501651523452e-05, "loss": 1.855, "step": 1497 }, { "epoch": 1.5364102564102564, "grad_norm": 0.1279296875, "learning_rate": 6.596645071056217e-05, "loss": 1.8147, "step": 1498 }, { "epoch": 1.5374358974358975, "grad_norm": 0.140625, "learning_rate": 6.568838279189029e-05, "loss": 1.8444, "step": 1499 }, { "epoch": 1.5384615384615383, "grad_norm": 0.12890625, "learning_rate": 6.541081351150638e-05, "loss": 1.7621, "step": 1500 }, { "epoch": 1.5394871794871796, "grad_norm": 0.1298828125, "learning_rate": 6.513374362034885e-05, "loss": 1.7876, "step": 1501 }, { "epoch": 1.5405128205128205, "grad_norm": 0.130859375, "learning_rate": 6.485717386800496e-05, "loss": 1.8478, "step": 1502 }, { "epoch": 1.5415384615384615, "grad_norm": 0.1318359375, "learning_rate": 6.458110500270903e-05, "loss": 1.7949, "step": 1503 }, { "epoch": 1.5425641025641026, "grad_norm": 0.12255859375, "learning_rate": 6.430553777134029e-05, "loss": 1.6719, "step": 1504 }, { "epoch": 1.5435897435897434, "grad_norm": 0.1474609375, "learning_rate": 6.403047291942058e-05, "loss": 1.6965, "step": 1505 }, { "epoch": 1.5446153846153847, "grad_norm": 0.1640625, "learning_rate": 6.375591119111296e-05, "loss": 1.8638, "step": 1506 }, { "epoch": 1.5456410256410256, "grad_norm": 0.12890625, "learning_rate": 6.348185332921924e-05, "loss": 1.7039, "step": 1507 }, { "epoch": 1.5466666666666666, "grad_norm": 0.134765625, "learning_rate": 6.320830007517781e-05, "loss": 1.786, "step": 1508 }, { "epoch": 1.5476923076923077, "grad_norm": 0.1376953125, "learning_rate": 6.293525216906215e-05, "loss": 1.8752, "step": 1509 }, { "epoch": 1.5487179487179488, "grad_norm": 0.1318359375, "learning_rate": 6.26627103495786e-05, "loss": 1.74, "step": 1510 }, { "epoch": 1.5497435897435898, "grad_norm": 0.12890625, "learning_rate": 6.239067535406403e-05, "loss": 1.7707, "step": 1511 }, { "epoch": 1.5507692307692307, "grad_norm": 0.1337890625, "learning_rate": 6.211914791848445e-05, "loss": 1.7755, "step": 1512 }, { "epoch": 1.5517948717948717, "grad_norm": 0.130859375, "learning_rate": 6.184812877743256e-05, "loss": 1.8339, "step": 1513 }, { "epoch": 1.5528205128205128, "grad_norm": 0.12890625, "learning_rate": 6.157761866412597e-05, "loss": 1.7888, "step": 1514 }, { "epoch": 1.5538461538461539, "grad_norm": 0.12158203125, "learning_rate": 6.130761831040521e-05, "loss": 1.7313, "step": 1515 }, { "epoch": 1.554871794871795, "grad_norm": 0.125, "learning_rate": 6.103812844673152e-05, "loss": 1.8052, "step": 1516 }, { "epoch": 1.5558974358974358, "grad_norm": 0.13671875, "learning_rate": 6.076914980218523e-05, "loss": 1.82, "step": 1517 }, { "epoch": 1.556923076923077, "grad_norm": 0.12890625, "learning_rate": 6.0500683104463606e-05, "loss": 1.7237, "step": 1518 }, { "epoch": 1.557948717948718, "grad_norm": 0.12158203125, "learning_rate": 6.023272907987873e-05, "loss": 1.7228, "step": 1519 }, { "epoch": 1.558974358974359, "grad_norm": 0.1328125, "learning_rate": 5.996528845335586e-05, "loss": 1.7494, "step": 1520 }, { "epoch": 1.558974358974359, "eval_loss": NaN, "eval_runtime": 73.8201, "eval_samples_per_second": 9.306, "eval_steps_per_second": 1.165, "step": 1520 }, { "epoch": 1.56, "grad_norm": 0.1318359375, "learning_rate": 5.969836194843123e-05, "loss": 1.764, "step": 1521 }, { "epoch": 1.561025641025641, "grad_norm": 0.1357421875, "learning_rate": 5.943195028725021e-05, "loss": 1.8018, "step": 1522 }, { "epoch": 1.5620512820512822, "grad_norm": 0.1357421875, "learning_rate": 5.916605419056525e-05, "loss": 1.7891, "step": 1523 }, { "epoch": 1.563076923076923, "grad_norm": 0.11865234375, "learning_rate": 5.890067437773411e-05, "loss": 1.6627, "step": 1524 }, { "epoch": 1.564102564102564, "grad_norm": 0.1474609375, "learning_rate": 5.8635811566717546e-05, "loss": 1.8212, "step": 1525 }, { "epoch": 1.5651282051282052, "grad_norm": 0.1259765625, "learning_rate": 5.837146647407782e-05, "loss": 1.661, "step": 1526 }, { "epoch": 1.566153846153846, "grad_norm": 0.1298828125, "learning_rate": 5.810763981497655e-05, "loss": 1.7579, "step": 1527 }, { "epoch": 1.5671794871794873, "grad_norm": 0.1220703125, "learning_rate": 5.7844332303172626e-05, "loss": 1.7304, "step": 1528 }, { "epoch": 1.5682051282051281, "grad_norm": 0.1220703125, "learning_rate": 5.758154465102058e-05, "loss": 1.6784, "step": 1529 }, { "epoch": 1.5692307692307692, "grad_norm": 0.1298828125, "learning_rate": 5.731927756946847e-05, "loss": 1.74, "step": 1530 }, { "epoch": 1.5702564102564103, "grad_norm": 0.134765625, "learning_rate": 5.705753176805597e-05, "loss": 1.8512, "step": 1531 }, { "epoch": 1.5712820512820511, "grad_norm": 0.1357421875, "learning_rate": 5.679630795491261e-05, "loss": 1.7263, "step": 1532 }, { "epoch": 1.5723076923076924, "grad_norm": 0.1513671875, "learning_rate": 5.653560683675543e-05, "loss": 1.8436, "step": 1533 }, { "epoch": 1.5733333333333333, "grad_norm": 0.14453125, "learning_rate": 5.627542911888767e-05, "loss": 1.642, "step": 1534 }, { "epoch": 1.5743589743589743, "grad_norm": 0.2158203125, "learning_rate": 5.601577550519646e-05, "loss": 1.6861, "step": 1535 }, { "epoch": 1.5753846153846154, "grad_norm": 0.1357421875, "learning_rate": 5.575664669815087e-05, "loss": 1.8116, "step": 1536 }, { "epoch": 1.5764102564102564, "grad_norm": 0.1279296875, "learning_rate": 5.549804339880038e-05, "loss": 1.6665, "step": 1537 }, { "epoch": 1.5774358974358975, "grad_norm": 0.12451171875, "learning_rate": 5.523996630677264e-05, "loss": 1.6516, "step": 1538 }, { "epoch": 1.5784615384615384, "grad_norm": 0.12158203125, "learning_rate": 5.4982416120271686e-05, "loss": 1.7641, "step": 1539 }, { "epoch": 1.5794871794871796, "grad_norm": 0.1787109375, "learning_rate": 5.472539353607611e-05, "loss": 1.8371, "step": 1540 }, { "epoch": 1.5805128205128205, "grad_norm": 0.1279296875, "learning_rate": 5.446889924953716e-05, "loss": 1.7815, "step": 1541 }, { "epoch": 1.5815384615384616, "grad_norm": 0.1279296875, "learning_rate": 5.42129339545766e-05, "loss": 1.6801, "step": 1542 }, { "epoch": 1.5825641025641026, "grad_norm": 0.1220703125, "learning_rate": 5.395749834368538e-05, "loss": 1.8695, "step": 1543 }, { "epoch": 1.5835897435897435, "grad_norm": 0.14453125, "learning_rate": 5.3702593107921136e-05, "loss": 1.7928, "step": 1544 }, { "epoch": 1.5846153846153848, "grad_norm": 0.11962890625, "learning_rate": 5.344821893690679e-05, "loss": 1.6894, "step": 1545 }, { "epoch": 1.5856410256410256, "grad_norm": 0.134765625, "learning_rate": 5.319437651882855e-05, "loss": 1.715, "step": 1546 }, { "epoch": 1.5866666666666667, "grad_norm": 0.1357421875, "learning_rate": 5.2941066540433875e-05, "loss": 1.6298, "step": 1547 }, { "epoch": 1.5876923076923077, "grad_norm": 0.1376953125, "learning_rate": 5.2688289687029906e-05, "loss": 1.7405, "step": 1548 }, { "epoch": 1.5887179487179486, "grad_norm": 0.12890625, "learning_rate": 5.243604664248139e-05, "loss": 1.7998, "step": 1549 }, { "epoch": 1.5897435897435899, "grad_norm": 0.130859375, "learning_rate": 5.218433808920883e-05, "loss": 1.792, "step": 1550 }, { "epoch": 1.5907692307692307, "grad_norm": 0.1396484375, "learning_rate": 5.193316470818682e-05, "loss": 1.8486, "step": 1551 }, { "epoch": 1.5917948717948718, "grad_norm": 0.1298828125, "learning_rate": 5.168252717894209e-05, "loss": 1.7727, "step": 1552 }, { "epoch": 1.5928205128205128, "grad_norm": 0.11865234375, "learning_rate": 5.143242617955152e-05, "loss": 1.8608, "step": 1553 }, { "epoch": 1.5938461538461537, "grad_norm": 0.1376953125, "learning_rate": 5.1182862386640715e-05, "loss": 1.7574, "step": 1554 }, { "epoch": 1.594871794871795, "grad_norm": 0.13671875, "learning_rate": 5.09338364753818e-05, "loss": 1.7988, "step": 1555 }, { "epoch": 1.5958974358974358, "grad_norm": 0.1318359375, "learning_rate": 5.068534911949155e-05, "loss": 1.6359, "step": 1556 }, { "epoch": 1.596923076923077, "grad_norm": 0.130859375, "learning_rate": 5.043740099122995e-05, "loss": 1.8613, "step": 1557 }, { "epoch": 1.597948717948718, "grad_norm": 0.13671875, "learning_rate": 5.01899927613981e-05, "loss": 1.7956, "step": 1558 }, { "epoch": 1.598974358974359, "grad_norm": 0.1396484375, "learning_rate": 4.994312509933635e-05, "loss": 1.9141, "step": 1559 }, { "epoch": 1.6, "grad_norm": 0.130859375, "learning_rate": 4.969679867292276e-05, "loss": 1.7624, "step": 1560 }, { "epoch": 1.6, "eval_loss": NaN, "eval_runtime": 73.7765, "eval_samples_per_second": 9.312, "eval_steps_per_second": 1.166, "step": 1560 }, { "epoch": 1.601025641025641, "grad_norm": 0.12451171875, "learning_rate": 4.945101414857081e-05, "loss": 1.8553, "step": 1561 }, { "epoch": 1.602051282051282, "grad_norm": 0.1279296875, "learning_rate": 4.920577219122841e-05, "loss": 1.7644, "step": 1562 }, { "epoch": 1.603076923076923, "grad_norm": 0.12890625, "learning_rate": 4.8961073464375284e-05, "loss": 1.6805, "step": 1563 }, { "epoch": 1.6041025641025641, "grad_norm": 0.1279296875, "learning_rate": 4.8716918630021395e-05, "loss": 1.7394, "step": 1564 }, { "epoch": 1.6051282051282052, "grad_norm": 0.15234375, "learning_rate": 4.8473308348705505e-05, "loss": 1.8036, "step": 1565 }, { "epoch": 1.606153846153846, "grad_norm": 0.12158203125, "learning_rate": 4.8230243279493106e-05, "loss": 1.7709, "step": 1566 }, { "epoch": 1.6071794871794873, "grad_norm": 0.126953125, "learning_rate": 4.7987724079974425e-05, "loss": 1.725, "step": 1567 }, { "epoch": 1.6082051282051282, "grad_norm": 0.15234375, "learning_rate": 4.7745751406263163e-05, "loss": 1.7179, "step": 1568 }, { "epoch": 1.6092307692307692, "grad_norm": 0.12060546875, "learning_rate": 4.75043259129943e-05, "loss": 1.7902, "step": 1569 }, { "epoch": 1.6102564102564103, "grad_norm": 0.12890625, "learning_rate": 4.7263448253322574e-05, "loss": 1.7585, "step": 1570 }, { "epoch": 1.6112820512820512, "grad_norm": 0.12060546875, "learning_rate": 4.702311907892051e-05, "loss": 1.757, "step": 1571 }, { "epoch": 1.6123076923076924, "grad_norm": 0.1259765625, "learning_rate": 4.678333903997686e-05, "loss": 1.8059, "step": 1572 }, { "epoch": 1.6133333333333333, "grad_norm": 0.1328125, "learning_rate": 4.654410878519455e-05, "loss": 1.8234, "step": 1573 }, { "epoch": 1.6143589743589744, "grad_norm": 0.126953125, "learning_rate": 4.630542896178935e-05, "loss": 1.6629, "step": 1574 }, { "epoch": 1.6153846153846154, "grad_norm": 0.125, "learning_rate": 4.606730021548766e-05, "loss": 1.8093, "step": 1575 }, { "epoch": 1.6164102564102563, "grad_norm": 0.12890625, "learning_rate": 4.58297231905252e-05, "loss": 1.7403, "step": 1576 }, { "epoch": 1.6174358974358976, "grad_norm": 0.1328125, "learning_rate": 4.55926985296449e-05, "loss": 1.7884, "step": 1577 }, { "epoch": 1.6184615384615384, "grad_norm": 0.126953125, "learning_rate": 4.535622687409546e-05, "loss": 1.688, "step": 1578 }, { "epoch": 1.6194871794871795, "grad_norm": 0.12890625, "learning_rate": 4.5120308863629336e-05, "loss": 1.8698, "step": 1579 }, { "epoch": 1.6205128205128205, "grad_norm": 0.1328125, "learning_rate": 4.488494513650132e-05, "loss": 1.7081, "step": 1580 }, { "epoch": 1.6215384615384614, "grad_norm": 0.123046875, "learning_rate": 4.465013632946638e-05, "loss": 1.7422, "step": 1581 }, { "epoch": 1.6225641025641027, "grad_norm": 0.11962890625, "learning_rate": 4.441588307777841e-05, "loss": 1.7498, "step": 1582 }, { "epoch": 1.6235897435897435, "grad_norm": 0.126953125, "learning_rate": 4.418218601518834e-05, "loss": 1.8138, "step": 1583 }, { "epoch": 1.6246153846153846, "grad_norm": 0.146484375, "learning_rate": 4.39490457739421e-05, "loss": 1.6232, "step": 1584 }, { "epoch": 1.6256410256410256, "grad_norm": 0.1318359375, "learning_rate": 4.371646298477947e-05, "loss": 1.8174, "step": 1585 }, { "epoch": 1.6266666666666667, "grad_norm": 0.119140625, "learning_rate": 4.348443827693199e-05, "loss": 1.7139, "step": 1586 }, { "epoch": 1.6276923076923078, "grad_norm": 0.125, "learning_rate": 4.325297227812136e-05, "loss": 1.7542, "step": 1587 }, { "epoch": 1.6287179487179486, "grad_norm": 0.12158203125, "learning_rate": 4.3022065614557796e-05, "loss": 1.819, "step": 1588 }, { "epoch": 1.62974358974359, "grad_norm": 0.13671875, "learning_rate": 4.2791718910938224e-05, "loss": 1.7527, "step": 1589 }, { "epoch": 1.6307692307692307, "grad_norm": 0.126953125, "learning_rate": 4.25619327904446e-05, "loss": 1.8911, "step": 1590 }, { "epoch": 1.6317948717948718, "grad_norm": 0.1318359375, "learning_rate": 4.233270787474242e-05, "loss": 1.8052, "step": 1591 }, { "epoch": 1.6328205128205129, "grad_norm": 0.1298828125, "learning_rate": 4.210404478397869e-05, "loss": 1.8341, "step": 1592 }, { "epoch": 1.6338461538461537, "grad_norm": 0.1279296875, "learning_rate": 4.1875944136780666e-05, "loss": 1.7351, "step": 1593 }, { "epoch": 1.634871794871795, "grad_norm": 0.1259765625, "learning_rate": 4.164840655025387e-05, "loss": 1.7447, "step": 1594 }, { "epoch": 1.6358974358974359, "grad_norm": 0.1298828125, "learning_rate": 4.142143263998046e-05, "loss": 1.8164, "step": 1595 }, { "epoch": 1.636923076923077, "grad_norm": 0.1357421875, "learning_rate": 4.119502302001771e-05, "loss": 1.6697, "step": 1596 }, { "epoch": 1.637948717948718, "grad_norm": 0.1298828125, "learning_rate": 4.096917830289626e-05, "loss": 1.7785, "step": 1597 }, { "epoch": 1.6389743589743588, "grad_norm": 0.1318359375, "learning_rate": 4.074389909961832e-05, "loss": 1.6874, "step": 1598 }, { "epoch": 1.6400000000000001, "grad_norm": 0.1279296875, "learning_rate": 4.0519186019656265e-05, "loss": 1.781, "step": 1599 }, { "epoch": 1.641025641025641, "grad_norm": 0.126953125, "learning_rate": 4.0295039670950965e-05, "loss": 1.7191, "step": 1600 }, { "epoch": 1.641025641025641, "eval_loss": NaN, "eval_runtime": 73.7951, "eval_samples_per_second": 9.31, "eval_steps_per_second": 1.165, "step": 1600 }, { "epoch": 1.642051282051282, "grad_norm": 0.126953125, "learning_rate": 4.00714606599098e-05, "loss": 1.8033, "step": 1601 }, { "epoch": 1.643076923076923, "grad_norm": 0.1201171875, "learning_rate": 3.9848449591405434e-05, "loss": 1.7697, "step": 1602 }, { "epoch": 1.644102564102564, "grad_norm": 0.126953125, "learning_rate": 3.962600706877417e-05, "loss": 1.6492, "step": 1603 }, { "epoch": 1.6451282051282052, "grad_norm": 0.1298828125, "learning_rate": 3.940413369381376e-05, "loss": 1.7482, "step": 1604 }, { "epoch": 1.646153846153846, "grad_norm": 0.12255859375, "learning_rate": 3.918283006678261e-05, "loss": 1.7177, "step": 1605 }, { "epoch": 1.6471794871794871, "grad_norm": 0.1376953125, "learning_rate": 3.896209678639734e-05, "loss": 1.7289, "step": 1606 }, { "epoch": 1.6482051282051282, "grad_norm": 0.126953125, "learning_rate": 3.874193444983179e-05, "loss": 1.8059, "step": 1607 }, { "epoch": 1.6492307692307693, "grad_norm": 0.1357421875, "learning_rate": 3.852234365271517e-05, "loss": 1.7503, "step": 1608 }, { "epoch": 1.6502564102564103, "grad_norm": 0.1318359375, "learning_rate": 3.830332498913022e-05, "loss": 1.6486, "step": 1609 }, { "epoch": 1.6512820512820512, "grad_norm": 0.1318359375, "learning_rate": 3.808487905161215e-05, "loss": 1.6613, "step": 1610 }, { "epoch": 1.6523076923076923, "grad_norm": 0.1396484375, "learning_rate": 3.786700643114652e-05, "loss": 1.7284, "step": 1611 }, { "epoch": 1.6533333333333333, "grad_norm": 0.126953125, "learning_rate": 3.7649707717167745e-05, "loss": 1.7907, "step": 1612 }, { "epoch": 1.6543589743589744, "grad_norm": 0.1279296875, "learning_rate": 3.743298349755778e-05, "loss": 1.6316, "step": 1613 }, { "epoch": 1.6553846153846155, "grad_norm": 0.11865234375, "learning_rate": 3.721683435864426e-05, "loss": 1.6262, "step": 1614 }, { "epoch": 1.6564102564102563, "grad_norm": 0.1328125, "learning_rate": 3.700126088519892e-05, "loss": 1.7533, "step": 1615 }, { "epoch": 1.6574358974358976, "grad_norm": 0.134765625, "learning_rate": 3.678626366043622e-05, "loss": 1.8059, "step": 1616 }, { "epoch": 1.6584615384615384, "grad_norm": 0.1298828125, "learning_rate": 3.657184326601151e-05, "loss": 1.6812, "step": 1617 }, { "epoch": 1.6594871794871795, "grad_norm": 0.1123046875, "learning_rate": 3.635800028201966e-05, "loss": 1.7077, "step": 1618 }, { "epoch": 1.6605128205128206, "grad_norm": 0.126953125, "learning_rate": 3.614473528699344e-05, "loss": 1.5736, "step": 1619 }, { "epoch": 1.6615384615384614, "grad_norm": 0.12158203125, "learning_rate": 3.5932048857901775e-05, "loss": 1.7093, "step": 1620 }, { "epoch": 1.6625641025641027, "grad_norm": 0.11865234375, "learning_rate": 3.571994157014841e-05, "loss": 1.7022, "step": 1621 }, { "epoch": 1.6635897435897435, "grad_norm": 0.12451171875, "learning_rate": 3.55084139975704e-05, "loss": 1.7907, "step": 1622 }, { "epoch": 1.6646153846153846, "grad_norm": 0.1220703125, "learning_rate": 3.529746671243619e-05, "loss": 1.5908, "step": 1623 }, { "epoch": 1.6656410256410257, "grad_norm": 0.142578125, "learning_rate": 3.508710028544451e-05, "loss": 1.7729, "step": 1624 }, { "epoch": 1.6666666666666665, "grad_norm": 0.12353515625, "learning_rate": 3.487731528572255e-05, "loss": 1.734, "step": 1625 }, { "epoch": 1.6676923076923078, "grad_norm": 0.12451171875, "learning_rate": 3.4668112280824504e-05, "loss": 1.9489, "step": 1626 }, { "epoch": 1.6687179487179487, "grad_norm": 0.1240234375, "learning_rate": 3.445949183673006e-05, "loss": 1.7088, "step": 1627 }, { "epoch": 1.6697435897435897, "grad_norm": 0.11865234375, "learning_rate": 3.4251454517842865e-05, "loss": 1.7381, "step": 1628 }, { "epoch": 1.6707692307692308, "grad_norm": 0.1240234375, "learning_rate": 3.4044000886988834e-05, "loss": 1.6992, "step": 1629 }, { "epoch": 1.6717948717948716, "grad_norm": 0.1259765625, "learning_rate": 3.3837131505414896e-05, "loss": 1.8127, "step": 1630 }, { "epoch": 1.672820512820513, "grad_norm": 0.1318359375, "learning_rate": 3.36308469327874e-05, "loss": 1.8135, "step": 1631 }, { "epoch": 1.6738461538461538, "grad_norm": 0.12158203125, "learning_rate": 3.342514772719032e-05, "loss": 1.7855, "step": 1632 }, { "epoch": 1.6748717948717948, "grad_norm": 0.130859375, "learning_rate": 3.32200344451242e-05, "loss": 1.6262, "step": 1633 }, { "epoch": 1.675897435897436, "grad_norm": 0.125, "learning_rate": 3.301550764150432e-05, "loss": 1.6866, "step": 1634 }, { "epoch": 1.676923076923077, "grad_norm": 0.12353515625, "learning_rate": 3.281156786965933e-05, "loss": 1.7546, "step": 1635 }, { "epoch": 1.677948717948718, "grad_norm": 0.1337890625, "learning_rate": 3.260821568132974e-05, "loss": 1.7911, "step": 1636 }, { "epoch": 1.6789743589743589, "grad_norm": 0.1240234375, "learning_rate": 3.240545162666628e-05, "loss": 1.7151, "step": 1637 }, { "epoch": 1.6800000000000002, "grad_norm": 0.12158203125, "learning_rate": 3.220327625422864e-05, "loss": 1.7974, "step": 1638 }, { "epoch": 1.681025641025641, "grad_norm": 0.1279296875, "learning_rate": 3.2001690110983944e-05, "loss": 1.6457, "step": 1639 }, { "epoch": 1.682051282051282, "grad_norm": 0.14453125, "learning_rate": 3.180069374230507e-05, "loss": 1.8216, "step": 1640 }, { "epoch": 1.682051282051282, "eval_loss": NaN, "eval_runtime": 73.7726, "eval_samples_per_second": 9.312, "eval_steps_per_second": 1.166, "step": 1640 }, { "epoch": 1.6830769230769231, "grad_norm": 0.12109375, "learning_rate": 3.160028769196938e-05, "loss": 1.7754, "step": 1641 }, { "epoch": 1.684102564102564, "grad_norm": 0.1171875, "learning_rate": 3.140047250215719e-05, "loss": 1.7807, "step": 1642 }, { "epoch": 1.6851282051282053, "grad_norm": 0.12255859375, "learning_rate": 3.1201248713450315e-05, "loss": 1.6822, "step": 1643 }, { "epoch": 1.6861538461538461, "grad_norm": 0.1337890625, "learning_rate": 3.100261686483055e-05, "loss": 1.6568, "step": 1644 }, { "epoch": 1.6871794871794872, "grad_norm": 0.130859375, "learning_rate": 3.0804577493678324e-05, "loss": 1.7509, "step": 1645 }, { "epoch": 1.6882051282051282, "grad_norm": 0.1279296875, "learning_rate": 3.060713113577102e-05, "loss": 1.745, "step": 1646 }, { "epoch": 1.689230769230769, "grad_norm": 0.1279296875, "learning_rate": 3.0410278325281786e-05, "loss": 1.8198, "step": 1647 }, { "epoch": 1.6902564102564104, "grad_norm": 0.1259765625, "learning_rate": 3.0214019594778043e-05, "loss": 1.8156, "step": 1648 }, { "epoch": 1.6912820512820512, "grad_norm": 0.1396484375, "learning_rate": 3.0018355475219806e-05, "loss": 1.7045, "step": 1649 }, { "epoch": 1.6923076923076923, "grad_norm": 0.1279296875, "learning_rate": 2.9823286495958558e-05, "loss": 1.6632, "step": 1650 }, { "epoch": 1.6933333333333334, "grad_norm": 0.1259765625, "learning_rate": 2.962881318473565e-05, "loss": 1.81, "step": 1651 }, { "epoch": 1.6943589743589742, "grad_norm": 0.1240234375, "learning_rate": 2.943493606768091e-05, "loss": 1.7129, "step": 1652 }, { "epoch": 1.6953846153846155, "grad_norm": 0.126953125, "learning_rate": 2.9241655669311308e-05, "loss": 1.6683, "step": 1653 }, { "epoch": 1.6964102564102563, "grad_norm": 0.123046875, "learning_rate": 2.9048972512529193e-05, "loss": 1.742, "step": 1654 }, { "epoch": 1.6974358974358974, "grad_norm": 0.130859375, "learning_rate": 2.885688711862136e-05, "loss": 1.6841, "step": 1655 }, { "epoch": 1.6984615384615385, "grad_norm": 0.1298828125, "learning_rate": 2.8665400007257397e-05, "loss": 1.7339, "step": 1656 }, { "epoch": 1.6994871794871795, "grad_norm": 0.123046875, "learning_rate": 2.8474511696488125e-05, "loss": 1.7707, "step": 1657 }, { "epoch": 1.7005128205128206, "grad_norm": 0.1279296875, "learning_rate": 2.8284222702744472e-05, "loss": 1.8163, "step": 1658 }, { "epoch": 1.7015384615384614, "grad_norm": 0.12451171875, "learning_rate": 2.8094533540836158e-05, "loss": 1.7427, "step": 1659 }, { "epoch": 1.7025641025641025, "grad_norm": 0.1240234375, "learning_rate": 2.7905444723949762e-05, "loss": 1.7031, "step": 1660 }, { "epoch": 1.7035897435897436, "grad_norm": 0.1279296875, "learning_rate": 2.771695676364791e-05, "loss": 1.6863, "step": 1661 }, { "epoch": 1.7046153846153846, "grad_norm": 0.11572265625, "learning_rate": 2.752907016986761e-05, "loss": 1.6518, "step": 1662 }, { "epoch": 1.7056410256410257, "grad_norm": 0.12353515625, "learning_rate": 2.7341785450918866e-05, "loss": 1.8014, "step": 1663 }, { "epoch": 1.7066666666666666, "grad_norm": 0.130859375, "learning_rate": 2.7155103113483487e-05, "loss": 1.8273, "step": 1664 }, { "epoch": 1.7076923076923078, "grad_norm": 0.12353515625, "learning_rate": 2.6969023662613473e-05, "loss": 1.6311, "step": 1665 }, { "epoch": 1.7087179487179487, "grad_norm": 0.125, "learning_rate": 2.678354760172985e-05, "loss": 1.7055, "step": 1666 }, { "epoch": 1.7097435897435898, "grad_norm": 0.1220703125, "learning_rate": 2.659867543262129e-05, "loss": 1.7549, "step": 1667 }, { "epoch": 1.7107692307692308, "grad_norm": 0.126953125, "learning_rate": 2.641440765544245e-05, "loss": 1.5497, "step": 1668 }, { "epoch": 1.7117948717948717, "grad_norm": 0.126953125, "learning_rate": 2.6230744768713093e-05, "loss": 1.5925, "step": 1669 }, { "epoch": 1.712820512820513, "grad_norm": 0.126953125, "learning_rate": 2.6047687269316446e-05, "loss": 1.807, "step": 1670 }, { "epoch": 1.7138461538461538, "grad_norm": 0.138671875, "learning_rate": 2.586523565249782e-05, "loss": 1.7864, "step": 1671 }, { "epoch": 1.7148717948717949, "grad_norm": 0.1298828125, "learning_rate": 2.5683390411863487e-05, "loss": 1.6704, "step": 1672 }, { "epoch": 1.715897435897436, "grad_norm": 0.1259765625, "learning_rate": 2.5502152039379195e-05, "loss": 1.7206, "step": 1673 }, { "epoch": 1.7169230769230768, "grad_norm": 0.1318359375, "learning_rate": 2.5321521025368805e-05, "loss": 1.8312, "step": 1674 }, { "epoch": 1.717948717948718, "grad_norm": 0.130859375, "learning_rate": 2.5141497858513108e-05, "loss": 1.7415, "step": 1675 }, { "epoch": 1.718974358974359, "grad_norm": 0.302734375, "learning_rate": 2.4962083025848403e-05, "loss": 1.9341, "step": 1676 }, { "epoch": 1.72, "grad_norm": 0.12109375, "learning_rate": 2.4783277012765116e-05, "loss": 1.6949, "step": 1677 }, { "epoch": 1.721025641025641, "grad_norm": 0.1318359375, "learning_rate": 2.460508030300662e-05, "loss": 1.7605, "step": 1678 }, { "epoch": 1.722051282051282, "grad_norm": 0.1171875, "learning_rate": 2.4427493378667953e-05, "loss": 1.6489, "step": 1679 }, { "epoch": 1.7230769230769232, "grad_norm": 0.1240234375, "learning_rate": 2.4250516720194264e-05, "loss": 1.7724, "step": 1680 }, { "epoch": 1.7230769230769232, "eval_loss": NaN, "eval_runtime": 73.7453, "eval_samples_per_second": 9.316, "eval_steps_per_second": 1.166, "step": 1680 }, { "epoch": 1.724102564102564, "grad_norm": 0.11962890625, "learning_rate": 2.4074150806379842e-05, "loss": 1.7111, "step": 1681 }, { "epoch": 1.725128205128205, "grad_norm": 0.12158203125, "learning_rate": 2.3898396114366562e-05, "loss": 1.83, "step": 1682 }, { "epoch": 1.7261538461538461, "grad_norm": 0.125, "learning_rate": 2.372325311964274e-05, "loss": 1.6846, "step": 1683 }, { "epoch": 1.7271794871794872, "grad_norm": 0.130859375, "learning_rate": 2.354872229604185e-05, "loss": 1.6881, "step": 1684 }, { "epoch": 1.7282051282051283, "grad_norm": 0.1259765625, "learning_rate": 2.3374804115741055e-05, "loss": 1.8581, "step": 1685 }, { "epoch": 1.7292307692307691, "grad_norm": 0.1376953125, "learning_rate": 2.3201499049260165e-05, "loss": 1.6534, "step": 1686 }, { "epoch": 1.7302564102564104, "grad_norm": 0.11767578125, "learning_rate": 2.30288075654603e-05, "loss": 1.6285, "step": 1687 }, { "epoch": 1.7312820512820513, "grad_norm": 0.1337890625, "learning_rate": 2.2856730131542438e-05, "loss": 1.6946, "step": 1688 }, { "epoch": 1.7323076923076923, "grad_norm": 0.12060546875, "learning_rate": 2.268526721304648e-05, "loss": 1.6609, "step": 1689 }, { "epoch": 1.7333333333333334, "grad_norm": 0.126953125, "learning_rate": 2.2514419273849673e-05, "loss": 1.6845, "step": 1690 }, { "epoch": 1.7343589743589742, "grad_norm": 0.1357421875, "learning_rate": 2.234418677616562e-05, "loss": 1.6549, "step": 1691 }, { "epoch": 1.7353846153846155, "grad_norm": 0.11767578125, "learning_rate": 2.2174570180542753e-05, "loss": 1.6415, "step": 1692 }, { "epoch": 1.7364102564102564, "grad_norm": 0.1279296875, "learning_rate": 2.2005569945863447e-05, "loss": 1.7273, "step": 1693 }, { "epoch": 1.7374358974358974, "grad_norm": 0.126953125, "learning_rate": 2.1837186529342273e-05, "loss": 1.7858, "step": 1694 }, { "epoch": 1.7384615384615385, "grad_norm": 0.1279296875, "learning_rate": 2.1669420386525306e-05, "loss": 1.7048, "step": 1695 }, { "epoch": 1.7394871794871793, "grad_norm": 0.115234375, "learning_rate": 2.150227197128862e-05, "loss": 1.6724, "step": 1696 }, { "epoch": 1.7405128205128206, "grad_norm": 0.130859375, "learning_rate": 2.133574173583691e-05, "loss": 1.6037, "step": 1697 }, { "epoch": 1.7415384615384615, "grad_norm": 0.1142578125, "learning_rate": 2.1169830130702616e-05, "loss": 1.7072, "step": 1698 }, { "epoch": 1.7425641025641025, "grad_norm": 0.1318359375, "learning_rate": 2.1004537604744468e-05, "loss": 1.7215, "step": 1699 }, { "epoch": 1.7435897435897436, "grad_norm": 0.123046875, "learning_rate": 2.083986460514631e-05, "loss": 1.7754, "step": 1700 }, { "epoch": 1.7446153846153845, "grad_norm": 0.14453125, "learning_rate": 2.0675811577415993e-05, "loss": 1.7715, "step": 1701 }, { "epoch": 1.7456410256410257, "grad_norm": 0.1328125, "learning_rate": 2.0512378965383965e-05, "loss": 1.7154, "step": 1702 }, { "epoch": 1.7466666666666666, "grad_norm": 0.126953125, "learning_rate": 2.0349567211202318e-05, "loss": 1.7352, "step": 1703 }, { "epoch": 1.7476923076923077, "grad_norm": 0.12890625, "learning_rate": 2.0187376755343433e-05, "loss": 1.7568, "step": 1704 }, { "epoch": 1.7487179487179487, "grad_norm": 0.1416015625, "learning_rate": 2.002580803659873e-05, "loss": 1.8168, "step": 1705 }, { "epoch": 1.7497435897435898, "grad_norm": 0.138671875, "learning_rate": 1.9864861492077696e-05, "loss": 1.8686, "step": 1706 }, { "epoch": 1.7507692307692309, "grad_norm": 0.1240234375, "learning_rate": 1.9704537557206637e-05, "loss": 1.7392, "step": 1707 }, { "epoch": 1.7517948717948717, "grad_norm": 0.134765625, "learning_rate": 1.9544836665727207e-05, "loss": 1.66, "step": 1708 }, { "epoch": 1.7528205128205128, "grad_norm": 0.12158203125, "learning_rate": 1.9385759249695712e-05, "loss": 1.6941, "step": 1709 }, { "epoch": 1.7538461538461538, "grad_norm": 0.125, "learning_rate": 1.9227305739481615e-05, "loss": 1.6808, "step": 1710 }, { "epoch": 1.754871794871795, "grad_norm": 0.123046875, "learning_rate": 1.9069476563766368e-05, "loss": 1.7001, "step": 1711 }, { "epoch": 1.755897435897436, "grad_norm": 0.15625, "learning_rate": 1.8912272149542527e-05, "loss": 1.8213, "step": 1712 }, { "epoch": 1.7569230769230768, "grad_norm": 0.130859375, "learning_rate": 1.8755692922112212e-05, "loss": 1.7344, "step": 1713 }, { "epoch": 1.757948717948718, "grad_norm": 0.125, "learning_rate": 1.8599739305086266e-05, "loss": 1.6224, "step": 1714 }, { "epoch": 1.758974358974359, "grad_norm": 0.11474609375, "learning_rate": 1.844441172038311e-05, "loss": 1.7253, "step": 1715 }, { "epoch": 1.76, "grad_norm": 0.11865234375, "learning_rate": 1.828971058822723e-05, "loss": 1.6996, "step": 1716 }, { "epoch": 1.761025641025641, "grad_norm": 0.11474609375, "learning_rate": 1.813563632714854e-05, "loss": 1.6424, "step": 1717 }, { "epoch": 1.762051282051282, "grad_norm": 0.1318359375, "learning_rate": 1.798218935398091e-05, "loss": 1.6045, "step": 1718 }, { "epoch": 1.7630769230769232, "grad_norm": 0.1171875, "learning_rate": 1.782937008386107e-05, "loss": 1.6762, "step": 1719 }, { "epoch": 1.764102564102564, "grad_norm": 0.1328125, "learning_rate": 1.7677178930227687e-05, "loss": 1.78, "step": 1720 }, { "epoch": 1.764102564102564, "eval_loss": NaN, "eval_runtime": 73.784, "eval_samples_per_second": 9.311, "eval_steps_per_second": 1.166, "step": 1720 }, { "epoch": 1.7651282051282051, "grad_norm": 0.119140625, "learning_rate": 1.7525616304820053e-05, "loss": 1.7535, "step": 1721 }, { "epoch": 1.7661538461538462, "grad_norm": 0.1181640625, "learning_rate": 1.7374682617677024e-05, "loss": 1.5441, "step": 1722 }, { "epoch": 1.767179487179487, "grad_norm": 0.267578125, "learning_rate": 1.7224378277135994e-05, "loss": 1.8379, "step": 1723 }, { "epoch": 1.7682051282051283, "grad_norm": 0.1142578125, "learning_rate": 1.707470368983166e-05, "loss": 1.6683, "step": 1724 }, { "epoch": 1.7692307692307692, "grad_norm": 0.1220703125, "learning_rate": 1.6925659260694892e-05, "loss": 1.6244, "step": 1725 }, { "epoch": 1.7702564102564102, "grad_norm": 0.1123046875, "learning_rate": 1.677724539295189e-05, "loss": 1.7015, "step": 1726 }, { "epoch": 1.7712820512820513, "grad_norm": 0.1337890625, "learning_rate": 1.6629462488122858e-05, "loss": 1.7783, "step": 1727 }, { "epoch": 1.7723076923076924, "grad_norm": 0.1376953125, "learning_rate": 1.64823109460209e-05, "loss": 1.81, "step": 1728 }, { "epoch": 1.7733333333333334, "grad_norm": 0.12353515625, "learning_rate": 1.6335791164751173e-05, "loss": 1.6987, "step": 1729 }, { "epoch": 1.7743589743589743, "grad_norm": 0.1416015625, "learning_rate": 1.6189903540709595e-05, "loss": 1.8246, "step": 1730 }, { "epoch": 1.7753846153846153, "grad_norm": 0.1259765625, "learning_rate": 1.6044648468581835e-05, "loss": 1.7304, "step": 1731 }, { "epoch": 1.7764102564102564, "grad_norm": 0.12109375, "learning_rate": 1.590002634134227e-05, "loss": 1.7715, "step": 1732 }, { "epoch": 1.7774358974358975, "grad_norm": 0.1162109375, "learning_rate": 1.5756037550252844e-05, "loss": 1.7204, "step": 1733 }, { "epoch": 1.7784615384615385, "grad_norm": 0.1259765625, "learning_rate": 1.5612682484862138e-05, "loss": 1.6286, "step": 1734 }, { "epoch": 1.7794871794871794, "grad_norm": 0.12353515625, "learning_rate": 1.5469961533004257e-05, "loss": 1.6703, "step": 1735 }, { "epoch": 1.7805128205128207, "grad_norm": 0.1240234375, "learning_rate": 1.5327875080797664e-05, "loss": 1.7685, "step": 1736 }, { "epoch": 1.7815384615384615, "grad_norm": 0.12890625, "learning_rate": 1.518642351264432e-05, "loss": 1.6764, "step": 1737 }, { "epoch": 1.7825641025641026, "grad_norm": 0.11669921875, "learning_rate": 1.5045607211228607e-05, "loss": 1.6775, "step": 1738 }, { "epoch": 1.7835897435897436, "grad_norm": 0.125, "learning_rate": 1.4905426557516172e-05, "loss": 1.6736, "step": 1739 }, { "epoch": 1.7846153846153845, "grad_norm": 0.1201171875, "learning_rate": 1.4765881930752984e-05, "loss": 1.5936, "step": 1740 }, { "epoch": 1.7856410256410258, "grad_norm": 0.11767578125, "learning_rate": 1.4626973708464386e-05, "loss": 1.7812, "step": 1741 }, { "epoch": 1.7866666666666666, "grad_norm": 0.125, "learning_rate": 1.4488702266453796e-05, "loss": 1.7407, "step": 1742 }, { "epoch": 1.7876923076923077, "grad_norm": 0.11962890625, "learning_rate": 1.4351067978802146e-05, "loss": 1.7557, "step": 1743 }, { "epoch": 1.7887179487179488, "grad_norm": 0.12255859375, "learning_rate": 1.4214071217866331e-05, "loss": 1.6225, "step": 1744 }, { "epoch": 1.7897435897435896, "grad_norm": 0.1396484375, "learning_rate": 1.4077712354278683e-05, "loss": 1.6411, "step": 1745 }, { "epoch": 1.790769230769231, "grad_norm": 0.1240234375, "learning_rate": 1.3941991756945632e-05, "loss": 1.8594, "step": 1746 }, { "epoch": 1.7917948717948717, "grad_norm": 0.11474609375, "learning_rate": 1.380690979304694e-05, "loss": 1.6741, "step": 1747 }, { "epoch": 1.7928205128205128, "grad_norm": 0.125, "learning_rate": 1.3672466828034524e-05, "loss": 1.7501, "step": 1748 }, { "epoch": 1.7938461538461539, "grad_norm": 0.12109375, "learning_rate": 1.353866322563163e-05, "loss": 1.789, "step": 1749 }, { "epoch": 1.7948717948717947, "grad_norm": 0.130859375, "learning_rate": 1.340549934783164e-05, "loss": 1.7333, "step": 1750 }, { "epoch": 1.795897435897436, "grad_norm": 0.11865234375, "learning_rate": 1.3272975554897316e-05, "loss": 1.7083, "step": 1751 }, { "epoch": 1.7969230769230768, "grad_norm": 0.130859375, "learning_rate": 1.314109220535975e-05, "loss": 1.6572, "step": 1752 }, { "epoch": 1.797948717948718, "grad_norm": 0.1318359375, "learning_rate": 1.300984965601726e-05, "loss": 1.6704, "step": 1753 }, { "epoch": 1.798974358974359, "grad_norm": 0.119140625, "learning_rate": 1.2879248261934624e-05, "loss": 1.6955, "step": 1754 }, { "epoch": 1.8, "grad_norm": 0.11865234375, "learning_rate": 1.2749288376442042e-05, "loss": 1.68, "step": 1755 }, { "epoch": 1.801025641025641, "grad_norm": 0.1396484375, "learning_rate": 1.2619970351134102e-05, "loss": 2.0084, "step": 1756 }, { "epoch": 1.802051282051282, "grad_norm": 0.119140625, "learning_rate": 1.2491294535869003e-05, "loss": 1.7288, "step": 1757 }, { "epoch": 1.803076923076923, "grad_norm": 0.1240234375, "learning_rate": 1.2363261278767412e-05, "loss": 1.6289, "step": 1758 }, { "epoch": 1.804102564102564, "grad_norm": 0.11962890625, "learning_rate": 1.2235870926211617e-05, "loss": 1.7402, "step": 1759 }, { "epoch": 1.8051282051282052, "grad_norm": 0.12890625, "learning_rate": 1.2109123822844653e-05, "loss": 1.5865, "step": 1760 }, { "epoch": 1.8051282051282052, "eval_loss": NaN, "eval_runtime": 73.758, "eval_samples_per_second": 9.314, "eval_steps_per_second": 1.166, "step": 1760 }, { "epoch": 1.8061538461538462, "grad_norm": 0.130859375, "learning_rate": 1.1983020311569198e-05, "loss": 1.7037, "step": 1761 }, { "epoch": 1.807179487179487, "grad_norm": 0.1201171875, "learning_rate": 1.1857560733546825e-05, "loss": 1.7816, "step": 1762 }, { "epoch": 1.8082051282051284, "grad_norm": 0.123046875, "learning_rate": 1.173274542819705e-05, "loss": 1.646, "step": 1763 }, { "epoch": 1.8092307692307692, "grad_norm": 0.150390625, "learning_rate": 1.1608574733196264e-05, "loss": 1.6019, "step": 1764 }, { "epoch": 1.8102564102564103, "grad_norm": 0.267578125, "learning_rate": 1.1485048984476998e-05, "loss": 1.6711, "step": 1765 }, { "epoch": 1.8112820512820513, "grad_norm": 0.11572265625, "learning_rate": 1.1362168516226901e-05, "loss": 1.7137, "step": 1766 }, { "epoch": 1.8123076923076922, "grad_norm": 0.142578125, "learning_rate": 1.1239933660887853e-05, "loss": 1.5951, "step": 1767 }, { "epoch": 1.8133333333333335, "grad_norm": 0.169921875, "learning_rate": 1.111834474915513e-05, "loss": 1.7474, "step": 1768 }, { "epoch": 1.8143589743589743, "grad_norm": 0.1259765625, "learning_rate": 1.0997402109976518e-05, "loss": 1.6902, "step": 1769 }, { "epoch": 1.8153846153846154, "grad_norm": 0.1240234375, "learning_rate": 1.0877106070551174e-05, "loss": 1.7972, "step": 1770 }, { "epoch": 1.8164102564102564, "grad_norm": 0.1279296875, "learning_rate": 1.075745695632921e-05, "loss": 1.638, "step": 1771 }, { "epoch": 1.8174358974358973, "grad_norm": 0.1435546875, "learning_rate": 1.0638455091010358e-05, "loss": 1.7079, "step": 1772 }, { "epoch": 1.8184615384615386, "grad_norm": 0.1357421875, "learning_rate": 1.0520100796543281e-05, "loss": 1.7849, "step": 1773 }, { "epoch": 1.8194871794871794, "grad_norm": 0.1298828125, "learning_rate": 1.040239439312482e-05, "loss": 1.574, "step": 1774 }, { "epoch": 1.8205128205128205, "grad_norm": 0.1279296875, "learning_rate": 1.0285336199198859e-05, "loss": 1.6561, "step": 1775 }, { "epoch": 1.8215384615384616, "grad_norm": 0.1376953125, "learning_rate": 1.0168926531455652e-05, "loss": 1.7878, "step": 1776 }, { "epoch": 1.8225641025641026, "grad_norm": 0.1220703125, "learning_rate": 1.005316570483103e-05, "loss": 1.7037, "step": 1777 }, { "epoch": 1.8235897435897437, "grad_norm": 0.1279296875, "learning_rate": 9.938054032505306e-06, "loss": 1.7395, "step": 1778 }, { "epoch": 1.8246153846153845, "grad_norm": 0.11328125, "learning_rate": 9.823591825902645e-06, "loss": 1.7483, "step": 1779 }, { "epoch": 1.8256410256410256, "grad_norm": 0.11669921875, "learning_rate": 9.709779394690144e-06, "loss": 1.8379, "step": 1780 }, { "epoch": 1.8266666666666667, "grad_norm": 0.11279296875, "learning_rate": 9.596617046776917e-06, "loss": 1.6601, "step": 1781 }, { "epoch": 1.8276923076923077, "grad_norm": 0.1240234375, "learning_rate": 9.484105088313405e-06, "loss": 1.7486, "step": 1782 }, { "epoch": 1.8287179487179488, "grad_norm": 0.11572265625, "learning_rate": 9.37224382369048e-06, "loss": 1.7339, "step": 1783 }, { "epoch": 1.8297435897435896, "grad_norm": 0.1220703125, "learning_rate": 9.261033555538561e-06, "loss": 1.5874, "step": 1784 }, { "epoch": 1.830769230769231, "grad_norm": 0.12109375, "learning_rate": 9.150474584726925e-06, "loss": 1.6966, "step": 1785 }, { "epoch": 1.8317948717948718, "grad_norm": 0.1201171875, "learning_rate": 9.040567210362754e-06, "loss": 1.6595, "step": 1786 }, { "epoch": 1.8328205128205128, "grad_norm": 0.1162109375, "learning_rate": 8.931311729790503e-06, "loss": 1.6749, "step": 1787 }, { "epoch": 1.833846153846154, "grad_norm": 0.1318359375, "learning_rate": 8.822708438590871e-06, "loss": 1.6861, "step": 1788 }, { "epoch": 1.8348717948717947, "grad_norm": 0.1220703125, "learning_rate": 8.714757630580244e-06, "loss": 1.6407, "step": 1789 }, { "epoch": 1.835897435897436, "grad_norm": 0.11962890625, "learning_rate": 8.607459597809564e-06, "loss": 1.7336, "step": 1790 }, { "epoch": 1.8369230769230769, "grad_norm": 0.12060546875, "learning_rate": 8.500814630564013e-06, "loss": 1.7416, "step": 1791 }, { "epoch": 1.837948717948718, "grad_norm": 0.12060546875, "learning_rate": 8.394823017361747e-06, "loss": 1.6996, "step": 1792 }, { "epoch": 1.838974358974359, "grad_norm": 0.12158203125, "learning_rate": 8.289485044953443e-06, "loss": 1.6035, "step": 1793 }, { "epoch": 1.8399999999999999, "grad_norm": 0.1259765625, "learning_rate": 8.184800998321417e-06, "loss": 1.7121, "step": 1794 }, { "epoch": 1.8410256410256411, "grad_norm": 0.1259765625, "learning_rate": 8.080771160678763e-06, "loss": 1.6542, "step": 1795 }, { "epoch": 1.842051282051282, "grad_norm": 0.123046875, "learning_rate": 7.977395813468791e-06, "loss": 1.5937, "step": 1796 }, { "epoch": 1.843076923076923, "grad_norm": 0.11669921875, "learning_rate": 7.874675236364065e-06, "loss": 1.7373, "step": 1797 }, { "epoch": 1.8441025641025641, "grad_norm": 0.11669921875, "learning_rate": 7.772609707265732e-06, "loss": 1.7249, "step": 1798 }, { "epoch": 1.845128205128205, "grad_norm": 0.1201171875, "learning_rate": 7.671199502302773e-06, "loss": 1.7792, "step": 1799 }, { "epoch": 1.8461538461538463, "grad_norm": 0.1201171875, "learning_rate": 7.57044489583128e-06, "loss": 1.7514, "step": 1800 }, { "epoch": 1.8461538461538463, "eval_loss": NaN, "eval_runtime": 73.7535, "eval_samples_per_second": 9.315, "eval_steps_per_second": 1.166, "step": 1800 }, { "epoch": 1.847179487179487, "grad_norm": 0.1171875, "learning_rate": 7.4703461604336555e-06, "loss": 1.7339, "step": 1801 }, { "epoch": 1.8482051282051282, "grad_norm": 0.12109375, "learning_rate": 7.370903566917914e-06, "loss": 1.7341, "step": 1802 }, { "epoch": 1.8492307692307692, "grad_norm": 0.12255859375, "learning_rate": 7.272117384316906e-06, "loss": 1.69, "step": 1803 }, { "epoch": 1.8502564102564103, "grad_norm": 0.123046875, "learning_rate": 7.173987879887684e-06, "loss": 1.6862, "step": 1804 }, { "epoch": 1.8512820512820514, "grad_norm": 0.1171875, "learning_rate": 7.076515319110688e-06, "loss": 1.6234, "step": 1805 }, { "epoch": 1.8523076923076922, "grad_norm": 0.1162109375, "learning_rate": 6.979699965688979e-06, "loss": 1.7593, "step": 1806 }, { "epoch": 1.8533333333333335, "grad_norm": 0.126953125, "learning_rate": 6.883542081547761e-06, "loss": 1.8088, "step": 1807 }, { "epoch": 1.8543589743589743, "grad_norm": 0.11865234375, "learning_rate": 6.788041926833383e-06, "loss": 1.6435, "step": 1808 }, { "epoch": 1.8553846153846154, "grad_norm": 0.1279296875, "learning_rate": 6.69319975991281e-06, "loss": 1.7918, "step": 1809 }, { "epoch": 1.8564102564102565, "grad_norm": 0.123046875, "learning_rate": 6.599015837372907e-06, "loss": 1.7355, "step": 1810 }, { "epoch": 1.8574358974358973, "grad_norm": 0.12060546875, "learning_rate": 6.5054904140196825e-06, "loss": 1.6086, "step": 1811 }, { "epoch": 1.8584615384615386, "grad_norm": 0.125, "learning_rate": 6.412623742877654e-06, "loss": 1.7231, "step": 1812 }, { "epoch": 1.8594871794871795, "grad_norm": 0.11962890625, "learning_rate": 6.320416075189101e-06, "loss": 1.7257, "step": 1813 }, { "epoch": 1.8605128205128205, "grad_norm": 0.1513671875, "learning_rate": 6.228867660413556e-06, "loss": 1.5682, "step": 1814 }, { "epoch": 1.8615384615384616, "grad_norm": 0.11962890625, "learning_rate": 6.1379787462268466e-06, "loss": 1.5165, "step": 1815 }, { "epoch": 1.8625641025641024, "grad_norm": 0.11572265625, "learning_rate": 6.04774957852064e-06, "loss": 1.6271, "step": 1816 }, { "epoch": 1.8635897435897437, "grad_norm": 0.11865234375, "learning_rate": 5.95818040140178e-06, "loss": 1.711, "step": 1817 }, { "epoch": 1.8646153846153846, "grad_norm": 0.11279296875, "learning_rate": 5.869271457191433e-06, "loss": 1.6795, "step": 1818 }, { "epoch": 1.8656410256410256, "grad_norm": 0.11767578125, "learning_rate": 5.781022986424744e-06, "loss": 1.6764, "step": 1819 }, { "epoch": 1.8666666666666667, "grad_norm": 0.119140625, "learning_rate": 5.693435227849874e-06, "loss": 1.7394, "step": 1820 }, { "epoch": 1.8676923076923075, "grad_norm": 0.12353515625, "learning_rate": 5.606508418427497e-06, "loss": 1.7931, "step": 1821 }, { "epoch": 1.8687179487179488, "grad_norm": 0.1279296875, "learning_rate": 5.520242793330216e-06, "loss": 1.6363, "step": 1822 }, { "epoch": 1.8697435897435897, "grad_norm": 0.1259765625, "learning_rate": 5.434638585941787e-06, "loss": 1.7022, "step": 1823 }, { "epoch": 1.8707692307692307, "grad_norm": 0.11767578125, "learning_rate": 5.349696027856593e-06, "loss": 1.731, "step": 1824 }, { "epoch": 1.8717948717948718, "grad_norm": 0.125, "learning_rate": 5.265415348879005e-06, "loss": 1.8044, "step": 1825 }, { "epoch": 1.8728205128205129, "grad_norm": 0.1416015625, "learning_rate": 5.181796777022713e-06, "loss": 1.6771, "step": 1826 }, { "epoch": 1.873846153846154, "grad_norm": 0.119140625, "learning_rate": 5.0988405385101464e-06, "loss": 1.6933, "step": 1827 }, { "epoch": 1.8748717948717948, "grad_norm": 0.1484375, "learning_rate": 5.016546857771892e-06, "loss": 1.796, "step": 1828 }, { "epoch": 1.8758974358974358, "grad_norm": 0.12158203125, "learning_rate": 4.934915957445912e-06, "loss": 1.7127, "step": 1829 }, { "epoch": 1.876923076923077, "grad_norm": 0.138671875, "learning_rate": 4.853948058377245e-06, "loss": 1.7744, "step": 1830 }, { "epoch": 1.877948717948718, "grad_norm": 0.126953125, "learning_rate": 4.773643379617165e-06, "loss": 1.8426, "step": 1831 }, { "epoch": 1.878974358974359, "grad_norm": 0.126953125, "learning_rate": 4.6940021384226095e-06, "loss": 1.7697, "step": 1832 }, { "epoch": 1.88, "grad_norm": 0.1279296875, "learning_rate": 4.615024550255725e-06, "loss": 1.7837, "step": 1833 }, { "epoch": 1.8810256410256412, "grad_norm": 0.11474609375, "learning_rate": 4.536710828783208e-06, "loss": 1.668, "step": 1834 }, { "epoch": 1.882051282051282, "grad_norm": 0.1220703125, "learning_rate": 4.45906118587569e-06, "loss": 1.7878, "step": 1835 }, { "epoch": 1.883076923076923, "grad_norm": 0.1435546875, "learning_rate": 4.3820758316071854e-06, "loss": 1.7563, "step": 1836 }, { "epoch": 1.8841025641025642, "grad_norm": 0.1259765625, "learning_rate": 4.305754974254561e-06, "loss": 1.8492, "step": 1837 }, { "epoch": 1.885128205128205, "grad_norm": 0.138671875, "learning_rate": 4.230098820296929e-06, "loss": 1.6562, "step": 1838 }, { "epoch": 1.8861538461538463, "grad_norm": 0.125, "learning_rate": 4.155107574415173e-06, "loss": 1.7502, "step": 1839 }, { "epoch": 1.8871794871794871, "grad_norm": 0.111328125, "learning_rate": 4.080781439491199e-06, "loss": 1.7128, "step": 1840 }, { "epoch": 1.8871794871794871, "eval_loss": NaN, "eval_runtime": 73.7728, "eval_samples_per_second": 9.312, "eval_steps_per_second": 1.166, "step": 1840 }, { "epoch": 1.8882051282051282, "grad_norm": 0.1494140625, "learning_rate": 4.007120616607657e-06, "loss": 1.7177, "step": 1841 }, { "epoch": 1.8892307692307693, "grad_norm": 0.11669921875, "learning_rate": 3.9341253050471644e-06, "loss": 1.6549, "step": 1842 }, { "epoch": 1.8902564102564101, "grad_norm": 0.11279296875, "learning_rate": 3.8617957022918925e-06, "loss": 1.6022, "step": 1843 }, { "epoch": 1.8912820512820514, "grad_norm": 0.1171875, "learning_rate": 3.7901320040229783e-06, "loss": 1.7136, "step": 1844 }, { "epoch": 1.8923076923076922, "grad_norm": 0.11181640625, "learning_rate": 3.719134404120084e-06, "loss": 1.5839, "step": 1845 }, { "epoch": 1.8933333333333333, "grad_norm": 0.1220703125, "learning_rate": 3.6488030946606744e-06, "loss": 1.7486, "step": 1846 }, { "epoch": 1.8943589743589744, "grad_norm": 0.1328125, "learning_rate": 3.5791382659197126e-06, "loss": 1.8109, "step": 1847 }, { "epoch": 1.8953846153846152, "grad_norm": 0.11669921875, "learning_rate": 3.510140106369103e-06, "loss": 1.5618, "step": 1848 }, { "epoch": 1.8964102564102565, "grad_norm": 0.1298828125, "learning_rate": 3.441808802677027e-06, "loss": 1.7669, "step": 1849 }, { "epoch": 1.8974358974358974, "grad_norm": 0.14453125, "learning_rate": 3.37414453970758e-06, "loss": 1.77, "step": 1850 }, { "epoch": 1.8984615384615384, "grad_norm": 0.13671875, "learning_rate": 3.30714750052033e-06, "loss": 1.6975, "step": 1851 }, { "epoch": 1.8994871794871795, "grad_norm": 0.11669921875, "learning_rate": 3.240817866369622e-06, "loss": 1.7503, "step": 1852 }, { "epoch": 1.9005128205128206, "grad_norm": 0.12109375, "learning_rate": 3.1751558167042447e-06, "loss": 1.6731, "step": 1853 }, { "epoch": 1.9015384615384616, "grad_norm": 0.1484375, "learning_rate": 3.1101615291668773e-06, "loss": 1.6861, "step": 1854 }, { "epoch": 1.9025641025641025, "grad_norm": 0.1181640625, "learning_rate": 3.04583517959367e-06, "loss": 1.7205, "step": 1855 }, { "epoch": 1.9035897435897438, "grad_norm": 0.11865234375, "learning_rate": 2.9821769420136648e-06, "loss": 1.6448, "step": 1856 }, { "epoch": 1.9046153846153846, "grad_norm": 0.12158203125, "learning_rate": 2.919186988648459e-06, "loss": 1.8004, "step": 1857 }, { "epoch": 1.9056410256410257, "grad_norm": 0.12255859375, "learning_rate": 2.8568654899116253e-06, "loss": 1.6852, "step": 1858 }, { "epoch": 1.9066666666666667, "grad_norm": 0.12890625, "learning_rate": 2.7952126144082933e-06, "loss": 1.5976, "step": 1859 }, { "epoch": 1.9076923076923076, "grad_norm": 0.1279296875, "learning_rate": 2.734228528934679e-06, "loss": 1.7504, "step": 1860 }, { "epoch": 1.9087179487179489, "grad_norm": 0.134765625, "learning_rate": 2.6739133984776953e-06, "loss": 1.715, "step": 1861 }, { "epoch": 1.9097435897435897, "grad_norm": 0.125, "learning_rate": 2.614267386214453e-06, "loss": 1.6602, "step": 1862 }, { "epoch": 1.9107692307692308, "grad_norm": 0.1279296875, "learning_rate": 2.5552906535117606e-06, "loss": 1.7012, "step": 1863 }, { "epoch": 1.9117948717948718, "grad_norm": 0.10693359375, "learning_rate": 2.4969833599258196e-06, "loss": 1.7204, "step": 1864 }, { "epoch": 1.9128205128205127, "grad_norm": 0.1396484375, "learning_rate": 2.4393456632016974e-06, "loss": 1.7067, "step": 1865 }, { "epoch": 1.913846153846154, "grad_norm": 0.12255859375, "learning_rate": 2.3823777192729377e-06, "loss": 1.6314, "step": 1866 }, { "epoch": 1.9148717948717948, "grad_norm": 0.12890625, "learning_rate": 2.3260796822610897e-06, "loss": 1.7604, "step": 1867 }, { "epoch": 1.9158974358974359, "grad_norm": 0.1240234375, "learning_rate": 2.2704517044754013e-06, "loss": 1.7093, "step": 1868 }, { "epoch": 1.916923076923077, "grad_norm": 0.1181640625, "learning_rate": 2.215493936412294e-06, "loss": 1.7074, "step": 1869 }, { "epoch": 1.9179487179487178, "grad_norm": 0.1259765625, "learning_rate": 2.161206526754972e-06, "loss": 1.6329, "step": 1870 }, { "epoch": 1.918974358974359, "grad_norm": 0.1259765625, "learning_rate": 2.1075896223731182e-06, "loss": 1.8274, "step": 1871 }, { "epoch": 1.92, "grad_norm": 0.1162109375, "learning_rate": 2.0546433683223396e-06, "loss": 1.6682, "step": 1872 }, { "epoch": 1.921025641025641, "grad_norm": 0.12451171875, "learning_rate": 2.0023679078439436e-06, "loss": 1.6497, "step": 1873 }, { "epoch": 1.922051282051282, "grad_norm": 0.1162109375, "learning_rate": 1.9507633823643846e-06, "loss": 1.6734, "step": 1874 }, { "epoch": 1.9230769230769231, "grad_norm": 0.1220703125, "learning_rate": 1.8998299314950118e-06, "loss": 1.5948, "step": 1875 }, { "epoch": 1.9241025641025642, "grad_norm": 0.1171875, "learning_rate": 1.849567693031684e-06, "loss": 1.7254, "step": 1876 }, { "epoch": 1.925128205128205, "grad_norm": 0.1162109375, "learning_rate": 1.7999768029542674e-06, "loss": 1.7084, "step": 1877 }, { "epoch": 1.926153846153846, "grad_norm": 0.1259765625, "learning_rate": 1.7510573954263864e-06, "loss": 1.7591, "step": 1878 }, { "epoch": 1.9271794871794872, "grad_norm": 0.10888671875, "learning_rate": 1.7028096027950913e-06, "loss": 1.6337, "step": 1879 }, { "epoch": 1.9282051282051282, "grad_norm": 0.1181640625, "learning_rate": 1.6552335555903298e-06, "loss": 1.8041, "step": 1880 }, { "epoch": 1.9282051282051282, "eval_loss": NaN, "eval_runtime": 73.79, "eval_samples_per_second": 9.31, "eval_steps_per_second": 1.165, "step": 1880 }, { "epoch": 1.9292307692307693, "grad_norm": 0.11083984375, "learning_rate": 1.608329382524809e-06, "loss": 1.6822, "step": 1881 }, { "epoch": 1.9302564102564101, "grad_norm": 0.12890625, "learning_rate": 1.5620972104934406e-06, "loss": 1.7225, "step": 1882 }, { "epoch": 1.9312820512820514, "grad_norm": 0.12890625, "learning_rate": 1.5165371645732284e-06, "loss": 1.6835, "step": 1883 }, { "epoch": 1.9323076923076923, "grad_norm": 0.11474609375, "learning_rate": 1.4716493680226596e-06, "loss": 1.6397, "step": 1884 }, { "epoch": 1.9333333333333333, "grad_norm": 0.119140625, "learning_rate": 1.4274339422816196e-06, "loss": 1.8027, "step": 1885 }, { "epoch": 1.9343589743589744, "grad_norm": 0.1298828125, "learning_rate": 1.3838910069708943e-06, "loss": 1.7934, "step": 1886 }, { "epoch": 1.9353846153846153, "grad_norm": 0.115234375, "learning_rate": 1.3410206798919466e-06, "loss": 1.607, "step": 1887 }, { "epoch": 1.9364102564102565, "grad_norm": 0.1552734375, "learning_rate": 1.2988230770265286e-06, "loss": 1.6264, "step": 1888 }, { "epoch": 1.9374358974358974, "grad_norm": 0.130859375, "learning_rate": 1.2572983125363758e-06, "loss": 1.6873, "step": 1889 }, { "epoch": 1.9384615384615385, "grad_norm": 0.1259765625, "learning_rate": 1.216446498763013e-06, "loss": 1.5994, "step": 1890 }, { "epoch": 1.9394871794871795, "grad_norm": 0.1279296875, "learning_rate": 1.176267746227283e-06, "loss": 1.6298, "step": 1891 }, { "epoch": 1.9405128205128204, "grad_norm": 0.1201171875, "learning_rate": 1.1367621636291237e-06, "loss": 1.767, "step": 1892 }, { "epoch": 1.9415384615384617, "grad_norm": 0.12109375, "learning_rate": 1.0979298578472908e-06, "loss": 1.6063, "step": 1893 }, { "epoch": 1.9425641025641025, "grad_norm": 0.1328125, "learning_rate": 1.0597709339390804e-06, "loss": 1.7012, "step": 1894 }, { "epoch": 1.9435897435897436, "grad_norm": 0.119140625, "learning_rate": 1.0222854951399408e-06, "loss": 1.6754, "step": 1895 }, { "epoch": 1.9446153846153846, "grad_norm": 0.10888671875, "learning_rate": 9.854736428633604e-07, "loss": 1.6286, "step": 1896 }, { "epoch": 1.9456410256410255, "grad_norm": 0.12109375, "learning_rate": 9.49335476700397e-07, "loss": 1.8, "step": 1897 }, { "epoch": 1.9466666666666668, "grad_norm": 0.11181640625, "learning_rate": 9.138710944195938e-07, "loss": 1.5514, "step": 1898 }, { "epoch": 1.9476923076923076, "grad_norm": 0.12109375, "learning_rate": 8.790805919666189e-07, "loss": 1.5519, "step": 1899 }, { "epoch": 1.9487179487179487, "grad_norm": 0.119140625, "learning_rate": 8.449640634639876e-07, "loss": 1.68, "step": 1900 }, { "epoch": 1.9497435897435897, "grad_norm": 0.12890625, "learning_rate": 8.115216012108684e-07, "loss": 1.758, "step": 1901 }, { "epoch": 1.9507692307692308, "grad_norm": 0.11669921875, "learning_rate": 7.787532956828047e-07, "loss": 1.6502, "step": 1902 }, { "epoch": 1.9517948717948719, "grad_norm": 0.11669921875, "learning_rate": 7.466592355314383e-07, "loss": 1.768, "step": 1903 }, { "epoch": 1.9528205128205127, "grad_norm": 0.2294921875, "learning_rate": 7.152395075843421e-07, "loss": 1.7177, "step": 1904 }, { "epoch": 1.953846153846154, "grad_norm": 0.125, "learning_rate": 6.844941968447149e-07, "loss": 1.6972, "step": 1905 }, { "epoch": 1.9548717948717949, "grad_norm": 0.123046875, "learning_rate": 6.544233864911875e-07, "loss": 1.6343, "step": 1906 }, { "epoch": 1.955897435897436, "grad_norm": 0.11181640625, "learning_rate": 6.250271578776279e-07, "loss": 1.7698, "step": 1907 }, { "epoch": 1.956923076923077, "grad_norm": 0.12353515625, "learning_rate": 5.963055905328363e-07, "loss": 1.7028, "step": 1908 }, { "epoch": 1.9579487179487178, "grad_norm": 0.126953125, "learning_rate": 5.682587621604063e-07, "loss": 1.6817, "step": 1909 }, { "epoch": 1.9589743589743591, "grad_norm": 0.12158203125, "learning_rate": 5.408867486384472e-07, "loss": 1.7335, "step": 1910 }, { "epoch": 1.96, "grad_norm": 0.12158203125, "learning_rate": 5.141896240194732e-07, "loss": 1.743, "step": 1911 }, { "epoch": 1.961025641025641, "grad_norm": 0.119140625, "learning_rate": 4.881674605301533e-07, "loss": 1.7584, "step": 1912 }, { "epoch": 1.962051282051282, "grad_norm": 0.1142578125, "learning_rate": 4.6282032857100644e-07, "loss": 1.6998, "step": 1913 }, { "epoch": 1.963076923076923, "grad_norm": 0.1484375, "learning_rate": 4.381482967164285e-07, "loss": 1.6334, "step": 1914 }, { "epoch": 1.9641025641025642, "grad_norm": 0.11962890625, "learning_rate": 4.141514317143602e-07, "loss": 1.6697, "step": 1915 }, { "epoch": 1.965128205128205, "grad_norm": 0.1298828125, "learning_rate": 3.9082979848611975e-07, "loss": 1.8322, "step": 1916 }, { "epoch": 1.9661538461538461, "grad_norm": 0.1181640625, "learning_rate": 3.6818346012629236e-07, "loss": 1.7852, "step": 1917 }, { "epoch": 1.9671794871794872, "grad_norm": 0.11328125, "learning_rate": 3.4621247790245226e-07, "loss": 1.5905, "step": 1918 }, { "epoch": 1.968205128205128, "grad_norm": 0.1328125, "learning_rate": 3.2491691125507983e-07, "loss": 1.8232, "step": 1919 }, { "epoch": 1.9692307692307693, "grad_norm": 0.12890625, "learning_rate": 3.0429681779739484e-07, "loss": 1.6393, "step": 1920 }, { "epoch": 1.9692307692307693, "eval_loss": NaN, "eval_runtime": 73.8001, "eval_samples_per_second": 9.309, "eval_steps_per_second": 1.165, "step": 1920 }, { "epoch": 1.9702564102564102, "grad_norm": 0.12060546875, "learning_rate": 2.8435225331521764e-07, "loss": 1.7637, "step": 1921 }, { "epoch": 1.9712820512820513, "grad_norm": 0.12890625, "learning_rate": 2.6508327176671956e-07, "loss": 1.5817, "step": 1922 }, { "epoch": 1.9723076923076923, "grad_norm": 0.12109375, "learning_rate": 2.4648992528236714e-07, "loss": 1.6483, "step": 1923 }, { "epoch": 1.9733333333333334, "grad_norm": 0.125, "learning_rate": 2.285722641647836e-07, "loss": 1.7738, "step": 1924 }, { "epoch": 1.9743589743589745, "grad_norm": 0.11083984375, "learning_rate": 2.1133033688858216e-07, "loss": 1.7075, "step": 1925 }, { "epoch": 1.9753846153846153, "grad_norm": 0.138671875, "learning_rate": 1.9476419010019953e-07, "loss": 1.5833, "step": 1926 }, { "epoch": 1.9764102564102564, "grad_norm": 0.12890625, "learning_rate": 1.7887386861784038e-07, "loss": 1.7602, "step": 1927 }, { "epoch": 1.9774358974358974, "grad_norm": 0.1298828125, "learning_rate": 1.6365941543131092e-07, "loss": 1.7269, "step": 1928 }, { "epoch": 1.9784615384615385, "grad_norm": 0.11474609375, "learning_rate": 1.4912087170196321e-07, "loss": 1.663, "step": 1929 }, { "epoch": 1.9794871794871796, "grad_norm": 0.123046875, "learning_rate": 1.3525827676247325e-07, "loss": 1.7939, "step": 1930 }, { "epoch": 1.9805128205128204, "grad_norm": 0.1259765625, "learning_rate": 1.2207166811684102e-07, "loss": 1.7722, "step": 1931 }, { "epoch": 1.9815384615384617, "grad_norm": 0.1357421875, "learning_rate": 1.0956108144025145e-07, "loss": 1.7227, "step": 1932 }, { "epoch": 1.9825641025641025, "grad_norm": 0.1123046875, "learning_rate": 9.772655057890822e-08, "loss": 1.6389, "step": 1933 }, { "epoch": 1.9835897435897436, "grad_norm": 0.12060546875, "learning_rate": 8.656810755008903e-08, "loss": 1.7773, "step": 1934 }, { "epoch": 1.9846153846153847, "grad_norm": 0.1162109375, "learning_rate": 7.608578254195143e-08, "loss": 1.6307, "step": 1935 }, { "epoch": 1.9856410256410255, "grad_norm": 0.123046875, "learning_rate": 6.62796039134772e-08, "loss": 1.6655, "step": 1936 }, { "epoch": 1.9866666666666668, "grad_norm": 0.1201171875, "learning_rate": 5.714959819433374e-08, "loss": 1.715, "step": 1937 }, { "epoch": 1.9876923076923076, "grad_norm": 0.1162109375, "learning_rate": 4.869579008498493e-08, "loss": 1.7424, "step": 1938 }, { "epoch": 1.9887179487179487, "grad_norm": 0.11767578125, "learning_rate": 4.0918202456413646e-08, "loss": 1.6549, "step": 1939 }, { "epoch": 1.9897435897435898, "grad_norm": 0.12109375, "learning_rate": 3.381685635017728e-08, "loss": 1.7639, "step": 1940 }, { "epoch": 1.9907692307692306, "grad_norm": 0.11474609375, "learning_rate": 2.7391770978296704e-08, "loss": 1.6407, "step": 1941 }, { "epoch": 1.991794871794872, "grad_norm": 0.11572265625, "learning_rate": 2.1642963723284003e-08, "loss": 1.656, "step": 1942 }, { "epoch": 1.9928205128205128, "grad_norm": 0.115234375, "learning_rate": 1.6570450138003733e-08, "loss": 1.5477, "step": 1943 }, { "epoch": 1.9938461538461538, "grad_norm": 0.12109375, "learning_rate": 1.2174243945672902e-08, "loss": 1.7781, "step": 1944 }, { "epoch": 1.994871794871795, "grad_norm": 0.1162109375, "learning_rate": 8.454357039860972e-09, "loss": 1.5747, "step": 1945 }, { "epoch": 1.9958974358974357, "grad_norm": 0.12255859375, "learning_rate": 5.410799484323325e-09, "loss": 1.8125, "step": 1946 }, { "epoch": 1.996923076923077, "grad_norm": 0.1630859375, "learning_rate": 3.0435795131678044e-09, "loss": 1.7589, "step": 1947 }, { "epoch": 1.9979487179487179, "grad_norm": 0.11962890625, "learning_rate": 1.3527035306881707e-09, "loss": 1.66, "step": 1948 }, { "epoch": 1.998974358974359, "grad_norm": 0.12255859375, "learning_rate": 3.3817611139186353e-10, "loss": 1.6364, "step": 1949 }, { "epoch": 2.0, "grad_norm": 0.1123046875, "learning_rate": 0.0, "loss": 1.6677, "step": 1950 } ], "logging_steps": 1, "max_steps": 1950, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1266338201167462e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }