diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1 +1,22486 @@ -{"best_metric": null, "best_model_checkpoint": null, "epoch": 38.48745788094347, "eval_steps": 1000, "global_step": 514000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [{"epoch": 0.008985533291400845, "grad_norm": 0.0016701683634892106, "learning_rate": 4.9999996212343494e-05, "loss": 1.1088, "step": 200}, {"epoch": 0.01797106658280169, "grad_norm": 0.011517546139657497, "learning_rate": 4.999998445840733e-05, "loss": 1.0986, "step": 400}, {"epoch": 0.026956599874202535, "grad_norm": 0.008341927081346512, "learning_rate": 4.999996473570505e-05, "loss": 1.0995, "step": 600}, {"epoch": 0.03594213316560338, "grad_norm": 0.00020909779414068907, "learning_rate": 4.999993704424292e-05, "loss": 1.1061, "step": 800}, {"epoch": 0.044927666457004224, "grad_norm": 0.0004861349880229682, "learning_rate": 4.999990138402976e-05, "loss": 1.0986, "step": 1000}, {"epoch": 0.044927666457004224, "eval_loss": 2.7725830078125, "eval_runtime": 1124.9446, "eval_samples_per_second": 8.804, "eval_steps_per_second": 0.276, "step": 1000}, {"epoch": 0.05391319974840507, "grad_norm": 0.0002919256512541324, "learning_rate": 4.999985775507695e-05, "loss": 1.0986, "step": 1200}, {"epoch": 0.0628987330398059, "grad_norm": 0.0018022151198238134, "learning_rate": 4.99998061573984e-05, "loss": 1.0986, "step": 1400}, {"epoch": 0.07188426633120676, "grad_norm": 1.6316088438034058, "learning_rate": 4.9999746591010545e-05, "loss": 1.0974, "step": 1600}, {"epoch": 0.0808697996226076, "grad_norm": 3.418214797973633, "learning_rate": 4.999967905593237e-05, "loss": 1.104, "step": 1800}, {"epoch": 0.08985533291400845, "grad_norm": 0.019139207899570465, "learning_rate": 4.9999603552185416e-05, "loss": 1.1005, "step": 2000}, {"epoch": 0.08985533291400845, "eval_loss": 3.4651877880096436, "eval_runtime": 1072.8319, "eval_samples_per_second": 9.232, "eval_steps_per_second": 0.144, "step": 2000}, {"epoch": 0.09884086620540929, "grad_norm": 0.47359538078308105, "learning_rate": 4.999952007979374e-05, "loss": 1.1032, "step": 2200}, {"epoch": 0.10782639949681014, "grad_norm": 1.0411008596420288, "learning_rate": 4.999942863878394e-05, "loss": 1.0966, "step": 2400}, {"epoch": 0.11681193278821098, "grad_norm": 2.402155876159668, "learning_rate": 4.999932922918519e-05, "loss": 1.0791, "step": 2600}, {"epoch": 0.1257974660796118, "grad_norm": 1.485827088356018, "learning_rate": 4.999922185102915e-05, "loss": 1.0514, "step": 2800}, {"epoch": 0.13478299937101268, "grad_norm": 2.352109432220459, "learning_rate": 4.9999106504350065e-05, "loss": 1.0327, "step": 3000}, {"epoch": 0.13478299937101268, "eval_loss": 3.369852066040039, "eval_runtime": 1064.8233, "eval_samples_per_second": 9.301, "eval_steps_per_second": 0.146, "step": 3000}, {"epoch": 0.14376853266241352, "grad_norm": 0.7272612452507019, "learning_rate": 4.999898318918469e-05, "loss": 1.0379, "step": 3200}, {"epoch": 0.15275406595381436, "grad_norm": 1.021616816520691, "learning_rate": 4.999885190557234e-05, "loss": 1.0416, "step": 3400}, {"epoch": 0.1617395992452152, "grad_norm": 2.4565377235412598, "learning_rate": 4.999871265355485e-05, "loss": 1.0212, "step": 3600}, {"epoch": 0.17072513253661606, "grad_norm": 20.56285858154297, "learning_rate": 4.9998565433176624e-05, "loss": 1.0219, "step": 3800}, {"epoch": 0.1797106658280169, "grad_norm": 0.7909038662910461, "learning_rate": 4.9998410244484574e-05, "loss": 1.0075, "step": 4000}, {"epoch": 0.1797106658280169, "eval_loss": 3.339078903198242, "eval_runtime": 1066.4833, "eval_samples_per_second": 9.287, "eval_steps_per_second": 0.145, "step": 4000}, {"epoch": 0.18869619911941773, "grad_norm": 2.09454607963562, "learning_rate": 4.999824708752817e-05, "loss": 0.9825, "step": 4200}, {"epoch": 0.19768173241081857, "grad_norm": 2.223658323287964, "learning_rate": 4.999807596235943e-05, "loss": 0.9851, "step": 4400}, {"epoch": 0.20666726570221944, "grad_norm": 1.121969223022461, "learning_rate": 4.999789686903289e-05, "loss": 1.0041, "step": 4600}, {"epoch": 0.21565279899362028, "grad_norm": 4.0251312255859375, "learning_rate": 4.9997709807605626e-05, "loss": 0.9841, "step": 4800}, {"epoch": 0.22463833228502111, "grad_norm": 1.6437472105026245, "learning_rate": 4.9997514778137275e-05, "loss": 0.9483, "step": 5000}, {"epoch": 0.22463833228502111, "eval_loss": 3.2980644702911377, "eval_runtime": 1067.9785, "eval_samples_per_second": 9.274, "eval_steps_per_second": 0.145, "step": 5000}, {"epoch": 0.23362386557642195, "grad_norm": 0.8991021513938904, "learning_rate": 4.999731178069001e-05, "loss": 0.9541, "step": 5200}, {"epoch": 0.24260939886782282, "grad_norm": 3.1451597213745117, "learning_rate": 4.999710081532853e-05, "loss": 0.9589, "step": 5400}, {"epoch": 0.2515949321592236, "grad_norm": 2.142390489578247, "learning_rate": 4.999688188212007e-05, "loss": 0.9677, "step": 5600}, {"epoch": 0.2605804654506245, "grad_norm": 2.2872331142425537, "learning_rate": 4.999665498113444e-05, "loss": 0.962, "step": 5800}, {"epoch": 0.26956599874202536, "grad_norm": 2.730259418487549, "learning_rate": 4.999642011244394e-05, "loss": 0.9581, "step": 6000}, {"epoch": 0.26956599874202536, "eval_loss": 3.3341598510742188, "eval_runtime": 1066.5406, "eval_samples_per_second": 9.286, "eval_steps_per_second": 0.145, "step": 6000}, {"epoch": 0.2785515320334262, "grad_norm": 2.8416945934295654, "learning_rate": 4.999617727612344e-05, "loss": 0.9675, "step": 6200}, {"epoch": 0.28753706532482703, "grad_norm": 2.8148677349090576, "learning_rate": 4.9995926472250356e-05, "loss": 0.9411, "step": 6400}, {"epoch": 0.2965225986162279, "grad_norm": 1.3317234516143799, "learning_rate": 4.999566770090462e-05, "loss": 0.9279, "step": 6600}, {"epoch": 0.3055081319076287, "grad_norm": 3.403902053833008, "learning_rate": 4.999540096216872e-05, "loss": 0.9293, "step": 6800}, {"epoch": 0.31449366519902955, "grad_norm": 1.70892333984375, "learning_rate": 4.9995126256127675e-05, "loss": 0.9475, "step": 7000}, {"epoch": 0.31449366519902955, "eval_loss": 3.238970994949341, "eval_runtime": 1068.527, "eval_samples_per_second": 9.269, "eval_steps_per_second": 0.145, "step": 7000}, {"epoch": 0.3234791984904304, "grad_norm": 3.11971378326416, "learning_rate": 4.999484358286907e-05, "loss": 0.9465, "step": 7200}, {"epoch": 0.3324647317818312, "grad_norm": 1.395370364189148, "learning_rate": 4.9994552942482975e-05, "loss": 0.9445, "step": 7400}, {"epoch": 0.3414502650732321, "grad_norm": 6.5639424324035645, "learning_rate": 4.999425433506204e-05, "loss": 0.9263, "step": 7600}, {"epoch": 0.35043579836463296, "grad_norm": 2.2011075019836426, "learning_rate": 4.999394776070146e-05, "loss": 0.9193, "step": 7800}, {"epoch": 0.3594213316560338, "grad_norm": 2.9525458812713623, "learning_rate": 4.999363321949895e-05, "loss": 0.9405, "step": 8000}, {"epoch": 0.3594213316560338, "eval_loss": 3.2370519638061523, "eval_runtime": 1068.6545, "eval_samples_per_second": 9.268, "eval_steps_per_second": 0.145, "step": 8000}, {"epoch": 0.36840686494743463, "grad_norm": 4.726866245269775, "learning_rate": 4.999331071155477e-05, "loss": 0.9391, "step": 8200}, {"epoch": 0.37739239823883547, "grad_norm": 2.23179292678833, "learning_rate": 4.9992980236971723e-05, "loss": 0.9352, "step": 8400}, {"epoch": 0.3863779315302363, "grad_norm": 2.175626516342163, "learning_rate": 4.9992641795855134e-05, "loss": 0.9359, "step": 8600}, {"epoch": 0.39536346482163714, "grad_norm": 5.489994525909424, "learning_rate": 4.9992295388312895e-05, "loss": 0.918, "step": 8800}, {"epoch": 0.404348998113038, "grad_norm": 1.484823226928711, "learning_rate": 4.9991941014455414e-05, "loss": 0.9075, "step": 9000}, {"epoch": 0.404348998113038, "eval_loss": 3.1722910404205322, "eval_runtime": 1070.0307, "eval_samples_per_second": 9.256, "eval_steps_per_second": 0.145, "step": 9000}, {"epoch": 0.4133345314044389, "grad_norm": 1.1743195056915283, "learning_rate": 4.9991578674395656e-05, "loss": 0.9116, "step": 9200}, {"epoch": 0.4223200646958397, "grad_norm": 4.027889728546143, "learning_rate": 4.999120836824912e-05, "loss": 0.9023, "step": 9400}, {"epoch": 0.43130559798724055, "grad_norm": 3.1647088527679443, "learning_rate": 4.9990830096133826e-05, "loss": 0.8992, "step": 9600}, {"epoch": 0.4402911312786414, "grad_norm": 1.6494026184082031, "learning_rate": 4.9990443858170366e-05, "loss": 0.8881, "step": 9800}, {"epoch": 0.44927666457004223, "grad_norm": 2.5967679023742676, "learning_rate": 4.999004965448184e-05, "loss": 0.8889, "step": 10000}, {"epoch": 0.44927666457004223, "eval_loss": 3.1767914295196533, "eval_runtime": 1067.4091, "eval_samples_per_second": 9.279, "eval_steps_per_second": 0.145, "step": 10000}, {"epoch": 0.45826219786144307, "grad_norm": 2.703774929046631, "learning_rate": 4.998964748519391e-05, "loss": 0.8845, "step": 10200}, {"epoch": 0.4672477311528439, "grad_norm": 5.934618949890137, "learning_rate": 4.998923735043477e-05, "loss": 0.899, "step": 10400}, {"epoch": 0.47623326444424474, "grad_norm": 7.952963352203369, "learning_rate": 4.9988819250335136e-05, "loss": 0.8968, "step": 10600}, {"epoch": 0.48521879773564563, "grad_norm": 3.2846908569335938, "learning_rate": 4.99883931850283e-05, "loss": 0.8687, "step": 10800}, {"epoch": 0.4942043310270465, "grad_norm": 1.9633086919784546, "learning_rate": 4.998795915465005e-05, "loss": 0.8537, "step": 11000}, {"epoch": 0.4942043310270465, "eval_loss": 3.1828198432922363, "eval_runtime": 1068.8128, "eval_samples_per_second": 9.266, "eval_steps_per_second": 0.145, "step": 11000}, {"epoch": 0.5031898643184473, "grad_norm": 6.807458400726318, "learning_rate": 4.9987517159338744e-05, "loss": 0.8482, "step": 11200}, {"epoch": 0.5121753976098481, "grad_norm": 2.9921388626098633, "learning_rate": 4.998706719923526e-05, "loss": 0.8662, "step": 11400}, {"epoch": 0.521160930901249, "grad_norm": 0.7828212380409241, "learning_rate": 4.998660927448304e-05, "loss": 0.88, "step": 11600}, {"epoch": 0.5301464641926499, "grad_norm": 3.1086294651031494, "learning_rate": 4.9986143385228026e-05, "loss": 0.8536, "step": 11800}, {"epoch": 0.5391319974840507, "grad_norm": 3.759007453918457, "learning_rate": 4.998566953161874e-05, "loss": 0.8321, "step": 12000}, {"epoch": 0.5391319974840507, "eval_loss": 3.1765565872192383, "eval_runtime": 1069.9445, "eval_samples_per_second": 9.257, "eval_steps_per_second": 0.145, "step": 12000}, {"epoch": 0.5481175307754516, "grad_norm": 4.347619533538818, "learning_rate": 4.9985187713806206e-05, "loss": 0.8713, "step": 12200}, {"epoch": 0.5571030640668524, "grad_norm": 2.748655080795288, "learning_rate": 4.9984697931944024e-05, "loss": 0.8457, "step": 12400}, {"epoch": 0.5660885973582532, "grad_norm": 2.891540288925171, "learning_rate": 4.998420018618829e-05, "loss": 0.8212, "step": 12600}, {"epoch": 0.5750741306496541, "grad_norm": 4.089766025543213, "learning_rate": 4.998369447669768e-05, "loss": 0.8288, "step": 12800}, {"epoch": 0.5840596639410549, "grad_norm": 4.722995758056641, "learning_rate": 4.9983180803633376e-05, "loss": 0.8757, "step": 13000}, {"epoch": 0.5840596639410549, "eval_loss": 3.168459892272949, "eval_runtime": 1070.7464, "eval_samples_per_second": 9.25, "eval_steps_per_second": 0.145, "step": 13000}, {"epoch": 0.5930451972324557, "grad_norm": 7.390491008758545, "learning_rate": 4.998265916715912e-05, "loss": 0.8477, "step": 13200}, {"epoch": 0.6020307305238566, "grad_norm": 2.4633262157440186, "learning_rate": 4.9982129567441185e-05, "loss": 0.8415, "step": 13400}, {"epoch": 0.6110162638152574, "grad_norm": 5.4892473220825195, "learning_rate": 4.998159200464837e-05, "loss": 0.8176, "step": 13600}, {"epoch": 0.6200017971066583, "grad_norm": 4.862381458282471, "learning_rate": 4.998104647895203e-05, "loss": 0.8336, "step": 13800}, {"epoch": 0.6289873303980591, "grad_norm": 8.079172134399414, "learning_rate": 4.998049299052606e-05, "loss": 0.8147, "step": 14000}, {"epoch": 0.6289873303980591, "eval_loss": 3.1354148387908936, "eval_runtime": 1070.1274, "eval_samples_per_second": 9.255, "eval_steps_per_second": 0.145, "step": 14000}, {"epoch": 0.6379728636894599, "grad_norm": 2.196859359741211, "learning_rate": 4.997993153954688e-05, "loss": 0.8196, "step": 14200}, {"epoch": 0.6469583969808608, "grad_norm": 2.802729606628418, "learning_rate": 4.997936212619344e-05, "loss": 0.8218, "step": 14400}, {"epoch": 0.6559439302722616, "grad_norm": 5.947813510894775, "learning_rate": 4.997878475064726e-05, "loss": 0.8178, "step": 14600}, {"epoch": 0.6649294635636624, "grad_norm": 4.929244041442871, "learning_rate": 4.9978199413092364e-05, "loss": 0.849, "step": 14800}, {"epoch": 0.6739149968550634, "grad_norm": 3.7185091972351074, "learning_rate": 4.9977606113715336e-05, "loss": 0.8132, "step": 15000}, {"epoch": 0.6739149968550634, "eval_loss": 3.086395263671875, "eval_runtime": 1123.3847, "eval_samples_per_second": 8.816, "eval_steps_per_second": 0.138, "step": 15000}, {"epoch": 0.6829005301464642, "grad_norm": 3.6919984817504883, "learning_rate": 4.9977004852705293e-05, "loss": 0.8171, "step": 15200}, {"epoch": 0.6918860634378651, "grad_norm": 3.0211970806121826, "learning_rate": 4.997639563025388e-05, "loss": 0.8394, "step": 15400}, {"epoch": 0.7008715967292659, "grad_norm": 3.166466236114502, "learning_rate": 4.99757784465553e-05, "loss": 0.7978, "step": 15600}, {"epoch": 0.7098571300206667, "grad_norm": 3.316209554672241, "learning_rate": 4.997515330180627e-05, "loss": 0.8196, "step": 15800}, {"epoch": 0.7188426633120676, "grad_norm": 3.4489612579345703, "learning_rate": 4.997452019620606e-05, "loss": 0.8218, "step": 16000}, {"epoch": 0.7188426633120676, "eval_loss": 3.1093759536743164, "eval_runtime": 1119.6409, "eval_samples_per_second": 8.846, "eval_steps_per_second": 0.138, "step": 16000}, {"epoch": 0.7278281966034684, "grad_norm": 7.543302059173584, "learning_rate": 4.997387912995647e-05, "loss": 0.7442, "step": 16200}, {"epoch": 0.7368137298948693, "grad_norm": 5.488494873046875, "learning_rate": 4.9973230103261834e-05, "loss": 0.8101, "step": 16400}, {"epoch": 0.7457992631862701, "grad_norm": 6.828782081604004, "learning_rate": 4.997257311632905e-05, "loss": 0.796, "step": 16600}, {"epoch": 0.7547847964776709, "grad_norm": 3.4980998039245605, "learning_rate": 4.997190816936751e-05, "loss": 0.8147, "step": 16800}, {"epoch": 0.7637703297690718, "grad_norm": 4.646483421325684, "learning_rate": 4.9971235262589175e-05, "loss": 0.8082, "step": 17000}, {"epoch": 0.7637703297690718, "eval_loss": 3.0615007877349854, "eval_runtime": 1118.9871, "eval_samples_per_second": 8.851, "eval_steps_per_second": 0.139, "step": 17000}, {"epoch": 0.7727558630604726, "grad_norm": 4.960477828979492, "learning_rate": 4.997055439620854e-05, "loss": 0.7868, "step": 17200}, {"epoch": 0.7817413963518735, "grad_norm": 5.231990337371826, "learning_rate": 4.9969865570442634e-05, "loss": 0.7698, "step": 17400}, {"epoch": 0.7907269296432743, "grad_norm": 6.0175065994262695, "learning_rate": 4.9969168785511e-05, "loss": 0.7753, "step": 17600}, {"epoch": 0.7997124629346751, "grad_norm": 1.7933512926101685, "learning_rate": 4.9968464041635765e-05, "loss": 0.8048, "step": 17800}, {"epoch": 0.808697996226076, "grad_norm": 2.3188130855560303, "learning_rate": 4.996775133904156e-05, "loss": 0.8065, "step": 18000}, {"epoch": 0.808697996226076, "eval_loss": 2.9708292484283447, "eval_runtime": 1121.2171, "eval_samples_per_second": 8.833, "eval_steps_per_second": 0.138, "step": 18000}, {"epoch": 0.8176835295174769, "grad_norm": 6.4882049560546875, "learning_rate": 4.996703067795554e-05, "loss": 0.7768, "step": 18200}, {"epoch": 0.8266690628088778, "grad_norm": 6.340662956237793, "learning_rate": 4.996630205860744e-05, "loss": 0.7618, "step": 18400}, {"epoch": 0.8356545961002786, "grad_norm": 2.5629725456237793, "learning_rate": 4.99655654812295e-05, "loss": 0.7907, "step": 18600}, {"epoch": 0.8446401293916794, "grad_norm": 2.3929648399353027, "learning_rate": 4.99648209460565e-05, "loss": 0.7728, "step": 18800}, {"epoch": 0.8536256626830803, "grad_norm": 8.27813720703125, "learning_rate": 4.9964068453325776e-05, "loss": 0.7344, "step": 19000}, {"epoch": 0.8536256626830803, "eval_loss": 2.9753618240356445, "eval_runtime": 1119.6944, "eval_samples_per_second": 8.845, "eval_steps_per_second": 0.138, "step": 19000}, {"epoch": 0.8626111959744811, "grad_norm": 3.184513568878174, "learning_rate": 4.996330800327716e-05, "loss": 0.7734, "step": 19200}, {"epoch": 0.8715967292658819, "grad_norm": 6.273008823394775, "learning_rate": 4.9962539596153065e-05, "loss": 0.7692, "step": 19400}, {"epoch": 0.8805822625572828, "grad_norm": 5.725162506103516, "learning_rate": 4.996176323219842e-05, "loss": 0.7814, "step": 19600}, {"epoch": 0.8895677958486836, "grad_norm": 5.493536949157715, "learning_rate": 4.996097891166069e-05, "loss": 0.7704, "step": 19800}, {"epoch": 0.8985533291400845, "grad_norm": 5.661196708679199, "learning_rate": 4.9960186634789874e-05, "loss": 0.8059, "step": 20000}, {"epoch": 0.8985533291400845, "eval_loss": 2.985053062438965, "eval_runtime": 1118.2825, "eval_samples_per_second": 8.856, "eval_steps_per_second": 0.139, "step": 20000}, {"epoch": 0.9075388624314853, "grad_norm": 6.618274211883545, "learning_rate": 4.995938640183851e-05, "loss": 0.7728, "step": 20200}, {"epoch": 0.9165243957228861, "grad_norm": 17.2467041015625, "learning_rate": 4.995857821306169e-05, "loss": 0.7402, "step": 20400}, {"epoch": 0.925509929014287, "grad_norm": 4.441402912139893, "learning_rate": 4.9957762068717e-05, "loss": 0.7789, "step": 20600}, {"epoch": 0.9344954623056878, "grad_norm": 2.338825225830078, "learning_rate": 4.99569379690646e-05, "loss": 0.7656, "step": 20800}, {"epoch": 0.9434809955970886, "grad_norm": 3.987342357635498, "learning_rate": 4.9956105914367175e-05, "loss": 0.7412, "step": 21000}, {"epoch": 0.9434809955970886, "eval_loss": 2.933100700378418, "eval_runtime": 1131.2007, "eval_samples_per_second": 8.755, "eval_steps_per_second": 0.137, "step": 21000}, {"epoch": 0.9524665288884895, "grad_norm": 9.93287467956543, "learning_rate": 4.9955265904889936e-05, "loss": 0.7687, "step": 21200}, {"epoch": 0.9614520621798903, "grad_norm": 3.2046945095062256, "learning_rate": 4.995441794090064e-05, "loss": 0.7305, "step": 21400}, {"epoch": 0.9704375954712913, "grad_norm": 2.932640790939331, "learning_rate": 4.9953562022669575e-05, "loss": 0.7675, "step": 21600}, {"epoch": 0.9794231287626921, "grad_norm": 1.4578217267990112, "learning_rate": 4.995269815046957e-05, "loss": 0.7412, "step": 21800}, {"epoch": 0.988408662054093, "grad_norm": 3.856112480163574, "learning_rate": 4.9951826324575974e-05, "loss": 0.7751, "step": 22000}, {"epoch": 0.988408662054093, "eval_loss": 3.065196990966797, "eval_runtime": 1131.3352, "eval_samples_per_second": 8.754, "eval_steps_per_second": 0.137, "step": 22000}, {"epoch": 0.9973941953454938, "grad_norm": 5.718069076538086, "learning_rate": 4.9950946545266695e-05, "loss": 0.7576, "step": 22200}, {"epoch": 1.0063797286368945, "grad_norm": 7.1981401443481445, "learning_rate": 4.9950058812822154e-05, "loss": 0.7669, "step": 22400}, {"epoch": 1.0153652619282953, "grad_norm": 3.5773613452911377, "learning_rate": 4.994916312752532e-05, "loss": 0.7544, "step": 22600}, {"epoch": 1.0243507952196962, "grad_norm": 4.548768043518066, "learning_rate": 4.9948259489661695e-05, "loss": 0.7895, "step": 22800}, {"epoch": 1.0333363285110972, "grad_norm": 3.69889497756958, "learning_rate": 4.994734789951932e-05, "loss": 0.7491, "step": 23000}, {"epoch": 1.0333363285110972, "eval_loss": 3.0196194648742676, "eval_runtime": 1131.3469, "eval_samples_per_second": 8.754, "eval_steps_per_second": 0.137, "step": 23000}, {"epoch": 1.042321861802498, "grad_norm": 3.7836413383483887, "learning_rate": 4.994642835738875e-05, "loss": 0.7269, "step": 23200}, {"epoch": 1.051307395093899, "grad_norm": 6.627780914306641, "learning_rate": 4.9945500863563105e-05, "loss": 0.6858, "step": 23400}, {"epoch": 1.0602929283852998, "grad_norm": 4.019529819488525, "learning_rate": 4.994456541833802e-05, "loss": 0.742, "step": 23600}, {"epoch": 1.0692784616767006, "grad_norm": 5.022628307342529, "learning_rate": 4.994362202201166e-05, "loss": 0.7332, "step": 23800}, {"epoch": 1.0782639949681014, "grad_norm": 12.518102645874023, "learning_rate": 4.994267067488474e-05, "loss": 0.7081, "step": 24000}, {"epoch": 1.0782639949681014, "eval_loss": 3.018568992614746, "eval_runtime": 1130.4061, "eval_samples_per_second": 8.761, "eval_steps_per_second": 0.137, "step": 24000}, {"epoch": 1.0872495282595023, "grad_norm": 2.7211592197418213, "learning_rate": 4.9941711377260506e-05, "loss": 0.7172, "step": 24200}, {"epoch": 1.0962350615509031, "grad_norm": 3.2140583992004395, "learning_rate": 4.994074412944473e-05, "loss": 0.7231, "step": 24400}, {"epoch": 1.105220594842304, "grad_norm": 0.7109707593917847, "learning_rate": 4.993976893174572e-05, "loss": 0.7293, "step": 24600}, {"epoch": 1.1142061281337048, "grad_norm": 9.078465461730957, "learning_rate": 4.993878578447433e-05, "loss": 0.7207, "step": 24800}, {"epoch": 1.1231916614251056, "grad_norm": 5.582509994506836, "learning_rate": 4.993779468794394e-05, "loss": 0.7292, "step": 25000}, {"epoch": 1.1231916614251056, "eval_loss": 2.892444133758545, "eval_runtime": 1130.6944, "eval_samples_per_second": 8.759, "eval_steps_per_second": 0.137, "step": 25000}, {"epoch": 1.1321771947165065, "grad_norm": 3.1292569637298584, "learning_rate": 4.9936795642470444e-05, "loss": 0.7389, "step": 25200}, {"epoch": 1.1411627280079073, "grad_norm": 2.5674803256988525, "learning_rate": 4.993578864837232e-05, "loss": 0.7215, "step": 25400}, {"epoch": 1.1501482612993081, "grad_norm": 2.9022293090820312, "learning_rate": 4.9934773705970514e-05, "loss": 0.7025, "step": 25600}, {"epoch": 1.159133794590709, "grad_norm": 10.041083335876465, "learning_rate": 4.9933750815588566e-05, "loss": 0.7249, "step": 25800}, {"epoch": 1.1681193278821098, "grad_norm": 5.979797840118408, "learning_rate": 4.9932719977552514e-05, "loss": 0.7304, "step": 26000}, {"epoch": 1.1681193278821098, "eval_loss": 2.932370185852051, "eval_runtime": 1084.371, "eval_samples_per_second": 9.133, "eval_steps_per_second": 0.143, "step": 26000}, {"epoch": 1.1771048611735107, "grad_norm": 2.0028152465820312, "learning_rate": 4.993168119219093e-05, "loss": 0.7482, "step": 26200}, {"epoch": 1.1860903944649115, "grad_norm": 2.630038022994995, "learning_rate": 4.993063445983495e-05, "loss": 0.7324, "step": 26400}, {"epoch": 1.1950759277563123, "grad_norm": 6.610321044921875, "learning_rate": 4.992957978081819e-05, "loss": 0.7263, "step": 26600}, {"epoch": 1.2040614610477132, "grad_norm": 3.0929627418518066, "learning_rate": 4.992851715547685e-05, "loss": 0.7191, "step": 26800}, {"epoch": 1.213046994339114, "grad_norm": 5.623810768127441, "learning_rate": 4.992744658414964e-05, "loss": 0.7092, "step": 27000}, {"epoch": 1.213046994339114, "eval_loss": 2.992058038711548, "eval_runtime": 1088.476, "eval_samples_per_second": 9.099, "eval_steps_per_second": 0.142, "step": 27000}, {"epoch": 1.2220325276305148, "grad_norm": 4.626497745513916, "learning_rate": 4.9926368067177806e-05, "loss": 0.7309, "step": 27200}, {"epoch": 1.2310180609219157, "grad_norm": 2.491546630859375, "learning_rate": 4.9925281604905126e-05, "loss": 0.7215, "step": 27400}, {"epoch": 1.2400035942133165, "grad_norm": 5.404864311218262, "learning_rate": 4.992418719767791e-05, "loss": 0.6825, "step": 27600}, {"epoch": 1.2489891275047174, "grad_norm": 3.231696605682373, "learning_rate": 4.9923084845845e-05, "loss": 0.7371, "step": 27800}, {"epoch": 1.2579746607961182, "grad_norm": 3.4389524459838867, "learning_rate": 4.992197454975778e-05, "loss": 0.7055, "step": 28000}, {"epoch": 1.2579746607961182, "eval_loss": 2.9535281658172607, "eval_runtime": 1087.0884, "eval_samples_per_second": 9.111, "eval_steps_per_second": 0.143, "step": 28000}, {"epoch": 1.266960194087519, "grad_norm": 2.275574207305908, "learning_rate": 4.992085630977014e-05, "loss": 0.722, "step": 28200}, {"epoch": 1.2759457273789199, "grad_norm": 3.3943276405334473, "learning_rate": 4.991973012623853e-05, "loss": 0.7129, "step": 28400}, {"epoch": 1.2849312606703207, "grad_norm": 3.186497688293457, "learning_rate": 4.9918595999521924e-05, "loss": 0.7351, "step": 28600}, {"epoch": 1.2939167939617215, "grad_norm": 10.006003379821777, "learning_rate": 4.991745392998182e-05, "loss": 0.7021, "step": 28800}, {"epoch": 1.3029023272531224, "grad_norm": 4.930509567260742, "learning_rate": 4.991630391798227e-05, "loss": 0.7292, "step": 29000}, {"epoch": 1.3029023272531224, "eval_loss": 2.9845774173736572, "eval_runtime": 1084.0245, "eval_samples_per_second": 9.136, "eval_steps_per_second": 0.143, "step": 29000}, {"epoch": 1.3118878605445232, "grad_norm": 1.6518604755401611, "learning_rate": 4.991514596388981e-05, "loss": 0.7086, "step": 29200}, {"epoch": 1.320873393835924, "grad_norm": 4.181282043457031, "learning_rate": 4.991398006807357e-05, "loss": 0.7083, "step": 29400}, {"epoch": 1.329858927127325, "grad_norm": 10.062579154968262, "learning_rate": 4.991280623090516e-05, "loss": 0.753, "step": 29600}, {"epoch": 1.3388444604187257, "grad_norm": 6.119633197784424, "learning_rate": 4.991162445275876e-05, "loss": 0.6906, "step": 29800}, {"epoch": 1.3478299937101266, "grad_norm": 7.6824822425842285, "learning_rate": 4.9910434734011046e-05, "loss": 0.7234, "step": 30000}, {"epoch": 1.3478299937101266, "eval_loss": 2.945618152618408, "eval_runtime": 1085.7029, "eval_samples_per_second": 9.122, "eval_steps_per_second": 0.143, "step": 30000}, {"epoch": 1.3568155270015274, "grad_norm": 4.914371490478516, "learning_rate": 4.990923707504125e-05, "loss": 0.6996, "step": 30200}, {"epoch": 1.3658010602929282, "grad_norm": 4.89448881149292, "learning_rate": 4.9908031476231124e-05, "loss": 0.7198, "step": 30400}, {"epoch": 1.3747865935843293, "grad_norm": 1.3539308309555054, "learning_rate": 4.990681793796495e-05, "loss": 0.698, "step": 30600}, {"epoch": 1.3837721268757301, "grad_norm": 3.3933920860290527, "learning_rate": 4.9905596460629555e-05, "loss": 0.7112, "step": 30800}, {"epoch": 1.392757660167131, "grad_norm": 3.926790952682495, "learning_rate": 4.9904367044614275e-05, "loss": 0.7554, "step": 31000}, {"epoch": 1.392757660167131, "eval_loss": 2.94183611869812, "eval_runtime": 1086.8024, "eval_samples_per_second": 9.113, "eval_steps_per_second": 0.143, "step": 31000}, {"epoch": 1.4017431934585318, "grad_norm": 2.5616230964660645, "learning_rate": 4.9903129690311e-05, "loss": 0.7149, "step": 31200}, {"epoch": 1.4107287267499327, "grad_norm": 2.269793748855591, "learning_rate": 4.990188439811412e-05, "loss": 0.7309, "step": 31400}, {"epoch": 1.4197142600413335, "grad_norm": 4.201299667358398, "learning_rate": 4.990063116842059e-05, "loss": 0.7157, "step": 31600}, {"epoch": 1.4286997933327343, "grad_norm": 3.891510009765625, "learning_rate": 4.989937000162987e-05, "loss": 0.7113, "step": 31800}, {"epoch": 1.4376853266241352, "grad_norm": 8.882272720336914, "learning_rate": 4.9898100898143955e-05, "loss": 0.6696, "step": 32000}, {"epoch": 1.4376853266241352, "eval_loss": 2.988067626953125, "eval_runtime": 1086.6628, "eval_samples_per_second": 9.114, "eval_steps_per_second": 0.143, "step": 32000}, {"epoch": 1.446670859915536, "grad_norm": 5.083052158355713, "learning_rate": 4.989682385836738e-05, "loss": 0.7092, "step": 32200}, {"epoch": 1.4556563932069368, "grad_norm": 7.371493339538574, "learning_rate": 4.989553888270719e-05, "loss": 0.7188, "step": 32400}, {"epoch": 1.4646419264983377, "grad_norm": 2.6267755031585693, "learning_rate": 4.989424597157299e-05, "loss": 0.6744, "step": 32600}, {"epoch": 1.4736274597897385, "grad_norm": 5.069836616516113, "learning_rate": 4.9892945125376896e-05, "loss": 0.7124, "step": 32800}, {"epoch": 1.4826129930811394, "grad_norm": 18.678049087524414, "learning_rate": 4.989163634453353e-05, "loss": 0.6928, "step": 33000}, {"epoch": 1.4826129930811394, "eval_loss": 2.9007580280303955, "eval_runtime": 1085.795, "eval_samples_per_second": 9.121, "eval_steps_per_second": 0.143, "step": 33000}, {"epoch": 1.4915985263725402, "grad_norm": 7.033535957336426, "learning_rate": 4.989031962946009e-05, "loss": 0.7045, "step": 33200}, {"epoch": 1.500584059663941, "grad_norm": 2.6740469932556152, "learning_rate": 4.988899498057628e-05, "loss": 0.7225, "step": 33400}, {"epoch": 1.5095695929553419, "grad_norm": 5.661626815795898, "learning_rate": 4.988766239830431e-05, "loss": 0.7058, "step": 33600}, {"epoch": 1.5185551262467427, "grad_norm": 10.127273559570312, "learning_rate": 4.988632188306896e-05, "loss": 0.7044, "step": 33800}, {"epoch": 1.5275406595381436, "grad_norm": 9.424492835998535, "learning_rate": 4.988497343529753e-05, "loss": 0.6702, "step": 34000}, {"epoch": 1.5275406595381436, "eval_loss": 2.8689780235290527, "eval_runtime": 1086.8402, "eval_samples_per_second": 9.113, "eval_steps_per_second": 0.143, "step": 34000}, {"epoch": 1.5365261928295444, "grad_norm": 4.340188503265381, "learning_rate": 4.988361705541982e-05, "loss": 0.663, "step": 34200}, {"epoch": 1.5455117261209452, "grad_norm": 5.512271881103516, "learning_rate": 4.988225274386819e-05, "loss": 0.7331, "step": 34400}, {"epoch": 1.5544972594123463, "grad_norm": 5.91928243637085, "learning_rate": 4.9880880501077496e-05, "loss": 0.7175, "step": 34600}, {"epoch": 1.5634827927037471, "grad_norm": 2.7053489685058594, "learning_rate": 4.987950032748516e-05, "loss": 0.6993, "step": 34800}, {"epoch": 1.572468325995148, "grad_norm": 6.583710670471191, "learning_rate": 4.9878112223531106e-05, "loss": 0.6826, "step": 35000}, {"epoch": 1.572468325995148, "eval_loss": 2.9143316745758057, "eval_runtime": 1083.335, "eval_samples_per_second": 9.142, "eval_steps_per_second": 0.143, "step": 35000}, {"epoch": 1.5814538592865488, "grad_norm": 3.8892221450805664, "learning_rate": 4.98767161896578e-05, "loss": 0.7215, "step": 35200}, {"epoch": 1.5904393925779496, "grad_norm": 5.868275165557861, "learning_rate": 4.987531222631022e-05, "loss": 0.6736, "step": 35400}, {"epoch": 1.5994249258693505, "grad_norm": 4.020185947418213, "learning_rate": 4.9873900333935886e-05, "loss": 0.7027, "step": 35600}, {"epoch": 1.6084104591607513, "grad_norm": 6.451934814453125, "learning_rate": 4.987248051298484e-05, "loss": 0.7045, "step": 35800}, {"epoch": 1.6173959924521522, "grad_norm": 8.390814781188965, "learning_rate": 4.987105276390965e-05, "loss": 0.6964, "step": 36000}, {"epoch": 1.6173959924521522, "eval_loss": 2.856686592102051, "eval_runtime": 1080.9016, "eval_samples_per_second": 9.163, "eval_steps_per_second": 0.143, "step": 36000}, {"epoch": 1.626381525743553, "grad_norm": 8.42429256439209, "learning_rate": 4.9869617087165424e-05, "loss": 0.6867, "step": 36200}, {"epoch": 1.6353670590349538, "grad_norm": 3.3174638748168945, "learning_rate": 4.9868173483209756e-05, "loss": 0.6841, "step": 36400}, {"epoch": 1.6443525923263547, "grad_norm": 5.016312122344971, "learning_rate": 4.986672195250282e-05, "loss": 0.6902, "step": 36600}, {"epoch": 1.6533381256177555, "grad_norm": 2.4442625045776367, "learning_rate": 4.986526249550729e-05, "loss": 0.7003, "step": 36800}, {"epoch": 1.6623236589091563, "grad_norm": 7.444258213043213, "learning_rate": 4.9863795112688364e-05, "loss": 0.6872, "step": 37000}, {"epoch": 1.6623236589091563, "eval_loss": 2.9427731037139893, "eval_runtime": 1046.5686, "eval_samples_per_second": 9.463, "eval_steps_per_second": 0.148, "step": 37000}, {"epoch": 1.6713091922005572, "grad_norm": 5.738009452819824, "learning_rate": 4.986231980451376e-05, "loss": 0.7106, "step": 37200}, {"epoch": 1.680294725491958, "grad_norm": 4.871852397918701, "learning_rate": 4.986083657145376e-05, "loss": 0.6893, "step": 37400}, {"epoch": 1.6892802587833589, "grad_norm": 4.325986862182617, "learning_rate": 4.985934541398113e-05, "loss": 0.6657, "step": 37600}, {"epoch": 1.6982657920747597, "grad_norm": 3.812180757522583, "learning_rate": 4.985784633257118e-05, "loss": 0.6489, "step": 37800}, {"epoch": 1.7072513253661605, "grad_norm": 3.503493309020996, "learning_rate": 4.985633932770174e-05, "loss": 0.7538, "step": 38000}, {"epoch": 1.7072513253661605, "eval_loss": 2.824307441711426, "eval_runtime": 1047.4182, "eval_samples_per_second": 9.456, "eval_steps_per_second": 0.148, "step": 38000}, {"epoch": 1.7162368586575614, "grad_norm": 3.583653450012207, "learning_rate": 4.985482439985317e-05, "loss": 0.6612, "step": 38200}, {"epoch": 1.7252223919489622, "grad_norm": 3.160301446914673, "learning_rate": 4.9853301549508364e-05, "loss": 0.6933, "step": 38400}, {"epoch": 1.734207925240363, "grad_norm": 4.189894199371338, "learning_rate": 4.9851770777152716e-05, "loss": 0.6824, "step": 38600}, {"epoch": 1.7431934585317639, "grad_norm": 0.5203965902328491, "learning_rate": 4.985023208327419e-05, "loss": 0.674, "step": 38800}, {"epoch": 1.7521789918231647, "grad_norm": 4.871167182922363, "learning_rate": 4.98486854683632e-05, "loss": 0.6908, "step": 39000}, {"epoch": 1.7521789918231647, "eval_loss": 2.880004405975342, "eval_runtime": 1044.7953, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.148, "step": 39000}, {"epoch": 1.7611645251145656, "grad_norm": 3.4473588466644287, "learning_rate": 4.9847130932912765e-05, "loss": 0.652, "step": 39200}, {"epoch": 1.7701500584059664, "grad_norm": 12.704270362854004, "learning_rate": 4.984556847741839e-05, "loss": 0.674, "step": 39400}, {"epoch": 1.7791355916973672, "grad_norm": 9.541321754455566, "learning_rate": 4.984399810237811e-05, "loss": 0.7046, "step": 39600}, {"epoch": 1.788121124988768, "grad_norm": 5.383360385894775, "learning_rate": 4.9842419808292473e-05, "loss": 0.6338, "step": 39800}, {"epoch": 1.797106658280169, "grad_norm": 7.993824005126953, "learning_rate": 4.9840833595664566e-05, "loss": 0.6627, "step": 40000}, {"epoch": 1.797106658280169, "eval_loss": 2.934129238128662, "eval_runtime": 1044.8474, "eval_samples_per_second": 9.479, "eval_steps_per_second": 0.148, "step": 40000}, {"epoch": 1.8060921915715697, "grad_norm": 2.7325427532196045, "learning_rate": 4.9839239464999996e-05, "loss": 0.6752, "step": 40200}, {"epoch": 1.8150777248629706, "grad_norm": 6.341977119445801, "learning_rate": 4.9837637416806895e-05, "loss": 0.671, "step": 40400}, {"epoch": 1.8240632581543714, "grad_norm": 10.8590726852417, "learning_rate": 4.9836027451595916e-05, "loss": 0.6901, "step": 40600}, {"epoch": 1.8330487914457723, "grad_norm": 10.971672058105469, "learning_rate": 4.983440956988023e-05, "loss": 0.6905, "step": 40800}, {"epoch": 1.842034324737173, "grad_norm": 8.158576011657715, "learning_rate": 4.983278377217556e-05, "loss": 0.698, "step": 41000}, {"epoch": 1.842034324737173, "eval_loss": 2.8494818210601807, "eval_runtime": 1044.9004, "eval_samples_per_second": 9.478, "eval_steps_per_second": 0.148, "step": 41000}, {"epoch": 1.851019858028574, "grad_norm": 7.720126628875732, "learning_rate": 4.983115005900011e-05, "loss": 0.6763, "step": 41200}, {"epoch": 1.8600053913199748, "grad_norm": 2.961477279663086, "learning_rate": 4.982950843087463e-05, "loss": 0.6895, "step": 41400}, {"epoch": 1.8689909246113756, "grad_norm": 2.009765148162842, "learning_rate": 4.98278588883224e-05, "loss": 0.7122, "step": 41600}, {"epoch": 1.8779764579027765, "grad_norm": 12.237375259399414, "learning_rate": 4.9826201431869205e-05, "loss": 0.6626, "step": 41800}, {"epoch": 1.8869619911941773, "grad_norm": 5.94899845123291, "learning_rate": 4.9824536062043356e-05, "loss": 0.6641, "step": 42000}, {"epoch": 1.8869619911941773, "eval_loss": 2.8374111652374268, "eval_runtime": 1044.7426, "eval_samples_per_second": 9.48, "eval_steps_per_second": 0.148, "step": 42000}, {"epoch": 1.8959475244855781, "grad_norm": 5.839437961578369, "learning_rate": 4.98228627793757e-05, "loss": 0.6554, "step": 42200}, {"epoch": 1.904933057776979, "grad_norm": 1.118190050125122, "learning_rate": 4.982118158439959e-05, "loss": 0.7005, "step": 42400}, {"epoch": 1.9139185910683798, "grad_norm": 3.554232358932495, "learning_rate": 4.981949247765092e-05, "loss": 0.7039, "step": 42600}, {"epoch": 1.9229041243597806, "grad_norm": 4.364952087402344, "learning_rate": 4.981779545966808e-05, "loss": 0.6665, "step": 42800}, {"epoch": 1.9318896576511815, "grad_norm": 5.755943775177002, "learning_rate": 4.981609053099201e-05, "loss": 0.6746, "step": 43000}, {"epoch": 1.9318896576511815, "eval_loss": 2.8288111686706543, "eval_runtime": 1043.7899, "eval_samples_per_second": 9.488, "eval_steps_per_second": 0.148, "step": 43000}, {"epoch": 1.9408751909425823, "grad_norm": 4.873472213745117, "learning_rate": 4.9814377692166145e-05, "loss": 0.691, "step": 43200}, {"epoch": 1.9498607242339832, "grad_norm": 3.6146950721740723, "learning_rate": 4.981265694373647e-05, "loss": 0.6707, "step": 43400}, {"epoch": 1.958846257525384, "grad_norm": 6.156956195831299, "learning_rate": 4.981092828625145e-05, "loss": 0.6618, "step": 43600}, {"epoch": 1.9678317908167848, "grad_norm": 4.361949920654297, "learning_rate": 4.980919172026211e-05, "loss": 0.6791, "step": 43800}, {"epoch": 1.9768173241081857, "grad_norm": 3.5817549228668213, "learning_rate": 4.9807447246321994e-05, "loss": 0.7073, "step": 44000}, {"epoch": 1.9768173241081857, "eval_loss": 2.869600296020508, "eval_runtime": 1043.5043, "eval_samples_per_second": 9.491, "eval_steps_per_second": 0.149, "step": 44000}, {"epoch": 1.9858028573995865, "grad_norm": 4.531149387359619, "learning_rate": 4.980569486498714e-05, "loss": 0.7056, "step": 44200}, {"epoch": 1.9947883906909873, "grad_norm": 4.764667987823486, "learning_rate": 4.980393457681612e-05, "loss": 0.678, "step": 44400}, {"epoch": 2.003773923982388, "grad_norm": 4.271178722381592, "learning_rate": 4.980216638237003e-05, "loss": 0.6399, "step": 44600}, {"epoch": 2.012759457273789, "grad_norm": 10.754460334777832, "learning_rate": 4.9800390282212484e-05, "loss": 0.6687, "step": 44800}, {"epoch": 2.02174499056519, "grad_norm": 2.3163371086120605, "learning_rate": 4.9798606276909623e-05, "loss": 0.6427, "step": 45000}, {"epoch": 2.02174499056519, "eval_loss": 2.8302671909332275, "eval_runtime": 1044.3702, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.148, "step": 45000}, {"epoch": 2.0307305238565907, "grad_norm": 6.137772083282471, "learning_rate": 4.9796814367030085e-05, "loss": 0.6573, "step": 45200}, {"epoch": 2.0397160571479915, "grad_norm": 9.637032508850098, "learning_rate": 4.979501455314506e-05, "loss": 0.6663, "step": 45400}, {"epoch": 2.0487015904393924, "grad_norm": 9.139311790466309, "learning_rate": 4.979320683582822e-05, "loss": 0.651, "step": 45600}, {"epoch": 2.057687123730793, "grad_norm": 5.3387017250061035, "learning_rate": 4.979139121565579e-05, "loss": 0.6698, "step": 45800}, {"epoch": 2.0666726570221945, "grad_norm": 3.5355489253997803, "learning_rate": 4.9789567693206504e-05, "loss": 0.6951, "step": 46000}, {"epoch": 2.0666726570221945, "eval_loss": 2.905496835708618, "eval_runtime": 1044.3998, "eval_samples_per_second": 9.483, "eval_steps_per_second": 0.148, "step": 46000}, {"epoch": 2.075658190313595, "grad_norm": 5.952988147735596, "learning_rate": 4.9787736269061604e-05, "loss": 0.6716, "step": 46200}, {"epoch": 2.084643723604996, "grad_norm": 3.8913867473602295, "learning_rate": 4.978589694380485e-05, "loss": 0.6543, "step": 46400}, {"epoch": 2.093629256896397, "grad_norm": 9.004631996154785, "learning_rate": 4.978404971802255e-05, "loss": 0.6471, "step": 46600}, {"epoch": 2.102614790187798, "grad_norm": 5.533471584320068, "learning_rate": 4.9782194592303485e-05, "loss": 0.6461, "step": 46800}, {"epoch": 2.1116003234791987, "grad_norm": 3.112337589263916, "learning_rate": 4.9780331567239005e-05, "loss": 0.6432, "step": 47000}, {"epoch": 2.1116003234791987, "eval_loss": 2.845529556274414, "eval_runtime": 1043.8826, "eval_samples_per_second": 9.488, "eval_steps_per_second": 0.148, "step": 47000}, {"epoch": 2.1205858567705995, "grad_norm": 8.843466758728027, "learning_rate": 4.977846064342292e-05, "loss": 0.6744, "step": 47200}, {"epoch": 2.1295713900620004, "grad_norm": 5.125086307525635, "learning_rate": 4.977658182145161e-05, "loss": 0.6604, "step": 47400}, {"epoch": 2.138556923353401, "grad_norm": 2.8930840492248535, "learning_rate": 4.9774695101923945e-05, "loss": 0.6688, "step": 47600}, {"epoch": 2.147542456644802, "grad_norm": 2.3682479858398438, "learning_rate": 4.9772800485441317e-05, "loss": 0.6755, "step": 47800}, {"epoch": 2.156527989936203, "grad_norm": 3.7809925079345703, "learning_rate": 4.977089797260764e-05, "loss": 0.6596, "step": 48000}, {"epoch": 2.156527989936203, "eval_loss": 2.806736946105957, "eval_runtime": 1045.1893, "eval_samples_per_second": 9.476, "eval_steps_per_second": 0.148, "step": 48000}, {"epoch": 2.1655135232276037, "grad_norm": 9.784541130065918, "learning_rate": 4.976898756402934e-05, "loss": 0.6993, "step": 48200}, {"epoch": 2.1744990565190045, "grad_norm": 3.151435136795044, "learning_rate": 4.976706926031536e-05, "loss": 0.657, "step": 48400}, {"epoch": 2.1834845898104054, "grad_norm": 4.002162456512451, "learning_rate": 4.976514306207716e-05, "loss": 0.6691, "step": 48600}, {"epoch": 2.1924701231018062, "grad_norm": 3.7456023693084717, "learning_rate": 4.976320896992872e-05, "loss": 0.6524, "step": 48800}, {"epoch": 2.201455656393207, "grad_norm": 7.874242782592773, "learning_rate": 4.9761266984486534e-05, "loss": 0.6828, "step": 49000}, {"epoch": 2.201455656393207, "eval_loss": 2.799010992050171, "eval_runtime": 1112.993, "eval_samples_per_second": 8.899, "eval_steps_per_second": 0.139, "step": 49000}, {"epoch": 2.210441189684608, "grad_norm": 2.5422885417938232, "learning_rate": 4.975931710636961e-05, "loss": 0.6353, "step": 49200}, {"epoch": 2.2194267229760087, "grad_norm": 7.764764308929443, "learning_rate": 4.9757359336199466e-05, "loss": 0.6586, "step": 49400}, {"epoch": 2.2284122562674096, "grad_norm": 3.0725579261779785, "learning_rate": 4.975539367460016e-05, "loss": 0.6556, "step": 49600}, {"epoch": 2.2373977895588104, "grad_norm": 3.268784523010254, "learning_rate": 4.9753420122198237e-05, "loss": 0.6571, "step": 49800}, {"epoch": 2.2463833228502113, "grad_norm": 7.206459045410156, "learning_rate": 4.9751438679622764e-05, "loss": 0.6115, "step": 50000}, {"epoch": 2.2463833228502113, "eval_loss": 2.912787675857544, "eval_runtime": 1110.2376, "eval_samples_per_second": 8.921, "eval_steps_per_second": 0.14, "step": 50000}, {"epoch": 2.255368856141612, "grad_norm": 1.150863766670227, "learning_rate": 4.974944934750534e-05, "loss": 0.6575, "step": 50200}, {"epoch": 2.264354389433013, "grad_norm": 4.235318183898926, "learning_rate": 4.974745212648006e-05, "loss": 0.649, "step": 50400}, {"epoch": 2.2733399227244138, "grad_norm": 3.499100923538208, "learning_rate": 4.974544701718353e-05, "loss": 0.6316, "step": 50600}, {"epoch": 2.2823254560158146, "grad_norm": 5.036466121673584, "learning_rate": 4.97434340202549e-05, "loss": 0.649, "step": 50800}, {"epoch": 2.2913109893072154, "grad_norm": 5.665818214416504, "learning_rate": 4.9741413136335794e-05, "loss": 0.6628, "step": 51000}, {"epoch": 2.2913109893072154, "eval_loss": 2.809664726257324, "eval_runtime": 1108.6765, "eval_samples_per_second": 8.933, "eval_steps_per_second": 0.14, "step": 51000}, {"epoch": 2.3002965225986163, "grad_norm": 6.9531779289245605, "learning_rate": 4.973938436607039e-05, "loss": 0.6451, "step": 51200}, {"epoch": 2.309282055890017, "grad_norm": 8.631576538085938, "learning_rate": 4.9737347710105346e-05, "loss": 0.648, "step": 51400}, {"epoch": 2.318267589181418, "grad_norm": 7.7942376136779785, "learning_rate": 4.973530316908986e-05, "loss": 0.6289, "step": 51600}, {"epoch": 2.327253122472819, "grad_norm": 4.3523688316345215, "learning_rate": 4.973325074367562e-05, "loss": 0.6838, "step": 51800}, {"epoch": 2.3362386557642196, "grad_norm": 4.113776206970215, "learning_rate": 4.973119043451684e-05, "loss": 0.6776, "step": 52000}, {"epoch": 2.3362386557642196, "eval_loss": 2.8563921451568604, "eval_runtime": 1110.1423, "eval_samples_per_second": 8.921, "eval_steps_per_second": 0.14, "step": 52000}, {"epoch": 2.3452241890556205, "grad_norm": 2.6197564601898193, "learning_rate": 4.972912224227025e-05, "loss": 0.6495, "step": 52200}, {"epoch": 2.3542097223470213, "grad_norm": 4.007927417755127, "learning_rate": 4.972704616759509e-05, "loss": 0.6299, "step": 52400}, {"epoch": 2.363195255638422, "grad_norm": 6.33441686630249, "learning_rate": 4.97249622111531e-05, "loss": 0.6444, "step": 52600}, {"epoch": 2.372180788929823, "grad_norm": 6.773642539978027, "learning_rate": 4.9722870373608556e-05, "loss": 0.658, "step": 52800}, {"epoch": 2.381166322221224, "grad_norm": 2.790375232696533, "learning_rate": 4.972077065562821e-05, "loss": 0.6435, "step": 53000}, {"epoch": 2.381166322221224, "eval_loss": 2.807753562927246, "eval_runtime": 1109.9528, "eval_samples_per_second": 8.923, "eval_steps_per_second": 0.14, "step": 53000}, {"epoch": 2.3901518555126247, "grad_norm": 4.388117790222168, "learning_rate": 4.971866305788138e-05, "loss": 0.6147, "step": 53200}, {"epoch": 2.3991373888040255, "grad_norm": 4.960672378540039, "learning_rate": 4.9716547581039854e-05, "loss": 0.6465, "step": 53400}, {"epoch": 2.4081229220954263, "grad_norm": 3.5351078510284424, "learning_rate": 4.9714424225777925e-05, "loss": 0.6336, "step": 53600}, {"epoch": 2.417108455386827, "grad_norm": 6.359066009521484, "learning_rate": 4.971229299277243e-05, "loss": 0.6607, "step": 53800}, {"epoch": 2.426093988678228, "grad_norm": 7.120554447174072, "learning_rate": 4.9710153882702706e-05, "loss": 0.6299, "step": 54000}, {"epoch": 2.426093988678228, "eval_loss": 2.8412070274353027, "eval_runtime": 1110.5443, "eval_samples_per_second": 8.918, "eval_steps_per_second": 0.14, "step": 54000}, {"epoch": 2.435079521969629, "grad_norm": 2.599130630493164, "learning_rate": 4.970800689625058e-05, "loss": 0.6324, "step": 54200}, {"epoch": 2.4440650552610297, "grad_norm": 12.322335243225098, "learning_rate": 4.970585203410041e-05, "loss": 0.6611, "step": 54400}, {"epoch": 2.4530505885524305, "grad_norm": 8.429553031921387, "learning_rate": 4.970368929693907e-05, "loss": 0.6683, "step": 54600}, {"epoch": 2.4620361218438314, "grad_norm": 5.938534259796143, "learning_rate": 4.970151868545593e-05, "loss": 0.615, "step": 54800}, {"epoch": 2.471021655135232, "grad_norm": 5.379678249359131, "learning_rate": 4.969934020034288e-05, "loss": 0.6439, "step": 55000}, {"epoch": 2.471021655135232, "eval_loss": 2.902723789215088, "eval_runtime": 1111.1081, "eval_samples_per_second": 8.914, "eval_steps_per_second": 0.14, "step": 55000}, {"epoch": 2.480007188426633, "grad_norm": 2.5961101055145264, "learning_rate": 4.96971538422943e-05, "loss": 0.6392, "step": 55200}, {"epoch": 2.488992721718034, "grad_norm": 2.440741777420044, "learning_rate": 4.9694959612007094e-05, "loss": 0.6433, "step": 55400}, {"epoch": 2.4979782550094347, "grad_norm": 2.6657445430755615, "learning_rate": 4.9692757510180686e-05, "loss": 0.6544, "step": 55600}, {"epoch": 2.5069637883008355, "grad_norm": 3.9788851737976074, "learning_rate": 4.969054753751699e-05, "loss": 0.6231, "step": 55800}, {"epoch": 2.5159493215922364, "grad_norm": 2.831127643585205, "learning_rate": 4.968832969472044e-05, "loss": 0.6441, "step": 56000}, {"epoch": 2.5159493215922364, "eval_loss": 2.836225986480713, "eval_runtime": 1110.9174, "eval_samples_per_second": 8.915, "eval_steps_per_second": 0.14, "step": 56000}, {"epoch": 2.5249348548836372, "grad_norm": 2.4856066703796387, "learning_rate": 4.968610398249798e-05, "loss": 0.6819, "step": 56200}, {"epoch": 2.533920388175038, "grad_norm": 6.462665557861328, "learning_rate": 4.9683870401559054e-05, "loss": 0.5954, "step": 56400}, {"epoch": 2.542905921466439, "grad_norm": 8.044194221496582, "learning_rate": 4.96816289526156e-05, "loss": 0.6849, "step": 56600}, {"epoch": 2.5518914547578397, "grad_norm": 1.6285322904586792, "learning_rate": 4.9679379636382115e-05, "loss": 0.6492, "step": 56800}, {"epoch": 2.5608769880492406, "grad_norm": 1.74399733543396, "learning_rate": 4.9677122453575544e-05, "loss": 0.6574, "step": 57000}, {"epoch": 2.5608769880492406, "eval_loss": 2.7768375873565674, "eval_runtime": 1110.2066, "eval_samples_per_second": 8.921, "eval_steps_per_second": 0.14, "step": 57000}, {"epoch": 2.5698625213406414, "grad_norm": 4.567875385284424, "learning_rate": 4.967485740491538e-05, "loss": 0.6247, "step": 57200}, {"epoch": 2.5788480546320423, "grad_norm": 2.1420087814331055, "learning_rate": 4.967258449112361e-05, "loss": 0.6101, "step": 57400}, {"epoch": 2.587833587923443, "grad_norm": 4.842061519622803, "learning_rate": 4.967030371292471e-05, "loss": 0.6361, "step": 57600}, {"epoch": 2.5968191212148444, "grad_norm": 7.400786876678467, "learning_rate": 4.9668015071045695e-05, "loss": 0.6456, "step": 57800}, {"epoch": 2.6058046545062448, "grad_norm": 8.932103157043457, "learning_rate": 4.966571856621607e-05, "loss": 0.6232, "step": 58000}, {"epoch": 2.6058046545062448, "eval_loss": 2.8550527095794678, "eval_runtime": 1110.8669, "eval_samples_per_second": 8.916, "eval_steps_per_second": 0.14, "step": 58000}, {"epoch": 2.614790187797646, "grad_norm": 2.9970428943634033, "learning_rate": 4.9663414199167845e-05, "loss": 0.6917, "step": 58200}, {"epoch": 2.6237757210890464, "grad_norm": 4.401594638824463, "learning_rate": 4.966110197063554e-05, "loss": 0.6321, "step": 58400}, {"epoch": 2.6327612543804477, "grad_norm": 8.229362487792969, "learning_rate": 4.965878188135618e-05, "loss": 0.6288, "step": 58600}, {"epoch": 2.641746787671848, "grad_norm": 1.6570228338241577, "learning_rate": 4.965645393206929e-05, "loss": 0.5909, "step": 58800}, {"epoch": 2.6507323209632494, "grad_norm": 8.355649948120117, "learning_rate": 4.9654118123516925e-05, "loss": 0.6708, "step": 59000}, {"epoch": 2.6507323209632494, "eval_loss": 2.7752935886383057, "eval_runtime": 1109.8773, "eval_samples_per_second": 8.924, "eval_steps_per_second": 0.14, "step": 59000}, {"epoch": 2.65971785425465, "grad_norm": 3.5462231636047363, "learning_rate": 4.96517744564436e-05, "loss": 0.6037, "step": 59200}, {"epoch": 2.668703387546051, "grad_norm": 4.182783603668213, "learning_rate": 4.964942293159637e-05, "loss": 0.6271, "step": 59400}, {"epoch": 2.6776889208374515, "grad_norm": 17.542783737182617, "learning_rate": 4.9647063549724796e-05, "loss": 0.6915, "step": 59600}, {"epoch": 2.6866744541288528, "grad_norm": 2.8875606060028076, "learning_rate": 4.9644696311580926e-05, "loss": 0.6154, "step": 59800}, {"epoch": 2.695659987420253, "grad_norm": 3.598609209060669, "learning_rate": 4.964232121791932e-05, "loss": 0.6308, "step": 60000}, {"epoch": 2.695659987420253, "eval_loss": 2.770158529281616, "eval_runtime": 1103.6022, "eval_samples_per_second": 8.974, "eval_steps_per_second": 0.14, "step": 60000}, {"epoch": 2.7046455207116544, "grad_norm": 4.902860164642334, "learning_rate": 4.963993826949703e-05, "loss": 0.6449, "step": 60200}, {"epoch": 2.713631054003055, "grad_norm": 1.6854755878448486, "learning_rate": 4.9637547467073634e-05, "loss": 0.6189, "step": 60400}, {"epoch": 2.722616587294456, "grad_norm": 3.137181520462036, "learning_rate": 4.96351488114112e-05, "loss": 0.6118, "step": 60600}, {"epoch": 2.7316021205858565, "grad_norm": 12.390292167663574, "learning_rate": 4.963274230327432e-05, "loss": 0.6407, "step": 60800}, {"epoch": 2.740587653877258, "grad_norm": 5.263106822967529, "learning_rate": 4.963032794343003e-05, "loss": 0.6426, "step": 61000}, {"epoch": 2.740587653877258, "eval_loss": 2.787389039993286, "eval_runtime": 1105.6052, "eval_samples_per_second": 8.958, "eval_steps_per_second": 0.14, "step": 61000}, {"epoch": 2.7495731871686586, "grad_norm": 5.193811416625977, "learning_rate": 4.962790573264794e-05, "loss": 0.6199, "step": 61200}, {"epoch": 2.7585587204600595, "grad_norm": 2.3068435192108154, "learning_rate": 4.962547567170013e-05, "loss": 0.6299, "step": 61400}, {"epoch": 2.7675442537514603, "grad_norm": 7.189493656158447, "learning_rate": 4.9623037761361166e-05, "loss": 0.6591, "step": 61600}, {"epoch": 2.776529787042861, "grad_norm": 3.9445478916168213, "learning_rate": 4.962059200240815e-05, "loss": 0.6282, "step": 61800}, {"epoch": 2.785515320334262, "grad_norm": 8.275954246520996, "learning_rate": 4.9618138395620666e-05, "loss": 0.6209, "step": 62000}, {"epoch": 2.785515320334262, "eval_loss": 2.711536407470703, "eval_runtime": 1103.3019, "eval_samples_per_second": 8.977, "eval_steps_per_second": 0.14, "step": 62000}, {"epoch": 2.794500853625663, "grad_norm": 6.457345008850098, "learning_rate": 4.96156769417808e-05, "loss": 0.6178, "step": 62200}, {"epoch": 2.8034863869170636, "grad_norm": 6.9077253341674805, "learning_rate": 4.961320764167316e-05, "loss": 0.62, "step": 62400}, {"epoch": 2.8124719202084645, "grad_norm": 1.4460822343826294, "learning_rate": 4.96107304960848e-05, "loss": 0.6681, "step": 62600}, {"epoch": 2.8214574534998653, "grad_norm": 5.170135021209717, "learning_rate": 4.9608245505805345e-05, "loss": 0.6137, "step": 62800}, {"epoch": 2.830442986791266, "grad_norm": 7.249731540679932, "learning_rate": 4.960575267162688e-05, "loss": 0.6175, "step": 63000}, {"epoch": 2.830442986791266, "eval_loss": 2.7555394172668457, "eval_runtime": 1103.5103, "eval_samples_per_second": 8.975, "eval_steps_per_second": 0.14, "step": 63000}, {"epoch": 2.839428520082667, "grad_norm": 8.970303535461426, "learning_rate": 4.960325199434399e-05, "loss": 0.5958, "step": 63200}, {"epoch": 2.848414053374068, "grad_norm": 9.521201133728027, "learning_rate": 4.960074347475377e-05, "loss": 0.6608, "step": 63400}, {"epoch": 2.8573995866654687, "grad_norm": 1.2697712182998657, "learning_rate": 4.9598227113655826e-05, "loss": 0.6367, "step": 63600}, {"epoch": 2.8663851199568695, "grad_norm": 6.463663578033447, "learning_rate": 4.959570291185224e-05, "loss": 0.6198, "step": 63800}, {"epoch": 2.8753706532482703, "grad_norm": 2.3747761249542236, "learning_rate": 4.95931708701476e-05, "loss": 0.656, "step": 64000}, {"epoch": 2.8753706532482703, "eval_loss": 2.7699778079986572, "eval_runtime": 1103.4164, "eval_samples_per_second": 8.976, "eval_steps_per_second": 0.14, "step": 64000}, {"epoch": 2.884356186539671, "grad_norm": 2.689181089401245, "learning_rate": 4.9590630989349e-05, "loss": 0.6433, "step": 64200}, {"epoch": 2.893341719831072, "grad_norm": 2.685288429260254, "learning_rate": 4.958808327026603e-05, "loss": 0.6643, "step": 64400}, {"epoch": 2.902327253122473, "grad_norm": 3.243163824081421, "learning_rate": 4.9585527713710777e-05, "loss": 0.6203, "step": 64600}, {"epoch": 2.9113127864138737, "grad_norm": 4.437738418579102, "learning_rate": 4.9582964320497824e-05, "loss": 0.6351, "step": 64800}, {"epoch": 2.9202983197052745, "grad_norm": 5.811532497406006, "learning_rate": 4.9580393091444266e-05, "loss": 0.6257, "step": 65000}, {"epoch": 2.9202983197052745, "eval_loss": 2.783703327178955, "eval_runtime": 1103.9347, "eval_samples_per_second": 8.972, "eval_steps_per_second": 0.14, "step": 65000}, {"epoch": 2.9292838529966754, "grad_norm": 3.7145042419433594, "learning_rate": 4.957781402736967e-05, "loss": 0.6402, "step": 65200}, {"epoch": 2.938269386288076, "grad_norm": 8.268646240234375, "learning_rate": 4.957522712909612e-05, "loss": 0.5925, "step": 65400}, {"epoch": 2.947254919579477, "grad_norm": 4.354446887969971, "learning_rate": 4.9572632397448196e-05, "loss": 0.6588, "step": 65600}, {"epoch": 2.956240452870878, "grad_norm": 4.316616058349609, "learning_rate": 4.957002983325297e-05, "loss": 0.6173, "step": 65800}, {"epoch": 2.9652259861622787, "grad_norm": 7.808084011077881, "learning_rate": 4.956741943734e-05, "loss": 0.6157, "step": 66000}, {"epoch": 2.9652259861622787, "eval_loss": 2.8421056270599365, "eval_runtime": 1104.1736, "eval_samples_per_second": 8.97, "eval_steps_per_second": 0.14, "step": 66000}, {"epoch": 2.9742115194536796, "grad_norm": 26.778465270996094, "learning_rate": 4.956480121054137e-05, "loss": 0.6378, "step": 66200}, {"epoch": 2.9831970527450804, "grad_norm": 5.89031457901001, "learning_rate": 4.956217515369163e-05, "loss": 0.5759, "step": 66400}, {"epoch": 2.9921825860364812, "grad_norm": 3.110283613204956, "learning_rate": 4.955954126762784e-05, "loss": 0.6221, "step": 66600}, {"epoch": 3.001168119327882, "grad_norm": 6.0229668617248535, "learning_rate": 4.955689955318956e-05, "loss": 0.6276, "step": 66800}, {"epoch": 3.010153652619283, "grad_norm": 4.137844562530518, "learning_rate": 4.955425001121883e-05, "loss": 0.5943, "step": 67000}, {"epoch": 3.010153652619283, "eval_loss": 2.781846523284912, "eval_runtime": 1104.5447, "eval_samples_per_second": 8.967, "eval_steps_per_second": 0.14, "step": 67000}, {"epoch": 3.0191391859106838, "grad_norm": 4.880155563354492, "learning_rate": 4.955159264256019e-05, "loss": 0.6199, "step": 67200}, {"epoch": 3.0281247192020846, "grad_norm": 4.160552024841309, "learning_rate": 4.9548927448060686e-05, "loss": 0.6228, "step": 67400}, {"epoch": 3.0371102524934854, "grad_norm": 4.420809745788574, "learning_rate": 4.954625442856986e-05, "loss": 0.5729, "step": 67600}, {"epoch": 3.0460957857848863, "grad_norm": 2.833252429962158, "learning_rate": 4.954357358493973e-05, "loss": 0.6168, "step": 67800}, {"epoch": 3.055081319076287, "grad_norm": 4.240931034088135, "learning_rate": 4.954088491802481e-05, "loss": 0.6033, "step": 68000}, {"epoch": 3.055081319076287, "eval_loss": 2.8714144229888916, "eval_runtime": 1105.2254, "eval_samples_per_second": 8.961, "eval_steps_per_second": 0.14, "step": 68000}, {"epoch": 3.064066852367688, "grad_norm": 9.208168983459473, "learning_rate": 4.953818842868212e-05, "loss": 0.5893, "step": 68200}, {"epoch": 3.073052385659089, "grad_norm": 3.6979544162750244, "learning_rate": 4.953548411777117e-05, "loss": 0.6, "step": 68400}, {"epoch": 3.0820379189504896, "grad_norm": 5.291320323944092, "learning_rate": 4.953277198615397e-05, "loss": 0.5899, "step": 68600}, {"epoch": 3.0910234522418905, "grad_norm": 3.7340753078460693, "learning_rate": 4.9530052034695e-05, "loss": 0.6183, "step": 68800}, {"epoch": 3.1000089855332913, "grad_norm": 2.6057052612304688, "learning_rate": 4.952732426426126e-05, "loss": 0.6176, "step": 69000}, {"epoch": 3.1000089855332913, "eval_loss": 2.7742364406585693, "eval_runtime": 1104.5457, "eval_samples_per_second": 8.967, "eval_steps_per_second": 0.14, "step": 69000}, {"epoch": 3.108994518824692, "grad_norm": 11.468999862670898, "learning_rate": 4.9524588675722205e-05, "loss": 0.5958, "step": 69200}, {"epoch": 3.117980052116093, "grad_norm": 4.5051374435424805, "learning_rate": 4.952184526994983e-05, "loss": 0.6213, "step": 69400}, {"epoch": 3.126965585407494, "grad_norm": 4.247747421264648, "learning_rate": 4.951909404781859e-05, "loss": 0.6011, "step": 69600}, {"epoch": 3.1359511186988946, "grad_norm": 6.309694290161133, "learning_rate": 4.951633501020545e-05, "loss": 0.6028, "step": 69800}, {"epoch": 3.1449366519902955, "grad_norm": 1.6225708723068237, "learning_rate": 4.951356815798983e-05, "loss": 0.6235, "step": 70000}, {"epoch": 3.1449366519902955, "eval_loss": 2.717803478240967, "eval_runtime": 1104.1485, "eval_samples_per_second": 8.97, "eval_steps_per_second": 0.14, "step": 70000}, {"epoch": 3.1539221852816963, "grad_norm": 4.1915106773376465, "learning_rate": 4.95107934920537e-05, "loss": 0.5785, "step": 70200}, {"epoch": 3.162907718573097, "grad_norm": 3.8733890056610107, "learning_rate": 4.9508011013281454e-05, "loss": 0.6236, "step": 70400}, {"epoch": 3.171893251864498, "grad_norm": 8.979776382446289, "learning_rate": 4.950522072256003e-05, "loss": 0.6158, "step": 70600}, {"epoch": 3.180878785155899, "grad_norm": 4.072059154510498, "learning_rate": 4.950242262077883e-05, "loss": 0.627, "step": 70800}, {"epoch": 3.1898643184472997, "grad_norm": 5.936033248901367, "learning_rate": 4.9499616708829744e-05, "loss": 0.5612, "step": 71000}, {"epoch": 3.1898643184472997, "eval_loss": 2.694528579711914, "eval_runtime": 1096.847, "eval_samples_per_second": 9.03, "eval_steps_per_second": 0.141, "step": 71000}, {"epoch": 3.1988498517387005, "grad_norm": 7.062220573425293, "learning_rate": 4.9496802987607174e-05, "loss": 0.5959, "step": 71200}, {"epoch": 3.2078353850301013, "grad_norm": 4.436807155609131, "learning_rate": 4.9493981458007986e-05, "loss": 0.6131, "step": 71400}, {"epoch": 3.216820918321502, "grad_norm": 4.5539021492004395, "learning_rate": 4.949115212093155e-05, "loss": 0.5965, "step": 71600}, {"epoch": 3.225806451612903, "grad_norm": 13.243054389953613, "learning_rate": 4.9488314977279716e-05, "loss": 0.5439, "step": 71800}, {"epoch": 3.234791984904304, "grad_norm": 11.988075256347656, "learning_rate": 4.948547002795682e-05, "loss": 0.6139, "step": 72000}, {"epoch": 3.234791984904304, "eval_loss": 2.7093992233276367, "eval_runtime": 1096.9087, "eval_samples_per_second": 9.029, "eval_steps_per_second": 0.141, "step": 72000}, {"epoch": 3.2437775181957047, "grad_norm": 2.3277647495269775, "learning_rate": 4.9482617273869705e-05, "loss": 0.618, "step": 72200}, {"epoch": 3.252763051487106, "grad_norm": 6.193905830383301, "learning_rate": 4.947975671592768e-05, "loss": 0.5845, "step": 72400}, {"epoch": 3.2617485847785064, "grad_norm": 3.807849884033203, "learning_rate": 4.9476888355042555e-05, "loss": 0.6207, "step": 72600}, {"epoch": 3.2707341180699077, "grad_norm": 13.691109657287598, "learning_rate": 4.9474012192128615e-05, "loss": 0.5921, "step": 72800}, {"epoch": 3.279719651361308, "grad_norm": 8.186936378479004, "learning_rate": 4.947112822810265e-05, "loss": 0.6381, "step": 73000}, {"epoch": 3.279719651361308, "eval_loss": 2.7966694831848145, "eval_runtime": 1103.5256, "eval_samples_per_second": 8.975, "eval_steps_per_second": 0.14, "step": 73000}, {"epoch": 3.2887051846527093, "grad_norm": 2.7031075954437256, "learning_rate": 4.946823646388392e-05, "loss": 0.6346, "step": 73200}, {"epoch": 3.29769071794411, "grad_norm": 1.7532190084457397, "learning_rate": 4.9465336900394174e-05, "loss": 0.5815, "step": 73400}, {"epoch": 3.306676251235511, "grad_norm": 5.828246116638184, "learning_rate": 4.946242953855765e-05, "loss": 0.6277, "step": 73600}, {"epoch": 3.315661784526912, "grad_norm": 3.648778200149536, "learning_rate": 4.9459514379301084e-05, "loss": 0.5939, "step": 73800}, {"epoch": 3.3246473178183127, "grad_norm": 4.8969597816467285, "learning_rate": 4.945659142355368e-05, "loss": 0.6147, "step": 74000}, {"epoch": 3.3246473178183127, "eval_loss": 2.834960460662842, "eval_runtime": 1095.2072, "eval_samples_per_second": 9.043, "eval_steps_per_second": 0.142, "step": 74000}, {"epoch": 3.3336328511097135, "grad_norm": 12.062762260437012, "learning_rate": 4.9453660672247124e-05, "loss": 0.6336, "step": 74200}, {"epoch": 3.3426183844011144, "grad_norm": 10.92843246459961, "learning_rate": 4.945072212631561e-05, "loss": 0.638, "step": 74400}, {"epoch": 3.351603917692515, "grad_norm": 7.536855220794678, "learning_rate": 4.9447775786695785e-05, "loss": 0.6045, "step": 74600}, {"epoch": 3.360589450983916, "grad_norm": 3.968078136444092, "learning_rate": 4.94448216543268e-05, "loss": 0.5983, "step": 74800}, {"epoch": 3.369574984275317, "grad_norm": 2.125988006591797, "learning_rate": 4.94418597301503e-05, "loss": 0.6118, "step": 75000}, {"epoch": 3.369574984275317, "eval_loss": 2.783966064453125, "eval_runtime": 1095.5505, "eval_samples_per_second": 9.04, "eval_steps_per_second": 0.141, "step": 75000}, {"epoch": 3.3785605175667177, "grad_norm": 5.085707187652588, "learning_rate": 4.9438890015110395e-05, "loss": 0.5765, "step": 75200}, {"epoch": 3.3875460508581186, "grad_norm": 4.397859573364258, "learning_rate": 4.943591251015368e-05, "loss": 0.6046, "step": 75400}, {"epoch": 3.3965315841495194, "grad_norm": 2.367764711380005, "learning_rate": 4.943292721622925e-05, "loss": 0.6331, "step": 75600}, {"epoch": 3.4055171174409202, "grad_norm": 7.137909889221191, "learning_rate": 4.942993413428865e-05, "loss": 0.5902, "step": 75800}, {"epoch": 3.414502650732321, "grad_norm": 4.154844760894775, "learning_rate": 4.942693326528594e-05, "loss": 0.5684, "step": 76000}, {"epoch": 3.414502650732321, "eval_loss": 2.7368874549865723, "eval_runtime": 1095.0529, "eval_samples_per_second": 9.044, "eval_steps_per_second": 0.142, "step": 76000}, {"epoch": 3.423488184023722, "grad_norm": 2.66355299949646, "learning_rate": 4.9423924610177645e-05, "loss": 0.6279, "step": 76200}, {"epoch": 3.4324737173151227, "grad_norm": 4.36577033996582, "learning_rate": 4.942090816992278e-05, "loss": 0.6016, "step": 76400}, {"epoch": 3.4414592506065236, "grad_norm": 5.2936625480651855, "learning_rate": 4.9417883945482835e-05, "loss": 0.6143, "step": 76600}, {"epoch": 3.4504447838979244, "grad_norm": 7.122065544128418, "learning_rate": 4.9414851937821794e-05, "loss": 0.6202, "step": 76800}, {"epoch": 3.4594303171893253, "grad_norm": 6.634164333343506, "learning_rate": 4.941181214790609e-05, "loss": 0.582, "step": 77000}, {"epoch": 3.4594303171893253, "eval_loss": 2.721560478210449, "eval_runtime": 1095.5312, "eval_samples_per_second": 9.04, "eval_steps_per_second": 0.141, "step": 77000}, {"epoch": 3.468415850480726, "grad_norm": 7.679781436920166, "learning_rate": 4.940876457670468e-05, "loss": 0.6062, "step": 77200}, {"epoch": 3.477401383772127, "grad_norm": 4.641097068786621, "learning_rate": 4.9405709225188966e-05, "loss": 0.5853, "step": 77400}, {"epoch": 3.4863869170635278, "grad_norm": 4.262377738952637, "learning_rate": 4.940264609433286e-05, "loss": 0.6164, "step": 77600}, {"epoch": 3.4953724503549286, "grad_norm": 2.9696292877197266, "learning_rate": 4.939957518511272e-05, "loss": 0.6181, "step": 77800}, {"epoch": 3.5043579836463294, "grad_norm": 2.491093158721924, "learning_rate": 4.9396496498507414e-05, "loss": 0.6236, "step": 78000}, {"epoch": 3.5043579836463294, "eval_loss": 2.689380407333374, "eval_runtime": 1095.9701, "eval_samples_per_second": 9.037, "eval_steps_per_second": 0.141, "step": 78000}, {"epoch": 3.5133435169377303, "grad_norm": 3.549752950668335, "learning_rate": 4.9393410035498264e-05, "loss": 0.6144, "step": 78200}, {"epoch": 3.522329050229131, "grad_norm": 33.26611328125, "learning_rate": 4.9390315797069084e-05, "loss": 0.6332, "step": 78400}, {"epoch": 3.531314583520532, "grad_norm": 4.73014497756958, "learning_rate": 4.9387213784206185e-05, "loss": 0.6195, "step": 78600}, {"epoch": 3.540300116811933, "grad_norm": 11.499771118164062, "learning_rate": 4.938410399789831e-05, "loss": 0.6105, "step": 78800}, {"epoch": 3.5492856501033336, "grad_norm": 9.83093547821045, "learning_rate": 4.9380986439136725e-05, "loss": 0.6256, "step": 79000}, {"epoch": 3.5492856501033336, "eval_loss": 2.74749493598938, "eval_runtime": 1097.8988, "eval_samples_per_second": 9.021, "eval_steps_per_second": 0.141, "step": 79000}, {"epoch": 3.5582711833947345, "grad_norm": 5.551429271697998, "learning_rate": 4.9377861108915136e-05, "loss": 0.6412, "step": 79200}, {"epoch": 3.5672567166861353, "grad_norm": 5.982589244842529, "learning_rate": 4.937472800822976e-05, "loss": 0.5878, "step": 79400}, {"epoch": 3.576242249977536, "grad_norm": 5.788779258728027, "learning_rate": 4.937158713807927e-05, "loss": 0.6077, "step": 79600}, {"epoch": 3.585227783268937, "grad_norm": 5.566563129425049, "learning_rate": 4.9368438499464826e-05, "loss": 0.6108, "step": 79800}, {"epoch": 3.594213316560338, "grad_norm": 1.8803223371505737, "learning_rate": 4.9365282093390055e-05, "loss": 0.5926, "step": 80000}, {"epoch": 3.594213316560338, "eval_loss": 2.700577974319458, "eval_runtime": 1096.7835, "eval_samples_per_second": 9.03, "eval_steps_per_second": 0.141, "step": 80000}, {"epoch": 3.6031988498517387, "grad_norm": 5.282078742980957, "learning_rate": 4.9362117920861063e-05, "loss": 0.5906, "step": 80200}, {"epoch": 3.6121843831431395, "grad_norm": 3.943328380584717, "learning_rate": 4.935894598288643e-05, "loss": 0.6109, "step": 80400}, {"epoch": 3.6211699164345403, "grad_norm": 19.697898864746094, "learning_rate": 4.935576628047722e-05, "loss": 0.5673, "step": 80600}, {"epoch": 3.630155449725941, "grad_norm": 7.314117908477783, "learning_rate": 4.935257881464696e-05, "loss": 0.6112, "step": 80800}, {"epoch": 3.639140983017342, "grad_norm": 8.926667213439941, "learning_rate": 4.934938358641167e-05, "loss": 0.5875, "step": 81000}, {"epoch": 3.639140983017342, "eval_loss": 2.7504782676696777, "eval_runtime": 1097.743, "eval_samples_per_second": 9.022, "eval_steps_per_second": 0.141, "step": 81000}, {"epoch": 3.648126516308743, "grad_norm": 1.6228649616241455, "learning_rate": 4.934618059678981e-05, "loss": 0.5964, "step": 81200}, {"epoch": 3.6571120496001437, "grad_norm": 7.490013599395752, "learning_rate": 4.934296984680236e-05, "loss": 0.605, "step": 81400}, {"epoch": 3.6660975828915445, "grad_norm": 5.786327362060547, "learning_rate": 4.933975133747273e-05, "loss": 0.5523, "step": 81600}, {"epoch": 3.6750831161829454, "grad_norm": 6.276517868041992, "learning_rate": 4.9336525069826834e-05, "loss": 0.6328, "step": 81800}, {"epoch": 3.684068649474346, "grad_norm": 4.784965515136719, "learning_rate": 4.933329104489304e-05, "loss": 0.6267, "step": 82000}, {"epoch": 3.684068649474346, "eval_loss": 2.812925338745117, "eval_runtime": 1084.0469, "eval_samples_per_second": 9.136, "eval_steps_per_second": 0.143, "step": 82000}, {"epoch": 3.693054182765747, "grad_norm": 1.2591400146484375, "learning_rate": 4.9330049263702205e-05, "loss": 0.6042, "step": 82200}, {"epoch": 3.702039716057148, "grad_norm": 2.7729320526123047, "learning_rate": 4.932679972728764e-05, "loss": 0.591, "step": 82400}, {"epoch": 3.7110252493485487, "grad_norm": 2.3185465335845947, "learning_rate": 4.9323542436685144e-05, "loss": 0.5797, "step": 82600}, {"epoch": 3.7200107826399496, "grad_norm": 7.948742389678955, "learning_rate": 4.932027739293298e-05, "loss": 0.6366, "step": 82800}, {"epoch": 3.7289963159313504, "grad_norm": 7.0373992919921875, "learning_rate": 4.931700459707188e-05, "loss": 0.6231, "step": 83000}, {"epoch": 3.7289963159313504, "eval_loss": 2.6898717880249023, "eval_runtime": 1082.2616, "eval_samples_per_second": 9.151, "eval_steps_per_second": 0.143, "step": 83000}, {"epoch": 3.7379818492227512, "grad_norm": 2.6516005992889404, "learning_rate": 4.931372405014505e-05, "loss": 0.5767, "step": 83200}, {"epoch": 3.746967382514152, "grad_norm": 3.6714022159576416, "learning_rate": 4.9310435753198174e-05, "loss": 0.6415, "step": 83400}, {"epoch": 3.755952915805553, "grad_norm": 2.8350040912628174, "learning_rate": 4.930713970727939e-05, "loss": 0.6196, "step": 83600}, {"epoch": 3.7649384490969537, "grad_norm": 6.588120937347412, "learning_rate": 4.930383591343933e-05, "loss": 0.6076, "step": 83800}, {"epoch": 3.7739239823883546, "grad_norm": 10.156900405883789, "learning_rate": 4.930052437273107e-05, "loss": 0.5944, "step": 84000}, {"epoch": 3.7739239823883546, "eval_loss": 2.7181143760681152, "eval_runtime": 1080.4885, "eval_samples_per_second": 9.166, "eval_steps_per_second": 0.143, "step": 84000}, {"epoch": 3.782909515679756, "grad_norm": 7.760807037353516, "learning_rate": 4.9297205086210166e-05, "loss": 0.6227, "step": 84200}, {"epoch": 3.7918950489711563, "grad_norm": 4.258764266967773, "learning_rate": 4.929387805493464e-05, "loss": 0.5706, "step": 84400}, {"epoch": 3.8008805822625575, "grad_norm": 1.825241208076477, "learning_rate": 4.9290543279965e-05, "loss": 0.6034, "step": 84600}, {"epoch": 3.809866115553958, "grad_norm": 6.256824493408203, "learning_rate": 4.9287200762364196e-05, "loss": 0.5564, "step": 84800}, {"epoch": 3.818851648845359, "grad_norm": 3.7286887168884277, "learning_rate": 4.9283850503197657e-05, "loss": 0.5849, "step": 85000}, {"epoch": 3.818851648845359, "eval_loss": 2.7389979362487793, "eval_runtime": 1084.0935, "eval_samples_per_second": 9.136, "eval_steps_per_second": 0.143, "step": 85000}, {"epoch": 3.8278371821367596, "grad_norm": 7.849632740020752, "learning_rate": 4.928049250353329e-05, "loss": 0.6199, "step": 85200}, {"epoch": 3.836822715428161, "grad_norm": 6.8108439445495605, "learning_rate": 4.927712676444146e-05, "loss": 0.5899, "step": 85400}, {"epoch": 3.8458082487195613, "grad_norm": 10.76682186126709, "learning_rate": 4.9273753286995e-05, "loss": 0.5788, "step": 85600}, {"epoch": 3.8547937820109626, "grad_norm": 3.199047088623047, "learning_rate": 4.9270372072269195e-05, "loss": 0.5883, "step": 85800}, {"epoch": 3.863779315302363, "grad_norm": 9.04162883758545, "learning_rate": 4.926698312134183e-05, "loss": 0.5848, "step": 86000}, {"epoch": 3.863779315302363, "eval_loss": 2.729203939437866, "eval_runtime": 1081.4692, "eval_samples_per_second": 9.158, "eval_steps_per_second": 0.143, "step": 86000}, {"epoch": 3.8727648485937642, "grad_norm": 4.6888909339904785, "learning_rate": 4.926358643529311e-05, "loss": 0.6202, "step": 86200}, {"epoch": 3.8817503818851646, "grad_norm": 4.689401149749756, "learning_rate": 4.9260182015205756e-05, "loss": 0.5842, "step": 86400}, {"epoch": 3.890735915176566, "grad_norm": 5.316648483276367, "learning_rate": 4.925676986216492e-05, "loss": 0.639, "step": 86600}, {"epoch": 3.8997214484679663, "grad_norm": 8.970780372619629, "learning_rate": 4.9253349977258224e-05, "loss": 0.5849, "step": 86800}, {"epoch": 3.9087069817593676, "grad_norm": 6.301709175109863, "learning_rate": 4.924992236157577e-05, "loss": 0.6302, "step": 87000}, {"epoch": 3.9087069817593676, "eval_loss": 2.6868460178375244, "eval_runtime": 1082.2018, "eval_samples_per_second": 9.152, "eval_steps_per_second": 0.143, "step": 87000}, {"epoch": 3.917692515050768, "grad_norm": 7.46571159362793, "learning_rate": 4.9246487016210105e-05, "loss": 0.6067, "step": 87200}, {"epoch": 3.9266780483421693, "grad_norm": 2.6615748405456543, "learning_rate": 4.924304394225626e-05, "loss": 0.5964, "step": 87400}, {"epoch": 3.93566358163357, "grad_norm": 1.640554666519165, "learning_rate": 4.92395931408117e-05, "loss": 0.594, "step": 87600}, {"epoch": 3.944649114924971, "grad_norm": 6.6660919189453125, "learning_rate": 4.923613461297638e-05, "loss": 0.5728, "step": 87800}, {"epoch": 3.953634648216372, "grad_norm": 8.77531909942627, "learning_rate": 4.923266835985271e-05, "loss": 0.5873, "step": 88000}, {"epoch": 3.953634648216372, "eval_loss": 2.6699206829071045, "eval_runtime": 1089.8325, "eval_samples_per_second": 9.088, "eval_steps_per_second": 0.142, "step": 88000}, {"epoch": 3.9626201815077726, "grad_norm": 9.528241157531738, "learning_rate": 4.922919438254556e-05, "loss": 0.5803, "step": 88200}, {"epoch": 3.9716057147991735, "grad_norm": 1.9404816627502441, "learning_rate": 4.9225712682162265e-05, "loss": 0.5529, "step": 88400}, {"epoch": 3.9805912480905743, "grad_norm": 10.01131820678711, "learning_rate": 4.922222325981262e-05, "loss": 0.6296, "step": 88600}, {"epoch": 3.989576781381975, "grad_norm": 12.538310050964355, "learning_rate": 4.921872611660887e-05, "loss": 0.5903, "step": 88800}, {"epoch": 3.998562314673376, "grad_norm": 1.599368691444397, "learning_rate": 4.921522125366574e-05, "loss": 0.6081, "step": 89000}, {"epoch": 3.998562314673376, "eval_loss": 2.7178070545196533, "eval_runtime": 1080.1856, "eval_samples_per_second": 9.169, "eval_steps_per_second": 0.143, "step": 89000}, {"epoch": 4.007547847964776, "grad_norm": 11.243287086486816, "learning_rate": 4.921170867210042e-05, "loss": 0.5604, "step": 89200}, {"epoch": 4.016533381256178, "grad_norm": 4.789255619049072, "learning_rate": 4.920818837303253e-05, "loss": 0.5699, "step": 89400}, {"epoch": 4.025518914547578, "grad_norm": 14.564445495605469, "learning_rate": 4.920466035758418e-05, "loss": 0.5595, "step": 89600}, {"epoch": 4.034504447838979, "grad_norm": 8.886981010437012, "learning_rate": 4.920112462687993e-05, "loss": 0.5749, "step": 89800}, {"epoch": 4.04348998113038, "grad_norm": 8.778055191040039, "learning_rate": 4.919758118204678e-05, "loss": 0.5711, "step": 90000}, {"epoch": 4.04348998113038, "eval_loss": 2.7640573978424072, "eval_runtime": 1082.5818, "eval_samples_per_second": 9.148, "eval_steps_per_second": 0.143, "step": 90000}, {"epoch": 4.052475514421781, "grad_norm": 3.818753242492676, "learning_rate": 4.9194030024214225e-05, "loss": 0.5166, "step": 90200}, {"epoch": 4.061461047713181, "grad_norm": 6.440443992614746, "learning_rate": 4.919047115451418e-05, "loss": 0.5528, "step": 90400}, {"epoch": 4.070446581004583, "grad_norm": 6.763418197631836, "learning_rate": 4.918690457408106e-05, "loss": 0.5533, "step": 90600}, {"epoch": 4.079432114295983, "grad_norm": 4.209813117980957, "learning_rate": 4.9183330284051695e-05, "loss": 0.5437, "step": 90800}, {"epoch": 4.088417647587384, "grad_norm": 10.399232864379883, "learning_rate": 4.917974828556541e-05, "loss": 0.5665, "step": 91000}, {"epoch": 4.088417647587384, "eval_loss": 2.688040256500244, "eval_runtime": 1080.6131, "eval_samples_per_second": 9.165, "eval_steps_per_second": 0.143, "step": 91000}, {"epoch": 4.097403180878785, "grad_norm": 2.827580213546753, "learning_rate": 4.917615857976396e-05, "loss": 0.5812, "step": 91200}, {"epoch": 4.106388714170186, "grad_norm": 3.4965403079986572, "learning_rate": 4.917256116779157e-05, "loss": 0.6076, "step": 91400}, {"epoch": 4.115374247461586, "grad_norm": 4.934850692749023, "learning_rate": 4.916895605079492e-05, "loss": 0.5613, "step": 91600}, {"epoch": 4.124359780752988, "grad_norm": 6.726780891418457, "learning_rate": 4.916534322992314e-05, "loss": 0.6017, "step": 91800}, {"epoch": 4.133345314044389, "grad_norm": 2.464892625808716, "learning_rate": 4.9161722706327826e-05, "loss": 0.5902, "step": 92000}, {"epoch": 4.133345314044389, "eval_loss": 2.6801517009735107, "eval_runtime": 1082.5084, "eval_samples_per_second": 9.149, "eval_steps_per_second": 0.143, "step": 92000}, {"epoch": 4.142330847335789, "grad_norm": 4.2705254554748535, "learning_rate": 4.915809448116302e-05, "loss": 0.558, "step": 92200}, {"epoch": 4.15131638062719, "grad_norm": 11.47816276550293, "learning_rate": 4.915445855558522e-05, "loss": 0.5689, "step": 92400}, {"epoch": 4.160301913918591, "grad_norm": 8.396933555603027, "learning_rate": 4.9150814930753374e-05, "loss": 0.5982, "step": 92600}, {"epoch": 4.169287447209992, "grad_norm": 5.501452922821045, "learning_rate": 4.914716360782889e-05, "loss": 0.5738, "step": 92800}, {"epoch": 4.178272980501393, "grad_norm": 8.553749084472656, "learning_rate": 4.914350458797565e-05, "loss": 0.5496, "step": 93000}, {"epoch": 4.178272980501393, "eval_loss": 2.7101192474365234, "eval_runtime": 1082.8384, "eval_samples_per_second": 9.146, "eval_steps_per_second": 0.143, "step": 93000}, {"epoch": 4.187258513792794, "grad_norm": 18.494911193847656, "learning_rate": 4.913983787235996e-05, "loss": 0.5905, "step": 93200}, {"epoch": 4.196244047084194, "grad_norm": 4.566243648529053, "learning_rate": 4.913616346215057e-05, "loss": 0.5712, "step": 93400}, {"epoch": 4.205229580375596, "grad_norm": 5.748531818389893, "learning_rate": 4.9132481358518735e-05, "loss": 0.558, "step": 93600}, {"epoch": 4.214215113666996, "grad_norm": 3.77885365486145, "learning_rate": 4.9128791562638096e-05, "loss": 0.5927, "step": 93800}, {"epoch": 4.223200646958397, "grad_norm": 2.6284022331237793, "learning_rate": 4.9125094075684805e-05, "loss": 0.5953, "step": 94000}, {"epoch": 4.223200646958397, "eval_loss": 2.712245225906372, "eval_runtime": 1088.8302, "eval_samples_per_second": 9.096, "eval_steps_per_second": 0.142, "step": 94000}, {"epoch": 4.232186180249798, "grad_norm": 5.8867645263671875, "learning_rate": 4.9121388898837415e-05, "loss": 0.5895, "step": 94200}, {"epoch": 4.241171713541199, "grad_norm": 6.118598937988281, "learning_rate": 4.911767603327698e-05, "loss": 0.6138, "step": 94400}, {"epoch": 4.250157246832599, "grad_norm": 7.058086395263672, "learning_rate": 4.911395548018696e-05, "loss": 0.5921, "step": 94600}, {"epoch": 4.259142780124001, "grad_norm": 6.587648391723633, "learning_rate": 4.911022724075329e-05, "loss": 0.5778, "step": 94800}, {"epoch": 4.268128313415401, "grad_norm": 1.6069397926330566, "learning_rate": 4.910649131616435e-05, "loss": 0.6262, "step": 95000}, {"epoch": 4.268128313415401, "eval_loss": 2.6547911167144775, "eval_runtime": 1085.8261, "eval_samples_per_second": 9.121, "eval_steps_per_second": 0.143, "step": 95000}, {"epoch": 4.277113846706802, "grad_norm": 6.686661243438721, "learning_rate": 4.910274770761096e-05, "loss": 0.5864, "step": 95200}, {"epoch": 4.286099379998203, "grad_norm": 7.897719860076904, "learning_rate": 4.909899641628641e-05, "loss": 0.5884, "step": 95400}, {"epoch": 4.295084913289604, "grad_norm": 7.400073528289795, "learning_rate": 4.9095237443386435e-05, "loss": 0.6021, "step": 95600}, {"epoch": 4.3040704465810045, "grad_norm": 4.220474720001221, "learning_rate": 4.9091470790109196e-05, "loss": 0.5518, "step": 95800}, {"epoch": 4.313055979872406, "grad_norm": 1.6574774980545044, "learning_rate": 4.908769645765532e-05, "loss": 0.5867, "step": 96000}, {"epoch": 4.313055979872406, "eval_loss": 2.691925525665283, "eval_runtime": 1089.0317, "eval_samples_per_second": 9.094, "eval_steps_per_second": 0.142, "step": 96000}, {"epoch": 4.322041513163806, "grad_norm": 3.5609164237976074, "learning_rate": 4.908391444722787e-05, "loss": 0.5803, "step": 96200}, {"epoch": 4.331027046455207, "grad_norm": 3.427290201187134, "learning_rate": 4.908012476003239e-05, "loss": 0.554, "step": 96400}, {"epoch": 4.340012579746608, "grad_norm": 52.728878021240234, "learning_rate": 4.907632739727682e-05, "loss": 0.5962, "step": 96600}, {"epoch": 4.348998113038009, "grad_norm": 12.754006385803223, "learning_rate": 4.907252236017159e-05, "loss": 0.5742, "step": 96800}, {"epoch": 4.3579836463294095, "grad_norm": 8.12136173248291, "learning_rate": 4.9068709649929544e-05, "loss": 0.6085, "step": 97000}, {"epoch": 4.3579836463294095, "eval_loss": 2.6768929958343506, "eval_runtime": 1090.8411, "eval_samples_per_second": 9.079, "eval_steps_per_second": 0.142, "step": 97000}, {"epoch": 4.366969179620811, "grad_norm": 5.45872688293457, "learning_rate": 4.9064889267766e-05, "loss": 0.5137, "step": 97200}, {"epoch": 4.375954712912211, "grad_norm": 3.9804370403289795, "learning_rate": 4.9061061214898707e-05, "loss": 0.5567, "step": 97400}, {"epoch": 4.3849402462036124, "grad_norm": 29.226791381835938, "learning_rate": 4.9057225492547846e-05, "loss": 0.5694, "step": 97600}, {"epoch": 4.393925779495013, "grad_norm": 6.9307169914245605, "learning_rate": 4.9053382101936076e-05, "loss": 0.5909, "step": 97800}, {"epoch": 4.402911312786414, "grad_norm": 5.833766937255859, "learning_rate": 4.904953104428846e-05, "loss": 0.5692, "step": 98000}, {"epoch": 4.402911312786414, "eval_loss": 2.714953660964966, "eval_runtime": 1094.2189, "eval_samples_per_second": 9.051, "eval_steps_per_second": 0.142, "step": 98000}, {"epoch": 4.4118968460778145, "grad_norm": 9.674918174743652, "learning_rate": 4.904567232083255e-05, "loss": 0.5795, "step": 98200}, {"epoch": 4.420882379369216, "grad_norm": 17.37355613708496, "learning_rate": 4.9041805932798295e-05, "loss": 0.581, "step": 98400}, {"epoch": 4.429867912660616, "grad_norm": 2.3987767696380615, "learning_rate": 4.9037931881418126e-05, "loss": 0.5911, "step": 98600}, {"epoch": 4.4388534459520175, "grad_norm": 6.0703558921813965, "learning_rate": 4.903405016792689e-05, "loss": 0.6068, "step": 98800}, {"epoch": 4.447838979243418, "grad_norm": 3.4397573471069336, "learning_rate": 4.9030160793561886e-05, "loss": 0.5542, "step": 99000}, {"epoch": 4.447838979243418, "eval_loss": 2.6832633018493652, "eval_runtime": 1085.7638, "eval_samples_per_second": 9.122, "eval_steps_per_second": 0.143, "step": 99000}, {"epoch": 4.456824512534819, "grad_norm": 1.5094788074493408, "learning_rate": 4.902626375956287e-05, "loss": 0.575, "step": 99200}, {"epoch": 4.4658100458262195, "grad_norm": 1.8952089548110962, "learning_rate": 4.902235906717201e-05, "loss": 0.5773, "step": 99400}, {"epoch": 4.474795579117621, "grad_norm": 6.439733505249023, "learning_rate": 4.9018446717633923e-05, "loss": 0.5653, "step": 99600}, {"epoch": 4.483781112409021, "grad_norm": 6.996722221374512, "learning_rate": 4.90145267121957e-05, "loss": 0.5823, "step": 99800}, {"epoch": 4.4927666457004225, "grad_norm": 8.791942596435547, "learning_rate": 4.901059905210682e-05, "loss": 0.5978, "step": 100000}, {"epoch": 4.4927666457004225, "eval_loss": 2.696164608001709, "eval_runtime": 1086.8043, "eval_samples_per_second": 9.113, "eval_steps_per_second": 0.143, "step": 100000}, {"epoch": 4.501752178991823, "grad_norm": 1.378144383430481, "learning_rate": 4.900666373861924e-05, "loss": 0.5769, "step": 100200}, {"epoch": 4.510737712283224, "grad_norm": 11.897534370422363, "learning_rate": 4.9002720772987345e-05, "loss": 0.6066, "step": 100400}, {"epoch": 4.519723245574625, "grad_norm": 5.889138698577881, "learning_rate": 4.899877015646795e-05, "loss": 0.5708, "step": 100600}, {"epoch": 4.528708778866026, "grad_norm": 8.439177513122559, "learning_rate": 4.899481189032034e-05, "loss": 0.5529, "step": 100800}, {"epoch": 4.537694312157426, "grad_norm": 5.41510534286499, "learning_rate": 4.899084597580619e-05, "loss": 0.5933, "step": 101000}, {"epoch": 4.537694312157426, "eval_loss": 2.7135655879974365, "eval_runtime": 1086.9924, "eval_samples_per_second": 9.111, "eval_steps_per_second": 0.143, "step": 101000}, {"epoch": 4.5466798454488275, "grad_norm": 6.926478385925293, "learning_rate": 4.898687241418965e-05, "loss": 0.5591, "step": 101200}, {"epoch": 4.555665378740228, "grad_norm": 4.796566963195801, "learning_rate": 4.89828912067373e-05, "loss": 0.5589, "step": 101400}, {"epoch": 4.564650912031629, "grad_norm": 12.869160652160645, "learning_rate": 4.897890235471814e-05, "loss": 0.5826, "step": 101600}, {"epoch": 4.57363644532303, "grad_norm": 9.72813892364502, "learning_rate": 4.897490585940363e-05, "loss": 0.5718, "step": 101800}, {"epoch": 4.582621978614431, "grad_norm": 5.5949201583862305, "learning_rate": 4.8970901722067654e-05, "loss": 0.5363, "step": 102000}, {"epoch": 4.582621978614431, "eval_loss": 2.71557879447937, "eval_runtime": 1083.3139, "eval_samples_per_second": 9.142, "eval_steps_per_second": 0.143, "step": 102000}, {"epoch": 4.591607511905831, "grad_norm": 4.014338970184326, "learning_rate": 4.8966889943986524e-05, "loss": 0.5851, "step": 102200}, {"epoch": 4.6005930451972326, "grad_norm": 8.909133911132812, "learning_rate": 4.896287052643902e-05, "loss": 0.5962, "step": 102400}, {"epoch": 4.609578578488633, "grad_norm": 8.902458190917969, "learning_rate": 4.8958843470706326e-05, "loss": 0.5596, "step": 102600}, {"epoch": 4.618564111780034, "grad_norm": 8.509809494018555, "learning_rate": 4.895480877807206e-05, "loss": 0.6035, "step": 102800}, {"epoch": 4.627549645071435, "grad_norm": 5.119136333465576, "learning_rate": 4.895076644982229e-05, "loss": 0.6273, "step": 103000}, {"epoch": 4.627549645071435, "eval_loss": 2.675107002258301, "eval_runtime": 1083.9625, "eval_samples_per_second": 9.137, "eval_steps_per_second": 0.143, "step": 103000}, {"epoch": 4.636535178362836, "grad_norm": 2.670029640197754, "learning_rate": 4.894671648724551e-05, "loss": 0.554, "step": 103200}, {"epoch": 4.645520711654236, "grad_norm": 1.9858131408691406, "learning_rate": 4.8942658891632654e-05, "loss": 0.5506, "step": 103400}, {"epoch": 4.654506244945638, "grad_norm": 4.778411388397217, "learning_rate": 4.893859366427708e-05, "loss": 0.5714, "step": 103600}, {"epoch": 4.663491778237038, "grad_norm": 13.496174812316895, "learning_rate": 4.893452080647457e-05, "loss": 0.5609, "step": 103800}, {"epoch": 4.672477311528439, "grad_norm": 3.933356285095215, "learning_rate": 4.893044031952338e-05, "loss": 0.5461, "step": 104000}, {"epoch": 4.672477311528439, "eval_loss": 2.6608850955963135, "eval_runtime": 1085.6954, "eval_samples_per_second": 9.122, "eval_steps_per_second": 0.143, "step": 104000}, {"epoch": 4.6814628448198405, "grad_norm": 6.484622001647949, "learning_rate": 4.8926352204724145e-05, "loss": 0.5888, "step": 104200}, {"epoch": 4.690448378111241, "grad_norm": 13.072513580322266, "learning_rate": 4.892225646337996e-05, "loss": 0.6129, "step": 104400}, {"epoch": 4.699433911402641, "grad_norm": 9.19959545135498, "learning_rate": 4.891815309679636e-05, "loss": 0.5822, "step": 104600}, {"epoch": 4.708419444694043, "grad_norm": 2.801856517791748, "learning_rate": 4.8914042106281264e-05, "loss": 0.6029, "step": 104800}, {"epoch": 4.717404977985444, "grad_norm": 10.685206413269043, "learning_rate": 4.8909923493145096e-05, "loss": 0.5901, "step": 105000}, {"epoch": 4.717404977985444, "eval_loss": 2.635706901550293, "eval_runtime": 1084.0059, "eval_samples_per_second": 9.136, "eval_steps_per_second": 0.143, "step": 105000}, {"epoch": 4.726390511276844, "grad_norm": 3.1026599407196045, "learning_rate": 4.8905797258700634e-05, "loss": 0.5829, "step": 105200}, {"epoch": 4.735376044568245, "grad_norm": 11.270343780517578, "learning_rate": 4.890166340426313e-05, "loss": 0.5699, "step": 105400}, {"epoch": 4.744361577859646, "grad_norm": 7.997730731964111, "learning_rate": 4.8897521931150266e-05, "loss": 0.5969, "step": 105600}, {"epoch": 4.753347111151047, "grad_norm": 9.27990436553955, "learning_rate": 4.8893372840682116e-05, "loss": 0.5781, "step": 105800}, {"epoch": 4.762332644442448, "grad_norm": 6.486850261688232, "learning_rate": 4.888921613418122e-05, "loss": 0.5926, "step": 106000}, {"epoch": 4.762332644442448, "eval_loss": 2.67816424369812, "eval_runtime": 1076.6519, "eval_samples_per_second": 9.199, "eval_steps_per_second": 0.144, "step": 106000}, {"epoch": 4.771318177733848, "grad_norm": 7.903515338897705, "learning_rate": 4.8885051812972536e-05, "loss": 0.5706, "step": 106200}, {"epoch": 4.780303711025249, "grad_norm": 4.940199375152588, "learning_rate": 4.8880879878383436e-05, "loss": 0.5647, "step": 106400}, {"epoch": 4.789289244316651, "grad_norm": 9.641985893249512, "learning_rate": 4.887670033174373e-05, "loss": 0.5661, "step": 106600}, {"epoch": 4.798274777608051, "grad_norm": 6.985136985778809, "learning_rate": 4.887251317438566e-05, "loss": 0.5938, "step": 106800}, {"epoch": 4.807260310899451, "grad_norm": 3.396899700164795, "learning_rate": 4.886831840764387e-05, "loss": 0.572, "step": 107000}, {"epoch": 4.807260310899451, "eval_loss": 2.6387288570404053, "eval_runtime": 1076.2791, "eval_samples_per_second": 9.202, "eval_steps_per_second": 0.144, "step": 107000}, {"epoch": 4.816245844190853, "grad_norm": 12.026623725891113, "learning_rate": 4.8864116032855455e-05, "loss": 0.5438, "step": 107200}, {"epoch": 4.825231377482254, "grad_norm": 5.219661712646484, "learning_rate": 4.885990605135993e-05, "loss": 0.558, "step": 107400}, {"epoch": 4.834216910773654, "grad_norm": 10.39129638671875, "learning_rate": 4.8855688464499215e-05, "loss": 0.5929, "step": 107600}, {"epoch": 4.843202444065056, "grad_norm": 2.12060546875, "learning_rate": 4.8851463273617694e-05, "loss": 0.5864, "step": 107800}, {"epoch": 4.852187977356456, "grad_norm": 15.424951553344727, "learning_rate": 4.884723048006212e-05, "loss": 0.585, "step": 108000}, {"epoch": 4.852187977356456, "eval_loss": 2.6704163551330566, "eval_runtime": 1076.6628, "eval_samples_per_second": 9.199, "eval_steps_per_second": 0.144, "step": 108000}, {"epoch": 4.861173510647857, "grad_norm": 4.717384338378906, "learning_rate": 4.8842990085181725e-05, "loss": 0.5606, "step": 108200}, {"epoch": 4.870159043939258, "grad_norm": 8.064077377319336, "learning_rate": 4.883874209032813e-05, "loss": 0.5986, "step": 108400}, {"epoch": 4.879144577230659, "grad_norm": 3.4180448055267334, "learning_rate": 4.8834486496855374e-05, "loss": 0.5765, "step": 108600}, {"epoch": 4.888130110522059, "grad_norm": 6.318375110626221, "learning_rate": 4.883022330611995e-05, "loss": 0.5866, "step": 108800}, {"epoch": 4.897115643813461, "grad_norm": 8.343177795410156, "learning_rate": 4.8825952519480745e-05, "loss": 0.5684, "step": 109000}, {"epoch": 4.897115643813461, "eval_loss": 2.612858533859253, "eval_runtime": 1076.4447, "eval_samples_per_second": 9.201, "eval_steps_per_second": 0.144, "step": 109000}, {"epoch": 4.906101177104861, "grad_norm": 13.54843807220459, "learning_rate": 4.882167413829908e-05, "loss": 0.5689, "step": 109200}, {"epoch": 4.915086710396262, "grad_norm": 1.2996422052383423, "learning_rate": 4.8817388163938685e-05, "loss": 0.5665, "step": 109400}, {"epoch": 4.924072243687663, "grad_norm": 1.4910564422607422, "learning_rate": 4.881309459776572e-05, "loss": 0.5883, "step": 109600}, {"epoch": 4.933057776979064, "grad_norm": 4.319411754608154, "learning_rate": 4.880879344114877e-05, "loss": 0.5886, "step": 109800}, {"epoch": 4.942043310270464, "grad_norm": 9.951111793518066, "learning_rate": 4.880448469545882e-05, "loss": 0.5587, "step": 110000}, {"epoch": 4.942043310270464, "eval_loss": 2.679171323776245, "eval_runtime": 1075.904, "eval_samples_per_second": 9.205, "eval_steps_per_second": 0.144, "step": 110000}, {"epoch": 4.951028843561866, "grad_norm": 5.12622594833374, "learning_rate": 4.8800168362069295e-05, "loss": 0.6082, "step": 110200}, {"epoch": 4.960014376853266, "grad_norm": 9.128108978271484, "learning_rate": 4.8795844442356036e-05, "loss": 0.5774, "step": 110400}, {"epoch": 4.968999910144667, "grad_norm": 13.645403861999512, "learning_rate": 4.879151293769729e-05, "loss": 0.6136, "step": 110600}, {"epoch": 4.977985443436068, "grad_norm": 4.305540084838867, "learning_rate": 4.878717384947372e-05, "loss": 0.6004, "step": 110800}, {"epoch": 4.986970976727469, "grad_norm": 2.3471438884735107, "learning_rate": 4.878282717906843e-05, "loss": 0.5718, "step": 111000}, {"epoch": 4.986970976727469, "eval_loss": 2.6824982166290283, "eval_runtime": 1076.2318, "eval_samples_per_second": 9.202, "eval_steps_per_second": 0.144, "step": 111000}, {"epoch": 4.995956510018869, "grad_norm": 3.578322172164917, "learning_rate": 4.8778472927866905e-05, "loss": 0.5599, "step": 111200}, {"epoch": 5.004942043310271, "grad_norm": 8.115492820739746, "learning_rate": 4.877411109725707e-05, "loss": 0.5391, "step": 111400}, {"epoch": 5.013927576601671, "grad_norm": 5.805984020233154, "learning_rate": 4.8769741688629276e-05, "loss": 0.5613, "step": 111600}, {"epoch": 5.022913109893072, "grad_norm": 15.611380577087402, "learning_rate": 4.8765364703376275e-05, "loss": 0.57, "step": 111800}, {"epoch": 5.031898643184473, "grad_norm": 14.959733009338379, "learning_rate": 4.876098014289322e-05, "loss": 0.5168, "step": 112000}, {"epoch": 5.031898643184473, "eval_loss": 2.672183036804199, "eval_runtime": 1076.4621, "eval_samples_per_second": 9.201, "eval_steps_per_second": 0.144, "step": 112000}, {"epoch": 5.040884176475874, "grad_norm": 6.3477864265441895, "learning_rate": 4.875658800857771e-05, "loss": 0.5427, "step": 112200}, {"epoch": 5.0498697097672745, "grad_norm": 5.391243934631348, "learning_rate": 4.8752188301829726e-05, "loss": 0.5698, "step": 112400}, {"epoch": 5.058855243058676, "grad_norm": 6.428415298461914, "learning_rate": 4.8747781024051686e-05, "loss": 0.551, "step": 112600}, {"epoch": 5.067840776350076, "grad_norm": 6.255007266998291, "learning_rate": 4.874336617664842e-05, "loss": 0.5098, "step": 112800}, {"epoch": 5.076826309641477, "grad_norm": 4.247288703918457, "learning_rate": 4.873894376102715e-05, "loss": 0.5399, "step": 113000}, {"epoch": 5.076826309641477, "eval_loss": 2.692117214202881, "eval_runtime": 1077.848, "eval_samples_per_second": 9.189, "eval_steps_per_second": 0.144, "step": 113000}, {"epoch": 5.085811842932878, "grad_norm": 4.478646755218506, "learning_rate": 4.873451377859753e-05, "loss": 0.5266, "step": 113200}, {"epoch": 5.094797376224279, "grad_norm": 4.759102821350098, "learning_rate": 4.873007623077162e-05, "loss": 0.5708, "step": 113400}, {"epoch": 5.1037829095156795, "grad_norm": 6.76074743270874, "learning_rate": 4.872563111896391e-05, "loss": 0.5347, "step": 113600}, {"epoch": 5.112768442807081, "grad_norm": 13.389432907104492, "learning_rate": 4.872117844459126e-05, "loss": 0.5058, "step": 113800}, {"epoch": 5.121753976098481, "grad_norm": 7.0974297523498535, "learning_rate": 4.871671820907296e-05, "loss": 0.549, "step": 114000}, {"epoch": 5.121753976098481, "eval_loss": 2.6620500087738037, "eval_runtime": 1077.5471, "eval_samples_per_second": 9.191, "eval_steps_per_second": 0.144, "step": 114000}, {"epoch": 5.130739509389882, "grad_norm": 3.2014670372009277, "learning_rate": 4.871225041383074e-05, "loss": 0.5409, "step": 114200}, {"epoch": 5.139725042681283, "grad_norm": 6.361083984375, "learning_rate": 4.8707775060288695e-05, "loss": 0.5407, "step": 114400}, {"epoch": 5.148710575972684, "grad_norm": 12.352490425109863, "learning_rate": 4.8703292149873356e-05, "loss": 0.5898, "step": 114600}, {"epoch": 5.1576961092640845, "grad_norm": 6.829831123352051, "learning_rate": 4.869880168401364e-05, "loss": 0.5598, "step": 114800}, {"epoch": 5.166681642555486, "grad_norm": 9.012941360473633, "learning_rate": 4.86943036641409e-05, "loss": 0.5792, "step": 115000}, {"epoch": 5.166681642555486, "eval_loss": 2.6695964336395264, "eval_runtime": 1076.6032, "eval_samples_per_second": 9.199, "eval_steps_per_second": 0.144, "step": 115000}, {"epoch": 5.175667175846886, "grad_norm": 5.5551838874816895, "learning_rate": 4.868979809168889e-05, "loss": 0.5334, "step": 115200}, {"epoch": 5.1846527091382875, "grad_norm": 5.080362796783447, "learning_rate": 4.8685284968093745e-05, "loss": 0.5476, "step": 115400}, {"epoch": 5.193638242429688, "grad_norm": 3.391294479370117, "learning_rate": 4.868076429479403e-05, "loss": 0.541, "step": 115600}, {"epoch": 5.202623775721089, "grad_norm": 5.813953399658203, "learning_rate": 4.867623607323074e-05, "loss": 0.5506, "step": 115800}, {"epoch": 5.2116093090124895, "grad_norm": 3.1033880710601807, "learning_rate": 4.8671700304847216e-05, "loss": 0.5843, "step": 116000}, {"epoch": 5.2116093090124895, "eval_loss": 2.706368923187256, "eval_runtime": 1124.9655, "eval_samples_per_second": 8.804, "eval_steps_per_second": 0.138, "step": 116000}, {"epoch": 5.220594842303891, "grad_norm": 2.261789321899414, "learning_rate": 4.866715699108926e-05, "loss": 0.5736, "step": 116200}, {"epoch": 5.229580375595291, "grad_norm": 6.052493095397949, "learning_rate": 4.866260613340504e-05, "loss": 0.5848, "step": 116400}, {"epoch": 5.2385659088866925, "grad_norm": 12.537518501281738, "learning_rate": 4.8658047733245166e-05, "loss": 0.5431, "step": 116600}, {"epoch": 5.247551442178093, "grad_norm": 4.784250736236572, "learning_rate": 4.8653481792062615e-05, "loss": 0.5338, "step": 116800}, {"epoch": 5.256536975469494, "grad_norm": 5.308268070220947, "learning_rate": 4.8648908311312794e-05, "loss": 0.607, "step": 117000}, {"epoch": 5.256536975469494, "eval_loss": 2.680147647857666, "eval_runtime": 1125.8958, "eval_samples_per_second": 8.797, "eval_steps_per_second": 0.138, "step": 117000}, {"epoch": 5.265522508760895, "grad_norm": 2.42497181892395, "learning_rate": 4.86443272924535e-05, "loss": 0.5626, "step": 117200}, {"epoch": 5.274508042052296, "grad_norm": 4.430539131164551, "learning_rate": 4.8639738736944934e-05, "loss": 0.5452, "step": 117400}, {"epoch": 5.283493575343696, "grad_norm": 2.8931050300598145, "learning_rate": 4.863514264624971e-05, "loss": 0.5511, "step": 117600}, {"epoch": 5.2924791086350975, "grad_norm": 4.152849197387695, "learning_rate": 4.8630539021832824e-05, "loss": 0.5992, "step": 117800}, {"epoch": 5.301464641926499, "grad_norm": 4.759932518005371, "learning_rate": 4.8625927865161694e-05, "loss": 0.562, "step": 118000}, {"epoch": 5.301464641926499, "eval_loss": 2.679501533508301, "eval_runtime": 1123.4329, "eval_samples_per_second": 8.816, "eval_steps_per_second": 0.138, "step": 118000}, {"epoch": 5.310450175217899, "grad_norm": 3.476011037826538, "learning_rate": 4.862130917770613e-05, "loss": 0.5785, "step": 118200}, {"epoch": 5.3194357085093, "grad_norm": 5.236737251281738, "learning_rate": 4.861668296093834e-05, "loss": 0.567, "step": 118400}, {"epoch": 5.328421241800701, "grad_norm": 4.2177348136901855, "learning_rate": 4.8612049216332935e-05, "loss": 0.5841, "step": 118600}, {"epoch": 5.337406775092102, "grad_norm": 11.418831825256348, "learning_rate": 4.8607407945366924e-05, "loss": 0.5766, "step": 118800}, {"epoch": 5.3463923083835025, "grad_norm": 3.5538837909698486, "learning_rate": 4.8602759149519716e-05, "loss": 0.564, "step": 119000}, {"epoch": 5.3463923083835025, "eval_loss": 2.6711316108703613, "eval_runtime": 1126.4665, "eval_samples_per_second": 8.792, "eval_steps_per_second": 0.138, "step": 119000}, {"epoch": 5.355377841674903, "grad_norm": 4.001996994018555, "learning_rate": 4.859810283027312e-05, "loss": 0.5761, "step": 119200}, {"epoch": 5.364363374966304, "grad_norm": 3.8045248985290527, "learning_rate": 4.8593438989111345e-05, "loss": 0.556, "step": 119400}, {"epoch": 5.3733489082577055, "grad_norm": 4.172726154327393, "learning_rate": 4.858876762752099e-05, "loss": 0.532, "step": 119600}, {"epoch": 5.382334441549106, "grad_norm": 3.246440887451172, "learning_rate": 4.858408874699105e-05, "loss": 0.5384, "step": 119800}, {"epoch": 5.391319974840507, "grad_norm": 4.557338714599609, "learning_rate": 4.8579402349012936e-05, "loss": 0.5814, "step": 120000}, {"epoch": 5.391319974840507, "eval_loss": 2.5864908695220947, "eval_runtime": 1127.6464, "eval_samples_per_second": 8.783, "eval_steps_per_second": 0.137, "step": 120000}, {"epoch": 5.400305508131908, "grad_norm": 4.541125297546387, "learning_rate": 4.857470843508043e-05, "loss": 0.5676, "step": 120200}, {"epoch": 5.409291041423309, "grad_norm": 5.430272579193115, "learning_rate": 4.857000700668973e-05, "loss": 0.5563, "step": 120400}, {"epoch": 5.418276574714709, "grad_norm": 6.92936372756958, "learning_rate": 4.8565298065339405e-05, "loss": 0.549, "step": 120600}, {"epoch": 5.4272621080061105, "grad_norm": 7.017961025238037, "learning_rate": 4.856058161253045e-05, "loss": 0.5848, "step": 120800}, {"epoch": 5.436247641297511, "grad_norm": 9.248579978942871, "learning_rate": 4.855585764976623e-05, "loss": 0.5389, "step": 121000}, {"epoch": 5.436247641297511, "eval_loss": 2.6353914737701416, "eval_runtime": 1126.4128, "eval_samples_per_second": 8.793, "eval_steps_per_second": 0.138, "step": 121000}, {"epoch": 5.445233174588912, "grad_norm": 4.005666255950928, "learning_rate": 4.8551126178552514e-05, "loss": 0.5066, "step": 121200}, {"epoch": 5.454218707880313, "grad_norm": 8.623493194580078, "learning_rate": 4.854638720039746e-05, "loss": 0.6034, "step": 121400}, {"epoch": 5.463204241171714, "grad_norm": 2.6416425704956055, "learning_rate": 4.854164071681163e-05, "loss": 0.6142, "step": 121600}, {"epoch": 5.472189774463114, "grad_norm": 10.089157104492188, "learning_rate": 4.853688672930796e-05, "loss": 0.5622, "step": 121800}, {"epoch": 5.481175307754516, "grad_norm": 4.700775146484375, "learning_rate": 4.853212523940179e-05, "loss": 0.5023, "step": 122000}, {"epoch": 5.481175307754516, "eval_loss": 2.6258456707000732, "eval_runtime": 1126.3011, "eval_samples_per_second": 8.793, "eval_steps_per_second": 0.138, "step": 122000}, {"epoch": 5.490160841045916, "grad_norm": 3.110429048538208, "learning_rate": 4.852735624861086e-05, "loss": 0.5401, "step": 122200}, {"epoch": 5.499146374337317, "grad_norm": 3.0017948150634766, "learning_rate": 4.8522579758455274e-05, "loss": 0.5053, "step": 122400}, {"epoch": 5.508131907628718, "grad_norm": 32.01022720336914, "learning_rate": 4.851779577045754e-05, "loss": 0.5696, "step": 122600}, {"epoch": 5.517117440920119, "grad_norm": 3.6444568634033203, "learning_rate": 4.8513004286142575e-05, "loss": 0.5667, "step": 122800}, {"epoch": 5.526102974211519, "grad_norm": 3.843571424484253, "learning_rate": 4.850820530703766e-05, "loss": 0.5343, "step": 123000}, {"epoch": 5.526102974211519, "eval_loss": 2.6320242881774902, "eval_runtime": 1124.7644, "eval_samples_per_second": 8.805, "eval_steps_per_second": 0.138, "step": 123000}, {"epoch": 5.535088507502921, "grad_norm": 8.31619930267334, "learning_rate": 4.8503398834672475e-05, "loss": 0.5359, "step": 123200}, {"epoch": 5.544074040794321, "grad_norm": 7.517163276672363, "learning_rate": 4.849858487057908e-05, "loss": 0.5299, "step": 123400}, {"epoch": 5.553059574085722, "grad_norm": 8.95091724395752, "learning_rate": 4.849376341629194e-05, "loss": 0.5113, "step": 123600}, {"epoch": 5.562045107377123, "grad_norm": 4.462621212005615, "learning_rate": 4.848893447334789e-05, "loss": 0.5366, "step": 123800}, {"epoch": 5.571030640668524, "grad_norm": 10.940470695495605, "learning_rate": 4.848409804328617e-05, "loss": 0.5379, "step": 124000}, {"epoch": 5.571030640668524, "eval_loss": 2.6875741481781006, "eval_runtime": 1125.7965, "eval_samples_per_second": 8.797, "eval_steps_per_second": 0.138, "step": 124000}, {"epoch": 5.580016173959924, "grad_norm": 6.110741138458252, "learning_rate": 4.847925412764838e-05, "loss": 0.5844, "step": 124200}, {"epoch": 5.589001707251326, "grad_norm": 8.463932037353516, "learning_rate": 4.847440272797854e-05, "loss": 0.5432, "step": 124400}, {"epoch": 5.597987240542726, "grad_norm": 5.193777561187744, "learning_rate": 4.846954384582303e-05, "loss": 0.5529, "step": 124600}, {"epoch": 5.606972773834127, "grad_norm": 20.273698806762695, "learning_rate": 4.8464677482730616e-05, "loss": 0.5491, "step": 124800}, {"epoch": 5.615958307125528, "grad_norm": 13.971944808959961, "learning_rate": 4.845980364025246e-05, "loss": 0.521, "step": 125000}, {"epoch": 5.615958307125528, "eval_loss": 2.638272523880005, "eval_runtime": 1125.3953, "eval_samples_per_second": 8.8, "eval_steps_per_second": 0.138, "step": 125000}, {"epoch": 5.624943840416929, "grad_norm": 9.242423057556152, "learning_rate": 4.845492231994211e-05, "loss": 0.5348, "step": 125200}, {"epoch": 5.633929373708329, "grad_norm": 11.727241516113281, "learning_rate": 4.8450033523355484e-05, "loss": 0.5712, "step": 125400}, {"epoch": 5.642914906999731, "grad_norm": 6.178032875061035, "learning_rate": 4.8445137252050885e-05, "loss": 0.5304, "step": 125600}, {"epoch": 5.651900440291131, "grad_norm": 2.3145875930786133, "learning_rate": 4.844023350758902e-05, "loss": 0.5708, "step": 125800}, {"epoch": 5.660885973582532, "grad_norm": 10.514315605163574, "learning_rate": 4.843532229153295e-05, "loss": 0.5351, "step": 126000}, {"epoch": 5.660885973582532, "eval_loss": 2.6288137435913086, "eval_runtime": 1125.1485, "eval_samples_per_second": 8.802, "eval_steps_per_second": 0.138, "step": 126000}, {"epoch": 5.669871506873933, "grad_norm": 4.7612762451171875, "learning_rate": 4.843040360544813e-05, "loss": 0.5437, "step": 126200}, {"epoch": 5.678857040165334, "grad_norm": 10.429271697998047, "learning_rate": 4.84254774509024e-05, "loss": 0.5677, "step": 126400}, {"epoch": 5.687842573456734, "grad_norm": 9.046426773071289, "learning_rate": 4.842054382946597e-05, "loss": 0.5346, "step": 126600}, {"epoch": 5.696828106748136, "grad_norm": 6.291619777679443, "learning_rate": 4.8415602742711444e-05, "loss": 0.5429, "step": 126800}, {"epoch": 5.705813640039536, "grad_norm": 4.383120059967041, "learning_rate": 4.8410654192213786e-05, "loss": 0.5791, "step": 127000}, {"epoch": 5.705813640039536, "eval_loss": 2.6114344596862793, "eval_runtime": 1111.2202, "eval_samples_per_second": 8.913, "eval_steps_per_second": 0.139, "step": 127000}, {"epoch": 5.714799173330937, "grad_norm": 7.231574058532715, "learning_rate": 4.840569817955035e-05, "loss": 0.549, "step": 127200}, {"epoch": 5.723784706622338, "grad_norm": 6.7952752113342285, "learning_rate": 4.840073470630089e-05, "loss": 0.5701, "step": 127400}, {"epoch": 5.732770239913739, "grad_norm": 13.880270957946777, "learning_rate": 4.83957637740475e-05, "loss": 0.5792, "step": 127600}, {"epoch": 5.741755773205139, "grad_norm": 3.9061381816864014, "learning_rate": 4.8390785384374664e-05, "loss": 0.5452, "step": 127800}, {"epoch": 5.750741306496541, "grad_norm": 5.482219696044922, "learning_rate": 4.838579953886927e-05, "loss": 0.5535, "step": 128000}, {"epoch": 5.750741306496541, "eval_loss": 2.6782829761505127, "eval_runtime": 1109.7824, "eval_samples_per_second": 8.924, "eval_steps_per_second": 0.14, "step": 128000}, {"epoch": 5.759726839787941, "grad_norm": 10.9642972946167, "learning_rate": 4.838080623912054e-05, "loss": 0.5603, "step": 128200}, {"epoch": 5.768712373079342, "grad_norm": 8.078912734985352, "learning_rate": 4.8375805486720086e-05, "loss": 0.5436, "step": 128400}, {"epoch": 5.777697906370743, "grad_norm": 4.08800745010376, "learning_rate": 4.8370797283261925e-05, "loss": 0.5288, "step": 128600}, {"epoch": 5.786683439662144, "grad_norm": 3.705470561981201, "learning_rate": 4.836578163034242e-05, "loss": 0.5173, "step": 128800}, {"epoch": 5.795668972953544, "grad_norm": 5.712687015533447, "learning_rate": 4.8360758529560314e-05, "loss": 0.5144, "step": 129000}, {"epoch": 5.795668972953544, "eval_loss": 2.654538631439209, "eval_runtime": 1110.9444, "eval_samples_per_second": 8.915, "eval_steps_per_second": 0.14, "step": 129000}, {"epoch": 5.804654506244946, "grad_norm": 4.038150310516357, "learning_rate": 4.835572798251671e-05, "loss": 0.5622, "step": 129200}, {"epoch": 5.813640039536346, "grad_norm": 8.389162063598633, "learning_rate": 4.8350689990815124e-05, "loss": 0.5431, "step": 129400}, {"epoch": 5.822625572827747, "grad_norm": 9.799603462219238, "learning_rate": 4.8345644556061396e-05, "loss": 0.5496, "step": 129600}, {"epoch": 5.831611106119148, "grad_norm": 44.71828842163086, "learning_rate": 4.8340591679863776e-05, "loss": 0.5837, "step": 129800}, {"epoch": 5.840596639410549, "grad_norm": 5.973487854003906, "learning_rate": 4.833553136383287e-05, "loss": 0.5761, "step": 130000}, {"epoch": 5.840596639410549, "eval_loss": 2.5852513313293457, "eval_runtime": 1110.4328, "eval_samples_per_second": 8.919, "eval_steps_per_second": 0.14, "step": 130000}, {"epoch": 5.84958217270195, "grad_norm": 2.016286611557007, "learning_rate": 4.833046360958165e-05, "loss": 0.5219, "step": 130200}, {"epoch": 5.858567705993351, "grad_norm": 2.8672537803649902, "learning_rate": 4.832538841872549e-05, "loss": 0.5476, "step": 130400}, {"epoch": 5.867553239284751, "grad_norm": 17.733501434326172, "learning_rate": 4.832030579288209e-05, "loss": 0.5759, "step": 130600}, {"epoch": 5.876538772576152, "grad_norm": 3.3349339962005615, "learning_rate": 4.831521573367154e-05, "loss": 0.5417, "step": 130800}, {"epoch": 5.885524305867554, "grad_norm": 8.842341423034668, "learning_rate": 4.8310118242716315e-05, "loss": 0.5808, "step": 131000}, {"epoch": 5.885524305867554, "eval_loss": 2.6102592945098877, "eval_runtime": 1109.8113, "eval_samples_per_second": 8.924, "eval_steps_per_second": 0.14, "step": 131000}, {"epoch": 5.894509839158954, "grad_norm": 17.3737850189209, "learning_rate": 4.830501332164124e-05, "loss": 0.5337, "step": 131200}, {"epoch": 5.9034953724503545, "grad_norm": 2.934797525405884, "learning_rate": 4.829990097207351e-05, "loss": 0.557, "step": 131400}, {"epoch": 5.912480905741756, "grad_norm": 3.8777339458465576, "learning_rate": 4.829478119564269e-05, "loss": 0.551, "step": 131600}, {"epoch": 5.921466439033157, "grad_norm": 4.155474662780762, "learning_rate": 4.828965399398071e-05, "loss": 0.5124, "step": 131800}, {"epoch": 5.9304519723245575, "grad_norm": 129.3715057373047, "learning_rate": 4.828451936872187e-05, "loss": 0.5903, "step": 132000}, {"epoch": 5.9304519723245575, "eval_loss": 2.62882924079895, "eval_runtime": 1109.5966, "eval_samples_per_second": 8.926, "eval_steps_per_second": 0.14, "step": 132000}, {"epoch": 5.939437505615958, "grad_norm": 15.213759422302246, "learning_rate": 4.827937732150285e-05, "loss": 0.5439, "step": 132200}, {"epoch": 5.948423038907359, "grad_norm": 5.646575450897217, "learning_rate": 4.827422785396267e-05, "loss": 0.5778, "step": 132400}, {"epoch": 5.95740857219876, "grad_norm": 14.637299537658691, "learning_rate": 4.8269070967742725e-05, "loss": 0.5321, "step": 132600}, {"epoch": 5.966394105490161, "grad_norm": 5.925998687744141, "learning_rate": 4.826390666448679e-05, "loss": 0.5413, "step": 132800}, {"epoch": 5.975379638781561, "grad_norm": 15.88015079498291, "learning_rate": 4.825873494584097e-05, "loss": 0.5342, "step": 133000}, {"epoch": 5.975379638781561, "eval_loss": 2.6159465312957764, "eval_runtime": 1111.9916, "eval_samples_per_second": 8.907, "eval_steps_per_second": 0.139, "step": 133000}, {"epoch": 5.9843651720729625, "grad_norm": 5.7126359939575195, "learning_rate": 4.8253555813453775e-05, "loss": 0.5362, "step": 133200}, {"epoch": 5.993350705364364, "grad_norm": 6.177489757537842, "learning_rate": 4.824836926897604e-05, "loss": 0.5586, "step": 133400}, {"epoch": 6.002336238655764, "grad_norm": 4.75473165512085, "learning_rate": 4.8243175314061e-05, "loss": 0.5288, "step": 133600}, {"epoch": 6.011321771947165, "grad_norm": 2.6426875591278076, "learning_rate": 4.8237973950364225e-05, "loss": 0.5172, "step": 133800}, {"epoch": 6.020307305238566, "grad_norm": 4.771461009979248, "learning_rate": 4.823276517954365e-05, "loss": 0.553, "step": 134000}, {"epoch": 6.020307305238566, "eval_loss": 2.6342790126800537, "eval_runtime": 1109.0332, "eval_samples_per_second": 8.93, "eval_steps_per_second": 0.14, "step": 134000}, {"epoch": 6.029292838529967, "grad_norm": 6.850405216217041, "learning_rate": 4.822754900325958e-05, "loss": 0.5677, "step": 134200}, {"epoch": 6.0382783718213675, "grad_norm": 6.183258533477783, "learning_rate": 4.822232542317466e-05, "loss": 0.5072, "step": 134400}, {"epoch": 6.047263905112769, "grad_norm": 8.269383430480957, "learning_rate": 4.821709444095393e-05, "loss": 0.5206, "step": 134600}, {"epoch": 6.056249438404169, "grad_norm": 1.2506552934646606, "learning_rate": 4.821185605826476e-05, "loss": 0.4931, "step": 134800}, {"epoch": 6.0652349716955705, "grad_norm": 5.354737281799316, "learning_rate": 4.820661027677689e-05, "loss": 0.5413, "step": 135000}, {"epoch": 6.0652349716955705, "eval_loss": 2.612915515899658, "eval_runtime": 1109.5309, "eval_samples_per_second": 8.926, "eval_steps_per_second": 0.14, "step": 135000}, {"epoch": 6.074220504986971, "grad_norm": 3.7436015605926514, "learning_rate": 4.820135709816242e-05, "loss": 0.5262, "step": 135200}, {"epoch": 6.083206038278372, "grad_norm": 2.3418149948120117, "learning_rate": 4.8196096524095815e-05, "loss": 0.4969, "step": 135400}, {"epoch": 6.0921915715697725, "grad_norm": 3.5079879760742188, "learning_rate": 4.8190828556253864e-05, "loss": 0.5307, "step": 135600}, {"epoch": 6.101177104861174, "grad_norm": 5.637112140655518, "learning_rate": 4.8185553196315755e-05, "loss": 0.4973, "step": 135800}, {"epoch": 6.110162638152574, "grad_norm": 9.889835357666016, "learning_rate": 4.8180270445963004e-05, "loss": 0.5798, "step": 136000}, {"epoch": 6.110162638152574, "eval_loss": 2.644315481185913, "eval_runtime": 1108.8674, "eval_samples_per_second": 8.932, "eval_steps_per_second": 0.14, "step": 136000}, {"epoch": 6.1191481714439755, "grad_norm": 5.801605701446533, "learning_rate": 4.817498030687949e-05, "loss": 0.5192, "step": 136200}, {"epoch": 6.128133704735376, "grad_norm": 7.900972843170166, "learning_rate": 4.8169682780751465e-05, "loss": 0.4924, "step": 136400}, {"epoch": 6.137119238026777, "grad_norm": 4.622593879699707, "learning_rate": 4.816437786926751e-05, "loss": 0.5523, "step": 136600}, {"epoch": 6.146104771318178, "grad_norm": 5.807979106903076, "learning_rate": 4.815906557411856e-05, "loss": 0.5208, "step": 136800}, {"epoch": 6.155090304609579, "grad_norm": 42.20900344848633, "learning_rate": 4.8153745896997926e-05, "loss": 0.5296, "step": 137000}, {"epoch": 6.155090304609579, "eval_loss": 2.6667978763580322, "eval_runtime": 1109.2515, "eval_samples_per_second": 8.929, "eval_steps_per_second": 0.14, "step": 137000}, {"epoch": 6.164075837900979, "grad_norm": 7.494675636291504, "learning_rate": 4.814841883960126e-05, "loss": 0.5432, "step": 137200}, {"epoch": 6.1730613711923805, "grad_norm": 24.198781967163086, "learning_rate": 4.814308440362656e-05, "loss": 0.5392, "step": 137400}, {"epoch": 6.182046904483781, "grad_norm": 4.07385778427124, "learning_rate": 4.8137742590774195e-05, "loss": 0.5453, "step": 137600}, {"epoch": 6.191032437775182, "grad_norm": 3.366076707839966, "learning_rate": 4.813239340274685e-05, "loss": 0.5586, "step": 137800}, {"epoch": 6.200017971066583, "grad_norm": 2.3177366256713867, "learning_rate": 4.8127036841249596e-05, "loss": 0.516, "step": 138000}, {"epoch": 6.200017971066583, "eval_loss": 2.58992862701416, "eval_runtime": 1042.972, "eval_samples_per_second": 9.496, "eval_steps_per_second": 0.149, "step": 138000}, {"epoch": 6.209003504357984, "grad_norm": 7.948215007781982, "learning_rate": 4.812167290798984e-05, "loss": 0.5612, "step": 138200}, {"epoch": 6.217989037649384, "grad_norm": 4.769832611083984, "learning_rate": 4.811630160467735e-05, "loss": 0.5632, "step": 138400}, {"epoch": 6.2269745709407855, "grad_norm": 3.1266725063323975, "learning_rate": 4.8110922933024214e-05, "loss": 0.5323, "step": 138600}, {"epoch": 6.235960104232186, "grad_norm": 3.03983211517334, "learning_rate": 4.8105536894744904e-05, "loss": 0.5069, "step": 138800}, {"epoch": 6.244945637523587, "grad_norm": 13.369333267211914, "learning_rate": 4.810014349155621e-05, "loss": 0.5327, "step": 139000}, {"epoch": 6.244945637523587, "eval_loss": 2.632561683654785, "eval_runtime": 1042.6567, "eval_samples_per_second": 9.499, "eval_steps_per_second": 0.149, "step": 139000}, {"epoch": 6.253931170814988, "grad_norm": 4.6813836097717285, "learning_rate": 4.809474272517731e-05, "loss": 0.5188, "step": 139200}, {"epoch": 6.262916704106389, "grad_norm": 8.677014350891113, "learning_rate": 4.8089334597329674e-05, "loss": 0.5233, "step": 139400}, {"epoch": 6.271902237397789, "grad_norm": 10.864197731018066, "learning_rate": 4.8083919109737165e-05, "loss": 0.5193, "step": 139600}, {"epoch": 6.280887770689191, "grad_norm": 5.195317268371582, "learning_rate": 4.807849626412596e-05, "loss": 0.5343, "step": 139800}, {"epoch": 6.289873303980591, "grad_norm": 2.9889798164367676, "learning_rate": 4.8073066062224605e-05, "loss": 0.5322, "step": 140000}, {"epoch": 6.289873303980591, "eval_loss": 2.6202876567840576, "eval_runtime": 1042.8692, "eval_samples_per_second": 9.497, "eval_steps_per_second": 0.149, "step": 140000}, {"epoch": 6.298858837271992, "grad_norm": 2.6103203296661377, "learning_rate": 4.8067628505763986e-05, "loss": 0.5202, "step": 140200}, {"epoch": 6.307844370563393, "grad_norm": 4.392446517944336, "learning_rate": 4.806218359647732e-05, "loss": 0.5528, "step": 140400}, {"epoch": 6.316829903854794, "grad_norm": 12.344572067260742, "learning_rate": 4.8056731336100175e-05, "loss": 0.5158, "step": 140600}, {"epoch": 6.325815437146194, "grad_norm": 4.688963413238525, "learning_rate": 4.8051271726370474e-05, "loss": 0.5684, "step": 140800}, {"epoch": 6.334800970437596, "grad_norm": 5.1644134521484375, "learning_rate": 4.8045804769028454e-05, "loss": 0.5473, "step": 141000}, {"epoch": 6.334800970437596, "eval_loss": 2.647378921508789, "eval_runtime": 1042.5176, "eval_samples_per_second": 9.5, "eval_steps_per_second": 0.149, "step": 141000}, {"epoch": 6.343786503728996, "grad_norm": 4.703906059265137, "learning_rate": 4.804033046581674e-05, "loss": 0.5046, "step": 141200}, {"epoch": 6.352772037020397, "grad_norm": 5.541541576385498, "learning_rate": 4.803484881848025e-05, "loss": 0.5424, "step": 141400}, {"epoch": 6.361757570311798, "grad_norm": 8.089109420776367, "learning_rate": 4.802935982876626e-05, "loss": 0.5066, "step": 141600}, {"epoch": 6.370743103603199, "grad_norm": 7.817598819732666, "learning_rate": 4.802386349842441e-05, "loss": 0.4951, "step": 141800}, {"epoch": 6.379728636894599, "grad_norm": 14.34579086303711, "learning_rate": 4.8018359829206646e-05, "loss": 0.5504, "step": 142000}, {"epoch": 6.379728636894599, "eval_loss": 2.6440494060516357, "eval_runtime": 1042.2395, "eval_samples_per_second": 9.503, "eval_steps_per_second": 0.149, "step": 142000}, {"epoch": 6.388714170186001, "grad_norm": 1.8953040838241577, "learning_rate": 4.801284882286727e-05, "loss": 0.5236, "step": 142200}, {"epoch": 6.397699703477401, "grad_norm": 7.690189838409424, "learning_rate": 4.800733048116291e-05, "loss": 0.5286, "step": 142400}, {"epoch": 6.406685236768802, "grad_norm": 4.344729423522949, "learning_rate": 4.8001804805852566e-05, "loss": 0.5673, "step": 142600}, {"epoch": 6.415670770060203, "grad_norm": 4.415552139282227, "learning_rate": 4.7996271798697534e-05, "loss": 0.5343, "step": 142800}, {"epoch": 6.424656303351604, "grad_norm": 8.222256660461426, "learning_rate": 4.799073146146147e-05, "loss": 0.5271, "step": 143000}, {"epoch": 6.424656303351604, "eval_loss": 2.661680221557617, "eval_runtime": 1042.5056, "eval_samples_per_second": 9.5, "eval_steps_per_second": 0.149, "step": 143000}, {"epoch": 6.433641836643004, "grad_norm": 10.482327461242676, "learning_rate": 4.798518379591035e-05, "loss": 0.5422, "step": 143200}, {"epoch": 6.442627369934406, "grad_norm": 5.589601516723633, "learning_rate": 4.7979628803812516e-05, "loss": 0.4927, "step": 143400}, {"epoch": 6.451612903225806, "grad_norm": 5.369229793548584, "learning_rate": 4.7974066486938613e-05, "loss": 0.5206, "step": 143600}, {"epoch": 6.460598436517207, "grad_norm": 10.578944206237793, "learning_rate": 4.796849684706164e-05, "loss": 0.5118, "step": 143800}, {"epoch": 6.469583969808608, "grad_norm": 5.688765525817871, "learning_rate": 4.7962919885956916e-05, "loss": 0.5278, "step": 144000}, {"epoch": 6.469583969808608, "eval_loss": 2.5855579376220703, "eval_runtime": 1042.8155, "eval_samples_per_second": 9.497, "eval_steps_per_second": 0.149, "step": 144000}, {"epoch": 6.478569503100009, "grad_norm": 13.294556617736816, "learning_rate": 4.795733560540211e-05, "loss": 0.5206, "step": 144200}, {"epoch": 6.487555036391409, "grad_norm": 23.359086990356445, "learning_rate": 4.7951744007177226e-05, "loss": 0.5141, "step": 144400}, {"epoch": 6.496540569682811, "grad_norm": 7.575876712799072, "learning_rate": 4.794614509306457e-05, "loss": 0.5391, "step": 144600}, {"epoch": 6.505526102974212, "grad_norm": 11.292476654052734, "learning_rate": 4.794053886484882e-05, "loss": 0.5605, "step": 144800}, {"epoch": 6.514511636265612, "grad_norm": 3.0334506034851074, "learning_rate": 4.7934925324316944e-05, "loss": 0.5455, "step": 145000}, {"epoch": 6.514511636265612, "eval_loss": 2.6387248039245605, "eval_runtime": 1043.1059, "eval_samples_per_second": 9.495, "eval_steps_per_second": 0.149, "step": 145000}, {"epoch": 6.523497169557013, "grad_norm": 7.96580171585083, "learning_rate": 4.792930447325827e-05, "loss": 0.5582, "step": 145200}, {"epoch": 6.532482702848414, "grad_norm": 9.228450775146484, "learning_rate": 4.792367631346447e-05, "loss": 0.5611, "step": 145400}, {"epoch": 6.541468236139815, "grad_norm": 7.638996124267578, "learning_rate": 4.79180408467295e-05, "loss": 0.4968, "step": 145600}, {"epoch": 6.550453769431216, "grad_norm": 3.997795343399048, "learning_rate": 4.791239807484968e-05, "loss": 0.5158, "step": 145800}, {"epoch": 6.559439302722616, "grad_norm": 6.292296886444092, "learning_rate": 4.7906747999623644e-05, "loss": 0.4836, "step": 146000}, {"epoch": 6.559439302722616, "eval_loss": 2.7034900188446045, "eval_runtime": 1041.7965, "eval_samples_per_second": 9.507, "eval_steps_per_second": 0.149, "step": 146000}, {"epoch": 6.568424836014017, "grad_norm": 4.545322418212891, "learning_rate": 4.790109062285236e-05, "loss": 0.513, "step": 146200}, {"epoch": 6.577410369305419, "grad_norm": 7.309622287750244, "learning_rate": 4.789542594633913e-05, "loss": 0.5276, "step": 146400}, {"epoch": 6.586395902596819, "grad_norm": 6.452086925506592, "learning_rate": 4.788975397188956e-05, "loss": 0.5494, "step": 146600}, {"epoch": 6.59538143588822, "grad_norm": 11.666097640991211, "learning_rate": 4.788407470131161e-05, "loss": 0.5539, "step": 146800}, {"epoch": 6.604366969179621, "grad_norm": 2.6482343673706055, "learning_rate": 4.787838813641554e-05, "loss": 0.5519, "step": 147000}, {"epoch": 6.604366969179621, "eval_loss": 2.6106491088867188, "eval_runtime": 1043.6396, "eval_samples_per_second": 9.49, "eval_steps_per_second": 0.149, "step": 147000}, {"epoch": 6.613352502471022, "grad_norm": 3.5646355152130127, "learning_rate": 4.787269427901395e-05, "loss": 0.5185, "step": 147200}, {"epoch": 6.622338035762422, "grad_norm": 4.31544303894043, "learning_rate": 4.786699313092177e-05, "loss": 0.5319, "step": 147400}, {"epoch": 6.631323569053824, "grad_norm": 9.14370346069336, "learning_rate": 4.786128469395624e-05, "loss": 0.5371, "step": 147600}, {"epoch": 6.640309102345224, "grad_norm": 8.601165771484375, "learning_rate": 4.785556896993693e-05, "loss": 0.5623, "step": 147800}, {"epoch": 6.649294635636625, "grad_norm": 0.5740114450454712, "learning_rate": 4.7849845960685735e-05, "loss": 0.5514, "step": 148000}, {"epoch": 6.649294635636625, "eval_loss": 2.6822104454040527, "eval_runtime": 1041.3572, "eval_samples_per_second": 9.511, "eval_steps_per_second": 0.149, "step": 148000}, {"epoch": 6.658280168928026, "grad_norm": 4.371459007263184, "learning_rate": 4.7844115668026865e-05, "loss": 0.5426, "step": 148200}, {"epoch": 6.667265702219427, "grad_norm": 8.560872077941895, "learning_rate": 4.783837809378685e-05, "loss": 0.5398, "step": 148400}, {"epoch": 6.676251235510827, "grad_norm": 17.999832153320312, "learning_rate": 4.783263323979456e-05, "loss": 0.5235, "step": 148600}, {"epoch": 6.685236768802229, "grad_norm": 5.890347003936768, "learning_rate": 4.782688110788116e-05, "loss": 0.5353, "step": 148800}, {"epoch": 6.694222302093629, "grad_norm": 11.35936450958252, "learning_rate": 4.782112169988015e-05, "loss": 0.5331, "step": 149000}, {"epoch": 6.694222302093629, "eval_loss": 2.594395637512207, "eval_runtime": 1042.7844, "eval_samples_per_second": 9.498, "eval_steps_per_second": 0.149, "step": 149000}, {"epoch": 6.70320783538503, "grad_norm": 8.832243919372559, "learning_rate": 4.781535501762735e-05, "loss": 0.5508, "step": 149200}, {"epoch": 6.712193368676431, "grad_norm": 5.891073226928711, "learning_rate": 4.780958106296089e-05, "loss": 0.5123, "step": 149400}, {"epoch": 6.721178901967832, "grad_norm": 4.517889976501465, "learning_rate": 4.780379983772124e-05, "loss": 0.5073, "step": 149600}, {"epoch": 6.7301644352592325, "grad_norm": 10.936097145080566, "learning_rate": 4.7798011343751146e-05, "loss": 0.5241, "step": 149800}, {"epoch": 6.739149968550634, "grad_norm": 11.331624031066895, "learning_rate": 4.7792215582895705e-05, "loss": 0.5371, "step": 150000}, {"epoch": 6.739149968550634, "eval_loss": 2.5754590034484863, "eval_runtime": 1074.2776, "eval_samples_per_second": 9.219, "eval_steps_per_second": 0.144, "step": 150000}, {"epoch": 6.748135501842034, "grad_norm": 1.8488596677780151, "learning_rate": 4.778641255700233e-05, "loss": 0.5524, "step": 150200}, {"epoch": 6.757121035133435, "grad_norm": 14.553401947021484, "learning_rate": 4.7780602267920716e-05, "loss": 0.5227, "step": 150400}, {"epoch": 6.766106568424836, "grad_norm": 8.445063591003418, "learning_rate": 4.777478471750292e-05, "loss": 0.5523, "step": 150600}, {"epoch": 6.775092101716237, "grad_norm": 4.426443576812744, "learning_rate": 4.776895990760328e-05, "loss": 0.5313, "step": 150800}, {"epoch": 6.7840776350076375, "grad_norm": 4.786408424377441, "learning_rate": 4.776312784007848e-05, "loss": 0.544, "step": 151000}, {"epoch": 6.7840776350076375, "eval_loss": 2.580105781555176, "eval_runtime": 1072.5697, "eval_samples_per_second": 9.234, "eval_steps_per_second": 0.145, "step": 151000}, {"epoch": 6.793063168299039, "grad_norm": 8.09899616241455, "learning_rate": 4.775728851678747e-05, "loss": 0.5373, "step": 151200}, {"epoch": 6.802048701590439, "grad_norm": 8.726985931396484, "learning_rate": 4.775144193959155e-05, "loss": 0.5123, "step": 151400}, {"epoch": 6.8110342348818405, "grad_norm": 5.333522319793701, "learning_rate": 4.774558811035431e-05, "loss": 0.5382, "step": 151600}, {"epoch": 6.820019768173241, "grad_norm": 2.5918726921081543, "learning_rate": 4.773972703094168e-05, "loss": 0.5008, "step": 151800}, {"epoch": 6.829005301464642, "grad_norm": 13.181851387023926, "learning_rate": 4.7733858703221876e-05, "loss": 0.535, "step": 152000}, {"epoch": 6.829005301464642, "eval_loss": 2.6217567920684814, "eval_runtime": 1073.9356, "eval_samples_per_second": 9.222, "eval_steps_per_second": 0.144, "step": 152000}, {"epoch": 6.8379908347560425, "grad_norm": 3.6828906536102295, "learning_rate": 4.772798312906545e-05, "loss": 0.5334, "step": 152200}, {"epoch": 6.846976368047444, "grad_norm": 11.301506042480469, "learning_rate": 4.772210031034521e-05, "loss": 0.5278, "step": 152400}, {"epoch": 6.855961901338844, "grad_norm": 2.866434097290039, "learning_rate": 4.771621024893633e-05, "loss": 0.5196, "step": 152600}, {"epoch": 6.8649474346302455, "grad_norm": 2.977900266647339, "learning_rate": 4.7710312946716286e-05, "loss": 0.5131, "step": 152800}, {"epoch": 6.873932967921646, "grad_norm": 4.671950340270996, "learning_rate": 4.770440840556483e-05, "loss": 0.5423, "step": 153000}, {"epoch": 6.873932967921646, "eval_loss": 2.61964750289917, "eval_runtime": 1072.5606, "eval_samples_per_second": 9.234, "eval_steps_per_second": 0.145, "step": 153000}, {"epoch": 6.882918501213047, "grad_norm": 9.421769142150879, "learning_rate": 4.769849662736403e-05, "loss": 0.5413, "step": 153200}, {"epoch": 6.8919040345044476, "grad_norm": 4.872519493103027, "learning_rate": 4.7692577613998295e-05, "loss": 0.5212, "step": 153400}, {"epoch": 6.900889567795849, "grad_norm": 4.424411296844482, "learning_rate": 4.7686651367354304e-05, "loss": 0.5071, "step": 153600}, {"epoch": 6.909875101087249, "grad_norm": 12.917271614074707, "learning_rate": 4.7680717889321046e-05, "loss": 0.5451, "step": 153800}, {"epoch": 6.9188606343786505, "grad_norm": 5.820809841156006, "learning_rate": 4.767477718178983e-05, "loss": 0.5204, "step": 154000}, {"epoch": 6.9188606343786505, "eval_loss": 2.657820463180542, "eval_runtime": 1071.7187, "eval_samples_per_second": 9.241, "eval_steps_per_second": 0.145, "step": 154000}, {"epoch": 6.927846167670051, "grad_norm": 6.326610088348389, "learning_rate": 4.7668829246654266e-05, "loss": 0.5737, "step": 154200}, {"epoch": 6.936831700961452, "grad_norm": 6.599421977996826, "learning_rate": 4.766287408581026e-05, "loss": 0.5191, "step": 154400}, {"epoch": 6.945817234252853, "grad_norm": 1.006998062133789, "learning_rate": 4.7656911701156016e-05, "loss": 0.5727, "step": 154600}, {"epoch": 6.954802767544254, "grad_norm": 10.324342727661133, "learning_rate": 4.7650942094592055e-05, "loss": 0.5666, "step": 154800}, {"epoch": 6.963788300835654, "grad_norm": 4.480410099029541, "learning_rate": 4.76449652680212e-05, "loss": 0.5732, "step": 155000}, {"epoch": 6.963788300835654, "eval_loss": 2.6091678142547607, "eval_runtime": 1071.6772, "eval_samples_per_second": 9.242, "eval_steps_per_second": 0.145, "step": 155000}, {"epoch": 6.9727738341270555, "grad_norm": 6.651985168457031, "learning_rate": 4.7638981223348565e-05, "loss": 0.5241, "step": 155200}, {"epoch": 6.981759367418456, "grad_norm": 5.644140720367432, "learning_rate": 4.7632989962481565e-05, "loss": 0.5446, "step": 155400}, {"epoch": 6.990744900709857, "grad_norm": 13.221419334411621, "learning_rate": 4.762699148732992e-05, "loss": 0.5281, "step": 155600}, {"epoch": 6.999730434001258, "grad_norm": 9.8410005569458, "learning_rate": 4.762098579980566e-05, "loss": 0.5165, "step": 155800}, {"epoch": 7.008715967292659, "grad_norm": 7.277264595031738, "learning_rate": 4.761497290182309e-05, "loss": 0.4809, "step": 156000}, {"epoch": 7.008715967292659, "eval_loss": 2.6050195693969727, "eval_runtime": 1071.5521, "eval_samples_per_second": 9.243, "eval_steps_per_second": 0.145, "step": 156000}, {"epoch": 7.017701500584059, "grad_norm": 4.4227776527404785, "learning_rate": 4.760895279529883e-05, "loss": 0.5146, "step": 156200}, {"epoch": 7.026687033875461, "grad_norm": 4.779057502746582, "learning_rate": 4.76029254821518e-05, "loss": 0.526, "step": 156400}, {"epoch": 7.035672567166861, "grad_norm": 3.8437089920043945, "learning_rate": 4.7596890964303206e-05, "loss": 0.4857, "step": 156600}, {"epoch": 7.044658100458262, "grad_norm": 5.413717746734619, "learning_rate": 4.759084924367655e-05, "loss": 0.5221, "step": 156800}, {"epoch": 7.053643633749663, "grad_norm": 13.871551513671875, "learning_rate": 4.758480032219765e-05, "loss": 0.511, "step": 157000}, {"epoch": 7.053643633749663, "eval_loss": 2.6103718280792236, "eval_runtime": 1071.8769, "eval_samples_per_second": 9.24, "eval_steps_per_second": 0.145, "step": 157000}, {"epoch": 7.062629167041064, "grad_norm": 10.212960243225098, "learning_rate": 4.757874420179459e-05, "loss": 0.476, "step": 157200}, {"epoch": 7.071614700332464, "grad_norm": 6.196323871612549, "learning_rate": 4.757268088439777e-05, "loss": 0.5268, "step": 157400}, {"epoch": 7.080600233623866, "grad_norm": 7.164575576782227, "learning_rate": 4.756661037193988e-05, "loss": 0.5259, "step": 157600}, {"epoch": 7.089585766915266, "grad_norm": 8.652503967285156, "learning_rate": 4.756053266635591e-05, "loss": 0.4922, "step": 157800}, {"epoch": 7.098571300206667, "grad_norm": 4.017261028289795, "learning_rate": 4.75544477695831e-05, "loss": 0.5387, "step": 158000}, {"epoch": 7.098571300206667, "eval_loss": 2.6241016387939453, "eval_runtime": 1072.8511, "eval_samples_per_second": 9.231, "eval_steps_per_second": 0.144, "step": 158000}, {"epoch": 7.107556833498068, "grad_norm": 4.347532272338867, "learning_rate": 4.7548355683561054e-05, "loss": 0.5527, "step": 158200}, {"epoch": 7.116542366789469, "grad_norm": 1.523980736732483, "learning_rate": 4.754225641023161e-05, "loss": 0.5129, "step": 158400}, {"epoch": 7.12552790008087, "grad_norm": 12.395309448242188, "learning_rate": 4.753614995153892e-05, "loss": 0.5365, "step": 158600}, {"epoch": 7.134513433372271, "grad_norm": 13.86411190032959, "learning_rate": 4.753003630942942e-05, "loss": 0.5008, "step": 158800}, {"epoch": 7.143498966663672, "grad_norm": 2.280458450317383, "learning_rate": 4.7523915485851846e-05, "loss": 0.4832, "step": 159000}, {"epoch": 7.143498966663672, "eval_loss": 2.6097371578216553, "eval_runtime": 1072.2002, "eval_samples_per_second": 9.237, "eval_steps_per_second": 0.145, "step": 159000}, {"epoch": 7.152484499955072, "grad_norm": 4.316972255706787, "learning_rate": 4.751778748275721e-05, "loss": 0.5307, "step": 159200}, {"epoch": 7.161470033246474, "grad_norm": 5.86752462387085, "learning_rate": 4.751165230209882e-05, "loss": 0.5302, "step": 159400}, {"epoch": 7.170455566537874, "grad_norm": 14.792780876159668, "learning_rate": 4.750550994583227e-05, "loss": 0.5341, "step": 159600}, {"epoch": 7.179441099829275, "grad_norm": 9.056463241577148, "learning_rate": 4.749936041591544e-05, "loss": 0.5453, "step": 159800}, {"epoch": 7.188426633120676, "grad_norm": 6.764106750488281, "learning_rate": 4.74932037143085e-05, "loss": 0.4882, "step": 160000}, {"epoch": 7.188426633120676, "eval_loss": 2.592075824737549, "eval_runtime": 1072.2539, "eval_samples_per_second": 9.237, "eval_steps_per_second": 0.145, "step": 160000}, {"epoch": 7.197412166412077, "grad_norm": 10.36343765258789, "learning_rate": 4.74870398429739e-05, "loss": 0.5078, "step": 160200}, {"epoch": 7.206397699703477, "grad_norm": 3.3423054218292236, "learning_rate": 4.748086880387638e-05, "loss": 0.5265, "step": 160400}, {"epoch": 7.215383232994879, "grad_norm": 7.084263801574707, "learning_rate": 4.7474690598982975e-05, "loss": 0.5367, "step": 160600}, {"epoch": 7.224368766286279, "grad_norm": 7.648595333099365, "learning_rate": 4.7468505230262974e-05, "loss": 0.5392, "step": 160800}, {"epoch": 7.23335429957768, "grad_norm": 1.4495679140090942, "learning_rate": 4.746231269968798e-05, "loss": 0.5099, "step": 161000}, {"epoch": 7.23335429957768, "eval_loss": 2.630073070526123, "eval_runtime": 1049.8697, "eval_samples_per_second": 9.434, "eval_steps_per_second": 0.148, "step": 161000}, {"epoch": 7.242339832869081, "grad_norm": 2.1218910217285156, "learning_rate": 4.745611300923187e-05, "loss": 0.5101, "step": 161200}, {"epoch": 7.251325366160482, "grad_norm": 27.048370361328125, "learning_rate": 4.744990616087079e-05, "loss": 0.5328, "step": 161400}, {"epoch": 7.260310899451882, "grad_norm": 9.959211349487305, "learning_rate": 4.7443692156583194e-05, "loss": 0.5176, "step": 161600}, {"epoch": 7.269296432743284, "grad_norm": 8.372459411621094, "learning_rate": 4.7437470998349785e-05, "loss": 0.5379, "step": 161800}, {"epoch": 7.278281966034684, "grad_norm": 12.155389785766602, "learning_rate": 4.7431242688153564e-05, "loss": 0.5518, "step": 162000}, {"epoch": 7.278281966034684, "eval_loss": 2.5808417797088623, "eval_runtime": 1051.3983, "eval_samples_per_second": 9.42, "eval_steps_per_second": 0.147, "step": 162000}, {"epoch": 7.287267499326085, "grad_norm": 12.06241226196289, "learning_rate": 4.7425007227979826e-05, "loss": 0.5364, "step": 162200}, {"epoch": 7.296253032617486, "grad_norm": 7.406551837921143, "learning_rate": 4.741876461981611e-05, "loss": 0.4916, "step": 162400}, {"epoch": 7.305238565908887, "grad_norm": 4.847611904144287, "learning_rate": 4.741251486565226e-05, "loss": 0.4856, "step": 162600}, {"epoch": 7.314224099200287, "grad_norm": 4.857258319854736, "learning_rate": 4.740625796748039e-05, "loss": 0.5113, "step": 162800}, {"epoch": 7.323209632491689, "grad_norm": 3.5690536499023438, "learning_rate": 4.7399993927294904e-05, "loss": 0.5447, "step": 163000}, {"epoch": 7.323209632491689, "eval_loss": 2.5550215244293213, "eval_runtime": 1050.4921, "eval_samples_per_second": 9.428, "eval_steps_per_second": 0.148, "step": 163000}, {"epoch": 7.332195165783089, "grad_norm": 2.832630157470703, "learning_rate": 4.739372274709245e-05, "loss": 0.5102, "step": 163200}, {"epoch": 7.34118069907449, "grad_norm": 6.479580879211426, "learning_rate": 4.7387444428871985e-05, "loss": 0.49, "step": 163400}, {"epoch": 7.350166232365891, "grad_norm": 5.155001640319824, "learning_rate": 4.738115897463472e-05, "loss": 0.5256, "step": 163600}, {"epoch": 7.359151765657292, "grad_norm": 10.935525894165039, "learning_rate": 4.7374866386384155e-05, "loss": 0.5168, "step": 163800}, {"epoch": 7.368137298948692, "grad_norm": 3.9100871086120605, "learning_rate": 4.736856666612605e-05, "loss": 0.5287, "step": 164000}, {"epoch": 7.368137298948692, "eval_loss": 2.5780515670776367, "eval_runtime": 1051.1987, "eval_samples_per_second": 9.422, "eval_steps_per_second": 0.147, "step": 164000}, {"epoch": 7.377122832240094, "grad_norm": 16.054746627807617, "learning_rate": 4.736225981586846e-05, "loss": 0.5182, "step": 164200}, {"epoch": 7.386108365531494, "grad_norm": 8.413787841796875, "learning_rate": 4.735594583762169e-05, "loss": 0.5142, "step": 164400}, {"epoch": 7.395093898822895, "grad_norm": 10.230764389038086, "learning_rate": 4.7349624733398324e-05, "loss": 0.532, "step": 164600}, {"epoch": 7.404079432114296, "grad_norm": 6.237130641937256, "learning_rate": 4.734329650521322e-05, "loss": 0.5217, "step": 164800}, {"epoch": 7.413064965405697, "grad_norm": 12.266544342041016, "learning_rate": 4.733696115508351e-05, "loss": 0.5514, "step": 165000}, {"epoch": 7.413064965405697, "eval_loss": 2.5827889442443848, "eval_runtime": 1050.6343, "eval_samples_per_second": 9.427, "eval_steps_per_second": 0.148, "step": 165000}, {"epoch": 7.422050498697097, "grad_norm": 8.876433372497559, "learning_rate": 4.7330618685028585e-05, "loss": 0.5055, "step": 165200}, {"epoch": 7.431036031988499, "grad_norm": 4.292701244354248, "learning_rate": 4.732426909707013e-05, "loss": 0.5443, "step": 165400}, {"epoch": 7.440021565279899, "grad_norm": 11.186918258666992, "learning_rate": 4.731791239323205e-05, "loss": 0.5327, "step": 165600}, {"epoch": 7.4490070985713, "grad_norm": 2.4021294116973877, "learning_rate": 4.7311548575540586e-05, "loss": 0.5159, "step": 165800}, {"epoch": 7.457992631862701, "grad_norm": 13.129263877868652, "learning_rate": 4.730517764602419e-05, "loss": 0.5135, "step": 166000}, {"epoch": 7.457992631862701, "eval_loss": 2.5977518558502197, "eval_runtime": 1050.7073, "eval_samples_per_second": 9.426, "eval_steps_per_second": 0.148, "step": 166000}, {"epoch": 7.466978165154102, "grad_norm": 1.4429153203964233, "learning_rate": 4.7298799606713606e-05, "loss": 0.5522, "step": 166200}, {"epoch": 7.4759636984455025, "grad_norm": 8.0523042678833, "learning_rate": 4.729241445964183e-05, "loss": 0.5187, "step": 166400}, {"epoch": 7.484949231736904, "grad_norm": 8.555193901062012, "learning_rate": 4.728602220684415e-05, "loss": 0.5157, "step": 166600}, {"epoch": 7.493934765028304, "grad_norm": 4.992981910705566, "learning_rate": 4.727962285035809e-05, "loss": 0.5323, "step": 166800}, {"epoch": 7.502920298319705, "grad_norm": 8.440316200256348, "learning_rate": 4.727321639222345e-05, "loss": 0.5371, "step": 167000}, {"epoch": 7.502920298319705, "eval_loss": 2.536879062652588, "eval_runtime": 1050.1243, "eval_samples_per_second": 9.431, "eval_steps_per_second": 0.148, "step": 167000}, {"epoch": 7.511905831611106, "grad_norm": 14.163543701171875, "learning_rate": 4.7266802834482296e-05, "loss": 0.5096, "step": 167200}, {"epoch": 7.520891364902507, "grad_norm": 2.259485960006714, "learning_rate": 4.726038217917896e-05, "loss": 0.5099, "step": 167400}, {"epoch": 7.5298768981939075, "grad_norm": 10.735986709594727, "learning_rate": 4.7253954428360024e-05, "loss": 0.5192, "step": 167600}, {"epoch": 7.538862431485309, "grad_norm": 3.719405174255371, "learning_rate": 4.7247519584074343e-05, "loss": 0.5043, "step": 167800}, {"epoch": 7.547847964776709, "grad_norm": 2.679960012435913, "learning_rate": 4.724107764837303e-05, "loss": 0.5153, "step": 168000}, {"epoch": 7.547847964776709, "eval_loss": 2.623818874359131, "eval_runtime": 1050.9471, "eval_samples_per_second": 9.424, "eval_steps_per_second": 0.147, "step": 168000}, {"epoch": 7.55683349806811, "grad_norm": 18.183778762817383, "learning_rate": 4.723462862330945e-05, "loss": 0.5054, "step": 168200}, {"epoch": 7.565819031359511, "grad_norm": 1.4932595491409302, "learning_rate": 4.722817251093925e-05, "loss": 0.5461, "step": 168400}, {"epoch": 7.574804564650912, "grad_norm": 10.546357154846191, "learning_rate": 4.722170931332031e-05, "loss": 0.544, "step": 168600}, {"epoch": 7.5837900979423125, "grad_norm": 1.394518256187439, "learning_rate": 4.721523903251278e-05, "loss": 0.4983, "step": 168800}, {"epoch": 7.592775631233714, "grad_norm": 6.905360698699951, "learning_rate": 4.720876167057907e-05, "loss": 0.5109, "step": 169000}, {"epoch": 7.592775631233714, "eval_loss": 2.588412284851074, "eval_runtime": 1050.4908, "eval_samples_per_second": 9.428, "eval_steps_per_second": 0.148, "step": 169000}, {"epoch": 7.601761164525114, "grad_norm": 19.295528411865234, "learning_rate": 4.7202277229583846e-05, "loss": 0.5174, "step": 169200}, {"epoch": 7.6107466978165155, "grad_norm": 22.249040603637695, "learning_rate": 4.719578571159402e-05, "loss": 0.5101, "step": 169400}, {"epoch": 7.619732231107916, "grad_norm": 7.415430068969727, "learning_rate": 4.718928711867878e-05, "loss": 0.4998, "step": 169600}, {"epoch": 7.628717764399317, "grad_norm": 2.853653907775879, "learning_rate": 4.718278145290955e-05, "loss": 0.5099, "step": 169800}, {"epoch": 7.637703297690718, "grad_norm": 4.130895137786865, "learning_rate": 4.7176268716360026e-05, "loss": 0.4822, "step": 170000}, {"epoch": 7.637703297690718, "eval_loss": 2.6600334644317627, "eval_runtime": 1049.8197, "eval_samples_per_second": 9.434, "eval_steps_per_second": 0.148, "step": 170000}, {"epoch": 7.646688830982119, "grad_norm": 2.998149871826172, "learning_rate": 4.7169748911106146e-05, "loss": 0.514, "step": 170200}, {"epoch": 7.655674364273519, "grad_norm": 2.742155075073242, "learning_rate": 4.71632220392261e-05, "loss": 0.5168, "step": 170400}, {"epoch": 7.6646598975649205, "grad_norm": 1.7436096668243408, "learning_rate": 4.7156688102800326e-05, "loss": 0.5029, "step": 170600}, {"epoch": 7.673645430856322, "grad_norm": 4.7532806396484375, "learning_rate": 4.715014710391153e-05, "loss": 0.5279, "step": 170800}, {"epoch": 7.682630964147722, "grad_norm": 8.532057762145996, "learning_rate": 4.714359904464466e-05, "loss": 0.5241, "step": 171000}, {"epoch": 7.682630964147722, "eval_loss": 2.546463966369629, "eval_runtime": 1051.0534, "eval_samples_per_second": 9.423, "eval_steps_per_second": 0.147, "step": 171000}, {"epoch": 7.691616497439123, "grad_norm": 5.461520195007324, "learning_rate": 4.713704392708692e-05, "loss": 0.5415, "step": 171200}, {"epoch": 7.700602030730524, "grad_norm": 5.665705680847168, "learning_rate": 4.713048175332775e-05, "loss": 0.5263, "step": 171400}, {"epoch": 7.709587564021925, "grad_norm": 8.942784309387207, "learning_rate": 4.7123912525458865e-05, "loss": 0.5518, "step": 171600}, {"epoch": 7.7185730973133255, "grad_norm": 9.14636516571045, "learning_rate": 4.7117336245574186e-05, "loss": 0.5277, "step": 171800}, {"epoch": 7.727558630604726, "grad_norm": 4.771318435668945, "learning_rate": 4.7110752915769934e-05, "loss": 0.4941, "step": 172000}, {"epoch": 7.727558630604726, "eval_loss": 2.600043296813965, "eval_runtime": 1049.7614, "eval_samples_per_second": 9.435, "eval_steps_per_second": 0.148, "step": 172000}, {"epoch": 7.736544163896127, "grad_norm": 4.336336135864258, "learning_rate": 4.710416253814454e-05, "loss": 0.5547, "step": 172200}, {"epoch": 7.7455296971875285, "grad_norm": 13.351747512817383, "learning_rate": 4.709756511479868e-05, "loss": 0.4655, "step": 172400}, {"epoch": 7.754515230478929, "grad_norm": 14.320053100585938, "learning_rate": 4.7090960647835305e-05, "loss": 0.5079, "step": 172600}, {"epoch": 7.763500763770329, "grad_norm": 9.463343620300293, "learning_rate": 4.708434913935959e-05, "loss": 0.5139, "step": 172800}, {"epoch": 7.7724862970617306, "grad_norm": 6.440632343292236, "learning_rate": 4.707773059147896e-05, "loss": 0.5042, "step": 173000}, {"epoch": 7.7724862970617306, "eval_loss": 2.626408576965332, "eval_runtime": 1128.6913, "eval_samples_per_second": 8.775, "eval_steps_per_second": 0.137, "step": 173000}, {"epoch": 7.781471830353132, "grad_norm": 7.2138261795043945, "learning_rate": 4.707110500630308e-05, "loss": 0.5522, "step": 173200}, {"epoch": 7.790457363644532, "grad_norm": 7.865017890930176, "learning_rate": 4.706447238594386e-05, "loss": 0.5161, "step": 173400}, {"epoch": 7.7994428969359335, "grad_norm": 18.77448844909668, "learning_rate": 4.7057832732515464e-05, "loss": 0.5437, "step": 173600}, {"epoch": 7.808428430227334, "grad_norm": 2.390789031982422, "learning_rate": 4.705118604813426e-05, "loss": 0.5101, "step": 173800}, {"epoch": 7.817413963518735, "grad_norm": 9.706137657165527, "learning_rate": 4.7044532334918915e-05, "loss": 0.5106, "step": 174000}, {"epoch": 7.817413963518735, "eval_loss": 2.6232926845550537, "eval_runtime": 1128.6235, "eval_samples_per_second": 8.775, "eval_steps_per_second": 0.137, "step": 174000}, {"epoch": 7.826399496810136, "grad_norm": 1.1721101999282837, "learning_rate": 4.70378715949903e-05, "loss": 0.5015, "step": 174200}, {"epoch": 7.835385030101537, "grad_norm": 15.840973854064941, "learning_rate": 4.703120383047151e-05, "loss": 0.4983, "step": 174400}, {"epoch": 7.844370563392937, "grad_norm": 11.476134300231934, "learning_rate": 4.702452904348792e-05, "loss": 0.5375, "step": 174600}, {"epoch": 7.8533560966843385, "grad_norm": 1.3802037239074707, "learning_rate": 4.701784723616712e-05, "loss": 0.5123, "step": 174800}, {"epoch": 7.862341629975739, "grad_norm": 8.808523178100586, "learning_rate": 4.7011158410638944e-05, "loss": 0.5052, "step": 175000}, {"epoch": 7.862341629975739, "eval_loss": 2.5762908458709717, "eval_runtime": 1129.5094, "eval_samples_per_second": 8.768, "eval_steps_per_second": 0.137, "step": 175000}, {"epoch": 7.87132716326714, "grad_norm": 3.9836955070495605, "learning_rate": 4.7004462569035456e-05, "loss": 0.521, "step": 175200}, {"epoch": 7.880312696558541, "grad_norm": 3.1506991386413574, "learning_rate": 4.6997759713490966e-05, "loss": 0.5264, "step": 175400}, {"epoch": 7.889298229849942, "grad_norm": 6.831039905548096, "learning_rate": 4.6991049846142e-05, "loss": 0.5244, "step": 175600}, {"epoch": 7.898283763141342, "grad_norm": 3.348510503768921, "learning_rate": 4.698433296912736e-05, "loss": 0.4787, "step": 175800}, {"epoch": 7.907269296432744, "grad_norm": 3.6049258708953857, "learning_rate": 4.697760908458804e-05, "loss": 0.5266, "step": 176000}, {"epoch": 7.907269296432744, "eval_loss": 2.573176622390747, "eval_runtime": 1129.0656, "eval_samples_per_second": 8.772, "eval_steps_per_second": 0.137, "step": 176000}, {"epoch": 7.916254829724144, "grad_norm": 13.29443073272705, "learning_rate": 4.697087819466728e-05, "loss": 0.4962, "step": 176200}, {"epoch": 7.925240363015545, "grad_norm": 7.278706073760986, "learning_rate": 4.696414030151056e-05, "loss": 0.5111, "step": 176400}, {"epoch": 7.934225896306946, "grad_norm": 5.561307907104492, "learning_rate": 4.695739540726559e-05, "loss": 0.5019, "step": 176600}, {"epoch": 7.943211429598347, "grad_norm": 7.39556884765625, "learning_rate": 4.695064351408232e-05, "loss": 0.5252, "step": 176800}, {"epoch": 7.952196962889747, "grad_norm": 8.245197296142578, "learning_rate": 4.694388462411291e-05, "loss": 0.5361, "step": 177000}, {"epoch": 7.952196962889747, "eval_loss": 2.5876715183258057, "eval_runtime": 1129.3784, "eval_samples_per_second": 8.769, "eval_steps_per_second": 0.137, "step": 177000}, {"epoch": 7.961182496181149, "grad_norm": 4.86469841003418, "learning_rate": 4.693711873951177e-05, "loss": 0.4771, "step": 177200}, {"epoch": 7.970168029472549, "grad_norm": 13.049267768859863, "learning_rate": 4.6930345862435527e-05, "loss": 0.5369, "step": 177400}, {"epoch": 7.97915356276395, "grad_norm": 6.7220258712768555, "learning_rate": 4.692356599504304e-05, "loss": 0.529, "step": 177600}, {"epoch": 7.988139096055351, "grad_norm": 10.31705379486084, "learning_rate": 4.69167791394954e-05, "loss": 0.5603, "step": 177800}, {"epoch": 7.997124629346752, "grad_norm": 6.541712760925293, "learning_rate": 4.690998529795592e-05, "loss": 0.5193, "step": 178000}, {"epoch": 7.997124629346752, "eval_loss": 2.6211884021759033, "eval_runtime": 1127.8197, "eval_samples_per_second": 8.782, "eval_steps_per_second": 0.137, "step": 178000}, {"epoch": 8.006110162638153, "grad_norm": 7.912782192230225, "learning_rate": 4.6903184472590145e-05, "loss": 0.5203, "step": 178200}, {"epoch": 8.015095695929553, "grad_norm": 4.079019546508789, "learning_rate": 4.6896376665565843e-05, "loss": 0.4817, "step": 178400}, {"epoch": 8.024081229220954, "grad_norm": 3.5934817790985107, "learning_rate": 4.6889561879053014e-05, "loss": 0.4757, "step": 178600}, {"epoch": 8.033066762512355, "grad_norm": 5.87857723236084, "learning_rate": 4.6882740115223864e-05, "loss": 0.5184, "step": 178800}, {"epoch": 8.042052295803757, "grad_norm": 10.092915534973145, "learning_rate": 4.687591137625285e-05, "loss": 0.475, "step": 179000}, {"epoch": 8.042052295803757, "eval_loss": 2.614030599594116, "eval_runtime": 1129.8602, "eval_samples_per_second": 8.766, "eval_steps_per_second": 0.137, "step": 179000}, {"epoch": 8.051037829095156, "grad_norm": 5.135852813720703, "learning_rate": 4.686907566431663e-05, "loss": 0.5036, "step": 179200}, {"epoch": 8.060023362386557, "grad_norm": 8.39755630493164, "learning_rate": 4.686223298159409e-05, "loss": 0.4812, "step": 179400}, {"epoch": 8.069008895677959, "grad_norm": 9.086663246154785, "learning_rate": 4.685538333026636e-05, "loss": 0.494, "step": 179600}, {"epoch": 8.07799442896936, "grad_norm": 4.75005578994751, "learning_rate": 4.6848526712516744e-05, "loss": 0.514, "step": 179800}, {"epoch": 8.08697996226076, "grad_norm": 5.841987133026123, "learning_rate": 4.684166313053081e-05, "loss": 0.5183, "step": 180000}, {"epoch": 8.08697996226076, "eval_loss": 2.6352553367614746, "eval_runtime": 1129.1046, "eval_samples_per_second": 8.772, "eval_steps_per_second": 0.137, "step": 180000}, {"epoch": 8.09596549555216, "grad_norm": 6.5779852867126465, "learning_rate": 4.683479258649633e-05, "loss": 0.515, "step": 180200}, {"epoch": 8.104951028843562, "grad_norm": 10.88022232055664, "learning_rate": 4.6827915082603304e-05, "loss": 0.4703, "step": 180400}, {"epoch": 8.113936562134963, "grad_norm": 4.6330366134643555, "learning_rate": 4.6821030621043927e-05, "loss": 0.5193, "step": 180600}, {"epoch": 8.122922095426363, "grad_norm": 6.782657146453857, "learning_rate": 4.681413920401263e-05, "loss": 0.4852, "step": 180800}, {"epoch": 8.131907628717764, "grad_norm": 15.633230209350586, "learning_rate": 4.680724083370606e-05, "loss": 0.5076, "step": 181000}, {"epoch": 8.131907628717764, "eval_loss": 2.5747714042663574, "eval_runtime": 1129.3837, "eval_samples_per_second": 8.769, "eval_steps_per_second": 0.137, "step": 181000}, {"epoch": 8.140893162009165, "grad_norm": 13.606180191040039, "learning_rate": 4.680033551232308e-05, "loss": 0.4894, "step": 181200}, {"epoch": 8.149878695300567, "grad_norm": 6.643714904785156, "learning_rate": 4.679342324206478e-05, "loss": 0.5166, "step": 181400}, {"epoch": 8.158864228591966, "grad_norm": 30.02402687072754, "learning_rate": 4.678650402513442e-05, "loss": 0.5312, "step": 181600}, {"epoch": 8.167849761883367, "grad_norm": 3.5424320697784424, "learning_rate": 4.6779577863737534e-05, "loss": 0.485, "step": 181800}, {"epoch": 8.176835295174769, "grad_norm": 3.954418897628784, "learning_rate": 4.677264476008183e-05, "loss": 0.4791, "step": 182000}, {"epoch": 8.176835295174769, "eval_loss": 2.621889114379883, "eval_runtime": 1127.9131, "eval_samples_per_second": 8.781, "eval_steps_per_second": 0.137, "step": 182000}, {"epoch": 8.18582082846617, "grad_norm": 8.198515892028809, "learning_rate": 4.6765704716377244e-05, "loss": 0.5274, "step": 182200}, {"epoch": 8.19480636175757, "grad_norm": 7.865370750427246, "learning_rate": 4.6758757734835925e-05, "loss": 0.478, "step": 182400}, {"epoch": 8.20379189504897, "grad_norm": 22.58502769470215, "learning_rate": 4.6751803817672214e-05, "loss": 0.4986, "step": 182600}, {"epoch": 8.212777428340372, "grad_norm": 1.826743245124817, "learning_rate": 4.6744842967102695e-05, "loss": 0.526, "step": 182800}, {"epoch": 8.221762961631773, "grad_norm": 1.7866239547729492, "learning_rate": 4.6737875185346134e-05, "loss": 0.4812, "step": 183000}, {"epoch": 8.221762961631773, "eval_loss": 2.6146905422210693, "eval_runtime": 1120.9053, "eval_samples_per_second": 8.836, "eval_steps_per_second": 0.138, "step": 183000}, {"epoch": 8.230748494923173, "grad_norm": 17.78580093383789, "learning_rate": 4.6730900474623525e-05, "loss": 0.4622, "step": 183200}, {"epoch": 8.239734028214574, "grad_norm": 2.1143832206726074, "learning_rate": 4.672391883715805e-05, "loss": 0.5061, "step": 183400}, {"epoch": 8.248719561505975, "grad_norm": 5.723171710968018, "learning_rate": 4.671693027517513e-05, "loss": 0.4791, "step": 183600}, {"epoch": 8.257705094797377, "grad_norm": 8.541521072387695, "learning_rate": 4.670993479090237e-05, "loss": 0.4839, "step": 183800}, {"epoch": 8.266690628088778, "grad_norm": 4.935067653656006, "learning_rate": 4.670293238656958e-05, "loss": 0.4801, "step": 184000}, {"epoch": 8.266690628088778, "eval_loss": 2.671586751937866, "eval_runtime": 1093.7184, "eval_samples_per_second": 9.055, "eval_steps_per_second": 0.142, "step": 184000}, {"epoch": 8.275676161380177, "grad_norm": 10.030083656311035, "learning_rate": 4.6695923064408776e-05, "loss": 0.5172, "step": 184200}, {"epoch": 8.284661694671579, "grad_norm": 5.141510486602783, "learning_rate": 4.66889068266542e-05, "loss": 0.5185, "step": 184400}, {"epoch": 8.29364722796298, "grad_norm": 1.1735432147979736, "learning_rate": 4.668188367554228e-05, "loss": 0.463, "step": 184600}, {"epoch": 8.30263276125438, "grad_norm": 12.648009300231934, "learning_rate": 4.667485361331165e-05, "loss": 0.5135, "step": 184800}, {"epoch": 8.31161829454578, "grad_norm": 10.014856338500977, "learning_rate": 4.6667816642203146e-05, "loss": 0.4898, "step": 185000}, {"epoch": 8.31161829454578, "eval_loss": 2.5692856311798096, "eval_runtime": 1092.5868, "eval_samples_per_second": 9.065, "eval_steps_per_second": 0.142, "step": 185000}, {"epoch": 8.320603827837182, "grad_norm": 0.6926993131637573, "learning_rate": 4.66607727644598e-05, "loss": 0.5116, "step": 185200}, {"epoch": 8.329589361128583, "grad_norm": 8.623538970947266, "learning_rate": 4.665372198232688e-05, "loss": 0.5403, "step": 185400}, {"epoch": 8.338574894419985, "grad_norm": 10.916993141174316, "learning_rate": 4.664666429805181e-05, "loss": 0.4905, "step": 185600}, {"epoch": 8.347560427711384, "grad_norm": 13.056023597717285, "learning_rate": 4.663959971388423e-05, "loss": 0.523, "step": 185800}, {"epoch": 8.356545961002785, "grad_norm": 9.11626148223877, "learning_rate": 4.663252823207599e-05, "loss": 0.5183, "step": 186000}, {"epoch": 8.356545961002785, "eval_loss": 2.5466091632843018, "eval_runtime": 1090.823, "eval_samples_per_second": 9.079, "eval_steps_per_second": 0.142, "step": 186000}, {"epoch": 8.365531494294187, "grad_norm": 4.152465343475342, "learning_rate": 4.6625449854881124e-05, "loss": 0.4888, "step": 186200}, {"epoch": 8.374517027585588, "grad_norm": 3.7355167865753174, "learning_rate": 4.661836458455588e-05, "loss": 0.5065, "step": 186400}, {"epoch": 8.383502560876988, "grad_norm": 4.155386447906494, "learning_rate": 4.661127242335869e-05, "loss": 0.5209, "step": 186600}, {"epoch": 8.392488094168389, "grad_norm": 16.843454360961914, "learning_rate": 4.660417337355018e-05, "loss": 0.4961, "step": 186800}, {"epoch": 8.40147362745979, "grad_norm": 8.681642532348633, "learning_rate": 4.659706743739319e-05, "loss": 0.5324, "step": 187000}, {"epoch": 8.40147362745979, "eval_loss": 2.5965471267700195, "eval_runtime": 1091.868, "eval_samples_per_second": 9.071, "eval_steps_per_second": 0.142, "step": 187000}, {"epoch": 8.410459160751191, "grad_norm": 16.07400131225586, "learning_rate": 4.658995461715273e-05, "loss": 0.4946, "step": 187200}, {"epoch": 8.41944469404259, "grad_norm": 3.314675807952881, "learning_rate": 4.658283491509603e-05, "loss": 0.4955, "step": 187400}, {"epoch": 8.428430227333992, "grad_norm": 8.137290000915527, "learning_rate": 4.6575708333492495e-05, "loss": 0.5202, "step": 187600}, {"epoch": 8.437415760625393, "grad_norm": 3.797729730606079, "learning_rate": 4.6568574874613725e-05, "loss": 0.542, "step": 187800}, {"epoch": 8.446401293916795, "grad_norm": 10.251813888549805, "learning_rate": 4.6561434540733525e-05, "loss": 0.4847, "step": 188000}, {"epoch": 8.446401293916795, "eval_loss": 2.5656449794769287, "eval_runtime": 1090.1823, "eval_samples_per_second": 9.085, "eval_steps_per_second": 0.142, "step": 188000}, {"epoch": 8.455386827208194, "grad_norm": 8.841021537780762, "learning_rate": 4.6554287334127874e-05, "loss": 0.4929, "step": 188200}, {"epoch": 8.464372360499596, "grad_norm": 3.129969596862793, "learning_rate": 4.654713325707496e-05, "loss": 0.5191, "step": 188400}, {"epoch": 8.473357893790997, "grad_norm": 4.764856815338135, "learning_rate": 4.653997231185514e-05, "loss": 0.4668, "step": 188600}, {"epoch": 8.482343427082398, "grad_norm": 2.219456195831299, "learning_rate": 4.653280450075097e-05, "loss": 0.4939, "step": 188800}, {"epoch": 8.491328960373798, "grad_norm": 15.745511054992676, "learning_rate": 4.652562982604721e-05, "loss": 0.5246, "step": 189000}, {"epoch": 8.491328960373798, "eval_loss": 2.595158576965332, "eval_runtime": 1091.4106, "eval_samples_per_second": 9.074, "eval_steps_per_second": 0.142, "step": 189000}, {"epoch": 8.500314493665199, "grad_norm": 28.447345733642578, "learning_rate": 4.651844829003078e-05, "loss": 0.5212, "step": 189200}, {"epoch": 8.5093000269566, "grad_norm": 5.278013229370117, "learning_rate": 4.651125989499081e-05, "loss": 0.5092, "step": 189400}, {"epoch": 8.518285560248001, "grad_norm": 7.048742294311523, "learning_rate": 4.65040646432186e-05, "loss": 0.484, "step": 189600}, {"epoch": 8.527271093539401, "grad_norm": 1.3166794776916504, "learning_rate": 4.6496862537007655e-05, "loss": 0.4682, "step": 189800}, {"epoch": 8.536256626830802, "grad_norm": 2.944568634033203, "learning_rate": 4.6489653578653636e-05, "loss": 0.4905, "step": 190000}, {"epoch": 8.536256626830802, "eval_loss": 2.6485064029693604, "eval_runtime": 1090.2995, "eval_samples_per_second": 9.084, "eval_steps_per_second": 0.142, "step": 190000}, {"epoch": 8.545242160122204, "grad_norm": 12.636077880859375, "learning_rate": 4.6482437770454415e-05, "loss": 0.4857, "step": 190200}, {"epoch": 8.554227693413605, "grad_norm": 8.520101547241211, "learning_rate": 4.647521511471003e-05, "loss": 0.529, "step": 190400}, {"epoch": 8.563213226705004, "grad_norm": 3.0266263484954834, "learning_rate": 4.646798561372272e-05, "loss": 0.5178, "step": 190600}, {"epoch": 8.572198759996406, "grad_norm": 6.245327949523926, "learning_rate": 4.6460749269796875e-05, "loss": 0.49, "step": 190800}, {"epoch": 8.581184293287807, "grad_norm": 11.986411094665527, "learning_rate": 4.645350608523911e-05, "loss": 0.4862, "step": 191000}, {"epoch": 8.581184293287807, "eval_loss": 2.6468417644500732, "eval_runtime": 1089.985, "eval_samples_per_second": 9.086, "eval_steps_per_second": 0.142, "step": 191000}, {"epoch": 8.590169826579208, "grad_norm": 33.56387710571289, "learning_rate": 4.6446256062358175e-05, "loss": 0.477, "step": 191200}, {"epoch": 8.599155359870608, "grad_norm": 6.720004558563232, "learning_rate": 4.6438999203465036e-05, "loss": 0.5533, "step": 191400}, {"epoch": 8.608140893162009, "grad_norm": 5.972818374633789, "learning_rate": 4.643173551087281e-05, "loss": 0.4685, "step": 191600}, {"epoch": 8.61712642645341, "grad_norm": 4.098087787628174, "learning_rate": 4.6424464986896814e-05, "loss": 0.5085, "step": 191800}, {"epoch": 8.626111959744811, "grad_norm": 9.735739707946777, "learning_rate": 4.641718763385454e-05, "loss": 0.5209, "step": 192000}, {"epoch": 8.626111959744811, "eval_loss": 2.538106679916382, "eval_runtime": 1089.8225, "eval_samples_per_second": 9.088, "eval_steps_per_second": 0.142, "step": 192000}, {"epoch": 8.635097493036211, "grad_norm": 17.28936004638672, "learning_rate": 4.640990345406563e-05, "loss": 0.4939, "step": 192200}, {"epoch": 8.644083026327612, "grad_norm": 5.040442943572998, "learning_rate": 4.640261244985194e-05, "loss": 0.5788, "step": 192400}, {"epoch": 8.653068559619014, "grad_norm": 5.635134220123291, "learning_rate": 4.639531462353748e-05, "loss": 0.5067, "step": 192600}, {"epoch": 8.662054092910415, "grad_norm": 9.026660919189453, "learning_rate": 4.638800997744843e-05, "loss": 0.5487, "step": 192800}, {"epoch": 8.671039626201814, "grad_norm": 14.188516616821289, "learning_rate": 4.6380698513913154e-05, "loss": 0.5135, "step": 193000}, {"epoch": 8.671039626201814, "eval_loss": 2.6619675159454346, "eval_runtime": 1089.9555, "eval_samples_per_second": 9.087, "eval_steps_per_second": 0.142, "step": 193000}, {"epoch": 8.680025159493216, "grad_norm": 3.390214204788208, "learning_rate": 4.6373380235262206e-05, "loss": 0.494, "step": 193200}, {"epoch": 8.689010692784617, "grad_norm": 6.442393779754639, "learning_rate": 4.636605514382827e-05, "loss": 0.476, "step": 193400}, {"epoch": 8.697996226076018, "grad_norm": 2.047686815261841, "learning_rate": 4.635872324194624e-05, "loss": 0.4956, "step": 193600}, {"epoch": 8.706981759367418, "grad_norm": 14.76450252532959, "learning_rate": 4.635138453195316e-05, "loss": 0.508, "step": 193800}, {"epoch": 8.715967292658819, "grad_norm": 12.547980308532715, "learning_rate": 4.634403901618824e-05, "loss": 0.493, "step": 194000}, {"epoch": 8.715967292658819, "eval_loss": 2.619582414627075, "eval_runtime": 1090.1869, "eval_samples_per_second": 9.085, "eval_steps_per_second": 0.142, "step": 194000}, {"epoch": 8.72495282595022, "grad_norm": 7.085901260375977, "learning_rate": 4.633668669699289e-05, "loss": 0.5181, "step": 194200}, {"epoch": 8.733938359241622, "grad_norm": 2.719491958618164, "learning_rate": 4.6329327576710654e-05, "loss": 0.4997, "step": 194400}, {"epoch": 8.742923892533021, "grad_norm": 1.1107314825057983, "learning_rate": 4.632196165768726e-05, "loss": 0.5234, "step": 194600}, {"epoch": 8.751909425824422, "grad_norm": 8.07888126373291, "learning_rate": 4.63145889422706e-05, "loss": 0.5515, "step": 194800}, {"epoch": 8.760894959115824, "grad_norm": 8.861418724060059, "learning_rate": 4.6307209432810736e-05, "loss": 0.491, "step": 195000}, {"epoch": 8.760894959115824, "eval_loss": 2.562807559967041, "eval_runtime": 1047.6466, "eval_samples_per_second": 9.454, "eval_steps_per_second": 0.148, "step": 195000}, {"epoch": 8.769880492407225, "grad_norm": 15.92845344543457, "learning_rate": 4.62998231316599e-05, "loss": 0.4595, "step": 195200}, {"epoch": 8.778866025698624, "grad_norm": 13.050873756408691, "learning_rate": 4.629243004117246e-05, "loss": 0.486, "step": 195400}, {"epoch": 8.787851558990026, "grad_norm": 2.353410005569458, "learning_rate": 4.6285030163705004e-05, "loss": 0.5059, "step": 195600}, {"epoch": 8.796837092281427, "grad_norm": 6.4239501953125, "learning_rate": 4.6277623501616206e-05, "loss": 0.5145, "step": 195800}, {"epoch": 8.805822625572828, "grad_norm": 10.336437225341797, "learning_rate": 4.627021005726698e-05, "loss": 0.4984, "step": 196000}, {"epoch": 8.805822625572828, "eval_loss": 2.643347978591919, "eval_runtime": 1054.0102, "eval_samples_per_second": 9.396, "eval_steps_per_second": 0.147, "step": 196000}, {"epoch": 8.814808158864228, "grad_norm": 1.9258716106414795, "learning_rate": 4.6262789833020356e-05, "loss": 0.503, "step": 196200}, {"epoch": 8.823793692155629, "grad_norm": 1.0549428462982178, "learning_rate": 4.625536283124154e-05, "loss": 0.5193, "step": 196400}, {"epoch": 8.83277922544703, "grad_norm": 8.691810607910156, "learning_rate": 4.624792905429789e-05, "loss": 0.4829, "step": 196600}, {"epoch": 8.841764758738432, "grad_norm": 2.745849370956421, "learning_rate": 4.624048850455893e-05, "loss": 0.5121, "step": 196800}, {"epoch": 8.850750292029833, "grad_norm": 4.562199115753174, "learning_rate": 4.623304118439635e-05, "loss": 0.4943, "step": 197000}, {"epoch": 8.850750292029833, "eval_loss": 2.5749173164367676, "eval_runtime": 1045.0959, "eval_samples_per_second": 9.477, "eval_steps_per_second": 0.148, "step": 197000}, {"epoch": 8.859735825321232, "grad_norm": 9.411834716796875, "learning_rate": 4.622558709618397e-05, "loss": 0.5262, "step": 197200}, {"epoch": 8.868721358612634, "grad_norm": 35.47937774658203, "learning_rate": 4.62181262422978e-05, "loss": 0.529, "step": 197400}, {"epoch": 8.877706891904035, "grad_norm": 3.0108392238616943, "learning_rate": 4.6210658625116e-05, "loss": 0.4835, "step": 197600}, {"epoch": 8.886692425195434, "grad_norm": 9.288346290588379, "learning_rate": 4.620318424701887e-05, "loss": 0.5115, "step": 197800}, {"epoch": 8.895677958486836, "grad_norm": 4.2439045906066895, "learning_rate": 4.6195703110388875e-05, "loss": 0.5205, "step": 198000}, {"epoch": 8.895677958486836, "eval_loss": 2.5893914699554443, "eval_runtime": 1047.7304, "eval_samples_per_second": 9.453, "eval_steps_per_second": 0.148, "step": 198000}, {"epoch": 8.904663491778237, "grad_norm": 15.511228561401367, "learning_rate": 4.618821521761063e-05, "loss": 0.501, "step": 198200}, {"epoch": 8.913649025069638, "grad_norm": 27.06317710876465, "learning_rate": 4.618072057107091e-05, "loss": 0.4678, "step": 198400}, {"epoch": 8.92263455836104, "grad_norm": 9.34231185913086, "learning_rate": 4.6173219173158646e-05, "loss": 0.5284, "step": 198600}, {"epoch": 8.931620091652439, "grad_norm": 3.9095022678375244, "learning_rate": 4.6165711026264914e-05, "loss": 0.5517, "step": 198800}, {"epoch": 8.94060562494384, "grad_norm": 16.16065788269043, "learning_rate": 4.6158196132782935e-05, "loss": 0.459, "step": 199000}, {"epoch": 8.94060562494384, "eval_loss": 2.5856435298919678, "eval_runtime": 1051.5622, "eval_samples_per_second": 9.418, "eval_steps_per_second": 0.147, "step": 199000}, {"epoch": 8.949591158235242, "grad_norm": 7.442063331604004, "learning_rate": 4.615067449510809e-05, "loss": 0.5037, "step": 199200}, {"epoch": 8.958576691526643, "grad_norm": 8.311750411987305, "learning_rate": 4.6143146115637915e-05, "loss": 0.5125, "step": 199400}, {"epoch": 8.967562224818042, "grad_norm": 12.351191520690918, "learning_rate": 4.613561099677207e-05, "loss": 0.5011, "step": 199600}, {"epoch": 8.976547758109444, "grad_norm": 4.787649154663086, "learning_rate": 4.61280691409124e-05, "loss": 0.502, "step": 199800}, {"epoch": 8.985533291400845, "grad_norm": 2.0292952060699463, "learning_rate": 4.612052055046287e-05, "loss": 0.51, "step": 200000}, {"epoch": 8.985533291400845, "eval_loss": 2.5565273761749268, "eval_runtime": 1056.4909, "eval_samples_per_second": 9.374, "eval_steps_per_second": 0.147, "step": 200000}, {"epoch": 8.994518824692246, "grad_norm": 17.331127166748047, "learning_rate": 4.61129652278296e-05, "loss": 0.4937, "step": 200200}, {"epoch": 9.003504357983646, "grad_norm": 1.4020780324935913, "learning_rate": 4.6105403175420844e-05, "loss": 0.5383, "step": 200400}, {"epoch": 9.012489891275047, "grad_norm": 8.02592658996582, "learning_rate": 4.6097834395647034e-05, "loss": 0.5085, "step": 200600}, {"epoch": 9.021475424566448, "grad_norm": 4.4860358238220215, "learning_rate": 4.6090258890920706e-05, "loss": 0.4802, "step": 200800}, {"epoch": 9.03046095785785, "grad_norm": 38.50815963745117, "learning_rate": 4.6082676663656575e-05, "loss": 0.4924, "step": 201000}, {"epoch": 9.03046095785785, "eval_loss": 2.609539031982422, "eval_runtime": 1047.6211, "eval_samples_per_second": 9.454, "eval_steps_per_second": 0.148, "step": 201000}, {"epoch": 9.03944649114925, "grad_norm": 6.612710952758789, "learning_rate": 4.607508771627146e-05, "loss": 0.4848, "step": 201200}, {"epoch": 9.04843202444065, "grad_norm": 6.748866558074951, "learning_rate": 4.606749205118437e-05, "loss": 0.4901, "step": 201400}, {"epoch": 9.057417557732052, "grad_norm": 8.580459594726562, "learning_rate": 4.6059889670816415e-05, "loss": 0.4836, "step": 201600}, {"epoch": 9.066403091023453, "grad_norm": 12.98373794555664, "learning_rate": 4.605228057759087e-05, "loss": 0.5037, "step": 201800}, {"epoch": 9.075388624314852, "grad_norm": 12.246403694152832, "learning_rate": 4.604466477393312e-05, "loss": 0.5253, "step": 202000}, {"epoch": 9.075388624314852, "eval_loss": 2.579723358154297, "eval_runtime": 1049.0403, "eval_samples_per_second": 9.441, "eval_steps_per_second": 0.148, "step": 202000}, {"epoch": 9.084374157606254, "grad_norm": 4.6200995445251465, "learning_rate": 4.603704226227072e-05, "loss": 0.5103, "step": 202200}, {"epoch": 9.093359690897655, "grad_norm": 2.7461910247802734, "learning_rate": 4.6029413045033366e-05, "loss": 0.5191, "step": 202400}, {"epoch": 9.102345224189056, "grad_norm": 9.832839965820312, "learning_rate": 4.602177712465286e-05, "loss": 0.441, "step": 202600}, {"epoch": 9.111330757480456, "grad_norm": 38.25431823730469, "learning_rate": 4.6014134503563164e-05, "loss": 0.4912, "step": 202800}, {"epoch": 9.120316290771857, "grad_norm": 4.103306293487549, "learning_rate": 4.6006485184200365e-05, "loss": 0.5063, "step": 203000}, {"epoch": 9.120316290771857, "eval_loss": 2.5657711029052734, "eval_runtime": 1049.2539, "eval_samples_per_second": 9.439, "eval_steps_per_second": 0.148, "step": 203000}, {"epoch": 9.129301824063258, "grad_norm": 4.588971138000488, "learning_rate": 4.59988291690027e-05, "loss": 0.4868, "step": 203200}, {"epoch": 9.13828735735466, "grad_norm": 4.60148811340332, "learning_rate": 4.599116646041052e-05, "loss": 0.4724, "step": 203400}, {"epoch": 9.14727289064606, "grad_norm": 9.302680969238281, "learning_rate": 4.5983497060866334e-05, "loss": 0.4685, "step": 203600}, {"epoch": 9.15625842393746, "grad_norm": 15.227461814880371, "learning_rate": 4.597582097281475e-05, "loss": 0.4643, "step": 203800}, {"epoch": 9.165243957228862, "grad_norm": 3.3283636569976807, "learning_rate": 4.596813819870254e-05, "loss": 0.4851, "step": 204000}, {"epoch": 9.165243957228862, "eval_loss": 2.586775779724121, "eval_runtime": 1044.1753, "eval_samples_per_second": 9.485, "eval_steps_per_second": 0.148, "step": 204000}, {"epoch": 9.174229490520263, "grad_norm": 13.116498947143555, "learning_rate": 4.596044874097859e-05, "loss": 0.4914, "step": 204200}, {"epoch": 9.183215023811663, "grad_norm": 4.156534194946289, "learning_rate": 4.595275260209392e-05, "loss": 0.4347, "step": 204400}, {"epoch": 9.192200557103064, "grad_norm": 13.453794479370117, "learning_rate": 4.594504978450169e-05, "loss": 0.5118, "step": 204600}, {"epoch": 9.201186090394465, "grad_norm": 7.623902320861816, "learning_rate": 4.5937340290657175e-05, "loss": 0.4727, "step": 204800}, {"epoch": 9.210171623685866, "grad_norm": 1.6703872680664062, "learning_rate": 4.592962412301778e-05, "loss": 0.4967, "step": 205000}, {"epoch": 9.210171623685866, "eval_loss": 2.5800576210021973, "eval_runtime": 1046.6856, "eval_samples_per_second": 9.462, "eval_steps_per_second": 0.148, "step": 205000}, {"epoch": 9.219157156977266, "grad_norm": 5.957919120788574, "learning_rate": 4.5921901284043033e-05, "loss": 0.5113, "step": 205200}, {"epoch": 9.228142690268667, "grad_norm": 1.301614761352539, "learning_rate": 4.5914171776194615e-05, "loss": 0.4691, "step": 205400}, {"epoch": 9.237128223560068, "grad_norm": 10.48454475402832, "learning_rate": 4.59064356019363e-05, "loss": 0.4726, "step": 205600}, {"epoch": 9.24611375685147, "grad_norm": 6.0278825759887695, "learning_rate": 4.5898692763734e-05, "loss": 0.558, "step": 205800}, {"epoch": 9.25509929014287, "grad_norm": 5.763274192810059, "learning_rate": 4.5890943264055754e-05, "loss": 0.5259, "step": 206000}, {"epoch": 9.25509929014287, "eval_loss": 2.604665756225586, "eval_runtime": 1046.3791, "eval_samples_per_second": 9.465, "eval_steps_per_second": 0.148, "step": 206000}, {"epoch": 9.26408482343427, "grad_norm": 10.876523971557617, "learning_rate": 4.588318710537172e-05, "loss": 0.4809, "step": 206200}, {"epoch": 9.273070356725672, "grad_norm": 0.9701793789863586, "learning_rate": 4.5875424290154175e-05, "loss": 0.4769, "step": 206400}, {"epoch": 9.282055890017073, "grad_norm": 1.0843396186828613, "learning_rate": 4.5867654820877534e-05, "loss": 0.463, "step": 206600}, {"epoch": 9.291041423308473, "grad_norm": 5.901642799377441, "learning_rate": 4.585987870001831e-05, "loss": 0.4497, "step": 206800}, {"epoch": 9.300026956599874, "grad_norm": 3.498466968536377, "learning_rate": 4.585209593005516e-05, "loss": 0.503, "step": 207000}, {"epoch": 9.300026956599874, "eval_loss": 2.567307472229004, "eval_runtime": 1105.7578, "eval_samples_per_second": 8.957, "eval_steps_per_second": 0.14, "step": 207000}, {"epoch": 9.309012489891275, "grad_norm": 6.869686603546143, "learning_rate": 4.5844306513468846e-05, "loss": 0.5243, "step": 207200}, {"epoch": 9.317998023182676, "grad_norm": 6.0725579261779785, "learning_rate": 4.583651045274225e-05, "loss": 0.4945, "step": 207400}, {"epoch": 9.326983556474076, "grad_norm": 7.266490936279297, "learning_rate": 4.582870775036037e-05, "loss": 0.5574, "step": 207600}, {"epoch": 9.335969089765477, "grad_norm": 9.448139190673828, "learning_rate": 4.582089840881032e-05, "loss": 0.4698, "step": 207800}, {"epoch": 9.344954623056879, "grad_norm": 22.13079071044922, "learning_rate": 4.581308243058134e-05, "loss": 0.4998, "step": 208000}, {"epoch": 9.344954623056879, "eval_loss": 2.5886385440826416, "eval_runtime": 1087.2015, "eval_samples_per_second": 9.11, "eval_steps_per_second": 0.143, "step": 208000}, {"epoch": 9.35394015634828, "grad_norm": 5.0014214515686035, "learning_rate": 4.580525981816478e-05, "loss": 0.4776, "step": 208200}, {"epoch": 9.36292568963968, "grad_norm": 14.449097633361816, "learning_rate": 4.57974305740541e-05, "loss": 0.496, "step": 208400}, {"epoch": 9.37191122293108, "grad_norm": 9.349239349365234, "learning_rate": 4.5789594700744885e-05, "loss": 0.4866, "step": 208600}, {"epoch": 9.380896756222482, "grad_norm": 11.318212509155273, "learning_rate": 4.5781752200734826e-05, "loss": 0.5278, "step": 208800}, {"epoch": 9.389882289513883, "grad_norm": 5.554197311401367, "learning_rate": 4.5773903076523715e-05, "loss": 0.5253, "step": 209000}, {"epoch": 9.389882289513883, "eval_loss": 2.599957227706909, "eval_runtime": 1086.4312, "eval_samples_per_second": 9.116, "eval_steps_per_second": 0.143, "step": 209000}, {"epoch": 9.398867822805283, "grad_norm": 6.716334342956543, "learning_rate": 4.5766047330613484e-05, "loss": 0.5018, "step": 209200}, {"epoch": 9.407853356096684, "grad_norm": 6.921196937561035, "learning_rate": 4.5758184965508145e-05, "loss": 0.492, "step": 209400}, {"epoch": 9.416838889388085, "grad_norm": 9.290867805480957, "learning_rate": 4.5750315983713845e-05, "loss": 0.4961, "step": 209600}, {"epoch": 9.425824422679487, "grad_norm": 4.696343898773193, "learning_rate": 4.574244038773881e-05, "loss": 0.5124, "step": 209800}, {"epoch": 9.434809955970888, "grad_norm": 7.172698020935059, "learning_rate": 4.5734558180093414e-05, "loss": 0.5043, "step": 210000}, {"epoch": 9.434809955970888, "eval_loss": 2.5734574794769287, "eval_runtime": 1084.6494, "eval_samples_per_second": 9.131, "eval_steps_per_second": 0.143, "step": 210000}, {"epoch": 9.443795489262287, "grad_norm": 11.656425476074219, "learning_rate": 4.5726669363290106e-05, "loss": 0.4677, "step": 210200}, {"epoch": 9.452781022553689, "grad_norm": 5.8166327476501465, "learning_rate": 4.571877393984345e-05, "loss": 0.5262, "step": 210400}, {"epoch": 9.46176655584509, "grad_norm": 9.039112091064453, "learning_rate": 4.571087191227013e-05, "loss": 0.4918, "step": 210600}, {"epoch": 9.47075208913649, "grad_norm": 5.360496520996094, "learning_rate": 4.570296328308892e-05, "loss": 0.4785, "step": 210800}, {"epoch": 9.47973762242789, "grad_norm": 3.4371631145477295, "learning_rate": 4.569504805482069e-05, "loss": 0.5008, "step": 211000}, {"epoch": 9.47973762242789, "eval_loss": 2.543621778488159, "eval_runtime": 1079.1033, "eval_samples_per_second": 9.178, "eval_steps_per_second": 0.144, "step": 211000}, {"epoch": 9.488723155719292, "grad_norm": 34.157020568847656, "learning_rate": 4.568712622998844e-05, "loss": 0.4958, "step": 211200}, {"epoch": 9.497708689010693, "grad_norm": 15.599651336669922, "learning_rate": 4.567919781111726e-05, "loss": 0.4775, "step": 211400}, {"epoch": 9.506694222302094, "grad_norm": 7.594967842102051, "learning_rate": 4.567126280073433e-05, "loss": 0.4781, "step": 211600}, {"epoch": 9.515679755593494, "grad_norm": 6.20011043548584, "learning_rate": 4.566332120136895e-05, "loss": 0.5039, "step": 211800}, {"epoch": 9.524665288884895, "grad_norm": 3.579672336578369, "learning_rate": 4.56553730155525e-05, "loss": 0.5192, "step": 212000}, {"epoch": 9.524665288884895, "eval_loss": 2.5792055130004883, "eval_runtime": 1068.6939, "eval_samples_per_second": 9.267, "eval_steps_per_second": 0.145, "step": 212000}, {"epoch": 9.533650822176297, "grad_norm": 16.665241241455078, "learning_rate": 4.564741824581848e-05, "loss": 0.4815, "step": 212200}, {"epoch": 9.542636355467698, "grad_norm": 2.774914503097534, "learning_rate": 4.563945689470247e-05, "loss": 0.5013, "step": 212400}, {"epoch": 9.551621888759097, "grad_norm": 5.757125377655029, "learning_rate": 4.563148896474218e-05, "loss": 0.4649, "step": 212600}, {"epoch": 9.560607422050499, "grad_norm": 6.996931552886963, "learning_rate": 4.562351445847737e-05, "loss": 0.4774, "step": 212800}, {"epoch": 9.5695929553419, "grad_norm": 8.286883354187012, "learning_rate": 4.561553337844994e-05, "loss": 0.4759, "step": 213000}, {"epoch": 9.5695929553419, "eval_loss": 2.6342129707336426, "eval_runtime": 1062.477, "eval_samples_per_second": 9.322, "eval_steps_per_second": 0.146, "step": 213000}, {"epoch": 9.578578488633301, "grad_norm": 16.222797393798828, "learning_rate": 4.560754572720385e-05, "loss": 0.4855, "step": 213200}, {"epoch": 9.5875640219247, "grad_norm": 3.249690532684326, "learning_rate": 4.559955150728517e-05, "loss": 0.4865, "step": 213400}, {"epoch": 9.596549555216102, "grad_norm": 1.507887601852417, "learning_rate": 4.559155072124208e-05, "loss": 0.4639, "step": 213600}, {"epoch": 9.605535088507503, "grad_norm": 5.217645645141602, "learning_rate": 4.558354337162482e-05, "loss": 0.4814, "step": 213800}, {"epoch": 9.614520621798905, "grad_norm": 8.47757339477539, "learning_rate": 4.557552946098575e-05, "loss": 0.4777, "step": 214000}, {"epoch": 9.614520621798905, "eval_loss": 2.528547525405884, "eval_runtime": 1060.9351, "eval_samples_per_second": 9.335, "eval_steps_per_second": 0.146, "step": 214000}, {"epoch": 9.623506155090304, "grad_norm": 5.725880146026611, "learning_rate": 4.556750899187932e-05, "loss": 0.4685, "step": 214200}, {"epoch": 9.632491688381705, "grad_norm": 2.501408100128174, "learning_rate": 4.555948196686204e-05, "loss": 0.4731, "step": 214400}, {"epoch": 9.641477221673107, "grad_norm": 5.393123626708984, "learning_rate": 4.555144838849253e-05, "loss": 0.4806, "step": 214600}, {"epoch": 9.650462754964508, "grad_norm": 9.90498161315918, "learning_rate": 4.5543408259331534e-05, "loss": 0.5061, "step": 214800}, {"epoch": 9.659448288255907, "grad_norm": 9.07691764831543, "learning_rate": 4.553536158194181e-05, "loss": 0.5264, "step": 215000}, {"epoch": 9.659448288255907, "eval_loss": 2.618248462677002, "eval_runtime": 1061.3051, "eval_samples_per_second": 9.332, "eval_steps_per_second": 0.146, "step": 215000}, {"epoch": 9.668433821547309, "grad_norm": 12.091426849365234, "learning_rate": 4.552730835888827e-05, "loss": 0.4808, "step": 215200}, {"epoch": 9.67741935483871, "grad_norm": 10.131613731384277, "learning_rate": 4.551924859273786e-05, "loss": 0.4742, "step": 215400}, {"epoch": 9.686404888130111, "grad_norm": 7.796463966369629, "learning_rate": 4.551118228605966e-05, "loss": 0.4831, "step": 215600}, {"epoch": 9.69539042142151, "grad_norm": 9.690413475036621, "learning_rate": 4.550310944142481e-05, "loss": 0.4876, "step": 215800}, {"epoch": 9.704375954712912, "grad_norm": 23.55455207824707, "learning_rate": 4.549503006140653e-05, "loss": 0.5262, "step": 216000}, {"epoch": 9.704375954712912, "eval_loss": 2.5615086555480957, "eval_runtime": 1066.0893, "eval_samples_per_second": 9.29, "eval_steps_per_second": 0.145, "step": 216000}, {"epoch": 9.713361488004313, "grad_norm": 4.3534674644470215, "learning_rate": 4.548694414858012e-05, "loss": 0.4968, "step": 216200}, {"epoch": 9.722347021295715, "grad_norm": 2.0972509384155273, "learning_rate": 4.5478851705523e-05, "loss": 0.4623, "step": 216400}, {"epoch": 9.731332554587114, "grad_norm": 7.557238578796387, "learning_rate": 4.547075273481461e-05, "loss": 0.4959, "step": 216600}, {"epoch": 9.740318087878515, "grad_norm": 4.63540506362915, "learning_rate": 4.546264723903652e-05, "loss": 0.4961, "step": 216800}, {"epoch": 9.749303621169917, "grad_norm": 6.184654712677002, "learning_rate": 4.545453522077237e-05, "loss": 0.4631, "step": 217000}, {"epoch": 9.749303621169917, "eval_loss": 2.5767123699188232, "eval_runtime": 1069.0714, "eval_samples_per_second": 9.264, "eval_steps_per_second": 0.145, "step": 217000}, {"epoch": 9.758289154461318, "grad_norm": 1.6774091720581055, "learning_rate": 4.544641668260785e-05, "loss": 0.4835, "step": 217200}, {"epoch": 9.767274687752717, "grad_norm": 13.404745101928711, "learning_rate": 4.543829162713078e-05, "loss": 0.4959, "step": 217400}, {"epoch": 9.776260221044119, "grad_norm": 6.530130386352539, "learning_rate": 4.5430160056931004e-05, "loss": 0.5029, "step": 217600}, {"epoch": 9.78524575433552, "grad_norm": 9.423506736755371, "learning_rate": 4.5422021974600484e-05, "loss": 0.4966, "step": 217800}, {"epoch": 9.794231287626921, "grad_norm": 12.464203834533691, "learning_rate": 4.5413877382733226e-05, "loss": 0.447, "step": 218000}, {"epoch": 9.794231287626921, "eval_loss": 2.601382255554199, "eval_runtime": 1079.2743, "eval_samples_per_second": 9.177, "eval_steps_per_second": 0.144, "step": 218000}, {"epoch": 9.80321682091832, "grad_norm": 3.708329439163208, "learning_rate": 4.540572628392534e-05, "loss": 0.4721, "step": 218200}, {"epoch": 9.812202354209722, "grad_norm": 3.581702947616577, "learning_rate": 4.539756868077498e-05, "loss": 0.5079, "step": 218400}, {"epoch": 9.821187887501123, "grad_norm": 2.959970235824585, "learning_rate": 4.53894045758824e-05, "loss": 0.5195, "step": 218600}, {"epoch": 9.830173420792525, "grad_norm": 3.9296224117279053, "learning_rate": 4.5381233971849915e-05, "loss": 0.4751, "step": 218800}, {"epoch": 9.839158954083924, "grad_norm": 5.21635103225708, "learning_rate": 4.53730568712819e-05, "loss": 0.4505, "step": 219000}, {"epoch": 9.839158954083924, "eval_loss": 2.5183651447296143, "eval_runtime": 1079.5489, "eval_samples_per_second": 9.174, "eval_steps_per_second": 0.144, "step": 219000}, {"epoch": 9.848144487375325, "grad_norm": 10.114027976989746, "learning_rate": 4.536487327678484e-05, "loss": 0.4909, "step": 219200}, {"epoch": 9.857130020666727, "grad_norm": 4.078984260559082, "learning_rate": 4.535668319096723e-05, "loss": 0.5135, "step": 219400}, {"epoch": 9.866115553958128, "grad_norm": 9.926795959472656, "learning_rate": 4.534848661643969e-05, "loss": 0.5231, "step": 219600}, {"epoch": 9.875101087249528, "grad_norm": 6.326144218444824, "learning_rate": 4.534028355581488e-05, "loss": 0.5147, "step": 219800}, {"epoch": 9.884086620540929, "grad_norm": 7.665927410125732, "learning_rate": 4.5332074011707515e-05, "loss": 0.4863, "step": 220000}, {"epoch": 9.884086620540929, "eval_loss": 2.528228998184204, "eval_runtime": 1079.0365, "eval_samples_per_second": 9.179, "eval_steps_per_second": 0.144, "step": 220000}, {"epoch": 9.89307215383233, "grad_norm": 13.316097259521484, "learning_rate": 4.532385798673442e-05, "loss": 0.517, "step": 220200}, {"epoch": 9.902057687123731, "grad_norm": 6.809960842132568, "learning_rate": 4.531563548351444e-05, "loss": 0.5025, "step": 220400}, {"epoch": 9.91104322041513, "grad_norm": 130.9669189453125, "learning_rate": 4.530740650466852e-05, "loss": 0.4974, "step": 220600}, {"epoch": 9.920028753706532, "grad_norm": 8.149009704589844, "learning_rate": 4.529917105281964e-05, "loss": 0.475, "step": 220800}, {"epoch": 9.929014286997933, "grad_norm": 9.56112289428711, "learning_rate": 4.529092913059287e-05, "loss": 0.5231, "step": 221000}, {"epoch": 9.929014286997933, "eval_loss": 2.5265750885009766, "eval_runtime": 1080.8883, "eval_samples_per_second": 9.163, "eval_steps_per_second": 0.143, "step": 221000}, {"epoch": 9.937999820289335, "grad_norm": 2.8517773151397705, "learning_rate": 4.5282680740615324e-05, "loss": 0.447, "step": 221200}, {"epoch": 9.946985353580734, "grad_norm": 9.419743537902832, "learning_rate": 4.527442588551618e-05, "loss": 0.5271, "step": 221400}, {"epoch": 9.955970886872135, "grad_norm": 5.280923366546631, "learning_rate": 4.5266164567926686e-05, "loss": 0.4949, "step": 221600}, {"epoch": 9.964956420163537, "grad_norm": 2.162322521209717, "learning_rate": 4.525789679048014e-05, "loss": 0.5058, "step": 221800}, {"epoch": 9.973941953454938, "grad_norm": 12.884297370910645, "learning_rate": 4.52496225558119e-05, "loss": 0.4859, "step": 222000}, {"epoch": 9.973941953454938, "eval_loss": 2.5312891006469727, "eval_runtime": 1083.1979, "eval_samples_per_second": 9.143, "eval_steps_per_second": 0.143, "step": 222000}, {"epoch": 9.982927486746338, "grad_norm": 12.709576606750488, "learning_rate": 4.52413418665594e-05, "loss": 0.504, "step": 222200}, {"epoch": 9.991913020037739, "grad_norm": 3.7961857318878174, "learning_rate": 4.523305472536209e-05, "loss": 0.4957, "step": 222400}, {"epoch": 10.00089855332914, "grad_norm": 9.928500175476074, "learning_rate": 4.522476113486153e-05, "loss": 0.497, "step": 222600}, {"epoch": 10.009884086620541, "grad_norm": 2.6933352947235107, "learning_rate": 4.52164610977013e-05, "loss": 0.4644, "step": 222800}, {"epoch": 10.018869619911941, "grad_norm": 2.5882034301757812, "learning_rate": 4.520815461652704e-05, "loss": 0.4717, "step": 223000}, {"epoch": 10.018869619911941, "eval_loss": 2.542062997817993, "eval_runtime": 1081.4133, "eval_samples_per_second": 9.158, "eval_steps_per_second": 0.143, "step": 223000}, {"epoch": 10.027855153203342, "grad_norm": 1.036136269569397, "learning_rate": 4.5199841693986446e-05, "loss": 0.4663, "step": 223200}, {"epoch": 10.036840686494743, "grad_norm": 3.3049538135528564, "learning_rate": 4.5191522332729276e-05, "loss": 0.4899, "step": 223400}, {"epoch": 10.045826219786145, "grad_norm": 3.9398066997528076, "learning_rate": 4.518319653540733e-05, "loss": 0.4902, "step": 223600}, {"epoch": 10.054811753077544, "grad_norm": 7.958073139190674, "learning_rate": 4.517486430467446e-05, "loss": 0.4853, "step": 223800}, {"epoch": 10.063797286368946, "grad_norm": 6.440467357635498, "learning_rate": 4.516652564318658e-05, "loss": 0.4674, "step": 224000}, {"epoch": 10.063797286368946, "eval_loss": 2.563239097595215, "eval_runtime": 1080.8507, "eval_samples_per_second": 9.163, "eval_steps_per_second": 0.143, "step": 224000}, {"epoch": 10.072782819660347, "grad_norm": 3.7625374794006348, "learning_rate": 4.5158180553601635e-05, "loss": 0.4607, "step": 224200}, {"epoch": 10.081768352951748, "grad_norm": 2.02681303024292, "learning_rate": 4.514982903857964e-05, "loss": 0.4737, "step": 224400}, {"epoch": 10.09075388624315, "grad_norm": 15.780081748962402, "learning_rate": 4.514147110078264e-05, "loss": 0.4451, "step": 224600}, {"epoch": 10.099739419534549, "grad_norm": 9.11990737915039, "learning_rate": 4.513310674287474e-05, "loss": 0.4585, "step": 224800}, {"epoch": 10.10872495282595, "grad_norm": 19.485971450805664, "learning_rate": 4.512473596752208e-05, "loss": 0.4777, "step": 225000}, {"epoch": 10.10872495282595, "eval_loss": 2.589509963989258, "eval_runtime": 1080.2805, "eval_samples_per_second": 9.168, "eval_steps_per_second": 0.143, "step": 225000}, {"epoch": 10.117710486117351, "grad_norm": 7.728920936584473, "learning_rate": 4.511635877739285e-05, "loss": 0.452, "step": 225200}, {"epoch": 10.126696019408753, "grad_norm": 6.3267412185668945, "learning_rate": 4.51079751751573e-05, "loss": 0.4296, "step": 225400}, {"epoch": 10.135681552700152, "grad_norm": 7.468375205993652, "learning_rate": 4.50995851634877e-05, "loss": 0.4678, "step": 225600}, {"epoch": 10.144667085991554, "grad_norm": 5.496447563171387, "learning_rate": 4.509118874505837e-05, "loss": 0.4364, "step": 225800}, {"epoch": 10.153652619282955, "grad_norm": 1.2194163799285889, "learning_rate": 4.508278592254568e-05, "loss": 0.4963, "step": 226000}, {"epoch": 10.153652619282955, "eval_loss": 2.564985513687134, "eval_runtime": 1079.4368, "eval_samples_per_second": 9.175, "eval_steps_per_second": 0.144, "step": 226000}, {"epoch": 10.162638152574356, "grad_norm": 4.605660438537598, "learning_rate": 4.507437669862804e-05, "loss": 0.5033, "step": 226200}, {"epoch": 10.171623685865756, "grad_norm": 7.148728370666504, "learning_rate": 4.5065961075985894e-05, "loss": 0.46, "step": 226400}, {"epoch": 10.180609219157157, "grad_norm": 6.414613246917725, "learning_rate": 4.505753905730173e-05, "loss": 0.4905, "step": 226600}, {"epoch": 10.189594752448558, "grad_norm": 17.29862403869629, "learning_rate": 4.504911064526007e-05, "loss": 0.4554, "step": 226800}, {"epoch": 10.19858028573996, "grad_norm": 26.544200897216797, "learning_rate": 4.504067584254748e-05, "loss": 0.446, "step": 227000}, {"epoch": 10.19858028573996, "eval_loss": 2.5394065380096436, "eval_runtime": 1081.1162, "eval_samples_per_second": 9.161, "eval_steps_per_second": 0.143, "step": 227000}, {"epoch": 10.207565819031359, "grad_norm": 2.5992953777313232, "learning_rate": 4.503223465185257e-05, "loss": 0.4749, "step": 227200}, {"epoch": 10.21655135232276, "grad_norm": 5.341890811920166, "learning_rate": 4.5023787075865955e-05, "loss": 0.4482, "step": 227400}, {"epoch": 10.225536885614162, "grad_norm": 1.8888834714889526, "learning_rate": 4.5015333117280324e-05, "loss": 0.465, "step": 227600}, {"epoch": 10.234522418905563, "grad_norm": 7.757589817047119, "learning_rate": 4.500687277879038e-05, "loss": 0.4819, "step": 227800}, {"epoch": 10.243507952196962, "grad_norm": 8.244403839111328, "learning_rate": 4.499840606309285e-05, "loss": 0.4512, "step": 228000}, {"epoch": 10.243507952196962, "eval_loss": 2.5606801509857178, "eval_runtime": 1079.9496, "eval_samples_per_second": 9.171, "eval_steps_per_second": 0.144, "step": 228000}, {"epoch": 10.252493485488364, "grad_norm": 9.635261535644531, "learning_rate": 4.498993297288653e-05, "loss": 0.4661, "step": 228200}, {"epoch": 10.261479018779765, "grad_norm": 0.8005920648574829, "learning_rate": 4.498145351087221e-05, "loss": 0.4503, "step": 228400}, {"epoch": 10.270464552071166, "grad_norm": 13.759466171264648, "learning_rate": 4.497296767975273e-05, "loss": 0.4807, "step": 228600}, {"epoch": 10.279450085362566, "grad_norm": 8.74666976928711, "learning_rate": 4.496447548223295e-05, "loss": 0.4259, "step": 228800}, {"epoch": 10.288435618653967, "grad_norm": 2.4805383682250977, "learning_rate": 4.495597692101977e-05, "loss": 0.4893, "step": 229000}, {"epoch": 10.288435618653967, "eval_loss": 2.536832809448242, "eval_runtime": 1080.0948, "eval_samples_per_second": 9.17, "eval_steps_per_second": 0.144, "step": 229000}, {"epoch": 10.297421151945368, "grad_norm": 16.94227409362793, "learning_rate": 4.494747199882212e-05, "loss": 0.5009, "step": 229200}, {"epoch": 10.30640668523677, "grad_norm": 28.570947647094727, "learning_rate": 4.4938960718350945e-05, "loss": 0.4331, "step": 229400}, {"epoch": 10.315392218528169, "grad_norm": 9.431313514709473, "learning_rate": 4.493044308231921e-05, "loss": 0.4823, "step": 229600}, {"epoch": 10.32437775181957, "grad_norm": 6.612549304962158, "learning_rate": 4.4921919093441944e-05, "loss": 0.4985, "step": 229800}, {"epoch": 10.333363285110972, "grad_norm": 4.512430667877197, "learning_rate": 4.4913388754436156e-05, "loss": 0.4586, "step": 230000}, {"epoch": 10.333363285110972, "eval_loss": 2.5845720767974854, "eval_runtime": 1086.1502, "eval_samples_per_second": 9.118, "eval_steps_per_second": 0.143, "step": 230000}, {"epoch": 10.342348818402373, "grad_norm": 8.223472595214844, "learning_rate": 4.4904852068020906e-05, "loss": 0.4548, "step": 230200}, {"epoch": 10.351334351693772, "grad_norm": 4.4741530418396, "learning_rate": 4.4896309036917264e-05, "loss": 0.4753, "step": 230400}, {"epoch": 10.360319884985174, "grad_norm": 8.382828712463379, "learning_rate": 4.488775966384834e-05, "loss": 0.4858, "step": 230600}, {"epoch": 10.369305418276575, "grad_norm": 5.764524459838867, "learning_rate": 4.4879203951539246e-05, "loss": 0.462, "step": 230800}, {"epoch": 10.378290951567976, "grad_norm": 9.164348602294922, "learning_rate": 4.4870641902717126e-05, "loss": 0.4565, "step": 231000}, {"epoch": 10.378290951567976, "eval_loss": 2.533195972442627, "eval_runtime": 1076.7261, "eval_samples_per_second": 9.198, "eval_steps_per_second": 0.144, "step": 231000}, {"epoch": 10.387276484859376, "grad_norm": 7.0318732261657715, "learning_rate": 4.486207352011113e-05, "loss": 0.4456, "step": 231200}, {"epoch": 10.396262018150777, "grad_norm": 8.506872177124023, "learning_rate": 4.4853498806452454e-05, "loss": 0.4627, "step": 231400}, {"epoch": 10.405247551442178, "grad_norm": 8.952465057373047, "learning_rate": 4.484491776447428e-05, "loss": 0.4674, "step": 231600}, {"epoch": 10.41423308473358, "grad_norm": 56.0440559387207, "learning_rate": 4.483633039691184e-05, "loss": 0.4451, "step": 231800}, {"epoch": 10.423218618024979, "grad_norm": 2.9122977256774902, "learning_rate": 4.4827736706502344e-05, "loss": 0.4789, "step": 232000}, {"epoch": 10.423218618024979, "eval_loss": 2.555021286010742, "eval_runtime": 1072.7806, "eval_samples_per_second": 9.232, "eval_steps_per_second": 0.144, "step": 232000}, {"epoch": 10.43220415131638, "grad_norm": 11.758764266967773, "learning_rate": 4.481913669598505e-05, "loss": 0.5142, "step": 232200}, {"epoch": 10.441189684607782, "grad_norm": 4.137763023376465, "learning_rate": 4.481053036810121e-05, "loss": 0.4642, "step": 232400}, {"epoch": 10.450175217899183, "grad_norm": 4.821073055267334, "learning_rate": 4.4801917725594113e-05, "loss": 0.4967, "step": 232600}, {"epoch": 10.459160751190582, "grad_norm": 3.3275232315063477, "learning_rate": 4.4793298771209036e-05, "loss": 0.4814, "step": 232800}, {"epoch": 10.468146284481984, "grad_norm": 10.877018928527832, "learning_rate": 4.4784673507693284e-05, "loss": 0.4652, "step": 233000}, {"epoch": 10.468146284481984, "eval_loss": 2.536766529083252, "eval_runtime": 1073.3016, "eval_samples_per_second": 9.228, "eval_steps_per_second": 0.144, "step": 233000}, {"epoch": 10.477131817773385, "grad_norm": 10.973562240600586, "learning_rate": 4.477604193779615e-05, "loss": 0.4667, "step": 233200}, {"epoch": 10.486117351064786, "grad_norm": 6.547046661376953, "learning_rate": 4.476740406426898e-05, "loss": 0.4834, "step": 233400}, {"epoch": 10.495102884356186, "grad_norm": 11.464012145996094, "learning_rate": 4.475875988986509e-05, "loss": 0.4755, "step": 233600}, {"epoch": 10.504088417647587, "grad_norm": 4.013788223266602, "learning_rate": 4.475010941733981e-05, "loss": 0.4742, "step": 233800}, {"epoch": 10.513073950938988, "grad_norm": 0.9032938480377197, "learning_rate": 4.474145264945049e-05, "loss": 0.5054, "step": 234000}, {"epoch": 10.513073950938988, "eval_loss": 2.5643973350524902, "eval_runtime": 1071.8884, "eval_samples_per_second": 9.24, "eval_steps_per_second": 0.145, "step": 234000}, {"epoch": 10.52205948423039, "grad_norm": 12.91777229309082, "learning_rate": 4.47327895889565e-05, "loss": 0.4666, "step": 234200}, {"epoch": 10.53104501752179, "grad_norm": 15.215625762939453, "learning_rate": 4.472412023861917e-05, "loss": 0.4704, "step": 234400}, {"epoch": 10.54003055081319, "grad_norm": 8.357992172241211, "learning_rate": 4.4715444601201884e-05, "loss": 0.4887, "step": 234600}, {"epoch": 10.549016084104592, "grad_norm": 10.161919593811035, "learning_rate": 4.470676267947e-05, "loss": 0.4796, "step": 234800}, {"epoch": 10.558001617395993, "grad_norm": 14.575705528259277, "learning_rate": 4.4698074476190885e-05, "loss": 0.4384, "step": 235000}, {"epoch": 10.558001617395993, "eval_loss": 2.5507290363311768, "eval_runtime": 1070.9659, "eval_samples_per_second": 9.248, "eval_steps_per_second": 0.145, "step": 235000}, {"epoch": 10.566987150687392, "grad_norm": 4.9642109870910645, "learning_rate": 4.4689379994133915e-05, "loss": 0.4849, "step": 235200}, {"epoch": 10.575972683978794, "grad_norm": 6.950181007385254, "learning_rate": 4.468067923607047e-05, "loss": 0.4751, "step": 235400}, {"epoch": 10.584958217270195, "grad_norm": 9.092172622680664, "learning_rate": 4.4671972204773913e-05, "loss": 0.4987, "step": 235600}, {"epoch": 10.593943750561596, "grad_norm": 2.7059104442596436, "learning_rate": 4.466325890301963e-05, "loss": 0.5025, "step": 235800}, {"epoch": 10.602929283852998, "grad_norm": 0.9468827247619629, "learning_rate": 4.465453933358498e-05, "loss": 0.449, "step": 236000}, {"epoch": 10.602929283852998, "eval_loss": 2.53763747215271, "eval_runtime": 1070.9813, "eval_samples_per_second": 9.248, "eval_steps_per_second": 0.145, "step": 236000}, {"epoch": 10.611914817144397, "grad_norm": 6.531583309173584, "learning_rate": 4.464581349924933e-05, "loss": 0.513, "step": 236200}, {"epoch": 10.620900350435798, "grad_norm": 10.116623878479004, "learning_rate": 4.4637081402794065e-05, "loss": 0.4852, "step": 236400}, {"epoch": 10.6298858837272, "grad_norm": 6.903548240661621, "learning_rate": 4.462834304700253e-05, "loss": 0.4906, "step": 236600}, {"epoch": 10.6388714170186, "grad_norm": 14.256983757019043, "learning_rate": 4.4619598434660103e-05, "loss": 0.4823, "step": 236800}, {"epoch": 10.64785695031, "grad_norm": 4.879205703735352, "learning_rate": 4.461084756855411e-05, "loss": 0.4704, "step": 237000}, {"epoch": 10.64785695031, "eval_loss": 2.573296546936035, "eval_runtime": 1070.7212, "eval_samples_per_second": 9.25, "eval_steps_per_second": 0.145, "step": 237000}, {"epoch": 10.656842483601402, "grad_norm": 7.068393230438232, "learning_rate": 4.460209045147393e-05, "loss": 0.4907, "step": 237200}, {"epoch": 10.665828016892803, "grad_norm": 9.679513931274414, "learning_rate": 4.459332708621088e-05, "loss": 0.458, "step": 237400}, {"epoch": 10.674813550184204, "grad_norm": 3.086480140686035, "learning_rate": 4.458455747555829e-05, "loss": 0.4512, "step": 237600}, {"epoch": 10.683799083475604, "grad_norm": 7.147046089172363, "learning_rate": 4.4575781622311483e-05, "loss": 0.4981, "step": 237800}, {"epoch": 10.692784616767005, "grad_norm": 7.950299263000488, "learning_rate": 4.456699952926777e-05, "loss": 0.5095, "step": 238000}, {"epoch": 10.692784616767005, "eval_loss": 2.5305910110473633, "eval_runtime": 1069.7405, "eval_samples_per_second": 9.258, "eval_steps_per_second": 0.145, "step": 238000}, {"epoch": 10.701770150058406, "grad_norm": 7.476064205169678, "learning_rate": 4.455821119922646e-05, "loss": 0.4871, "step": 238200}, {"epoch": 10.710755683349806, "grad_norm": 0.6263104677200317, "learning_rate": 4.454941663498882e-05, "loss": 0.487, "step": 238400}, {"epoch": 10.719741216641207, "grad_norm": 12.403650283813477, "learning_rate": 4.4540615839358144e-05, "loss": 0.4504, "step": 238600}, {"epoch": 10.728726749932608, "grad_norm": 4.677651882171631, "learning_rate": 4.4531808815139685e-05, "loss": 0.4703, "step": 238800}, {"epoch": 10.73771228322401, "grad_norm": 3.9398200511932373, "learning_rate": 4.45229955651407e-05, "loss": 0.4882, "step": 239000}, {"epoch": 10.73771228322401, "eval_loss": 2.5735087394714355, "eval_runtime": 1071.2709, "eval_samples_per_second": 9.245, "eval_steps_per_second": 0.145, "step": 239000}, {"epoch": 10.746697816515411, "grad_norm": 7.807620525360107, "learning_rate": 4.45141760921704e-05, "loss": 0.4666, "step": 239200}, {"epoch": 10.75568334980681, "grad_norm": 3.5220091342926025, "learning_rate": 4.450535039904001e-05, "loss": 0.4507, "step": 239400}, {"epoch": 10.764668883098212, "grad_norm": 5.474115371704102, "learning_rate": 4.4496518488562735e-05, "loss": 0.5232, "step": 239600}, {"epoch": 10.773654416389613, "grad_norm": 3.3102242946624756, "learning_rate": 4.448768036355374e-05, "loss": 0.4838, "step": 239800}, {"epoch": 10.782639949681014, "grad_norm": 6.073796272277832, "learning_rate": 4.447883602683019e-05, "loss": 0.5051, "step": 240000}, {"epoch": 10.782639949681014, "eval_loss": 2.6252071857452393, "eval_runtime": 1070.75, "eval_samples_per_second": 9.25, "eval_steps_per_second": 0.145, "step": 240000}, {"epoch": 10.791625482972414, "grad_norm": 11.76477336883545, "learning_rate": 4.446998548121123e-05, "loss": 0.4978, "step": 240200}, {"epoch": 10.800611016263815, "grad_norm": 9.04162311553955, "learning_rate": 4.446112872951798e-05, "loss": 0.4882, "step": 240400}, {"epoch": 10.809596549555216, "grad_norm": 7.809966564178467, "learning_rate": 4.445226577457351e-05, "loss": 0.4747, "step": 240600}, {"epoch": 10.818582082846618, "grad_norm": 10.286615371704102, "learning_rate": 4.4443396619202936e-05, "loss": 0.4706, "step": 240800}, {"epoch": 10.827567616138017, "grad_norm": 4.194571018218994, "learning_rate": 4.4434521266233284e-05, "loss": 0.4912, "step": 241000}, {"epoch": 10.827567616138017, "eval_loss": 2.5471911430358887, "eval_runtime": 1122.4761, "eval_samples_per_second": 8.823, "eval_steps_per_second": 0.138, "step": 241000}, {"epoch": 10.836553149429418, "grad_norm": 8.166125297546387, "learning_rate": 4.442563971849358e-05, "loss": 0.4689, "step": 241200}, {"epoch": 10.84553868272082, "grad_norm": 0.8636496663093567, "learning_rate": 4.441675197881483e-05, "loss": 0.5064, "step": 241400}, {"epoch": 10.854524216012221, "grad_norm": 7.717101573944092, "learning_rate": 4.440785805003002e-05, "loss": 0.4968, "step": 241600}, {"epoch": 10.86350974930362, "grad_norm": 6.4478440284729, "learning_rate": 4.439895793497407e-05, "loss": 0.4771, "step": 241800}, {"epoch": 10.872495282595022, "grad_norm": 6.758020877838135, "learning_rate": 4.439005163648393e-05, "loss": 0.464, "step": 242000}, {"epoch": 10.872495282595022, "eval_loss": 2.5376241207122803, "eval_runtime": 1093.847, "eval_samples_per_second": 9.054, "eval_steps_per_second": 0.142, "step": 242000}, {"epoch": 10.881480815886423, "grad_norm": 3.514791488647461, "learning_rate": 4.438113915739847e-05, "loss": 0.4488, "step": 242200}, {"epoch": 10.890466349177824, "grad_norm": 5.87647008895874, "learning_rate": 4.437222050055855e-05, "loss": 0.4547, "step": 242400}, {"epoch": 10.899451882469224, "grad_norm": 7.898502826690674, "learning_rate": 4.4363295668807006e-05, "loss": 0.5082, "step": 242600}, {"epoch": 10.908437415760625, "grad_norm": 23.251298904418945, "learning_rate": 4.435436466498863e-05, "loss": 0.5251, "step": 242800}, {"epoch": 10.917422949052026, "grad_norm": 12.48715877532959, "learning_rate": 4.4345427491950194e-05, "loss": 0.5158, "step": 243000}, {"epoch": 10.917422949052026, "eval_loss": 2.5292649269104004, "eval_runtime": 1091.8273, "eval_samples_per_second": 9.071, "eval_steps_per_second": 0.142, "step": 243000}, {"epoch": 10.926408482343428, "grad_norm": 4.933159351348877, "learning_rate": 4.433648415254043e-05, "loss": 0.4988, "step": 243200}, {"epoch": 10.935394015634827, "grad_norm": 8.043121337890625, "learning_rate": 4.432753464961003e-05, "loss": 0.4807, "step": 243400}, {"epoch": 10.944379548926229, "grad_norm": 5.658725738525391, "learning_rate": 4.431857898601166e-05, "loss": 0.5186, "step": 243600}, {"epoch": 10.95336508221763, "grad_norm": 4.071963787078857, "learning_rate": 4.4309617164599935e-05, "loss": 0.4554, "step": 243800}, {"epoch": 10.962350615509031, "grad_norm": 11.117284774780273, "learning_rate": 4.430064918823146e-05, "loss": 0.4819, "step": 244000}, {"epoch": 10.962350615509031, "eval_loss": 2.524524211883545, "eval_runtime": 1093.0541, "eval_samples_per_second": 9.061, "eval_steps_per_second": 0.142, "step": 244000}, {"epoch": 10.97133614880043, "grad_norm": 2.5072007179260254, "learning_rate": 4.429167505976477e-05, "loss": 0.462, "step": 244200}, {"epoch": 10.980321682091832, "grad_norm": 0.8460531830787659, "learning_rate": 4.428269478206038e-05, "loss": 0.4288, "step": 244400}, {"epoch": 10.989307215383233, "grad_norm": 14.47143840789795, "learning_rate": 4.4273708357980767e-05, "loss": 0.5106, "step": 244600}, {"epoch": 10.998292748674634, "grad_norm": 7.705573558807373, "learning_rate": 4.426471579039037e-05, "loss": 0.4879, "step": 244800}, {"epoch": 11.007278281966034, "grad_norm": 2.811030626296997, "learning_rate": 4.4255717082155545e-05, "loss": 0.4478, "step": 245000}, {"epoch": 11.007278281966034, "eval_loss": 2.5267140865325928, "eval_runtime": 1093.249, "eval_samples_per_second": 9.059, "eval_steps_per_second": 0.142, "step": 245000}, {"epoch": 11.016263815257435, "grad_norm": 2.7444190979003906, "learning_rate": 4.424671223614466e-05, "loss": 0.4124, "step": 245200}, {"epoch": 11.025249348548837, "grad_norm": 4.81060266494751, "learning_rate": 4.423770125522802e-05, "loss": 0.4267, "step": 245400}, {"epoch": 11.034234881840238, "grad_norm": 8.938187599182129, "learning_rate": 4.4228684142277874e-05, "loss": 0.4374, "step": 245600}, {"epoch": 11.043220415131637, "grad_norm": 2.805171012878418, "learning_rate": 4.421966090016844e-05, "loss": 0.4774, "step": 245800}, {"epoch": 11.052205948423039, "grad_norm": 0.964135468006134, "learning_rate": 4.421063153177588e-05, "loss": 0.4706, "step": 246000}, {"epoch": 11.052205948423039, "eval_loss": 2.5728235244750977, "eval_runtime": 1091.2334, "eval_samples_per_second": 9.076, "eval_steps_per_second": 0.142, "step": 246000}, {"epoch": 11.06119148171444, "grad_norm": 14.399362564086914, "learning_rate": 4.420159603997832e-05, "loss": 0.4882, "step": 246200}, {"epoch": 11.070177015005841, "grad_norm": 10.316938400268555, "learning_rate": 4.4192554427655824e-05, "loss": 0.4716, "step": 246400}, {"epoch": 11.07916254829724, "grad_norm": 6.025542259216309, "learning_rate": 4.418350669769041e-05, "loss": 0.4675, "step": 246600}, {"epoch": 11.088148081588642, "grad_norm": 4.75909948348999, "learning_rate": 4.417445285296606e-05, "loss": 0.4213, "step": 246800}, {"epoch": 11.097133614880043, "grad_norm": 1.9783635139465332, "learning_rate": 4.416539289636869e-05, "loss": 0.4627, "step": 247000}, {"epoch": 11.097133614880043, "eval_loss": 2.543732166290283, "eval_runtime": 1092.6379, "eval_samples_per_second": 9.064, "eval_steps_per_second": 0.142, "step": 247000}, {"epoch": 11.106119148171445, "grad_norm": 15.855208396911621, "learning_rate": 4.415632683078615e-05, "loss": 0.4413, "step": 247200}, {"epoch": 11.115104681462844, "grad_norm": 10.875030517578125, "learning_rate": 4.41472546591083e-05, "loss": 0.462, "step": 247400}, {"epoch": 11.124090214754245, "grad_norm": 12.176704406738281, "learning_rate": 4.413817638422686e-05, "loss": 0.4606, "step": 247600}, {"epoch": 11.133075748045647, "grad_norm": 9.033163070678711, "learning_rate": 4.412909200903555e-05, "loss": 0.4772, "step": 247800}, {"epoch": 11.142061281337048, "grad_norm": 3.4691646099090576, "learning_rate": 4.4120001536430045e-05, "loss": 0.4675, "step": 248000}, {"epoch": 11.142061281337048, "eval_loss": 2.5572187900543213, "eval_runtime": 1093.238, "eval_samples_per_second": 9.059, "eval_steps_per_second": 0.142, "step": 248000}, {"epoch": 11.151046814628447, "grad_norm": 5.028947830200195, "learning_rate": 4.411090496930791e-05, "loss": 0.4654, "step": 248200}, {"epoch": 11.160032347919849, "grad_norm": 13.782191276550293, "learning_rate": 4.410180231056869e-05, "loss": 0.4893, "step": 248400}, {"epoch": 11.16901788121125, "grad_norm": 18.2941837310791, "learning_rate": 4.4092693563113886e-05, "loss": 0.4495, "step": 248600}, {"epoch": 11.178003414502651, "grad_norm": 3.19677734375, "learning_rate": 4.40835787298469e-05, "loss": 0.4599, "step": 248800}, {"epoch": 11.18698894779405, "grad_norm": 5.5048956871032715, "learning_rate": 4.4074457813673085e-05, "loss": 0.4923, "step": 249000}, {"epoch": 11.18698894779405, "eval_loss": 2.5093724727630615, "eval_runtime": 1090.7596, "eval_samples_per_second": 9.08, "eval_steps_per_second": 0.142, "step": 249000}, {"epoch": 11.195974481085452, "grad_norm": 6.13324499130249, "learning_rate": 4.406533081749976e-05, "loss": 0.4531, "step": 249200}, {"epoch": 11.204960014376853, "grad_norm": 7.9370012283325195, "learning_rate": 4.4056197744236146e-05, "loss": 0.471, "step": 249400}, {"epoch": 11.213945547668255, "grad_norm": 8.390715599060059, "learning_rate": 4.404705859679345e-05, "loss": 0.4765, "step": 249600}, {"epoch": 11.222931080959654, "grad_norm": 5.003363609313965, "learning_rate": 4.403791337808474e-05, "loss": 0.4939, "step": 249800}, {"epoch": 11.231916614251055, "grad_norm": 27.854265213012695, "learning_rate": 4.4028762091025085e-05, "loss": 0.4676, "step": 250000}, {"epoch": 11.231916614251055, "eval_loss": 2.5488498210906982, "eval_runtime": 1093.4053, "eval_samples_per_second": 9.058, "eval_steps_per_second": 0.142, "step": 250000}, {"epoch": 11.240902147542457, "grad_norm": 20.608421325683594, "learning_rate": 4.401960473853146e-05, "loss": 0.4464, "step": 250200}, {"epoch": 11.249887680833858, "grad_norm": 2.9301233291625977, "learning_rate": 4.401044132352279e-05, "loss": 0.4746, "step": 250400}, {"epoch": 11.25887321412526, "grad_norm": 13.66663646697998, "learning_rate": 4.400127184891991e-05, "loss": 0.474, "step": 250600}, {"epoch": 11.267858747416659, "grad_norm": 19.16084098815918, "learning_rate": 4.399209631764559e-05, "loss": 0.4846, "step": 250800}, {"epoch": 11.27684428070806, "grad_norm": 5.497101306915283, "learning_rate": 4.398291473262456e-05, "loss": 0.4921, "step": 251000}, {"epoch": 11.27684428070806, "eval_loss": 2.606623411178589, "eval_runtime": 1091.4454, "eval_samples_per_second": 9.074, "eval_steps_per_second": 0.142, "step": 251000}, {"epoch": 11.285829813999461, "grad_norm": 16.50528335571289, "learning_rate": 4.397372709678344e-05, "loss": 0.4951, "step": 251200}, {"epoch": 11.294815347290863, "grad_norm": 3.4211204051971436, "learning_rate": 4.3964533413050805e-05, "loss": 0.4456, "step": 251400}, {"epoch": 11.303800880582262, "grad_norm": 4.113375186920166, "learning_rate": 4.3955333684357145e-05, "loss": 0.4471, "step": 251600}, {"epoch": 11.312786413873663, "grad_norm": 6.673891067504883, "learning_rate": 4.3946127913634894e-05, "loss": 0.5014, "step": 251800}, {"epoch": 11.321771947165065, "grad_norm": 16.668277740478516, "learning_rate": 4.393691610381838e-05, "loss": 0.4654, "step": 252000}, {"epoch": 11.321771947165065, "eval_loss": 2.590348243713379, "eval_runtime": 1090.7216, "eval_samples_per_second": 9.08, "eval_steps_per_second": 0.142, "step": 252000}, {"epoch": 11.330757480456466, "grad_norm": 8.572153091430664, "learning_rate": 4.392769825784389e-05, "loss": 0.4574, "step": 252200}, {"epoch": 11.339743013747865, "grad_norm": 14.801168441772461, "learning_rate": 4.391847437864961e-05, "loss": 0.4844, "step": 252400}, {"epoch": 11.348728547039267, "grad_norm": 10.526625633239746, "learning_rate": 4.390924446917566e-05, "loss": 0.4687, "step": 252600}, {"epoch": 11.357714080330668, "grad_norm": 4.2288126945495605, "learning_rate": 4.390000853236409e-05, "loss": 0.4693, "step": 252800}, {"epoch": 11.36669961362207, "grad_norm": 4.500141143798828, "learning_rate": 4.389076657115886e-05, "loss": 0.4602, "step": 253000}, {"epoch": 11.36669961362207, "eval_loss": 2.5286338329315186, "eval_runtime": 1088.5161, "eval_samples_per_second": 9.099, "eval_steps_per_second": 0.142, "step": 253000}, {"epoch": 11.375685146913469, "grad_norm": 4.990228176116943, "learning_rate": 4.3881518588505846e-05, "loss": 0.4347, "step": 253200}, {"epoch": 11.38467068020487, "grad_norm": 2.7549238204956055, "learning_rate": 4.3872264587352864e-05, "loss": 0.445, "step": 253400}, {"epoch": 11.393656213496271, "grad_norm": 4.3550519943237305, "learning_rate": 4.3863004570649614e-05, "loss": 0.4574, "step": 253600}, {"epoch": 11.402641746787673, "grad_norm": 2.8987128734588623, "learning_rate": 4.385373854134775e-05, "loss": 0.4668, "step": 253800}, {"epoch": 11.411627280079072, "grad_norm": 11.990416526794434, "learning_rate": 4.384446650240082e-05, "loss": 0.4634, "step": 254000}, {"epoch": 11.411627280079072, "eval_loss": 2.5327000617980957, "eval_runtime": 1087.7639, "eval_samples_per_second": 9.105, "eval_steps_per_second": 0.142, "step": 254000}, {"epoch": 11.420612813370473, "grad_norm": 11.864954948425293, "learning_rate": 4.38351884567643e-05, "loss": 0.4627, "step": 254200}, {"epoch": 11.429598346661875, "grad_norm": 8.507243156433105, "learning_rate": 4.3825904407395574e-05, "loss": 0.4492, "step": 254400}, {"epoch": 11.438583879953276, "grad_norm": 3.335512399673462, "learning_rate": 4.3816614357253935e-05, "loss": 0.5134, "step": 254600}, {"epoch": 11.447569413244675, "grad_norm": 9.387479782104492, "learning_rate": 4.38073183093006e-05, "loss": 0.4559, "step": 254800}, {"epoch": 11.456554946536077, "grad_norm": 8.435622215270996, "learning_rate": 4.379801626649869e-05, "loss": 0.4588, "step": 255000}, {"epoch": 11.456554946536077, "eval_loss": 2.593653917312622, "eval_runtime": 1084.7817, "eval_samples_per_second": 9.13, "eval_steps_per_second": 0.143, "step": 255000}, {"epoch": 11.465540479827478, "grad_norm": 1.6870744228363037, "learning_rate": 4.378870823181323e-05, "loss": 0.4554, "step": 255200}, {"epoch": 11.47452601311888, "grad_norm": 6.257181644439697, "learning_rate": 4.3779394208211174e-05, "loss": 0.4805, "step": 255400}, {"epoch": 11.483511546410279, "grad_norm": 2.434807062149048, "learning_rate": 4.3770074198661385e-05, "loss": 0.4651, "step": 255600}, {"epoch": 11.49249707970168, "grad_norm": 3.8635079860687256, "learning_rate": 4.37607482061346e-05, "loss": 0.4393, "step": 255800}, {"epoch": 11.501482612993081, "grad_norm": 16.132322311401367, "learning_rate": 4.37514162336035e-05, "loss": 0.483, "step": 256000}, {"epoch": 11.501482612993081, "eval_loss": 2.567880153656006, "eval_runtime": 1085.3827, "eval_samples_per_second": 9.125, "eval_steps_per_second": 0.143, "step": 256000}, {"epoch": 11.510468146284483, "grad_norm": 18.950214385986328, "learning_rate": 4.374207828404267e-05, "loss": 0.4645, "step": 256200}, {"epoch": 11.519453679575882, "grad_norm": 30.078716278076172, "learning_rate": 4.373273436042857e-05, "loss": 0.4436, "step": 256400}, {"epoch": 11.528439212867283, "grad_norm": 11.811574935913086, "learning_rate": 4.3723384465739594e-05, "loss": 0.4611, "step": 256600}, {"epoch": 11.537424746158685, "grad_norm": 7.034965515136719, "learning_rate": 4.371402860295601e-05, "loss": 0.4889, "step": 256800}, {"epoch": 11.546410279450086, "grad_norm": 12.620630264282227, "learning_rate": 4.3704666775060045e-05, "loss": 0.4649, "step": 257000}, {"epoch": 11.546410279450086, "eval_loss": 2.515794038772583, "eval_runtime": 1084.1853, "eval_samples_per_second": 9.135, "eval_steps_per_second": 0.143, "step": 257000}, {"epoch": 11.555395812741486, "grad_norm": 2.5326550006866455, "learning_rate": 4.369529898503576e-05, "loss": 0.4934, "step": 257200}, {"epoch": 11.564381346032887, "grad_norm": 8.968504905700684, "learning_rate": 4.3685925235869155e-05, "loss": 0.4643, "step": 257400}, {"epoch": 11.573366879324288, "grad_norm": 3.6532328128814697, "learning_rate": 4.367654553054811e-05, "loss": 0.4552, "step": 257600}, {"epoch": 11.58235241261569, "grad_norm": 14.925705909729004, "learning_rate": 4.3667159872062434e-05, "loss": 0.4879, "step": 257800}, {"epoch": 11.591337945907089, "grad_norm": 4.690251350402832, "learning_rate": 4.36577682634038e-05, "loss": 0.4709, "step": 258000}, {"epoch": 11.591337945907089, "eval_loss": 2.600820541381836, "eval_runtime": 1083.5624, "eval_samples_per_second": 9.14, "eval_steps_per_second": 0.143, "step": 258000}, {"epoch": 11.60032347919849, "grad_norm": 14.12942123413086, "learning_rate": 4.3648370707565786e-05, "loss": 0.4925, "step": 258200}, {"epoch": 11.609309012489891, "grad_norm": 10.568379402160645, "learning_rate": 4.363896720754389e-05, "loss": 0.4636, "step": 258400}, {"epoch": 11.618294545781293, "grad_norm": 6.521212100982666, "learning_rate": 4.362955776633546e-05, "loss": 0.5114, "step": 258600}, {"epoch": 11.627280079072692, "grad_norm": 5.636810302734375, "learning_rate": 4.362014238693979e-05, "loss": 0.4439, "step": 258800}, {"epoch": 11.636265612364094, "grad_norm": 9.390134811401367, "learning_rate": 4.361072107235803e-05, "loss": 0.4771, "step": 259000}, {"epoch": 11.636265612364094, "eval_loss": 2.567819118499756, "eval_runtime": 1083.8444, "eval_samples_per_second": 9.138, "eval_steps_per_second": 0.143, "step": 259000}, {"epoch": 11.645251145655495, "grad_norm": 6.163935661315918, "learning_rate": 4.360129382559323e-05, "loss": 0.4715, "step": 259200}, {"epoch": 11.654236678946896, "grad_norm": 8.139466285705566, "learning_rate": 4.359186064965032e-05, "loss": 0.4934, "step": 259400}, {"epoch": 11.663222212238296, "grad_norm": 19.77556610107422, "learning_rate": 4.358242154753615e-05, "loss": 0.4945, "step": 259600}, {"epoch": 11.672207745529697, "grad_norm": 1.9366395473480225, "learning_rate": 4.357297652225943e-05, "loss": 0.4604, "step": 259800}, {"epoch": 11.681193278821098, "grad_norm": 5.113880157470703, "learning_rate": 4.356352557683079e-05, "loss": 0.4671, "step": 260000}, {"epoch": 11.681193278821098, "eval_loss": 2.564166307449341, "eval_runtime": 1084.7483, "eval_samples_per_second": 9.13, "eval_steps_per_second": 0.143, "step": 260000}, {"epoch": 11.6901788121125, "grad_norm": 1.103203535079956, "learning_rate": 4.355406871426271e-05, "loss": 0.4809, "step": 260200}, {"epoch": 11.699164345403899, "grad_norm": 3.9322304725646973, "learning_rate": 4.3544605937569585e-05, "loss": 0.5147, "step": 260400}, {"epoch": 11.7081498786953, "grad_norm": 14.528691291809082, "learning_rate": 4.353513724976765e-05, "loss": 0.46, "step": 260600}, {"epoch": 11.717135411986701, "grad_norm": 4.72658634185791, "learning_rate": 4.3525662653875105e-05, "loss": 0.5064, "step": 260800}, {"epoch": 11.726120945278103, "grad_norm": 1.3560961484909058, "learning_rate": 4.351618215291196e-05, "loss": 0.4535, "step": 261000}, {"epoch": 11.726120945278103, "eval_loss": 2.5357089042663574, "eval_runtime": 1084.1462, "eval_samples_per_second": 9.135, "eval_steps_per_second": 0.143, "step": 261000}, {"epoch": 11.735106478569502, "grad_norm": 14.868110656738281, "learning_rate": 4.350669574990013e-05, "loss": 0.4626, "step": 261200}, {"epoch": 11.744092011860904, "grad_norm": 5.739045143127441, "learning_rate": 4.3497203447863415e-05, "loss": 0.5111, "step": 261400}, {"epoch": 11.753077545152305, "grad_norm": 7.391199111938477, "learning_rate": 4.34877052498275e-05, "loss": 0.485, "step": 261600}, {"epoch": 11.762063078443706, "grad_norm": 7.108745098114014, "learning_rate": 4.347820115881994e-05, "loss": 0.4663, "step": 261800}, {"epoch": 11.771048611735107, "grad_norm": 15.372479438781738, "learning_rate": 4.346869117787018e-05, "loss": 0.4235, "step": 262000}, {"epoch": 11.771048611735107, "eval_loss": 2.5822150707244873, "eval_runtime": 1083.6043, "eval_samples_per_second": 9.14, "eval_steps_per_second": 0.143, "step": 262000}, {"epoch": 11.780034145026507, "grad_norm": 4.675400257110596, "learning_rate": 4.345917531000952e-05, "loss": 0.5049, "step": 262200}, {"epoch": 11.789019678317908, "grad_norm": 7.368799209594727, "learning_rate": 4.344965355827117e-05, "loss": 0.4666, "step": 262400}, {"epoch": 11.79800521160931, "grad_norm": 24.108701705932617, "learning_rate": 4.344012592569018e-05, "loss": 0.4994, "step": 262600}, {"epoch": 11.806990744900709, "grad_norm": 3.419159412384033, "learning_rate": 4.34305924153035e-05, "loss": 0.473, "step": 262800}, {"epoch": 11.81597627819211, "grad_norm": 29.086864471435547, "learning_rate": 4.3421053030149936e-05, "loss": 0.4757, "step": 263000}, {"epoch": 11.81597627819211, "eval_loss": 2.5641908645629883, "eval_runtime": 1084.8454, "eval_samples_per_second": 9.129, "eval_steps_per_second": 0.143, "step": 263000}, {"epoch": 11.824961811483512, "grad_norm": 11.448222160339355, "learning_rate": 4.341150777327019e-05, "loss": 0.4729, "step": 263200}, {"epoch": 11.833947344774913, "grad_norm": 4.488698482513428, "learning_rate": 4.34019566477068e-05, "loss": 0.4513, "step": 263400}, {"epoch": 11.842932878066314, "grad_norm": 2.3001222610473633, "learning_rate": 4.3392399656504214e-05, "loss": 0.4475, "step": 263600}, {"epoch": 11.851918411357714, "grad_norm": 6.0910844802856445, "learning_rate": 4.3382836802708715e-05, "loss": 0.5439, "step": 263800}, {"epoch": 11.860903944649115, "grad_norm": 4.601564407348633, "learning_rate": 4.337326808936848e-05, "loss": 0.4688, "step": 264000}, {"epoch": 11.860903944649115, "eval_loss": 2.945237874984741, "eval_runtime": 1100.7652, "eval_samples_per_second": 8.997, "eval_steps_per_second": 0.091, "step": 264000}, {"epoch": 11.869889477940516, "grad_norm": 5.200575828552246, "learning_rate": 4.336369351953354e-05, "loss": 0.4502, "step": 264200}, {"epoch": 11.878875011231916, "grad_norm": 0.4828265905380249, "learning_rate": 4.335411309625581e-05, "loss": 0.4914, "step": 264400}, {"epoch": 11.887860544523317, "grad_norm": 6.368671894073486, "learning_rate": 4.334452682258905e-05, "loss": 0.47, "step": 264600}, {"epoch": 11.896846077814718, "grad_norm": 11.522847175598145, "learning_rate": 4.333493470158888e-05, "loss": 0.4316, "step": 264800}, {"epoch": 11.90583161110612, "grad_norm": 5.565563678741455, "learning_rate": 4.3325336736312814e-05, "loss": 0.5091, "step": 265000}, {"epoch": 11.90583161110612, "eval_loss": 2.9430134296417236, "eval_runtime": 1099.498, "eval_samples_per_second": 9.008, "eval_steps_per_second": 0.091, "step": 265000}, {"epoch": 11.91481714439752, "grad_norm": 2.104519844055176, "learning_rate": 4.331573292982021e-05, "loss": 0.4338, "step": 265200}, {"epoch": 11.92380267768892, "grad_norm": 5.740574836730957, "learning_rate": 4.3306123285172275e-05, "loss": 0.4399, "step": 265400}, {"epoch": 11.932788210980322, "grad_norm": 5.429746150970459, "learning_rate": 4.329650780543211e-05, "loss": 0.479, "step": 265600}, {"epoch": 11.941773744271723, "grad_norm": 1.9795042276382446, "learning_rate": 4.328688649366465e-05, "loss": 0.4407, "step": 265800}, {"epoch": 11.950759277563124, "grad_norm": 7.313149452209473, "learning_rate": 4.327725935293668e-05, "loss": 0.4642, "step": 266000}, {"epoch": 11.950759277563124, "eval_loss": 3.0007801055908203, "eval_runtime": 1098.5023, "eval_samples_per_second": 9.016, "eval_steps_per_second": 0.091, "step": 266000}, {"epoch": 11.959744810854524, "grad_norm": 3.4922845363616943, "learning_rate": 4.3267626386316884e-05, "loss": 0.4454, "step": 266200}, {"epoch": 11.968730344145925, "grad_norm": 20.564990997314453, "learning_rate": 4.325798759687577e-05, "loss": 0.4763, "step": 266400}, {"epoch": 11.977715877437326, "grad_norm": 15.71061897277832, "learning_rate": 4.324834298768571e-05, "loss": 0.4989, "step": 266600}, {"epoch": 11.986701410728728, "grad_norm": 5.444253921508789, "learning_rate": 4.323869256182092e-05, "loss": 0.4474, "step": 266800}, {"epoch": 11.995686944020127, "grad_norm": 7.9454216957092285, "learning_rate": 4.3229036322357505e-05, "loss": 0.4415, "step": 267000}, {"epoch": 11.995686944020127, "eval_loss": 2.9907069206237793, "eval_runtime": 1098.2527, "eval_samples_per_second": 9.018, "eval_steps_per_second": 0.091, "step": 267000}, {"epoch": 12.004672477311528, "grad_norm": 10.628538131713867, "learning_rate": 4.3219374272373375e-05, "loss": 0.4892, "step": 267200}, {"epoch": 12.01365801060293, "grad_norm": 11.927538871765137, "learning_rate": 4.3209706414948326e-05, "loss": 0.4157, "step": 267400}, {"epoch": 12.02264354389433, "grad_norm": 4.5106682777404785, "learning_rate": 4.3200032753164004e-05, "loss": 0.4235, "step": 267600}, {"epoch": 12.03162907718573, "grad_norm": 9.342924118041992, "learning_rate": 4.319035329010389e-05, "loss": 0.4333, "step": 267800}, {"epoch": 12.040614610477132, "grad_norm": 5.0819244384765625, "learning_rate": 4.3180668028853314e-05, "loss": 0.4374, "step": 268000}, {"epoch": 12.040614610477132, "eval_loss": 2.9819138050079346, "eval_runtime": 1099.2643, "eval_samples_per_second": 9.01, "eval_steps_per_second": 0.091, "step": 268000}, {"epoch": 12.049600143768533, "grad_norm": 11.678213119506836, "learning_rate": 4.317097697249948e-05, "loss": 0.4525, "step": 268200}, {"epoch": 12.058585677059934, "grad_norm": 5.52247428894043, "learning_rate": 4.31612801241314e-05, "loss": 0.4444, "step": 268400}, {"epoch": 12.067571210351334, "grad_norm": 6.6727190017700195, "learning_rate": 4.315157748683996e-05, "loss": 0.4566, "step": 268600}, {"epoch": 12.076556743642735, "grad_norm": 5.082212448120117, "learning_rate": 4.314186906371788e-05, "loss": 0.4681, "step": 268800}, {"epoch": 12.085542276934136, "grad_norm": 12.604265213012695, "learning_rate": 4.3132154857859744e-05, "loss": 0.4056, "step": 269000}, {"epoch": 12.085542276934136, "eval_loss": 2.960404634475708, "eval_runtime": 1098.0453, "eval_samples_per_second": 9.02, "eval_steps_per_second": 0.091, "step": 269000}, {"epoch": 12.094527810225538, "grad_norm": 10.235774993896484, "learning_rate": 4.312243487236194e-05, "loss": 0.4455, "step": 269200}, {"epoch": 12.103513343516937, "grad_norm": 7.912709712982178, "learning_rate": 4.3112709110322744e-05, "loss": 0.4643, "step": 269400}, {"epoch": 12.112498876808338, "grad_norm": 4.5928473472595215, "learning_rate": 4.310297757484224e-05, "loss": 0.4281, "step": 269600}, {"epoch": 12.12148441009974, "grad_norm": 1.3474705219268799, "learning_rate": 4.309324026902236e-05, "loss": 0.4354, "step": 269800}, {"epoch": 12.130469943391141, "grad_norm": 7.204748153686523, "learning_rate": 4.3083497195966887e-05, "loss": 0.42, "step": 270000}, {"epoch": 12.130469943391141, "eval_loss": 3.0123867988586426, "eval_runtime": 1098.9017, "eval_samples_per_second": 9.013, "eval_steps_per_second": 0.091, "step": 270000}, {"epoch": 12.13945547668254, "grad_norm": 3.3051373958587646, "learning_rate": 4.3073748358781424e-05, "loss": 0.4633, "step": 270200}, {"epoch": 12.148441009973942, "grad_norm": 3.480196952819824, "learning_rate": 4.306399376057343e-05, "loss": 0.4057, "step": 270400}, {"epoch": 12.157426543265343, "grad_norm": 14.72482681274414, "learning_rate": 4.305423340445218e-05, "loss": 0.4233, "step": 270600}, {"epoch": 12.166412076556744, "grad_norm": 8.279642105102539, "learning_rate": 4.304446729352881e-05, "loss": 0.4694, "step": 270800}, {"epoch": 12.175397609848144, "grad_norm": 4.855335712432861, "learning_rate": 4.303469543091627e-05, "loss": 0.4497, "step": 271000}, {"epoch": 12.175397609848144, "eval_loss": 2.980236291885376, "eval_runtime": 1098.5437, "eval_samples_per_second": 9.016, "eval_steps_per_second": 0.091, "step": 271000}, {"epoch": 12.184383143139545, "grad_norm": 9.080001831054688, "learning_rate": 4.302491781972935e-05, "loss": 0.4435, "step": 271200}, {"epoch": 12.193368676430946, "grad_norm": 2.5085525512695312, "learning_rate": 4.301513446308466e-05, "loss": 0.4243, "step": 271400}, {"epoch": 12.202354209722348, "grad_norm": 10.801093101501465, "learning_rate": 4.300534536410068e-05, "loss": 0.4641, "step": 271600}, {"epoch": 12.211339743013747, "grad_norm": 2.8049042224884033, "learning_rate": 4.2995550525897667e-05, "loss": 0.4632, "step": 271800}, {"epoch": 12.220325276305148, "grad_norm": 4.995143413543701, "learning_rate": 4.298574995159774e-05, "loss": 0.4471, "step": 272000}, {"epoch": 12.220325276305148, "eval_loss": 2.955246686935425, "eval_runtime": 1098.9794, "eval_samples_per_second": 9.012, "eval_steps_per_second": 0.091, "step": 272000}, {"epoch": 12.22931080959655, "grad_norm": 2.9934492111206055, "learning_rate": 4.297594364432486e-05, "loss": 0.4534, "step": 272200}, {"epoch": 12.238296342887951, "grad_norm": 6.686132907867432, "learning_rate": 4.2966131607204764e-05, "loss": 0.4186, "step": 272400}, {"epoch": 12.24728187617935, "grad_norm": 7.996724605560303, "learning_rate": 4.295631384336507e-05, "loss": 0.4452, "step": 272600}, {"epoch": 12.256267409470752, "grad_norm": 3.5460829734802246, "learning_rate": 4.294649035593519e-05, "loss": 0.4479, "step": 272800}, {"epoch": 12.265252942762153, "grad_norm": 6.196242809295654, "learning_rate": 4.2936661148046375e-05, "loss": 0.5112, "step": 273000}, {"epoch": 12.265252942762153, "eval_loss": 2.9934980869293213, "eval_runtime": 1098.838, "eval_samples_per_second": 9.013, "eval_steps_per_second": 0.091, "step": 273000}, {"epoch": 12.274238476053554, "grad_norm": 3.0045993328094482, "learning_rate": 4.292682622283168e-05, "loss": 0.4462, "step": 273200}, {"epoch": 12.283224009344954, "grad_norm": 5.161373138427734, "learning_rate": 4.2916985583426016e-05, "loss": 0.459, "step": 273400}, {"epoch": 12.292209542636355, "grad_norm": 2.4376187324523926, "learning_rate": 4.290713923296607e-05, "loss": 0.4572, "step": 273600}, {"epoch": 12.301195075927756, "grad_norm": 1.416688323020935, "learning_rate": 4.289728717459041e-05, "loss": 0.4842, "step": 273800}, {"epoch": 12.310180609219158, "grad_norm": 7.329530715942383, "learning_rate": 4.288742941143935e-05, "loss": 0.4582, "step": 274000}, {"epoch": 12.310180609219158, "eval_loss": 3.067824125289917, "eval_runtime": 1099.4168, "eval_samples_per_second": 9.008, "eval_steps_per_second": 0.091, "step": 274000}, {"epoch": 12.319166142510557, "grad_norm": 12.674388885498047, "learning_rate": 4.287756594665508e-05, "loss": 0.4969, "step": 274200}, {"epoch": 12.328151675801958, "grad_norm": 12.752253532409668, "learning_rate": 4.286769678338159e-05, "loss": 0.4488, "step": 274400}, {"epoch": 12.33713720909336, "grad_norm": 22.549896240234375, "learning_rate": 4.285782192476467e-05, "loss": 0.4084, "step": 274600}, {"epoch": 12.346122742384761, "grad_norm": 18.12051010131836, "learning_rate": 4.284794137395195e-05, "loss": 0.4575, "step": 274800}, {"epoch": 12.35510827567616, "grad_norm": 0.43731093406677246, "learning_rate": 4.283805513409287e-05, "loss": 0.4361, "step": 275000}, {"epoch": 12.35510827567616, "eval_loss": 2.9659314155578613, "eval_runtime": 1099.8228, "eval_samples_per_second": 9.005, "eval_steps_per_second": 0.091, "step": 275000}, {"epoch": 12.364093808967562, "grad_norm": 19.862689971923828, "learning_rate": 4.282816320833866e-05, "loss": 0.4251, "step": 275200}, {"epoch": 12.373079342258963, "grad_norm": 10.183892250061035, "learning_rate": 4.281826559984239e-05, "loss": 0.4746, "step": 275400}, {"epoch": 12.382064875550364, "grad_norm": 5.8187642097473145, "learning_rate": 4.280836231175893e-05, "loss": 0.4471, "step": 275600}, {"epoch": 12.391050408841764, "grad_norm": 15.410677909851074, "learning_rate": 4.279845334724496e-05, "loss": 0.4219, "step": 275800}, {"epoch": 12.400035942133165, "grad_norm": 3.4729344844818115, "learning_rate": 4.2788538709458984e-05, "loss": 0.4493, "step": 276000}, {"epoch": 12.400035942133165, "eval_loss": 3.924736499786377, "eval_runtime": 1200.9974, "eval_samples_per_second": 8.246, "eval_steps_per_second": 0.032, "step": 276000}, {"epoch": 12.409021475424566, "grad_norm": 3.802396059036255, "learning_rate": 4.277861840156128e-05, "loss": 0.4697, "step": 276200}, {"epoch": 12.418007008715968, "grad_norm": 3.487226963043213, "learning_rate": 4.276869242671396e-05, "loss": 0.4842, "step": 276400}, {"epoch": 12.426992542007369, "grad_norm": 15.522408485412598, "learning_rate": 4.275876078808095e-05, "loss": 0.4582, "step": 276600}, {"epoch": 12.435978075298769, "grad_norm": 4.422022819519043, "learning_rate": 4.274882348882795e-05, "loss": 0.4654, "step": 276800}, {"epoch": 12.44496360859017, "grad_norm": 7.4790520668029785, "learning_rate": 4.27388805321225e-05, "loss": 0.4306, "step": 277000}, {"epoch": 12.44496360859017, "eval_loss": 3.912114143371582, "eval_runtime": 1203.2852, "eval_samples_per_second": 8.231, "eval_steps_per_second": 0.032, "step": 277000}, {"epoch": 12.453949141881571, "grad_norm": 23.840351104736328, "learning_rate": 4.272893192113391e-05, "loss": 0.4198, "step": 277200}, {"epoch": 12.46293467517297, "grad_norm": 2.9730992317199707, "learning_rate": 4.271897765903332e-05, "loss": 0.4503, "step": 277400}, {"epoch": 12.471920208464372, "grad_norm": 5.045375823974609, "learning_rate": 4.2709017748993654e-05, "loss": 0.4917, "step": 277600}, {"epoch": 12.480905741755773, "grad_norm": 5.691855430603027, "learning_rate": 4.269905219418964e-05, "loss": 0.4699, "step": 277800}, {"epoch": 12.489891275047174, "grad_norm": 3.8715128898620605, "learning_rate": 4.2689080997797815e-05, "loss": 0.4549, "step": 278000}, {"epoch": 12.489891275047174, "eval_loss": 3.87607741355896, "eval_runtime": 1200.8926, "eval_samples_per_second": 8.247, "eval_steps_per_second": 0.032, "step": 278000}, {"epoch": 12.498876808338576, "grad_norm": 6.021407604217529, "learning_rate": 4.2679104162996495e-05, "loss": 0.4249, "step": 278200}, {"epoch": 12.507862341629975, "grad_norm": 7.7932538986206055, "learning_rate": 4.266912169296581e-05, "loss": 0.4297, "step": 278400}, {"epoch": 12.516847874921377, "grad_norm": 20.896095275878906, "learning_rate": 4.265913359088769e-05, "loss": 0.4688, "step": 278600}, {"epoch": 12.525833408212778, "grad_norm": 17.99188804626465, "learning_rate": 4.264913985994583e-05, "loss": 0.4563, "step": 278800}, {"epoch": 12.534818941504179, "grad_norm": 1.572239875793457, "learning_rate": 4.263914050332576e-05, "loss": 0.4485, "step": 279000}, {"epoch": 12.534818941504179, "eval_loss": 3.8942999839782715, "eval_runtime": 1200.7162, "eval_samples_per_second": 8.248, "eval_steps_per_second": 0.032, "step": 279000}, {"epoch": 12.543804474795579, "grad_norm": 1.1558527946472168, "learning_rate": 4.2629135524214777e-05, "loss": 0.4433, "step": 279200}, {"epoch": 12.55279000808698, "grad_norm": 10.34830379486084, "learning_rate": 4.261912492580197e-05, "loss": 0.4556, "step": 279400}, {"epoch": 12.561775541378381, "grad_norm": 8.091256141662598, "learning_rate": 4.260910871127823e-05, "loss": 0.4459, "step": 279600}, {"epoch": 12.570761074669782, "grad_norm": 5.710160732269287, "learning_rate": 4.2599086883836236e-05, "loss": 0.4667, "step": 279800}, {"epoch": 12.579746607961182, "grad_norm": 11.081522941589355, "learning_rate": 4.2589059446670454e-05, "loss": 0.4969, "step": 280000}, {"epoch": 12.579746607961182, "eval_loss": 3.8589940071105957, "eval_runtime": 1206.2897, "eval_samples_per_second": 8.21, "eval_steps_per_second": 0.032, "step": 280000}, {"epoch": 12.588732141252583, "grad_norm": 17.533634185791016, "learning_rate": 4.257902640297714e-05, "loss": 0.4725, "step": 280200}, {"epoch": 12.597717674543985, "grad_norm": 2.660717487335205, "learning_rate": 4.256898775595432e-05, "loss": 0.4301, "step": 280400}, {"epoch": 12.606703207835386, "grad_norm": 22.708642959594727, "learning_rate": 4.255894350880185e-05, "loss": 0.4595, "step": 280600}, {"epoch": 12.615688741126785, "grad_norm": 8.68639087677002, "learning_rate": 4.254889366472131e-05, "loss": 0.512, "step": 280800}, {"epoch": 12.624674274418187, "grad_norm": 9.3152494430542, "learning_rate": 4.253883822691612e-05, "loss": 0.4898, "step": 281000}, {"epoch": 12.624674274418187, "eval_loss": 3.836467981338501, "eval_runtime": 1202.5642, "eval_samples_per_second": 8.236, "eval_steps_per_second": 0.032, "step": 281000}, {"epoch": 12.633659807709588, "grad_norm": 1.8501704931259155, "learning_rate": 4.252877719859145e-05, "loss": 0.4381, "step": 281200}, {"epoch": 12.64264534100099, "grad_norm": 9.407011032104492, "learning_rate": 4.2518710582954255e-05, "loss": 0.4878, "step": 281400}, {"epoch": 12.651630874292389, "grad_norm": 18.41656494140625, "learning_rate": 4.2508638383213296e-05, "loss": 0.4736, "step": 281600}, {"epoch": 12.66061640758379, "grad_norm": 8.159863471984863, "learning_rate": 4.249856060257908e-05, "loss": 0.4956, "step": 281800}, {"epoch": 12.669601940875191, "grad_norm": 2.042884588241577, "learning_rate": 4.248847724426391e-05, "loss": 0.4835, "step": 282000}, {"epoch": 12.669601940875191, "eval_loss": 3.9126241207122803, "eval_runtime": 1205.3139, "eval_samples_per_second": 8.217, "eval_steps_per_second": 0.032, "step": 282000}, {"epoch": 12.678587474166592, "grad_norm": 6.690242767333984, "learning_rate": 4.247838831148186e-05, "loss": 0.4672, "step": 282200}, {"epoch": 12.687573007457992, "grad_norm": 1.212893009185791, "learning_rate": 4.24682938074488e-05, "loss": 0.4522, "step": 282400}, {"epoch": 12.696558540749393, "grad_norm": 6.8718581199646, "learning_rate": 4.245819373538235e-05, "loss": 0.4921, "step": 282600}, {"epoch": 12.705544074040795, "grad_norm": 5.218339920043945, "learning_rate": 4.244808809850193e-05, "loss": 0.4412, "step": 282800}, {"epoch": 12.714529607332196, "grad_norm": 3.228175401687622, "learning_rate": 4.24379769000287e-05, "loss": 0.4452, "step": 283000}, {"epoch": 12.714529607332196, "eval_loss": 3.84609055519104, "eval_runtime": 1201.2158, "eval_samples_per_second": 8.245, "eval_steps_per_second": 0.032, "step": 283000}, {"epoch": 12.723515140623595, "grad_norm": 10.501503944396973, "learning_rate": 4.2427860143185625e-05, "loss": 0.4471, "step": 283200}, {"epoch": 12.732500673914997, "grad_norm": 10.110664367675781, "learning_rate": 4.241773783119742e-05, "loss": 0.4441, "step": 283400}, {"epoch": 12.741486207206398, "grad_norm": 5.942151069641113, "learning_rate": 4.240760996729061e-05, "loss": 0.4631, "step": 283600}, {"epoch": 12.7504717404978, "grad_norm": 17.07978057861328, "learning_rate": 4.2397476554693427e-05, "loss": 0.4466, "step": 283800}, {"epoch": 12.759457273789199, "grad_norm": 6.301132678985596, "learning_rate": 4.238733759663592e-05, "loss": 0.4957, "step": 284000}, {"epoch": 12.759457273789199, "eval_loss": 3.8400514125823975, "eval_runtime": 1202.3941, "eval_samples_per_second": 8.237, "eval_steps_per_second": 0.032, "step": 284000}, {"epoch": 12.7684428070806, "grad_norm": 4.1205573081970215, "learning_rate": 4.237719309634989e-05, "loss": 0.4325, "step": 284200}, {"epoch": 12.777428340372001, "grad_norm": 2.6801910400390625, "learning_rate": 4.236704305706889e-05, "loss": 0.478, "step": 284400}, {"epoch": 12.786413873663403, "grad_norm": 5.553824424743652, "learning_rate": 4.235688748202828e-05, "loss": 0.4462, "step": 284600}, {"epoch": 12.795399406954802, "grad_norm": 4.970882415771484, "learning_rate": 4.234672637446514e-05, "loss": 0.4544, "step": 284800}, {"epoch": 12.804384940246203, "grad_norm": 7.782638072967529, "learning_rate": 4.233655973761833e-05, "loss": 0.4713, "step": 285000}, {"epoch": 12.804384940246203, "eval_loss": 3.8344786167144775, "eval_runtime": 1202.9038, "eval_samples_per_second": 8.233, "eval_steps_per_second": 0.032, "step": 285000}, {"epoch": 12.813370473537605, "grad_norm": 4.948213577270508, "learning_rate": 4.232638757472849e-05, "loss": 0.452, "step": 285200}, {"epoch": 12.822356006829006, "grad_norm": 16.379188537597656, "learning_rate": 4.2316209889037986e-05, "loss": 0.4633, "step": 285400}, {"epoch": 12.831341540120405, "grad_norm": 3.503868341445923, "learning_rate": 4.230602668379098e-05, "loss": 0.467, "step": 285600}, {"epoch": 12.840327073411807, "grad_norm": 1.0399272441864014, "learning_rate": 4.229583796223337e-05, "loss": 0.43, "step": 285800}, {"epoch": 12.849312606703208, "grad_norm": 1.698477029800415, "learning_rate": 4.228564372761281e-05, "loss": 0.4586, "step": 286000}, {"epoch": 12.849312606703208, "eval_loss": 3.8653202056884766, "eval_runtime": 1185.291, "eval_samples_per_second": 8.356, "eval_steps_per_second": 0.033, "step": 286000}, {"epoch": 12.85829813999461, "grad_norm": 10.822354316711426, "learning_rate": 4.2275443983178744e-05, "loss": 0.4417, "step": 286200}, {"epoch": 12.867283673286009, "grad_norm": 8.866846084594727, "learning_rate": 4.2265238732182334e-05, "loss": 0.4166, "step": 286400}, {"epoch": 12.87626920657741, "grad_norm": 4.1137261390686035, "learning_rate": 4.225502797787651e-05, "loss": 0.4994, "step": 286600}, {"epoch": 12.885254739868811, "grad_norm": 3.115154266357422, "learning_rate": 4.224481172351596e-05, "loss": 0.4336, "step": 286800}, {"epoch": 12.894240273160213, "grad_norm": 7.953911304473877, "learning_rate": 4.2234589972357144e-05, "loss": 0.4433, "step": 287000}, {"epoch": 12.894240273160213, "eval_loss": 3.8534297943115234, "eval_runtime": 1184.3457, "eval_samples_per_second": 8.362, "eval_steps_per_second": 0.033, "step": 287000}, {"epoch": 12.903225806451612, "grad_norm": 3.455723524093628, "learning_rate": 4.222436272765822e-05, "loss": 0.4541, "step": 287200}, {"epoch": 12.912211339743013, "grad_norm": 9.256354331970215, "learning_rate": 4.221412999267915e-05, "loss": 0.4282, "step": 287400}, {"epoch": 12.921196873034415, "grad_norm": 5.0986409187316895, "learning_rate": 4.220389177068163e-05, "loss": 0.4577, "step": 287600}, {"epoch": 12.930182406325816, "grad_norm": 10.405719757080078, "learning_rate": 4.2193648064929094e-05, "loss": 0.4245, "step": 287800}, {"epoch": 12.939167939617215, "grad_norm": 6.69377326965332, "learning_rate": 4.218339887868673e-05, "loss": 0.4955, "step": 288000}, {"epoch": 12.939167939617215, "eval_loss": 3.7864327430725098, "eval_runtime": 1165.7975, "eval_samples_per_second": 8.495, "eval_steps_per_second": 0.033, "step": 288000}, {"epoch": 12.948153472908617, "grad_norm": 4.542316436767578, "learning_rate": 4.2173144215221475e-05, "loss": 0.4509, "step": 288200}, {"epoch": 12.957139006200018, "grad_norm": 9.559526443481445, "learning_rate": 4.216288407780202e-05, "loss": 0.426, "step": 288400}, {"epoch": 12.96612453949142, "grad_norm": 7.886917591094971, "learning_rate": 4.21526184696988e-05, "loss": 0.4613, "step": 288600}, {"epoch": 12.975110072782819, "grad_norm": 4.012725353240967, "learning_rate": 4.214234739418396e-05, "loss": 0.4668, "step": 288800}, {"epoch": 12.98409560607422, "grad_norm": 10.49506664276123, "learning_rate": 4.213207085453143e-05, "loss": 0.4632, "step": 289000}, {"epoch": 12.98409560607422, "eval_loss": 3.8832597732543945, "eval_runtime": 1163.551, "eval_samples_per_second": 8.512, "eval_steps_per_second": 0.034, "step": 289000}, {"epoch": 12.993081139365621, "grad_norm": 14.843647956848145, "learning_rate": 4.2121788854016864e-05, "loss": 0.487, "step": 289200}, {"epoch": 13.002066672657023, "grad_norm": 12.702319145202637, "learning_rate": 4.211150139591766e-05, "loss": 0.4755, "step": 289400}, {"epoch": 13.011052205948422, "grad_norm": 12.583155632019043, "learning_rate": 4.2101208483512954e-05, "loss": 0.4325, "step": 289600}, {"epoch": 13.020037739239823, "grad_norm": 1.6690092086791992, "learning_rate": 4.209091012008362e-05, "loss": 0.4279, "step": 289800}, {"epoch": 13.029023272531225, "grad_norm": 13.319869995117188, "learning_rate": 4.208060630891226e-05, "loss": 0.459, "step": 290000}, {"epoch": 13.029023272531225, "eval_loss": 3.850545883178711, "eval_runtime": 1164.1167, "eval_samples_per_second": 8.508, "eval_steps_per_second": 0.034, "step": 290000}, {"epoch": 13.038008805822626, "grad_norm": 11.082257270812988, "learning_rate": 4.207029705328324e-05, "loss": 0.4205, "step": 290200}, {"epoch": 13.046994339114027, "grad_norm": 3.647700309753418, "learning_rate": 4.2059982356482636e-05, "loss": 0.4541, "step": 290400}, {"epoch": 13.055979872405427, "grad_norm": 6.96566104888916, "learning_rate": 4.204966222179826e-05, "loss": 0.448, "step": 290600}, {"epoch": 13.064965405696828, "grad_norm": 4.0198235511779785, "learning_rate": 4.2039336652519665e-05, "loss": 0.4345, "step": 290800}, {"epoch": 13.07395093898823, "grad_norm": 5.543626308441162, "learning_rate": 4.2029005651938146e-05, "loss": 0.4483, "step": 291000}, {"epoch": 13.07395093898823, "eval_loss": 3.8965601921081543, "eval_runtime": 1165.1251, "eval_samples_per_second": 8.5, "eval_steps_per_second": 0.033, "step": 291000}, {"epoch": 13.08293647227963, "grad_norm": 13.703949928283691, "learning_rate": 4.201866922334672e-05, "loss": 0.4145, "step": 291200}, {"epoch": 13.09192200557103, "grad_norm": 28.786453247070312, "learning_rate": 4.20083273700401e-05, "loss": 0.4455, "step": 291400}, {"epoch": 13.100907538862431, "grad_norm": 9.806286811828613, "learning_rate": 4.199798009531481e-05, "loss": 0.4122, "step": 291600}, {"epoch": 13.109893072153833, "grad_norm": 6.537720203399658, "learning_rate": 4.198762740246901e-05, "loss": 0.4223, "step": 291800}, {"epoch": 13.118878605445234, "grad_norm": 8.785443305969238, "learning_rate": 4.1977269294802645e-05, "loss": 0.4664, "step": 292000}, {"epoch": 13.118878605445234, "eval_loss": 3.8596513271331787, "eval_runtime": 1165.6454, "eval_samples_per_second": 8.497, "eval_steps_per_second": 0.033, "step": 292000}, {"epoch": 13.127864138736633, "grad_norm": 6.35100793838501, "learning_rate": 4.196690577561738e-05, "loss": 0.4475, "step": 292200}, {"epoch": 13.136849672028035, "grad_norm": 6.956860065460205, "learning_rate": 4.195653684821658e-05, "loss": 0.4396, "step": 292400}, {"epoch": 13.145835205319436, "grad_norm": 5.264865875244141, "learning_rate": 4.1946162515905364e-05, "loss": 0.4265, "step": 292600}, {"epoch": 13.154820738610837, "grad_norm": 12.176240921020508, "learning_rate": 4.193578278199054e-05, "loss": 0.4379, "step": 292800}, {"epoch": 13.163806271902237, "grad_norm": 6.024650573730469, "learning_rate": 4.192539764978068e-05, "loss": 0.4243, "step": 293000}, {"epoch": 13.163806271902237, "eval_loss": 3.8728034496307373, "eval_runtime": 1170.4759, "eval_samples_per_second": 8.462, "eval_steps_per_second": 0.033, "step": 293000}, {"epoch": 13.172791805193638, "grad_norm": 1.1849206686019897, "learning_rate": 4.191500712258604e-05, "loss": 0.4381, "step": 293200}, {"epoch": 13.18177733848504, "grad_norm": 3.522000789642334, "learning_rate": 4.190461120371861e-05, "loss": 0.472, "step": 293400}, {"epoch": 13.19076287177644, "grad_norm": 2.328458309173584, "learning_rate": 4.1894209896492096e-05, "loss": 0.4262, "step": 293600}, {"epoch": 13.19974840506784, "grad_norm": 9.86052131652832, "learning_rate": 4.188380320422193e-05, "loss": 0.442, "step": 293800}, {"epoch": 13.208733938359241, "grad_norm": 4.702374458312988, "learning_rate": 4.187339113022525e-05, "loss": 0.3967, "step": 294000}, {"epoch": 13.208733938359241, "eval_loss": 3.881704568862915, "eval_runtime": 1178.457, "eval_samples_per_second": 8.404, "eval_steps_per_second": 0.033, "step": 294000}, {"epoch": 13.217719471650643, "grad_norm": 7.168625354766846, "learning_rate": 4.186297367782091e-05, "loss": 0.4736, "step": 294200}, {"epoch": 13.226705004942044, "grad_norm": 9.348653793334961, "learning_rate": 4.1852550850329494e-05, "loss": 0.4496, "step": 294400}, {"epoch": 13.235690538233444, "grad_norm": 6.130259990692139, "learning_rate": 4.184212265107328e-05, "loss": 0.4574, "step": 294600}, {"epoch": 13.244676071524845, "grad_norm": 8.369153022766113, "learning_rate": 4.1831689083376256e-05, "loss": 0.4083, "step": 294800}, {"epoch": 13.253661604816246, "grad_norm": 7.550708770751953, "learning_rate": 4.182125015056415e-05, "loss": 0.4462, "step": 295000}, {"epoch": 13.253661604816246, "eval_loss": 3.848435163497925, "eval_runtime": 1171.7179, "eval_samples_per_second": 8.453, "eval_steps_per_second": 0.033, "step": 295000}, {"epoch": 13.262647138107647, "grad_norm": 4.578621864318848, "learning_rate": 4.181080585596436e-05, "loss": 0.4379, "step": 295200}, {"epoch": 13.271632671399047, "grad_norm": 5.007719039916992, "learning_rate": 4.1800356202906024e-05, "loss": 0.4498, "step": 295400}, {"epoch": 13.280618204690448, "grad_norm": 20.014347076416016, "learning_rate": 4.178990119471998e-05, "loss": 0.454, "step": 295600}, {"epoch": 13.28960373798185, "grad_norm": 7.8681254386901855, "learning_rate": 4.1779440834738757e-05, "loss": 0.451, "step": 295800}, {"epoch": 13.29858927127325, "grad_norm": 6.996041774749756, "learning_rate": 4.176897512629663e-05, "loss": 0.4109, "step": 296000}, {"epoch": 13.29858927127325, "eval_loss": 3.9298160076141357, "eval_runtime": 1180.5598, "eval_samples_per_second": 8.389, "eval_steps_per_second": 0.033, "step": 296000}, {"epoch": 13.30757480456465, "grad_norm": 3.667933464050293, "learning_rate": 4.175850407272953e-05, "loss": 0.417, "step": 296200}, {"epoch": 13.316560337856052, "grad_norm": 4.346782684326172, "learning_rate": 4.1748027677375116e-05, "loss": 0.4439, "step": 296400}, {"epoch": 13.325545871147453, "grad_norm": 7.255468368530273, "learning_rate": 4.1737545943572756e-05, "loss": 0.4517, "step": 296600}, {"epoch": 13.334531404438854, "grad_norm": 1.1761934757232666, "learning_rate": 4.172705887466351e-05, "loss": 0.4611, "step": 296800}, {"epoch": 13.343516937730254, "grad_norm": 2.3793375492095947, "learning_rate": 4.171656647399014e-05, "loss": 0.4535, "step": 297000}, {"epoch": 13.343516937730254, "eval_loss": 3.8182103633880615, "eval_runtime": 1137.4266, "eval_samples_per_second": 8.707, "eval_steps_per_second": 0.034, "step": 297000}, {"epoch": 13.352502471021655, "grad_norm": 8.53345775604248, "learning_rate": 4.17060687448971e-05, "loss": 0.416, "step": 297200}, {"epoch": 13.361488004313056, "grad_norm": 4.831078052520752, "learning_rate": 4.169556569073056e-05, "loss": 0.4341, "step": 297400}, {"epoch": 13.370473537604457, "grad_norm": 9.299762725830078, "learning_rate": 4.168505731483837e-05, "loss": 0.3995, "step": 297600}, {"epoch": 13.379459070895857, "grad_norm": 11.03166389465332, "learning_rate": 4.167454362057008e-05, "loss": 0.4338, "step": 297800}, {"epoch": 13.388444604187258, "grad_norm": 6.606450080871582, "learning_rate": 4.166402461127696e-05, "loss": 0.4563, "step": 298000}, {"epoch": 13.388444604187258, "eval_loss": 3.860046863555908, "eval_runtime": 1114.1874, "eval_samples_per_second": 8.889, "eval_steps_per_second": 0.035, "step": 298000}, {"epoch": 13.39743013747866, "grad_norm": 9.79546070098877, "learning_rate": 4.1653500290311934e-05, "loss": 0.4505, "step": 298200}, {"epoch": 13.40641567077006, "grad_norm": 5.0448832511901855, "learning_rate": 4.1642970661029634e-05, "loss": 0.4342, "step": 298400}, {"epoch": 13.41540120406146, "grad_norm": 15.43664836883545, "learning_rate": 4.163243572678641e-05, "loss": 0.4311, "step": 298600}, {"epoch": 13.424386737352862, "grad_norm": 5.8657612800598145, "learning_rate": 4.162189549094026e-05, "loss": 0.4572, "step": 298800}, {"epoch": 13.433372270644263, "grad_norm": 8.958415031433105, "learning_rate": 4.161134995685091e-05, "loss": 0.4754, "step": 299000}, {"epoch": 13.433372270644263, "eval_loss": 3.8714182376861572, "eval_runtime": 1117.5357, "eval_samples_per_second": 8.862, "eval_steps_per_second": 0.035, "step": 299000}, {"epoch": 13.442357803935664, "grad_norm": 12.89301586151123, "learning_rate": 4.160079912787974e-05, "loss": 0.4224, "step": 299200}, {"epoch": 13.451343337227064, "grad_norm": 30.66848373413086, "learning_rate": 4.1590243007389845e-05, "loss": 0.4751, "step": 299400}, {"epoch": 13.460328870518465, "grad_norm": 9.195915222167969, "learning_rate": 4.1579681598746e-05, "loss": 0.4678, "step": 299600}, {"epoch": 13.469314403809866, "grad_norm": 9.206331253051758, "learning_rate": 4.156911490531466e-05, "loss": 0.4399, "step": 299800}, {"epoch": 13.478299937101268, "grad_norm": 4.251493453979492, "learning_rate": 4.1558542930463965e-05, "loss": 0.4103, "step": 300000}, {"epoch": 13.478299937101268, "eval_loss": 3.946397542953491, "eval_runtime": 1115.2299, "eval_samples_per_second": 8.881, "eval_steps_per_second": 0.035, "step": 300000}, {"epoch": 13.487285470392667, "grad_norm": 12.777297973632812, "learning_rate": 4.154796567756375e-05, "loss": 0.5246, "step": 300200}, {"epoch": 13.496271003684068, "grad_norm": 2.6797468662261963, "learning_rate": 4.1537383149985506e-05, "loss": 0.4457, "step": 300400}, {"epoch": 13.50525653697547, "grad_norm": 5.52931547164917, "learning_rate": 4.1526795351102444e-05, "loss": 0.4505, "step": 300600}, {"epoch": 13.51424207026687, "grad_norm": 12.613361358642578, "learning_rate": 4.151620228428942e-05, "loss": 0.4745, "step": 300800}, {"epoch": 13.52322760355827, "grad_norm": 7.806926727294922, "learning_rate": 4.150560395292298e-05, "loss": 0.4347, "step": 301000}, {"epoch": 13.52322760355827, "eval_loss": 3.85687255859375, "eval_runtime": 1114.6959, "eval_samples_per_second": 8.885, "eval_steps_per_second": 0.035, "step": 301000}, {"epoch": 13.532213136849672, "grad_norm": 4.979412078857422, "learning_rate": 4.1495000360381363e-05, "loss": 0.4813, "step": 301200}, {"epoch": 13.541198670141073, "grad_norm": 13.663886070251465, "learning_rate": 4.1484391510044475e-05, "loss": 0.4744, "step": 301400}, {"epoch": 13.550184203432474, "grad_norm": 6.1580681800842285, "learning_rate": 4.147377740529388e-05, "loss": 0.4415, "step": 301600}, {"epoch": 13.559169736723874, "grad_norm": 13.568781852722168, "learning_rate": 4.146315804951284e-05, "loss": 0.4407, "step": 301800}, {"epoch": 13.568155270015275, "grad_norm": 1.211671233177185, "learning_rate": 4.145253344608628e-05, "loss": 0.4566, "step": 302000}, {"epoch": 13.568155270015275, "eval_loss": 3.837907552719116, "eval_runtime": 1113.6432, "eval_samples_per_second": 8.893, "eval_steps_per_second": 0.035, "step": 302000}, {"epoch": 13.577140803306676, "grad_norm": 1.426780343055725, "learning_rate": 4.1441903598400814e-05, "loss": 0.4497, "step": 302200}, {"epoch": 13.586126336598078, "grad_norm": 7.560256004333496, "learning_rate": 4.1431268509844706e-05, "loss": 0.4683, "step": 302400}, {"epoch": 13.595111869889479, "grad_norm": 20.501848220825195, "learning_rate": 4.1420628183807896e-05, "loss": 0.4646, "step": 302600}, {"epoch": 13.604097403180878, "grad_norm": 3.325043201446533, "learning_rate": 4.140998262368201e-05, "loss": 0.443, "step": 302800}, {"epoch": 13.61308293647228, "grad_norm": 2.9573566913604736, "learning_rate": 4.139933183286031e-05, "loss": 0.4471, "step": 303000}, {"epoch": 13.61308293647228, "eval_loss": 3.8605709075927734, "eval_runtime": 1118.1313, "eval_samples_per_second": 8.858, "eval_steps_per_second": 0.035, "step": 303000}, {"epoch": 13.622068469763681, "grad_norm": 4.5685319900512695, "learning_rate": 4.138867581473776e-05, "loss": 0.4583, "step": 303200}, {"epoch": 13.63105400305508, "grad_norm": 0.45331665873527527, "learning_rate": 4.1378014572710974e-05, "loss": 0.4281, "step": 303400}, {"epoch": 13.640039536346482, "grad_norm": 8.040594100952148, "learning_rate": 4.136734811017822e-05, "loss": 0.4353, "step": 303600}, {"epoch": 13.649025069637883, "grad_norm": 7.731649398803711, "learning_rate": 4.135667643053945e-05, "loss": 0.4867, "step": 303800}, {"epoch": 13.658010602929284, "grad_norm": 13.919236183166504, "learning_rate": 4.1345999537196275e-05, "loss": 0.4752, "step": 304000}, {"epoch": 13.658010602929284, "eval_loss": 3.850292444229126, "eval_runtime": 1113.3609, "eval_samples_per_second": 8.896, "eval_steps_per_second": 0.035, "step": 304000}, {"epoch": 13.666996136220686, "grad_norm": 7.589078426361084, "learning_rate": 4.1335317433551954e-05, "loss": 0.4251, "step": 304200}, {"epoch": 13.675981669512085, "grad_norm": 10.349044799804688, "learning_rate": 4.132463012301143e-05, "loss": 0.4303, "step": 304400}, {"epoch": 13.684967202803486, "grad_norm": 1.0288686752319336, "learning_rate": 4.131393760898128e-05, "loss": 0.4318, "step": 304600}, {"epoch": 13.693952736094888, "grad_norm": 13.238295555114746, "learning_rate": 4.130323989486976e-05, "loss": 0.4539, "step": 304800}, {"epoch": 13.702938269386289, "grad_norm": 17.6412410736084, "learning_rate": 4.1292536984086764e-05, "loss": 0.4484, "step": 305000}, {"epoch": 13.702938269386289, "eval_loss": 3.859189033508301, "eval_runtime": 1112.8183, "eval_samples_per_second": 8.9, "eval_steps_per_second": 0.035, "step": 305000}, {"epoch": 13.711923802677688, "grad_norm": 2.382539749145508, "learning_rate": 4.128182888004387e-05, "loss": 0.4026, "step": 305200}, {"epoch": 13.72090933596909, "grad_norm": 7.253118515014648, "learning_rate": 4.127111558615427e-05, "loss": 0.4531, "step": 305400}, {"epoch": 13.729894869260491, "grad_norm": 8.220928192138672, "learning_rate": 4.126039710583287e-05, "loss": 0.4339, "step": 305600}, {"epoch": 13.738880402551892, "grad_norm": 4.559962749481201, "learning_rate": 4.124967344249617e-05, "loss": 0.4274, "step": 305800}, {"epoch": 13.747865935843292, "grad_norm": 25.09603500366211, "learning_rate": 4.1238944599562354e-05, "loss": 0.451, "step": 306000}, {"epoch": 13.747865935843292, "eval_loss": 3.9123668670654297, "eval_runtime": 1113.8568, "eval_samples_per_second": 8.892, "eval_steps_per_second": 0.035, "step": 306000}, {"epoch": 13.756851469134693, "grad_norm": 7.623703479766846, "learning_rate": 4.122821058045125e-05, "loss": 0.4204, "step": 306200}, {"epoch": 13.765837002426094, "grad_norm": 16.578161239624023, "learning_rate": 4.121747138858433e-05, "loss": 0.4556, "step": 306400}, {"epoch": 13.774822535717496, "grad_norm": 39.884002685546875, "learning_rate": 4.120672702738473e-05, "loss": 0.4342, "step": 306600}, {"epoch": 13.783808069008895, "grad_norm": 6.272052764892578, "learning_rate": 4.1195977500277215e-05, "loss": 0.4377, "step": 306800}, {"epoch": 13.792793602300296, "grad_norm": 4.232491970062256, "learning_rate": 4.1185222810688214e-05, "loss": 0.4948, "step": 307000}, {"epoch": 13.792793602300296, "eval_loss": 3.866061210632324, "eval_runtime": 1113.1102, "eval_samples_per_second": 8.898, "eval_steps_per_second": 0.035, "step": 307000}, {"epoch": 13.801779135591698, "grad_norm": 7.848074913024902, "learning_rate": 4.1174462962045784e-05, "loss": 0.4657, "step": 307200}, {"epoch": 13.810764668883099, "grad_norm": 11.766325950622559, "learning_rate": 4.1163697957779644e-05, "loss": 0.4369, "step": 307400}, {"epoch": 13.819750202174498, "grad_norm": 4.907791614532471, "learning_rate": 4.115292780132115e-05, "loss": 0.4427, "step": 307600}, {"epoch": 13.8287357354659, "grad_norm": 2.2997195720672607, "learning_rate": 4.114215249610329e-05, "loss": 0.4261, "step": 307800}, {"epoch": 13.837721268757301, "grad_norm": 4.029343605041504, "learning_rate": 4.1131372045560704e-05, "loss": 0.4393, "step": 308000}, {"epoch": 13.837721268757301, "eval_loss": 3.869534969329834, "eval_runtime": 1145.7345, "eval_samples_per_second": 8.644, "eval_steps_per_second": 0.034, "step": 308000}, {"epoch": 13.846706802048702, "grad_norm": 3.6049351692199707, "learning_rate": 4.112058645312967e-05, "loss": 0.4413, "step": 308200}, {"epoch": 13.855692335340102, "grad_norm": 0.6825031638145447, "learning_rate": 4.110979572224811e-05, "loss": 0.4046, "step": 308400}, {"epoch": 13.864677868631503, "grad_norm": 11.253166198730469, "learning_rate": 4.109899985635558e-05, "loss": 0.4877, "step": 308600}, {"epoch": 13.873663401922904, "grad_norm": 3.120997428894043, "learning_rate": 4.108819885889326e-05, "loss": 0.4409, "step": 308800}, {"epoch": 13.882648935214306, "grad_norm": 18.108745574951172, "learning_rate": 4.107739273330398e-05, "loss": 0.4455, "step": 309000}, {"epoch": 13.882648935214306, "eval_loss": 3.858532667160034, "eval_runtime": 1133.8734, "eval_samples_per_second": 8.735, "eval_steps_per_second": 0.034, "step": 309000}, {"epoch": 13.891634468505705, "grad_norm": 4.392665863037109, "learning_rate": 4.1066581483032206e-05, "loss": 0.4946, "step": 309200}, {"epoch": 13.900620001797106, "grad_norm": 0.8881078958511353, "learning_rate": 4.1055765111524036e-05, "loss": 0.4265, "step": 309400}, {"epoch": 13.909605535088508, "grad_norm": 1.4993141889572144, "learning_rate": 4.104494362222719e-05, "loss": 0.4309, "step": 309600}, {"epoch": 13.918591068379909, "grad_norm": 5.614892959594727, "learning_rate": 4.103411701859103e-05, "loss": 0.4848, "step": 309800}, {"epoch": 13.927576601671309, "grad_norm": 6.294254779815674, "learning_rate": 4.102328530406655e-05, "loss": 0.4334, "step": 310000}, {"epoch": 13.927576601671309, "eval_loss": 3.8455817699432373, "eval_runtime": 1137.7256, "eval_samples_per_second": 8.705, "eval_steps_per_second": 0.034, "step": 310000}, {"epoch": 13.93656213496271, "grad_norm": 2.6192963123321533, "learning_rate": 4.101244848210636e-05, "loss": 0.4564, "step": 310200}, {"epoch": 13.945547668254111, "grad_norm": 17.42061424255371, "learning_rate": 4.100160655616471e-05, "loss": 0.4186, "step": 310400}, {"epoch": 13.954533201545512, "grad_norm": 13.576807022094727, "learning_rate": 4.099075952969747e-05, "loss": 0.4534, "step": 310600}, {"epoch": 13.963518734836912, "grad_norm": 7.059383392333984, "learning_rate": 4.097990740616214e-05, "loss": 0.4483, "step": 310800}, {"epoch": 13.972504268128313, "grad_norm": 6.2722978591918945, "learning_rate": 4.096905018901785e-05, "loss": 0.448, "step": 311000}, {"epoch": 13.972504268128313, "eval_loss": 3.86065673828125, "eval_runtime": 1127.0444, "eval_samples_per_second": 8.788, "eval_steps_per_second": 0.035, "step": 311000}, {"epoch": 13.981489801419714, "grad_norm": 0.11190976202487946, "learning_rate": 4.095818788172534e-05, "loss": 0.4484, "step": 311200}, {"epoch": 13.990475334711116, "grad_norm": 11.270726203918457, "learning_rate": 4.094732048774698e-05, "loss": 0.4496, "step": 311400}, {"epoch": 13.999460868002515, "grad_norm": 25.78597640991211, "learning_rate": 4.093644801054676e-05, "loss": 0.4627, "step": 311600}, {"epoch": 14.008446401293916, "grad_norm": 7.157655239105225, "learning_rate": 4.09255704535903e-05, "loss": 0.4073, "step": 311800}, {"epoch": 14.017431934585318, "grad_norm": 6.422256946563721, "learning_rate": 4.0914687820344824e-05, "loss": 0.3854, "step": 312000}, {"epoch": 14.017431934585318, "eval_loss": 3.9006946086883545, "eval_runtime": 1133.3391, "eval_samples_per_second": 8.739, "eval_steps_per_second": 0.034, "step": 312000}, {"epoch": 14.026417467876719, "grad_norm": 2.7464749813079834, "learning_rate": 4.090380011427918e-05, "loss": 0.435, "step": 312200}, {"epoch": 14.035403001168119, "grad_norm": 9.64920425415039, "learning_rate": 4.0892907338863833e-05, "loss": 0.4341, "step": 312400}, {"epoch": 14.04438853445952, "grad_norm": 28.953222274780273, "learning_rate": 4.088200949757087e-05, "loss": 0.4119, "step": 312600}, {"epoch": 14.053374067750921, "grad_norm": 11.050024032592773, "learning_rate": 4.0871106593873975e-05, "loss": 0.4425, "step": 312800}, {"epoch": 14.062359601042322, "grad_norm": 7.281927585601807, "learning_rate": 4.086019863124847e-05, "loss": 0.4323, "step": 313000}, {"epoch": 14.062359601042322, "eval_loss": 3.8579936027526855, "eval_runtime": 1129.0178, "eval_samples_per_second": 8.772, "eval_steps_per_second": 0.035, "step": 313000}, {"epoch": 14.071345134333722, "grad_norm": 9.319841384887695, "learning_rate": 4.084928561317127e-05, "loss": 0.4312, "step": 313200}, {"epoch": 14.080330667625123, "grad_norm": 4.579616069793701, "learning_rate": 4.0838367543120916e-05, "loss": 0.4136, "step": 313400}, {"epoch": 14.089316200916524, "grad_norm": 10.863465309143066, "learning_rate": 4.0827444424577543e-05, "loss": 0.4331, "step": 313600}, {"epoch": 14.098301734207926, "grad_norm": 6.145780086517334, "learning_rate": 4.0816516261022915e-05, "loss": 0.425, "step": 313800}, {"epoch": 14.107287267499325, "grad_norm": 6.644456386566162, "learning_rate": 4.080558305594039e-05, "loss": 0.4153, "step": 314000}, {"epoch": 14.107287267499325, "eval_loss": 3.8607418537139893, "eval_runtime": 1121.8494, "eval_samples_per_second": 8.828, "eval_steps_per_second": 0.035, "step": 314000}, {"epoch": 14.116272800790727, "grad_norm": 20.19847297668457, "learning_rate": 4.079464481281493e-05, "loss": 0.3909, "step": 314200}, {"epoch": 14.125258334082128, "grad_norm": 11.029516220092773, "learning_rate": 4.07837015351331e-05, "loss": 0.4105, "step": 314400}, {"epoch": 14.13424386737353, "grad_norm": 9.190872192382812, "learning_rate": 4.077275322638311e-05, "loss": 0.4244, "step": 314600}, {"epoch": 14.143229400664929, "grad_norm": 15.798444747924805, "learning_rate": 4.076179989005471e-05, "loss": 0.4464, "step": 314800}, {"epoch": 14.15221493395633, "grad_norm": 7.170180797576904, "learning_rate": 4.07508415296393e-05, "loss": 0.4383, "step": 315000}, {"epoch": 14.15221493395633, "eval_loss": 3.8738784790039062, "eval_runtime": 1126.1206, "eval_samples_per_second": 8.795, "eval_steps_per_second": 0.035, "step": 315000}, {"epoch": 14.161200467247731, "grad_norm": 3.4297237396240234, "learning_rate": 4.073987814862988e-05, "loss": 0.4147, "step": 315200}, {"epoch": 14.170186000539132, "grad_norm": 17.3597469329834, "learning_rate": 4.072890975052102e-05, "loss": 0.4264, "step": 315400}, {"epoch": 14.179171533830532, "grad_norm": 3.725116014480591, "learning_rate": 4.071793633880891e-05, "loss": 0.3873, "step": 315600}, {"epoch": 14.188157067121933, "grad_norm": 8.087611198425293, "learning_rate": 4.070695791699132e-05, "loss": 0.4188, "step": 315800}, {"epoch": 14.197142600413335, "grad_norm": 2.207904577255249, "learning_rate": 4.069597448856765e-05, "loss": 0.4476, "step": 316000}, {"epoch": 14.197142600413335, "eval_loss": 3.8536148071289062, "eval_runtime": 1123.8487, "eval_samples_per_second": 8.813, "eval_steps_per_second": 0.035, "step": 316000}, {"epoch": 14.206128133704736, "grad_norm": 4.730515956878662, "learning_rate": 4.0684986057038876e-05, "loss": 0.4299, "step": 316200}, {"epoch": 14.215113666996135, "grad_norm": 17.80805015563965, "learning_rate": 4.067399262590757e-05, "loss": 0.452, "step": 316400}, {"epoch": 14.224099200287537, "grad_norm": 5.914919853210449, "learning_rate": 4.0662994198677883e-05, "loss": 0.4265, "step": 316600}, {"epoch": 14.233084733578938, "grad_norm": 7.017390251159668, "learning_rate": 4.065199077885559e-05, "loss": 0.4424, "step": 316800}, {"epoch": 14.24207026687034, "grad_norm": 2.4039924144744873, "learning_rate": 4.064098236994803e-05, "loss": 0.3815, "step": 317000}, {"epoch": 14.24207026687034, "eval_loss": 3.8721015453338623, "eval_runtime": 1123.2832, "eval_samples_per_second": 8.817, "eval_steps_per_second": 0.035, "step": 317000}, {"epoch": 14.25105580016174, "grad_norm": 25.048295974731445, "learning_rate": 4.062996897546415e-05, "loss": 0.4516, "step": 317200}, {"epoch": 14.26004133345314, "grad_norm": 10.468742370605469, "learning_rate": 4.0618950598914475e-05, "loss": 0.3964, "step": 317400}, {"epoch": 14.269026866744541, "grad_norm": 5.206949710845947, "learning_rate": 4.060792724381112e-05, "loss": 0.405, "step": 317600}, {"epoch": 14.278012400035943, "grad_norm": 6.171004772186279, "learning_rate": 4.0596898913667795e-05, "loss": 0.4015, "step": 317800}, {"epoch": 14.286997933327344, "grad_norm": 7.8683905601501465, "learning_rate": 4.0585865611999775e-05, "loss": 0.4184, "step": 318000}, {"epoch": 14.286997933327344, "eval_loss": 3.863692045211792, "eval_runtime": 1121.258, "eval_samples_per_second": 8.833, "eval_steps_per_second": 0.035, "step": 318000}, {"epoch": 14.295983466618743, "grad_norm": 17.344314575195312, "learning_rate": 4.0574827342323945e-05, "loss": 0.4423, "step": 318200}, {"epoch": 14.304968999910145, "grad_norm": 7.545623302459717, "learning_rate": 4.056378410815877e-05, "loss": 0.4582, "step": 318400}, {"epoch": 14.313954533201546, "grad_norm": 4.13499641418457, "learning_rate": 4.055273591302427e-05, "loss": 0.4233, "step": 318600}, {"epoch": 14.322940066492947, "grad_norm": 1.984163761138916, "learning_rate": 4.054168276044209e-05, "loss": 0.4549, "step": 318800}, {"epoch": 14.331925599784347, "grad_norm": 8.898198127746582, "learning_rate": 4.053062465393542e-05, "loss": 0.4277, "step": 319000}, {"epoch": 14.331925599784347, "eval_loss": 3.831319808959961, "eval_runtime": 1136.9161, "eval_samples_per_second": 8.711, "eval_steps_per_second": 0.034, "step": 319000}, {"epoch": 14.340911133075748, "grad_norm": 4.621338367462158, "learning_rate": 4.0519561597029036e-05, "loss": 0.4108, "step": 319200}, {"epoch": 14.34989666636715, "grad_norm": 6.966736793518066, "learning_rate": 4.050849359324931e-05, "loss": 0.4347, "step": 319400}, {"epoch": 14.35888219965855, "grad_norm": 2.585519313812256, "learning_rate": 4.0497420646124157e-05, "loss": 0.4252, "step": 319600}, {"epoch": 14.36786773294995, "grad_norm": 10.04625415802002, "learning_rate": 4.0486342759183115e-05, "loss": 0.4074, "step": 319800}, {"epoch": 14.376853266241351, "grad_norm": 6.281806945800781, "learning_rate": 4.047525993595724e-05, "loss": 0.4581, "step": 320000}, {"epoch": 14.376853266241351, "eval_loss": 3.7998464107513428, "eval_runtime": 1123.5798, "eval_samples_per_second": 8.815, "eval_steps_per_second": 0.035, "step": 320000}, {"epoch": 14.385838799532753, "grad_norm": 16.557212829589844, "learning_rate": 4.046417217997922e-05, "loss": 0.4741, "step": 320200}, {"epoch": 14.394824332824154, "grad_norm": 7.429055213928223, "learning_rate": 4.045307949478326e-05, "loss": 0.4885, "step": 320400}, {"epoch": 14.403809866115553, "grad_norm": 13.883950233459473, "learning_rate": 4.044198188390519e-05, "loss": 0.3895, "step": 320600}, {"epoch": 14.412795399406955, "grad_norm": 7.166148662567139, "learning_rate": 4.0430879350882364e-05, "loss": 0.4325, "step": 320800}, {"epoch": 14.421780932698356, "grad_norm": 24.932443618774414, "learning_rate": 4.0419771899253724e-05, "loss": 0.4677, "step": 321000}, {"epoch": 14.421780932698356, "eval_loss": 3.8351047039031982, "eval_runtime": 1104.1188, "eval_samples_per_second": 8.97, "eval_steps_per_second": 0.035, "step": 321000}, {"epoch": 14.430766465989757, "grad_norm": 1.9560954570770264, "learning_rate": 4.040865953255979e-05, "loss": 0.421, "step": 321200}, {"epoch": 14.439751999281157, "grad_norm": 14.022553443908691, "learning_rate": 4.0397542254342624e-05, "loss": 0.447, "step": 321400}, {"epoch": 14.448737532572558, "grad_norm": 7.733597755432129, "learning_rate": 4.0386420068145886e-05, "loss": 0.4134, "step": 321600}, {"epoch": 14.45772306586396, "grad_norm": 9.011775016784668, "learning_rate": 4.0375292977514765e-05, "loss": 0.4656, "step": 321800}, {"epoch": 14.46670859915536, "grad_norm": 3.5252091884613037, "learning_rate": 4.036416098599605e-05, "loss": 0.4171, "step": 322000}, {"epoch": 14.46670859915536, "eval_loss": 3.8441038131713867, "eval_runtime": 1104.159, "eval_samples_per_second": 8.97, "eval_steps_per_second": 0.035, "step": 322000}, {"epoch": 14.47569413244676, "grad_norm": 1.1404999494552612, "learning_rate": 4.035302409713805e-05, "loss": 0.3627, "step": 322200}, {"epoch": 14.484679665738161, "grad_norm": 5.832608699798584, "learning_rate": 4.034188231449067e-05, "loss": 0.4487, "step": 322400}, {"epoch": 14.493665199029563, "grad_norm": 8.705142974853516, "learning_rate": 4.033073564160535e-05, "loss": 0.4353, "step": 322600}, {"epoch": 14.502650732320964, "grad_norm": 14.9191312789917, "learning_rate": 4.0319584082035136e-05, "loss": 0.4538, "step": 322800}, {"epoch": 14.511636265612363, "grad_norm": 6.388049602508545, "learning_rate": 4.030842763933456e-05, "loss": 0.4367, "step": 323000}, {"epoch": 14.511636265612363, "eval_loss": 3.840134382247925, "eval_runtime": 1105.0884, "eval_samples_per_second": 8.962, "eval_steps_per_second": 0.035, "step": 323000}, {"epoch": 14.520621798903765, "grad_norm": 5.0418524742126465, "learning_rate": 4.0297266317059765e-05, "loss": 0.4324, "step": 323200}, {"epoch": 14.529607332195166, "grad_norm": 9.340652465820312, "learning_rate": 4.0286100118768426e-05, "loss": 0.427, "step": 323400}, {"epoch": 14.538592865486567, "grad_norm": 25.69853973388672, "learning_rate": 4.027492904801978e-05, "loss": 0.4492, "step": 323600}, {"epoch": 14.547578398777967, "grad_norm": 1.1400892734527588, "learning_rate": 4.026375310837461e-05, "loss": 0.4793, "step": 323800}, {"epoch": 14.556563932069368, "grad_norm": 4.694724082946777, "learning_rate": 4.025257230339527e-05, "loss": 0.4572, "step": 324000}, {"epoch": 14.556563932069368, "eval_loss": 3.8130171298980713, "eval_runtime": 1105.0408, "eval_samples_per_second": 8.963, "eval_steps_per_second": 0.035, "step": 324000}, {"epoch": 14.56554946536077, "grad_norm": 8.171147346496582, "learning_rate": 4.024138663664564e-05, "loss": 0.4274, "step": 324200}, {"epoch": 14.57453499865217, "grad_norm": 6.94440221786499, "learning_rate": 4.023019611169116e-05, "loss": 0.4361, "step": 324400}, {"epoch": 14.58352053194357, "grad_norm": 5.78433084487915, "learning_rate": 4.021900073209882e-05, "loss": 0.431, "step": 324600}, {"epoch": 14.592506065234971, "grad_norm": 10.060790061950684, "learning_rate": 4.020780050143717e-05, "loss": 0.4193, "step": 324800}, {"epoch": 14.601491598526373, "grad_norm": 2.9336678981781006, "learning_rate": 4.0196595423276276e-05, "loss": 0.4811, "step": 325000}, {"epoch": 14.601491598526373, "eval_loss": 3.8441808223724365, "eval_runtime": 1105.4679, "eval_samples_per_second": 8.959, "eval_steps_per_second": 0.035, "step": 325000}, {"epoch": 14.610477131817774, "grad_norm": 11.331477165222168, "learning_rate": 4.018538550118777e-05, "loss": 0.4118, "step": 325200}, {"epoch": 14.619462665109173, "grad_norm": 4.01665735244751, "learning_rate": 4.017417073874482e-05, "loss": 0.43, "step": 325400}, {"epoch": 14.628448198400575, "grad_norm": 3.0681374073028564, "learning_rate": 4.016295113952216e-05, "loss": 0.411, "step": 325600}, {"epoch": 14.637433731691976, "grad_norm": 0.3734178841114044, "learning_rate": 4.015172670709603e-05, "loss": 0.4073, "step": 325800}, {"epoch": 14.646419264983377, "grad_norm": 14.095786094665527, "learning_rate": 4.0140497445044234e-05, "loss": 0.4476, "step": 326000}, {"epoch": 14.646419264983377, "eval_loss": 3.848971366882324, "eval_runtime": 1104.6646, "eval_samples_per_second": 8.966, "eval_steps_per_second": 0.035, "step": 326000}, {"epoch": 14.655404798274777, "grad_norm": 19.044757843017578, "learning_rate": 4.01292633569461e-05, "loss": 0.4564, "step": 326200}, {"epoch": 14.664390331566178, "grad_norm": 6.487691402435303, "learning_rate": 4.011802444638251e-05, "loss": 0.4744, "step": 326400}, {"epoch": 14.67337586485758, "grad_norm": 5.221654891967773, "learning_rate": 4.0106780716935875e-05, "loss": 0.4423, "step": 326600}, {"epoch": 14.68236139814898, "grad_norm": 17.094696044921875, "learning_rate": 4.009553217219015e-05, "loss": 0.4425, "step": 326800}, {"epoch": 14.69134693144038, "grad_norm": 3.616652488708496, "learning_rate": 4.008427881573081e-05, "loss": 0.5084, "step": 327000}, {"epoch": 14.69134693144038, "eval_loss": 3.8496687412261963, "eval_runtime": 1107.6478, "eval_samples_per_second": 8.941, "eval_steps_per_second": 0.035, "step": 327000}, {"epoch": 14.700332464731781, "grad_norm": 5.430749893188477, "learning_rate": 4.0073020651144864e-05, "loss": 0.4159, "step": 327200}, {"epoch": 14.709317998023183, "grad_norm": 5.325740814208984, "learning_rate": 4.0061757682020886e-05, "loss": 0.4361, "step": 327400}, {"epoch": 14.718303531314584, "grad_norm": 10.217351913452148, "learning_rate": 4.005048991194893e-05, "loss": 0.4284, "step": 327600}, {"epoch": 14.727289064605984, "grad_norm": 18.080963134765625, "learning_rate": 4.003921734452063e-05, "loss": 0.4282, "step": 327800}, {"epoch": 14.736274597897385, "grad_norm": 14.644773483276367, "learning_rate": 4.00279399833291e-05, "loss": 0.4241, "step": 328000}, {"epoch": 14.736274597897385, "eval_loss": 3.9514822959899902, "eval_runtime": 1105.1163, "eval_samples_per_second": 8.962, "eval_steps_per_second": 0.035, "step": 328000}, {"epoch": 14.745260131188786, "grad_norm": 6.811315536499023, "learning_rate": 4.001665783196904e-05, "loss": 0.4371, "step": 328200}, {"epoch": 14.754245664480187, "grad_norm": 2.8421096801757812, "learning_rate": 4.000537089403662e-05, "loss": 0.386, "step": 328400}, {"epoch": 14.763231197771589, "grad_norm": 9.394848823547363, "learning_rate": 3.999407917312957e-05, "loss": 0.4609, "step": 328600}, {"epoch": 14.772216731062988, "grad_norm": 4.573288440704346, "learning_rate": 3.998278267284714e-05, "loss": 0.4733, "step": 328800}, {"epoch": 14.78120226435439, "grad_norm": 7.103633880615234, "learning_rate": 3.997148139679009e-05, "loss": 0.4596, "step": 329000}, {"epoch": 14.78120226435439, "eval_loss": 3.844900131225586, "eval_runtime": 1104.3562, "eval_samples_per_second": 8.968, "eval_steps_per_second": 0.035, "step": 329000}, {"epoch": 14.79018779764579, "grad_norm": 21.354633331298828, "learning_rate": 3.996017534856072e-05, "loss": 0.4149, "step": 329200}, {"epoch": 14.79917333093719, "grad_norm": 3.860731363296509, "learning_rate": 3.9948864531762833e-05, "loss": 0.43, "step": 329400}, {"epoch": 14.808158864228592, "grad_norm": 9.424334526062012, "learning_rate": 3.9937548950001775e-05, "loss": 0.4443, "step": 329600}, {"epoch": 14.817144397519993, "grad_norm": 4.933842658996582, "learning_rate": 3.992622860688439e-05, "loss": 0.4222, "step": 329800}, {"epoch": 14.826129930811394, "grad_norm": 5.060630798339844, "learning_rate": 3.9914903506019036e-05, "loss": 0.4871, "step": 330000}, {"epoch": 14.826129930811394, "eval_loss": 3.873565673828125, "eval_runtime": 1110.331, "eval_samples_per_second": 8.92, "eval_steps_per_second": 0.035, "step": 330000}, {"epoch": 14.835115464102795, "grad_norm": 14.746922492980957, "learning_rate": 3.990357365101561e-05, "loss": 0.4373, "step": 330200}, {"epoch": 14.844100997394195, "grad_norm": 15.675421714782715, "learning_rate": 3.989223904548551e-05, "loss": 0.4631, "step": 330400}, {"epoch": 14.853086530685596, "grad_norm": 9.67367935180664, "learning_rate": 3.988089969304166e-05, "loss": 0.4458, "step": 330600}, {"epoch": 14.862072063976997, "grad_norm": 3.0517771244049072, "learning_rate": 3.986955559729848e-05, "loss": 0.4513, "step": 330800}, {"epoch": 14.871057597268399, "grad_norm": 1.9877949953079224, "learning_rate": 3.985820676187191e-05, "loss": 0.4313, "step": 331000}, {"epoch": 14.871057597268399, "eval_loss": 3.8447208404541016, "eval_runtime": 1163.0107, "eval_samples_per_second": 8.516, "eval_steps_per_second": 0.034, "step": 331000}, {"epoch": 14.880043130559798, "grad_norm": 7.18410587310791, "learning_rate": 3.9846853190379394e-05, "loss": 0.4369, "step": 331200}, {"epoch": 14.8890286638512, "grad_norm": 10.671833992004395, "learning_rate": 3.9835494886439914e-05, "loss": 0.3974, "step": 331400}, {"epoch": 14.8980141971426, "grad_norm": 4.593978404998779, "learning_rate": 3.9824131853673904e-05, "loss": 0.4512, "step": 331600}, {"epoch": 14.906999730434002, "grad_norm": 9.309211730957031, "learning_rate": 3.981276409570338e-05, "loss": 0.4041, "step": 331800}, {"epoch": 14.915985263725402, "grad_norm": 5.8800435066223145, "learning_rate": 3.980139161615179e-05, "loss": 0.4698, "step": 332000}, {"epoch": 14.915985263725402, "eval_loss": 3.8392350673675537, "eval_runtime": 1142.4653, "eval_samples_per_second": 8.669, "eval_steps_per_second": 0.034, "step": 332000}, {"epoch": 14.924970797016803, "grad_norm": 4.226430892944336, "learning_rate": 3.979001441864416e-05, "loss": 0.4409, "step": 332200}, {"epoch": 14.933956330308204, "grad_norm": 3.3841519355773926, "learning_rate": 3.977863250680694e-05, "loss": 0.4371, "step": 332400}, {"epoch": 14.942941863599605, "grad_norm": 7.70395040512085, "learning_rate": 3.976724588426815e-05, "loss": 0.4421, "step": 332600}, {"epoch": 14.951927396891005, "grad_norm": 10.1765718460083, "learning_rate": 3.975585455465727e-05, "loss": 0.4105, "step": 332800}, {"epoch": 14.960912930182406, "grad_norm": 6.869187355041504, "learning_rate": 3.974445852160531e-05, "loss": 0.4158, "step": 333000}, {"epoch": 14.960912930182406, "eval_loss": 3.8126509189605713, "eval_runtime": 1144.9743, "eval_samples_per_second": 8.65, "eval_steps_per_second": 0.034, "step": 333000}, {"epoch": 14.969898463473807, "grad_norm": 5.523416042327881, "learning_rate": 3.973305778874475e-05, "loss": 0.4251, "step": 333200}, {"epoch": 14.978883996765209, "grad_norm": 5.1718950271606445, "learning_rate": 3.97216523597096e-05, "loss": 0.4309, "step": 333400}, {"epoch": 14.987869530056608, "grad_norm": 5.314184188842773, "learning_rate": 3.971024223813535e-05, "loss": 0.4442, "step": 333600}, {"epoch": 14.99685506334801, "grad_norm": 5.813663482666016, "learning_rate": 3.969882742765897e-05, "loss": 0.4774, "step": 333800}, {"epoch": 15.00584059663941, "grad_norm": 4.15483283996582, "learning_rate": 3.968740793191895e-05, "loss": 0.386, "step": 334000}, {"epoch": 15.00584059663941, "eval_loss": 3.831601619720459, "eval_runtime": 1157.4903, "eval_samples_per_second": 8.556, "eval_steps_per_second": 0.034, "step": 334000}, {"epoch": 15.014826129930812, "grad_norm": 4.984675407409668, "learning_rate": 3.9675983754555257e-05, "loss": 0.3864, "step": 334200}, {"epoch": 15.023811663222212, "grad_norm": 8.731829643249512, "learning_rate": 3.966455489920937e-05, "loss": 0.3777, "step": 334400}, {"epoch": 15.032797196513613, "grad_norm": 9.469175338745117, "learning_rate": 3.9653121369524234e-05, "loss": 0.4377, "step": 334600}, {"epoch": 15.041782729805014, "grad_norm": 16.434850692749023, "learning_rate": 3.9641683169144304e-05, "loss": 0.4178, "step": 334800}, {"epoch": 15.050768263096415, "grad_norm": 2.574371099472046, "learning_rate": 3.9630240301715516e-05, "loss": 0.4114, "step": 335000}, {"epoch": 15.050768263096415, "eval_loss": 3.860501289367676, "eval_runtime": 1146.1338, "eval_samples_per_second": 8.641, "eval_steps_per_second": 0.034, "step": 335000}, {"epoch": 15.059753796387815, "grad_norm": 5.90514612197876, "learning_rate": 3.961879277088529e-05, "loss": 0.4158, "step": 335200}, {"epoch": 15.068739329679216, "grad_norm": 4.330122470855713, "learning_rate": 3.9607340580302535e-05, "loss": 0.398, "step": 335400}, {"epoch": 15.077724862970618, "grad_norm": 0.6313864588737488, "learning_rate": 3.9595883733617646e-05, "loss": 0.4184, "step": 335600}, {"epoch": 15.086710396262019, "grad_norm": 1.5892980098724365, "learning_rate": 3.9584422234482505e-05, "loss": 0.3704, "step": 335800}, {"epoch": 15.095695929553418, "grad_norm": 13.559605598449707, "learning_rate": 3.957295608655047e-05, "loss": 0.4061, "step": 336000}, {"epoch": 15.095695929553418, "eval_loss": 3.878929853439331, "eval_runtime": 1159.8964, "eval_samples_per_second": 8.539, "eval_steps_per_second": 0.034, "step": 336000}, {"epoch": 15.10468146284482, "grad_norm": 4.454782009124756, "learning_rate": 3.95614852934764e-05, "loss": 0.4292, "step": 336200}, {"epoch": 15.11366699613622, "grad_norm": 12.67405891418457, "learning_rate": 3.9550009858916606e-05, "loss": 0.4449, "step": 336400}, {"epoch": 15.122652529427622, "grad_norm": 7.279116153717041, "learning_rate": 3.9538529786528896e-05, "loss": 0.4239, "step": 336600}, {"epoch": 15.131638062719022, "grad_norm": 8.419065475463867, "learning_rate": 3.952704507997256e-05, "loss": 0.3916, "step": 336800}, {"epoch": 15.140623596010423, "grad_norm": 7.502383232116699, "learning_rate": 3.951555574290834e-05, "loss": 0.4076, "step": 337000}, {"epoch": 15.140623596010423, "eval_loss": 3.861605167388916, "eval_runtime": 1176.4609, "eval_samples_per_second": 8.418, "eval_steps_per_second": 0.033, "step": 337000}, {"epoch": 15.149609129301824, "grad_norm": 5.945129871368408, "learning_rate": 3.950406177899849e-05, "loss": 0.416, "step": 337200}, {"epoch": 15.158594662593226, "grad_norm": 14.246264457702637, "learning_rate": 3.9492563191906706e-05, "loss": 0.3824, "step": 337400}, {"epoch": 15.167580195884625, "grad_norm": 2.2644824981689453, "learning_rate": 3.9481059985298186e-05, "loss": 0.4079, "step": 337600}, {"epoch": 15.176565729176026, "grad_norm": 6.7229204177856445, "learning_rate": 3.946955216283958e-05, "loss": 0.4154, "step": 337800}, {"epoch": 15.185551262467428, "grad_norm": 5.469477653503418, "learning_rate": 3.9458039728199016e-05, "loss": 0.3919, "step": 338000}, {"epoch": 15.185551262467428, "eval_loss": 3.9068820476531982, "eval_runtime": 1146.6357, "eval_samples_per_second": 8.637, "eval_steps_per_second": 0.034, "step": 338000}, {"epoch": 15.194536795758829, "grad_norm": 0.9827006459236145, "learning_rate": 3.944652268504609e-05, "loss": 0.3947, "step": 338200}, {"epoch": 15.203522329050228, "grad_norm": 8.862197875976562, "learning_rate": 3.943500103705188e-05, "loss": 0.4456, "step": 338400}, {"epoch": 15.21250786234163, "grad_norm": 9.226635932922363, "learning_rate": 3.94234747878889e-05, "loss": 0.4429, "step": 338600}, {"epoch": 15.221493395633031, "grad_norm": 9.727663040161133, "learning_rate": 3.9411943941231175e-05, "loss": 0.4261, "step": 338800}, {"epoch": 15.230478928924432, "grad_norm": 6.154589653015137, "learning_rate": 3.940040850075416e-05, "loss": 0.4575, "step": 339000}, {"epoch": 15.230478928924432, "eval_loss": 3.8878021240234375, "eval_runtime": 1146.7256, "eval_samples_per_second": 8.637, "eval_steps_per_second": 0.034, "step": 339000}, {"epoch": 15.239464462215832, "grad_norm": 5.461616039276123, "learning_rate": 3.938886847013479e-05, "loss": 0.413, "step": 339200}, {"epoch": 15.248449995507233, "grad_norm": 12.906144142150879, "learning_rate": 3.937732385305145e-05, "loss": 0.4228, "step": 339400}, {"epoch": 15.257435528798634, "grad_norm": 21.305442810058594, "learning_rate": 3.936577465318402e-05, "loss": 0.4037, "step": 339600}, {"epoch": 15.266421062090036, "grad_norm": 7.382744789123535, "learning_rate": 3.9354220874213785e-05, "loss": 0.3948, "step": 339800}, {"epoch": 15.275406595381435, "grad_norm": 5.708733558654785, "learning_rate": 3.9342662519823545e-05, "loss": 0.4167, "step": 340000}, {"epoch": 15.275406595381435, "eval_loss": 3.8730831146240234, "eval_runtime": 1143.9137, "eval_samples_per_second": 8.658, "eval_steps_per_second": 0.034, "step": 340000}, {"epoch": 15.284392128672836, "grad_norm": 4.250601768493652, "learning_rate": 3.933109959369753e-05, "loss": 0.3798, "step": 340200}, {"epoch": 15.293377661964238, "grad_norm": 8.226158142089844, "learning_rate": 3.9319532099521434e-05, "loss": 0.3839, "step": 340400}, {"epoch": 15.302363195255639, "grad_norm": 30.672576904296875, "learning_rate": 3.9307960040982396e-05, "loss": 0.4016, "step": 340600}, {"epoch": 15.311348728547038, "grad_norm": 12.382901191711426, "learning_rate": 3.929638342176902e-05, "loss": 0.411, "step": 340800}, {"epoch": 15.32033426183844, "grad_norm": 5.150439262390137, "learning_rate": 3.9284802245571385e-05, "loss": 0.4006, "step": 341000}, {"epoch": 15.32033426183844, "eval_loss": 3.9192259311676025, "eval_runtime": 1145.0085, "eval_samples_per_second": 8.65, "eval_steps_per_second": 0.034, "step": 341000}, {"epoch": 15.329319795129841, "grad_norm": 6.119823932647705, "learning_rate": 3.927321651608097e-05, "loss": 0.4234, "step": 341200}, {"epoch": 15.338305328421242, "grad_norm": 2.2303431034088135, "learning_rate": 3.926162623699077e-05, "loss": 0.393, "step": 341400}, {"epoch": 15.347290861712642, "grad_norm": 19.413272857666016, "learning_rate": 3.9250031411995155e-05, "loss": 0.4275, "step": 341600}, {"epoch": 15.356276395004043, "grad_norm": 2.270556688308716, "learning_rate": 3.923843204479002e-05, "loss": 0.4144, "step": 341800}, {"epoch": 15.365261928295444, "grad_norm": 10.509578704833984, "learning_rate": 3.922682813907265e-05, "loss": 0.4045, "step": 342000}, {"epoch": 15.365261928295444, "eval_loss": 3.8500490188598633, "eval_runtime": 1170.295, "eval_samples_per_second": 8.463, "eval_steps_per_second": 0.033, "step": 342000}, {"epoch": 15.374247461586846, "grad_norm": 9.872151374816895, "learning_rate": 3.921521969854182e-05, "loss": 0.4156, "step": 342200}, {"epoch": 15.383232994878245, "grad_norm": 7.011927604675293, "learning_rate": 3.9203606726897724e-05, "loss": 0.4073, "step": 342400}, {"epoch": 15.392218528169646, "grad_norm": 8.124802589416504, "learning_rate": 3.919198922784199e-05, "loss": 0.4099, "step": 342600}, {"epoch": 15.401204061461048, "grad_norm": 9.334155082702637, "learning_rate": 3.918036720507773e-05, "loss": 0.423, "step": 342800}, {"epoch": 15.410189594752449, "grad_norm": 3.0574357509613037, "learning_rate": 3.916874066230945e-05, "loss": 0.4416, "step": 343000}, {"epoch": 15.410189594752449, "eval_loss": 3.8163387775421143, "eval_runtime": 1150.3405, "eval_samples_per_second": 8.61, "eval_steps_per_second": 0.034, "step": 343000}, {"epoch": 15.41917512804385, "grad_norm": 4.572579383850098, "learning_rate": 3.915710960324314e-05, "loss": 0.4077, "step": 343200}, {"epoch": 15.42816066133525, "grad_norm": 60.36442184448242, "learning_rate": 3.91454740315862e-05, "loss": 0.4761, "step": 343400}, {"epoch": 15.437146194626651, "grad_norm": 7.321791172027588, "learning_rate": 3.913383395104748e-05, "loss": 0.393, "step": 343600}, {"epoch": 15.446131727918052, "grad_norm": 8.782684326171875, "learning_rate": 3.912218936533727e-05, "loss": 0.4361, "step": 343800}, {"epoch": 15.455117261209454, "grad_norm": 17.37846565246582, "learning_rate": 3.911054027816729e-05, "loss": 0.4088, "step": 344000}, {"epoch": 15.455117261209454, "eval_loss": 3.8347713947296143, "eval_runtime": 1150.0338, "eval_samples_per_second": 8.612, "eval_steps_per_second": 0.034, "step": 344000}, {"epoch": 15.464102794500853, "grad_norm": 4.234193325042725, "learning_rate": 3.909888669325068e-05, "loss": 0.4399, "step": 344200}, {"epoch": 15.473088327792254, "grad_norm": 6.374758720397949, "learning_rate": 3.908722861430205e-05, "loss": 0.4039, "step": 344400}, {"epoch": 15.482073861083656, "grad_norm": 34.553226470947266, "learning_rate": 3.907556604503743e-05, "loss": 0.4337, "step": 344600}, {"epoch": 15.491059394375057, "grad_norm": 10.942513465881348, "learning_rate": 3.906389898917424e-05, "loss": 0.4693, "step": 344800}, {"epoch": 15.500044927666456, "grad_norm": 8.577802658081055, "learning_rate": 3.905222745043139e-05, "loss": 0.3982, "step": 345000}, {"epoch": 15.500044927666456, "eval_loss": 3.816509962081909, "eval_runtime": 1149.9103, "eval_samples_per_second": 8.613, "eval_steps_per_second": 0.034, "step": 345000}, {"epoch": 15.509030460957858, "grad_norm": 6.402909278869629, "learning_rate": 3.9040551432529195e-05, "loss": 0.4115, "step": 345200}, {"epoch": 15.518015994249259, "grad_norm": 6.276604175567627, "learning_rate": 3.902887093918938e-05, "loss": 0.4154, "step": 345400}, {"epoch": 15.52700152754066, "grad_norm": 7.94034481048584, "learning_rate": 3.9017185974135115e-05, "loss": 0.3947, "step": 345600}, {"epoch": 15.53598706083206, "grad_norm": 1.8332997560501099, "learning_rate": 3.900549654109101e-05, "loss": 0.41, "step": 345800}, {"epoch": 15.544972594123461, "grad_norm": 19.339252471923828, "learning_rate": 3.899380264378305e-05, "loss": 0.4381, "step": 346000}, {"epoch": 15.544972594123461, "eval_loss": 3.820833206176758, "eval_runtime": 1150.5308, "eval_samples_per_second": 8.608, "eval_steps_per_second": 0.034, "step": 346000}, {"epoch": 15.553958127414862, "grad_norm": 23.56734275817871, "learning_rate": 3.898210428593872e-05, "loss": 0.411, "step": 346200}, {"epoch": 15.562943660706264, "grad_norm": 6.649259567260742, "learning_rate": 3.897040147128683e-05, "loss": 0.424, "step": 346400}, {"epoch": 15.571929193997663, "grad_norm": 5.427579879760742, "learning_rate": 3.89586942035577e-05, "loss": 0.4441, "step": 346600}, {"epoch": 15.580914727289064, "grad_norm": 5.252974510192871, "learning_rate": 3.8946982486483015e-05, "loss": 0.4452, "step": 346800}, {"epoch": 15.589900260580466, "grad_norm": 3.2411303520202637, "learning_rate": 3.8935266323795895e-05, "loss": 0.3956, "step": 347000}, {"epoch": 15.589900260580466, "eval_loss": 3.8776004314422607, "eval_runtime": 1148.9182, "eval_samples_per_second": 8.62, "eval_steps_per_second": 0.034, "step": 347000}, {"epoch": 15.598885793871867, "grad_norm": 9.3895902633667, "learning_rate": 3.892354571923088e-05, "loss": 0.4057, "step": 347200}, {"epoch": 15.607871327163267, "grad_norm": 3.1582448482513428, "learning_rate": 3.8911820676523925e-05, "loss": 0.4189, "step": 347400}, {"epoch": 15.616856860454668, "grad_norm": 9.8271484375, "learning_rate": 3.890009119941239e-05, "loss": 0.4239, "step": 347600}, {"epoch": 15.625842393746069, "grad_norm": 2.3805694580078125, "learning_rate": 3.888835729163507e-05, "loss": 0.4121, "step": 347800}, {"epoch": 15.63482792703747, "grad_norm": 12.050047874450684, "learning_rate": 3.887661895693214e-05, "loss": 0.4411, "step": 348000}, {"epoch": 15.63482792703747, "eval_loss": 3.842379570007324, "eval_runtime": 1150.1946, "eval_samples_per_second": 8.611, "eval_steps_per_second": 0.034, "step": 348000}, {"epoch": 15.64381346032887, "grad_norm": 12.517159461975098, "learning_rate": 3.886487619904521e-05, "loss": 0.4285, "step": 348200}, {"epoch": 15.652798993620271, "grad_norm": 8.59961223602295, "learning_rate": 3.88531290217173e-05, "loss": 0.4315, "step": 348400}, {"epoch": 15.661784526911672, "grad_norm": 9.657811164855957, "learning_rate": 3.8841377428692835e-05, "loss": 0.4277, "step": 348600}, {"epoch": 15.670770060203074, "grad_norm": 4.169412136077881, "learning_rate": 3.882962142371763e-05, "loss": 0.4158, "step": 348800}, {"epoch": 15.679755593494473, "grad_norm": 5.746458530426025, "learning_rate": 3.881786101053894e-05, "loss": 0.4112, "step": 349000}, {"epoch": 15.679755593494473, "eval_loss": 3.84271240234375, "eval_runtime": 1152.7298, "eval_samples_per_second": 8.592, "eval_steps_per_second": 0.034, "step": 349000}, {"epoch": 15.688741126785875, "grad_norm": 5.669808387756348, "learning_rate": 3.880609619290538e-05, "loss": 0.4544, "step": 349200}, {"epoch": 15.697726660077276, "grad_norm": 2.429694652557373, "learning_rate": 3.879432697456703e-05, "loss": 0.4341, "step": 349400}, {"epoch": 15.706712193368677, "grad_norm": 2.860553026199341, "learning_rate": 3.8782553359275315e-05, "loss": 0.4342, "step": 349600}, {"epoch": 15.715697726660077, "grad_norm": 11.57726001739502, "learning_rate": 3.877077535078309e-05, "loss": 0.4178, "step": 349800}, {"epoch": 15.724683259951478, "grad_norm": 2.3827250003814697, "learning_rate": 3.8758992952844605e-05, "loss": 0.4078, "step": 350000}, {"epoch": 15.724683259951478, "eval_loss": 3.8592307567596436, "eval_runtime": 1149.9252, "eval_samples_per_second": 8.613, "eval_steps_per_second": 0.034, "step": 350000}, {"epoch": 15.73366879324288, "grad_norm": 28.76621437072754, "learning_rate": 3.8747206169215516e-05, "loss": 0.4289, "step": 350200}, {"epoch": 15.74265432653428, "grad_norm": 1.1635797023773193, "learning_rate": 3.873541500365286e-05, "loss": 0.4409, "step": 350400}, {"epoch": 15.75163985982568, "grad_norm": 9.564525604248047, "learning_rate": 3.872361945991509e-05, "loss": 0.4339, "step": 350600}, {"epoch": 15.760625393117081, "grad_norm": 3.1764824390411377, "learning_rate": 3.871181954176204e-05, "loss": 0.4069, "step": 350800}, {"epoch": 15.769610926408482, "grad_norm": 5.794785499572754, "learning_rate": 3.870001525295494e-05, "loss": 0.4446, "step": 351000}, {"epoch": 15.769610926408482, "eval_loss": 3.835042953491211, "eval_runtime": 1150.8003, "eval_samples_per_second": 8.606, "eval_steps_per_second": 0.034, "step": 351000}, {"epoch": 15.778596459699884, "grad_norm": 3.9470226764678955, "learning_rate": 3.868820659725642e-05, "loss": 0.4118, "step": 351200}, {"epoch": 15.787581992991283, "grad_norm": 25.599266052246094, "learning_rate": 3.86763935784305e-05, "loss": 0.3989, "step": 351400}, {"epoch": 15.796567526282685, "grad_norm": 11.884906768798828, "learning_rate": 3.8664576200242604e-05, "loss": 0.4074, "step": 351600}, {"epoch": 15.805553059574086, "grad_norm": 4.182280540466309, "learning_rate": 3.8652754466459504e-05, "loss": 0.4018, "step": 351800}, {"epoch": 15.814538592865487, "grad_norm": 2.89786696434021, "learning_rate": 3.8640928380849406e-05, "loss": 0.4295, "step": 352000}, {"epoch": 15.814538592865487, "eval_loss": 3.835994005203247, "eval_runtime": 1149.5102, "eval_samples_per_second": 8.616, "eval_steps_per_second": 0.034, "step": 352000}, {"epoch": 15.823524126156887, "grad_norm": 2.728250741958618, "learning_rate": 3.862909794718188e-05, "loss": 0.4141, "step": 352200}, {"epoch": 15.832509659448288, "grad_norm": 5.0473456382751465, "learning_rate": 3.861726316922789e-05, "loss": 0.4068, "step": 352400}, {"epoch": 15.84149519273969, "grad_norm": 4.916729927062988, "learning_rate": 3.860542405075978e-05, "loss": 0.4048, "step": 352600}, {"epoch": 15.85048072603109, "grad_norm": 5.58930778503418, "learning_rate": 3.859358059555127e-05, "loss": 0.431, "step": 352800}, {"epoch": 15.85946625932249, "grad_norm": 2.4550957679748535, "learning_rate": 3.858173280737748e-05, "loss": 0.434, "step": 353000}, {"epoch": 15.85946625932249, "eval_loss": 3.8414108753204346, "eval_runtime": 1140.739, "eval_samples_per_second": 8.682, "eval_steps_per_second": 0.034, "step": 353000}, {"epoch": 15.868451792613891, "grad_norm": 1.504676342010498, "learning_rate": 3.85698806900149e-05, "loss": 0.4354, "step": 353200}, {"epoch": 15.877437325905293, "grad_norm": 5.374175071716309, "learning_rate": 3.8558024247241414e-05, "loss": 0.458, "step": 353400}, {"epoch": 15.886422859196694, "grad_norm": 14.35389518737793, "learning_rate": 3.854616348283625e-05, "loss": 0.4403, "step": 353600}, {"epoch": 15.895408392488093, "grad_norm": 4.4372148513793945, "learning_rate": 3.853429840058006e-05, "loss": 0.4214, "step": 353800}, {"epoch": 15.904393925779495, "grad_norm": 10.166844367980957, "learning_rate": 3.852242900425483e-05, "loss": 0.43, "step": 354000}, {"epoch": 15.904393925779495, "eval_loss": 3.879225492477417, "eval_runtime": 1145.2973, "eval_samples_per_second": 8.648, "eval_steps_per_second": 0.034, "step": 354000}, {"epoch": 15.913379459070896, "grad_norm": 3.3060805797576904, "learning_rate": 3.8510555297643956e-05, "loss": 0.4449, "step": 354200}, {"epoch": 15.922364992362297, "grad_norm": 17.104143142700195, "learning_rate": 3.849867728453218e-05, "loss": 0.4431, "step": 354400}, {"epoch": 15.931350525653698, "grad_norm": 5.082907676696777, "learning_rate": 3.848679496870563e-05, "loss": 0.4273, "step": 354600}, {"epoch": 15.940336058945098, "grad_norm": 9.734619140625, "learning_rate": 3.847490835395181e-05, "loss": 0.4214, "step": 354800}, {"epoch": 15.9493215922365, "grad_norm": 10.629302024841309, "learning_rate": 3.846301744405959e-05, "loss": 0.4601, "step": 355000}, {"epoch": 15.9493215922365, "eval_loss": 3.8631420135498047, "eval_runtime": 1142.5819, "eval_samples_per_second": 8.668, "eval_steps_per_second": 0.034, "step": 355000}, {"epoch": 15.9583071255279, "grad_norm": 15.07685375213623, "learning_rate": 3.84511222428192e-05, "loss": 0.4517, "step": 355200}, {"epoch": 15.9672926588193, "grad_norm": 2.141556978225708, "learning_rate": 3.843922275402225e-05, "loss": 0.4253, "step": 355400}, {"epoch": 15.976278192110701, "grad_norm": 9.05489444732666, "learning_rate": 3.842731898146171e-05, "loss": 0.4403, "step": 355600}, {"epoch": 15.985263725402103, "grad_norm": 7.7289557456970215, "learning_rate": 3.841541092893191e-05, "loss": 0.4053, "step": 355800}, {"epoch": 15.994249258693504, "grad_norm": 16.47095489501953, "learning_rate": 3.8403498600228574e-05, "loss": 0.4137, "step": 356000}, {"epoch": 15.994249258693504, "eval_loss": 3.8049228191375732, "eval_runtime": 1141.3474, "eval_samples_per_second": 8.677, "eval_steps_per_second": 0.034, "step": 356000}, {"epoch": 16.003234791984905, "grad_norm": 7.816695213317871, "learning_rate": 3.839158199914874e-05, "loss": 0.4137, "step": 356200}, {"epoch": 16.012220325276306, "grad_norm": 2.7365758419036865, "learning_rate": 3.837966112949086e-05, "loss": 0.4017, "step": 356400}, {"epoch": 16.021205858567708, "grad_norm": 8.747932434082031, "learning_rate": 3.8367735995054704e-05, "loss": 0.3901, "step": 356600}, {"epoch": 16.030191391859105, "grad_norm": 4.3832106590271, "learning_rate": 3.835580659964142e-05, "loss": 0.3867, "step": 356800}, {"epoch": 16.039176925150507, "grad_norm": 12.593661308288574, "learning_rate": 3.834387294705352e-05, "loss": 0.4276, "step": 357000}, {"epoch": 16.039176925150507, "eval_loss": 3.8479878902435303, "eval_runtime": 1145.2444, "eval_samples_per_second": 8.648, "eval_steps_per_second": 0.034, "step": 357000}, {"epoch": 16.048162458441908, "grad_norm": 4.510431289672852, "learning_rate": 3.833193504109487e-05, "loss": 0.4091, "step": 357200}, {"epoch": 16.05714799173331, "grad_norm": 14.032699584960938, "learning_rate": 3.831999288557067e-05, "loss": 0.382, "step": 357400}, {"epoch": 16.06613352502471, "grad_norm": 8.67285442352295, "learning_rate": 3.83080464842875e-05, "loss": 0.4095, "step": 357600}, {"epoch": 16.075119058316112, "grad_norm": 11.347421646118164, "learning_rate": 3.8296095841053295e-05, "loss": 0.4026, "step": 357800}, {"epoch": 16.084104591607513, "grad_norm": 2.454707622528076, "learning_rate": 3.8284140959677315e-05, "loss": 0.3763, "step": 358000}, {"epoch": 16.084104591607513, "eval_loss": 3.891216993331909, "eval_runtime": 1143.6428, "eval_samples_per_second": 8.66, "eval_steps_per_second": 0.034, "step": 358000}, {"epoch": 16.093090124898914, "grad_norm": 6.182559490203857, "learning_rate": 3.827218184397021e-05, "loss": 0.3719, "step": 358200}, {"epoch": 16.102075658190312, "grad_norm": 8.535185813903809, "learning_rate": 3.826021849774394e-05, "loss": 0.3971, "step": 358400}, {"epoch": 16.111061191481713, "grad_norm": 4.548397064208984, "learning_rate": 3.8248250924811843e-05, "loss": 0.371, "step": 358600}, {"epoch": 16.120046724773115, "grad_norm": 10.030683517456055, "learning_rate": 3.8236279128988584e-05, "loss": 0.4092, "step": 358800}, {"epoch": 16.129032258064516, "grad_norm": 5.520787239074707, "learning_rate": 3.8224303114090196e-05, "loss": 0.436, "step": 359000}, {"epoch": 16.129032258064516, "eval_loss": 3.845858573913574, "eval_runtime": 1151.3773, "eval_samples_per_second": 8.602, "eval_steps_per_second": 0.034, "step": 359000}, {"epoch": 16.138017791355917, "grad_norm": 0.6454381346702576, "learning_rate": 3.8212322883934026e-05, "loss": 0.4252, "step": 359200}, {"epoch": 16.14700332464732, "grad_norm": 10.40180492401123, "learning_rate": 3.82003384423388e-05, "loss": 0.3774, "step": 359400}, {"epoch": 16.15598885793872, "grad_norm": 1.8541001081466675, "learning_rate": 3.8188349793124554e-05, "loss": 0.3787, "step": 359600}, {"epoch": 16.16497439123012, "grad_norm": 9.01765251159668, "learning_rate": 3.817635694011268e-05, "loss": 0.4182, "step": 359800}, {"epoch": 16.17395992452152, "grad_norm": 1.7692986726760864, "learning_rate": 3.8164359887125935e-05, "loss": 0.4164, "step": 360000}, {"epoch": 16.17395992452152, "eval_loss": 3.8807284832000732, "eval_runtime": 1141.9331, "eval_samples_per_second": 8.673, "eval_steps_per_second": 0.034, "step": 360000}, {"epoch": 16.18294545781292, "grad_norm": 13.624265670776367, "learning_rate": 3.815235863798836e-05, "loss": 0.3842, "step": 360200}, {"epoch": 16.19193099110432, "grad_norm": 4.887984275817871, "learning_rate": 3.814035319652538e-05, "loss": 0.3879, "step": 360400}, {"epoch": 16.200916524395723, "grad_norm": 0.7442801594734192, "learning_rate": 3.8128343566563726e-05, "loss": 0.3995, "step": 360600}, {"epoch": 16.209902057687124, "grad_norm": 10.681866645812988, "learning_rate": 3.811632975193149e-05, "loss": 0.4225, "step": 360800}, {"epoch": 16.218887590978525, "grad_norm": 0.09919462352991104, "learning_rate": 3.8104311756458085e-05, "loss": 0.4133, "step": 361000}, {"epoch": 16.218887590978525, "eval_loss": 3.8468129634857178, "eval_runtime": 1141.1126, "eval_samples_per_second": 8.679, "eval_steps_per_second": 0.034, "step": 361000}, {"epoch": 16.227873124269927, "grad_norm": 2.938690185546875, "learning_rate": 3.809228958397425e-05, "loss": 0.4147, "step": 361200}, {"epoch": 16.236858657561328, "grad_norm": 5.6593828201293945, "learning_rate": 3.808026323831208e-05, "loss": 0.3787, "step": 361400}, {"epoch": 16.245844190852726, "grad_norm": 4.981930255889893, "learning_rate": 3.806823272330495e-05, "loss": 0.3999, "step": 361600}, {"epoch": 16.254829724144127, "grad_norm": 5.699765205383301, "learning_rate": 3.805619804278763e-05, "loss": 0.4093, "step": 361800}, {"epoch": 16.263815257435528, "grad_norm": 1.215476155281067, "learning_rate": 3.804415920059616e-05, "loss": 0.4021, "step": 362000}, {"epoch": 16.263815257435528, "eval_loss": 3.8529727458953857, "eval_runtime": 1150.9758, "eval_samples_per_second": 8.605, "eval_steps_per_second": 0.034, "step": 362000}, {"epoch": 16.27280079072693, "grad_norm": 15.102256774902344, "learning_rate": 3.8032116200567944e-05, "loss": 0.4041, "step": 362200}, {"epoch": 16.28178632401833, "grad_norm": 8.938138008117676, "learning_rate": 3.80200690465417e-05, "loss": 0.4056, "step": 362400}, {"epoch": 16.290771857309732, "grad_norm": 0.7558520436286926, "learning_rate": 3.800801774235746e-05, "loss": 0.3967, "step": 362600}, {"epoch": 16.299757390601133, "grad_norm": 3.1432087421417236, "learning_rate": 3.79959622918566e-05, "loss": 0.4021, "step": 362800}, {"epoch": 16.308742923892535, "grad_norm": 11.30734920501709, "learning_rate": 3.798390269888179e-05, "loss": 0.39, "step": 363000}, {"epoch": 16.308742923892535, "eval_loss": 3.8927652835845947, "eval_runtime": 1141.2518, "eval_samples_per_second": 8.678, "eval_steps_per_second": 0.034, "step": 363000}, {"epoch": 16.317728457183932, "grad_norm": 11.273520469665527, "learning_rate": 3.797183896727704e-05, "loss": 0.4538, "step": 363200}, {"epoch": 16.326713990475334, "grad_norm": 17.33855438232422, "learning_rate": 3.7959771100887685e-05, "loss": 0.4019, "step": 363400}, {"epoch": 16.335699523766735, "grad_norm": 9.408929824829102, "learning_rate": 3.794769910356036e-05, "loss": 0.4173, "step": 363600}, {"epoch": 16.344685057058136, "grad_norm": 5.125523567199707, "learning_rate": 3.793562297914302e-05, "loss": 0.4259, "step": 363800}, {"epoch": 16.353670590349537, "grad_norm": 17.848237991333008, "learning_rate": 3.792354273148495e-05, "loss": 0.4109, "step": 364000}, {"epoch": 16.353670590349537, "eval_loss": 3.8154456615448, "eval_runtime": 1133.9853, "eval_samples_per_second": 8.734, "eval_steps_per_second": 0.034, "step": 364000}, {"epoch": 16.36265612364094, "grad_norm": 7.285728931427002, "learning_rate": 3.791145836443673e-05, "loss": 0.4203, "step": 364200}, {"epoch": 16.37164165693234, "grad_norm": 0.5706067681312561, "learning_rate": 3.7899369881850264e-05, "loss": 0.4326, "step": 364400}, {"epoch": 16.38062719022374, "grad_norm": 6.83461856842041, "learning_rate": 3.788727728757876e-05, "loss": 0.415, "step": 364600}, {"epoch": 16.38961272351514, "grad_norm": 3.2358269691467285, "learning_rate": 3.7875180585476754e-05, "loss": 0.4249, "step": 364800}, {"epoch": 16.39859825680654, "grad_norm": 4.388341903686523, "learning_rate": 3.786307977940008e-05, "loss": 0.4001, "step": 365000}, {"epoch": 16.39859825680654, "eval_loss": 3.87809681892395, "eval_runtime": 1106.541, "eval_samples_per_second": 8.95, "eval_steps_per_second": 0.035, "step": 365000}, {"epoch": 16.40758379009794, "grad_norm": 10.232439994812012, "learning_rate": 3.785097487320588e-05, "loss": 0.4246, "step": 365200}, {"epoch": 16.416569323389343, "grad_norm": 21.1503849029541, "learning_rate": 3.783886587075259e-05, "loss": 0.4109, "step": 365400}, {"epoch": 16.425554856680744, "grad_norm": 15.055440902709961, "learning_rate": 3.782675277589998e-05, "loss": 0.4047, "step": 365600}, {"epoch": 16.434540389972145, "grad_norm": 5.9024128913879395, "learning_rate": 3.78146355925091e-05, "loss": 0.4365, "step": 365800}, {"epoch": 16.443525923263547, "grad_norm": 3.827387571334839, "learning_rate": 3.780251432444232e-05, "loss": 0.3897, "step": 366000}, {"epoch": 16.443525923263547, "eval_loss": 3.8388655185699463, "eval_runtime": 1105.7998, "eval_samples_per_second": 8.956, "eval_steps_per_second": 0.035, "step": 366000}, {"epoch": 16.452511456554948, "grad_norm": 5.388125419616699, "learning_rate": 3.7790388975563296e-05, "loss": 0.4402, "step": 366200}, {"epoch": 16.461496989846346, "grad_norm": 1.5944033861160278, "learning_rate": 3.777825954973699e-05, "loss": 0.4247, "step": 366400}, {"epoch": 16.470482523137747, "grad_norm": 3.2299532890319824, "learning_rate": 3.7766126050829683e-05, "loss": 0.4161, "step": 366600}, {"epoch": 16.47946805642915, "grad_norm": 4.81660270690918, "learning_rate": 3.7753988482708923e-05, "loss": 0.4256, "step": 366800}, {"epoch": 16.48845358972055, "grad_norm": 12.131381034851074, "learning_rate": 3.774184684924359e-05, "loss": 0.4218, "step": 367000}, {"epoch": 16.48845358972055, "eval_loss": 3.8612823486328125, "eval_runtime": 1100.2738, "eval_samples_per_second": 9.001, "eval_steps_per_second": 0.035, "step": 367000}, {"epoch": 16.49743912301195, "grad_norm": 2.8556697368621826, "learning_rate": 3.772970115430381e-05, "loss": 0.4187, "step": 367200}, {"epoch": 16.506424656303352, "grad_norm": 8.463600158691406, "learning_rate": 3.7717551401761055e-05, "loss": 0.3736, "step": 367400}, {"epoch": 16.515410189594753, "grad_norm": 0.5444090962409973, "learning_rate": 3.770539759548806e-05, "loss": 0.4075, "step": 367600}, {"epoch": 16.524395722886155, "grad_norm": 16.545907974243164, "learning_rate": 3.7693239739358865e-05, "loss": 0.4065, "step": 367800}, {"epoch": 16.533381256177556, "grad_norm": 17.78046989440918, "learning_rate": 3.76810778372488e-05, "loss": 0.4137, "step": 368000}, {"epoch": 16.533381256177556, "eval_loss": 3.8438374996185303, "eval_runtime": 1102.6952, "eval_samples_per_second": 8.982, "eval_steps_per_second": 0.035, "step": 368000}, {"epoch": 16.542366789468954, "grad_norm": 5.933611869812012, "learning_rate": 3.766891189303448e-05, "loss": 0.4089, "step": 368200}, {"epoch": 16.551352322760355, "grad_norm": 2.965001106262207, "learning_rate": 3.76567419105938e-05, "loss": 0.3756, "step": 368400}, {"epoch": 16.560337856051756, "grad_norm": 12.640633583068848, "learning_rate": 3.764456789380596e-05, "loss": 0.4273, "step": 368600}, {"epoch": 16.569323389343158, "grad_norm": 7.198838233947754, "learning_rate": 3.763238984655144e-05, "loss": 0.4022, "step": 368800}, {"epoch": 16.57830892263456, "grad_norm": 3.5390090942382812, "learning_rate": 3.7620207772712e-05, "loss": 0.4116, "step": 369000}, {"epoch": 16.57830892263456, "eval_loss": 3.8293216228485107, "eval_runtime": 1099.8945, "eval_samples_per_second": 9.005, "eval_steps_per_second": 0.035, "step": 369000}, {"epoch": 16.58729445592596, "grad_norm": 5.592366695404053, "learning_rate": 3.7608021676170695e-05, "loss": 0.4036, "step": 369200}, {"epoch": 16.59627998921736, "grad_norm": 12.47636890411377, "learning_rate": 3.759583156081184e-05, "loss": 0.3893, "step": 369400}, {"epoch": 16.60526552250876, "grad_norm": 3.6026880741119385, "learning_rate": 3.758363743052105e-05, "loss": 0.4395, "step": 369600}, {"epoch": 16.61425105580016, "grad_norm": 8.781318664550781, "learning_rate": 3.7571439289185204e-05, "loss": 0.3842, "step": 369800}, {"epoch": 16.62323658909156, "grad_norm": 1.9131399393081665, "learning_rate": 3.75592371406925e-05, "loss": 0.4082, "step": 370000}, {"epoch": 16.62323658909156, "eval_loss": 3.8365583419799805, "eval_runtime": 1106.4819, "eval_samples_per_second": 8.951, "eval_steps_per_second": 0.035, "step": 370000}, {"epoch": 16.632222122382963, "grad_norm": 9.32291030883789, "learning_rate": 3.754703098893235e-05, "loss": 0.4044, "step": 370200}, {"epoch": 16.641207655674364, "grad_norm": 7.453135013580322, "learning_rate": 3.753482083779549e-05, "loss": 0.4132, "step": 370400}, {"epoch": 16.650193188965766, "grad_norm": 13.478267669677734, "learning_rate": 3.752260669117392e-05, "loss": 0.4149, "step": 370600}, {"epoch": 16.659178722257167, "grad_norm": 4.782924652099609, "learning_rate": 3.7510388552960895e-05, "loss": 0.4303, "step": 370800}, {"epoch": 16.668164255548568, "grad_norm": 6.732643127441406, "learning_rate": 3.749816642705098e-05, "loss": 0.4386, "step": 371000}, {"epoch": 16.668164255548568, "eval_loss": 3.8590922355651855, "eval_runtime": 1101.0023, "eval_samples_per_second": 8.995, "eval_steps_per_second": 0.035, "step": 371000}, {"epoch": 16.67714978883997, "grad_norm": 11.248590469360352, "learning_rate": 3.748594031733996e-05, "loss": 0.4137, "step": 371200}, {"epoch": 16.686135322131367, "grad_norm": 7.598705768585205, "learning_rate": 3.747371022772494e-05, "loss": 0.415, "step": 371400}, {"epoch": 16.69512085542277, "grad_norm": 2.1938705444335938, "learning_rate": 3.746147616210426e-05, "loss": 0.4304, "step": 371600}, {"epoch": 16.70410638871417, "grad_norm": 4.91569185256958, "learning_rate": 3.7449238124377536e-05, "loss": 0.4076, "step": 371800}, {"epoch": 16.71309192200557, "grad_norm": 20.976909637451172, "learning_rate": 3.743699611844567e-05, "loss": 0.405, "step": 372000}, {"epoch": 16.71309192200557, "eval_loss": 3.873788595199585, "eval_runtime": 1101.0887, "eval_samples_per_second": 8.995, "eval_steps_per_second": 0.035, "step": 372000}, {"epoch": 16.722077455296972, "grad_norm": 8.065682411193848, "learning_rate": 3.7424750148210794e-05, "loss": 0.4384, "step": 372200}, {"epoch": 16.731062988588373, "grad_norm": 13.42385482788086, "learning_rate": 3.741250021757633e-05, "loss": 0.4002, "step": 372400}, {"epoch": 16.740048521879775, "grad_norm": 14.792691230773926, "learning_rate": 3.7400246330446954e-05, "loss": 0.3998, "step": 372600}, {"epoch": 16.749034055171176, "grad_norm": 28.727434158325195, "learning_rate": 3.7387988490728595e-05, "loss": 0.4238, "step": 372800}, {"epoch": 16.758019588462574, "grad_norm": 10.067317008972168, "learning_rate": 3.7375726702328454e-05, "loss": 0.4134, "step": 373000}, {"epoch": 16.758019588462574, "eval_loss": 3.951530933380127, "eval_runtime": 1102.4686, "eval_samples_per_second": 8.983, "eval_steps_per_second": 0.035, "step": 373000}, {"epoch": 16.767005121753975, "grad_norm": 9.972529411315918, "learning_rate": 3.736346096915499e-05, "loss": 0.4335, "step": 373200}, {"epoch": 16.775990655045376, "grad_norm": 2.3625543117523193, "learning_rate": 3.735119129511792e-05, "loss": 0.4357, "step": 373400}, {"epoch": 16.784976188336778, "grad_norm": 5.44252347946167, "learning_rate": 3.733891768412819e-05, "loss": 0.4042, "step": 373600}, {"epoch": 16.79396172162818, "grad_norm": 14.719382286071777, "learning_rate": 3.7326640140098056e-05, "loss": 0.379, "step": 373800}, {"epoch": 16.80294725491958, "grad_norm": 12.511571884155273, "learning_rate": 3.731435866694097e-05, "loss": 0.4258, "step": 374000}, {"epoch": 16.80294725491958, "eval_loss": 3.8407986164093018, "eval_runtime": 1100.7682, "eval_samples_per_second": 8.997, "eval_steps_per_second": 0.035, "step": 374000}, {"epoch": 16.81193278821098, "grad_norm": 2.9213812351226807, "learning_rate": 3.7302073268571673e-05, "loss": 0.4111, "step": 374200}, {"epoch": 16.820918321502383, "grad_norm": 40.420196533203125, "learning_rate": 3.728978394890615e-05, "loss": 0.4209, "step": 374400}, {"epoch": 16.82990385479378, "grad_norm": 1.4034184217453003, "learning_rate": 3.727749071186162e-05, "loss": 0.4118, "step": 374600}, {"epoch": 16.83888938808518, "grad_norm": 10.61877727508545, "learning_rate": 3.7265193561356576e-05, "loss": 0.3717, "step": 374800}, {"epoch": 16.847874921376583, "grad_norm": 15.831500053405762, "learning_rate": 3.725289250131074e-05, "loss": 0.4242, "step": 375000}, {"epoch": 16.847874921376583, "eval_loss": 3.901285171508789, "eval_runtime": 1085.5255, "eval_samples_per_second": 9.124, "eval_steps_per_second": 0.036, "step": 375000}, {"epoch": 16.856860454667984, "grad_norm": 19.590776443481445, "learning_rate": 3.724058753564507e-05, "loss": 0.4149, "step": 375200}, {"epoch": 16.865845987959386, "grad_norm": 12.736054420471191, "learning_rate": 3.722827866828181e-05, "loss": 0.4186, "step": 375400}, {"epoch": 16.874831521250787, "grad_norm": 18.651493072509766, "learning_rate": 3.721596590314441e-05, "loss": 0.4529, "step": 375600}, {"epoch": 16.883817054542188, "grad_norm": 9.52115535736084, "learning_rate": 3.720364924415757e-05, "loss": 0.4294, "step": 375800}, {"epoch": 16.89280258783359, "grad_norm": 11.281582832336426, "learning_rate": 3.719132869524723e-05, "loss": 0.4451, "step": 376000}, {"epoch": 16.89280258783359, "eval_loss": 3.8090622425079346, "eval_runtime": 1084.0102, "eval_samples_per_second": 9.136, "eval_steps_per_second": 0.036, "step": 376000}, {"epoch": 16.901788121124987, "grad_norm": 17.860044479370117, "learning_rate": 3.71790042603406e-05, "loss": 0.4197, "step": 376200}, {"epoch": 16.91077365441639, "grad_norm": 2.703660488128662, "learning_rate": 3.716667594336608e-05, "loss": 0.4291, "step": 376400}, {"epoch": 16.91975918770779, "grad_norm": 6.559628486633301, "learning_rate": 3.715434374825334e-05, "loss": 0.4271, "step": 376600}, {"epoch": 16.92874472099919, "grad_norm": 17.741317749023438, "learning_rate": 3.7142007678933286e-05, "loss": 0.4216, "step": 376800}, {"epoch": 16.937730254290592, "grad_norm": 14.408329963684082, "learning_rate": 3.7129667739338035e-05, "loss": 0.3846, "step": 377000}, {"epoch": 16.937730254290592, "eval_loss": 3.846365213394165, "eval_runtime": 1084.0168, "eval_samples_per_second": 9.136, "eval_steps_per_second": 0.036, "step": 377000}, {"epoch": 16.946715787581994, "grad_norm": 6.594641208648682, "learning_rate": 3.711732393340097e-05, "loss": 0.4175, "step": 377200}, {"epoch": 16.955701320873395, "grad_norm": 22.12388801574707, "learning_rate": 3.710497626505666e-05, "loss": 0.4371, "step": 377400}, {"epoch": 16.964686854164796, "grad_norm": 18.402645111083984, "learning_rate": 3.7092624738240974e-05, "loss": 0.3814, "step": 377600}, {"epoch": 16.973672387456194, "grad_norm": 0.5258151888847351, "learning_rate": 3.708026935689094e-05, "loss": 0.3426, "step": 377800}, {"epoch": 16.982657920747595, "grad_norm": 13.795966148376465, "learning_rate": 3.7067910124944866e-05, "loss": 0.3805, "step": 378000}, {"epoch": 16.982657920747595, "eval_loss": 3.942888021469116, "eval_runtime": 1083.5357, "eval_samples_per_second": 9.14, "eval_steps_per_second": 0.036, "step": 378000}, {"epoch": 16.991643454038996, "grad_norm": 15.092402458190918, "learning_rate": 3.7055547046342257e-05, "loss": 0.4181, "step": 378200}, {"epoch": 17.000628987330398, "grad_norm": 8.252157211303711, "learning_rate": 3.704318012502386e-05, "loss": 0.4221, "step": 378400}, {"epoch": 17.0096145206218, "grad_norm": 7.719264030456543, "learning_rate": 3.703080936493163e-05, "loss": 0.3772, "step": 378600}, {"epoch": 17.0186000539132, "grad_norm": 9.026861190795898, "learning_rate": 3.701843477000879e-05, "loss": 0.3988, "step": 378800}, {"epoch": 17.0275855872046, "grad_norm": 6.281711101531982, "learning_rate": 3.7006056344199716e-05, "loss": 0.3912, "step": 379000}, {"epoch": 17.0275855872046, "eval_loss": 3.819859504699707, "eval_runtime": 1085.6011, "eval_samples_per_second": 9.123, "eval_steps_per_second": 0.036, "step": 379000}, {"epoch": 17.036571120496003, "grad_norm": 2.070225954055786, "learning_rate": 3.699367409145005e-05, "loss": 0.4107, "step": 379200}, {"epoch": 17.0455566537874, "grad_norm": 8.535941123962402, "learning_rate": 3.698128801570665e-05, "loss": 0.3904, "step": 379400}, {"epoch": 17.054542187078802, "grad_norm": 6.998322486877441, "learning_rate": 3.69688981209176e-05, "loss": 0.4092, "step": 379600}, {"epoch": 17.063527720370203, "grad_norm": 1.5596981048583984, "learning_rate": 3.6956504411032165e-05, "loss": 0.4072, "step": 379800}, {"epoch": 17.072513253661604, "grad_norm": 11.192583084106445, "learning_rate": 3.694410689000087e-05, "loss": 0.3701, "step": 380000}, {"epoch": 17.072513253661604, "eval_loss": 3.847810745239258, "eval_runtime": 1083.6619, "eval_samples_per_second": 9.139, "eval_steps_per_second": 0.036, "step": 380000}, {"epoch": 17.081498786953006, "grad_norm": 21.050588607788086, "learning_rate": 3.693170556177542e-05, "loss": 0.3933, "step": 380200}, {"epoch": 17.090484320244407, "grad_norm": 6.3362016677856445, "learning_rate": 3.691930043030877e-05, "loss": 0.3821, "step": 380400}, {"epoch": 17.09946985353581, "grad_norm": 7.509994029998779, "learning_rate": 3.6906891499555054e-05, "loss": 0.3792, "step": 380600}, {"epoch": 17.10845538682721, "grad_norm": 13.802506446838379, "learning_rate": 3.6894478773469624e-05, "loss": 0.3725, "step": 380800}, {"epoch": 17.117440920118607, "grad_norm": 9.925665855407715, "learning_rate": 3.688206225600904e-05, "loss": 0.3727, "step": 381000}, {"epoch": 17.117440920118607, "eval_loss": 3.851689100265503, "eval_runtime": 1083.8981, "eval_samples_per_second": 9.137, "eval_steps_per_second": 0.036, "step": 381000}, {"epoch": 17.12642645341001, "grad_norm": 0.7609677910804749, "learning_rate": 3.68696419511311e-05, "loss": 0.3871, "step": 381200}, {"epoch": 17.13541198670141, "grad_norm": 11.126961708068848, "learning_rate": 3.685721786279478e-05, "loss": 0.4077, "step": 381400}, {"epoch": 17.14439751999281, "grad_norm": 5.107800006866455, "learning_rate": 3.684478999496026e-05, "loss": 0.4096, "step": 381600}, {"epoch": 17.153383053284212, "grad_norm": 4.639297008514404, "learning_rate": 3.6832358351588945e-05, "loss": 0.3921, "step": 381800}, {"epoch": 17.162368586575614, "grad_norm": 5.009506702423096, "learning_rate": 3.681992293664341e-05, "loss": 0.3988, "step": 382000}, {"epoch": 17.162368586575614, "eval_loss": 3.8172054290771484, "eval_runtime": 1088.2423, "eval_samples_per_second": 9.101, "eval_steps_per_second": 0.036, "step": 382000}, {"epoch": 17.171354119867015, "grad_norm": 2.0426735877990723, "learning_rate": 3.6807483754087476e-05, "loss": 0.3995, "step": 382200}, {"epoch": 17.180339653158416, "grad_norm": 0.8747676014900208, "learning_rate": 3.679504080788614e-05, "loss": 0.3465, "step": 382400}, {"epoch": 17.189325186449818, "grad_norm": 9.304901123046875, "learning_rate": 3.678259410200558e-05, "loss": 0.3792, "step": 382600}, {"epoch": 17.198310719741215, "grad_norm": 5.541252136230469, "learning_rate": 3.677014364041323e-05, "loss": 0.3944, "step": 382800}, {"epoch": 17.207296253032617, "grad_norm": 7.812130451202393, "learning_rate": 3.675768942707767e-05, "loss": 0.4363, "step": 383000}, {"epoch": 17.207296253032617, "eval_loss": 3.8186628818511963, "eval_runtime": 1085.5035, "eval_samples_per_second": 9.124, "eval_steps_per_second": 0.036, "step": 383000}, {"epoch": 17.216281786324018, "grad_norm": 8.80836296081543, "learning_rate": 3.6745231465968674e-05, "loss": 0.3704, "step": 383200}, {"epoch": 17.22526731961542, "grad_norm": 2.294656276702881, "learning_rate": 3.673276976105724e-05, "loss": 0.3851, "step": 383400}, {"epoch": 17.23425285290682, "grad_norm": 0.8409772515296936, "learning_rate": 3.6720304316315556e-05, "loss": 0.365, "step": 383600}, {"epoch": 17.24323838619822, "grad_norm": 7.286799430847168, "learning_rate": 3.670783513571698e-05, "loss": 0.3604, "step": 383800}, {"epoch": 17.252223919489623, "grad_norm": 11.555950164794922, "learning_rate": 3.6695362223236086e-05, "loss": 0.3812, "step": 384000}, {"epoch": 17.252223919489623, "eval_loss": 3.913374185562134, "eval_runtime": 1084.6125, "eval_samples_per_second": 9.131, "eval_steps_per_second": 0.036, "step": 384000}, {"epoch": 17.261209452781024, "grad_norm": 2.9781994819641113, "learning_rate": 3.668288558284861e-05, "loss": 0.3923, "step": 384200}, {"epoch": 17.270194986072422, "grad_norm": 7.835712432861328, "learning_rate": 3.66704052185315e-05, "loss": 0.4073, "step": 384400}, {"epoch": 17.279180519363823, "grad_norm": 9.055235862731934, "learning_rate": 3.6657921134262885e-05, "loss": 0.382, "step": 384600}, {"epoch": 17.288166052655225, "grad_norm": 27.968557357788086, "learning_rate": 3.664543333402207e-05, "loss": 0.4148, "step": 384800}, {"epoch": 17.297151585946626, "grad_norm": 12.404014587402344, "learning_rate": 3.663294182178956e-05, "loss": 0.3557, "step": 385000}, {"epoch": 17.297151585946626, "eval_loss": 3.8852949142456055, "eval_runtime": 1086.2089, "eval_samples_per_second": 9.118, "eval_steps_per_second": 0.036, "step": 385000}, {"epoch": 17.306137119238027, "grad_norm": 10.516440391540527, "learning_rate": 3.662044660154703e-05, "loss": 0.4145, "step": 385200}, {"epoch": 17.31512265252943, "grad_norm": 2.42533278465271, "learning_rate": 3.660794767727735e-05, "loss": 0.3952, "step": 385400}, {"epoch": 17.32410818582083, "grad_norm": 1.5313594341278076, "learning_rate": 3.659544505296456e-05, "loss": 0.3634, "step": 385600}, {"epoch": 17.33309371911223, "grad_norm": 6.5009765625, "learning_rate": 3.6582938732593865e-05, "loss": 0.4266, "step": 385800}, {"epoch": 17.34207925240363, "grad_norm": 7.348703384399414, "learning_rate": 3.657042872015168e-05, "loss": 0.4209, "step": 386000}, {"epoch": 17.34207925240363, "eval_loss": 3.80428147315979, "eval_runtime": 1088.4654, "eval_samples_per_second": 9.099, "eval_steps_per_second": 0.036, "step": 386000}, {"epoch": 17.35106478569503, "grad_norm": 5.27815580368042, "learning_rate": 3.655791501962559e-05, "loss": 0.3811, "step": 386200}, {"epoch": 17.36005031898643, "grad_norm": 10.278822898864746, "learning_rate": 3.654539763500433e-05, "loss": 0.3897, "step": 386400}, {"epoch": 17.369035852277833, "grad_norm": 7.166937351226807, "learning_rate": 3.653287657027783e-05, "loss": 0.4025, "step": 386600}, {"epoch": 17.378021385569234, "grad_norm": 15.087567329406738, "learning_rate": 3.652035182943721e-05, "loss": 0.333, "step": 386800}, {"epoch": 17.387006918860635, "grad_norm": 18.905258178710938, "learning_rate": 3.6507823416474715e-05, "loss": 0.3743, "step": 387000}, {"epoch": 17.387006918860635, "eval_loss": 3.854860782623291, "eval_runtime": 1149.6352, "eval_samples_per_second": 8.615, "eval_steps_per_second": 0.034, "step": 387000}, {"epoch": 17.395992452152036, "grad_norm": 14.928525924682617, "learning_rate": 3.6495291335383805e-05, "loss": 0.4021, "step": 387200}, {"epoch": 17.404977985443438, "grad_norm": 3.540318012237549, "learning_rate": 3.648275559015909e-05, "loss": 0.4007, "step": 387400}, {"epoch": 17.413963518734835, "grad_norm": 1.0011667013168335, "learning_rate": 3.647021618479634e-05, "loss": 0.3821, "step": 387600}, {"epoch": 17.422949052026237, "grad_norm": 9.072355270385742, "learning_rate": 3.6457673123292504e-05, "loss": 0.4013, "step": 387800}, {"epoch": 17.431934585317638, "grad_norm": 5.886098861694336, "learning_rate": 3.644512640964569e-05, "loss": 0.3763, "step": 388000}, {"epoch": 17.431934585317638, "eval_loss": 3.810971260070801, "eval_runtime": 1130.6573, "eval_samples_per_second": 8.76, "eval_steps_per_second": 0.034, "step": 388000}, {"epoch": 17.44092011860904, "grad_norm": 7.5825514793396, "learning_rate": 3.643257604785518e-05, "loss": 0.4158, "step": 388200}, {"epoch": 17.44990565190044, "grad_norm": 4.319643020629883, "learning_rate": 3.642002204192142e-05, "loss": 0.3819, "step": 388400}, {"epoch": 17.458891185191842, "grad_norm": 12.306256294250488, "learning_rate": 3.6407464395845996e-05, "loss": 0.4156, "step": 388600}, {"epoch": 17.467876718483243, "grad_norm": 22.988723754882812, "learning_rate": 3.639490311363167e-05, "loss": 0.4123, "step": 388800}, {"epoch": 17.476862251774644, "grad_norm": 7.2487359046936035, "learning_rate": 3.638233819928237e-05, "loss": 0.4258, "step": 389000}, {"epoch": 17.476862251774644, "eval_loss": 3.8038196563720703, "eval_runtime": 1126.3212, "eval_samples_per_second": 8.793, "eval_steps_per_second": 0.035, "step": 389000}, {"epoch": 17.485847785066042, "grad_norm": 13.96484088897705, "learning_rate": 3.6369769656803165e-05, "loss": 0.3725, "step": 389200}, {"epoch": 17.494833318357443, "grad_norm": 6.461380958557129, "learning_rate": 3.63571974902003e-05, "loss": 0.4061, "step": 389400}, {"epoch": 17.503818851648845, "grad_norm": 8.86327075958252, "learning_rate": 3.6344621703481146e-05, "loss": 0.3814, "step": 389600}, {"epoch": 17.512804384940246, "grad_norm": 1.6969479322433472, "learning_rate": 3.6332042300654255e-05, "loss": 0.3937, "step": 389800}, {"epoch": 17.521789918231647, "grad_norm": 6.137419700622559, "learning_rate": 3.631945928572932e-05, "loss": 0.3711, "step": 390000}, {"epoch": 17.521789918231647, "eval_loss": 3.819227457046509, "eval_runtime": 1126.304, "eval_samples_per_second": 8.793, "eval_steps_per_second": 0.035, "step": 390000}, {"epoch": 17.53077545152305, "grad_norm": 13.840421676635742, "learning_rate": 3.6306872662717195e-05, "loss": 0.4058, "step": 390200}, {"epoch": 17.53976098481445, "grad_norm": 9.404634475708008, "learning_rate": 3.6294282435629865e-05, "loss": 0.425, "step": 390400}, {"epoch": 17.54874651810585, "grad_norm": 13.545289993286133, "learning_rate": 3.6281688608480486e-05, "loss": 0.3879, "step": 390600}, {"epoch": 17.55773205139725, "grad_norm": 10.073009490966797, "learning_rate": 3.6269091185283345e-05, "loss": 0.4131, "step": 390800}, {"epoch": 17.56671758468865, "grad_norm": 4.1348676681518555, "learning_rate": 3.6256490170053885e-05, "loss": 0.4094, "step": 391000}, {"epoch": 17.56671758468865, "eval_loss": 3.8144443035125732, "eval_runtime": 1125.7795, "eval_samples_per_second": 8.797, "eval_steps_per_second": 0.035, "step": 391000}, {"epoch": 17.57570311798005, "grad_norm": 12.360026359558105, "learning_rate": 3.624388556680869e-05, "loss": 0.3895, "step": 391200}, {"epoch": 17.584688651271453, "grad_norm": 3.9698164463043213, "learning_rate": 3.6231277379565476e-05, "loss": 0.4149, "step": 391400}, {"epoch": 17.593674184562854, "grad_norm": 13.396862030029297, "learning_rate": 3.621866561234314e-05, "loss": 0.3643, "step": 391600}, {"epoch": 17.602659717854255, "grad_norm": 5.373486518859863, "learning_rate": 3.620605026916166e-05, "loss": 0.4009, "step": 391800}, {"epoch": 17.611645251145656, "grad_norm": 5.472818374633789, "learning_rate": 3.619343135404221e-05, "loss": 0.401, "step": 392000}, {"epoch": 17.611645251145656, "eval_loss": 3.7937300205230713, "eval_runtime": 1126.5045, "eval_samples_per_second": 8.792, "eval_steps_per_second": 0.035, "step": 392000}, {"epoch": 17.620630784437058, "grad_norm": 11.465763092041016, "learning_rate": 3.6180808871007076e-05, "loss": 0.3799, "step": 392200}, {"epoch": 17.629616317728455, "grad_norm": 1.5130301713943481, "learning_rate": 3.6168182824079684e-05, "loss": 0.3873, "step": 392400}, {"epoch": 17.638601851019857, "grad_norm": 4.5390143394470215, "learning_rate": 3.61555532172846e-05, "loss": 0.4056, "step": 392600}, {"epoch": 17.647587384311258, "grad_norm": 5.865408897399902, "learning_rate": 3.6142920054647514e-05, "loss": 0.4667, "step": 392800}, {"epoch": 17.65657291760266, "grad_norm": 11.054267883300781, "learning_rate": 3.613028334019526e-05, "loss": 0.4056, "step": 393000}, {"epoch": 17.65657291760266, "eval_loss": 3.8446738719940186, "eval_runtime": 1128.0658, "eval_samples_per_second": 8.78, "eval_steps_per_second": 0.035, "step": 393000}, {"epoch": 17.66555845089406, "grad_norm": 1.73776376247406, "learning_rate": 3.6117643077955795e-05, "loss": 0.3956, "step": 393200}, {"epoch": 17.674543984185462, "grad_norm": 8.85155200958252, "learning_rate": 3.610499927195823e-05, "loss": 0.4032, "step": 393400}, {"epoch": 17.683529517476863, "grad_norm": 0.8997072577476501, "learning_rate": 3.6092351926232784e-05, "loss": 0.4166, "step": 393600}, {"epoch": 17.692515050768264, "grad_norm": 5.855953216552734, "learning_rate": 3.6079701044810796e-05, "loss": 0.3818, "step": 393800}, {"epoch": 17.701500584059666, "grad_norm": 5.543238162994385, "learning_rate": 3.606704663172476e-05, "loss": 0.3927, "step": 394000}, {"epoch": 17.701500584059666, "eval_loss": 3.8253390789031982, "eval_runtime": 1130.3479, "eval_samples_per_second": 8.762, "eval_steps_per_second": 0.035, "step": 394000}, {"epoch": 17.710486117351063, "grad_norm": 9.299339294433594, "learning_rate": 3.6054388691008264e-05, "loss": 0.3598, "step": 394200}, {"epoch": 17.719471650642465, "grad_norm": 16.317785263061523, "learning_rate": 3.604172722669607e-05, "loss": 0.3629, "step": 394400}, {"epoch": 17.728457183933866, "grad_norm": 11.917454719543457, "learning_rate": 3.602906224282398e-05, "loss": 0.4213, "step": 394600}, {"epoch": 17.737442717225267, "grad_norm": 6.563929080963135, "learning_rate": 3.6016393743429024e-05, "loss": 0.3994, "step": 394800}, {"epoch": 17.74642825051667, "grad_norm": 8.417221069335938, "learning_rate": 3.6003721732549254e-05, "loss": 0.3833, "step": 395000}, {"epoch": 17.74642825051667, "eval_loss": 3.8368141651153564, "eval_runtime": 1125.9952, "eval_samples_per_second": 8.796, "eval_steps_per_second": 0.035, "step": 395000}, {"epoch": 17.75541378380807, "grad_norm": 18.441783905029297, "learning_rate": 3.59910462142239e-05, "loss": 0.3396, "step": 395200}, {"epoch": 17.76439931709947, "grad_norm": 13.164015769958496, "learning_rate": 3.59783671924933e-05, "loss": 0.4187, "step": 395400}, {"epoch": 17.77338485039087, "grad_norm": 14.248663902282715, "learning_rate": 3.59656846713989e-05, "loss": 0.4077, "step": 395600}, {"epoch": 17.78237038368227, "grad_norm": 11.191965103149414, "learning_rate": 3.595299865498325e-05, "loss": 0.3516, "step": 395800}, {"epoch": 17.79135591697367, "grad_norm": 1.773537039756775, "learning_rate": 3.594030914729005e-05, "loss": 0.3653, "step": 396000}, {"epoch": 17.79135591697367, "eval_loss": 3.8245689868927, "eval_runtime": 1126.8022, "eval_samples_per_second": 8.789, "eval_steps_per_second": 0.035, "step": 396000}, {"epoch": 17.800341450265073, "grad_norm": 3.224982261657715, "learning_rate": 3.592761615236407e-05, "loss": 0.3715, "step": 396200}, {"epoch": 17.809326983556474, "grad_norm": 11.764269828796387, "learning_rate": 3.591491967425123e-05, "loss": 0.4247, "step": 396400}, {"epoch": 17.818312516847875, "grad_norm": 28.149105072021484, "learning_rate": 3.5902219716998545e-05, "loss": 0.4073, "step": 396600}, {"epoch": 17.827298050139277, "grad_norm": 5.350660800933838, "learning_rate": 3.5889516284654115e-05, "loss": 0.4157, "step": 396800}, {"epoch": 17.836283583430678, "grad_norm": 3.0195703506469727, "learning_rate": 3.587680938126719e-05, "loss": 0.4154, "step": 397000}, {"epoch": 17.836283583430678, "eval_loss": 3.830150842666626, "eval_runtime": 1126.5253, "eval_samples_per_second": 8.792, "eval_steps_per_second": 0.035, "step": 397000}, {"epoch": 17.84526911672208, "grad_norm": 16.077167510986328, "learning_rate": 3.58640990108881e-05, "loss": 0.3934, "step": 397200}, {"epoch": 17.854254650013477, "grad_norm": 7.119049072265625, "learning_rate": 3.5851385177568287e-05, "loss": 0.3933, "step": 397400}, {"epoch": 17.863240183304878, "grad_norm": 4.785800933837891, "learning_rate": 3.583866788536029e-05, "loss": 0.4054, "step": 397600}, {"epoch": 17.87222571659628, "grad_norm": 15.827156066894531, "learning_rate": 3.582594713831777e-05, "loss": 0.3705, "step": 397800}, {"epoch": 17.88121124988768, "grad_norm": 8.269429206848145, "learning_rate": 3.581322294049546e-05, "loss": 0.3958, "step": 398000}, {"epoch": 17.88121124988768, "eval_loss": 3.8027560710906982, "eval_runtime": 1224.91, "eval_samples_per_second": 8.085, "eval_steps_per_second": 0.032, "step": 398000}, {"epoch": 17.890196783179082, "grad_norm": 8.487425804138184, "learning_rate": 3.580049529594922e-05, "loss": 0.3931, "step": 398200}, {"epoch": 17.899182316470483, "grad_norm": 18.79955291748047, "learning_rate": 3.5787764208736e-05, "loss": 0.4494, "step": 398400}, {"epoch": 17.908167849761885, "grad_norm": 12.001044273376465, "learning_rate": 3.577502968291383e-05, "loss": 0.4309, "step": 398600}, {"epoch": 17.917153383053286, "grad_norm": 5.9302873611450195, "learning_rate": 3.576229172254186e-05, "loss": 0.415, "step": 398800}, {"epoch": 17.926138916344684, "grad_norm": 6.8387346267700195, "learning_rate": 3.574955033168033e-05, "loss": 0.392, "step": 399000}, {"epoch": 17.926138916344684, "eval_loss": 3.784846544265747, "eval_runtime": 1204.623, "eval_samples_per_second": 8.222, "eval_steps_per_second": 0.032, "step": 399000}, {"epoch": 17.935124449636085, "grad_norm": 3.8658130168914795, "learning_rate": 3.573680551439056e-05, "loss": 0.382, "step": 399200}, {"epoch": 17.944109982927486, "grad_norm": 2.803126573562622, "learning_rate": 3.572405727473498e-05, "loss": 0.3711, "step": 399400}, {"epoch": 17.953095516218887, "grad_norm": 0.6691089272499084, "learning_rate": 3.5711305616777095e-05, "loss": 0.3527, "step": 399600}, {"epoch": 17.96208104951029, "grad_norm": 5.192505836486816, "learning_rate": 3.569855054458151e-05, "loss": 0.4064, "step": 399800}, {"epoch": 17.97106658280169, "grad_norm": 10.876336097717285, "learning_rate": 3.568579206221392e-05, "loss": 0.4061, "step": 400000}, {"epoch": 17.97106658280169, "eval_loss": 3.802236557006836, "eval_runtime": 1204.5349, "eval_samples_per_second": 8.222, "eval_steps_per_second": 0.032, "step": 400000}, {"epoch": 17.98005211609309, "grad_norm": 10.837194442749023, "learning_rate": 3.5673030173741085e-05, "loss": 0.3892, "step": 400200}, {"epoch": 17.989037649384493, "grad_norm": 19.335147857666016, "learning_rate": 3.566026488323089e-05, "loss": 0.4285, "step": 400400}, {"epoch": 17.99802318267589, "grad_norm": 7.6052470207214355, "learning_rate": 3.5647496194752264e-05, "loss": 0.4123, "step": 400600}, {"epoch": 18.00700871596729, "grad_norm": 1.3463623523712158, "learning_rate": 3.5634724112375236e-05, "loss": 0.3767, "step": 400800}, {"epoch": 18.015994249258693, "grad_norm": 6.778363227844238, "learning_rate": 3.5621948640170944e-05, "loss": 0.3737, "step": 401000}, {"epoch": 18.015994249258693, "eval_loss": 3.854170083999634, "eval_runtime": 1204.5669, "eval_samples_per_second": 8.222, "eval_steps_per_second": 0.032, "step": 401000}, {"epoch": 18.024979782550094, "grad_norm": 6.250158309936523, "learning_rate": 3.560916978221156e-05, "loss": 0.3642, "step": 401200}, {"epoch": 18.033965315841495, "grad_norm": 12.505826950073242, "learning_rate": 3.559638754257035e-05, "loss": 0.3701, "step": 401400}, {"epoch": 18.042950849132897, "grad_norm": 18.78114891052246, "learning_rate": 3.558360192532168e-05, "loss": 0.3628, "step": 401600}, {"epoch": 18.051936382424298, "grad_norm": 2.8729214668273926, "learning_rate": 3.557081293454097e-05, "loss": 0.3777, "step": 401800}, {"epoch": 18.0609219157157, "grad_norm": 8.019610404968262, "learning_rate": 3.555802057430471e-05, "loss": 0.3402, "step": 402000}, {"epoch": 18.0609219157157, "eval_loss": 3.8658034801483154, "eval_runtime": 1205.5991, "eval_samples_per_second": 8.215, "eval_steps_per_second": 0.032, "step": 402000}, {"epoch": 18.069907449007097, "grad_norm": 0.7817026376724243, "learning_rate": 3.5545224848690495e-05, "loss": 0.3799, "step": 402200}, {"epoch": 18.0788929822985, "grad_norm": 5.083946704864502, "learning_rate": 3.553242576177697e-05, "loss": 0.3577, "step": 402400}, {"epoch": 18.0878785155899, "grad_norm": 7.09104061126709, "learning_rate": 3.5519623317643834e-05, "loss": 0.3819, "step": 402600}, {"epoch": 18.0968640488813, "grad_norm": 8.251867294311523, "learning_rate": 3.55068175203719e-05, "loss": 0.3898, "step": 402800}, {"epoch": 18.105849582172702, "grad_norm": 29.634862899780273, "learning_rate": 3.549400837404302e-05, "loss": 0.3648, "step": 403000}, {"epoch": 18.105849582172702, "eval_loss": 3.867095947265625, "eval_runtime": 1203.7886, "eval_samples_per_second": 8.227, "eval_steps_per_second": 0.032, "step": 403000}, {"epoch": 18.114835115464103, "grad_norm": 8.83678913116455, "learning_rate": 3.548119588274012e-05, "loss": 0.3644, "step": 403200}, {"epoch": 18.123820648755505, "grad_norm": 3.9877867698669434, "learning_rate": 3.5468380050547185e-05, "loss": 0.3518, "step": 403400}, {"epoch": 18.132806182046906, "grad_norm": 12.110077857971191, "learning_rate": 3.545556088154928e-05, "loss": 0.4015, "step": 403600}, {"epoch": 18.141791715338304, "grad_norm": 20.395000457763672, "learning_rate": 3.544273837983253e-05, "loss": 0.356, "step": 403800}, {"epoch": 18.150777248629705, "grad_norm": 7.915891170501709, "learning_rate": 3.5429912549484114e-05, "loss": 0.3513, "step": 404000}, {"epoch": 18.150777248629705, "eval_loss": 3.825883626937866, "eval_runtime": 1205.146, "eval_samples_per_second": 8.218, "eval_steps_per_second": 0.032, "step": 404000}, {"epoch": 18.159762781921106, "grad_norm": 2.465219736099243, "learning_rate": 3.541708339459227e-05, "loss": 0.3469, "step": 404200}, {"epoch": 18.168748315212508, "grad_norm": 16.333881378173828, "learning_rate": 3.54042509192463e-05, "loss": 0.3947, "step": 404400}, {"epoch": 18.17773384850391, "grad_norm": 6.627115249633789, "learning_rate": 3.539141512753658e-05, "loss": 0.4071, "step": 404600}, {"epoch": 18.18671938179531, "grad_norm": 9.679762840270996, "learning_rate": 3.5378576023554524e-05, "loss": 0.382, "step": 404800}, {"epoch": 18.19570491508671, "grad_norm": 4.362650394439697, "learning_rate": 3.536573361139261e-05, "loss": 0.3896, "step": 405000}, {"epoch": 18.19570491508671, "eval_loss": 3.831510543823242, "eval_runtime": 1203.2249, "eval_samples_per_second": 8.231, "eval_steps_per_second": 0.032, "step": 405000}, {"epoch": 18.204690448378113, "grad_norm": 3.280683994293213, "learning_rate": 3.5352887895144354e-05, "loss": 0.3867, "step": 405200}, {"epoch": 18.21367598166951, "grad_norm": 25.597644805908203, "learning_rate": 3.534003887890435e-05, "loss": 0.3474, "step": 405400}, {"epoch": 18.22266151496091, "grad_norm": 15.584162712097168, "learning_rate": 3.532718656676824e-05, "loss": 0.377, "step": 405600}, {"epoch": 18.231647048252313, "grad_norm": 5.3182053565979, "learning_rate": 3.5314330962832696e-05, "loss": 0.3463, "step": 405800}, {"epoch": 18.240632581543714, "grad_norm": 3.7088468074798584, "learning_rate": 3.5301472071195454e-05, "loss": 0.3678, "step": 406000}, {"epoch": 18.240632581543714, "eval_loss": 3.8044979572296143, "eval_runtime": 1210.8568, "eval_samples_per_second": 8.179, "eval_steps_per_second": 0.032, "step": 406000}, {"epoch": 18.249618114835116, "grad_norm": 7.514823913574219, "learning_rate": 3.5288609895955304e-05, "loss": 0.357, "step": 406200}, {"epoch": 18.258603648126517, "grad_norm": 2.4954440593719482, "learning_rate": 3.527574444121207e-05, "loss": 0.3982, "step": 406400}, {"epoch": 18.267589181417918, "grad_norm": 3.856297016143799, "learning_rate": 3.5262875711066625e-05, "loss": 0.3921, "step": 406600}, {"epoch": 18.27657471470932, "grad_norm": 3.8277928829193115, "learning_rate": 3.525000370962089e-05, "loss": 0.387, "step": 406800}, {"epoch": 18.285560248000717, "grad_norm": 1.290062665939331, "learning_rate": 3.523712844097783e-05, "loss": 0.3554, "step": 407000}, {"epoch": 18.285560248000717, "eval_loss": 3.9154751300811768, "eval_runtime": 1217.0508, "eval_samples_per_second": 8.138, "eval_steps_per_second": 0.032, "step": 407000}, {"epoch": 18.29454578129212, "grad_norm": 8.983039855957031, "learning_rate": 3.522424990924145e-05, "loss": 0.3989, "step": 407200}, {"epoch": 18.30353131458352, "grad_norm": 15.448911666870117, "learning_rate": 3.5211368118516774e-05, "loss": 0.395, "step": 407400}, {"epoch": 18.31251684787492, "grad_norm": 6.722110271453857, "learning_rate": 3.51984830729099e-05, "loss": 0.3846, "step": 407600}, {"epoch": 18.321502381166322, "grad_norm": 5.694580554962158, "learning_rate": 3.5185594776527945e-05, "loss": 0.3845, "step": 407800}, {"epoch": 18.330487914457724, "grad_norm": 4.475128173828125, "learning_rate": 3.517270323347907e-05, "loss": 0.4102, "step": 408000}, {"epoch": 18.330487914457724, "eval_loss": 3.8598849773406982, "eval_runtime": 1097.2151, "eval_samples_per_second": 9.026, "eval_steps_per_second": 0.036, "step": 408000}, {"epoch": 18.339473447749125, "grad_norm": 7.8763933181762695, "learning_rate": 3.5159808447872456e-05, "loss": 0.3745, "step": 408200}, {"epoch": 18.348458981040526, "grad_norm": 35.217857360839844, "learning_rate": 3.5146910423818324e-05, "loss": 0.3821, "step": 408400}, {"epoch": 18.357444514331924, "grad_norm": 7.480992794036865, "learning_rate": 3.513400916542793e-05, "loss": 0.3777, "step": 408600}, {"epoch": 18.366430047623325, "grad_norm": 1.083188772201538, "learning_rate": 3.5121104676813575e-05, "loss": 0.353, "step": 408800}, {"epoch": 18.375415580914726, "grad_norm": 5.977663040161133, "learning_rate": 3.510819696208857e-05, "loss": 0.3875, "step": 409000}, {"epoch": 18.375415580914726, "eval_loss": 3.8312559127807617, "eval_runtime": 1097.7017, "eval_samples_per_second": 9.022, "eval_steps_per_second": 0.036, "step": 409000}, {"epoch": 18.384401114206128, "grad_norm": 5.178797721862793, "learning_rate": 3.509528602536725e-05, "loss": 0.3846, "step": 409200}, {"epoch": 18.39338664749753, "grad_norm": 0.88429194688797, "learning_rate": 3.5082371870764997e-05, "loss": 0.3766, "step": 409400}, {"epoch": 18.40237218078893, "grad_norm": 1.1388074159622192, "learning_rate": 3.50694545023982e-05, "loss": 0.4182, "step": 409600}, {"epoch": 18.41135771408033, "grad_norm": 10.69584846496582, "learning_rate": 3.50565339243843e-05, "loss": 0.3962, "step": 409800}, {"epoch": 18.420343247371733, "grad_norm": 3.2189548015594482, "learning_rate": 3.5043610140841716e-05, "loss": 0.3745, "step": 410000}, {"epoch": 18.420343247371733, "eval_loss": 3.84757399559021, "eval_runtime": 1096.3132, "eval_samples_per_second": 9.034, "eval_steps_per_second": 0.036, "step": 410000}, {"epoch": 18.429328780663134, "grad_norm": 4.857696056365967, "learning_rate": 3.503068315588993e-05, "loss": 0.3714, "step": 410200}, {"epoch": 18.438314313954532, "grad_norm": 22.0413875579834, "learning_rate": 3.501775297364943e-05, "loss": 0.3584, "step": 410400}, {"epoch": 18.447299847245933, "grad_norm": 12.368648529052734, "learning_rate": 3.5004819598241725e-05, "loss": 0.3731, "step": 410600}, {"epoch": 18.456285380537334, "grad_norm": 7.075397968292236, "learning_rate": 3.4991883033789316e-05, "loss": 0.3521, "step": 410800}, {"epoch": 18.465270913828736, "grad_norm": 10.172215461730957, "learning_rate": 3.4978943284415784e-05, "loss": 0.3916, "step": 411000}, {"epoch": 18.465270913828736, "eval_loss": 3.8483147621154785, "eval_runtime": 1094.622, "eval_samples_per_second": 9.048, "eval_steps_per_second": 0.036, "step": 411000}, {"epoch": 18.474256447120137, "grad_norm": 5.510894775390625, "learning_rate": 3.496600035424565e-05, "loss": 0.3889, "step": 411200}, {"epoch": 18.483241980411538, "grad_norm": 7.840881824493408, "learning_rate": 3.495305424740449e-05, "loss": 0.3941, "step": 411400}, {"epoch": 18.49222751370294, "grad_norm": 2.5886456966400146, "learning_rate": 3.4940104968018904e-05, "loss": 0.3836, "step": 411600}, {"epoch": 18.50121304699434, "grad_norm": 7.37034273147583, "learning_rate": 3.4927152520216474e-05, "loss": 0.3475, "step": 411800}, {"epoch": 18.51019858028574, "grad_norm": 6.969428062438965, "learning_rate": 3.49141969081258e-05, "loss": 0.3713, "step": 412000}, {"epoch": 18.51019858028574, "eval_loss": 3.88724684715271, "eval_runtime": 1095.691, "eval_samples_per_second": 9.039, "eval_steps_per_second": 0.036, "step": 412000}, {"epoch": 18.51918411357714, "grad_norm": 10.182289123535156, "learning_rate": 3.49012381358765e-05, "loss": 0.3692, "step": 412200}, {"epoch": 18.52816964686854, "grad_norm": 11.804682731628418, "learning_rate": 3.4888276207599194e-05, "loss": 0.3947, "step": 412400}, {"epoch": 18.537155180159942, "grad_norm": 12.905986785888672, "learning_rate": 3.48753111274255e-05, "loss": 0.3867, "step": 412600}, {"epoch": 18.546140713451344, "grad_norm": 3.650761842727661, "learning_rate": 3.4862342899488066e-05, "loss": 0.3821, "step": 412800}, {"epoch": 18.555126246742745, "grad_norm": 14.769987106323242, "learning_rate": 3.484937152792051e-05, "loss": 0.3525, "step": 413000}, {"epoch": 18.555126246742745, "eval_loss": 3.798630475997925, "eval_runtime": 1096.711, "eval_samples_per_second": 9.031, "eval_steps_per_second": 0.036, "step": 413000}, {"epoch": 18.564111780034146, "grad_norm": 12.465880393981934, "learning_rate": 3.483639701685746e-05, "loss": 0.3876, "step": 413200}, {"epoch": 18.573097313325547, "grad_norm": 19.23861312866211, "learning_rate": 3.4823419370434574e-05, "loss": 0.3585, "step": 413400}, {"epoch": 18.582082846616945, "grad_norm": 2.4888880252838135, "learning_rate": 3.481043859278847e-05, "loss": 0.3783, "step": 413600}, {"epoch": 18.591068379908346, "grad_norm": 12.582083702087402, "learning_rate": 3.4797454688056804e-05, "loss": 0.3861, "step": 413800}, {"epoch": 18.600053913199748, "grad_norm": 0.991515576839447, "learning_rate": 3.4784467660378174e-05, "loss": 0.4015, "step": 414000}, {"epoch": 18.600053913199748, "eval_loss": 3.845909833908081, "eval_runtime": 1096.418, "eval_samples_per_second": 9.033, "eval_steps_per_second": 0.036, "step": 414000}, {"epoch": 18.60903944649115, "grad_norm": 0.9095927476882935, "learning_rate": 3.4771477513892234e-05, "loss": 0.357, "step": 414200}, {"epoch": 18.61802497978255, "grad_norm": 8.816062927246094, "learning_rate": 3.47584842527396e-05, "loss": 0.3994, "step": 414400}, {"epoch": 18.62701051307395, "grad_norm": 12.012443542480469, "learning_rate": 3.4745487881061865e-05, "loss": 0.39, "step": 414600}, {"epoch": 18.635996046365353, "grad_norm": 31.449888229370117, "learning_rate": 3.473248840300165e-05, "loss": 0.357, "step": 414800}, {"epoch": 18.644981579656754, "grad_norm": 4.814366817474365, "learning_rate": 3.471948582270256e-05, "loss": 0.3608, "step": 415000}, {"epoch": 18.644981579656754, "eval_loss": 3.8544886112213135, "eval_runtime": 1097.8947, "eval_samples_per_second": 9.021, "eval_steps_per_second": 0.036, "step": 415000}, {"epoch": 18.653967112948152, "grad_norm": 3.825913429260254, "learning_rate": 3.470648014430915e-05, "loss": 0.3929, "step": 415200}, {"epoch": 18.662952646239553, "grad_norm": 12.636445045471191, "learning_rate": 3.4693471371967014e-05, "loss": 0.3701, "step": 415400}, {"epoch": 18.671938179530954, "grad_norm": 9.792268753051758, "learning_rate": 3.4680459509822696e-05, "loss": 0.4264, "step": 415600}, {"epoch": 18.680923712822356, "grad_norm": 2.876805305480957, "learning_rate": 3.466744456202375e-05, "loss": 0.4097, "step": 415800}, {"epoch": 18.689909246113757, "grad_norm": 3.836838722229004, "learning_rate": 3.4654426532718695e-05, "loss": 0.4236, "step": 416000}, {"epoch": 18.689909246113757, "eval_loss": 3.790830135345459, "eval_runtime": 1100.7008, "eval_samples_per_second": 8.998, "eval_steps_per_second": 0.035, "step": 416000}, {"epoch": 18.69889477940516, "grad_norm": 11.140382766723633, "learning_rate": 3.4641405426057034e-05, "loss": 0.388, "step": 416200}, {"epoch": 18.70788031269656, "grad_norm": 6.716423034667969, "learning_rate": 3.462838124618926e-05, "loss": 0.366, "step": 416400}, {"epoch": 18.71686584598796, "grad_norm": 5.123216152191162, "learning_rate": 3.461535399726685e-05, "loss": 0.4019, "step": 416600}, {"epoch": 18.72585137927936, "grad_norm": 0.5618104338645935, "learning_rate": 3.460232368344224e-05, "loss": 0.3711, "step": 416800}, {"epoch": 18.73483691257076, "grad_norm": 3.904057264328003, "learning_rate": 3.458929030886885e-05, "loss": 0.4017, "step": 417000}, {"epoch": 18.73483691257076, "eval_loss": 3.819016695022583, "eval_runtime": 1097.3118, "eval_samples_per_second": 9.026, "eval_steps_per_second": 0.036, "step": 417000}, {"epoch": 18.74382244586216, "grad_norm": 9.883956909179688, "learning_rate": 3.457625387770109e-05, "loss": 0.3891, "step": 417200}, {"epoch": 18.752807979153562, "grad_norm": 22.456649780273438, "learning_rate": 3.456321439409432e-05, "loss": 0.4144, "step": 417400}, {"epoch": 18.761793512444964, "grad_norm": 12.037010192871094, "learning_rate": 3.455017186220491e-05, "loss": 0.3706, "step": 417600}, {"epoch": 18.770779045736365, "grad_norm": 30.236738204956055, "learning_rate": 3.4537126286190155e-05, "loss": 0.4131, "step": 417800}, {"epoch": 18.779764579027766, "grad_norm": 6.0100321769714355, "learning_rate": 3.452407767020835e-05, "loss": 0.4224, "step": 418000}, {"epoch": 18.779764579027766, "eval_loss": 3.8412423133850098, "eval_runtime": 1095.5506, "eval_samples_per_second": 9.04, "eval_steps_per_second": 0.036, "step": 418000}, {"epoch": 18.788750112319168, "grad_norm": 16.41374969482422, "learning_rate": 3.4511026018418765e-05, "loss": 0.3991, "step": 418200}, {"epoch": 18.797735645610565, "grad_norm": 15.420040130615234, "learning_rate": 3.4497971334981596e-05, "loss": 0.4127, "step": 418400}, {"epoch": 18.806721178901967, "grad_norm": 13.536659240722656, "learning_rate": 3.448491362405807e-05, "loss": 0.3659, "step": 418600}, {"epoch": 18.815706712193368, "grad_norm": 20.171710968017578, "learning_rate": 3.447185288981031e-05, "loss": 0.4017, "step": 418800}, {"epoch": 18.82469224548477, "grad_norm": 9.69514274597168, "learning_rate": 3.445878913640146e-05, "loss": 0.38, "step": 419000}, {"epoch": 18.82469224548477, "eval_loss": 3.8335611820220947, "eval_runtime": 1094.6714, "eval_samples_per_second": 9.047, "eval_steps_per_second": 0.036, "step": 419000}, {"epoch": 18.83367777877617, "grad_norm": 1.9153423309326172, "learning_rate": 3.444572236799559e-05, "loss": 0.4292, "step": 419200}, {"epoch": 18.84266331206757, "grad_norm": 16.780864715576172, "learning_rate": 3.443265258875776e-05, "loss": 0.386, "step": 419400}, {"epoch": 18.851648845358973, "grad_norm": 7.751341819763184, "learning_rate": 3.4419579802853946e-05, "loss": 0.4026, "step": 419600}, {"epoch": 18.860634378650374, "grad_norm": 10.850844383239746, "learning_rate": 3.440650401445113e-05, "loss": 0.3684, "step": 419800}, {"epoch": 18.869619911941776, "grad_norm": 10.96944522857666, "learning_rate": 3.439342522771722e-05, "loss": 0.3631, "step": 420000}, {"epoch": 18.869619911941776, "eval_loss": 3.8032419681549072, "eval_runtime": 1188.8528, "eval_samples_per_second": 8.331, "eval_steps_per_second": 0.033, "step": 420000}, {"epoch": 18.878605445233173, "grad_norm": 61.311546325683594, "learning_rate": 3.43803434468211e-05, "loss": 0.3718, "step": 420200}, {"epoch": 18.887590978524575, "grad_norm": 0.1739572435617447, "learning_rate": 3.43672586759326e-05, "loss": 0.3735, "step": 420400}, {"epoch": 18.896576511815976, "grad_norm": 1.1089012622833252, "learning_rate": 3.4354170919222484e-05, "loss": 0.383, "step": 420600}, {"epoch": 18.905562045107377, "grad_norm": 3.8840813636779785, "learning_rate": 3.43410801808625e-05, "loss": 0.3992, "step": 420800}, {"epoch": 18.91454757839878, "grad_norm": 10.133760452270508, "learning_rate": 3.432798646502533e-05, "loss": 0.383, "step": 421000}, {"epoch": 18.91454757839878, "eval_loss": 3.857928514480591, "eval_runtime": 1170.5487, "eval_samples_per_second": 8.461, "eval_steps_per_second": 0.033, "step": 421000}, {"epoch": 18.92353311169018, "grad_norm": 12.687873840332031, "learning_rate": 3.4314889775884615e-05, "loss": 0.3884, "step": 421200}, {"epoch": 18.93251864498158, "grad_norm": 3.658750534057617, "learning_rate": 3.4301790117614906e-05, "loss": 0.372, "step": 421400}, {"epoch": 18.94150417827298, "grad_norm": 24.821044921875, "learning_rate": 3.4288687494391766e-05, "loss": 0.398, "step": 421600}, {"epoch": 18.95048971156438, "grad_norm": 1.3283342123031616, "learning_rate": 3.427558191039165e-05, "loss": 0.3814, "step": 421800}, {"epoch": 18.95947524485578, "grad_norm": 4.043994426727295, "learning_rate": 3.426247336979198e-05, "loss": 0.383, "step": 422000}, {"epoch": 18.95947524485578, "eval_loss": 3.8787529468536377, "eval_runtime": 1167.8307, "eval_samples_per_second": 8.481, "eval_steps_per_second": 0.033, "step": 422000}, {"epoch": 18.968460778147183, "grad_norm": 10.233535766601562, "learning_rate": 3.4249361876771106e-05, "loss": 0.3636, "step": 422200}, {"epoch": 18.977446311438584, "grad_norm": 7.685864448547363, "learning_rate": 3.423624743550833e-05, "loss": 0.3719, "step": 422400}, {"epoch": 18.986431844729985, "grad_norm": 4.338862895965576, "learning_rate": 3.422313005018389e-05, "loss": 0.3908, "step": 422600}, {"epoch": 18.995417378021386, "grad_norm": 6.173080921173096, "learning_rate": 3.421000972497897e-05, "loss": 0.4272, "step": 422800}, {"epoch": 19.004402911312788, "grad_norm": 9.796375274658203, "learning_rate": 3.419688646407569e-05, "loss": 0.405, "step": 423000}, {"epoch": 19.004402911312788, "eval_loss": 3.8710274696350098, "eval_runtime": 1174.2983, "eval_samples_per_second": 8.434, "eval_steps_per_second": 0.033, "step": 423000}, {"epoch": 19.01338844460419, "grad_norm": 16.157901763916016, "learning_rate": 3.418376027165708e-05, "loss": 0.3669, "step": 423200}, {"epoch": 19.022373977895587, "grad_norm": 6.099151134490967, "learning_rate": 3.417063115190714e-05, "loss": 0.3595, "step": 423400}, {"epoch": 19.031359511186988, "grad_norm": 18.236555099487305, "learning_rate": 3.4157499109010786e-05, "loss": 0.3571, "step": 423600}, {"epoch": 19.04034504447839, "grad_norm": 0.8889177441596985, "learning_rate": 3.414436414715386e-05, "loss": 0.3457, "step": 423800}, {"epoch": 19.04933057776979, "grad_norm": 10.380514144897461, "learning_rate": 3.413122627052316e-05, "loss": 0.3385, "step": 424000}, {"epoch": 19.04933057776979, "eval_loss": 3.8625006675720215, "eval_runtime": 1174.9973, "eval_samples_per_second": 8.429, "eval_steps_per_second": 0.033, "step": 424000}, {"epoch": 19.058316111061192, "grad_norm": 1.4684069156646729, "learning_rate": 3.4118085483306375e-05, "loss": 0.3354, "step": 424200}, {"epoch": 19.067301644352593, "grad_norm": 7.4322075843811035, "learning_rate": 3.4104941789692156e-05, "loss": 0.3579, "step": 424400}, {"epoch": 19.076287177643994, "grad_norm": 10.02495002746582, "learning_rate": 3.409179519387006e-05, "loss": 0.3629, "step": 424600}, {"epoch": 19.085272710935396, "grad_norm": 4.068674564361572, "learning_rate": 3.4078645700030575e-05, "loss": 0.3463, "step": 424800}, {"epoch": 19.094258244226793, "grad_norm": 0.7052398920059204, "learning_rate": 3.406549331236511e-05, "loss": 0.393, "step": 425000}, {"epoch": 19.094258244226793, "eval_loss": 3.8474578857421875, "eval_runtime": 1176.9074, "eval_samples_per_second": 8.415, "eval_steps_per_second": 0.033, "step": 425000}, {"epoch": 19.103243777518195, "grad_norm": 9.41407585144043, "learning_rate": 3.405233803506602e-05, "loss": 0.3732, "step": 425200}, {"epoch": 19.112229310809596, "grad_norm": 9.691625595092773, "learning_rate": 3.403917987232653e-05, "loss": 0.3649, "step": 425400}, {"epoch": 19.121214844100997, "grad_norm": 3.508151054382324, "learning_rate": 3.4026018828340846e-05, "loss": 0.3801, "step": 425600}, {"epoch": 19.1302003773924, "grad_norm": 10.020624160766602, "learning_rate": 3.401285490730404e-05, "loss": 0.3543, "step": 425800}, {"epoch": 19.1391859106838, "grad_norm": 32.40066909790039, "learning_rate": 3.399968811341212e-05, "loss": 0.3514, "step": 426000}, {"epoch": 19.1391859106838, "eval_loss": 3.8292617797851562, "eval_runtime": 1170.371, "eval_samples_per_second": 8.462, "eval_steps_per_second": 0.033, "step": 426000}, {"epoch": 19.1481714439752, "grad_norm": 16.520408630371094, "learning_rate": 3.398651845086203e-05, "loss": 0.3583, "step": 426200}, {"epoch": 19.157156977266602, "grad_norm": 9.090585708618164, "learning_rate": 3.3973345923851604e-05, "loss": 0.3934, "step": 426400}, {"epoch": 19.166142510558, "grad_norm": 11.521536827087402, "learning_rate": 3.39601705365796e-05, "loss": 0.351, "step": 426600}, {"epoch": 19.1751280438494, "grad_norm": 8.667354583740234, "learning_rate": 3.394699229324567e-05, "loss": 0.3621, "step": 426800}, {"epoch": 19.184113577140803, "grad_norm": 28.831558227539062, "learning_rate": 3.3933811198050405e-05, "loss": 0.3502, "step": 427000}, {"epoch": 19.184113577140803, "eval_loss": 3.881221055984497, "eval_runtime": 1173.9735, "eval_samples_per_second": 8.436, "eval_steps_per_second": 0.033, "step": 427000}, {"epoch": 19.193099110432204, "grad_norm": 8.013230323791504, "learning_rate": 3.392062725519529e-05, "loss": 0.3609, "step": 427200}, {"epoch": 19.202084643723605, "grad_norm": 11.29799747467041, "learning_rate": 3.390744046888271e-05, "loss": 0.4193, "step": 427400}, {"epoch": 19.211070177015007, "grad_norm": 3.9097185134887695, "learning_rate": 3.389425084331596e-05, "loss": 0.3746, "step": 427600}, {"epoch": 19.220055710306408, "grad_norm": 11.717888832092285, "learning_rate": 3.388105838269925e-05, "loss": 0.3999, "step": 427800}, {"epoch": 19.22904124359781, "grad_norm": 12.494455337524414, "learning_rate": 3.386786309123769e-05, "loss": 0.3875, "step": 428000}, {"epoch": 19.22904124359781, "eval_loss": 3.8519411087036133, "eval_runtime": 1173.1781, "eval_samples_per_second": 8.442, "eval_steps_per_second": 0.033, "step": 428000}, {"epoch": 19.238026776889207, "grad_norm": 3.4043800830841064, "learning_rate": 3.38546649731373e-05, "loss": 0.3683, "step": 428200}, {"epoch": 19.247012310180608, "grad_norm": 12.774907112121582, "learning_rate": 3.3841464032604974e-05, "loss": 0.3805, "step": 428400}, {"epoch": 19.25599784347201, "grad_norm": 7.213978290557861, "learning_rate": 3.382826027384853e-05, "loss": 0.3526, "step": 428600}, {"epoch": 19.26498337676341, "grad_norm": 8.512626647949219, "learning_rate": 3.3815053701076674e-05, "loss": 0.3925, "step": 428800}, {"epoch": 19.273968910054812, "grad_norm": 3.8123066425323486, "learning_rate": 3.3801844318499024e-05, "loss": 0.3349, "step": 429000}, {"epoch": 19.273968910054812, "eval_loss": 3.8657233715057373, "eval_runtime": 1171.8186, "eval_samples_per_second": 8.452, "eval_steps_per_second": 0.033, "step": 429000}, {"epoch": 19.282954443346213, "grad_norm": 1.9035091400146484, "learning_rate": 3.378863213032607e-05, "loss": 0.3481, "step": 429200}, {"epoch": 19.291939976637615, "grad_norm": 14.608076095581055, "learning_rate": 3.37754171407692e-05, "loss": 0.3859, "step": 429400}, {"epoch": 19.300925509929016, "grad_norm": 6.863801002502441, "learning_rate": 3.376219935404072e-05, "loss": 0.3843, "step": 429600}, {"epoch": 19.309911043220414, "grad_norm": 11.920736312866211, "learning_rate": 3.374897877435381e-05, "loss": 0.3549, "step": 429800}, {"epoch": 19.318896576511815, "grad_norm": 4.002532482147217, "learning_rate": 3.373575540592253e-05, "loss": 0.4075, "step": 430000}, {"epoch": 19.318896576511815, "eval_loss": 3.8724846839904785, "eval_runtime": 1110.6742, "eval_samples_per_second": 8.917, "eval_steps_per_second": 0.035, "step": 430000}, {"epoch": 19.327882109803216, "grad_norm": 19.618444442749023, "learning_rate": 3.372252925296186e-05, "loss": 0.3922, "step": 430200}, {"epoch": 19.336867643094617, "grad_norm": 3.7305030822753906, "learning_rate": 3.370930031968762e-05, "loss": 0.3698, "step": 430400}, {"epoch": 19.34585317638602, "grad_norm": 4.330793380737305, "learning_rate": 3.3696068610316556e-05, "loss": 0.3633, "step": 430600}, {"epoch": 19.35483870967742, "grad_norm": 0.21204280853271484, "learning_rate": 3.368283412906629e-05, "loss": 0.3499, "step": 430800}, {"epoch": 19.36382424296882, "grad_norm": 6.117523193359375, "learning_rate": 3.366959688015531e-05, "loss": 0.3454, "step": 431000}, {"epoch": 19.36382424296882, "eval_loss": 3.8316211700439453, "eval_runtime": 1087.1061, "eval_samples_per_second": 9.11, "eval_steps_per_second": 0.036, "step": 431000}, {"epoch": 19.372809776260222, "grad_norm": 3.591719627380371, "learning_rate": 3.365635686780303e-05, "loss": 0.3373, "step": 431200}, {"epoch": 19.38179530955162, "grad_norm": 8.026259422302246, "learning_rate": 3.364311409622969e-05, "loss": 0.3859, "step": 431400}, {"epoch": 19.39078084284302, "grad_norm": 4.9064836502075195, "learning_rate": 3.362986856965644e-05, "loss": 0.3662, "step": 431600}, {"epoch": 19.399766376134423, "grad_norm": 2.1227197647094727, "learning_rate": 3.3616620292305304e-05, "loss": 0.345, "step": 431800}, {"epoch": 19.408751909425824, "grad_norm": 14.224973678588867, "learning_rate": 3.3603369268399174e-05, "loss": 0.398, "step": 432000}, {"epoch": 19.408751909425824, "eval_loss": 3.853020191192627, "eval_runtime": 1079.4522, "eval_samples_per_second": 9.175, "eval_steps_per_second": 0.036, "step": 432000}, {"epoch": 19.417737442717225, "grad_norm": 8.285384178161621, "learning_rate": 3.359011550216184e-05, "loss": 0.3661, "step": 432200}, {"epoch": 19.426722976008627, "grad_norm": 8.617288589477539, "learning_rate": 3.3576858997817936e-05, "loss": 0.3613, "step": 432400}, {"epoch": 19.435708509300028, "grad_norm": 3.534817934036255, "learning_rate": 3.3563599759593007e-05, "loss": 0.3901, "step": 432600}, {"epoch": 19.44469404259143, "grad_norm": 0.19126541912555695, "learning_rate": 3.3550337791713426e-05, "loss": 0.3549, "step": 432800}, {"epoch": 19.453679575882827, "grad_norm": 10.775198936462402, "learning_rate": 3.353707309840646e-05, "loss": 0.3864, "step": 433000}, {"epoch": 19.453679575882827, "eval_loss": 3.870607376098633, "eval_runtime": 1102.0643, "eval_samples_per_second": 8.987, "eval_steps_per_second": 0.035, "step": 433000}, {"epoch": 19.462665109174228, "grad_norm": 10.87759780883789, "learning_rate": 3.352380568390024e-05, "loss": 0.3797, "step": 433200}, {"epoch": 19.47165064246563, "grad_norm": 8.955763816833496, "learning_rate": 3.351053555242376e-05, "loss": 0.3572, "step": 433400}, {"epoch": 19.48063617575703, "grad_norm": 11.83018684387207, "learning_rate": 3.349726270820691e-05, "loss": 0.3859, "step": 433600}, {"epoch": 19.489621709048432, "grad_norm": 29.993505477905273, "learning_rate": 3.3483987155480396e-05, "loss": 0.4068, "step": 433800}, {"epoch": 19.498607242339833, "grad_norm": 7.300692081451416, "learning_rate": 3.347070889847582e-05, "loss": 0.3916, "step": 434000}, {"epoch": 19.498607242339833, "eval_loss": 3.8529105186462402, "eval_runtime": 1098.1299, "eval_samples_per_second": 9.019, "eval_steps_per_second": 0.036, "step": 434000}, {"epoch": 19.507592775631235, "grad_norm": 21.306541442871094, "learning_rate": 3.345742794142564e-05, "loss": 0.3635, "step": 434200}, {"epoch": 19.516578308922636, "grad_norm": 0.5357521772384644, "learning_rate": 3.3444144288563174e-05, "loss": 0.3509, "step": 434400}, {"epoch": 19.525563842214034, "grad_norm": 10.118279457092285, "learning_rate": 3.343085794412258e-05, "loss": 0.3619, "step": 434600}, {"epoch": 19.534549375505435, "grad_norm": 8.305274963378906, "learning_rate": 3.341756891233891e-05, "loss": 0.3737, "step": 434800}, {"epoch": 19.543534908796836, "grad_norm": 0.6471884846687317, "learning_rate": 3.3404277197448054e-05, "loss": 0.3445, "step": 435000}, {"epoch": 19.543534908796836, "eval_loss": 3.916043281555176, "eval_runtime": 1098.0537, "eval_samples_per_second": 9.02, "eval_steps_per_second": 0.036, "step": 435000}, {"epoch": 19.552520442088237, "grad_norm": 9.640978813171387, "learning_rate": 3.339098280368675e-05, "loss": 0.3829, "step": 435200}, {"epoch": 19.56150597537964, "grad_norm": 28.039609909057617, "learning_rate": 3.33776857352926e-05, "loss": 0.403, "step": 435400}, {"epoch": 19.57049150867104, "grad_norm": 1.782164216041565, "learning_rate": 3.3364385996504055e-05, "loss": 0.3996, "step": 435600}, {"epoch": 19.57947704196244, "grad_norm": 15.381430625915527, "learning_rate": 3.335108359156042e-05, "loss": 0.358, "step": 435800}, {"epoch": 19.588462575253843, "grad_norm": 6.020942211151123, "learning_rate": 3.3337778524701835e-05, "loss": 0.3816, "step": 436000}, {"epoch": 19.588462575253843, "eval_loss": 3.842747449874878, "eval_runtime": 1082.6766, "eval_samples_per_second": 9.148, "eval_steps_per_second": 0.036, "step": 436000}, {"epoch": 19.597448108545244, "grad_norm": 15.338593482971191, "learning_rate": 3.332447080016932e-05, "loss": 0.3869, "step": 436200}, {"epoch": 19.60643364183664, "grad_norm": 11.474835395812988, "learning_rate": 3.3311160422204715e-05, "loss": 0.3966, "step": 436400}, {"epoch": 19.615419175128043, "grad_norm": 2.0930511951446533, "learning_rate": 3.329784739505072e-05, "loss": 0.3639, "step": 436600}, {"epoch": 19.624404708419444, "grad_norm": 3.015812635421753, "learning_rate": 3.3284531722950855e-05, "loss": 0.3951, "step": 436800}, {"epoch": 19.633390241710845, "grad_norm": 6.570770740509033, "learning_rate": 3.3271213410149524e-05, "loss": 0.3735, "step": 437000}, {"epoch": 19.633390241710845, "eval_loss": 3.8144209384918213, "eval_runtime": 1090.0308, "eval_samples_per_second": 9.086, "eval_steps_per_second": 0.036, "step": 437000}, {"epoch": 19.642375775002247, "grad_norm": 3.2332072257995605, "learning_rate": 3.325789246089195e-05, "loss": 0.3631, "step": 437200}, {"epoch": 19.651361308293648, "grad_norm": 3.6440892219543457, "learning_rate": 3.324456887942417e-05, "loss": 0.3675, "step": 437400}, {"epoch": 19.66034684158505, "grad_norm": 11.325727462768555, "learning_rate": 3.323124266999312e-05, "loss": 0.3748, "step": 437600}, {"epoch": 19.66933237487645, "grad_norm": 1.8451133966445923, "learning_rate": 3.3217913836846524e-05, "loss": 0.3727, "step": 437800}, {"epoch": 19.67831790816785, "grad_norm": 6.25849723815918, "learning_rate": 3.320458238423295e-05, "loss": 0.4164, "step": 438000}, {"epoch": 19.67831790816785, "eval_loss": 3.8024802207946777, "eval_runtime": 1094.4447, "eval_samples_per_second": 9.049, "eval_steps_per_second": 0.036, "step": 438000}, {"epoch": 19.68730344145925, "grad_norm": 22.77155113220215, "learning_rate": 3.319124831640183e-05, "loss": 0.3534, "step": 438200}, {"epoch": 19.69628897475065, "grad_norm": 9.079693794250488, "learning_rate": 3.31779116376034e-05, "loss": 0.3323, "step": 438400}, {"epoch": 19.705274508042052, "grad_norm": 5.9739813804626465, "learning_rate": 3.316457235208873e-05, "loss": 0.3551, "step": 438600}, {"epoch": 19.714260041333453, "grad_norm": 7.636072635650635, "learning_rate": 3.315123046410974e-05, "loss": 0.3599, "step": 438800}, {"epoch": 19.723245574624855, "grad_norm": 8.846769332885742, "learning_rate": 3.313788597791917e-05, "loss": 0.3778, "step": 439000}, {"epoch": 19.723245574624855, "eval_loss": 3.8162496089935303, "eval_runtime": 1105.6042, "eval_samples_per_second": 8.958, "eval_steps_per_second": 0.035, "step": 439000}, {"epoch": 19.732231107916256, "grad_norm": 5.736910343170166, "learning_rate": 3.312453889777057e-05, "loss": 0.3947, "step": 439200}, {"epoch": 19.741216641207657, "grad_norm": 13.45654582977295, "learning_rate": 3.311118922791835e-05, "loss": 0.3551, "step": 439400}, {"epoch": 19.750202174499055, "grad_norm": 2.0433974266052246, "learning_rate": 3.309783697261771e-05, "loss": 0.3922, "step": 439600}, {"epoch": 19.759187707790456, "grad_norm": 7.121521949768066, "learning_rate": 3.3084482136124716e-05, "loss": 0.3869, "step": 439800}, {"epoch": 19.768173241081858, "grad_norm": 0.8535615801811218, "learning_rate": 3.3071124722696224e-05, "loss": 0.401, "step": 440000}, {"epoch": 19.768173241081858, "eval_loss": 3.806692361831665, "eval_runtime": 1098.742, "eval_samples_per_second": 9.014, "eval_steps_per_second": 0.035, "step": 440000}, {"epoch": 19.77715877437326, "grad_norm": 13.158157348632812, "learning_rate": 3.305776473658991e-05, "loss": 0.3573, "step": 440200}, {"epoch": 19.78614430766466, "grad_norm": 10.366994857788086, "learning_rate": 3.304440218206429e-05, "loss": 0.3676, "step": 440400}, {"epoch": 19.79512984095606, "grad_norm": 11.056921005249023, "learning_rate": 3.3031037063378695e-05, "loss": 0.3905, "step": 440600}, {"epoch": 19.804115374247463, "grad_norm": 3.31510066986084, "learning_rate": 3.301766938479325e-05, "loss": 0.3789, "step": 440800}, {"epoch": 19.813100907538864, "grad_norm": 0.25016453862190247, "learning_rate": 3.300429915056894e-05, "loss": 0.35, "step": 441000}, {"epoch": 19.813100907538864, "eval_loss": 3.828049421310425, "eval_runtime": 1104.6838, "eval_samples_per_second": 8.965, "eval_steps_per_second": 0.035, "step": 441000}, {"epoch": 19.82208644083026, "grad_norm": 5.278088569641113, "learning_rate": 3.299092636496751e-05, "loss": 0.372, "step": 441200}, {"epoch": 19.831071974121663, "grad_norm": 7.003445625305176, "learning_rate": 3.297755103225157e-05, "loss": 0.3633, "step": 441400}, {"epoch": 19.840057507413064, "grad_norm": 18.454580307006836, "learning_rate": 3.296417315668451e-05, "loss": 0.3645, "step": 441600}, {"epoch": 19.849043040704466, "grad_norm": 6.675582408905029, "learning_rate": 3.2950792742530536e-05, "loss": 0.3794, "step": 441800}, {"epoch": 19.858028573995867, "grad_norm": 3.7882144451141357, "learning_rate": 3.293740979405467e-05, "loss": 0.3936, "step": 442000}, {"epoch": 19.858028573995867, "eval_loss": 3.856177806854248, "eval_runtime": 1169.3786, "eval_samples_per_second": 8.469, "eval_steps_per_second": 0.033, "step": 442000}, {"epoch": 19.867014107287268, "grad_norm": 2.224478006362915, "learning_rate": 3.292402431552273e-05, "loss": 0.3826, "step": 442200}, {"epoch": 19.87599964057867, "grad_norm": 1.1260976791381836, "learning_rate": 3.291063631120137e-05, "loss": 0.367, "step": 442400}, {"epoch": 19.88498517387007, "grad_norm": 7.941216468811035, "learning_rate": 3.2897245785357995e-05, "loss": 0.4042, "step": 442600}, {"epoch": 19.89397070716147, "grad_norm": 8.846776008605957, "learning_rate": 3.288385274226088e-05, "loss": 0.3933, "step": 442800}, {"epoch": 19.90295624045287, "grad_norm": 16.292428970336914, "learning_rate": 3.287045718617904e-05, "loss": 0.3749, "step": 443000}, {"epoch": 19.90295624045287, "eval_loss": 3.854950428009033, "eval_runtime": 1159.3263, "eval_samples_per_second": 8.543, "eval_steps_per_second": 0.034, "step": 443000}, {"epoch": 19.91194177374427, "grad_norm": 12.939181327819824, "learning_rate": 3.285705912138234e-05, "loss": 0.3701, "step": 443200}, {"epoch": 19.920927307035672, "grad_norm": 3.3179798126220703, "learning_rate": 3.284365855214141e-05, "loss": 0.427, "step": 443400}, {"epoch": 19.929912840327074, "grad_norm": 4.160244941711426, "learning_rate": 3.283025548272771e-05, "loss": 0.3636, "step": 443600}, {"epoch": 19.938898373618475, "grad_norm": 1.0800896883010864, "learning_rate": 3.281684991741347e-05, "loss": 0.4054, "step": 443800}, {"epoch": 19.947883906909876, "grad_norm": 10.361804962158203, "learning_rate": 3.2803441860471725e-05, "loss": 0.4003, "step": 444000}, {"epoch": 19.947883906909876, "eval_loss": 3.795114517211914, "eval_runtime": 1157.2871, "eval_samples_per_second": 8.558, "eval_steps_per_second": 0.034, "step": 444000}, {"epoch": 19.956869440201277, "grad_norm": 2.5146071910858154, "learning_rate": 3.27900313161763e-05, "loss": 0.3784, "step": 444200}, {"epoch": 19.965854973492675, "grad_norm": 2.567941904067993, "learning_rate": 3.277661828880182e-05, "loss": 0.3757, "step": 444400}, {"epoch": 19.974840506784076, "grad_norm": 7.472506046295166, "learning_rate": 3.276320278262371e-05, "loss": 0.383, "step": 444600}, {"epoch": 19.983826040075478, "grad_norm": 1.7942224740982056, "learning_rate": 3.2749784801918155e-05, "loss": 0.3547, "step": 444800}, {"epoch": 19.99281157336688, "grad_norm": 12.670038223266602, "learning_rate": 3.273636435096216e-05, "loss": 0.4145, "step": 445000}, {"epoch": 19.99281157336688, "eval_loss": 3.7545852661132812, "eval_runtime": 1143.5493, "eval_samples_per_second": 8.661, "eval_steps_per_second": 0.034, "step": 445000}, {"epoch": 20.00179710665828, "grad_norm": 0.7427432537078857, "learning_rate": 3.27229414340335e-05, "loss": 0.3815, "step": 445200}, {"epoch": 20.01078263994968, "grad_norm": 2.870213270187378, "learning_rate": 3.270951605541075e-05, "loss": 0.3358, "step": 445400}, {"epoch": 20.019768173241083, "grad_norm": 7.560419082641602, "learning_rate": 3.269608821937325e-05, "loss": 0.3451, "step": 445600}, {"epoch": 20.028753706532484, "grad_norm": 6.4001078605651855, "learning_rate": 3.268265793020114e-05, "loss": 0.3516, "step": 445800}, {"epoch": 20.037739239823882, "grad_norm": 21.972902297973633, "learning_rate": 3.2669225192175334e-05, "loss": 0.3828, "step": 446000}, {"epoch": 20.037739239823882, "eval_loss": 3.8768162727355957, "eval_runtime": 1147.0252, "eval_samples_per_second": 8.635, "eval_steps_per_second": 0.034, "step": 446000}, {"epoch": 20.046724773115283, "grad_norm": 13.854667663574219, "learning_rate": 3.265579000957753e-05, "loss": 0.3745, "step": 446200}, {"epoch": 20.055710306406684, "grad_norm": 1.945226788520813, "learning_rate": 3.26423523866902e-05, "loss": 0.3407, "step": 446400}, {"epoch": 20.064695839698086, "grad_norm": 2.497396469116211, "learning_rate": 3.26289123277966e-05, "loss": 0.3409, "step": 446600}, {"epoch": 20.073681372989487, "grad_norm": 17.679908752441406, "learning_rate": 3.261546983718077e-05, "loss": 0.3555, "step": 446800}, {"epoch": 20.08266690628089, "grad_norm": 12.340278625488281, "learning_rate": 3.2602024919127495e-05, "loss": 0.3559, "step": 447000}, {"epoch": 20.08266690628089, "eval_loss": 3.868159532546997, "eval_runtime": 1144.6119, "eval_samples_per_second": 8.653, "eval_steps_per_second": 0.034, "step": 447000}, {"epoch": 20.09165243957229, "grad_norm": 7.965939521789551, "learning_rate": 3.2588577577922366e-05, "loss": 0.3499, "step": 447200}, {"epoch": 20.10063797286369, "grad_norm": 1.9072184562683105, "learning_rate": 3.2575127817851734e-05, "loss": 0.3428, "step": 447400}, {"epoch": 20.10962350615509, "grad_norm": 6.992972373962402, "learning_rate": 3.256167564320272e-05, "loss": 0.3544, "step": 447600}, {"epoch": 20.11860903944649, "grad_norm": 5.526668548583984, "learning_rate": 3.2548221058263214e-05, "loss": 0.3596, "step": 447800}, {"epoch": 20.12759457273789, "grad_norm": 8.724543571472168, "learning_rate": 3.2534764067321874e-05, "loss": 0.3359, "step": 448000}, {"epoch": 20.12759457273789, "eval_loss": 3.878002882003784, "eval_runtime": 1143.5931, "eval_samples_per_second": 8.66, "eval_steps_per_second": 0.034, "step": 448000}, {"epoch": 20.136580106029292, "grad_norm": 5.3289361000061035, "learning_rate": 3.252130467466814e-05, "loss": 0.3555, "step": 448200}, {"epoch": 20.145565639320694, "grad_norm": 2.90199875831604, "learning_rate": 3.25078428845922e-05, "loss": 0.3167, "step": 448400}, {"epoch": 20.154551172612095, "grad_norm": 4.369307041168213, "learning_rate": 3.2494378701385e-05, "loss": 0.3423, "step": 448600}, {"epoch": 20.163536705903496, "grad_norm": 6.077184677124023, "learning_rate": 3.248091212933827e-05, "loss": 0.3617, "step": 448800}, {"epoch": 20.172522239194898, "grad_norm": 4.385313034057617, "learning_rate": 3.246744317274449e-05, "loss": 0.3382, "step": 449000}, {"epoch": 20.172522239194898, "eval_loss": 3.871030807495117, "eval_runtime": 1143.6866, "eval_samples_per_second": 8.66, "eval_steps_per_second": 0.034, "step": 449000}, {"epoch": 20.1815077724863, "grad_norm": 4.845536708831787, "learning_rate": 3.24539718358969e-05, "loss": 0.3544, "step": 449200}, {"epoch": 20.190493305777697, "grad_norm": 9.48888111114502, "learning_rate": 3.2440498123089496e-05, "loss": 0.3651, "step": 449400}, {"epoch": 20.199478839069098, "grad_norm": 16.708328247070312, "learning_rate": 3.242702203861704e-05, "loss": 0.3364, "step": 449600}, {"epoch": 20.2084643723605, "grad_norm": 31.345827102661133, "learning_rate": 3.241354358677505e-05, "loss": 0.3687, "step": 449800}, {"epoch": 20.2174499056519, "grad_norm": 6.827626705169678, "learning_rate": 3.240006277185978e-05, "loss": 0.3804, "step": 450000}, {"epoch": 20.2174499056519, "eval_loss": 3.9251058101654053, "eval_runtime": 1154.8423, "eval_samples_per_second": 8.576, "eval_steps_per_second": 0.034, "step": 450000}, {"epoch": 20.2264354389433, "grad_norm": 6.233980178833008, "learning_rate": 3.2386579598168266e-05, "loss": 0.3687, "step": 450200}, {"epoch": 20.235420972234703, "grad_norm": 6.345924377441406, "learning_rate": 3.237309406999827e-05, "loss": 0.3432, "step": 450400}, {"epoch": 20.244406505526104, "grad_norm": 1.4343754053115845, "learning_rate": 3.235960619164832e-05, "loss": 0.3801, "step": 450600}, {"epoch": 20.253392038817505, "grad_norm": 17.45358657836914, "learning_rate": 3.234611596741769e-05, "loss": 0.365, "step": 450800}, {"epoch": 20.262377572108903, "grad_norm": 16.016883850097656, "learning_rate": 3.23326234016064e-05, "loss": 0.3624, "step": 451000}, {"epoch": 20.262377572108903, "eval_loss": 3.8094112873077393, "eval_runtime": 1142.5451, "eval_samples_per_second": 8.668, "eval_steps_per_second": 0.034, "step": 451000}, {"epoch": 20.271363105400305, "grad_norm": 17.484983444213867, "learning_rate": 3.2319128498515214e-05, "loss": 0.3379, "step": 451200}, {"epoch": 20.280348638691706, "grad_norm": 17.760513305664062, "learning_rate": 3.230563126244564e-05, "loss": 0.371, "step": 451400}, {"epoch": 20.289334171983107, "grad_norm": 6.531546592712402, "learning_rate": 3.229213169769995e-05, "loss": 0.3737, "step": 451600}, {"epoch": 20.29831970527451, "grad_norm": 10.28607177734375, "learning_rate": 3.227862980858112e-05, "loss": 0.3628, "step": 451800}, {"epoch": 20.30730523856591, "grad_norm": 5.768312454223633, "learning_rate": 3.22651255993929e-05, "loss": 0.377, "step": 452000}, {"epoch": 20.30730523856591, "eval_loss": 3.835094690322876, "eval_runtime": 1150.0337, "eval_samples_per_second": 8.612, "eval_steps_per_second": 0.034, "step": 452000}, {"epoch": 20.31629077185731, "grad_norm": 9.820401191711426, "learning_rate": 3.2251619074439776e-05, "loss": 0.3633, "step": 452200}, {"epoch": 20.325276305148712, "grad_norm": 9.445414543151855, "learning_rate": 3.2238110238026944e-05, "loss": 0.3547, "step": 452400}, {"epoch": 20.33426183844011, "grad_norm": 5.395224571228027, "learning_rate": 3.2224599094460376e-05, "loss": 0.3578, "step": 452600}, {"epoch": 20.34324737173151, "grad_norm": 12.77868938446045, "learning_rate": 3.221108564804675e-05, "loss": 0.3832, "step": 452800}, {"epoch": 20.352232905022912, "grad_norm": 5.215237617492676, "learning_rate": 3.219756990309349e-05, "loss": 0.3757, "step": 453000}, {"epoch": 20.352232905022912, "eval_loss": 3.832378625869751, "eval_runtime": 1145.081, "eval_samples_per_second": 8.649, "eval_steps_per_second": 0.034, "step": 453000}, {"epoch": 20.361218438314314, "grad_norm": 8.17989730834961, "learning_rate": 3.2184051863908746e-05, "loss": 0.3425, "step": 453200}, {"epoch": 20.370203971605715, "grad_norm": 8.778077125549316, "learning_rate": 3.217053153480142e-05, "loss": 0.3502, "step": 453400}, {"epoch": 20.379189504897116, "grad_norm": 22.368091583251953, "learning_rate": 3.2157008920081115e-05, "loss": 0.373, "step": 453600}, {"epoch": 20.388175038188518, "grad_norm": 2.329055070877075, "learning_rate": 3.2143484024058186e-05, "loss": 0.3252, "step": 453800}, {"epoch": 20.39716057147992, "grad_norm": 8.0297269821167, "learning_rate": 3.212995685104369e-05, "loss": 0.3704, "step": 454000}, {"epoch": 20.39716057147992, "eval_loss": 3.886225938796997, "eval_runtime": 1143.4802, "eval_samples_per_second": 8.661, "eval_steps_per_second": 0.034, "step": 454000}, {"epoch": 20.406146104771317, "grad_norm": 4.103653430938721, "learning_rate": 3.2116427405349437e-05, "loss": 0.3638, "step": 454200}, {"epoch": 20.415131638062718, "grad_norm": 12.913371086120605, "learning_rate": 3.210289569128795e-05, "loss": 0.3766, "step": 454400}, {"epoch": 20.42411717135412, "grad_norm": 8.67467975616455, "learning_rate": 3.208936171317246e-05, "loss": 0.3515, "step": 454600}, {"epoch": 20.43310270464552, "grad_norm": 14.403546333312988, "learning_rate": 3.2075825475316954e-05, "loss": 0.3751, "step": 454800}, {"epoch": 20.44208823793692, "grad_norm": 4.453256607055664, "learning_rate": 3.20622869820361e-05, "loss": 0.37, "step": 455000}, {"epoch": 20.44208823793692, "eval_loss": 3.873455762863159, "eval_runtime": 1125.7815, "eval_samples_per_second": 8.797, "eval_steps_per_second": 0.035, "step": 455000}, {"epoch": 20.451073771228323, "grad_norm": 12.016096115112305, "learning_rate": 3.204874623764532e-05, "loss": 0.3539, "step": 455200}, {"epoch": 20.460059304519724, "grad_norm": 10.212580680847168, "learning_rate": 3.2035203246460725e-05, "loss": 0.3843, "step": 455400}, {"epoch": 20.469044837811126, "grad_norm": 6.088382720947266, "learning_rate": 3.2021658012799166e-05, "loss": 0.3938, "step": 455600}, {"epoch": 20.478030371102523, "grad_norm": 11.492984771728516, "learning_rate": 3.200811054097819e-05, "loss": 0.372, "step": 455800}, {"epoch": 20.487015904393925, "grad_norm": 12.331425666809082, "learning_rate": 3.1994560835316073e-05, "loss": 0.3457, "step": 456000}, {"epoch": 20.487015904393925, "eval_loss": 3.8303720951080322, "eval_runtime": 1114.4203, "eval_samples_per_second": 8.887, "eval_steps_per_second": 0.035, "step": 456000}, {"epoch": 20.496001437685326, "grad_norm": 28.88426399230957, "learning_rate": 3.198100890013178e-05, "loss": 0.3414, "step": 456200}, {"epoch": 20.504986970976727, "grad_norm": 12.088685989379883, "learning_rate": 3.196745473974502e-05, "loss": 0.3848, "step": 456400}, {"epoch": 20.51397250426813, "grad_norm": 15.99104118347168, "learning_rate": 3.195389835847619e-05, "loss": 0.3815, "step": 456600}, {"epoch": 20.52295803755953, "grad_norm": 7.567880153656006, "learning_rate": 3.194033976064637e-05, "loss": 0.3409, "step": 456800}, {"epoch": 20.53194357085093, "grad_norm": 0.6070024371147156, "learning_rate": 3.192677895057742e-05, "loss": 0.3422, "step": 457000}, {"epoch": 20.53194357085093, "eval_loss": 3.879889726638794, "eval_runtime": 1114.428, "eval_samples_per_second": 8.887, "eval_steps_per_second": 0.035, "step": 457000}, {"epoch": 20.540929104142332, "grad_norm": 1.9777508974075317, "learning_rate": 3.1913215932591826e-05, "loss": 0.3976, "step": 457200}, {"epoch": 20.54991463743373, "grad_norm": 2.3788673877716064, "learning_rate": 3.189965071101282e-05, "loss": 0.3776, "step": 457400}, {"epoch": 20.55890017072513, "grad_norm": 10.905414581298828, "learning_rate": 3.188608329016433e-05, "loss": 0.374, "step": 457600}, {"epoch": 20.567885704016533, "grad_norm": 9.221813201904297, "learning_rate": 3.187251367437099e-05, "loss": 0.3753, "step": 457800}, {"epoch": 20.576871237307934, "grad_norm": 35.775840759277344, "learning_rate": 3.185894186795811e-05, "loss": 0.3513, "step": 458000}, {"epoch": 20.576871237307934, "eval_loss": 3.8578977584838867, "eval_runtime": 1114.7006, "eval_samples_per_second": 8.885, "eval_steps_per_second": 0.035, "step": 458000}, {"epoch": 20.585856770599335, "grad_norm": 8.585643768310547, "learning_rate": 3.184536787525173e-05, "loss": 0.3549, "step": 458200}, {"epoch": 20.594842303890736, "grad_norm": 7.512677192687988, "learning_rate": 3.183179170057857e-05, "loss": 0.3572, "step": 458400}, {"epoch": 20.603827837182138, "grad_norm": 11.871265411376953, "learning_rate": 3.1818213348266035e-05, "loss": 0.3588, "step": 458600}, {"epoch": 20.61281337047354, "grad_norm": 4.45906925201416, "learning_rate": 3.180463282264225e-05, "loss": 0.3437, "step": 458800}, {"epoch": 20.621798903764937, "grad_norm": 3.7630507946014404, "learning_rate": 3.179105012803601e-05, "loss": 0.3904, "step": 459000}, {"epoch": 20.621798903764937, "eval_loss": 3.8454971313476562, "eval_runtime": 1116.6233, "eval_samples_per_second": 8.87, "eval_steps_per_second": 0.035, "step": 459000}, {"epoch": 20.630784437056338, "grad_norm": 9.435053825378418, "learning_rate": 3.1777465268776805e-05, "loss": 0.3552, "step": 459200}, {"epoch": 20.63976997034774, "grad_norm": 0.3744598925113678, "learning_rate": 3.176387824919484e-05, "loss": 0.3446, "step": 459400}, {"epoch": 20.64875550363914, "grad_norm": 2.1311497688293457, "learning_rate": 3.175028907362097e-05, "loss": 0.3755, "step": 459600}, {"epoch": 20.657741036930542, "grad_norm": 7.7464141845703125, "learning_rate": 3.173669774638677e-05, "loss": 0.3599, "step": 459800}, {"epoch": 20.666726570221943, "grad_norm": 18.331575393676758, "learning_rate": 3.172310427182448e-05, "loss": 0.3311, "step": 460000}, {"epoch": 20.666726570221943, "eval_loss": 3.899061918258667, "eval_runtime": 1122.1771, "eval_samples_per_second": 8.826, "eval_steps_per_second": 0.035, "step": 460000}, {"epoch": 20.675712103513344, "grad_norm": 4.977959156036377, "learning_rate": 3.1709508654267026e-05, "loss": 0.3996, "step": 460200}, {"epoch": 20.684697636804746, "grad_norm": 6.856226921081543, "learning_rate": 3.169591089804804e-05, "loss": 0.3761, "step": 460400}, {"epoch": 20.693683170096143, "grad_norm": 8.389673233032227, "learning_rate": 3.1682311007501795e-05, "loss": 0.3726, "step": 460600}, {"epoch": 20.702668703387545, "grad_norm": 3.833249807357788, "learning_rate": 3.1668708986963284e-05, "loss": 0.3422, "step": 460800}, {"epoch": 20.711654236678946, "grad_norm": 7.320929527282715, "learning_rate": 3.165510484076816e-05, "loss": 0.3855, "step": 461000}, {"epoch": 20.711654236678946, "eval_loss": 3.8244404792785645, "eval_runtime": 1128.0561, "eval_samples_per_second": 8.78, "eval_steps_per_second": 0.035, "step": 461000}, {"epoch": 20.720639769970347, "grad_norm": 3.787951946258545, "learning_rate": 3.164149857325276e-05, "loss": 0.3799, "step": 461200}, {"epoch": 20.72962530326175, "grad_norm": 5.104145526885986, "learning_rate": 3.162789018875408e-05, "loss": 0.3677, "step": 461400}, {"epoch": 20.73861083655315, "grad_norm": 6.0579962730407715, "learning_rate": 3.1614279691609804e-05, "loss": 0.3492, "step": 461600}, {"epoch": 20.74759636984455, "grad_norm": 5.607633590698242, "learning_rate": 3.1600667086158315e-05, "loss": 0.3562, "step": 461800}, {"epoch": 20.756581903135952, "grad_norm": 13.053763389587402, "learning_rate": 3.158705237673861e-05, "loss": 0.3833, "step": 462000}, {"epoch": 20.756581903135952, "eval_loss": 3.8414077758789062, "eval_runtime": 1119.0904, "eval_samples_per_second": 8.85, "eval_steps_per_second": 0.035, "step": 462000}, {"epoch": 20.765567436427354, "grad_norm": 8.402251243591309, "learning_rate": 3.157343556769041e-05, "loss": 0.412, "step": 462200}, {"epoch": 20.77455296971875, "grad_norm": 21.891206741333008, "learning_rate": 3.1559816663354076e-05, "loss": 0.3489, "step": 462400}, {"epoch": 20.783538503010153, "grad_norm": 6.903267860412598, "learning_rate": 3.1546195668070646e-05, "loss": 0.389, "step": 462600}, {"epoch": 20.792524036301554, "grad_norm": 5.88771915435791, "learning_rate": 3.153257258618183e-05, "loss": 0.3546, "step": 462800}, {"epoch": 20.801509569592955, "grad_norm": 5.859227657318115, "learning_rate": 3.151894742202999e-05, "loss": 0.3742, "step": 463000}, {"epoch": 20.801509569592955, "eval_loss": 3.807049512863159, "eval_runtime": 1121.8109, "eval_samples_per_second": 8.829, "eval_steps_per_second": 0.035, "step": 463000}, {"epoch": 20.810495102884357, "grad_norm": 9.092805862426758, "learning_rate": 3.150532017995816e-05, "loss": 0.3714, "step": 463200}, {"epoch": 20.819480636175758, "grad_norm": 32.67975997924805, "learning_rate": 3.149169086431003e-05, "loss": 0.4, "step": 463400}, {"epoch": 20.82846616946716, "grad_norm": 8.08678913116455, "learning_rate": 3.1478059479429966e-05, "loss": 0.3589, "step": 463600}, {"epoch": 20.83745170275856, "grad_norm": 2.283585548400879, "learning_rate": 3.146442602966297e-05, "loss": 0.3339, "step": 463800}, {"epoch": 20.846437236049958, "grad_norm": 8.233623504638672, "learning_rate": 3.145079051935475e-05, "loss": 0.3761, "step": 464000}, {"epoch": 20.846437236049958, "eval_loss": 3.8668360710144043, "eval_runtime": 1173.3335, "eval_samples_per_second": 8.441, "eval_steps_per_second": 0.033, "step": 464000}, {"epoch": 20.85542276934136, "grad_norm": 5.021024703979492, "learning_rate": 3.143715295285158e-05, "loss": 0.339, "step": 464200}, {"epoch": 20.86440830263276, "grad_norm": 7.741531848907471, "learning_rate": 3.142351333450049e-05, "loss": 0.3532, "step": 464400}, {"epoch": 20.873393835924162, "grad_norm": 3.023864984512329, "learning_rate": 3.140987166864911e-05, "loss": 0.3614, "step": 464600}, {"epoch": 20.882379369215563, "grad_norm": 5.5194549560546875, "learning_rate": 3.1396227959645717e-05, "loss": 0.3642, "step": 464800}, {"epoch": 20.891364902506965, "grad_norm": 0.732132613658905, "learning_rate": 3.138258221183928e-05, "loss": 0.3897, "step": 465000}, {"epoch": 20.891364902506965, "eval_loss": 3.830918073654175, "eval_runtime": 1150.322, "eval_samples_per_second": 8.61, "eval_steps_per_second": 0.034, "step": 465000}, {"epoch": 20.900350435798366, "grad_norm": 4.300996780395508, "learning_rate": 3.1368934429579376e-05, "loss": 0.302, "step": 465200}, {"epoch": 20.909335969089767, "grad_norm": 5.096749782562256, "learning_rate": 3.135528461721624e-05, "loss": 0.3462, "step": 465400}, {"epoch": 20.918321502381165, "grad_norm": 13.806108474731445, "learning_rate": 3.134163277910078e-05, "loss": 0.3477, "step": 465600}, {"epoch": 20.927307035672566, "grad_norm": 1.5174065828323364, "learning_rate": 3.1327978919584526e-05, "loss": 0.3579, "step": 465800}, {"epoch": 20.936292568963967, "grad_norm": 4.7623395919799805, "learning_rate": 3.131432304301965e-05, "loss": 0.3539, "step": 466000}, {"epoch": 20.936292568963967, "eval_loss": 3.8357908725738525, "eval_runtime": 1154.0612, "eval_samples_per_second": 8.582, "eval_steps_per_second": 0.034, "step": 466000}, {"epoch": 20.94527810225537, "grad_norm": 13.757698059082031, "learning_rate": 3.130066515375897e-05, "loss": 0.3352, "step": 466200}, {"epoch": 20.95426363554677, "grad_norm": 4.73702335357666, "learning_rate": 3.1287005256155964e-05, "loss": 0.3747, "step": 466400}, {"epoch": 20.96324916883817, "grad_norm": 0.19603075087070465, "learning_rate": 3.1273343354564734e-05, "loss": 0.382, "step": 466600}, {"epoch": 20.972234702129573, "grad_norm": 2.0142762660980225, "learning_rate": 3.1259679453340006e-05, "loss": 0.3544, "step": 466800}, {"epoch": 20.981220235420974, "grad_norm": 13.178425788879395, "learning_rate": 3.1246013556837184e-05, "loss": 0.3255, "step": 467000}, {"epoch": 20.981220235420974, "eval_loss": 3.835940361022949, "eval_runtime": 1155.7445, "eval_samples_per_second": 8.569, "eval_steps_per_second": 0.034, "step": 467000}, {"epoch": 20.99020576871237, "grad_norm": 9.660638809204102, "learning_rate": 3.1232345669412265e-05, "loss": 0.3552, "step": 467200}, {"epoch": 20.999191302003773, "grad_norm": 5.755095958709717, "learning_rate": 3.121867579542191e-05, "loss": 0.3652, "step": 467400}, {"epoch": 21.008176835295174, "grad_norm": 23.942413330078125, "learning_rate": 3.1205003939223395e-05, "loss": 0.3479, "step": 467600}, {"epoch": 21.017162368586575, "grad_norm": 5.542444229125977, "learning_rate": 3.119133010517465e-05, "loss": 0.3158, "step": 467800}, {"epoch": 21.026147901877977, "grad_norm": 3.515453815460205, "learning_rate": 3.1177654297634203e-05, "loss": 0.2882, "step": 468000}, {"epoch": 21.026147901877977, "eval_loss": 3.8817296028137207, "eval_runtime": 1153.4188, "eval_samples_per_second": 8.587, "eval_steps_per_second": 0.034, "step": 468000}, {"epoch": 21.035133435169378, "grad_norm": 3.5313735008239746, "learning_rate": 3.116397652096124e-05, "loss": 0.3262, "step": 468200}, {"epoch": 21.04411896846078, "grad_norm": 10.718170166015625, "learning_rate": 3.1150296779515566e-05, "loss": 0.337, "step": 468400}, {"epoch": 21.05310450175218, "grad_norm": 8.422656059265137, "learning_rate": 3.11366150776576e-05, "loss": 0.3319, "step": 468600}, {"epoch": 21.06209003504358, "grad_norm": 7.027642726898193, "learning_rate": 3.11229314197484e-05, "loss": 0.3825, "step": 468800}, {"epoch": 21.07107556833498, "grad_norm": 2.228684902191162, "learning_rate": 3.110924581014964e-05, "loss": 0.329, "step": 469000}, {"epoch": 21.07107556833498, "eval_loss": 3.8373589515686035, "eval_runtime": 1150.8556, "eval_samples_per_second": 8.606, "eval_steps_per_second": 0.034, "step": 469000}, {"epoch": 21.08006110162638, "grad_norm": 6.492588996887207, "learning_rate": 3.109555825322364e-05, "loss": 0.3721, "step": 469200}, {"epoch": 21.089046634917782, "grad_norm": 5.467384338378906, "learning_rate": 3.1081868753333306e-05, "loss": 0.3371, "step": 469400}, {"epoch": 21.098032168209183, "grad_norm": 19.02194595336914, "learning_rate": 3.106817731484216e-05, "loss": 0.3575, "step": 469600}, {"epoch": 21.107017701500585, "grad_norm": 5.688388347625732, "learning_rate": 3.105448394211439e-05, "loss": 0.3323, "step": 469800}, {"epoch": 21.116003234791986, "grad_norm": 6.124304294586182, "learning_rate": 3.104078863951475e-05, "loss": 0.3399, "step": 470000}, {"epoch": 21.116003234791986, "eval_loss": 3.8396663665771484, "eval_runtime": 1148.2714, "eval_samples_per_second": 8.625, "eval_steps_per_second": 0.034, "step": 470000}, {"epoch": 21.124988768083387, "grad_norm": 14.203096389770508, "learning_rate": 3.1027091411408634e-05, "loss": 0.3087, "step": 470200}, {"epoch": 21.133974301374785, "grad_norm": 10.170199394226074, "learning_rate": 3.101339226216205e-05, "loss": 0.3511, "step": 470400}, {"epoch": 21.142959834666186, "grad_norm": 3.682291030883789, "learning_rate": 3.099969119614161e-05, "loss": 0.3443, "step": 470600}, {"epoch": 21.151945367957588, "grad_norm": 3.399019718170166, "learning_rate": 3.098598821771454e-05, "loss": 0.329, "step": 470800}, {"epoch": 21.16093090124899, "grad_norm": 4.879147052764893, "learning_rate": 3.0972283331248675e-05, "loss": 0.3404, "step": 471000}, {"epoch": 21.16093090124899, "eval_loss": 3.8527607917785645, "eval_runtime": 1154.1744, "eval_samples_per_second": 8.581, "eval_steps_per_second": 0.034, "step": 471000}, {"epoch": 21.16991643454039, "grad_norm": 14.056867599487305, "learning_rate": 3.095857654111246e-05, "loss": 0.367, "step": 471200}, {"epoch": 21.17890196783179, "grad_norm": 2.038222312927246, "learning_rate": 3.094486785167495e-05, "loss": 0.3434, "step": 471400}, {"epoch": 21.187887501123193, "grad_norm": 5.393631458282471, "learning_rate": 3.09311572673058e-05, "loss": 0.3316, "step": 471600}, {"epoch": 21.196873034414594, "grad_norm": 9.57490348815918, "learning_rate": 3.091744479237526e-05, "loss": 0.3618, "step": 471800}, {"epoch": 21.20585856770599, "grad_norm": 6.818603515625, "learning_rate": 3.090373043125421e-05, "loss": 0.3651, "step": 472000}, {"epoch": 21.20585856770599, "eval_loss": 3.847317695617676, "eval_runtime": 1155.725, "eval_samples_per_second": 8.57, "eval_steps_per_second": 0.034, "step": 472000}, {"epoch": 21.214844100997393, "grad_norm": 2.522334575653076, "learning_rate": 3.0890014188314095e-05, "loss": 0.3264, "step": 472200}, {"epoch": 21.223829634288794, "grad_norm": 25.88078498840332, "learning_rate": 3.0876296067927e-05, "loss": 0.3423, "step": 472400}, {"epoch": 21.232815167580195, "grad_norm": 0.09056749939918518, "learning_rate": 3.0862576074465566e-05, "loss": 0.3413, "step": 472600}, {"epoch": 21.241800700871597, "grad_norm": 28.01805305480957, "learning_rate": 3.0848854212303065e-05, "loss": 0.3273, "step": 472800}, {"epoch": 21.250786234162998, "grad_norm": 6.097854137420654, "learning_rate": 3.083513048581335e-05, "loss": 0.3848, "step": 473000}, {"epoch": 21.250786234162998, "eval_loss": 3.879460334777832, "eval_runtime": 1149.2535, "eval_samples_per_second": 8.618, "eval_steps_per_second": 0.034, "step": 473000}, {"epoch": 21.2597717674544, "grad_norm": 0.36335647106170654, "learning_rate": 3.082140489937088e-05, "loss": 0.3841, "step": 473200}, {"epoch": 21.2687573007458, "grad_norm": 2.704850435256958, "learning_rate": 3.080767745735067e-05, "loss": 0.3488, "step": 473400}, {"epoch": 21.2777428340372, "grad_norm": 0.6730875968933105, "learning_rate": 3.079394816412839e-05, "loss": 0.3457, "step": 473600}, {"epoch": 21.2867283673286, "grad_norm": 16.261018753051758, "learning_rate": 3.078021702408024e-05, "loss": 0.3444, "step": 473800}, {"epoch": 21.29571390062, "grad_norm": 8.230804443359375, "learning_rate": 3.076648404158303e-05, "loss": 0.3606, "step": 474000}, {"epoch": 21.29571390062, "eval_loss": 3.8442225456237793, "eval_runtime": 1152.5751, "eval_samples_per_second": 8.593, "eval_steps_per_second": 0.034, "step": 474000}, {"epoch": 21.304699433911402, "grad_norm": 6.650168418884277, "learning_rate": 3.075274922101418e-05, "loss": 0.3307, "step": 474200}, {"epoch": 21.313684967202803, "grad_norm": 9.012650489807129, "learning_rate": 3.073901256675166e-05, "loss": 0.3595, "step": 474400}, {"epoch": 21.322670500494205, "grad_norm": 3.0658600330352783, "learning_rate": 3.072527408317403e-05, "loss": 0.365, "step": 474600}, {"epoch": 21.331656033785606, "grad_norm": 8.665407180786133, "learning_rate": 3.071153377466047e-05, "loss": 0.3393, "step": 474800}, {"epoch": 21.340641567077007, "grad_norm": 0.1144244521856308, "learning_rate": 3.0697791645590696e-05, "loss": 0.3567, "step": 475000}, {"epoch": 21.340641567077007, "eval_loss": 3.848034143447876, "eval_runtime": 1168.8081, "eval_samples_per_second": 8.474, "eval_steps_per_second": 0.033, "step": 475000}, {"epoch": 21.34962710036841, "grad_norm": 9.049808502197266, "learning_rate": 3.068404770034503e-05, "loss": 0.3773, "step": 475200}, {"epoch": 21.358612633659806, "grad_norm": 5.73265266418457, "learning_rate": 3.067030194330437e-05, "loss": 0.3476, "step": 475400}, {"epoch": 21.367598166951208, "grad_norm": 12.6224365234375, "learning_rate": 3.065655437885018e-05, "loss": 0.3389, "step": 475600}, {"epoch": 21.37658370024261, "grad_norm": 19.895153045654297, "learning_rate": 3.06428050113645e-05, "loss": 0.3646, "step": 475800}, {"epoch": 21.38556923353401, "grad_norm": 9.202630043029785, "learning_rate": 3.062905384522998e-05, "loss": 0.4052, "step": 476000}, {"epoch": 21.38556923353401, "eval_loss": 3.8101115226745605, "eval_runtime": 1161.6908, "eval_samples_per_second": 8.526, "eval_steps_per_second": 0.034, "step": 476000}, {"epoch": 21.39455476682541, "grad_norm": 24.745006561279297, "learning_rate": 3.0615300884829785e-05, "loss": 0.3686, "step": 476200}, {"epoch": 21.403540300116813, "grad_norm": 2.2949283123016357, "learning_rate": 3.060154613454771e-05, "loss": 0.3118, "step": 476400}, {"epoch": 21.412525833408214, "grad_norm": 1.272202491760254, "learning_rate": 3.058778959876807e-05, "loss": 0.3484, "step": 476600}, {"epoch": 21.421511366699615, "grad_norm": 0.6712559461593628, "learning_rate": 3.057403128187578e-05, "loss": 0.3196, "step": 476800}, {"epoch": 21.430496899991013, "grad_norm": 4.88563346862793, "learning_rate": 3.056027118825632e-05, "loss": 0.3432, "step": 477000}, {"epoch": 21.430496899991013, "eval_loss": 3.836414098739624, "eval_runtime": 1156.4826, "eval_samples_per_second": 8.564, "eval_steps_per_second": 0.034, "step": 477000}, {"epoch": 21.439482433282414, "grad_norm": 5.171449661254883, "learning_rate": 3.054650932229573e-05, "loss": 0.3461, "step": 477200}, {"epoch": 21.448467966573816, "grad_norm": 6.105608940124512, "learning_rate": 3.053274568838061e-05, "loss": 0.3616, "step": 477400}, {"epoch": 21.457453499865217, "grad_norm": 0.032906968146562576, "learning_rate": 3.051898029089814e-05, "loss": 0.3433, "step": 477600}, {"epoch": 21.466439033156618, "grad_norm": 15.590333938598633, "learning_rate": 3.0505213134236043e-05, "loss": 0.3356, "step": 477800}, {"epoch": 21.47542456644802, "grad_norm": 4.688640117645264, "learning_rate": 3.0491444222782616e-05, "loss": 0.3906, "step": 478000}, {"epoch": 21.47542456644802, "eval_loss": 3.85675048828125, "eval_runtime": 1155.4131, "eval_samples_per_second": 8.572, "eval_steps_per_second": 0.034, "step": 478000}, {"epoch": 21.48441009973942, "grad_norm": 10.541050910949707, "learning_rate": 3.0477673560926723e-05, "loss": 0.3419, "step": 478200}, {"epoch": 21.493395633030822, "grad_norm": 2.6476938724517822, "learning_rate": 3.046390115305775e-05, "loss": 0.3415, "step": 478400}, {"epoch": 21.50238116632222, "grad_norm": 14.356165885925293, "learning_rate": 3.0450127003565676e-05, "loss": 0.3367, "step": 478600}, {"epoch": 21.51136669961362, "grad_norm": 16.879222869873047, "learning_rate": 3.043635111684102e-05, "loss": 0.3584, "step": 478800}, {"epoch": 21.520352232905022, "grad_norm": 7.5179009437561035, "learning_rate": 3.0422573497274865e-05, "loss": 0.3594, "step": 479000}, {"epoch": 21.520352232905022, "eval_loss": 3.820604085922241, "eval_runtime": 1154.9865, "eval_samples_per_second": 8.575, "eval_steps_per_second": 0.034, "step": 479000}, {"epoch": 21.529337766196424, "grad_norm": 14.661418914794922, "learning_rate": 3.040879414925883e-05, "loss": 0.3627, "step": 479200}, {"epoch": 21.538323299487825, "grad_norm": 38.703025817871094, "learning_rate": 3.0395013077185103e-05, "loss": 0.3574, "step": 479400}, {"epoch": 21.547308832779226, "grad_norm": 4.57069730758667, "learning_rate": 3.0381230285446395e-05, "loss": 0.2861, "step": 479600}, {"epoch": 21.556294366070627, "grad_norm": 15.500905990600586, "learning_rate": 3.036744577843601e-05, "loss": 0.3579, "step": 479800}, {"epoch": 21.56527989936203, "grad_norm": 5.1388959884643555, "learning_rate": 3.0353659560547748e-05, "loss": 0.3689, "step": 480000}, {"epoch": 21.56527989936203, "eval_loss": 3.8755042552948, "eval_runtime": 1153.7667, "eval_samples_per_second": 8.584, "eval_steps_per_second": 0.034, "step": 480000}, {"epoch": 21.574265432653426, "grad_norm": 0.9813115000724792, "learning_rate": 3.0339871636175982e-05, "loss": 0.3489, "step": 480200}, {"epoch": 21.583250965944828, "grad_norm": 10.196927070617676, "learning_rate": 3.0326082009715636e-05, "loss": 0.3901, "step": 480400}, {"epoch": 21.59223649923623, "grad_norm": 14.794051170349121, "learning_rate": 3.031229068556215e-05, "loss": 0.3294, "step": 480600}, {"epoch": 21.60122203252763, "grad_norm": 14.24916934967041, "learning_rate": 3.029849766811153e-05, "loss": 0.387, "step": 480800}, {"epoch": 21.61020756581903, "grad_norm": 15.70306396484375, "learning_rate": 3.0284702961760304e-05, "loss": 0.3595, "step": 481000}, {"epoch": 21.61020756581903, "eval_loss": 3.8320348262786865, "eval_runtime": 1154.7214, "eval_samples_per_second": 8.577, "eval_steps_per_second": 0.034, "step": 481000}, {"epoch": 21.619193099110433, "grad_norm": 16.37736701965332, "learning_rate": 3.027090657090556e-05, "loss": 0.3717, "step": 481200}, {"epoch": 21.628178632401834, "grad_norm": 3.5008671283721924, "learning_rate": 3.025710849994489e-05, "loss": 0.3668, "step": 481400}, {"epoch": 21.637164165693235, "grad_norm": 9.52043628692627, "learning_rate": 3.024330875327646e-05, "loss": 0.3244, "step": 481600}, {"epoch": 21.646149698984633, "grad_norm": 8.85307502746582, "learning_rate": 3.022950733529894e-05, "loss": 0.3817, "step": 481800}, {"epoch": 21.655135232276034, "grad_norm": 18.641752243041992, "learning_rate": 3.0215704250411542e-05, "loss": 0.3254, "step": 482000}, {"epoch": 21.655135232276034, "eval_loss": 3.8365020751953125, "eval_runtime": 1155.1846, "eval_samples_per_second": 8.574, "eval_steps_per_second": 0.034, "step": 482000}, {"epoch": 21.664120765567436, "grad_norm": 11.407354354858398, "learning_rate": 3.0201899503014013e-05, "loss": 0.3427, "step": 482200}, {"epoch": 21.673106298858837, "grad_norm": 20.381561279296875, "learning_rate": 3.0188093097506642e-05, "loss": 0.3127, "step": 482400}, {"epoch": 21.68209183215024, "grad_norm": 11.307368278503418, "learning_rate": 3.0174285038290208e-05, "loss": 0.356, "step": 482600}, {"epoch": 21.69107736544164, "grad_norm": 4.448453903198242, "learning_rate": 3.016047532976606e-05, "loss": 0.3319, "step": 482800}, {"epoch": 21.70006289873304, "grad_norm": 14.862668991088867, "learning_rate": 3.0146663976336036e-05, "loss": 0.3684, "step": 483000}, {"epoch": 21.70006289873304, "eval_loss": 3.879840135574341, "eval_runtime": 1155.6614, "eval_samples_per_second": 8.57, "eval_steps_per_second": 0.034, "step": 483000}, {"epoch": 21.709048432024442, "grad_norm": 7.227370738983154, "learning_rate": 3.0132850982402538e-05, "loss": 0.3515, "step": 483200}, {"epoch": 21.71803396531584, "grad_norm": 1.9134999513626099, "learning_rate": 3.0119036352368463e-05, "loss": 0.3544, "step": 483400}, {"epoch": 21.72701949860724, "grad_norm": 5.353797912597656, "learning_rate": 3.010522009063722e-05, "loss": 0.325, "step": 483600}, {"epoch": 21.736005031898642, "grad_norm": 3.9726414680480957, "learning_rate": 3.0091402201612785e-05, "loss": 0.3743, "step": 483800}, {"epoch": 21.744990565190044, "grad_norm": 7.579124927520752, "learning_rate": 3.007758268969959e-05, "loss": 0.3347, "step": 484000}, {"epoch": 21.744990565190044, "eval_loss": 3.8592593669891357, "eval_runtime": 1154.5705, "eval_samples_per_second": 8.578, "eval_steps_per_second": 0.034, "step": 484000}, {"epoch": 21.753976098481445, "grad_norm": 2.528778076171875, "learning_rate": 3.0063761559302626e-05, "loss": 0.3497, "step": 484200}, {"epoch": 21.762961631772846, "grad_norm": 7.943315029144287, "learning_rate": 3.0049938814827405e-05, "loss": 0.3666, "step": 484400}, {"epoch": 21.771947165064248, "grad_norm": 33.58492660522461, "learning_rate": 3.0036114460679926e-05, "loss": 0.3457, "step": 484600}, {"epoch": 21.78093269835565, "grad_norm": 1.3153636455535889, "learning_rate": 3.002228850126671e-05, "loss": 0.3493, "step": 484800}, {"epoch": 21.789918231647047, "grad_norm": 8.177019119262695, "learning_rate": 3.00084609409948e-05, "loss": 0.3624, "step": 485000}, {"epoch": 21.789918231647047, "eval_loss": 3.820582389831543, "eval_runtime": 1154.2343, "eval_samples_per_second": 8.581, "eval_steps_per_second": 0.034, "step": 485000}, {"epoch": 21.798903764938448, "grad_norm": 3.7506697177886963, "learning_rate": 2.9994631784271743e-05, "loss": 0.3678, "step": 485200}, {"epoch": 21.80788929822985, "grad_norm": 14.741352081298828, "learning_rate": 2.998080103550558e-05, "loss": 0.3489, "step": 485400}, {"epoch": 21.81687483152125, "grad_norm": 9.07077693939209, "learning_rate": 2.9966968699104896e-05, "loss": 0.325, "step": 485600}, {"epoch": 21.82586036481265, "grad_norm": 56.59426498413086, "learning_rate": 2.995313477947875e-05, "loss": 0.3738, "step": 485800}, {"epoch": 21.834845898104053, "grad_norm": 16.987424850463867, "learning_rate": 2.993929928103671e-05, "loss": 0.3698, "step": 486000}, {"epoch": 21.834845898104053, "eval_loss": 3.7959418296813965, "eval_runtime": 1183.6378, "eval_samples_per_second": 8.367, "eval_steps_per_second": 0.033, "step": 486000}, {"epoch": 21.843831431395454, "grad_norm": 23.582782745361328, "learning_rate": 2.992546220818886e-05, "loss": 0.3545, "step": 486200}, {"epoch": 21.852816964686856, "grad_norm": 8.88424301147461, "learning_rate": 2.991162356534577e-05, "loss": 0.3428, "step": 486400}, {"epoch": 21.861802497978253, "grad_norm": 9.823083877563477, "learning_rate": 2.9897783356918536e-05, "loss": 0.3352, "step": 486600}, {"epoch": 21.870788031269655, "grad_norm": 1.0258564949035645, "learning_rate": 2.988394158731872e-05, "loss": 0.3661, "step": 486800}, {"epoch": 21.879773564561056, "grad_norm": 2.3258697986602783, "learning_rate": 2.98700982609584e-05, "loss": 0.3484, "step": 487000}, {"epoch": 21.879773564561056, "eval_loss": 3.8458335399627686, "eval_runtime": 1171.3081, "eval_samples_per_second": 8.456, "eval_steps_per_second": 0.033, "step": 487000}, {"epoch": 21.888759097852457, "grad_norm": 16.876636505126953, "learning_rate": 2.985625338225016e-05, "loss": 0.356, "step": 487200}, {"epoch": 21.89774463114386, "grad_norm": 1.0593225955963135, "learning_rate": 2.9842406955607054e-05, "loss": 0.3426, "step": 487400}, {"epoch": 21.90673016443526, "grad_norm": 0.3930041491985321, "learning_rate": 2.9828558985442647e-05, "loss": 0.3712, "step": 487600}, {"epoch": 21.91571569772666, "grad_norm": 47.871334075927734, "learning_rate": 2.9814709476170988e-05, "loss": 0.3656, "step": 487800}, {"epoch": 21.924701231018062, "grad_norm": 7.659090042114258, "learning_rate": 2.9800858432206625e-05, "loss": 0.3934, "step": 488000}, {"epoch": 21.924701231018062, "eval_loss": 3.867889881134033, "eval_runtime": 1172.2377, "eval_samples_per_second": 8.449, "eval_steps_per_second": 0.033, "step": 488000}, {"epoch": 21.933686764309464, "grad_norm": 11.335125923156738, "learning_rate": 2.9787005857964583e-05, "loss": 0.3697, "step": 488200}, {"epoch": 21.94267229760086, "grad_norm": 5.224600791931152, "learning_rate": 2.977315175786039e-05, "loss": 0.3876, "step": 488400}, {"epoch": 21.951657830892263, "grad_norm": 0.7447425723075867, "learning_rate": 2.9759296136310048e-05, "loss": 0.3723, "step": 488600}, {"epoch": 21.960643364183664, "grad_norm": 13.654375076293945, "learning_rate": 2.9745438997730045e-05, "loss": 0.3389, "step": 488800}, {"epoch": 21.969628897475065, "grad_norm": 3.7496023178100586, "learning_rate": 2.9731580346537357e-05, "loss": 0.3349, "step": 489000}, {"epoch": 21.969628897475065, "eval_loss": 3.8698184490203857, "eval_runtime": 1168.8312, "eval_samples_per_second": 8.473, "eval_steps_per_second": 0.033, "step": 489000}, {"epoch": 21.978614430766466, "grad_norm": 1.3468828201293945, "learning_rate": 2.971772018714945e-05, "loss": 0.3456, "step": 489200}, {"epoch": 21.987599964057868, "grad_norm": 6.780975341796875, "learning_rate": 2.9703858523984245e-05, "loss": 0.3457, "step": 489400}, {"epoch": 21.99658549734927, "grad_norm": 5.41343355178833, "learning_rate": 2.9689995361460175e-05, "loss": 0.3758, "step": 489600}, {"epoch": 22.00557103064067, "grad_norm": 4.552206993103027, "learning_rate": 2.9676130703996124e-05, "loss": 0.3399, "step": 489800}, {"epoch": 22.014556563932068, "grad_norm": 9.643780708312988, "learning_rate": 2.9662264556011465e-05, "loss": 0.3381, "step": 490000}, {"epoch": 22.014556563932068, "eval_loss": 3.8691928386688232, "eval_runtime": 1170.7785, "eval_samples_per_second": 8.459, "eval_steps_per_second": 0.033, "step": 490000}, {"epoch": 22.02354209722347, "grad_norm": 7.726506233215332, "learning_rate": 2.9648396921926047e-05, "loss": 0.3159, "step": 490200}, {"epoch": 22.03252763051487, "grad_norm": 4.900279521942139, "learning_rate": 2.963452780616019e-05, "loss": 0.3327, "step": 490400}, {"epoch": 22.041513163806272, "grad_norm": 6.858339786529541, "learning_rate": 2.9620657213134684e-05, "loss": 0.3054, "step": 490600}, {"epoch": 22.050498697097673, "grad_norm": 1.6258982419967651, "learning_rate": 2.9606785147270798e-05, "loss": 0.3267, "step": 490800}, {"epoch": 22.059484230389074, "grad_norm": 0.9190937876701355, "learning_rate": 2.959291161299026e-05, "loss": 0.3167, "step": 491000}, {"epoch": 22.059484230389074, "eval_loss": 3.9671905040740967, "eval_runtime": 1171.7463, "eval_samples_per_second": 8.452, "eval_steps_per_second": 0.033, "step": 491000}, {"epoch": 22.068469763680476, "grad_norm": 10.989773750305176, "learning_rate": 2.9579036614715267e-05, "loss": 0.3332, "step": 491200}, {"epoch": 22.077455296971877, "grad_norm": 10.96854305267334, "learning_rate": 2.95651601568685e-05, "loss": 0.3212, "step": 491400}, {"epoch": 22.086440830263275, "grad_norm": 5.382962703704834, "learning_rate": 2.9551282243873068e-05, "loss": 0.3327, "step": 491600}, {"epoch": 22.095426363554676, "grad_norm": 13.09936237335205, "learning_rate": 2.953740288015259e-05, "loss": 0.3301, "step": 491800}, {"epoch": 22.104411896846077, "grad_norm": 2.1858365535736084, "learning_rate": 2.9523522070131116e-05, "loss": 0.3324, "step": 492000}, {"epoch": 22.104411896846077, "eval_loss": 3.9012913703918457, "eval_runtime": 1170.9398, "eval_samples_per_second": 8.458, "eval_steps_per_second": 0.033, "step": 492000}, {"epoch": 22.11339743013748, "grad_norm": 2.50134015083313, "learning_rate": 2.9509639818233166e-05, "loss": 0.2969, "step": 492200}, {"epoch": 22.12238296342888, "grad_norm": 1.286801815032959, "learning_rate": 2.9495756128883716e-05, "loss": 0.2918, "step": 492400}, {"epoch": 22.13136849672028, "grad_norm": 2.6734347343444824, "learning_rate": 2.9481871006508215e-05, "loss": 0.3323, "step": 492600}, {"epoch": 22.140354030011682, "grad_norm": 6.276237487792969, "learning_rate": 2.946798445553254e-05, "loss": 0.323, "step": 492800}, {"epoch": 22.149339563303084, "grad_norm": 1.7359256744384766, "learning_rate": 2.945409648038306e-05, "loss": 0.3305, "step": 493000}, {"epoch": 22.149339563303084, "eval_loss": 3.8641602993011475, "eval_runtime": 1172.5282, "eval_samples_per_second": 8.447, "eval_steps_per_second": 0.033, "step": 493000}, {"epoch": 22.15832509659448, "grad_norm": 17.382686614990234, "learning_rate": 2.9440207085486565e-05, "loss": 0.3097, "step": 493200}, {"epoch": 22.167310629885883, "grad_norm": 5.912476062774658, "learning_rate": 2.9426316275270316e-05, "loss": 0.3329, "step": 493400}, {"epoch": 22.176296163177284, "grad_norm": 9.099150657653809, "learning_rate": 2.941242405416203e-05, "loss": 0.3517, "step": 493600}, {"epoch": 22.185281696468685, "grad_norm": 1.9675058126449585, "learning_rate": 2.9398530426589843e-05, "loss": 0.3251, "step": 493800}, {"epoch": 22.194267229760086, "grad_norm": 3.559220552444458, "learning_rate": 2.9384635396982373e-05, "loss": 0.3182, "step": 494000}, {"epoch": 22.194267229760086, "eval_loss": 3.8617329597473145, "eval_runtime": 1172.2551, "eval_samples_per_second": 8.449, "eval_steps_per_second": 0.033, "step": 494000}, {"epoch": 22.203252763051488, "grad_norm": 1.4313397407531738, "learning_rate": 2.937073896976868e-05, "loss": 0.3291, "step": 494200}, {"epoch": 22.21223829634289, "grad_norm": 10.649069786071777, "learning_rate": 2.9356841149378243e-05, "loss": 0.3143, "step": 494400}, {"epoch": 22.22122382963429, "grad_norm": 2.5395827293395996, "learning_rate": 2.934294194024102e-05, "loss": 0.3239, "step": 494600}, {"epoch": 22.230209362925688, "grad_norm": 16.162391662597656, "learning_rate": 2.9329041346787393e-05, "loss": 0.3264, "step": 494800}, {"epoch": 22.23919489621709, "grad_norm": 4.001119136810303, "learning_rate": 2.9315139373448187e-05, "loss": 0.3633, "step": 495000}, {"epoch": 22.23919489621709, "eval_loss": 3.887908935546875, "eval_runtime": 1171.3046, "eval_samples_per_second": 8.456, "eval_steps_per_second": 0.033, "step": 495000}, {"epoch": 22.24818042950849, "grad_norm": 3.224276065826416, "learning_rate": 2.930123602465466e-05, "loss": 0.3412, "step": 495200}, {"epoch": 22.257165962799892, "grad_norm": 8.406235694885254, "learning_rate": 2.9287331304838526e-05, "loss": 0.3101, "step": 495400}, {"epoch": 22.266151496091293, "grad_norm": 0.37792113423347473, "learning_rate": 2.927342521843191e-05, "loss": 0.313, "step": 495600}, {"epoch": 22.275137029382694, "grad_norm": 7.6752119064331055, "learning_rate": 2.925951776986742e-05, "loss": 0.3194, "step": 495800}, {"epoch": 22.284122562674096, "grad_norm": 8.115521430969238, "learning_rate": 2.9245608963578035e-05, "loss": 0.3282, "step": 496000}, {"epoch": 22.284122562674096, "eval_loss": 3.8440310955047607, "eval_runtime": 1171.3815, "eval_samples_per_second": 8.455, "eval_steps_per_second": 0.033, "step": 496000}, {"epoch": 22.293108095965497, "grad_norm": 6.122352123260498, "learning_rate": 2.9231698803997214e-05, "loss": 0.3584, "step": 496200}, {"epoch": 29.735234215885946, "grad_norm": 6.727758884429932, "learning_rate": 1.76713460327016e-05, "loss": 0.4305, "step": 496400}, {"epoch": 29.7472145681083, "grad_norm": 27.16288185119629, "learning_rate": 1.7653356059332797e-05, "loss": 0.4504, "step": 496600}, {"epoch": 29.759194920330657, "grad_norm": 20.496925354003906, "learning_rate": 1.7635370248836235e-05, "loss": 0.4269, "step": 496800}, {"epoch": 29.771175272553013, "grad_norm": 11.9760160446167, "learning_rate": 1.7617388611403342e-05, "loss": 0.4121, "step": 497000}, {"epoch": 29.771175272553013, "eval_loss": 1.3001623153686523, "eval_runtime": 1179.5019, "eval_samples_per_second": 8.397, "eval_steps_per_second": 0.525, "step": 497000}, {"epoch": 29.783155624775368, "grad_norm": 18.339258193969727, "learning_rate": 1.7599411157223162e-05, "loss": 0.3986, "step": 497200}, {"epoch": 29.795135976997724, "grad_norm": 13.581840515136719, "learning_rate": 1.758143789648235e-05, "loss": 0.4327, "step": 497400}, {"epoch": 29.80711632922008, "grad_norm": 7.681920528411865, "learning_rate": 1.7563468839365203e-05, "loss": 0.4123, "step": 497600}, {"epoch": 29.819096681442435, "grad_norm": 9.169760704040527, "learning_rate": 1.7545503996053654e-05, "loss": 0.414, "step": 497800}, {"epoch": 29.83107703366479, "grad_norm": 14.092098236083984, "learning_rate": 1.7527543376727206e-05, "loss": 0.4185, "step": 498000}, {"epoch": 29.83107703366479, "eval_loss": 1.3006553649902344, "eval_runtime": 1179.444, "eval_samples_per_second": 8.397, "eval_steps_per_second": 0.525, "step": 498000}, {"epoch": 29.843057385887146, "grad_norm": 5.654545783996582, "learning_rate": 1.7509586991563e-05, "loss": 0.4006, "step": 498200}, {"epoch": 29.855037738109502, "grad_norm": 13.537749290466309, "learning_rate": 1.7491634850735765e-05, "loss": 0.4088, "step": 498400}, {"epoch": 29.867018090331857, "grad_norm": 24.24238395690918, "learning_rate": 1.7473686964417836e-05, "loss": 0.432, "step": 498600}, {"epoch": 29.87899844255421, "grad_norm": 9.747505187988281, "learning_rate": 1.745574334277912e-05, "loss": 0.4162, "step": 498800}, {"epoch": 29.890978794776565, "grad_norm": 17.57337188720703, "learning_rate": 1.743780399598713e-05, "loss": 0.4, "step": 499000}, {"epoch": 29.890978794776565, "eval_loss": 1.2909830808639526, "eval_runtime": 1174.8573, "eval_samples_per_second": 8.43, "eval_steps_per_second": 0.527, "step": 499000}, {"epoch": 29.90295914699892, "grad_norm": 20.43497657775879, "learning_rate": 1.7419868934206927e-05, "loss": 0.3781, "step": 499200}, {"epoch": 29.914939499221276, "grad_norm": 6.868372917175293, "learning_rate": 1.7401938167601173e-05, "loss": 0.3713, "step": 499400}, {"epoch": 29.926919851443632, "grad_norm": 3.9050910472869873, "learning_rate": 1.7384011706330083e-05, "loss": 0.3943, "step": 499600}, {"epoch": 29.938900203665987, "grad_norm": 4.61909294128418, "learning_rate": 1.7366089560551432e-05, "loss": 0.4047, "step": 499800}, {"epoch": 29.950880555888343, "grad_norm": 14.102638244628906, "learning_rate": 1.7348171740420547e-05, "loss": 0.4009, "step": 500000}, {"epoch": 29.950880555888343, "eval_loss": 1.2899349927902222, "eval_runtime": 1176.47, "eval_samples_per_second": 8.418, "eval_steps_per_second": 0.526, "step": 500000}, {"epoch": 29.9628609081107, "grad_norm": 16.03158187866211, "learning_rate": 1.7330258256090326e-05, "loss": 0.3929, "step": 500200}, {"epoch": 29.974841260333054, "grad_norm": 12.243492126464844, "learning_rate": 1.731234911771117e-05, "loss": 0.423, "step": 500400}, {"epoch": 29.98682161255541, "grad_norm": 17.75141143798828, "learning_rate": 1.7294444335431046e-05, "loss": 0.3905, "step": 500600}, {"epoch": 29.998801964777766, "grad_norm": 14.251209259033203, "learning_rate": 1.7276543919395454e-05, "loss": 0.4274, "step": 500800}, {"epoch": 30.01078231700012, "grad_norm": 5.90828275680542, "learning_rate": 1.725864787974741e-05, "loss": 0.3744, "step": 501000}, {"epoch": 30.01078231700012, "eval_loss": 1.2981280088424683, "eval_runtime": 1177.1036, "eval_samples_per_second": 8.414, "eval_steps_per_second": 0.526, "step": 501000}, {"epoch": 30.022762669222477, "grad_norm": 7.459860324859619, "learning_rate": 1.724075622662745e-05, "loss": 0.3641, "step": 501200}, {"epoch": 30.03474302144483, "grad_norm": 6.359617710113525, "learning_rate": 1.7222868970173625e-05, "loss": 0.3961, "step": 501400}, {"epoch": 30.046723373667184, "grad_norm": 8.468971252441406, "learning_rate": 1.72049861205215e-05, "loss": 0.3861, "step": 501600}, {"epoch": 30.05870372588954, "grad_norm": 9.226763725280762, "learning_rate": 1.718710768780414e-05, "loss": 0.3803, "step": 501800}, {"epoch": 30.070684078111896, "grad_norm": 6.459045886993408, "learning_rate": 1.7169233682152108e-05, "loss": 0.3691, "step": 502000}, {"epoch": 30.070684078111896, "eval_loss": 1.2914437055587769, "eval_runtime": 1176.221, "eval_samples_per_second": 8.42, "eval_steps_per_second": 0.526, "step": 502000}, {"epoch": 30.08266443033425, "grad_norm": 0.5821087956428528, "learning_rate": 1.7151364113693456e-05, "loss": 0.3721, "step": 502200}, {"epoch": 30.094644782556607, "grad_norm": 0.9501954317092896, "learning_rate": 1.713349899255372e-05, "loss": 0.4402, "step": 502400}, {"epoch": 30.106625134778962, "grad_norm": 4.453815460205078, "learning_rate": 1.7115638328855927e-05, "loss": 0.4195, "step": 502600}, {"epoch": 30.118605487001318, "grad_norm": 5.928565502166748, "learning_rate": 1.709778213272056e-05, "loss": 0.4023, "step": 502800}, {"epoch": 30.130585839223674, "grad_norm": 12.186752319335938, "learning_rate": 1.7079930414265587e-05, "loss": 0.3775, "step": 503000}, {"epoch": 30.130585839223674, "eval_loss": 1.2876982688903809, "eval_runtime": 1177.2126, "eval_samples_per_second": 8.413, "eval_steps_per_second": 0.526, "step": 503000}, {"epoch": 30.14256619144603, "grad_norm": 6.3686017990112305, "learning_rate": 1.706208318360644e-05, "loss": 0.3965, "step": 503200}, {"epoch": 30.154546543668385, "grad_norm": 5.7197089195251465, "learning_rate": 1.7044240450855985e-05, "loss": 0.3283, "step": 503400}, {"epoch": 30.16652689589074, "grad_norm": 9.594609260559082, "learning_rate": 1.7026402226124558e-05, "loss": 0.4004, "step": 503600}, {"epoch": 30.178507248113096, "grad_norm": 4.027350425720215, "learning_rate": 1.7008568519519958e-05, "loss": 0.4013, "step": 503800}, {"epoch": 30.19048760033545, "grad_norm": 5.989893913269043, "learning_rate": 1.6990739341147378e-05, "loss": 0.3604, "step": 504000}, {"epoch": 30.19048760033545, "eval_loss": 1.2966716289520264, "eval_runtime": 1178.6668, "eval_samples_per_second": 8.403, "eval_steps_per_second": 0.525, "step": 504000}, {"epoch": 30.202467952557804, "grad_norm": 3.6295764446258545, "learning_rate": 1.6972914701109475e-05, "loss": 0.4039, "step": 504200}, {"epoch": 30.21444830478016, "grad_norm": 22.197795867919922, "learning_rate": 1.6955094609506355e-05, "loss": 0.3813, "step": 504400}, {"epoch": 30.226428657002515, "grad_norm": 16.731632232666016, "learning_rate": 1.6937279076435488e-05, "loss": 0.4041, "step": 504600}, {"epoch": 30.23840900922487, "grad_norm": 9.170949935913086, "learning_rate": 1.6919468111991805e-05, "loss": 0.3707, "step": 504800}, {"epoch": 30.250389361447226, "grad_norm": 10.209980010986328, "learning_rate": 1.690166172626766e-05, "loss": 0.3934, "step": 505000}, {"epoch": 30.250389361447226, "eval_loss": 1.289827585220337, "eval_runtime": 1172.8257, "eval_samples_per_second": 8.445, "eval_steps_per_second": 0.528, "step": 505000}, {"epoch": 30.26236971366958, "grad_norm": 4.348522663116455, "learning_rate": 1.6883859929352756e-05, "loss": 0.3851, "step": 505200}, {"epoch": 30.274350065891937, "grad_norm": 4.488011360168457, "learning_rate": 1.6866062731334254e-05, "loss": 0.402, "step": 505400}, {"epoch": 30.286330418114293, "grad_norm": 9.877191543579102, "learning_rate": 1.6848270142296684e-05, "loss": 0.4081, "step": 505600}, {"epoch": 30.29831077033665, "grad_norm": 8.008275032043457, "learning_rate": 1.683048217232195e-05, "loss": 0.3914, "step": 505800}, {"epoch": 37.8884312991389, "grad_norm": 9.622276306152344, "learning_rate": 3.428469915162767e-05, "loss": 0.4741, "step": 506000}, {"epoch": 37.8884312991389, "eval_loss": 1.4929417371749878, "eval_runtime": 1176.5683, "eval_samples_per_second": 8.418, "eval_steps_per_second": 0.422, "step": 506000}, {"epoch": 37.90340696368401, "grad_norm": 9.39040756225586, "learning_rate": 3.4273777458497844e-05, "loss": 0.5071, "step": 506200}, {"epoch": 37.918382628229125, "grad_norm": 5.257010459899902, "learning_rate": 3.426285371263784e-05, "loss": 0.5165, "step": 506400}, {"epoch": 37.933358292774244, "grad_norm": 11.673007011413574, "learning_rate": 3.425192791646561e-05, "loss": 0.4934, "step": 506600}, {"epoch": 37.948333957319356, "grad_norm": 8.285956382751465, "learning_rate": 3.424100007239956e-05, "loss": 0.4938, "step": 506800}, {"epoch": 37.96330962186447, "grad_norm": 12.166071891784668, "learning_rate": 3.423007018285853e-05, "loss": 0.5294, "step": 507000}, {"epoch": 37.96330962186447, "eval_loss": 1.4884788990020752, "eval_runtime": 1175.8977, "eval_samples_per_second": 8.423, "eval_steps_per_second": 0.422, "step": 507000}, {"epoch": 37.97828528640959, "grad_norm": 8.446301460266113, "learning_rate": 3.4219138250261844e-05, "loss": 0.4972, "step": 507200}, {"epoch": 37.9932609509547, "grad_norm": 11.271819114685059, "learning_rate": 3.4208204277029254e-05, "loss": 0.4955, "step": 507400}, {"epoch": 38.00823661549981, "grad_norm": 3.9000062942504883, "learning_rate": 3.419726826558097e-05, "loss": 0.5022, "step": 507600}, {"epoch": 38.02321228004493, "grad_norm": 5.071476936340332, "learning_rate": 3.418633021833766e-05, "loss": 0.5001, "step": 507800}, {"epoch": 38.03818794459004, "grad_norm": 6.885400295257568, "learning_rate": 3.4175390137720426e-05, "loss": 0.5128, "step": 508000}, {"epoch": 38.03818794459004, "eval_loss": 1.5250493288040161, "eval_runtime": 1200.985, "eval_samples_per_second": 8.247, "eval_steps_per_second": 0.413, "step": 508000}, {"epoch": 38.053163609135154, "grad_norm": 10.252553939819336, "learning_rate": 3.4164448026150833e-05, "loss": 0.5061, "step": 508200}, {"epoch": 38.068139273680266, "grad_norm": 13.395514488220215, "learning_rate": 3.41535038860509e-05, "loss": 0.4998, "step": 508400}, {"epoch": 38.083114938225386, "grad_norm": 8.786001205444336, "learning_rate": 3.414255771984308e-05, "loss": 0.4754, "step": 508600}, {"epoch": 38.0980906027705, "grad_norm": 4.015108108520508, "learning_rate": 3.413160952995029e-05, "loss": 0.5136, "step": 508800}, {"epoch": 38.11306626731561, "grad_norm": 12.217897415161133, "learning_rate": 3.412065931879589e-05, "loss": 0.5201, "step": 509000}, {"epoch": 38.11306626731561, "eval_loss": 1.5008865594863892, "eval_runtime": 1179.3365, "eval_samples_per_second": 8.398, "eval_steps_per_second": 0.421, "step": 509000}, {"epoch": 38.12804193186073, "grad_norm": 6.388108730316162, "learning_rate": 3.4109707088803675e-05, "loss": 0.4798, "step": 509200}, {"epoch": 38.14301759640584, "grad_norm": 11.298495292663574, "learning_rate": 3.40987528423979e-05, "loss": 0.5075, "step": 509400}, {"epoch": 38.15799326095095, "grad_norm": 10.524277687072754, "learning_rate": 3.4087796582003275e-05, "loss": 0.5069, "step": 509600}, {"epoch": 38.17296892549607, "grad_norm": 4.701446533203125, "learning_rate": 3.407683831004492e-05, "loss": 0.5348, "step": 509800}, {"epoch": 38.187944590041184, "grad_norm": 6.099429130554199, "learning_rate": 3.4065878028948444e-05, "loss": 0.5197, "step": 510000}, {"epoch": 38.187944590041184, "eval_loss": 1.4903627634048462, "eval_runtime": 1176.9696, "eval_samples_per_second": 8.415, "eval_steps_per_second": 0.421, "step": 510000}, {"epoch": 38.202920254586296, "grad_norm": 2.2168385982513428, "learning_rate": 3.405491574113988e-05, "loss": 0.5274, "step": 510200}, {"epoch": 38.217895919131415, "grad_norm": 5.571523666381836, "learning_rate": 3.4043951449045695e-05, "loss": 0.508, "step": 510400}, {"epoch": 38.23287158367653, "grad_norm": 6.537966728210449, "learning_rate": 3.403298515509283e-05, "loss": 0.4882, "step": 510600}, {"epoch": 38.24784724822164, "grad_norm": 28.721155166625977, "learning_rate": 3.4022016861708624e-05, "loss": 0.5249, "step": 510800}, {"epoch": 38.26282291276675, "grad_norm": 42.41488265991211, "learning_rate": 3.401104657132091e-05, "loss": 0.5189, "step": 511000}, {"epoch": 38.26282291276675, "eval_loss": 1.4894089698791504, "eval_runtime": 1177.1322, "eval_samples_per_second": 8.414, "eval_steps_per_second": 0.421, "step": 511000}, {"epoch": 38.27779857731187, "grad_norm": 4.228972434997559, "learning_rate": 3.4000074286357915e-05, "loss": 0.5163, "step": 511200}, {"epoch": 38.29277424185698, "grad_norm": 15.363574028015137, "learning_rate": 3.398910000924834e-05, "loss": 0.5183, "step": 511400}, {"epoch": 38.307749906402094, "grad_norm": 9.935384750366211, "learning_rate": 3.3978123742421324e-05, "loss": 0.5591, "step": 511600}, {"epoch": 38.32272557094721, "grad_norm": 4.404662609100342, "learning_rate": 3.396714548830643e-05, "loss": 0.5022, "step": 511800}, {"epoch": 38.337701235492325, "grad_norm": 11.139054298400879, "learning_rate": 3.395616524933368e-05, "loss": 0.5119, "step": 512000}, {"epoch": 38.337701235492325, "eval_loss": 1.499504804611206, "eval_runtime": 1176.9511, "eval_samples_per_second": 8.415, "eval_steps_per_second": 0.421, "step": 512000}, {"epoch": 38.35267690003744, "grad_norm": 6.452362537384033, "learning_rate": 3.39451830279335e-05, "loss": 0.5166, "step": 512200}, {"epoch": 38.367652564582556, "grad_norm": 18.25565528869629, "learning_rate": 3.3934198826536816e-05, "loss": 0.5368, "step": 512400}, {"epoch": 38.38262822912767, "grad_norm": 23.89322280883789, "learning_rate": 3.3923212647574944e-05, "loss": 0.5608, "step": 512600}, {"epoch": 38.39760389367278, "grad_norm": 31.064790725708008, "learning_rate": 3.3912224493479636e-05, "loss": 0.5323, "step": 512800}, {"epoch": 38.4125795582179, "grad_norm": 5.670620918273926, "learning_rate": 3.390123436668312e-05, "loss": 0.5253, "step": 513000}, {"epoch": 38.4125795582179, "eval_loss": 1.4939533472061157, "eval_runtime": 1175.7739, "eval_samples_per_second": 8.423, "eval_steps_per_second": 0.422, "step": 513000}, {"epoch": 38.42755522276301, "grad_norm": 13.84334945678711, "learning_rate": 3.389024226961801e-05, "loss": 0.5102, "step": 513200}, {"epoch": 38.44253088730812, "grad_norm": 6.627730369567871, "learning_rate": 3.38792482047174e-05, "loss": 0.5247, "step": 513400}, {"epoch": 38.457506551853236, "grad_norm": 6.677562236785889, "learning_rate": 3.3868252174414795e-05, "loss": 0.5242, "step": 513600}, {"epoch": 38.472482216398355, "grad_norm": 12.594396591186523, "learning_rate": 3.385725418114414e-05, "loss": 0.5081, "step": 513800}, {"epoch": 38.48745788094347, "grad_norm": 11.885519981384277, "learning_rate": 3.384625422733983e-05, "loss": 0.5153, "step": 514000}, {"epoch": 38.48745788094347, "eval_loss": 1.4921842813491821, "eval_runtime": 1175.5948, "eval_samples_per_second": 8.425, "eval_steps_per_second": 0.422, "step": 514000}], "logging_steps": 200, "max_steps": 1335500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 200, "stateful_callbacks": {"TrainerControl": {"args": {"should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false}, "attributes": {}}}, "total_flos": 7.492972918996992e+18, "train_batch_size": 10, "trial_name": null, "trial_params": null} \ No newline at end of file +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 39.10146012729315, + "eval_steps": 1000, + "global_step": 522200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008985533291400845, + "grad_norm": 0.0016701683634892106, + "learning_rate": 4.9999996212343494e-05, + "loss": 1.1088, + "step": 200 + }, + { + "epoch": 0.01797106658280169, + "grad_norm": 0.011517546139657497, + "learning_rate": 4.999998445840733e-05, + "loss": 1.0986, + "step": 400 + }, + { + "epoch": 0.026956599874202535, + "grad_norm": 0.008341927081346512, + "learning_rate": 4.999996473570505e-05, + "loss": 1.0995, + "step": 600 + }, + { + "epoch": 0.03594213316560338, + "grad_norm": 0.00020909779414068907, + "learning_rate": 4.999993704424292e-05, + "loss": 1.1061, + "step": 800 + }, + { + "epoch": 0.044927666457004224, + "grad_norm": 0.0004861349880229682, + "learning_rate": 4.999990138402976e-05, + "loss": 1.0986, + "step": 1000 + }, + { + "epoch": 0.044927666457004224, + "eval_loss": 2.7725830078125, + "eval_runtime": 1124.9446, + "eval_samples_per_second": 8.804, + "eval_steps_per_second": 0.276, + "step": 1000 + }, + { + "epoch": 0.05391319974840507, + "grad_norm": 0.0002919256512541324, + "learning_rate": 4.999985775507695e-05, + "loss": 1.0986, + "step": 1200 + }, + { + "epoch": 0.0628987330398059, + "grad_norm": 0.0018022151198238134, + "learning_rate": 4.99998061573984e-05, + "loss": 1.0986, + "step": 1400 + }, + { + "epoch": 0.07188426633120676, + "grad_norm": 1.6316088438034058, + "learning_rate": 4.9999746591010545e-05, + "loss": 1.0974, + "step": 1600 + }, + { + "epoch": 0.0808697996226076, + "grad_norm": 3.418214797973633, + "learning_rate": 4.999967905593237e-05, + "loss": 1.104, + "step": 1800 + }, + { + "epoch": 0.08985533291400845, + "grad_norm": 0.019139207899570465, + "learning_rate": 4.9999603552185416e-05, + "loss": 1.1005, + "step": 2000 + }, + { + "epoch": 0.08985533291400845, + "eval_loss": 3.4651877880096436, + "eval_runtime": 1072.8319, + "eval_samples_per_second": 9.232, + "eval_steps_per_second": 0.144, + "step": 2000 + }, + { + "epoch": 0.09884086620540929, + "grad_norm": 0.47359538078308105, + "learning_rate": 4.999952007979374e-05, + "loss": 1.1032, + "step": 2200 + }, + { + "epoch": 0.10782639949681014, + "grad_norm": 1.0411008596420288, + "learning_rate": 4.999942863878394e-05, + "loss": 1.0966, + "step": 2400 + }, + { + "epoch": 0.11681193278821098, + "grad_norm": 2.402155876159668, + "learning_rate": 4.999932922918519e-05, + "loss": 1.0791, + "step": 2600 + }, + { + "epoch": 0.1257974660796118, + "grad_norm": 1.485827088356018, + "learning_rate": 4.999922185102915e-05, + "loss": 1.0514, + "step": 2800 + }, + { + "epoch": 0.13478299937101268, + "grad_norm": 2.352109432220459, + "learning_rate": 4.9999106504350065e-05, + "loss": 1.0327, + "step": 3000 + }, + { + "epoch": 0.13478299937101268, + "eval_loss": 3.369852066040039, + "eval_runtime": 1064.8233, + "eval_samples_per_second": 9.301, + "eval_steps_per_second": 0.146, + "step": 3000 + }, + { + "epoch": 0.14376853266241352, + "grad_norm": 0.7272612452507019, + "learning_rate": 4.999898318918469e-05, + "loss": 1.0379, + "step": 3200 + }, + { + "epoch": 0.15275406595381436, + "grad_norm": 1.021616816520691, + "learning_rate": 4.999885190557234e-05, + "loss": 1.0416, + "step": 3400 + }, + { + "epoch": 0.1617395992452152, + "grad_norm": 2.4565377235412598, + "learning_rate": 4.999871265355485e-05, + "loss": 1.0212, + "step": 3600 + }, + { + "epoch": 0.17072513253661606, + "grad_norm": 20.56285858154297, + "learning_rate": 4.9998565433176624e-05, + "loss": 1.0219, + "step": 3800 + }, + { + "epoch": 0.1797106658280169, + "grad_norm": 0.7909038662910461, + "learning_rate": 4.9998410244484574e-05, + "loss": 1.0075, + "step": 4000 + }, + { + "epoch": 0.1797106658280169, + "eval_loss": 3.339078903198242, + "eval_runtime": 1066.4833, + "eval_samples_per_second": 9.287, + "eval_steps_per_second": 0.145, + "step": 4000 + }, + { + "epoch": 0.18869619911941773, + "grad_norm": 2.09454607963562, + "learning_rate": 4.999824708752817e-05, + "loss": 0.9825, + "step": 4200 + }, + { + "epoch": 0.19768173241081857, + "grad_norm": 2.223658323287964, + "learning_rate": 4.999807596235943e-05, + "loss": 0.9851, + "step": 4400 + }, + { + "epoch": 0.20666726570221944, + "grad_norm": 1.121969223022461, + "learning_rate": 4.999789686903289e-05, + "loss": 1.0041, + "step": 4600 + }, + { + "epoch": 0.21565279899362028, + "grad_norm": 4.0251312255859375, + "learning_rate": 4.9997709807605626e-05, + "loss": 0.9841, + "step": 4800 + }, + { + "epoch": 0.22463833228502111, + "grad_norm": 1.6437472105026245, + "learning_rate": 4.9997514778137275e-05, + "loss": 0.9483, + "step": 5000 + }, + { + "epoch": 0.22463833228502111, + "eval_loss": 3.2980644702911377, + "eval_runtime": 1067.9785, + "eval_samples_per_second": 9.274, + "eval_steps_per_second": 0.145, + "step": 5000 + }, + { + "epoch": 0.23362386557642195, + "grad_norm": 0.8991021513938904, + "learning_rate": 4.999731178069001e-05, + "loss": 0.9541, + "step": 5200 + }, + { + "epoch": 0.24260939886782282, + "grad_norm": 3.1451597213745117, + "learning_rate": 4.999710081532853e-05, + "loss": 0.9589, + "step": 5400 + }, + { + "epoch": 0.2515949321592236, + "grad_norm": 2.142390489578247, + "learning_rate": 4.999688188212007e-05, + "loss": 0.9677, + "step": 5600 + }, + { + "epoch": 0.2605804654506245, + "grad_norm": 2.2872331142425537, + "learning_rate": 4.999665498113444e-05, + "loss": 0.962, + "step": 5800 + }, + { + "epoch": 0.26956599874202536, + "grad_norm": 2.730259418487549, + "learning_rate": 4.999642011244394e-05, + "loss": 0.9581, + "step": 6000 + }, + { + "epoch": 0.26956599874202536, + "eval_loss": 3.3341598510742188, + "eval_runtime": 1066.5406, + "eval_samples_per_second": 9.286, + "eval_steps_per_second": 0.145, + "step": 6000 + }, + { + "epoch": 0.2785515320334262, + "grad_norm": 2.8416945934295654, + "learning_rate": 4.999617727612344e-05, + "loss": 0.9675, + "step": 6200 + }, + { + "epoch": 0.28753706532482703, + "grad_norm": 2.8148677349090576, + "learning_rate": 4.9995926472250356e-05, + "loss": 0.9411, + "step": 6400 + }, + { + "epoch": 0.2965225986162279, + "grad_norm": 1.3317234516143799, + "learning_rate": 4.999566770090462e-05, + "loss": 0.9279, + "step": 6600 + }, + { + "epoch": 0.3055081319076287, + "grad_norm": 3.403902053833008, + "learning_rate": 4.999540096216872e-05, + "loss": 0.9293, + "step": 6800 + }, + { + "epoch": 0.31449366519902955, + "grad_norm": 1.70892333984375, + "learning_rate": 4.9995126256127675e-05, + "loss": 0.9475, + "step": 7000 + }, + { + "epoch": 0.31449366519902955, + "eval_loss": 3.238970994949341, + "eval_runtime": 1068.527, + "eval_samples_per_second": 9.269, + "eval_steps_per_second": 0.145, + "step": 7000 + }, + { + "epoch": 0.3234791984904304, + "grad_norm": 3.11971378326416, + "learning_rate": 4.999484358286907e-05, + "loss": 0.9465, + "step": 7200 + }, + { + "epoch": 0.3324647317818312, + "grad_norm": 1.395370364189148, + "learning_rate": 4.9994552942482975e-05, + "loss": 0.9445, + "step": 7400 + }, + { + "epoch": 0.3414502650732321, + "grad_norm": 6.5639424324035645, + "learning_rate": 4.999425433506204e-05, + "loss": 0.9263, + "step": 7600 + }, + { + "epoch": 0.35043579836463296, + "grad_norm": 2.2011075019836426, + "learning_rate": 4.999394776070146e-05, + "loss": 0.9193, + "step": 7800 + }, + { + "epoch": 0.3594213316560338, + "grad_norm": 2.9525458812713623, + "learning_rate": 4.999363321949895e-05, + "loss": 0.9405, + "step": 8000 + }, + { + "epoch": 0.3594213316560338, + "eval_loss": 3.2370519638061523, + "eval_runtime": 1068.6545, + "eval_samples_per_second": 9.268, + "eval_steps_per_second": 0.145, + "step": 8000 + }, + { + "epoch": 0.36840686494743463, + "grad_norm": 4.726866245269775, + "learning_rate": 4.999331071155477e-05, + "loss": 0.9391, + "step": 8200 + }, + { + "epoch": 0.37739239823883547, + "grad_norm": 2.23179292678833, + "learning_rate": 4.9992980236971723e-05, + "loss": 0.9352, + "step": 8400 + }, + { + "epoch": 0.3863779315302363, + "grad_norm": 2.175626516342163, + "learning_rate": 4.9992641795855134e-05, + "loss": 0.9359, + "step": 8600 + }, + { + "epoch": 0.39536346482163714, + "grad_norm": 5.489994525909424, + "learning_rate": 4.9992295388312895e-05, + "loss": 0.918, + "step": 8800 + }, + { + "epoch": 0.404348998113038, + "grad_norm": 1.484823226928711, + "learning_rate": 4.9991941014455414e-05, + "loss": 0.9075, + "step": 9000 + }, + { + "epoch": 0.404348998113038, + "eval_loss": 3.1722910404205322, + "eval_runtime": 1070.0307, + "eval_samples_per_second": 9.256, + "eval_steps_per_second": 0.145, + "step": 9000 + }, + { + "epoch": 0.4133345314044389, + "grad_norm": 1.1743195056915283, + "learning_rate": 4.9991578674395656e-05, + "loss": 0.9116, + "step": 9200 + }, + { + "epoch": 0.4223200646958397, + "grad_norm": 4.027889728546143, + "learning_rate": 4.999120836824912e-05, + "loss": 0.9023, + "step": 9400 + }, + { + "epoch": 0.43130559798724055, + "grad_norm": 3.1647088527679443, + "learning_rate": 4.9990830096133826e-05, + "loss": 0.8992, + "step": 9600 + }, + { + "epoch": 0.4402911312786414, + "grad_norm": 1.6494026184082031, + "learning_rate": 4.9990443858170366e-05, + "loss": 0.8881, + "step": 9800 + }, + { + "epoch": 0.44927666457004223, + "grad_norm": 2.5967679023742676, + "learning_rate": 4.999004965448184e-05, + "loss": 0.8889, + "step": 10000 + }, + { + "epoch": 0.44927666457004223, + "eval_loss": 3.1767914295196533, + "eval_runtime": 1067.4091, + "eval_samples_per_second": 9.279, + "eval_steps_per_second": 0.145, + "step": 10000 + }, + { + "epoch": 0.45826219786144307, + "grad_norm": 2.703774929046631, + "learning_rate": 4.998964748519391e-05, + "loss": 0.8845, + "step": 10200 + }, + { + "epoch": 0.4672477311528439, + "grad_norm": 5.934618949890137, + "learning_rate": 4.998923735043477e-05, + "loss": 0.899, + "step": 10400 + }, + { + "epoch": 0.47623326444424474, + "grad_norm": 7.952963352203369, + "learning_rate": 4.9988819250335136e-05, + "loss": 0.8968, + "step": 10600 + }, + { + "epoch": 0.48521879773564563, + "grad_norm": 3.2846908569335938, + "learning_rate": 4.99883931850283e-05, + "loss": 0.8687, + "step": 10800 + }, + { + "epoch": 0.4942043310270465, + "grad_norm": 1.9633086919784546, + "learning_rate": 4.998795915465005e-05, + "loss": 0.8537, + "step": 11000 + }, + { + "epoch": 0.4942043310270465, + "eval_loss": 3.1828198432922363, + "eval_runtime": 1068.8128, + "eval_samples_per_second": 9.266, + "eval_steps_per_second": 0.145, + "step": 11000 + }, + { + "epoch": 0.5031898643184473, + "grad_norm": 6.807458400726318, + "learning_rate": 4.9987517159338744e-05, + "loss": 0.8482, + "step": 11200 + }, + { + "epoch": 0.5121753976098481, + "grad_norm": 2.9921388626098633, + "learning_rate": 4.998706719923526e-05, + "loss": 0.8662, + "step": 11400 + }, + { + "epoch": 0.521160930901249, + "grad_norm": 0.7828212380409241, + "learning_rate": 4.998660927448304e-05, + "loss": 0.88, + "step": 11600 + }, + { + "epoch": 0.5301464641926499, + "grad_norm": 3.1086294651031494, + "learning_rate": 4.9986143385228026e-05, + "loss": 0.8536, + "step": 11800 + }, + { + "epoch": 0.5391319974840507, + "grad_norm": 3.759007453918457, + "learning_rate": 4.998566953161874e-05, + "loss": 0.8321, + "step": 12000 + }, + { + "epoch": 0.5391319974840507, + "eval_loss": 3.1765565872192383, + "eval_runtime": 1069.9445, + "eval_samples_per_second": 9.257, + "eval_steps_per_second": 0.145, + "step": 12000 + }, + { + "epoch": 0.5481175307754516, + "grad_norm": 4.347619533538818, + "learning_rate": 4.9985187713806206e-05, + "loss": 0.8713, + "step": 12200 + }, + { + "epoch": 0.5571030640668524, + "grad_norm": 2.748655080795288, + "learning_rate": 4.9984697931944024e-05, + "loss": 0.8457, + "step": 12400 + }, + { + "epoch": 0.5660885973582532, + "grad_norm": 2.891540288925171, + "learning_rate": 4.998420018618829e-05, + "loss": 0.8212, + "step": 12600 + }, + { + "epoch": 0.5750741306496541, + "grad_norm": 4.089766025543213, + "learning_rate": 4.998369447669768e-05, + "loss": 0.8288, + "step": 12800 + }, + { + "epoch": 0.5840596639410549, + "grad_norm": 4.722995758056641, + "learning_rate": 4.9983180803633376e-05, + "loss": 0.8757, + "step": 13000 + }, + { + "epoch": 0.5840596639410549, + "eval_loss": 3.168459892272949, + "eval_runtime": 1070.7464, + "eval_samples_per_second": 9.25, + "eval_steps_per_second": 0.145, + "step": 13000 + }, + { + "epoch": 0.5930451972324557, + "grad_norm": 7.390491008758545, + "learning_rate": 4.998265916715912e-05, + "loss": 0.8477, + "step": 13200 + }, + { + "epoch": 0.6020307305238566, + "grad_norm": 2.4633262157440186, + "learning_rate": 4.9982129567441185e-05, + "loss": 0.8415, + "step": 13400 + }, + { + "epoch": 0.6110162638152574, + "grad_norm": 5.4892473220825195, + "learning_rate": 4.998159200464837e-05, + "loss": 0.8176, + "step": 13600 + }, + { + "epoch": 0.6200017971066583, + "grad_norm": 4.862381458282471, + "learning_rate": 4.998104647895203e-05, + "loss": 0.8336, + "step": 13800 + }, + { + "epoch": 0.6289873303980591, + "grad_norm": 8.079172134399414, + "learning_rate": 4.998049299052606e-05, + "loss": 0.8147, + "step": 14000 + }, + { + "epoch": 0.6289873303980591, + "eval_loss": 3.1354148387908936, + "eval_runtime": 1070.1274, + "eval_samples_per_second": 9.255, + "eval_steps_per_second": 0.145, + "step": 14000 + }, + { + "epoch": 0.6379728636894599, + "grad_norm": 2.196859359741211, + "learning_rate": 4.997993153954688e-05, + "loss": 0.8196, + "step": 14200 + }, + { + "epoch": 0.6469583969808608, + "grad_norm": 2.802729606628418, + "learning_rate": 4.997936212619344e-05, + "loss": 0.8218, + "step": 14400 + }, + { + "epoch": 0.6559439302722616, + "grad_norm": 5.947813510894775, + "learning_rate": 4.997878475064726e-05, + "loss": 0.8178, + "step": 14600 + }, + { + "epoch": 0.6649294635636624, + "grad_norm": 4.929244041442871, + "learning_rate": 4.9978199413092364e-05, + "loss": 0.849, + "step": 14800 + }, + { + "epoch": 0.6739149968550634, + "grad_norm": 3.7185091972351074, + "learning_rate": 4.9977606113715336e-05, + "loss": 0.8132, + "step": 15000 + }, + { + "epoch": 0.6739149968550634, + "eval_loss": 3.086395263671875, + "eval_runtime": 1123.3847, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 0.138, + "step": 15000 + }, + { + "epoch": 0.6829005301464642, + "grad_norm": 3.6919984817504883, + "learning_rate": 4.9977004852705293e-05, + "loss": 0.8171, + "step": 15200 + }, + { + "epoch": 0.6918860634378651, + "grad_norm": 3.0211970806121826, + "learning_rate": 4.997639563025388e-05, + "loss": 0.8394, + "step": 15400 + }, + { + "epoch": 0.7008715967292659, + "grad_norm": 3.166466236114502, + "learning_rate": 4.99757784465553e-05, + "loss": 0.7978, + "step": 15600 + }, + { + "epoch": 0.7098571300206667, + "grad_norm": 3.316209554672241, + "learning_rate": 4.997515330180627e-05, + "loss": 0.8196, + "step": 15800 + }, + { + "epoch": 0.7188426633120676, + "grad_norm": 3.4489612579345703, + "learning_rate": 4.997452019620606e-05, + "loss": 0.8218, + "step": 16000 + }, + { + "epoch": 0.7188426633120676, + "eval_loss": 3.1093759536743164, + "eval_runtime": 1119.6409, + "eval_samples_per_second": 8.846, + "eval_steps_per_second": 0.138, + "step": 16000 + }, + { + "epoch": 0.7278281966034684, + "grad_norm": 7.543302059173584, + "learning_rate": 4.997387912995647e-05, + "loss": 0.7442, + "step": 16200 + }, + { + "epoch": 0.7368137298948693, + "grad_norm": 5.488494873046875, + "learning_rate": 4.9973230103261834e-05, + "loss": 0.8101, + "step": 16400 + }, + { + "epoch": 0.7457992631862701, + "grad_norm": 6.828782081604004, + "learning_rate": 4.997257311632905e-05, + "loss": 0.796, + "step": 16600 + }, + { + "epoch": 0.7547847964776709, + "grad_norm": 3.4980998039245605, + "learning_rate": 4.997190816936751e-05, + "loss": 0.8147, + "step": 16800 + }, + { + "epoch": 0.7637703297690718, + "grad_norm": 4.646483421325684, + "learning_rate": 4.9971235262589175e-05, + "loss": 0.8082, + "step": 17000 + }, + { + "epoch": 0.7637703297690718, + "eval_loss": 3.0615007877349854, + "eval_runtime": 1118.9871, + "eval_samples_per_second": 8.851, + "eval_steps_per_second": 0.139, + "step": 17000 + }, + { + "epoch": 0.7727558630604726, + "grad_norm": 4.960477828979492, + "learning_rate": 4.997055439620854e-05, + "loss": 0.7868, + "step": 17200 + }, + { + "epoch": 0.7817413963518735, + "grad_norm": 5.231990337371826, + "learning_rate": 4.9969865570442634e-05, + "loss": 0.7698, + "step": 17400 + }, + { + "epoch": 0.7907269296432743, + "grad_norm": 6.0175065994262695, + "learning_rate": 4.9969168785511e-05, + "loss": 0.7753, + "step": 17600 + }, + { + "epoch": 0.7997124629346751, + "grad_norm": 1.7933512926101685, + "learning_rate": 4.9968464041635765e-05, + "loss": 0.8048, + "step": 17800 + }, + { + "epoch": 0.808697996226076, + "grad_norm": 2.3188130855560303, + "learning_rate": 4.996775133904156e-05, + "loss": 0.8065, + "step": 18000 + }, + { + "epoch": 0.808697996226076, + "eval_loss": 2.9708292484283447, + "eval_runtime": 1121.2171, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 0.138, + "step": 18000 + }, + { + "epoch": 0.8176835295174769, + "grad_norm": 6.4882049560546875, + "learning_rate": 4.996703067795554e-05, + "loss": 0.7768, + "step": 18200 + }, + { + "epoch": 0.8266690628088778, + "grad_norm": 6.340662956237793, + "learning_rate": 4.996630205860744e-05, + "loss": 0.7618, + "step": 18400 + }, + { + "epoch": 0.8356545961002786, + "grad_norm": 2.5629725456237793, + "learning_rate": 4.99655654812295e-05, + "loss": 0.7907, + "step": 18600 + }, + { + "epoch": 0.8446401293916794, + "grad_norm": 2.3929648399353027, + "learning_rate": 4.99648209460565e-05, + "loss": 0.7728, + "step": 18800 + }, + { + "epoch": 0.8536256626830803, + "grad_norm": 8.27813720703125, + "learning_rate": 4.9964068453325776e-05, + "loss": 0.7344, + "step": 19000 + }, + { + "epoch": 0.8536256626830803, + "eval_loss": 2.9753618240356445, + "eval_runtime": 1119.6944, + "eval_samples_per_second": 8.845, + "eval_steps_per_second": 0.138, + "step": 19000 + }, + { + "epoch": 0.8626111959744811, + "grad_norm": 3.184513568878174, + "learning_rate": 4.996330800327716e-05, + "loss": 0.7734, + "step": 19200 + }, + { + "epoch": 0.8715967292658819, + "grad_norm": 6.273008823394775, + "learning_rate": 4.9962539596153065e-05, + "loss": 0.7692, + "step": 19400 + }, + { + "epoch": 0.8805822625572828, + "grad_norm": 5.725162506103516, + "learning_rate": 4.996176323219842e-05, + "loss": 0.7814, + "step": 19600 + }, + { + "epoch": 0.8895677958486836, + "grad_norm": 5.493536949157715, + "learning_rate": 4.996097891166069e-05, + "loss": 0.7704, + "step": 19800 + }, + { + "epoch": 0.8985533291400845, + "grad_norm": 5.661196708679199, + "learning_rate": 4.9960186634789874e-05, + "loss": 0.8059, + "step": 20000 + }, + { + "epoch": 0.8985533291400845, + "eval_loss": 2.985053062438965, + "eval_runtime": 1118.2825, + "eval_samples_per_second": 8.856, + "eval_steps_per_second": 0.139, + "step": 20000 + }, + { + "epoch": 0.9075388624314853, + "grad_norm": 6.618274211883545, + "learning_rate": 4.995938640183851e-05, + "loss": 0.7728, + "step": 20200 + }, + { + "epoch": 0.9165243957228861, + "grad_norm": 17.2467041015625, + "learning_rate": 4.995857821306169e-05, + "loss": 0.7402, + "step": 20400 + }, + { + "epoch": 0.925509929014287, + "grad_norm": 4.441402912139893, + "learning_rate": 4.9957762068717e-05, + "loss": 0.7789, + "step": 20600 + }, + { + "epoch": 0.9344954623056878, + "grad_norm": 2.338825225830078, + "learning_rate": 4.99569379690646e-05, + "loss": 0.7656, + "step": 20800 + }, + { + "epoch": 0.9434809955970886, + "grad_norm": 3.987342357635498, + "learning_rate": 4.9956105914367175e-05, + "loss": 0.7412, + "step": 21000 + }, + { + "epoch": 0.9434809955970886, + "eval_loss": 2.933100700378418, + "eval_runtime": 1131.2007, + "eval_samples_per_second": 8.755, + "eval_steps_per_second": 0.137, + "step": 21000 + }, + { + "epoch": 0.9524665288884895, + "grad_norm": 9.93287467956543, + "learning_rate": 4.9955265904889936e-05, + "loss": 0.7687, + "step": 21200 + }, + { + "epoch": 0.9614520621798903, + "grad_norm": 3.2046945095062256, + "learning_rate": 4.995441794090064e-05, + "loss": 0.7305, + "step": 21400 + }, + { + "epoch": 0.9704375954712913, + "grad_norm": 2.932640790939331, + "learning_rate": 4.9953562022669575e-05, + "loss": 0.7675, + "step": 21600 + }, + { + "epoch": 0.9794231287626921, + "grad_norm": 1.4578217267990112, + "learning_rate": 4.995269815046957e-05, + "loss": 0.7412, + "step": 21800 + }, + { + "epoch": 0.988408662054093, + "grad_norm": 3.856112480163574, + "learning_rate": 4.9951826324575974e-05, + "loss": 0.7751, + "step": 22000 + }, + { + "epoch": 0.988408662054093, + "eval_loss": 3.065196990966797, + "eval_runtime": 1131.3352, + "eval_samples_per_second": 8.754, + "eval_steps_per_second": 0.137, + "step": 22000 + }, + { + "epoch": 0.9973941953454938, + "grad_norm": 5.718069076538086, + "learning_rate": 4.9950946545266695e-05, + "loss": 0.7576, + "step": 22200 + }, + { + "epoch": 1.0063797286368945, + "grad_norm": 7.1981401443481445, + "learning_rate": 4.9950058812822154e-05, + "loss": 0.7669, + "step": 22400 + }, + { + "epoch": 1.0153652619282953, + "grad_norm": 3.5773613452911377, + "learning_rate": 4.994916312752532e-05, + "loss": 0.7544, + "step": 22600 + }, + { + "epoch": 1.0243507952196962, + "grad_norm": 4.548768043518066, + "learning_rate": 4.9948259489661695e-05, + "loss": 0.7895, + "step": 22800 + }, + { + "epoch": 1.0333363285110972, + "grad_norm": 3.69889497756958, + "learning_rate": 4.994734789951932e-05, + "loss": 0.7491, + "step": 23000 + }, + { + "epoch": 1.0333363285110972, + "eval_loss": 3.0196194648742676, + "eval_runtime": 1131.3469, + "eval_samples_per_second": 8.754, + "eval_steps_per_second": 0.137, + "step": 23000 + }, + { + "epoch": 1.042321861802498, + "grad_norm": 3.7836413383483887, + "learning_rate": 4.994642835738875e-05, + "loss": 0.7269, + "step": 23200 + }, + { + "epoch": 1.051307395093899, + "grad_norm": 6.627780914306641, + "learning_rate": 4.9945500863563105e-05, + "loss": 0.6858, + "step": 23400 + }, + { + "epoch": 1.0602929283852998, + "grad_norm": 4.019529819488525, + "learning_rate": 4.994456541833802e-05, + "loss": 0.742, + "step": 23600 + }, + { + "epoch": 1.0692784616767006, + "grad_norm": 5.022628307342529, + "learning_rate": 4.994362202201166e-05, + "loss": 0.7332, + "step": 23800 + }, + { + "epoch": 1.0782639949681014, + "grad_norm": 12.518102645874023, + "learning_rate": 4.994267067488474e-05, + "loss": 0.7081, + "step": 24000 + }, + { + "epoch": 1.0782639949681014, + "eval_loss": 3.018568992614746, + "eval_runtime": 1130.4061, + "eval_samples_per_second": 8.761, + "eval_steps_per_second": 0.137, + "step": 24000 + }, + { + "epoch": 1.0872495282595023, + "grad_norm": 2.7211592197418213, + "learning_rate": 4.9941711377260506e-05, + "loss": 0.7172, + "step": 24200 + }, + { + "epoch": 1.0962350615509031, + "grad_norm": 3.2140583992004395, + "learning_rate": 4.994074412944473e-05, + "loss": 0.7231, + "step": 24400 + }, + { + "epoch": 1.105220594842304, + "grad_norm": 0.7109707593917847, + "learning_rate": 4.993976893174572e-05, + "loss": 0.7293, + "step": 24600 + }, + { + "epoch": 1.1142061281337048, + "grad_norm": 9.078465461730957, + "learning_rate": 4.993878578447433e-05, + "loss": 0.7207, + "step": 24800 + }, + { + "epoch": 1.1231916614251056, + "grad_norm": 5.582509994506836, + "learning_rate": 4.993779468794394e-05, + "loss": 0.7292, + "step": 25000 + }, + { + "epoch": 1.1231916614251056, + "eval_loss": 2.892444133758545, + "eval_runtime": 1130.6944, + "eval_samples_per_second": 8.759, + "eval_steps_per_second": 0.137, + "step": 25000 + }, + { + "epoch": 1.1321771947165065, + "grad_norm": 3.1292569637298584, + "learning_rate": 4.9936795642470444e-05, + "loss": 0.7389, + "step": 25200 + }, + { + "epoch": 1.1411627280079073, + "grad_norm": 2.5674803256988525, + "learning_rate": 4.993578864837232e-05, + "loss": 0.7215, + "step": 25400 + }, + { + "epoch": 1.1501482612993081, + "grad_norm": 2.9022293090820312, + "learning_rate": 4.9934773705970514e-05, + "loss": 0.7025, + "step": 25600 + }, + { + "epoch": 1.159133794590709, + "grad_norm": 10.041083335876465, + "learning_rate": 4.9933750815588566e-05, + "loss": 0.7249, + "step": 25800 + }, + { + "epoch": 1.1681193278821098, + "grad_norm": 5.979797840118408, + "learning_rate": 4.9932719977552514e-05, + "loss": 0.7304, + "step": 26000 + }, + { + "epoch": 1.1681193278821098, + "eval_loss": 2.932370185852051, + "eval_runtime": 1084.371, + "eval_samples_per_second": 9.133, + "eval_steps_per_second": 0.143, + "step": 26000 + }, + { + "epoch": 1.1771048611735107, + "grad_norm": 2.0028152465820312, + "learning_rate": 4.993168119219093e-05, + "loss": 0.7482, + "step": 26200 + }, + { + "epoch": 1.1860903944649115, + "grad_norm": 2.630038022994995, + "learning_rate": 4.993063445983495e-05, + "loss": 0.7324, + "step": 26400 + }, + { + "epoch": 1.1950759277563123, + "grad_norm": 6.610321044921875, + "learning_rate": 4.992957978081819e-05, + "loss": 0.7263, + "step": 26600 + }, + { + "epoch": 1.2040614610477132, + "grad_norm": 3.0929627418518066, + "learning_rate": 4.992851715547685e-05, + "loss": 0.7191, + "step": 26800 + }, + { + "epoch": 1.213046994339114, + "grad_norm": 5.623810768127441, + "learning_rate": 4.992744658414964e-05, + "loss": 0.7092, + "step": 27000 + }, + { + "epoch": 1.213046994339114, + "eval_loss": 2.992058038711548, + "eval_runtime": 1088.476, + "eval_samples_per_second": 9.099, + "eval_steps_per_second": 0.142, + "step": 27000 + }, + { + "epoch": 1.2220325276305148, + "grad_norm": 4.626497745513916, + "learning_rate": 4.9926368067177806e-05, + "loss": 0.7309, + "step": 27200 + }, + { + "epoch": 1.2310180609219157, + "grad_norm": 2.491546630859375, + "learning_rate": 4.9925281604905126e-05, + "loss": 0.7215, + "step": 27400 + }, + { + "epoch": 1.2400035942133165, + "grad_norm": 5.404864311218262, + "learning_rate": 4.992418719767791e-05, + "loss": 0.6825, + "step": 27600 + }, + { + "epoch": 1.2489891275047174, + "grad_norm": 3.231696605682373, + "learning_rate": 4.9923084845845e-05, + "loss": 0.7371, + "step": 27800 + }, + { + "epoch": 1.2579746607961182, + "grad_norm": 3.4389524459838867, + "learning_rate": 4.992197454975778e-05, + "loss": 0.7055, + "step": 28000 + }, + { + "epoch": 1.2579746607961182, + "eval_loss": 2.9535281658172607, + "eval_runtime": 1087.0884, + "eval_samples_per_second": 9.111, + "eval_steps_per_second": 0.143, + "step": 28000 + }, + { + "epoch": 1.266960194087519, + "grad_norm": 2.275574207305908, + "learning_rate": 4.992085630977014e-05, + "loss": 0.722, + "step": 28200 + }, + { + "epoch": 1.2759457273789199, + "grad_norm": 3.3943276405334473, + "learning_rate": 4.991973012623853e-05, + "loss": 0.7129, + "step": 28400 + }, + { + "epoch": 1.2849312606703207, + "grad_norm": 3.186497688293457, + "learning_rate": 4.9918595999521924e-05, + "loss": 0.7351, + "step": 28600 + }, + { + "epoch": 1.2939167939617215, + "grad_norm": 10.006003379821777, + "learning_rate": 4.991745392998182e-05, + "loss": 0.7021, + "step": 28800 + }, + { + "epoch": 1.3029023272531224, + "grad_norm": 4.930509567260742, + "learning_rate": 4.991630391798227e-05, + "loss": 0.7292, + "step": 29000 + }, + { + "epoch": 1.3029023272531224, + "eval_loss": 2.9845774173736572, + "eval_runtime": 1084.0245, + "eval_samples_per_second": 9.136, + "eval_steps_per_second": 0.143, + "step": 29000 + }, + { + "epoch": 1.3118878605445232, + "grad_norm": 1.6518604755401611, + "learning_rate": 4.991514596388981e-05, + "loss": 0.7086, + "step": 29200 + }, + { + "epoch": 1.320873393835924, + "grad_norm": 4.181282043457031, + "learning_rate": 4.991398006807357e-05, + "loss": 0.7083, + "step": 29400 + }, + { + "epoch": 1.329858927127325, + "grad_norm": 10.062579154968262, + "learning_rate": 4.991280623090516e-05, + "loss": 0.753, + "step": 29600 + }, + { + "epoch": 1.3388444604187257, + "grad_norm": 6.119633197784424, + "learning_rate": 4.991162445275876e-05, + "loss": 0.6906, + "step": 29800 + }, + { + "epoch": 1.3478299937101266, + "grad_norm": 7.6824822425842285, + "learning_rate": 4.9910434734011046e-05, + "loss": 0.7234, + "step": 30000 + }, + { + "epoch": 1.3478299937101266, + "eval_loss": 2.945618152618408, + "eval_runtime": 1085.7029, + "eval_samples_per_second": 9.122, + "eval_steps_per_second": 0.143, + "step": 30000 + }, + { + "epoch": 1.3568155270015274, + "grad_norm": 4.914371490478516, + "learning_rate": 4.990923707504125e-05, + "loss": 0.6996, + "step": 30200 + }, + { + "epoch": 1.3658010602929282, + "grad_norm": 4.89448881149292, + "learning_rate": 4.9908031476231124e-05, + "loss": 0.7198, + "step": 30400 + }, + { + "epoch": 1.3747865935843293, + "grad_norm": 1.3539308309555054, + "learning_rate": 4.990681793796495e-05, + "loss": 0.698, + "step": 30600 + }, + { + "epoch": 1.3837721268757301, + "grad_norm": 3.3933920860290527, + "learning_rate": 4.9905596460629555e-05, + "loss": 0.7112, + "step": 30800 + }, + { + "epoch": 1.392757660167131, + "grad_norm": 3.926790952682495, + "learning_rate": 4.9904367044614275e-05, + "loss": 0.7554, + "step": 31000 + }, + { + "epoch": 1.392757660167131, + "eval_loss": 2.94183611869812, + "eval_runtime": 1086.8024, + "eval_samples_per_second": 9.113, + "eval_steps_per_second": 0.143, + "step": 31000 + }, + { + "epoch": 1.4017431934585318, + "grad_norm": 2.5616230964660645, + "learning_rate": 4.9903129690311e-05, + "loss": 0.7149, + "step": 31200 + }, + { + "epoch": 1.4107287267499327, + "grad_norm": 2.269793748855591, + "learning_rate": 4.990188439811412e-05, + "loss": 0.7309, + "step": 31400 + }, + { + "epoch": 1.4197142600413335, + "grad_norm": 4.201299667358398, + "learning_rate": 4.990063116842059e-05, + "loss": 0.7157, + "step": 31600 + }, + { + "epoch": 1.4286997933327343, + "grad_norm": 3.891510009765625, + "learning_rate": 4.989937000162987e-05, + "loss": 0.7113, + "step": 31800 + }, + { + "epoch": 1.4376853266241352, + "grad_norm": 8.882272720336914, + "learning_rate": 4.9898100898143955e-05, + "loss": 0.6696, + "step": 32000 + }, + { + "epoch": 1.4376853266241352, + "eval_loss": 2.988067626953125, + "eval_runtime": 1086.6628, + "eval_samples_per_second": 9.114, + "eval_steps_per_second": 0.143, + "step": 32000 + }, + { + "epoch": 1.446670859915536, + "grad_norm": 5.083052158355713, + "learning_rate": 4.989682385836738e-05, + "loss": 0.7092, + "step": 32200 + }, + { + "epoch": 1.4556563932069368, + "grad_norm": 7.371493339538574, + "learning_rate": 4.989553888270719e-05, + "loss": 0.7188, + "step": 32400 + }, + { + "epoch": 1.4646419264983377, + "grad_norm": 2.6267755031585693, + "learning_rate": 4.989424597157299e-05, + "loss": 0.6744, + "step": 32600 + }, + { + "epoch": 1.4736274597897385, + "grad_norm": 5.069836616516113, + "learning_rate": 4.9892945125376896e-05, + "loss": 0.7124, + "step": 32800 + }, + { + "epoch": 1.4826129930811394, + "grad_norm": 18.678049087524414, + "learning_rate": 4.989163634453353e-05, + "loss": 0.6928, + "step": 33000 + }, + { + "epoch": 1.4826129930811394, + "eval_loss": 2.9007580280303955, + "eval_runtime": 1085.795, + "eval_samples_per_second": 9.121, + "eval_steps_per_second": 0.143, + "step": 33000 + }, + { + "epoch": 1.4915985263725402, + "grad_norm": 7.033535957336426, + "learning_rate": 4.989031962946009e-05, + "loss": 0.7045, + "step": 33200 + }, + { + "epoch": 1.500584059663941, + "grad_norm": 2.6740469932556152, + "learning_rate": 4.988899498057628e-05, + "loss": 0.7225, + "step": 33400 + }, + { + "epoch": 1.5095695929553419, + "grad_norm": 5.661626815795898, + "learning_rate": 4.988766239830431e-05, + "loss": 0.7058, + "step": 33600 + }, + { + "epoch": 1.5185551262467427, + "grad_norm": 10.127273559570312, + "learning_rate": 4.988632188306896e-05, + "loss": 0.7044, + "step": 33800 + }, + { + "epoch": 1.5275406595381436, + "grad_norm": 9.424492835998535, + "learning_rate": 4.988497343529753e-05, + "loss": 0.6702, + "step": 34000 + }, + { + "epoch": 1.5275406595381436, + "eval_loss": 2.8689780235290527, + "eval_runtime": 1086.8402, + "eval_samples_per_second": 9.113, + "eval_steps_per_second": 0.143, + "step": 34000 + }, + { + "epoch": 1.5365261928295444, + "grad_norm": 4.340188503265381, + "learning_rate": 4.988361705541982e-05, + "loss": 0.663, + "step": 34200 + }, + { + "epoch": 1.5455117261209452, + "grad_norm": 5.512271881103516, + "learning_rate": 4.988225274386819e-05, + "loss": 0.7331, + "step": 34400 + }, + { + "epoch": 1.5544972594123463, + "grad_norm": 5.91928243637085, + "learning_rate": 4.9880880501077496e-05, + "loss": 0.7175, + "step": 34600 + }, + { + "epoch": 1.5634827927037471, + "grad_norm": 2.7053489685058594, + "learning_rate": 4.987950032748516e-05, + "loss": 0.6993, + "step": 34800 + }, + { + "epoch": 1.572468325995148, + "grad_norm": 6.583710670471191, + "learning_rate": 4.9878112223531106e-05, + "loss": 0.6826, + "step": 35000 + }, + { + "epoch": 1.572468325995148, + "eval_loss": 2.9143316745758057, + "eval_runtime": 1083.335, + "eval_samples_per_second": 9.142, + "eval_steps_per_second": 0.143, + "step": 35000 + }, + { + "epoch": 1.5814538592865488, + "grad_norm": 3.8892221450805664, + "learning_rate": 4.98767161896578e-05, + "loss": 0.7215, + "step": 35200 + }, + { + "epoch": 1.5904393925779496, + "grad_norm": 5.868275165557861, + "learning_rate": 4.987531222631022e-05, + "loss": 0.6736, + "step": 35400 + }, + { + "epoch": 1.5994249258693505, + "grad_norm": 4.020185947418213, + "learning_rate": 4.9873900333935886e-05, + "loss": 0.7027, + "step": 35600 + }, + { + "epoch": 1.6084104591607513, + "grad_norm": 6.451934814453125, + "learning_rate": 4.987248051298484e-05, + "loss": 0.7045, + "step": 35800 + }, + { + "epoch": 1.6173959924521522, + "grad_norm": 8.390814781188965, + "learning_rate": 4.987105276390965e-05, + "loss": 0.6964, + "step": 36000 + }, + { + "epoch": 1.6173959924521522, + "eval_loss": 2.856686592102051, + "eval_runtime": 1080.9016, + "eval_samples_per_second": 9.163, + "eval_steps_per_second": 0.143, + "step": 36000 + }, + { + "epoch": 1.626381525743553, + "grad_norm": 8.42429256439209, + "learning_rate": 4.9869617087165424e-05, + "loss": 0.6867, + "step": 36200 + }, + { + "epoch": 1.6353670590349538, + "grad_norm": 3.3174638748168945, + "learning_rate": 4.9868173483209756e-05, + "loss": 0.6841, + "step": 36400 + }, + { + "epoch": 1.6443525923263547, + "grad_norm": 5.016312122344971, + "learning_rate": 4.986672195250282e-05, + "loss": 0.6902, + "step": 36600 + }, + { + "epoch": 1.6533381256177555, + "grad_norm": 2.4442625045776367, + "learning_rate": 4.986526249550729e-05, + "loss": 0.7003, + "step": 36800 + }, + { + "epoch": 1.6623236589091563, + "grad_norm": 7.444258213043213, + "learning_rate": 4.9863795112688364e-05, + "loss": 0.6872, + "step": 37000 + }, + { + "epoch": 1.6623236589091563, + "eval_loss": 2.9427731037139893, + "eval_runtime": 1046.5686, + "eval_samples_per_second": 9.463, + "eval_steps_per_second": 0.148, + "step": 37000 + }, + { + "epoch": 1.6713091922005572, + "grad_norm": 5.738009452819824, + "learning_rate": 4.986231980451376e-05, + "loss": 0.7106, + "step": 37200 + }, + { + "epoch": 1.680294725491958, + "grad_norm": 4.871852397918701, + "learning_rate": 4.986083657145376e-05, + "loss": 0.6893, + "step": 37400 + }, + { + "epoch": 1.6892802587833589, + "grad_norm": 4.325986862182617, + "learning_rate": 4.985934541398113e-05, + "loss": 0.6657, + "step": 37600 + }, + { + "epoch": 1.6982657920747597, + "grad_norm": 3.812180757522583, + "learning_rate": 4.985784633257118e-05, + "loss": 0.6489, + "step": 37800 + }, + { + "epoch": 1.7072513253661605, + "grad_norm": 3.503493309020996, + "learning_rate": 4.985633932770174e-05, + "loss": 0.7538, + "step": 38000 + }, + { + "epoch": 1.7072513253661605, + "eval_loss": 2.824307441711426, + "eval_runtime": 1047.4182, + "eval_samples_per_second": 9.456, + "eval_steps_per_second": 0.148, + "step": 38000 + }, + { + "epoch": 1.7162368586575614, + "grad_norm": 3.583653450012207, + "learning_rate": 4.985482439985317e-05, + "loss": 0.6612, + "step": 38200 + }, + { + "epoch": 1.7252223919489622, + "grad_norm": 3.160301446914673, + "learning_rate": 4.9853301549508364e-05, + "loss": 0.6933, + "step": 38400 + }, + { + "epoch": 1.734207925240363, + "grad_norm": 4.189894199371338, + "learning_rate": 4.9851770777152716e-05, + "loss": 0.6824, + "step": 38600 + }, + { + "epoch": 1.7431934585317639, + "grad_norm": 0.5203965902328491, + "learning_rate": 4.985023208327419e-05, + "loss": 0.674, + "step": 38800 + }, + { + "epoch": 1.7521789918231647, + "grad_norm": 4.871167182922363, + "learning_rate": 4.98486854683632e-05, + "loss": 0.6908, + "step": 39000 + }, + { + "epoch": 1.7521789918231647, + "eval_loss": 2.880004405975342, + "eval_runtime": 1044.7953, + "eval_samples_per_second": 9.479, + "eval_steps_per_second": 0.148, + "step": 39000 + }, + { + "epoch": 1.7611645251145656, + "grad_norm": 3.4473588466644287, + "learning_rate": 4.9847130932912765e-05, + "loss": 0.652, + "step": 39200 + }, + { + "epoch": 1.7701500584059664, + "grad_norm": 12.704270362854004, + "learning_rate": 4.984556847741839e-05, + "loss": 0.674, + "step": 39400 + }, + { + "epoch": 1.7791355916973672, + "grad_norm": 9.541321754455566, + "learning_rate": 4.984399810237811e-05, + "loss": 0.7046, + "step": 39600 + }, + { + "epoch": 1.788121124988768, + "grad_norm": 5.383360385894775, + "learning_rate": 4.9842419808292473e-05, + "loss": 0.6338, + "step": 39800 + }, + { + "epoch": 1.797106658280169, + "grad_norm": 7.993824005126953, + "learning_rate": 4.9840833595664566e-05, + "loss": 0.6627, + "step": 40000 + }, + { + "epoch": 1.797106658280169, + "eval_loss": 2.934129238128662, + "eval_runtime": 1044.8474, + "eval_samples_per_second": 9.479, + "eval_steps_per_second": 0.148, + "step": 40000 + }, + { + "epoch": 1.8060921915715697, + "grad_norm": 2.7325427532196045, + "learning_rate": 4.9839239464999996e-05, + "loss": 0.6752, + "step": 40200 + }, + { + "epoch": 1.8150777248629706, + "grad_norm": 6.341977119445801, + "learning_rate": 4.9837637416806895e-05, + "loss": 0.671, + "step": 40400 + }, + { + "epoch": 1.8240632581543714, + "grad_norm": 10.8590726852417, + "learning_rate": 4.9836027451595916e-05, + "loss": 0.6901, + "step": 40600 + }, + { + "epoch": 1.8330487914457723, + "grad_norm": 10.971672058105469, + "learning_rate": 4.983440956988023e-05, + "loss": 0.6905, + "step": 40800 + }, + { + "epoch": 1.842034324737173, + "grad_norm": 8.158576011657715, + "learning_rate": 4.983278377217556e-05, + "loss": 0.698, + "step": 41000 + }, + { + "epoch": 1.842034324737173, + "eval_loss": 2.8494818210601807, + "eval_runtime": 1044.9004, + "eval_samples_per_second": 9.478, + "eval_steps_per_second": 0.148, + "step": 41000 + }, + { + "epoch": 1.851019858028574, + "grad_norm": 7.720126628875732, + "learning_rate": 4.983115005900011e-05, + "loss": 0.6763, + "step": 41200 + }, + { + "epoch": 1.8600053913199748, + "grad_norm": 2.961477279663086, + "learning_rate": 4.982950843087463e-05, + "loss": 0.6895, + "step": 41400 + }, + { + "epoch": 1.8689909246113756, + "grad_norm": 2.009765148162842, + "learning_rate": 4.98278588883224e-05, + "loss": 0.7122, + "step": 41600 + }, + { + "epoch": 1.8779764579027765, + "grad_norm": 12.237375259399414, + "learning_rate": 4.9826201431869205e-05, + "loss": 0.6626, + "step": 41800 + }, + { + "epoch": 1.8869619911941773, + "grad_norm": 5.94899845123291, + "learning_rate": 4.9824536062043356e-05, + "loss": 0.6641, + "step": 42000 + }, + { + "epoch": 1.8869619911941773, + "eval_loss": 2.8374111652374268, + "eval_runtime": 1044.7426, + "eval_samples_per_second": 9.48, + "eval_steps_per_second": 0.148, + "step": 42000 + }, + { + "epoch": 1.8959475244855781, + "grad_norm": 5.839437961578369, + "learning_rate": 4.98228627793757e-05, + "loss": 0.6554, + "step": 42200 + }, + { + "epoch": 1.904933057776979, + "grad_norm": 1.118190050125122, + "learning_rate": 4.982118158439959e-05, + "loss": 0.7005, + "step": 42400 + }, + { + "epoch": 1.9139185910683798, + "grad_norm": 3.554232358932495, + "learning_rate": 4.981949247765092e-05, + "loss": 0.7039, + "step": 42600 + }, + { + "epoch": 1.9229041243597806, + "grad_norm": 4.364952087402344, + "learning_rate": 4.981779545966808e-05, + "loss": 0.6665, + "step": 42800 + }, + { + "epoch": 1.9318896576511815, + "grad_norm": 5.755943775177002, + "learning_rate": 4.981609053099201e-05, + "loss": 0.6746, + "step": 43000 + }, + { + "epoch": 1.9318896576511815, + "eval_loss": 2.8288111686706543, + "eval_runtime": 1043.7899, + "eval_samples_per_second": 9.488, + "eval_steps_per_second": 0.148, + "step": 43000 + }, + { + "epoch": 1.9408751909425823, + "grad_norm": 4.873472213745117, + "learning_rate": 4.9814377692166145e-05, + "loss": 0.691, + "step": 43200 + }, + { + "epoch": 1.9498607242339832, + "grad_norm": 3.6146950721740723, + "learning_rate": 4.981265694373647e-05, + "loss": 0.6707, + "step": 43400 + }, + { + "epoch": 1.958846257525384, + "grad_norm": 6.156956195831299, + "learning_rate": 4.981092828625145e-05, + "loss": 0.6618, + "step": 43600 + }, + { + "epoch": 1.9678317908167848, + "grad_norm": 4.361949920654297, + "learning_rate": 4.980919172026211e-05, + "loss": 0.6791, + "step": 43800 + }, + { + "epoch": 1.9768173241081857, + "grad_norm": 3.5817549228668213, + "learning_rate": 4.9807447246321994e-05, + "loss": 0.7073, + "step": 44000 + }, + { + "epoch": 1.9768173241081857, + "eval_loss": 2.869600296020508, + "eval_runtime": 1043.5043, + "eval_samples_per_second": 9.491, + "eval_steps_per_second": 0.149, + "step": 44000 + }, + { + "epoch": 1.9858028573995865, + "grad_norm": 4.531149387359619, + "learning_rate": 4.980569486498714e-05, + "loss": 0.7056, + "step": 44200 + }, + { + "epoch": 1.9947883906909873, + "grad_norm": 4.764667987823486, + "learning_rate": 4.980393457681612e-05, + "loss": 0.678, + "step": 44400 + }, + { + "epoch": 2.003773923982388, + "grad_norm": 4.271178722381592, + "learning_rate": 4.980216638237003e-05, + "loss": 0.6399, + "step": 44600 + }, + { + "epoch": 2.012759457273789, + "grad_norm": 10.754460334777832, + "learning_rate": 4.9800390282212484e-05, + "loss": 0.6687, + "step": 44800 + }, + { + "epoch": 2.02174499056519, + "grad_norm": 2.3163371086120605, + "learning_rate": 4.9798606276909623e-05, + "loss": 0.6427, + "step": 45000 + }, + { + "epoch": 2.02174499056519, + "eval_loss": 2.8302671909332275, + "eval_runtime": 1044.3702, + "eval_samples_per_second": 9.483, + "eval_steps_per_second": 0.148, + "step": 45000 + }, + { + "epoch": 2.0307305238565907, + "grad_norm": 6.137772083282471, + "learning_rate": 4.9796814367030085e-05, + "loss": 0.6573, + "step": 45200 + }, + { + "epoch": 2.0397160571479915, + "grad_norm": 9.637032508850098, + "learning_rate": 4.979501455314506e-05, + "loss": 0.6663, + "step": 45400 + }, + { + "epoch": 2.0487015904393924, + "grad_norm": 9.139311790466309, + "learning_rate": 4.979320683582822e-05, + "loss": 0.651, + "step": 45600 + }, + { + "epoch": 2.057687123730793, + "grad_norm": 5.3387017250061035, + "learning_rate": 4.979139121565579e-05, + "loss": 0.6698, + "step": 45800 + }, + { + "epoch": 2.0666726570221945, + "grad_norm": 3.5355489253997803, + "learning_rate": 4.9789567693206504e-05, + "loss": 0.6951, + "step": 46000 + }, + { + "epoch": 2.0666726570221945, + "eval_loss": 2.905496835708618, + "eval_runtime": 1044.3998, + "eval_samples_per_second": 9.483, + "eval_steps_per_second": 0.148, + "step": 46000 + }, + { + "epoch": 2.075658190313595, + "grad_norm": 5.952988147735596, + "learning_rate": 4.9787736269061604e-05, + "loss": 0.6716, + "step": 46200 + }, + { + "epoch": 2.084643723604996, + "grad_norm": 3.8913867473602295, + "learning_rate": 4.978589694380485e-05, + "loss": 0.6543, + "step": 46400 + }, + { + "epoch": 2.093629256896397, + "grad_norm": 9.004631996154785, + "learning_rate": 4.978404971802255e-05, + "loss": 0.6471, + "step": 46600 + }, + { + "epoch": 2.102614790187798, + "grad_norm": 5.533471584320068, + "learning_rate": 4.9782194592303485e-05, + "loss": 0.6461, + "step": 46800 + }, + { + "epoch": 2.1116003234791987, + "grad_norm": 3.112337589263916, + "learning_rate": 4.9780331567239005e-05, + "loss": 0.6432, + "step": 47000 + }, + { + "epoch": 2.1116003234791987, + "eval_loss": 2.845529556274414, + "eval_runtime": 1043.8826, + "eval_samples_per_second": 9.488, + "eval_steps_per_second": 0.148, + "step": 47000 + }, + { + "epoch": 2.1205858567705995, + "grad_norm": 8.843466758728027, + "learning_rate": 4.977846064342292e-05, + "loss": 0.6744, + "step": 47200 + }, + { + "epoch": 2.1295713900620004, + "grad_norm": 5.125086307525635, + "learning_rate": 4.977658182145161e-05, + "loss": 0.6604, + "step": 47400 + }, + { + "epoch": 2.138556923353401, + "grad_norm": 2.8930840492248535, + "learning_rate": 4.9774695101923945e-05, + "loss": 0.6688, + "step": 47600 + }, + { + "epoch": 2.147542456644802, + "grad_norm": 2.3682479858398438, + "learning_rate": 4.9772800485441317e-05, + "loss": 0.6755, + "step": 47800 + }, + { + "epoch": 2.156527989936203, + "grad_norm": 3.7809925079345703, + "learning_rate": 4.977089797260764e-05, + "loss": 0.6596, + "step": 48000 + }, + { + "epoch": 2.156527989936203, + "eval_loss": 2.806736946105957, + "eval_runtime": 1045.1893, + "eval_samples_per_second": 9.476, + "eval_steps_per_second": 0.148, + "step": 48000 + }, + { + "epoch": 2.1655135232276037, + "grad_norm": 9.784541130065918, + "learning_rate": 4.976898756402934e-05, + "loss": 0.6993, + "step": 48200 + }, + { + "epoch": 2.1744990565190045, + "grad_norm": 3.151435136795044, + "learning_rate": 4.976706926031536e-05, + "loss": 0.657, + "step": 48400 + }, + { + "epoch": 2.1834845898104054, + "grad_norm": 4.002162456512451, + "learning_rate": 4.976514306207716e-05, + "loss": 0.6691, + "step": 48600 + }, + { + "epoch": 2.1924701231018062, + "grad_norm": 3.7456023693084717, + "learning_rate": 4.976320896992872e-05, + "loss": 0.6524, + "step": 48800 + }, + { + "epoch": 2.201455656393207, + "grad_norm": 7.874242782592773, + "learning_rate": 4.9761266984486534e-05, + "loss": 0.6828, + "step": 49000 + }, + { + "epoch": 2.201455656393207, + "eval_loss": 2.799010992050171, + "eval_runtime": 1112.993, + "eval_samples_per_second": 8.899, + "eval_steps_per_second": 0.139, + "step": 49000 + }, + { + "epoch": 2.210441189684608, + "grad_norm": 2.5422885417938232, + "learning_rate": 4.975931710636961e-05, + "loss": 0.6353, + "step": 49200 + }, + { + "epoch": 2.2194267229760087, + "grad_norm": 7.764764308929443, + "learning_rate": 4.9757359336199466e-05, + "loss": 0.6586, + "step": 49400 + }, + { + "epoch": 2.2284122562674096, + "grad_norm": 3.0725579261779785, + "learning_rate": 4.975539367460016e-05, + "loss": 0.6556, + "step": 49600 + }, + { + "epoch": 2.2373977895588104, + "grad_norm": 3.268784523010254, + "learning_rate": 4.9753420122198237e-05, + "loss": 0.6571, + "step": 49800 + }, + { + "epoch": 2.2463833228502113, + "grad_norm": 7.206459045410156, + "learning_rate": 4.9751438679622764e-05, + "loss": 0.6115, + "step": 50000 + }, + { + "epoch": 2.2463833228502113, + "eval_loss": 2.912787675857544, + "eval_runtime": 1110.2376, + "eval_samples_per_second": 8.921, + "eval_steps_per_second": 0.14, + "step": 50000 + }, + { + "epoch": 2.255368856141612, + "grad_norm": 1.150863766670227, + "learning_rate": 4.974944934750534e-05, + "loss": 0.6575, + "step": 50200 + }, + { + "epoch": 2.264354389433013, + "grad_norm": 4.235318183898926, + "learning_rate": 4.974745212648006e-05, + "loss": 0.649, + "step": 50400 + }, + { + "epoch": 2.2733399227244138, + "grad_norm": 3.499100923538208, + "learning_rate": 4.974544701718353e-05, + "loss": 0.6316, + "step": 50600 + }, + { + "epoch": 2.2823254560158146, + "grad_norm": 5.036466121673584, + "learning_rate": 4.97434340202549e-05, + "loss": 0.649, + "step": 50800 + }, + { + "epoch": 2.2913109893072154, + "grad_norm": 5.665818214416504, + "learning_rate": 4.9741413136335794e-05, + "loss": 0.6628, + "step": 51000 + }, + { + "epoch": 2.2913109893072154, + "eval_loss": 2.809664726257324, + "eval_runtime": 1108.6765, + "eval_samples_per_second": 8.933, + "eval_steps_per_second": 0.14, + "step": 51000 + }, + { + "epoch": 2.3002965225986163, + "grad_norm": 6.9531779289245605, + "learning_rate": 4.973938436607039e-05, + "loss": 0.6451, + "step": 51200 + }, + { + "epoch": 2.309282055890017, + "grad_norm": 8.631576538085938, + "learning_rate": 4.9737347710105346e-05, + "loss": 0.648, + "step": 51400 + }, + { + "epoch": 2.318267589181418, + "grad_norm": 7.7942376136779785, + "learning_rate": 4.973530316908986e-05, + "loss": 0.6289, + "step": 51600 + }, + { + "epoch": 2.327253122472819, + "grad_norm": 4.3523688316345215, + "learning_rate": 4.973325074367562e-05, + "loss": 0.6838, + "step": 51800 + }, + { + "epoch": 2.3362386557642196, + "grad_norm": 4.113776206970215, + "learning_rate": 4.973119043451684e-05, + "loss": 0.6776, + "step": 52000 + }, + { + "epoch": 2.3362386557642196, + "eval_loss": 2.8563921451568604, + "eval_runtime": 1110.1423, + "eval_samples_per_second": 8.921, + "eval_steps_per_second": 0.14, + "step": 52000 + }, + { + "epoch": 2.3452241890556205, + "grad_norm": 2.6197564601898193, + "learning_rate": 4.972912224227025e-05, + "loss": 0.6495, + "step": 52200 + }, + { + "epoch": 2.3542097223470213, + "grad_norm": 4.007927417755127, + "learning_rate": 4.972704616759509e-05, + "loss": 0.6299, + "step": 52400 + }, + { + "epoch": 2.363195255638422, + "grad_norm": 6.33441686630249, + "learning_rate": 4.97249622111531e-05, + "loss": 0.6444, + "step": 52600 + }, + { + "epoch": 2.372180788929823, + "grad_norm": 6.773642539978027, + "learning_rate": 4.9722870373608556e-05, + "loss": 0.658, + "step": 52800 + }, + { + "epoch": 2.381166322221224, + "grad_norm": 2.790375232696533, + "learning_rate": 4.972077065562821e-05, + "loss": 0.6435, + "step": 53000 + }, + { + "epoch": 2.381166322221224, + "eval_loss": 2.807753562927246, + "eval_runtime": 1109.9528, + "eval_samples_per_second": 8.923, + "eval_steps_per_second": 0.14, + "step": 53000 + }, + { + "epoch": 2.3901518555126247, + "grad_norm": 4.388117790222168, + "learning_rate": 4.971866305788138e-05, + "loss": 0.6147, + "step": 53200 + }, + { + "epoch": 2.3991373888040255, + "grad_norm": 4.960672378540039, + "learning_rate": 4.9716547581039854e-05, + "loss": 0.6465, + "step": 53400 + }, + { + "epoch": 2.4081229220954263, + "grad_norm": 3.5351078510284424, + "learning_rate": 4.9714424225777925e-05, + "loss": 0.6336, + "step": 53600 + }, + { + "epoch": 2.417108455386827, + "grad_norm": 6.359066009521484, + "learning_rate": 4.971229299277243e-05, + "loss": 0.6607, + "step": 53800 + }, + { + "epoch": 2.426093988678228, + "grad_norm": 7.120554447174072, + "learning_rate": 4.9710153882702706e-05, + "loss": 0.6299, + "step": 54000 + }, + { + "epoch": 2.426093988678228, + "eval_loss": 2.8412070274353027, + "eval_runtime": 1110.5443, + "eval_samples_per_second": 8.918, + "eval_steps_per_second": 0.14, + "step": 54000 + }, + { + "epoch": 2.435079521969629, + "grad_norm": 2.599130630493164, + "learning_rate": 4.970800689625058e-05, + "loss": 0.6324, + "step": 54200 + }, + { + "epoch": 2.4440650552610297, + "grad_norm": 12.322335243225098, + "learning_rate": 4.970585203410041e-05, + "loss": 0.6611, + "step": 54400 + }, + { + "epoch": 2.4530505885524305, + "grad_norm": 8.429553031921387, + "learning_rate": 4.970368929693907e-05, + "loss": 0.6683, + "step": 54600 + }, + { + "epoch": 2.4620361218438314, + "grad_norm": 5.938534259796143, + "learning_rate": 4.970151868545593e-05, + "loss": 0.615, + "step": 54800 + }, + { + "epoch": 2.471021655135232, + "grad_norm": 5.379678249359131, + "learning_rate": 4.969934020034288e-05, + "loss": 0.6439, + "step": 55000 + }, + { + "epoch": 2.471021655135232, + "eval_loss": 2.902723789215088, + "eval_runtime": 1111.1081, + "eval_samples_per_second": 8.914, + "eval_steps_per_second": 0.14, + "step": 55000 + }, + { + "epoch": 2.480007188426633, + "grad_norm": 2.5961101055145264, + "learning_rate": 4.96971538422943e-05, + "loss": 0.6392, + "step": 55200 + }, + { + "epoch": 2.488992721718034, + "grad_norm": 2.440741777420044, + "learning_rate": 4.9694959612007094e-05, + "loss": 0.6433, + "step": 55400 + }, + { + "epoch": 2.4979782550094347, + "grad_norm": 2.6657445430755615, + "learning_rate": 4.9692757510180686e-05, + "loss": 0.6544, + "step": 55600 + }, + { + "epoch": 2.5069637883008355, + "grad_norm": 3.9788851737976074, + "learning_rate": 4.969054753751699e-05, + "loss": 0.6231, + "step": 55800 + }, + { + "epoch": 2.5159493215922364, + "grad_norm": 2.831127643585205, + "learning_rate": 4.968832969472044e-05, + "loss": 0.6441, + "step": 56000 + }, + { + "epoch": 2.5159493215922364, + "eval_loss": 2.836225986480713, + "eval_runtime": 1110.9174, + "eval_samples_per_second": 8.915, + "eval_steps_per_second": 0.14, + "step": 56000 + }, + { + "epoch": 2.5249348548836372, + "grad_norm": 2.4856066703796387, + "learning_rate": 4.968610398249798e-05, + "loss": 0.6819, + "step": 56200 + }, + { + "epoch": 2.533920388175038, + "grad_norm": 6.462665557861328, + "learning_rate": 4.9683870401559054e-05, + "loss": 0.5954, + "step": 56400 + }, + { + "epoch": 2.542905921466439, + "grad_norm": 8.044194221496582, + "learning_rate": 4.96816289526156e-05, + "loss": 0.6849, + "step": 56600 + }, + { + "epoch": 2.5518914547578397, + "grad_norm": 1.6285322904586792, + "learning_rate": 4.9679379636382115e-05, + "loss": 0.6492, + "step": 56800 + }, + { + "epoch": 2.5608769880492406, + "grad_norm": 1.74399733543396, + "learning_rate": 4.9677122453575544e-05, + "loss": 0.6574, + "step": 57000 + }, + { + "epoch": 2.5608769880492406, + "eval_loss": 2.7768375873565674, + "eval_runtime": 1110.2066, + "eval_samples_per_second": 8.921, + "eval_steps_per_second": 0.14, + "step": 57000 + }, + { + "epoch": 2.5698625213406414, + "grad_norm": 4.567875385284424, + "learning_rate": 4.967485740491538e-05, + "loss": 0.6247, + "step": 57200 + }, + { + "epoch": 2.5788480546320423, + "grad_norm": 2.1420087814331055, + "learning_rate": 4.967258449112361e-05, + "loss": 0.6101, + "step": 57400 + }, + { + "epoch": 2.587833587923443, + "grad_norm": 4.842061519622803, + "learning_rate": 4.967030371292471e-05, + "loss": 0.6361, + "step": 57600 + }, + { + "epoch": 2.5968191212148444, + "grad_norm": 7.400786876678467, + "learning_rate": 4.9668015071045695e-05, + "loss": 0.6456, + "step": 57800 + }, + { + "epoch": 2.6058046545062448, + "grad_norm": 8.932103157043457, + "learning_rate": 4.966571856621607e-05, + "loss": 0.6232, + "step": 58000 + }, + { + "epoch": 2.6058046545062448, + "eval_loss": 2.8550527095794678, + "eval_runtime": 1110.8669, + "eval_samples_per_second": 8.916, + "eval_steps_per_second": 0.14, + "step": 58000 + }, + { + "epoch": 2.614790187797646, + "grad_norm": 2.9970428943634033, + "learning_rate": 4.9663414199167845e-05, + "loss": 0.6917, + "step": 58200 + }, + { + "epoch": 2.6237757210890464, + "grad_norm": 4.401594638824463, + "learning_rate": 4.966110197063554e-05, + "loss": 0.6321, + "step": 58400 + }, + { + "epoch": 2.6327612543804477, + "grad_norm": 8.229362487792969, + "learning_rate": 4.965878188135618e-05, + "loss": 0.6288, + "step": 58600 + }, + { + "epoch": 2.641746787671848, + "grad_norm": 1.6570228338241577, + "learning_rate": 4.965645393206929e-05, + "loss": 0.5909, + "step": 58800 + }, + { + "epoch": 2.6507323209632494, + "grad_norm": 8.355649948120117, + "learning_rate": 4.9654118123516925e-05, + "loss": 0.6708, + "step": 59000 + }, + { + "epoch": 2.6507323209632494, + "eval_loss": 2.7752935886383057, + "eval_runtime": 1109.8773, + "eval_samples_per_second": 8.924, + "eval_steps_per_second": 0.14, + "step": 59000 + }, + { + "epoch": 2.65971785425465, + "grad_norm": 3.5462231636047363, + "learning_rate": 4.96517744564436e-05, + "loss": 0.6037, + "step": 59200 + }, + { + "epoch": 2.668703387546051, + "grad_norm": 4.182783603668213, + "learning_rate": 4.964942293159637e-05, + "loss": 0.6271, + "step": 59400 + }, + { + "epoch": 2.6776889208374515, + "grad_norm": 17.542783737182617, + "learning_rate": 4.9647063549724796e-05, + "loss": 0.6915, + "step": 59600 + }, + { + "epoch": 2.6866744541288528, + "grad_norm": 2.8875606060028076, + "learning_rate": 4.9644696311580926e-05, + "loss": 0.6154, + "step": 59800 + }, + { + "epoch": 2.695659987420253, + "grad_norm": 3.598609209060669, + "learning_rate": 4.964232121791932e-05, + "loss": 0.6308, + "step": 60000 + }, + { + "epoch": 2.695659987420253, + "eval_loss": 2.770158529281616, + "eval_runtime": 1103.6022, + "eval_samples_per_second": 8.974, + "eval_steps_per_second": 0.14, + "step": 60000 + }, + { + "epoch": 2.7046455207116544, + "grad_norm": 4.902860164642334, + "learning_rate": 4.963993826949703e-05, + "loss": 0.6449, + "step": 60200 + }, + { + "epoch": 2.713631054003055, + "grad_norm": 1.6854755878448486, + "learning_rate": 4.9637547467073634e-05, + "loss": 0.6189, + "step": 60400 + }, + { + "epoch": 2.722616587294456, + "grad_norm": 3.137181520462036, + "learning_rate": 4.96351488114112e-05, + "loss": 0.6118, + "step": 60600 + }, + { + "epoch": 2.7316021205858565, + "grad_norm": 12.390292167663574, + "learning_rate": 4.963274230327432e-05, + "loss": 0.6407, + "step": 60800 + }, + { + "epoch": 2.740587653877258, + "grad_norm": 5.263106822967529, + "learning_rate": 4.963032794343003e-05, + "loss": 0.6426, + "step": 61000 + }, + { + "epoch": 2.740587653877258, + "eval_loss": 2.787389039993286, + "eval_runtime": 1105.6052, + "eval_samples_per_second": 8.958, + "eval_steps_per_second": 0.14, + "step": 61000 + }, + { + "epoch": 2.7495731871686586, + "grad_norm": 5.193811416625977, + "learning_rate": 4.962790573264794e-05, + "loss": 0.6199, + "step": 61200 + }, + { + "epoch": 2.7585587204600595, + "grad_norm": 2.3068435192108154, + "learning_rate": 4.962547567170013e-05, + "loss": 0.6299, + "step": 61400 + }, + { + "epoch": 2.7675442537514603, + "grad_norm": 7.189493656158447, + "learning_rate": 4.9623037761361166e-05, + "loss": 0.6591, + "step": 61600 + }, + { + "epoch": 2.776529787042861, + "grad_norm": 3.9445478916168213, + "learning_rate": 4.962059200240815e-05, + "loss": 0.6282, + "step": 61800 + }, + { + "epoch": 2.785515320334262, + "grad_norm": 8.275954246520996, + "learning_rate": 4.9618138395620666e-05, + "loss": 0.6209, + "step": 62000 + }, + { + "epoch": 2.785515320334262, + "eval_loss": 2.711536407470703, + "eval_runtime": 1103.3019, + "eval_samples_per_second": 8.977, + "eval_steps_per_second": 0.14, + "step": 62000 + }, + { + "epoch": 2.794500853625663, + "grad_norm": 6.457345008850098, + "learning_rate": 4.96156769417808e-05, + "loss": 0.6178, + "step": 62200 + }, + { + "epoch": 2.8034863869170636, + "grad_norm": 6.9077253341674805, + "learning_rate": 4.961320764167316e-05, + "loss": 0.62, + "step": 62400 + }, + { + "epoch": 2.8124719202084645, + "grad_norm": 1.4460822343826294, + "learning_rate": 4.96107304960848e-05, + "loss": 0.6681, + "step": 62600 + }, + { + "epoch": 2.8214574534998653, + "grad_norm": 5.170135021209717, + "learning_rate": 4.9608245505805345e-05, + "loss": 0.6137, + "step": 62800 + }, + { + "epoch": 2.830442986791266, + "grad_norm": 7.249731540679932, + "learning_rate": 4.960575267162688e-05, + "loss": 0.6175, + "step": 63000 + }, + { + "epoch": 2.830442986791266, + "eval_loss": 2.7555394172668457, + "eval_runtime": 1103.5103, + "eval_samples_per_second": 8.975, + "eval_steps_per_second": 0.14, + "step": 63000 + }, + { + "epoch": 2.839428520082667, + "grad_norm": 8.970303535461426, + "learning_rate": 4.960325199434399e-05, + "loss": 0.5958, + "step": 63200 + }, + { + "epoch": 2.848414053374068, + "grad_norm": 9.521201133728027, + "learning_rate": 4.960074347475377e-05, + "loss": 0.6608, + "step": 63400 + }, + { + "epoch": 2.8573995866654687, + "grad_norm": 1.2697712182998657, + "learning_rate": 4.9598227113655826e-05, + "loss": 0.6367, + "step": 63600 + }, + { + "epoch": 2.8663851199568695, + "grad_norm": 6.463663578033447, + "learning_rate": 4.959570291185224e-05, + "loss": 0.6198, + "step": 63800 + }, + { + "epoch": 2.8753706532482703, + "grad_norm": 2.3747761249542236, + "learning_rate": 4.95931708701476e-05, + "loss": 0.656, + "step": 64000 + }, + { + "epoch": 2.8753706532482703, + "eval_loss": 2.7699778079986572, + "eval_runtime": 1103.4164, + "eval_samples_per_second": 8.976, + "eval_steps_per_second": 0.14, + "step": 64000 + }, + { + "epoch": 2.884356186539671, + "grad_norm": 2.689181089401245, + "learning_rate": 4.9590630989349e-05, + "loss": 0.6433, + "step": 64200 + }, + { + "epoch": 2.893341719831072, + "grad_norm": 2.685288429260254, + "learning_rate": 4.958808327026603e-05, + "loss": 0.6643, + "step": 64400 + }, + { + "epoch": 2.902327253122473, + "grad_norm": 3.243163824081421, + "learning_rate": 4.9585527713710777e-05, + "loss": 0.6203, + "step": 64600 + }, + { + "epoch": 2.9113127864138737, + "grad_norm": 4.437738418579102, + "learning_rate": 4.9582964320497824e-05, + "loss": 0.6351, + "step": 64800 + }, + { + "epoch": 2.9202983197052745, + "grad_norm": 5.811532497406006, + "learning_rate": 4.9580393091444266e-05, + "loss": 0.6257, + "step": 65000 + }, + { + "epoch": 2.9202983197052745, + "eval_loss": 2.783703327178955, + "eval_runtime": 1103.9347, + "eval_samples_per_second": 8.972, + "eval_steps_per_second": 0.14, + "step": 65000 + }, + { + "epoch": 2.9292838529966754, + "grad_norm": 3.7145042419433594, + "learning_rate": 4.957781402736967e-05, + "loss": 0.6402, + "step": 65200 + }, + { + "epoch": 2.938269386288076, + "grad_norm": 8.268646240234375, + "learning_rate": 4.957522712909612e-05, + "loss": 0.5925, + "step": 65400 + }, + { + "epoch": 2.947254919579477, + "grad_norm": 4.354446887969971, + "learning_rate": 4.9572632397448196e-05, + "loss": 0.6588, + "step": 65600 + }, + { + "epoch": 2.956240452870878, + "grad_norm": 4.316616058349609, + "learning_rate": 4.957002983325297e-05, + "loss": 0.6173, + "step": 65800 + }, + { + "epoch": 2.9652259861622787, + "grad_norm": 7.808084011077881, + "learning_rate": 4.956741943734e-05, + "loss": 0.6157, + "step": 66000 + }, + { + "epoch": 2.9652259861622787, + "eval_loss": 2.8421056270599365, + "eval_runtime": 1104.1736, + "eval_samples_per_second": 8.97, + "eval_steps_per_second": 0.14, + "step": 66000 + }, + { + "epoch": 2.9742115194536796, + "grad_norm": 26.778465270996094, + "learning_rate": 4.956480121054137e-05, + "loss": 0.6378, + "step": 66200 + }, + { + "epoch": 2.9831970527450804, + "grad_norm": 5.89031457901001, + "learning_rate": 4.956217515369163e-05, + "loss": 0.5759, + "step": 66400 + }, + { + "epoch": 2.9921825860364812, + "grad_norm": 3.110283613204956, + "learning_rate": 4.955954126762784e-05, + "loss": 0.6221, + "step": 66600 + }, + { + "epoch": 3.001168119327882, + "grad_norm": 6.0229668617248535, + "learning_rate": 4.955689955318956e-05, + "loss": 0.6276, + "step": 66800 + }, + { + "epoch": 3.010153652619283, + "grad_norm": 4.137844562530518, + "learning_rate": 4.955425001121883e-05, + "loss": 0.5943, + "step": 67000 + }, + { + "epoch": 3.010153652619283, + "eval_loss": 2.781846523284912, + "eval_runtime": 1104.5447, + "eval_samples_per_second": 8.967, + "eval_steps_per_second": 0.14, + "step": 67000 + }, + { + "epoch": 3.0191391859106838, + "grad_norm": 4.880155563354492, + "learning_rate": 4.955159264256019e-05, + "loss": 0.6199, + "step": 67200 + }, + { + "epoch": 3.0281247192020846, + "grad_norm": 4.160552024841309, + "learning_rate": 4.9548927448060686e-05, + "loss": 0.6228, + "step": 67400 + }, + { + "epoch": 3.0371102524934854, + "grad_norm": 4.420809745788574, + "learning_rate": 4.954625442856986e-05, + "loss": 0.5729, + "step": 67600 + }, + { + "epoch": 3.0460957857848863, + "grad_norm": 2.833252429962158, + "learning_rate": 4.954357358493973e-05, + "loss": 0.6168, + "step": 67800 + }, + { + "epoch": 3.055081319076287, + "grad_norm": 4.240931034088135, + "learning_rate": 4.954088491802481e-05, + "loss": 0.6033, + "step": 68000 + }, + { + "epoch": 3.055081319076287, + "eval_loss": 2.8714144229888916, + "eval_runtime": 1105.2254, + "eval_samples_per_second": 8.961, + "eval_steps_per_second": 0.14, + "step": 68000 + }, + { + "epoch": 3.064066852367688, + "grad_norm": 9.208168983459473, + "learning_rate": 4.953818842868212e-05, + "loss": 0.5893, + "step": 68200 + }, + { + "epoch": 3.073052385659089, + "grad_norm": 3.6979544162750244, + "learning_rate": 4.953548411777117e-05, + "loss": 0.6, + "step": 68400 + }, + { + "epoch": 3.0820379189504896, + "grad_norm": 5.291320323944092, + "learning_rate": 4.953277198615397e-05, + "loss": 0.5899, + "step": 68600 + }, + { + "epoch": 3.0910234522418905, + "grad_norm": 3.7340753078460693, + "learning_rate": 4.9530052034695e-05, + "loss": 0.6183, + "step": 68800 + }, + { + "epoch": 3.1000089855332913, + "grad_norm": 2.6057052612304688, + "learning_rate": 4.952732426426126e-05, + "loss": 0.6176, + "step": 69000 + }, + { + "epoch": 3.1000089855332913, + "eval_loss": 2.7742364406585693, + "eval_runtime": 1104.5457, + "eval_samples_per_second": 8.967, + "eval_steps_per_second": 0.14, + "step": 69000 + }, + { + "epoch": 3.108994518824692, + "grad_norm": 11.468999862670898, + "learning_rate": 4.9524588675722205e-05, + "loss": 0.5958, + "step": 69200 + }, + { + "epoch": 3.117980052116093, + "grad_norm": 4.5051374435424805, + "learning_rate": 4.952184526994983e-05, + "loss": 0.6213, + "step": 69400 + }, + { + "epoch": 3.126965585407494, + "grad_norm": 4.247747421264648, + "learning_rate": 4.951909404781859e-05, + "loss": 0.6011, + "step": 69600 + }, + { + "epoch": 3.1359511186988946, + "grad_norm": 6.309694290161133, + "learning_rate": 4.951633501020545e-05, + "loss": 0.6028, + "step": 69800 + }, + { + "epoch": 3.1449366519902955, + "grad_norm": 1.6225708723068237, + "learning_rate": 4.951356815798983e-05, + "loss": 0.6235, + "step": 70000 + }, + { + "epoch": 3.1449366519902955, + "eval_loss": 2.717803478240967, + "eval_runtime": 1104.1485, + "eval_samples_per_second": 8.97, + "eval_steps_per_second": 0.14, + "step": 70000 + }, + { + "epoch": 3.1539221852816963, + "grad_norm": 4.1915106773376465, + "learning_rate": 4.95107934920537e-05, + "loss": 0.5785, + "step": 70200 + }, + { + "epoch": 3.162907718573097, + "grad_norm": 3.8733890056610107, + "learning_rate": 4.9508011013281454e-05, + "loss": 0.6236, + "step": 70400 + }, + { + "epoch": 3.171893251864498, + "grad_norm": 8.979776382446289, + "learning_rate": 4.950522072256003e-05, + "loss": 0.6158, + "step": 70600 + }, + { + "epoch": 3.180878785155899, + "grad_norm": 4.072059154510498, + "learning_rate": 4.950242262077883e-05, + "loss": 0.627, + "step": 70800 + }, + { + "epoch": 3.1898643184472997, + "grad_norm": 5.936033248901367, + "learning_rate": 4.9499616708829744e-05, + "loss": 0.5612, + "step": 71000 + }, + { + "epoch": 3.1898643184472997, + "eval_loss": 2.694528579711914, + "eval_runtime": 1096.847, + "eval_samples_per_second": 9.03, + "eval_steps_per_second": 0.141, + "step": 71000 + }, + { + "epoch": 3.1988498517387005, + "grad_norm": 7.062220573425293, + "learning_rate": 4.9496802987607174e-05, + "loss": 0.5959, + "step": 71200 + }, + { + "epoch": 3.2078353850301013, + "grad_norm": 4.436807155609131, + "learning_rate": 4.9493981458007986e-05, + "loss": 0.6131, + "step": 71400 + }, + { + "epoch": 3.216820918321502, + "grad_norm": 4.5539021492004395, + "learning_rate": 4.949115212093155e-05, + "loss": 0.5965, + "step": 71600 + }, + { + "epoch": 3.225806451612903, + "grad_norm": 13.243054389953613, + "learning_rate": 4.9488314977279716e-05, + "loss": 0.5439, + "step": 71800 + }, + { + "epoch": 3.234791984904304, + "grad_norm": 11.988075256347656, + "learning_rate": 4.948547002795682e-05, + "loss": 0.6139, + "step": 72000 + }, + { + "epoch": 3.234791984904304, + "eval_loss": 2.7093992233276367, + "eval_runtime": 1096.9087, + "eval_samples_per_second": 9.029, + "eval_steps_per_second": 0.141, + "step": 72000 + }, + { + "epoch": 3.2437775181957047, + "grad_norm": 2.3277647495269775, + "learning_rate": 4.9482617273869705e-05, + "loss": 0.618, + "step": 72200 + }, + { + "epoch": 3.252763051487106, + "grad_norm": 6.193905830383301, + "learning_rate": 4.947975671592768e-05, + "loss": 0.5845, + "step": 72400 + }, + { + "epoch": 3.2617485847785064, + "grad_norm": 3.807849884033203, + "learning_rate": 4.9476888355042555e-05, + "loss": 0.6207, + "step": 72600 + }, + { + "epoch": 3.2707341180699077, + "grad_norm": 13.691109657287598, + "learning_rate": 4.9474012192128615e-05, + "loss": 0.5921, + "step": 72800 + }, + { + "epoch": 3.279719651361308, + "grad_norm": 8.186936378479004, + "learning_rate": 4.947112822810265e-05, + "loss": 0.6381, + "step": 73000 + }, + { + "epoch": 3.279719651361308, + "eval_loss": 2.7966694831848145, + "eval_runtime": 1103.5256, + "eval_samples_per_second": 8.975, + "eval_steps_per_second": 0.14, + "step": 73000 + }, + { + "epoch": 3.2887051846527093, + "grad_norm": 2.7031075954437256, + "learning_rate": 4.946823646388392e-05, + "loss": 0.6346, + "step": 73200 + }, + { + "epoch": 3.29769071794411, + "grad_norm": 1.7532190084457397, + "learning_rate": 4.9465336900394174e-05, + "loss": 0.5815, + "step": 73400 + }, + { + "epoch": 3.306676251235511, + "grad_norm": 5.828246116638184, + "learning_rate": 4.946242953855765e-05, + "loss": 0.6277, + "step": 73600 + }, + { + "epoch": 3.315661784526912, + "grad_norm": 3.648778200149536, + "learning_rate": 4.9459514379301084e-05, + "loss": 0.5939, + "step": 73800 + }, + { + "epoch": 3.3246473178183127, + "grad_norm": 4.8969597816467285, + "learning_rate": 4.945659142355368e-05, + "loss": 0.6147, + "step": 74000 + }, + { + "epoch": 3.3246473178183127, + "eval_loss": 2.834960460662842, + "eval_runtime": 1095.2072, + "eval_samples_per_second": 9.043, + "eval_steps_per_second": 0.142, + "step": 74000 + }, + { + "epoch": 3.3336328511097135, + "grad_norm": 12.062762260437012, + "learning_rate": 4.9453660672247124e-05, + "loss": 0.6336, + "step": 74200 + }, + { + "epoch": 3.3426183844011144, + "grad_norm": 10.92843246459961, + "learning_rate": 4.945072212631561e-05, + "loss": 0.638, + "step": 74400 + }, + { + "epoch": 3.351603917692515, + "grad_norm": 7.536855220794678, + "learning_rate": 4.9447775786695785e-05, + "loss": 0.6045, + "step": 74600 + }, + { + "epoch": 3.360589450983916, + "grad_norm": 3.968078136444092, + "learning_rate": 4.94448216543268e-05, + "loss": 0.5983, + "step": 74800 + }, + { + "epoch": 3.369574984275317, + "grad_norm": 2.125988006591797, + "learning_rate": 4.94418597301503e-05, + "loss": 0.6118, + "step": 75000 + }, + { + "epoch": 3.369574984275317, + "eval_loss": 2.783966064453125, + "eval_runtime": 1095.5505, + "eval_samples_per_second": 9.04, + "eval_steps_per_second": 0.141, + "step": 75000 + }, + { + "epoch": 3.3785605175667177, + "grad_norm": 5.085707187652588, + "learning_rate": 4.9438890015110395e-05, + "loss": 0.5765, + "step": 75200 + }, + { + "epoch": 3.3875460508581186, + "grad_norm": 4.397859573364258, + "learning_rate": 4.943591251015368e-05, + "loss": 0.6046, + "step": 75400 + }, + { + "epoch": 3.3965315841495194, + "grad_norm": 2.367764711380005, + "learning_rate": 4.943292721622925e-05, + "loss": 0.6331, + "step": 75600 + }, + { + "epoch": 3.4055171174409202, + "grad_norm": 7.137909889221191, + "learning_rate": 4.942993413428865e-05, + "loss": 0.5902, + "step": 75800 + }, + { + "epoch": 3.414502650732321, + "grad_norm": 4.154844760894775, + "learning_rate": 4.942693326528594e-05, + "loss": 0.5684, + "step": 76000 + }, + { + "epoch": 3.414502650732321, + "eval_loss": 2.7368874549865723, + "eval_runtime": 1095.0529, + "eval_samples_per_second": 9.044, + "eval_steps_per_second": 0.142, + "step": 76000 + }, + { + "epoch": 3.423488184023722, + "grad_norm": 2.66355299949646, + "learning_rate": 4.9423924610177645e-05, + "loss": 0.6279, + "step": 76200 + }, + { + "epoch": 3.4324737173151227, + "grad_norm": 4.36577033996582, + "learning_rate": 4.942090816992278e-05, + "loss": 0.6016, + "step": 76400 + }, + { + "epoch": 3.4414592506065236, + "grad_norm": 5.2936625480651855, + "learning_rate": 4.9417883945482835e-05, + "loss": 0.6143, + "step": 76600 + }, + { + "epoch": 3.4504447838979244, + "grad_norm": 7.122065544128418, + "learning_rate": 4.9414851937821794e-05, + "loss": 0.6202, + "step": 76800 + }, + { + "epoch": 3.4594303171893253, + "grad_norm": 6.634164333343506, + "learning_rate": 4.941181214790609e-05, + "loss": 0.582, + "step": 77000 + }, + { + "epoch": 3.4594303171893253, + "eval_loss": 2.721560478210449, + "eval_runtime": 1095.5312, + "eval_samples_per_second": 9.04, + "eval_steps_per_second": 0.141, + "step": 77000 + }, + { + "epoch": 3.468415850480726, + "grad_norm": 7.679781436920166, + "learning_rate": 4.940876457670468e-05, + "loss": 0.6062, + "step": 77200 + }, + { + "epoch": 3.477401383772127, + "grad_norm": 4.641097068786621, + "learning_rate": 4.9405709225188966e-05, + "loss": 0.5853, + "step": 77400 + }, + { + "epoch": 3.4863869170635278, + "grad_norm": 4.262377738952637, + "learning_rate": 4.940264609433286e-05, + "loss": 0.6164, + "step": 77600 + }, + { + "epoch": 3.4953724503549286, + "grad_norm": 2.9696292877197266, + "learning_rate": 4.939957518511272e-05, + "loss": 0.6181, + "step": 77800 + }, + { + "epoch": 3.5043579836463294, + "grad_norm": 2.491093158721924, + "learning_rate": 4.9396496498507414e-05, + "loss": 0.6236, + "step": 78000 + }, + { + "epoch": 3.5043579836463294, + "eval_loss": 2.689380407333374, + "eval_runtime": 1095.9701, + "eval_samples_per_second": 9.037, + "eval_steps_per_second": 0.141, + "step": 78000 + }, + { + "epoch": 3.5133435169377303, + "grad_norm": 3.549752950668335, + "learning_rate": 4.9393410035498264e-05, + "loss": 0.6144, + "step": 78200 + }, + { + "epoch": 3.522329050229131, + "grad_norm": 33.26611328125, + "learning_rate": 4.9390315797069084e-05, + "loss": 0.6332, + "step": 78400 + }, + { + "epoch": 3.531314583520532, + "grad_norm": 4.73014497756958, + "learning_rate": 4.9387213784206185e-05, + "loss": 0.6195, + "step": 78600 + }, + { + "epoch": 3.540300116811933, + "grad_norm": 11.499771118164062, + "learning_rate": 4.938410399789831e-05, + "loss": 0.6105, + "step": 78800 + }, + { + "epoch": 3.5492856501033336, + "grad_norm": 9.83093547821045, + "learning_rate": 4.9380986439136725e-05, + "loss": 0.6256, + "step": 79000 + }, + { + "epoch": 3.5492856501033336, + "eval_loss": 2.74749493598938, + "eval_runtime": 1097.8988, + "eval_samples_per_second": 9.021, + "eval_steps_per_second": 0.141, + "step": 79000 + }, + { + "epoch": 3.5582711833947345, + "grad_norm": 5.551429271697998, + "learning_rate": 4.9377861108915136e-05, + "loss": 0.6412, + "step": 79200 + }, + { + "epoch": 3.5672567166861353, + "grad_norm": 5.982589244842529, + "learning_rate": 4.937472800822976e-05, + "loss": 0.5878, + "step": 79400 + }, + { + "epoch": 3.576242249977536, + "grad_norm": 5.788779258728027, + "learning_rate": 4.937158713807927e-05, + "loss": 0.6077, + "step": 79600 + }, + { + "epoch": 3.585227783268937, + "grad_norm": 5.566563129425049, + "learning_rate": 4.9368438499464826e-05, + "loss": 0.6108, + "step": 79800 + }, + { + "epoch": 3.594213316560338, + "grad_norm": 1.8803223371505737, + "learning_rate": 4.9365282093390055e-05, + "loss": 0.5926, + "step": 80000 + }, + { + "epoch": 3.594213316560338, + "eval_loss": 2.700577974319458, + "eval_runtime": 1096.7835, + "eval_samples_per_second": 9.03, + "eval_steps_per_second": 0.141, + "step": 80000 + }, + { + "epoch": 3.6031988498517387, + "grad_norm": 5.282078742980957, + "learning_rate": 4.9362117920861063e-05, + "loss": 0.5906, + "step": 80200 + }, + { + "epoch": 3.6121843831431395, + "grad_norm": 3.943328380584717, + "learning_rate": 4.935894598288643e-05, + "loss": 0.6109, + "step": 80400 + }, + { + "epoch": 3.6211699164345403, + "grad_norm": 19.697898864746094, + "learning_rate": 4.935576628047722e-05, + "loss": 0.5673, + "step": 80600 + }, + { + "epoch": 3.630155449725941, + "grad_norm": 7.314117908477783, + "learning_rate": 4.935257881464696e-05, + "loss": 0.6112, + "step": 80800 + }, + { + "epoch": 3.639140983017342, + "grad_norm": 8.926667213439941, + "learning_rate": 4.934938358641167e-05, + "loss": 0.5875, + "step": 81000 + }, + { + "epoch": 3.639140983017342, + "eval_loss": 2.7504782676696777, + "eval_runtime": 1097.743, + "eval_samples_per_second": 9.022, + "eval_steps_per_second": 0.141, + "step": 81000 + }, + { + "epoch": 3.648126516308743, + "grad_norm": 1.6228649616241455, + "learning_rate": 4.934618059678981e-05, + "loss": 0.5964, + "step": 81200 + }, + { + "epoch": 3.6571120496001437, + "grad_norm": 7.490013599395752, + "learning_rate": 4.934296984680236e-05, + "loss": 0.605, + "step": 81400 + }, + { + "epoch": 3.6660975828915445, + "grad_norm": 5.786327362060547, + "learning_rate": 4.933975133747273e-05, + "loss": 0.5523, + "step": 81600 + }, + { + "epoch": 3.6750831161829454, + "grad_norm": 6.276517868041992, + "learning_rate": 4.9336525069826834e-05, + "loss": 0.6328, + "step": 81800 + }, + { + "epoch": 3.684068649474346, + "grad_norm": 4.784965515136719, + "learning_rate": 4.933329104489304e-05, + "loss": 0.6267, + "step": 82000 + }, + { + "epoch": 3.684068649474346, + "eval_loss": 2.812925338745117, + "eval_runtime": 1084.0469, + "eval_samples_per_second": 9.136, + "eval_steps_per_second": 0.143, + "step": 82000 + }, + { + "epoch": 3.693054182765747, + "grad_norm": 1.2591400146484375, + "learning_rate": 4.9330049263702205e-05, + "loss": 0.6042, + "step": 82200 + }, + { + "epoch": 3.702039716057148, + "grad_norm": 2.7729320526123047, + "learning_rate": 4.932679972728764e-05, + "loss": 0.591, + "step": 82400 + }, + { + "epoch": 3.7110252493485487, + "grad_norm": 2.3185465335845947, + "learning_rate": 4.9323542436685144e-05, + "loss": 0.5797, + "step": 82600 + }, + { + "epoch": 3.7200107826399496, + "grad_norm": 7.948742389678955, + "learning_rate": 4.932027739293298e-05, + "loss": 0.6366, + "step": 82800 + }, + { + "epoch": 3.7289963159313504, + "grad_norm": 7.0373992919921875, + "learning_rate": 4.931700459707188e-05, + "loss": 0.6231, + "step": 83000 + }, + { + "epoch": 3.7289963159313504, + "eval_loss": 2.6898717880249023, + "eval_runtime": 1082.2616, + "eval_samples_per_second": 9.151, + "eval_steps_per_second": 0.143, + "step": 83000 + }, + { + "epoch": 3.7379818492227512, + "grad_norm": 2.6516005992889404, + "learning_rate": 4.931372405014505e-05, + "loss": 0.5767, + "step": 83200 + }, + { + "epoch": 3.746967382514152, + "grad_norm": 3.6714022159576416, + "learning_rate": 4.9310435753198174e-05, + "loss": 0.6415, + "step": 83400 + }, + { + "epoch": 3.755952915805553, + "grad_norm": 2.8350040912628174, + "learning_rate": 4.930713970727939e-05, + "loss": 0.6196, + "step": 83600 + }, + { + "epoch": 3.7649384490969537, + "grad_norm": 6.588120937347412, + "learning_rate": 4.930383591343933e-05, + "loss": 0.6076, + "step": 83800 + }, + { + "epoch": 3.7739239823883546, + "grad_norm": 10.156900405883789, + "learning_rate": 4.930052437273107e-05, + "loss": 0.5944, + "step": 84000 + }, + { + "epoch": 3.7739239823883546, + "eval_loss": 2.7181143760681152, + "eval_runtime": 1080.4885, + "eval_samples_per_second": 9.166, + "eval_steps_per_second": 0.143, + "step": 84000 + }, + { + "epoch": 3.782909515679756, + "grad_norm": 7.760807037353516, + "learning_rate": 4.9297205086210166e-05, + "loss": 0.6227, + "step": 84200 + }, + { + "epoch": 3.7918950489711563, + "grad_norm": 4.258764266967773, + "learning_rate": 4.929387805493464e-05, + "loss": 0.5706, + "step": 84400 + }, + { + "epoch": 3.8008805822625575, + "grad_norm": 1.825241208076477, + "learning_rate": 4.9290543279965e-05, + "loss": 0.6034, + "step": 84600 + }, + { + "epoch": 3.809866115553958, + "grad_norm": 6.256824493408203, + "learning_rate": 4.9287200762364196e-05, + "loss": 0.5564, + "step": 84800 + }, + { + "epoch": 3.818851648845359, + "grad_norm": 3.7286887168884277, + "learning_rate": 4.9283850503197657e-05, + "loss": 0.5849, + "step": 85000 + }, + { + "epoch": 3.818851648845359, + "eval_loss": 2.7389979362487793, + "eval_runtime": 1084.0935, + "eval_samples_per_second": 9.136, + "eval_steps_per_second": 0.143, + "step": 85000 + }, + { + "epoch": 3.8278371821367596, + "grad_norm": 7.849632740020752, + "learning_rate": 4.928049250353329e-05, + "loss": 0.6199, + "step": 85200 + }, + { + "epoch": 3.836822715428161, + "grad_norm": 6.8108439445495605, + "learning_rate": 4.927712676444146e-05, + "loss": 0.5899, + "step": 85400 + }, + { + "epoch": 3.8458082487195613, + "grad_norm": 10.76682186126709, + "learning_rate": 4.9273753286995e-05, + "loss": 0.5788, + "step": 85600 + }, + { + "epoch": 3.8547937820109626, + "grad_norm": 3.199047088623047, + "learning_rate": 4.9270372072269195e-05, + "loss": 0.5883, + "step": 85800 + }, + { + "epoch": 3.863779315302363, + "grad_norm": 9.04162883758545, + "learning_rate": 4.926698312134183e-05, + "loss": 0.5848, + "step": 86000 + }, + { + "epoch": 3.863779315302363, + "eval_loss": 2.729203939437866, + "eval_runtime": 1081.4692, + "eval_samples_per_second": 9.158, + "eval_steps_per_second": 0.143, + "step": 86000 + }, + { + "epoch": 3.8727648485937642, + "grad_norm": 4.6888909339904785, + "learning_rate": 4.926358643529311e-05, + "loss": 0.6202, + "step": 86200 + }, + { + "epoch": 3.8817503818851646, + "grad_norm": 4.689401149749756, + "learning_rate": 4.9260182015205756e-05, + "loss": 0.5842, + "step": 86400 + }, + { + "epoch": 3.890735915176566, + "grad_norm": 5.316648483276367, + "learning_rate": 4.925676986216492e-05, + "loss": 0.639, + "step": 86600 + }, + { + "epoch": 3.8997214484679663, + "grad_norm": 8.970780372619629, + "learning_rate": 4.9253349977258224e-05, + "loss": 0.5849, + "step": 86800 + }, + { + "epoch": 3.9087069817593676, + "grad_norm": 6.301709175109863, + "learning_rate": 4.924992236157577e-05, + "loss": 0.6302, + "step": 87000 + }, + { + "epoch": 3.9087069817593676, + "eval_loss": 2.6868460178375244, + "eval_runtime": 1082.2018, + "eval_samples_per_second": 9.152, + "eval_steps_per_second": 0.143, + "step": 87000 + }, + { + "epoch": 3.917692515050768, + "grad_norm": 7.46571159362793, + "learning_rate": 4.9246487016210105e-05, + "loss": 0.6067, + "step": 87200 + }, + { + "epoch": 3.9266780483421693, + "grad_norm": 2.6615748405456543, + "learning_rate": 4.924304394225626e-05, + "loss": 0.5964, + "step": 87400 + }, + { + "epoch": 3.93566358163357, + "grad_norm": 1.640554666519165, + "learning_rate": 4.92395931408117e-05, + "loss": 0.594, + "step": 87600 + }, + { + "epoch": 3.944649114924971, + "grad_norm": 6.6660919189453125, + "learning_rate": 4.923613461297638e-05, + "loss": 0.5728, + "step": 87800 + }, + { + "epoch": 3.953634648216372, + "grad_norm": 8.77531909942627, + "learning_rate": 4.923266835985271e-05, + "loss": 0.5873, + "step": 88000 + }, + { + "epoch": 3.953634648216372, + "eval_loss": 2.6699206829071045, + "eval_runtime": 1089.8325, + "eval_samples_per_second": 9.088, + "eval_steps_per_second": 0.142, + "step": 88000 + }, + { + "epoch": 3.9626201815077726, + "grad_norm": 9.528241157531738, + "learning_rate": 4.922919438254556e-05, + "loss": 0.5803, + "step": 88200 + }, + { + "epoch": 3.9716057147991735, + "grad_norm": 1.9404816627502441, + "learning_rate": 4.9225712682162265e-05, + "loss": 0.5529, + "step": 88400 + }, + { + "epoch": 3.9805912480905743, + "grad_norm": 10.01131820678711, + "learning_rate": 4.922222325981262e-05, + "loss": 0.6296, + "step": 88600 + }, + { + "epoch": 3.989576781381975, + "grad_norm": 12.538310050964355, + "learning_rate": 4.921872611660887e-05, + "loss": 0.5903, + "step": 88800 + }, + { + "epoch": 3.998562314673376, + "grad_norm": 1.599368691444397, + "learning_rate": 4.921522125366574e-05, + "loss": 0.6081, + "step": 89000 + }, + { + "epoch": 3.998562314673376, + "eval_loss": 2.7178070545196533, + "eval_runtime": 1080.1856, + "eval_samples_per_second": 9.169, + "eval_steps_per_second": 0.143, + "step": 89000 + }, + { + "epoch": 4.007547847964776, + "grad_norm": 11.243287086486816, + "learning_rate": 4.921170867210042e-05, + "loss": 0.5604, + "step": 89200 + }, + { + "epoch": 4.016533381256178, + "grad_norm": 4.789255619049072, + "learning_rate": 4.920818837303253e-05, + "loss": 0.5699, + "step": 89400 + }, + { + "epoch": 4.025518914547578, + "grad_norm": 14.564445495605469, + "learning_rate": 4.920466035758418e-05, + "loss": 0.5595, + "step": 89600 + }, + { + "epoch": 4.034504447838979, + "grad_norm": 8.886981010437012, + "learning_rate": 4.920112462687993e-05, + "loss": 0.5749, + "step": 89800 + }, + { + "epoch": 4.04348998113038, + "grad_norm": 8.778055191040039, + "learning_rate": 4.919758118204678e-05, + "loss": 0.5711, + "step": 90000 + }, + { + "epoch": 4.04348998113038, + "eval_loss": 2.7640573978424072, + "eval_runtime": 1082.5818, + "eval_samples_per_second": 9.148, + "eval_steps_per_second": 0.143, + "step": 90000 + }, + { + "epoch": 4.052475514421781, + "grad_norm": 3.818753242492676, + "learning_rate": 4.9194030024214225e-05, + "loss": 0.5166, + "step": 90200 + }, + { + "epoch": 4.061461047713181, + "grad_norm": 6.440443992614746, + "learning_rate": 4.919047115451418e-05, + "loss": 0.5528, + "step": 90400 + }, + { + "epoch": 4.070446581004583, + "grad_norm": 6.763418197631836, + "learning_rate": 4.918690457408106e-05, + "loss": 0.5533, + "step": 90600 + }, + { + "epoch": 4.079432114295983, + "grad_norm": 4.209813117980957, + "learning_rate": 4.9183330284051695e-05, + "loss": 0.5437, + "step": 90800 + }, + { + "epoch": 4.088417647587384, + "grad_norm": 10.399232864379883, + "learning_rate": 4.917974828556541e-05, + "loss": 0.5665, + "step": 91000 + }, + { + "epoch": 4.088417647587384, + "eval_loss": 2.688040256500244, + "eval_runtime": 1080.6131, + "eval_samples_per_second": 9.165, + "eval_steps_per_second": 0.143, + "step": 91000 + }, + { + "epoch": 4.097403180878785, + "grad_norm": 2.827580213546753, + "learning_rate": 4.917615857976396e-05, + "loss": 0.5812, + "step": 91200 + }, + { + "epoch": 4.106388714170186, + "grad_norm": 3.4965403079986572, + "learning_rate": 4.917256116779157e-05, + "loss": 0.6076, + "step": 91400 + }, + { + "epoch": 4.115374247461586, + "grad_norm": 4.934850692749023, + "learning_rate": 4.916895605079492e-05, + "loss": 0.5613, + "step": 91600 + }, + { + "epoch": 4.124359780752988, + "grad_norm": 6.726780891418457, + "learning_rate": 4.916534322992314e-05, + "loss": 0.6017, + "step": 91800 + }, + { + "epoch": 4.133345314044389, + "grad_norm": 2.464892625808716, + "learning_rate": 4.9161722706327826e-05, + "loss": 0.5902, + "step": 92000 + }, + { + "epoch": 4.133345314044389, + "eval_loss": 2.6801517009735107, + "eval_runtime": 1082.5084, + "eval_samples_per_second": 9.149, + "eval_steps_per_second": 0.143, + "step": 92000 + }, + { + "epoch": 4.142330847335789, + "grad_norm": 4.2705254554748535, + "learning_rate": 4.915809448116302e-05, + "loss": 0.558, + "step": 92200 + }, + { + "epoch": 4.15131638062719, + "grad_norm": 11.47816276550293, + "learning_rate": 4.915445855558522e-05, + "loss": 0.5689, + "step": 92400 + }, + { + "epoch": 4.160301913918591, + "grad_norm": 8.396933555603027, + "learning_rate": 4.9150814930753374e-05, + "loss": 0.5982, + "step": 92600 + }, + { + "epoch": 4.169287447209992, + "grad_norm": 5.501452922821045, + "learning_rate": 4.914716360782889e-05, + "loss": 0.5738, + "step": 92800 + }, + { + "epoch": 4.178272980501393, + "grad_norm": 8.553749084472656, + "learning_rate": 4.914350458797565e-05, + "loss": 0.5496, + "step": 93000 + }, + { + "epoch": 4.178272980501393, + "eval_loss": 2.7101192474365234, + "eval_runtime": 1082.8384, + "eval_samples_per_second": 9.146, + "eval_steps_per_second": 0.143, + "step": 93000 + }, + { + "epoch": 4.187258513792794, + "grad_norm": 18.494911193847656, + "learning_rate": 4.913983787235996e-05, + "loss": 0.5905, + "step": 93200 + }, + { + "epoch": 4.196244047084194, + "grad_norm": 4.566243648529053, + "learning_rate": 4.913616346215057e-05, + "loss": 0.5712, + "step": 93400 + }, + { + "epoch": 4.205229580375596, + "grad_norm": 5.748531818389893, + "learning_rate": 4.9132481358518735e-05, + "loss": 0.558, + "step": 93600 + }, + { + "epoch": 4.214215113666996, + "grad_norm": 3.77885365486145, + "learning_rate": 4.9128791562638096e-05, + "loss": 0.5927, + "step": 93800 + }, + { + "epoch": 4.223200646958397, + "grad_norm": 2.6284022331237793, + "learning_rate": 4.9125094075684805e-05, + "loss": 0.5953, + "step": 94000 + }, + { + "epoch": 4.223200646958397, + "eval_loss": 2.712245225906372, + "eval_runtime": 1088.8302, + "eval_samples_per_second": 9.096, + "eval_steps_per_second": 0.142, + "step": 94000 + }, + { + "epoch": 4.232186180249798, + "grad_norm": 5.8867645263671875, + "learning_rate": 4.9121388898837415e-05, + "loss": 0.5895, + "step": 94200 + }, + { + "epoch": 4.241171713541199, + "grad_norm": 6.118598937988281, + "learning_rate": 4.911767603327698e-05, + "loss": 0.6138, + "step": 94400 + }, + { + "epoch": 4.250157246832599, + "grad_norm": 7.058086395263672, + "learning_rate": 4.911395548018696e-05, + "loss": 0.5921, + "step": 94600 + }, + { + "epoch": 4.259142780124001, + "grad_norm": 6.587648391723633, + "learning_rate": 4.911022724075329e-05, + "loss": 0.5778, + "step": 94800 + }, + { + "epoch": 4.268128313415401, + "grad_norm": 1.6069397926330566, + "learning_rate": 4.910649131616435e-05, + "loss": 0.6262, + "step": 95000 + }, + { + "epoch": 4.268128313415401, + "eval_loss": 2.6547911167144775, + "eval_runtime": 1085.8261, + "eval_samples_per_second": 9.121, + "eval_steps_per_second": 0.143, + "step": 95000 + }, + { + "epoch": 4.277113846706802, + "grad_norm": 6.686661243438721, + "learning_rate": 4.910274770761096e-05, + "loss": 0.5864, + "step": 95200 + }, + { + "epoch": 4.286099379998203, + "grad_norm": 7.897719860076904, + "learning_rate": 4.909899641628641e-05, + "loss": 0.5884, + "step": 95400 + }, + { + "epoch": 4.295084913289604, + "grad_norm": 7.400073528289795, + "learning_rate": 4.9095237443386435e-05, + "loss": 0.6021, + "step": 95600 + }, + { + "epoch": 4.3040704465810045, + "grad_norm": 4.220474720001221, + "learning_rate": 4.9091470790109196e-05, + "loss": 0.5518, + "step": 95800 + }, + { + "epoch": 4.313055979872406, + "grad_norm": 1.6574774980545044, + "learning_rate": 4.908769645765532e-05, + "loss": 0.5867, + "step": 96000 + }, + { + "epoch": 4.313055979872406, + "eval_loss": 2.691925525665283, + "eval_runtime": 1089.0317, + "eval_samples_per_second": 9.094, + "eval_steps_per_second": 0.142, + "step": 96000 + }, + { + "epoch": 4.322041513163806, + "grad_norm": 3.5609164237976074, + "learning_rate": 4.908391444722787e-05, + "loss": 0.5803, + "step": 96200 + }, + { + "epoch": 4.331027046455207, + "grad_norm": 3.427290201187134, + "learning_rate": 4.908012476003239e-05, + "loss": 0.554, + "step": 96400 + }, + { + "epoch": 4.340012579746608, + "grad_norm": 52.728878021240234, + "learning_rate": 4.907632739727682e-05, + "loss": 0.5962, + "step": 96600 + }, + { + "epoch": 4.348998113038009, + "grad_norm": 12.754006385803223, + "learning_rate": 4.907252236017159e-05, + "loss": 0.5742, + "step": 96800 + }, + { + "epoch": 4.3579836463294095, + "grad_norm": 8.12136173248291, + "learning_rate": 4.9068709649929544e-05, + "loss": 0.6085, + "step": 97000 + }, + { + "epoch": 4.3579836463294095, + "eval_loss": 2.6768929958343506, + "eval_runtime": 1090.8411, + "eval_samples_per_second": 9.079, + "eval_steps_per_second": 0.142, + "step": 97000 + }, + { + "epoch": 4.366969179620811, + "grad_norm": 5.45872688293457, + "learning_rate": 4.9064889267766e-05, + "loss": 0.5137, + "step": 97200 + }, + { + "epoch": 4.375954712912211, + "grad_norm": 3.9804370403289795, + "learning_rate": 4.9061061214898707e-05, + "loss": 0.5567, + "step": 97400 + }, + { + "epoch": 4.3849402462036124, + "grad_norm": 29.226791381835938, + "learning_rate": 4.9057225492547846e-05, + "loss": 0.5694, + "step": 97600 + }, + { + "epoch": 4.393925779495013, + "grad_norm": 6.9307169914245605, + "learning_rate": 4.9053382101936076e-05, + "loss": 0.5909, + "step": 97800 + }, + { + "epoch": 4.402911312786414, + "grad_norm": 5.833766937255859, + "learning_rate": 4.904953104428846e-05, + "loss": 0.5692, + "step": 98000 + }, + { + "epoch": 4.402911312786414, + "eval_loss": 2.714953660964966, + "eval_runtime": 1094.2189, + "eval_samples_per_second": 9.051, + "eval_steps_per_second": 0.142, + "step": 98000 + }, + { + "epoch": 4.4118968460778145, + "grad_norm": 9.674918174743652, + "learning_rate": 4.904567232083255e-05, + "loss": 0.5795, + "step": 98200 + }, + { + "epoch": 4.420882379369216, + "grad_norm": 17.37355613708496, + "learning_rate": 4.9041805932798295e-05, + "loss": 0.581, + "step": 98400 + }, + { + "epoch": 4.429867912660616, + "grad_norm": 2.3987767696380615, + "learning_rate": 4.9037931881418126e-05, + "loss": 0.5911, + "step": 98600 + }, + { + "epoch": 4.4388534459520175, + "grad_norm": 6.0703558921813965, + "learning_rate": 4.903405016792689e-05, + "loss": 0.6068, + "step": 98800 + }, + { + "epoch": 4.447838979243418, + "grad_norm": 3.4397573471069336, + "learning_rate": 4.9030160793561886e-05, + "loss": 0.5542, + "step": 99000 + }, + { + "epoch": 4.447838979243418, + "eval_loss": 2.6832633018493652, + "eval_runtime": 1085.7638, + "eval_samples_per_second": 9.122, + "eval_steps_per_second": 0.143, + "step": 99000 + }, + { + "epoch": 4.456824512534819, + "grad_norm": 1.5094788074493408, + "learning_rate": 4.902626375956287e-05, + "loss": 0.575, + "step": 99200 + }, + { + "epoch": 4.4658100458262195, + "grad_norm": 1.8952089548110962, + "learning_rate": 4.902235906717201e-05, + "loss": 0.5773, + "step": 99400 + }, + { + "epoch": 4.474795579117621, + "grad_norm": 6.439733505249023, + "learning_rate": 4.9018446717633923e-05, + "loss": 0.5653, + "step": 99600 + }, + { + "epoch": 4.483781112409021, + "grad_norm": 6.996722221374512, + "learning_rate": 4.90145267121957e-05, + "loss": 0.5823, + "step": 99800 + }, + { + "epoch": 4.4927666457004225, + "grad_norm": 8.791942596435547, + "learning_rate": 4.901059905210682e-05, + "loss": 0.5978, + "step": 100000 + }, + { + "epoch": 4.4927666457004225, + "eval_loss": 2.696164608001709, + "eval_runtime": 1086.8043, + "eval_samples_per_second": 9.113, + "eval_steps_per_second": 0.143, + "step": 100000 + }, + { + "epoch": 4.501752178991823, + "grad_norm": 1.378144383430481, + "learning_rate": 4.900666373861924e-05, + "loss": 0.5769, + "step": 100200 + }, + { + "epoch": 4.510737712283224, + "grad_norm": 11.897534370422363, + "learning_rate": 4.9002720772987345e-05, + "loss": 0.6066, + "step": 100400 + }, + { + "epoch": 4.519723245574625, + "grad_norm": 5.889138698577881, + "learning_rate": 4.899877015646795e-05, + "loss": 0.5708, + "step": 100600 + }, + { + "epoch": 4.528708778866026, + "grad_norm": 8.439177513122559, + "learning_rate": 4.899481189032034e-05, + "loss": 0.5529, + "step": 100800 + }, + { + "epoch": 4.537694312157426, + "grad_norm": 5.41510534286499, + "learning_rate": 4.899084597580619e-05, + "loss": 0.5933, + "step": 101000 + }, + { + "epoch": 4.537694312157426, + "eval_loss": 2.7135655879974365, + "eval_runtime": 1086.9924, + "eval_samples_per_second": 9.111, + "eval_steps_per_second": 0.143, + "step": 101000 + }, + { + "epoch": 4.5466798454488275, + "grad_norm": 6.926478385925293, + "learning_rate": 4.898687241418965e-05, + "loss": 0.5591, + "step": 101200 + }, + { + "epoch": 4.555665378740228, + "grad_norm": 4.796566963195801, + "learning_rate": 4.89828912067373e-05, + "loss": 0.5589, + "step": 101400 + }, + { + "epoch": 4.564650912031629, + "grad_norm": 12.869160652160645, + "learning_rate": 4.897890235471814e-05, + "loss": 0.5826, + "step": 101600 + }, + { + "epoch": 4.57363644532303, + "grad_norm": 9.72813892364502, + "learning_rate": 4.897490585940363e-05, + "loss": 0.5718, + "step": 101800 + }, + { + "epoch": 4.582621978614431, + "grad_norm": 5.5949201583862305, + "learning_rate": 4.8970901722067654e-05, + "loss": 0.5363, + "step": 102000 + }, + { + "epoch": 4.582621978614431, + "eval_loss": 2.71557879447937, + "eval_runtime": 1083.3139, + "eval_samples_per_second": 9.142, + "eval_steps_per_second": 0.143, + "step": 102000 + }, + { + "epoch": 4.591607511905831, + "grad_norm": 4.014338970184326, + "learning_rate": 4.8966889943986524e-05, + "loss": 0.5851, + "step": 102200 + }, + { + "epoch": 4.6005930451972326, + "grad_norm": 8.909133911132812, + "learning_rate": 4.896287052643902e-05, + "loss": 0.5962, + "step": 102400 + }, + { + "epoch": 4.609578578488633, + "grad_norm": 8.902458190917969, + "learning_rate": 4.8958843470706326e-05, + "loss": 0.5596, + "step": 102600 + }, + { + "epoch": 4.618564111780034, + "grad_norm": 8.509809494018555, + "learning_rate": 4.895480877807206e-05, + "loss": 0.6035, + "step": 102800 + }, + { + "epoch": 4.627549645071435, + "grad_norm": 5.119136333465576, + "learning_rate": 4.895076644982229e-05, + "loss": 0.6273, + "step": 103000 + }, + { + "epoch": 4.627549645071435, + "eval_loss": 2.675107002258301, + "eval_runtime": 1083.9625, + "eval_samples_per_second": 9.137, + "eval_steps_per_second": 0.143, + "step": 103000 + }, + { + "epoch": 4.636535178362836, + "grad_norm": 2.670029640197754, + "learning_rate": 4.894671648724551e-05, + "loss": 0.554, + "step": 103200 + }, + { + "epoch": 4.645520711654236, + "grad_norm": 1.9858131408691406, + "learning_rate": 4.8942658891632654e-05, + "loss": 0.5506, + "step": 103400 + }, + { + "epoch": 4.654506244945638, + "grad_norm": 4.778411388397217, + "learning_rate": 4.893859366427708e-05, + "loss": 0.5714, + "step": 103600 + }, + { + "epoch": 4.663491778237038, + "grad_norm": 13.496174812316895, + "learning_rate": 4.893452080647457e-05, + "loss": 0.5609, + "step": 103800 + }, + { + "epoch": 4.672477311528439, + "grad_norm": 3.933356285095215, + "learning_rate": 4.893044031952338e-05, + "loss": 0.5461, + "step": 104000 + }, + { + "epoch": 4.672477311528439, + "eval_loss": 2.6608850955963135, + "eval_runtime": 1085.6954, + "eval_samples_per_second": 9.122, + "eval_steps_per_second": 0.143, + "step": 104000 + }, + { + "epoch": 4.6814628448198405, + "grad_norm": 6.484622001647949, + "learning_rate": 4.8926352204724145e-05, + "loss": 0.5888, + "step": 104200 + }, + { + "epoch": 4.690448378111241, + "grad_norm": 13.072513580322266, + "learning_rate": 4.892225646337996e-05, + "loss": 0.6129, + "step": 104400 + }, + { + "epoch": 4.699433911402641, + "grad_norm": 9.19959545135498, + "learning_rate": 4.891815309679636e-05, + "loss": 0.5822, + "step": 104600 + }, + { + "epoch": 4.708419444694043, + "grad_norm": 2.801856517791748, + "learning_rate": 4.8914042106281264e-05, + "loss": 0.6029, + "step": 104800 + }, + { + "epoch": 4.717404977985444, + "grad_norm": 10.685206413269043, + "learning_rate": 4.8909923493145096e-05, + "loss": 0.5901, + "step": 105000 + }, + { + "epoch": 4.717404977985444, + "eval_loss": 2.635706901550293, + "eval_runtime": 1084.0059, + "eval_samples_per_second": 9.136, + "eval_steps_per_second": 0.143, + "step": 105000 + }, + { + "epoch": 4.726390511276844, + "grad_norm": 3.1026599407196045, + "learning_rate": 4.8905797258700634e-05, + "loss": 0.5829, + "step": 105200 + }, + { + "epoch": 4.735376044568245, + "grad_norm": 11.270343780517578, + "learning_rate": 4.890166340426313e-05, + "loss": 0.5699, + "step": 105400 + }, + { + "epoch": 4.744361577859646, + "grad_norm": 7.997730731964111, + "learning_rate": 4.8897521931150266e-05, + "loss": 0.5969, + "step": 105600 + }, + { + "epoch": 4.753347111151047, + "grad_norm": 9.27990436553955, + "learning_rate": 4.8893372840682116e-05, + "loss": 0.5781, + "step": 105800 + }, + { + "epoch": 4.762332644442448, + "grad_norm": 6.486850261688232, + "learning_rate": 4.888921613418122e-05, + "loss": 0.5926, + "step": 106000 + }, + { + "epoch": 4.762332644442448, + "eval_loss": 2.67816424369812, + "eval_runtime": 1076.6519, + "eval_samples_per_second": 9.199, + "eval_steps_per_second": 0.144, + "step": 106000 + }, + { + "epoch": 4.771318177733848, + "grad_norm": 7.903515338897705, + "learning_rate": 4.8885051812972536e-05, + "loss": 0.5706, + "step": 106200 + }, + { + "epoch": 4.780303711025249, + "grad_norm": 4.940199375152588, + "learning_rate": 4.8880879878383436e-05, + "loss": 0.5647, + "step": 106400 + }, + { + "epoch": 4.789289244316651, + "grad_norm": 9.641985893249512, + "learning_rate": 4.887670033174373e-05, + "loss": 0.5661, + "step": 106600 + }, + { + "epoch": 4.798274777608051, + "grad_norm": 6.985136985778809, + "learning_rate": 4.887251317438566e-05, + "loss": 0.5938, + "step": 106800 + }, + { + "epoch": 4.807260310899451, + "grad_norm": 3.396899700164795, + "learning_rate": 4.886831840764387e-05, + "loss": 0.572, + "step": 107000 + }, + { + "epoch": 4.807260310899451, + "eval_loss": 2.6387288570404053, + "eval_runtime": 1076.2791, + "eval_samples_per_second": 9.202, + "eval_steps_per_second": 0.144, + "step": 107000 + }, + { + "epoch": 4.816245844190853, + "grad_norm": 12.026623725891113, + "learning_rate": 4.8864116032855455e-05, + "loss": 0.5438, + "step": 107200 + }, + { + "epoch": 4.825231377482254, + "grad_norm": 5.219661712646484, + "learning_rate": 4.885990605135993e-05, + "loss": 0.558, + "step": 107400 + }, + { + "epoch": 4.834216910773654, + "grad_norm": 10.39129638671875, + "learning_rate": 4.8855688464499215e-05, + "loss": 0.5929, + "step": 107600 + }, + { + "epoch": 4.843202444065056, + "grad_norm": 2.12060546875, + "learning_rate": 4.8851463273617694e-05, + "loss": 0.5864, + "step": 107800 + }, + { + "epoch": 4.852187977356456, + "grad_norm": 15.424951553344727, + "learning_rate": 4.884723048006212e-05, + "loss": 0.585, + "step": 108000 + }, + { + "epoch": 4.852187977356456, + "eval_loss": 2.6704163551330566, + "eval_runtime": 1076.6628, + "eval_samples_per_second": 9.199, + "eval_steps_per_second": 0.144, + "step": 108000 + }, + { + "epoch": 4.861173510647857, + "grad_norm": 4.717384338378906, + "learning_rate": 4.8842990085181725e-05, + "loss": 0.5606, + "step": 108200 + }, + { + "epoch": 4.870159043939258, + "grad_norm": 8.064077377319336, + "learning_rate": 4.883874209032813e-05, + "loss": 0.5986, + "step": 108400 + }, + { + "epoch": 4.879144577230659, + "grad_norm": 3.4180448055267334, + "learning_rate": 4.8834486496855374e-05, + "loss": 0.5765, + "step": 108600 + }, + { + "epoch": 4.888130110522059, + "grad_norm": 6.318375110626221, + "learning_rate": 4.883022330611995e-05, + "loss": 0.5866, + "step": 108800 + }, + { + "epoch": 4.897115643813461, + "grad_norm": 8.343177795410156, + "learning_rate": 4.8825952519480745e-05, + "loss": 0.5684, + "step": 109000 + }, + { + "epoch": 4.897115643813461, + "eval_loss": 2.612858533859253, + "eval_runtime": 1076.4447, + "eval_samples_per_second": 9.201, + "eval_steps_per_second": 0.144, + "step": 109000 + }, + { + "epoch": 4.906101177104861, + "grad_norm": 13.54843807220459, + "learning_rate": 4.882167413829908e-05, + "loss": 0.5689, + "step": 109200 + }, + { + "epoch": 4.915086710396262, + "grad_norm": 1.2996422052383423, + "learning_rate": 4.8817388163938685e-05, + "loss": 0.5665, + "step": 109400 + }, + { + "epoch": 4.924072243687663, + "grad_norm": 1.4910564422607422, + "learning_rate": 4.881309459776572e-05, + "loss": 0.5883, + "step": 109600 + }, + { + "epoch": 4.933057776979064, + "grad_norm": 4.319411754608154, + "learning_rate": 4.880879344114877e-05, + "loss": 0.5886, + "step": 109800 + }, + { + "epoch": 4.942043310270464, + "grad_norm": 9.951111793518066, + "learning_rate": 4.880448469545882e-05, + "loss": 0.5587, + "step": 110000 + }, + { + "epoch": 4.942043310270464, + "eval_loss": 2.679171323776245, + "eval_runtime": 1075.904, + "eval_samples_per_second": 9.205, + "eval_steps_per_second": 0.144, + "step": 110000 + }, + { + "epoch": 4.951028843561866, + "grad_norm": 5.12622594833374, + "learning_rate": 4.8800168362069295e-05, + "loss": 0.6082, + "step": 110200 + }, + { + "epoch": 4.960014376853266, + "grad_norm": 9.128108978271484, + "learning_rate": 4.8795844442356036e-05, + "loss": 0.5774, + "step": 110400 + }, + { + "epoch": 4.968999910144667, + "grad_norm": 13.645403861999512, + "learning_rate": 4.879151293769729e-05, + "loss": 0.6136, + "step": 110600 + }, + { + "epoch": 4.977985443436068, + "grad_norm": 4.305540084838867, + "learning_rate": 4.878717384947372e-05, + "loss": 0.6004, + "step": 110800 + }, + { + "epoch": 4.986970976727469, + "grad_norm": 2.3471438884735107, + "learning_rate": 4.878282717906843e-05, + "loss": 0.5718, + "step": 111000 + }, + { + "epoch": 4.986970976727469, + "eval_loss": 2.6824982166290283, + "eval_runtime": 1076.2318, + "eval_samples_per_second": 9.202, + "eval_steps_per_second": 0.144, + "step": 111000 + }, + { + "epoch": 4.995956510018869, + "grad_norm": 3.578322172164917, + "learning_rate": 4.8778472927866905e-05, + "loss": 0.5599, + "step": 111200 + }, + { + "epoch": 5.004942043310271, + "grad_norm": 8.115492820739746, + "learning_rate": 4.877411109725707e-05, + "loss": 0.5391, + "step": 111400 + }, + { + "epoch": 5.013927576601671, + "grad_norm": 5.805984020233154, + "learning_rate": 4.8769741688629276e-05, + "loss": 0.5613, + "step": 111600 + }, + { + "epoch": 5.022913109893072, + "grad_norm": 15.611380577087402, + "learning_rate": 4.8765364703376275e-05, + "loss": 0.57, + "step": 111800 + }, + { + "epoch": 5.031898643184473, + "grad_norm": 14.959733009338379, + "learning_rate": 4.876098014289322e-05, + "loss": 0.5168, + "step": 112000 + }, + { + "epoch": 5.031898643184473, + "eval_loss": 2.672183036804199, + "eval_runtime": 1076.4621, + "eval_samples_per_second": 9.201, + "eval_steps_per_second": 0.144, + "step": 112000 + }, + { + "epoch": 5.040884176475874, + "grad_norm": 6.3477864265441895, + "learning_rate": 4.875658800857771e-05, + "loss": 0.5427, + "step": 112200 + }, + { + "epoch": 5.0498697097672745, + "grad_norm": 5.391243934631348, + "learning_rate": 4.8752188301829726e-05, + "loss": 0.5698, + "step": 112400 + }, + { + "epoch": 5.058855243058676, + "grad_norm": 6.428415298461914, + "learning_rate": 4.8747781024051686e-05, + "loss": 0.551, + "step": 112600 + }, + { + "epoch": 5.067840776350076, + "grad_norm": 6.255007266998291, + "learning_rate": 4.874336617664842e-05, + "loss": 0.5098, + "step": 112800 + }, + { + "epoch": 5.076826309641477, + "grad_norm": 4.247288703918457, + "learning_rate": 4.873894376102715e-05, + "loss": 0.5399, + "step": 113000 + }, + { + "epoch": 5.076826309641477, + "eval_loss": 2.692117214202881, + "eval_runtime": 1077.848, + "eval_samples_per_second": 9.189, + "eval_steps_per_second": 0.144, + "step": 113000 + }, + { + "epoch": 5.085811842932878, + "grad_norm": 4.478646755218506, + "learning_rate": 4.873451377859753e-05, + "loss": 0.5266, + "step": 113200 + }, + { + "epoch": 5.094797376224279, + "grad_norm": 4.759102821350098, + "learning_rate": 4.873007623077162e-05, + "loss": 0.5708, + "step": 113400 + }, + { + "epoch": 5.1037829095156795, + "grad_norm": 6.76074743270874, + "learning_rate": 4.872563111896391e-05, + "loss": 0.5347, + "step": 113600 + }, + { + "epoch": 5.112768442807081, + "grad_norm": 13.389432907104492, + "learning_rate": 4.872117844459126e-05, + "loss": 0.5058, + "step": 113800 + }, + { + "epoch": 5.121753976098481, + "grad_norm": 7.0974297523498535, + "learning_rate": 4.871671820907296e-05, + "loss": 0.549, + "step": 114000 + }, + { + "epoch": 5.121753976098481, + "eval_loss": 2.6620500087738037, + "eval_runtime": 1077.5471, + "eval_samples_per_second": 9.191, + "eval_steps_per_second": 0.144, + "step": 114000 + }, + { + "epoch": 5.130739509389882, + "grad_norm": 3.2014670372009277, + "learning_rate": 4.871225041383074e-05, + "loss": 0.5409, + "step": 114200 + }, + { + "epoch": 5.139725042681283, + "grad_norm": 6.361083984375, + "learning_rate": 4.8707775060288695e-05, + "loss": 0.5407, + "step": 114400 + }, + { + "epoch": 5.148710575972684, + "grad_norm": 12.352490425109863, + "learning_rate": 4.8703292149873356e-05, + "loss": 0.5898, + "step": 114600 + }, + { + "epoch": 5.1576961092640845, + "grad_norm": 6.829831123352051, + "learning_rate": 4.869880168401364e-05, + "loss": 0.5598, + "step": 114800 + }, + { + "epoch": 5.166681642555486, + "grad_norm": 9.012941360473633, + "learning_rate": 4.86943036641409e-05, + "loss": 0.5792, + "step": 115000 + }, + { + "epoch": 5.166681642555486, + "eval_loss": 2.6695964336395264, + "eval_runtime": 1076.6032, + "eval_samples_per_second": 9.199, + "eval_steps_per_second": 0.144, + "step": 115000 + }, + { + "epoch": 5.175667175846886, + "grad_norm": 5.5551838874816895, + "learning_rate": 4.868979809168889e-05, + "loss": 0.5334, + "step": 115200 + }, + { + "epoch": 5.1846527091382875, + "grad_norm": 5.080362796783447, + "learning_rate": 4.8685284968093745e-05, + "loss": 0.5476, + "step": 115400 + }, + { + "epoch": 5.193638242429688, + "grad_norm": 3.391294479370117, + "learning_rate": 4.868076429479403e-05, + "loss": 0.541, + "step": 115600 + }, + { + "epoch": 5.202623775721089, + "grad_norm": 5.813953399658203, + "learning_rate": 4.867623607323074e-05, + "loss": 0.5506, + "step": 115800 + }, + { + "epoch": 5.2116093090124895, + "grad_norm": 3.1033880710601807, + "learning_rate": 4.8671700304847216e-05, + "loss": 0.5843, + "step": 116000 + }, + { + "epoch": 5.2116093090124895, + "eval_loss": 2.706368923187256, + "eval_runtime": 1124.9655, + "eval_samples_per_second": 8.804, + "eval_steps_per_second": 0.138, + "step": 116000 + }, + { + "epoch": 5.220594842303891, + "grad_norm": 2.261789321899414, + "learning_rate": 4.866715699108926e-05, + "loss": 0.5736, + "step": 116200 + }, + { + "epoch": 5.229580375595291, + "grad_norm": 6.052493095397949, + "learning_rate": 4.866260613340504e-05, + "loss": 0.5848, + "step": 116400 + }, + { + "epoch": 5.2385659088866925, + "grad_norm": 12.537518501281738, + "learning_rate": 4.8658047733245166e-05, + "loss": 0.5431, + "step": 116600 + }, + { + "epoch": 5.247551442178093, + "grad_norm": 4.784250736236572, + "learning_rate": 4.8653481792062615e-05, + "loss": 0.5338, + "step": 116800 + }, + { + "epoch": 5.256536975469494, + "grad_norm": 5.308268070220947, + "learning_rate": 4.8648908311312794e-05, + "loss": 0.607, + "step": 117000 + }, + { + "epoch": 5.256536975469494, + "eval_loss": 2.680147647857666, + "eval_runtime": 1125.8958, + "eval_samples_per_second": 8.797, + "eval_steps_per_second": 0.138, + "step": 117000 + }, + { + "epoch": 5.265522508760895, + "grad_norm": 2.42497181892395, + "learning_rate": 4.86443272924535e-05, + "loss": 0.5626, + "step": 117200 + }, + { + "epoch": 5.274508042052296, + "grad_norm": 4.430539131164551, + "learning_rate": 4.8639738736944934e-05, + "loss": 0.5452, + "step": 117400 + }, + { + "epoch": 5.283493575343696, + "grad_norm": 2.8931050300598145, + "learning_rate": 4.863514264624971e-05, + "loss": 0.5511, + "step": 117600 + }, + { + "epoch": 5.2924791086350975, + "grad_norm": 4.152849197387695, + "learning_rate": 4.8630539021832824e-05, + "loss": 0.5992, + "step": 117800 + }, + { + "epoch": 5.301464641926499, + "grad_norm": 4.759932518005371, + "learning_rate": 4.8625927865161694e-05, + "loss": 0.562, + "step": 118000 + }, + { + "epoch": 5.301464641926499, + "eval_loss": 2.679501533508301, + "eval_runtime": 1123.4329, + "eval_samples_per_second": 8.816, + "eval_steps_per_second": 0.138, + "step": 118000 + }, + { + "epoch": 5.310450175217899, + "grad_norm": 3.476011037826538, + "learning_rate": 4.862130917770613e-05, + "loss": 0.5785, + "step": 118200 + }, + { + "epoch": 5.3194357085093, + "grad_norm": 5.236737251281738, + "learning_rate": 4.861668296093834e-05, + "loss": 0.567, + "step": 118400 + }, + { + "epoch": 5.328421241800701, + "grad_norm": 4.2177348136901855, + "learning_rate": 4.8612049216332935e-05, + "loss": 0.5841, + "step": 118600 + }, + { + "epoch": 5.337406775092102, + "grad_norm": 11.418831825256348, + "learning_rate": 4.8607407945366924e-05, + "loss": 0.5766, + "step": 118800 + }, + { + "epoch": 5.3463923083835025, + "grad_norm": 3.5538837909698486, + "learning_rate": 4.8602759149519716e-05, + "loss": 0.564, + "step": 119000 + }, + { + "epoch": 5.3463923083835025, + "eval_loss": 2.6711316108703613, + "eval_runtime": 1126.4665, + "eval_samples_per_second": 8.792, + "eval_steps_per_second": 0.138, + "step": 119000 + }, + { + "epoch": 5.355377841674903, + "grad_norm": 4.001996994018555, + "learning_rate": 4.859810283027312e-05, + "loss": 0.5761, + "step": 119200 + }, + { + "epoch": 5.364363374966304, + "grad_norm": 3.8045248985290527, + "learning_rate": 4.8593438989111345e-05, + "loss": 0.556, + "step": 119400 + }, + { + "epoch": 5.3733489082577055, + "grad_norm": 4.172726154327393, + "learning_rate": 4.858876762752099e-05, + "loss": 0.532, + "step": 119600 + }, + { + "epoch": 5.382334441549106, + "grad_norm": 3.246440887451172, + "learning_rate": 4.858408874699105e-05, + "loss": 0.5384, + "step": 119800 + }, + { + "epoch": 5.391319974840507, + "grad_norm": 4.557338714599609, + "learning_rate": 4.8579402349012936e-05, + "loss": 0.5814, + "step": 120000 + }, + { + "epoch": 5.391319974840507, + "eval_loss": 2.5864908695220947, + "eval_runtime": 1127.6464, + "eval_samples_per_second": 8.783, + "eval_steps_per_second": 0.137, + "step": 120000 + }, + { + "epoch": 5.400305508131908, + "grad_norm": 4.541125297546387, + "learning_rate": 4.857470843508043e-05, + "loss": 0.5676, + "step": 120200 + }, + { + "epoch": 5.409291041423309, + "grad_norm": 5.430272579193115, + "learning_rate": 4.857000700668973e-05, + "loss": 0.5563, + "step": 120400 + }, + { + "epoch": 5.418276574714709, + "grad_norm": 6.92936372756958, + "learning_rate": 4.8565298065339405e-05, + "loss": 0.549, + "step": 120600 + }, + { + "epoch": 5.4272621080061105, + "grad_norm": 7.017961025238037, + "learning_rate": 4.856058161253045e-05, + "loss": 0.5848, + "step": 120800 + }, + { + "epoch": 5.436247641297511, + "grad_norm": 9.248579978942871, + "learning_rate": 4.855585764976623e-05, + "loss": 0.5389, + "step": 121000 + }, + { + "epoch": 5.436247641297511, + "eval_loss": 2.6353914737701416, + "eval_runtime": 1126.4128, + "eval_samples_per_second": 8.793, + "eval_steps_per_second": 0.138, + "step": 121000 + }, + { + "epoch": 5.445233174588912, + "grad_norm": 4.005666255950928, + "learning_rate": 4.8551126178552514e-05, + "loss": 0.5066, + "step": 121200 + }, + { + "epoch": 5.454218707880313, + "grad_norm": 8.623493194580078, + "learning_rate": 4.854638720039746e-05, + "loss": 0.6034, + "step": 121400 + }, + { + "epoch": 5.463204241171714, + "grad_norm": 2.6416425704956055, + "learning_rate": 4.854164071681163e-05, + "loss": 0.6142, + "step": 121600 + }, + { + "epoch": 5.472189774463114, + "grad_norm": 10.089157104492188, + "learning_rate": 4.853688672930796e-05, + "loss": 0.5622, + "step": 121800 + }, + { + "epoch": 5.481175307754516, + "grad_norm": 4.700775146484375, + "learning_rate": 4.853212523940179e-05, + "loss": 0.5023, + "step": 122000 + }, + { + "epoch": 5.481175307754516, + "eval_loss": 2.6258456707000732, + "eval_runtime": 1126.3011, + "eval_samples_per_second": 8.793, + "eval_steps_per_second": 0.138, + "step": 122000 + }, + { + "epoch": 5.490160841045916, + "grad_norm": 3.110429048538208, + "learning_rate": 4.852735624861086e-05, + "loss": 0.5401, + "step": 122200 + }, + { + "epoch": 5.499146374337317, + "grad_norm": 3.0017948150634766, + "learning_rate": 4.8522579758455274e-05, + "loss": 0.5053, + "step": 122400 + }, + { + "epoch": 5.508131907628718, + "grad_norm": 32.01022720336914, + "learning_rate": 4.851779577045754e-05, + "loss": 0.5696, + "step": 122600 + }, + { + "epoch": 5.517117440920119, + "grad_norm": 3.6444568634033203, + "learning_rate": 4.8513004286142575e-05, + "loss": 0.5667, + "step": 122800 + }, + { + "epoch": 5.526102974211519, + "grad_norm": 3.843571424484253, + "learning_rate": 4.850820530703766e-05, + "loss": 0.5343, + "step": 123000 + }, + { + "epoch": 5.526102974211519, + "eval_loss": 2.6320242881774902, + "eval_runtime": 1124.7644, + "eval_samples_per_second": 8.805, + "eval_steps_per_second": 0.138, + "step": 123000 + }, + { + "epoch": 5.535088507502921, + "grad_norm": 8.31619930267334, + "learning_rate": 4.8503398834672475e-05, + "loss": 0.5359, + "step": 123200 + }, + { + "epoch": 5.544074040794321, + "grad_norm": 7.517163276672363, + "learning_rate": 4.849858487057908e-05, + "loss": 0.5299, + "step": 123400 + }, + { + "epoch": 5.553059574085722, + "grad_norm": 8.95091724395752, + "learning_rate": 4.849376341629194e-05, + "loss": 0.5113, + "step": 123600 + }, + { + "epoch": 5.562045107377123, + "grad_norm": 4.462621212005615, + "learning_rate": 4.848893447334789e-05, + "loss": 0.5366, + "step": 123800 + }, + { + "epoch": 5.571030640668524, + "grad_norm": 10.940470695495605, + "learning_rate": 4.848409804328617e-05, + "loss": 0.5379, + "step": 124000 + }, + { + "epoch": 5.571030640668524, + "eval_loss": 2.6875741481781006, + "eval_runtime": 1125.7965, + "eval_samples_per_second": 8.797, + "eval_steps_per_second": 0.138, + "step": 124000 + }, + { + "epoch": 5.580016173959924, + "grad_norm": 6.110741138458252, + "learning_rate": 4.847925412764838e-05, + "loss": 0.5844, + "step": 124200 + }, + { + "epoch": 5.589001707251326, + "grad_norm": 8.463932037353516, + "learning_rate": 4.847440272797854e-05, + "loss": 0.5432, + "step": 124400 + }, + { + "epoch": 5.597987240542726, + "grad_norm": 5.193777561187744, + "learning_rate": 4.846954384582303e-05, + "loss": 0.5529, + "step": 124600 + }, + { + "epoch": 5.606972773834127, + "grad_norm": 20.273698806762695, + "learning_rate": 4.8464677482730616e-05, + "loss": 0.5491, + "step": 124800 + }, + { + "epoch": 5.615958307125528, + "grad_norm": 13.971944808959961, + "learning_rate": 4.845980364025246e-05, + "loss": 0.521, + "step": 125000 + }, + { + "epoch": 5.615958307125528, + "eval_loss": 2.638272523880005, + "eval_runtime": 1125.3953, + "eval_samples_per_second": 8.8, + "eval_steps_per_second": 0.138, + "step": 125000 + }, + { + "epoch": 5.624943840416929, + "grad_norm": 9.242423057556152, + "learning_rate": 4.845492231994211e-05, + "loss": 0.5348, + "step": 125200 + }, + { + "epoch": 5.633929373708329, + "grad_norm": 11.727241516113281, + "learning_rate": 4.8450033523355484e-05, + "loss": 0.5712, + "step": 125400 + }, + { + "epoch": 5.642914906999731, + "grad_norm": 6.178032875061035, + "learning_rate": 4.8445137252050885e-05, + "loss": 0.5304, + "step": 125600 + }, + { + "epoch": 5.651900440291131, + "grad_norm": 2.3145875930786133, + "learning_rate": 4.844023350758902e-05, + "loss": 0.5708, + "step": 125800 + }, + { + "epoch": 5.660885973582532, + "grad_norm": 10.514315605163574, + "learning_rate": 4.843532229153295e-05, + "loss": 0.5351, + "step": 126000 + }, + { + "epoch": 5.660885973582532, + "eval_loss": 2.6288137435913086, + "eval_runtime": 1125.1485, + "eval_samples_per_second": 8.802, + "eval_steps_per_second": 0.138, + "step": 126000 + }, + { + "epoch": 5.669871506873933, + "grad_norm": 4.7612762451171875, + "learning_rate": 4.843040360544813e-05, + "loss": 0.5437, + "step": 126200 + }, + { + "epoch": 5.678857040165334, + "grad_norm": 10.429271697998047, + "learning_rate": 4.84254774509024e-05, + "loss": 0.5677, + "step": 126400 + }, + { + "epoch": 5.687842573456734, + "grad_norm": 9.046426773071289, + "learning_rate": 4.842054382946597e-05, + "loss": 0.5346, + "step": 126600 + }, + { + "epoch": 5.696828106748136, + "grad_norm": 6.291619777679443, + "learning_rate": 4.8415602742711444e-05, + "loss": 0.5429, + "step": 126800 + }, + { + "epoch": 5.705813640039536, + "grad_norm": 4.383120059967041, + "learning_rate": 4.8410654192213786e-05, + "loss": 0.5791, + "step": 127000 + }, + { + "epoch": 5.705813640039536, + "eval_loss": 2.6114344596862793, + "eval_runtime": 1111.2202, + "eval_samples_per_second": 8.913, + "eval_steps_per_second": 0.139, + "step": 127000 + }, + { + "epoch": 5.714799173330937, + "grad_norm": 7.231574058532715, + "learning_rate": 4.840569817955035e-05, + "loss": 0.549, + "step": 127200 + }, + { + "epoch": 5.723784706622338, + "grad_norm": 6.7952752113342285, + "learning_rate": 4.840073470630089e-05, + "loss": 0.5701, + "step": 127400 + }, + { + "epoch": 5.732770239913739, + "grad_norm": 13.880270957946777, + "learning_rate": 4.83957637740475e-05, + "loss": 0.5792, + "step": 127600 + }, + { + "epoch": 5.741755773205139, + "grad_norm": 3.9061381816864014, + "learning_rate": 4.8390785384374664e-05, + "loss": 0.5452, + "step": 127800 + }, + { + "epoch": 5.750741306496541, + "grad_norm": 5.482219696044922, + "learning_rate": 4.838579953886927e-05, + "loss": 0.5535, + "step": 128000 + }, + { + "epoch": 5.750741306496541, + "eval_loss": 2.6782829761505127, + "eval_runtime": 1109.7824, + "eval_samples_per_second": 8.924, + "eval_steps_per_second": 0.14, + "step": 128000 + }, + { + "epoch": 5.759726839787941, + "grad_norm": 10.9642972946167, + "learning_rate": 4.838080623912054e-05, + "loss": 0.5603, + "step": 128200 + }, + { + "epoch": 5.768712373079342, + "grad_norm": 8.078912734985352, + "learning_rate": 4.8375805486720086e-05, + "loss": 0.5436, + "step": 128400 + }, + { + "epoch": 5.777697906370743, + "grad_norm": 4.08800745010376, + "learning_rate": 4.8370797283261925e-05, + "loss": 0.5288, + "step": 128600 + }, + { + "epoch": 5.786683439662144, + "grad_norm": 3.705470561981201, + "learning_rate": 4.836578163034242e-05, + "loss": 0.5173, + "step": 128800 + }, + { + "epoch": 5.795668972953544, + "grad_norm": 5.712687015533447, + "learning_rate": 4.8360758529560314e-05, + "loss": 0.5144, + "step": 129000 + }, + { + "epoch": 5.795668972953544, + "eval_loss": 2.654538631439209, + "eval_runtime": 1110.9444, + "eval_samples_per_second": 8.915, + "eval_steps_per_second": 0.14, + "step": 129000 + }, + { + "epoch": 5.804654506244946, + "grad_norm": 4.038150310516357, + "learning_rate": 4.835572798251671e-05, + "loss": 0.5622, + "step": 129200 + }, + { + "epoch": 5.813640039536346, + "grad_norm": 8.389162063598633, + "learning_rate": 4.8350689990815124e-05, + "loss": 0.5431, + "step": 129400 + }, + { + "epoch": 5.822625572827747, + "grad_norm": 9.799603462219238, + "learning_rate": 4.8345644556061396e-05, + "loss": 0.5496, + "step": 129600 + }, + { + "epoch": 5.831611106119148, + "grad_norm": 44.71828842163086, + "learning_rate": 4.8340591679863776e-05, + "loss": 0.5837, + "step": 129800 + }, + { + "epoch": 5.840596639410549, + "grad_norm": 5.973487854003906, + "learning_rate": 4.833553136383287e-05, + "loss": 0.5761, + "step": 130000 + }, + { + "epoch": 5.840596639410549, + "eval_loss": 2.5852513313293457, + "eval_runtime": 1110.4328, + "eval_samples_per_second": 8.919, + "eval_steps_per_second": 0.14, + "step": 130000 + }, + { + "epoch": 5.84958217270195, + "grad_norm": 2.016286611557007, + "learning_rate": 4.833046360958165e-05, + "loss": 0.5219, + "step": 130200 + }, + { + "epoch": 5.858567705993351, + "grad_norm": 2.8672537803649902, + "learning_rate": 4.832538841872549e-05, + "loss": 0.5476, + "step": 130400 + }, + { + "epoch": 5.867553239284751, + "grad_norm": 17.733501434326172, + "learning_rate": 4.832030579288209e-05, + "loss": 0.5759, + "step": 130600 + }, + { + "epoch": 5.876538772576152, + "grad_norm": 3.3349339962005615, + "learning_rate": 4.831521573367154e-05, + "loss": 0.5417, + "step": 130800 + }, + { + "epoch": 5.885524305867554, + "grad_norm": 8.842341423034668, + "learning_rate": 4.8310118242716315e-05, + "loss": 0.5808, + "step": 131000 + }, + { + "epoch": 5.885524305867554, + "eval_loss": 2.6102592945098877, + "eval_runtime": 1109.8113, + "eval_samples_per_second": 8.924, + "eval_steps_per_second": 0.14, + "step": 131000 + }, + { + "epoch": 5.894509839158954, + "grad_norm": 17.3737850189209, + "learning_rate": 4.830501332164124e-05, + "loss": 0.5337, + "step": 131200 + }, + { + "epoch": 5.9034953724503545, + "grad_norm": 2.934797525405884, + "learning_rate": 4.829990097207351e-05, + "loss": 0.557, + "step": 131400 + }, + { + "epoch": 5.912480905741756, + "grad_norm": 3.8777339458465576, + "learning_rate": 4.829478119564269e-05, + "loss": 0.551, + "step": 131600 + }, + { + "epoch": 5.921466439033157, + "grad_norm": 4.155474662780762, + "learning_rate": 4.828965399398071e-05, + "loss": 0.5124, + "step": 131800 + }, + { + "epoch": 5.9304519723245575, + "grad_norm": 129.3715057373047, + "learning_rate": 4.828451936872187e-05, + "loss": 0.5903, + "step": 132000 + }, + { + "epoch": 5.9304519723245575, + "eval_loss": 2.62882924079895, + "eval_runtime": 1109.5966, + "eval_samples_per_second": 8.926, + "eval_steps_per_second": 0.14, + "step": 132000 + }, + { + "epoch": 5.939437505615958, + "grad_norm": 15.213759422302246, + "learning_rate": 4.827937732150285e-05, + "loss": 0.5439, + "step": 132200 + }, + { + "epoch": 5.948423038907359, + "grad_norm": 5.646575450897217, + "learning_rate": 4.827422785396267e-05, + "loss": 0.5778, + "step": 132400 + }, + { + "epoch": 5.95740857219876, + "grad_norm": 14.637299537658691, + "learning_rate": 4.8269070967742725e-05, + "loss": 0.5321, + "step": 132600 + }, + { + "epoch": 5.966394105490161, + "grad_norm": 5.925998687744141, + "learning_rate": 4.826390666448679e-05, + "loss": 0.5413, + "step": 132800 + }, + { + "epoch": 5.975379638781561, + "grad_norm": 15.88015079498291, + "learning_rate": 4.825873494584097e-05, + "loss": 0.5342, + "step": 133000 + }, + { + "epoch": 5.975379638781561, + "eval_loss": 2.6159465312957764, + "eval_runtime": 1111.9916, + "eval_samples_per_second": 8.907, + "eval_steps_per_second": 0.139, + "step": 133000 + }, + { + "epoch": 5.9843651720729625, + "grad_norm": 5.7126359939575195, + "learning_rate": 4.8253555813453775e-05, + "loss": 0.5362, + "step": 133200 + }, + { + "epoch": 5.993350705364364, + "grad_norm": 6.177489757537842, + "learning_rate": 4.824836926897604e-05, + "loss": 0.5586, + "step": 133400 + }, + { + "epoch": 6.002336238655764, + "grad_norm": 4.75473165512085, + "learning_rate": 4.8243175314061e-05, + "loss": 0.5288, + "step": 133600 + }, + { + "epoch": 6.011321771947165, + "grad_norm": 2.6426875591278076, + "learning_rate": 4.8237973950364225e-05, + "loss": 0.5172, + "step": 133800 + }, + { + "epoch": 6.020307305238566, + "grad_norm": 4.771461009979248, + "learning_rate": 4.823276517954365e-05, + "loss": 0.553, + "step": 134000 + }, + { + "epoch": 6.020307305238566, + "eval_loss": 2.6342790126800537, + "eval_runtime": 1109.0332, + "eval_samples_per_second": 8.93, + "eval_steps_per_second": 0.14, + "step": 134000 + }, + { + "epoch": 6.029292838529967, + "grad_norm": 6.850405216217041, + "learning_rate": 4.822754900325958e-05, + "loss": 0.5677, + "step": 134200 + }, + { + "epoch": 6.0382783718213675, + "grad_norm": 6.183258533477783, + "learning_rate": 4.822232542317466e-05, + "loss": 0.5072, + "step": 134400 + }, + { + "epoch": 6.047263905112769, + "grad_norm": 8.269383430480957, + "learning_rate": 4.821709444095393e-05, + "loss": 0.5206, + "step": 134600 + }, + { + "epoch": 6.056249438404169, + "grad_norm": 1.2506552934646606, + "learning_rate": 4.821185605826476e-05, + "loss": 0.4931, + "step": 134800 + }, + { + "epoch": 6.0652349716955705, + "grad_norm": 5.354737281799316, + "learning_rate": 4.820661027677689e-05, + "loss": 0.5413, + "step": 135000 + }, + { + "epoch": 6.0652349716955705, + "eval_loss": 2.612915515899658, + "eval_runtime": 1109.5309, + "eval_samples_per_second": 8.926, + "eval_steps_per_second": 0.14, + "step": 135000 + }, + { + "epoch": 6.074220504986971, + "grad_norm": 3.7436015605926514, + "learning_rate": 4.820135709816242e-05, + "loss": 0.5262, + "step": 135200 + }, + { + "epoch": 6.083206038278372, + "grad_norm": 2.3418149948120117, + "learning_rate": 4.8196096524095815e-05, + "loss": 0.4969, + "step": 135400 + }, + { + "epoch": 6.0921915715697725, + "grad_norm": 3.5079879760742188, + "learning_rate": 4.8190828556253864e-05, + "loss": 0.5307, + "step": 135600 + }, + { + "epoch": 6.101177104861174, + "grad_norm": 5.637112140655518, + "learning_rate": 4.8185553196315755e-05, + "loss": 0.4973, + "step": 135800 + }, + { + "epoch": 6.110162638152574, + "grad_norm": 9.889835357666016, + "learning_rate": 4.8180270445963004e-05, + "loss": 0.5798, + "step": 136000 + }, + { + "epoch": 6.110162638152574, + "eval_loss": 2.644315481185913, + "eval_runtime": 1108.8674, + "eval_samples_per_second": 8.932, + "eval_steps_per_second": 0.14, + "step": 136000 + }, + { + "epoch": 6.1191481714439755, + "grad_norm": 5.801605701446533, + "learning_rate": 4.817498030687949e-05, + "loss": 0.5192, + "step": 136200 + }, + { + "epoch": 6.128133704735376, + "grad_norm": 7.900972843170166, + "learning_rate": 4.8169682780751465e-05, + "loss": 0.4924, + "step": 136400 + }, + { + "epoch": 6.137119238026777, + "grad_norm": 4.622593879699707, + "learning_rate": 4.816437786926751e-05, + "loss": 0.5523, + "step": 136600 + }, + { + "epoch": 6.146104771318178, + "grad_norm": 5.807979106903076, + "learning_rate": 4.815906557411856e-05, + "loss": 0.5208, + "step": 136800 + }, + { + "epoch": 6.155090304609579, + "grad_norm": 42.20900344848633, + "learning_rate": 4.8153745896997926e-05, + "loss": 0.5296, + "step": 137000 + }, + { + "epoch": 6.155090304609579, + "eval_loss": 2.6667978763580322, + "eval_runtime": 1109.2515, + "eval_samples_per_second": 8.929, + "eval_steps_per_second": 0.14, + "step": 137000 + }, + { + "epoch": 6.164075837900979, + "grad_norm": 7.494675636291504, + "learning_rate": 4.814841883960126e-05, + "loss": 0.5432, + "step": 137200 + }, + { + "epoch": 6.1730613711923805, + "grad_norm": 24.198781967163086, + "learning_rate": 4.814308440362656e-05, + "loss": 0.5392, + "step": 137400 + }, + { + "epoch": 6.182046904483781, + "grad_norm": 4.07385778427124, + "learning_rate": 4.8137742590774195e-05, + "loss": 0.5453, + "step": 137600 + }, + { + "epoch": 6.191032437775182, + "grad_norm": 3.366076707839966, + "learning_rate": 4.813239340274685e-05, + "loss": 0.5586, + "step": 137800 + }, + { + "epoch": 6.200017971066583, + "grad_norm": 2.3177366256713867, + "learning_rate": 4.8127036841249596e-05, + "loss": 0.516, + "step": 138000 + }, + { + "epoch": 6.200017971066583, + "eval_loss": 2.58992862701416, + "eval_runtime": 1042.972, + "eval_samples_per_second": 9.496, + "eval_steps_per_second": 0.149, + "step": 138000 + }, + { + "epoch": 6.209003504357984, + "grad_norm": 7.948215007781982, + "learning_rate": 4.812167290798984e-05, + "loss": 0.5612, + "step": 138200 + }, + { + "epoch": 6.217989037649384, + "grad_norm": 4.769832611083984, + "learning_rate": 4.811630160467735e-05, + "loss": 0.5632, + "step": 138400 + }, + { + "epoch": 6.2269745709407855, + "grad_norm": 3.1266725063323975, + "learning_rate": 4.8110922933024214e-05, + "loss": 0.5323, + "step": 138600 + }, + { + "epoch": 6.235960104232186, + "grad_norm": 3.03983211517334, + "learning_rate": 4.8105536894744904e-05, + "loss": 0.5069, + "step": 138800 + }, + { + "epoch": 6.244945637523587, + "grad_norm": 13.369333267211914, + "learning_rate": 4.810014349155621e-05, + "loss": 0.5327, + "step": 139000 + }, + { + "epoch": 6.244945637523587, + "eval_loss": 2.632561683654785, + "eval_runtime": 1042.6567, + "eval_samples_per_second": 9.499, + "eval_steps_per_second": 0.149, + "step": 139000 + }, + { + "epoch": 6.253931170814988, + "grad_norm": 4.6813836097717285, + "learning_rate": 4.809474272517731e-05, + "loss": 0.5188, + "step": 139200 + }, + { + "epoch": 6.262916704106389, + "grad_norm": 8.677014350891113, + "learning_rate": 4.8089334597329674e-05, + "loss": 0.5233, + "step": 139400 + }, + { + "epoch": 6.271902237397789, + "grad_norm": 10.864197731018066, + "learning_rate": 4.8083919109737165e-05, + "loss": 0.5193, + "step": 139600 + }, + { + "epoch": 6.280887770689191, + "grad_norm": 5.195317268371582, + "learning_rate": 4.807849626412596e-05, + "loss": 0.5343, + "step": 139800 + }, + { + "epoch": 6.289873303980591, + "grad_norm": 2.9889798164367676, + "learning_rate": 4.8073066062224605e-05, + "loss": 0.5322, + "step": 140000 + }, + { + "epoch": 6.289873303980591, + "eval_loss": 2.6202876567840576, + "eval_runtime": 1042.8692, + "eval_samples_per_second": 9.497, + "eval_steps_per_second": 0.149, + "step": 140000 + }, + { + "epoch": 6.298858837271992, + "grad_norm": 2.6103203296661377, + "learning_rate": 4.8067628505763986e-05, + "loss": 0.5202, + "step": 140200 + }, + { + "epoch": 6.307844370563393, + "grad_norm": 4.392446517944336, + "learning_rate": 4.806218359647732e-05, + "loss": 0.5528, + "step": 140400 + }, + { + "epoch": 6.316829903854794, + "grad_norm": 12.344572067260742, + "learning_rate": 4.8056731336100175e-05, + "loss": 0.5158, + "step": 140600 + }, + { + "epoch": 6.325815437146194, + "grad_norm": 4.688963413238525, + "learning_rate": 4.8051271726370474e-05, + "loss": 0.5684, + "step": 140800 + }, + { + "epoch": 6.334800970437596, + "grad_norm": 5.1644134521484375, + "learning_rate": 4.8045804769028454e-05, + "loss": 0.5473, + "step": 141000 + }, + { + "epoch": 6.334800970437596, + "eval_loss": 2.647378921508789, + "eval_runtime": 1042.5176, + "eval_samples_per_second": 9.5, + "eval_steps_per_second": 0.149, + "step": 141000 + }, + { + "epoch": 6.343786503728996, + "grad_norm": 4.703906059265137, + "learning_rate": 4.804033046581674e-05, + "loss": 0.5046, + "step": 141200 + }, + { + "epoch": 6.352772037020397, + "grad_norm": 5.541541576385498, + "learning_rate": 4.803484881848025e-05, + "loss": 0.5424, + "step": 141400 + }, + { + "epoch": 6.361757570311798, + "grad_norm": 8.089109420776367, + "learning_rate": 4.802935982876626e-05, + "loss": 0.5066, + "step": 141600 + }, + { + "epoch": 6.370743103603199, + "grad_norm": 7.817598819732666, + "learning_rate": 4.802386349842441e-05, + "loss": 0.4951, + "step": 141800 + }, + { + "epoch": 6.379728636894599, + "grad_norm": 14.34579086303711, + "learning_rate": 4.8018359829206646e-05, + "loss": 0.5504, + "step": 142000 + }, + { + "epoch": 6.379728636894599, + "eval_loss": 2.6440494060516357, + "eval_runtime": 1042.2395, + "eval_samples_per_second": 9.503, + "eval_steps_per_second": 0.149, + "step": 142000 + }, + { + "epoch": 6.388714170186001, + "grad_norm": 1.8953040838241577, + "learning_rate": 4.801284882286727e-05, + "loss": 0.5236, + "step": 142200 + }, + { + "epoch": 6.397699703477401, + "grad_norm": 7.690189838409424, + "learning_rate": 4.800733048116291e-05, + "loss": 0.5286, + "step": 142400 + }, + { + "epoch": 6.406685236768802, + "grad_norm": 4.344729423522949, + "learning_rate": 4.8001804805852566e-05, + "loss": 0.5673, + "step": 142600 + }, + { + "epoch": 6.415670770060203, + "grad_norm": 4.415552139282227, + "learning_rate": 4.7996271798697534e-05, + "loss": 0.5343, + "step": 142800 + }, + { + "epoch": 6.424656303351604, + "grad_norm": 8.222256660461426, + "learning_rate": 4.799073146146147e-05, + "loss": 0.5271, + "step": 143000 + }, + { + "epoch": 6.424656303351604, + "eval_loss": 2.661680221557617, + "eval_runtime": 1042.5056, + "eval_samples_per_second": 9.5, + "eval_steps_per_second": 0.149, + "step": 143000 + }, + { + "epoch": 6.433641836643004, + "grad_norm": 10.482327461242676, + "learning_rate": 4.798518379591035e-05, + "loss": 0.5422, + "step": 143200 + }, + { + "epoch": 6.442627369934406, + "grad_norm": 5.589601516723633, + "learning_rate": 4.7979628803812516e-05, + "loss": 0.4927, + "step": 143400 + }, + { + "epoch": 6.451612903225806, + "grad_norm": 5.369229793548584, + "learning_rate": 4.7974066486938613e-05, + "loss": 0.5206, + "step": 143600 + }, + { + "epoch": 6.460598436517207, + "grad_norm": 10.578944206237793, + "learning_rate": 4.796849684706164e-05, + "loss": 0.5118, + "step": 143800 + }, + { + "epoch": 6.469583969808608, + "grad_norm": 5.688765525817871, + "learning_rate": 4.7962919885956916e-05, + "loss": 0.5278, + "step": 144000 + }, + { + "epoch": 6.469583969808608, + "eval_loss": 2.5855579376220703, + "eval_runtime": 1042.8155, + "eval_samples_per_second": 9.497, + "eval_steps_per_second": 0.149, + "step": 144000 + }, + { + "epoch": 6.478569503100009, + "grad_norm": 13.294556617736816, + "learning_rate": 4.795733560540211e-05, + "loss": 0.5206, + "step": 144200 + }, + { + "epoch": 6.487555036391409, + "grad_norm": 23.359086990356445, + "learning_rate": 4.7951744007177226e-05, + "loss": 0.5141, + "step": 144400 + }, + { + "epoch": 6.496540569682811, + "grad_norm": 7.575876712799072, + "learning_rate": 4.794614509306457e-05, + "loss": 0.5391, + "step": 144600 + }, + { + "epoch": 6.505526102974212, + "grad_norm": 11.292476654052734, + "learning_rate": 4.794053886484882e-05, + "loss": 0.5605, + "step": 144800 + }, + { + "epoch": 6.514511636265612, + "grad_norm": 3.0334506034851074, + "learning_rate": 4.7934925324316944e-05, + "loss": 0.5455, + "step": 145000 + }, + { + "epoch": 6.514511636265612, + "eval_loss": 2.6387248039245605, + "eval_runtime": 1043.1059, + "eval_samples_per_second": 9.495, + "eval_steps_per_second": 0.149, + "step": 145000 + }, + { + "epoch": 6.523497169557013, + "grad_norm": 7.96580171585083, + "learning_rate": 4.792930447325827e-05, + "loss": 0.5582, + "step": 145200 + }, + { + "epoch": 6.532482702848414, + "grad_norm": 9.228450775146484, + "learning_rate": 4.792367631346447e-05, + "loss": 0.5611, + "step": 145400 + }, + { + "epoch": 6.541468236139815, + "grad_norm": 7.638996124267578, + "learning_rate": 4.79180408467295e-05, + "loss": 0.4968, + "step": 145600 + }, + { + "epoch": 6.550453769431216, + "grad_norm": 3.997795343399048, + "learning_rate": 4.791239807484968e-05, + "loss": 0.5158, + "step": 145800 + }, + { + "epoch": 6.559439302722616, + "grad_norm": 6.292296886444092, + "learning_rate": 4.7906747999623644e-05, + "loss": 0.4836, + "step": 146000 + }, + { + "epoch": 6.559439302722616, + "eval_loss": 2.7034900188446045, + "eval_runtime": 1041.7965, + "eval_samples_per_second": 9.507, + "eval_steps_per_second": 0.149, + "step": 146000 + }, + { + "epoch": 6.568424836014017, + "grad_norm": 4.545322418212891, + "learning_rate": 4.790109062285236e-05, + "loss": 0.513, + "step": 146200 + }, + { + "epoch": 6.577410369305419, + "grad_norm": 7.309622287750244, + "learning_rate": 4.789542594633913e-05, + "loss": 0.5276, + "step": 146400 + }, + { + "epoch": 6.586395902596819, + "grad_norm": 6.452086925506592, + "learning_rate": 4.788975397188956e-05, + "loss": 0.5494, + "step": 146600 + }, + { + "epoch": 6.59538143588822, + "grad_norm": 11.666097640991211, + "learning_rate": 4.788407470131161e-05, + "loss": 0.5539, + "step": 146800 + }, + { + "epoch": 6.604366969179621, + "grad_norm": 2.6482343673706055, + "learning_rate": 4.787838813641554e-05, + "loss": 0.5519, + "step": 147000 + }, + { + "epoch": 6.604366969179621, + "eval_loss": 2.6106491088867188, + "eval_runtime": 1043.6396, + "eval_samples_per_second": 9.49, + "eval_steps_per_second": 0.149, + "step": 147000 + }, + { + "epoch": 6.613352502471022, + "grad_norm": 3.5646355152130127, + "learning_rate": 4.787269427901395e-05, + "loss": 0.5185, + "step": 147200 + }, + { + "epoch": 6.622338035762422, + "grad_norm": 4.31544303894043, + "learning_rate": 4.786699313092177e-05, + "loss": 0.5319, + "step": 147400 + }, + { + "epoch": 6.631323569053824, + "grad_norm": 9.14370346069336, + "learning_rate": 4.786128469395624e-05, + "loss": 0.5371, + "step": 147600 + }, + { + "epoch": 6.640309102345224, + "grad_norm": 8.601165771484375, + "learning_rate": 4.785556896993693e-05, + "loss": 0.5623, + "step": 147800 + }, + { + "epoch": 6.649294635636625, + "grad_norm": 0.5740114450454712, + "learning_rate": 4.7849845960685735e-05, + "loss": 0.5514, + "step": 148000 + }, + { + "epoch": 6.649294635636625, + "eval_loss": 2.6822104454040527, + "eval_runtime": 1041.3572, + "eval_samples_per_second": 9.511, + "eval_steps_per_second": 0.149, + "step": 148000 + }, + { + "epoch": 6.658280168928026, + "grad_norm": 4.371459007263184, + "learning_rate": 4.7844115668026865e-05, + "loss": 0.5426, + "step": 148200 + }, + { + "epoch": 6.667265702219427, + "grad_norm": 8.560872077941895, + "learning_rate": 4.783837809378685e-05, + "loss": 0.5398, + "step": 148400 + }, + { + "epoch": 6.676251235510827, + "grad_norm": 17.999832153320312, + "learning_rate": 4.783263323979456e-05, + "loss": 0.5235, + "step": 148600 + }, + { + "epoch": 6.685236768802229, + "grad_norm": 5.890347003936768, + "learning_rate": 4.782688110788116e-05, + "loss": 0.5353, + "step": 148800 + }, + { + "epoch": 6.694222302093629, + "grad_norm": 11.35936450958252, + "learning_rate": 4.782112169988015e-05, + "loss": 0.5331, + "step": 149000 + }, + { + "epoch": 6.694222302093629, + "eval_loss": 2.594395637512207, + "eval_runtime": 1042.7844, + "eval_samples_per_second": 9.498, + "eval_steps_per_second": 0.149, + "step": 149000 + }, + { + "epoch": 6.70320783538503, + "grad_norm": 8.832243919372559, + "learning_rate": 4.781535501762735e-05, + "loss": 0.5508, + "step": 149200 + }, + { + "epoch": 6.712193368676431, + "grad_norm": 5.891073226928711, + "learning_rate": 4.780958106296089e-05, + "loss": 0.5123, + "step": 149400 + }, + { + "epoch": 6.721178901967832, + "grad_norm": 4.517889976501465, + "learning_rate": 4.780379983772124e-05, + "loss": 0.5073, + "step": 149600 + }, + { + "epoch": 6.7301644352592325, + "grad_norm": 10.936097145080566, + "learning_rate": 4.7798011343751146e-05, + "loss": 0.5241, + "step": 149800 + }, + { + "epoch": 6.739149968550634, + "grad_norm": 11.331624031066895, + "learning_rate": 4.7792215582895705e-05, + "loss": 0.5371, + "step": 150000 + }, + { + "epoch": 6.739149968550634, + "eval_loss": 2.5754590034484863, + "eval_runtime": 1074.2776, + "eval_samples_per_second": 9.219, + "eval_steps_per_second": 0.144, + "step": 150000 + }, + { + "epoch": 6.748135501842034, + "grad_norm": 1.8488596677780151, + "learning_rate": 4.778641255700233e-05, + "loss": 0.5524, + "step": 150200 + }, + { + "epoch": 6.757121035133435, + "grad_norm": 14.553401947021484, + "learning_rate": 4.7780602267920716e-05, + "loss": 0.5227, + "step": 150400 + }, + { + "epoch": 6.766106568424836, + "grad_norm": 8.445063591003418, + "learning_rate": 4.777478471750292e-05, + "loss": 0.5523, + "step": 150600 + }, + { + "epoch": 6.775092101716237, + "grad_norm": 4.426443576812744, + "learning_rate": 4.776895990760328e-05, + "loss": 0.5313, + "step": 150800 + }, + { + "epoch": 6.7840776350076375, + "grad_norm": 4.786408424377441, + "learning_rate": 4.776312784007848e-05, + "loss": 0.544, + "step": 151000 + }, + { + "epoch": 6.7840776350076375, + "eval_loss": 2.580105781555176, + "eval_runtime": 1072.5697, + "eval_samples_per_second": 9.234, + "eval_steps_per_second": 0.145, + "step": 151000 + }, + { + "epoch": 6.793063168299039, + "grad_norm": 8.09899616241455, + "learning_rate": 4.775728851678747e-05, + "loss": 0.5373, + "step": 151200 + }, + { + "epoch": 6.802048701590439, + "grad_norm": 8.726985931396484, + "learning_rate": 4.775144193959155e-05, + "loss": 0.5123, + "step": 151400 + }, + { + "epoch": 6.8110342348818405, + "grad_norm": 5.333522319793701, + "learning_rate": 4.774558811035431e-05, + "loss": 0.5382, + "step": 151600 + }, + { + "epoch": 6.820019768173241, + "grad_norm": 2.5918726921081543, + "learning_rate": 4.773972703094168e-05, + "loss": 0.5008, + "step": 151800 + }, + { + "epoch": 6.829005301464642, + "grad_norm": 13.181851387023926, + "learning_rate": 4.7733858703221876e-05, + "loss": 0.535, + "step": 152000 + }, + { + "epoch": 6.829005301464642, + "eval_loss": 2.6217567920684814, + "eval_runtime": 1073.9356, + "eval_samples_per_second": 9.222, + "eval_steps_per_second": 0.144, + "step": 152000 + }, + { + "epoch": 6.8379908347560425, + "grad_norm": 3.6828906536102295, + "learning_rate": 4.772798312906545e-05, + "loss": 0.5334, + "step": 152200 + }, + { + "epoch": 6.846976368047444, + "grad_norm": 11.301506042480469, + "learning_rate": 4.772210031034521e-05, + "loss": 0.5278, + "step": 152400 + }, + { + "epoch": 6.855961901338844, + "grad_norm": 2.866434097290039, + "learning_rate": 4.771621024893633e-05, + "loss": 0.5196, + "step": 152600 + }, + { + "epoch": 6.8649474346302455, + "grad_norm": 2.977900266647339, + "learning_rate": 4.7710312946716286e-05, + "loss": 0.5131, + "step": 152800 + }, + { + "epoch": 6.873932967921646, + "grad_norm": 4.671950340270996, + "learning_rate": 4.770440840556483e-05, + "loss": 0.5423, + "step": 153000 + }, + { + "epoch": 6.873932967921646, + "eval_loss": 2.61964750289917, + "eval_runtime": 1072.5606, + "eval_samples_per_second": 9.234, + "eval_steps_per_second": 0.145, + "step": 153000 + }, + { + "epoch": 6.882918501213047, + "grad_norm": 9.421769142150879, + "learning_rate": 4.769849662736403e-05, + "loss": 0.5413, + "step": 153200 + }, + { + "epoch": 6.8919040345044476, + "grad_norm": 4.872519493103027, + "learning_rate": 4.7692577613998295e-05, + "loss": 0.5212, + "step": 153400 + }, + { + "epoch": 6.900889567795849, + "grad_norm": 4.424411296844482, + "learning_rate": 4.7686651367354304e-05, + "loss": 0.5071, + "step": 153600 + }, + { + "epoch": 6.909875101087249, + "grad_norm": 12.917271614074707, + "learning_rate": 4.7680717889321046e-05, + "loss": 0.5451, + "step": 153800 + }, + { + "epoch": 6.9188606343786505, + "grad_norm": 5.820809841156006, + "learning_rate": 4.767477718178983e-05, + "loss": 0.5204, + "step": 154000 + }, + { + "epoch": 6.9188606343786505, + "eval_loss": 2.657820463180542, + "eval_runtime": 1071.7187, + "eval_samples_per_second": 9.241, + "eval_steps_per_second": 0.145, + "step": 154000 + }, + { + "epoch": 6.927846167670051, + "grad_norm": 6.326610088348389, + "learning_rate": 4.7668829246654266e-05, + "loss": 0.5737, + "step": 154200 + }, + { + "epoch": 6.936831700961452, + "grad_norm": 6.599421977996826, + "learning_rate": 4.766287408581026e-05, + "loss": 0.5191, + "step": 154400 + }, + { + "epoch": 6.945817234252853, + "grad_norm": 1.006998062133789, + "learning_rate": 4.7656911701156016e-05, + "loss": 0.5727, + "step": 154600 + }, + { + "epoch": 6.954802767544254, + "grad_norm": 10.324342727661133, + "learning_rate": 4.7650942094592055e-05, + "loss": 0.5666, + "step": 154800 + }, + { + "epoch": 6.963788300835654, + "grad_norm": 4.480410099029541, + "learning_rate": 4.76449652680212e-05, + "loss": 0.5732, + "step": 155000 + }, + { + "epoch": 6.963788300835654, + "eval_loss": 2.6091678142547607, + "eval_runtime": 1071.6772, + "eval_samples_per_second": 9.242, + "eval_steps_per_second": 0.145, + "step": 155000 + }, + { + "epoch": 6.9727738341270555, + "grad_norm": 6.651985168457031, + "learning_rate": 4.7638981223348565e-05, + "loss": 0.5241, + "step": 155200 + }, + { + "epoch": 6.981759367418456, + "grad_norm": 5.644140720367432, + "learning_rate": 4.7632989962481565e-05, + "loss": 0.5446, + "step": 155400 + }, + { + "epoch": 6.990744900709857, + "grad_norm": 13.221419334411621, + "learning_rate": 4.762699148732992e-05, + "loss": 0.5281, + "step": 155600 + }, + { + "epoch": 6.999730434001258, + "grad_norm": 9.8410005569458, + "learning_rate": 4.762098579980566e-05, + "loss": 0.5165, + "step": 155800 + }, + { + "epoch": 7.008715967292659, + "grad_norm": 7.277264595031738, + "learning_rate": 4.761497290182309e-05, + "loss": 0.4809, + "step": 156000 + }, + { + "epoch": 7.008715967292659, + "eval_loss": 2.6050195693969727, + "eval_runtime": 1071.5521, + "eval_samples_per_second": 9.243, + "eval_steps_per_second": 0.145, + "step": 156000 + }, + { + "epoch": 7.017701500584059, + "grad_norm": 4.4227776527404785, + "learning_rate": 4.760895279529883e-05, + "loss": 0.5146, + "step": 156200 + }, + { + "epoch": 7.026687033875461, + "grad_norm": 4.779057502746582, + "learning_rate": 4.76029254821518e-05, + "loss": 0.526, + "step": 156400 + }, + { + "epoch": 7.035672567166861, + "grad_norm": 3.8437089920043945, + "learning_rate": 4.7596890964303206e-05, + "loss": 0.4857, + "step": 156600 + }, + { + "epoch": 7.044658100458262, + "grad_norm": 5.413717746734619, + "learning_rate": 4.759084924367655e-05, + "loss": 0.5221, + "step": 156800 + }, + { + "epoch": 7.053643633749663, + "grad_norm": 13.871551513671875, + "learning_rate": 4.758480032219765e-05, + "loss": 0.511, + "step": 157000 + }, + { + "epoch": 7.053643633749663, + "eval_loss": 2.6103718280792236, + "eval_runtime": 1071.8769, + "eval_samples_per_second": 9.24, + "eval_steps_per_second": 0.145, + "step": 157000 + }, + { + "epoch": 7.062629167041064, + "grad_norm": 10.212960243225098, + "learning_rate": 4.757874420179459e-05, + "loss": 0.476, + "step": 157200 + }, + { + "epoch": 7.071614700332464, + "grad_norm": 6.196323871612549, + "learning_rate": 4.757268088439777e-05, + "loss": 0.5268, + "step": 157400 + }, + { + "epoch": 7.080600233623866, + "grad_norm": 7.164575576782227, + "learning_rate": 4.756661037193988e-05, + "loss": 0.5259, + "step": 157600 + }, + { + "epoch": 7.089585766915266, + "grad_norm": 8.652503967285156, + "learning_rate": 4.756053266635591e-05, + "loss": 0.4922, + "step": 157800 + }, + { + "epoch": 7.098571300206667, + "grad_norm": 4.017261028289795, + "learning_rate": 4.75544477695831e-05, + "loss": 0.5387, + "step": 158000 + }, + { + "epoch": 7.098571300206667, + "eval_loss": 2.6241016387939453, + "eval_runtime": 1072.8511, + "eval_samples_per_second": 9.231, + "eval_steps_per_second": 0.144, + "step": 158000 + }, + { + "epoch": 7.107556833498068, + "grad_norm": 4.347532272338867, + "learning_rate": 4.7548355683561054e-05, + "loss": 0.5527, + "step": 158200 + }, + { + "epoch": 7.116542366789469, + "grad_norm": 1.523980736732483, + "learning_rate": 4.754225641023161e-05, + "loss": 0.5129, + "step": 158400 + }, + { + "epoch": 7.12552790008087, + "grad_norm": 12.395309448242188, + "learning_rate": 4.753614995153892e-05, + "loss": 0.5365, + "step": 158600 + }, + { + "epoch": 7.134513433372271, + "grad_norm": 13.86411190032959, + "learning_rate": 4.753003630942942e-05, + "loss": 0.5008, + "step": 158800 + }, + { + "epoch": 7.143498966663672, + "grad_norm": 2.280458450317383, + "learning_rate": 4.7523915485851846e-05, + "loss": 0.4832, + "step": 159000 + }, + { + "epoch": 7.143498966663672, + "eval_loss": 2.6097371578216553, + "eval_runtime": 1072.2002, + "eval_samples_per_second": 9.237, + "eval_steps_per_second": 0.145, + "step": 159000 + }, + { + "epoch": 7.152484499955072, + "grad_norm": 4.316972255706787, + "learning_rate": 4.751778748275721e-05, + "loss": 0.5307, + "step": 159200 + }, + { + "epoch": 7.161470033246474, + "grad_norm": 5.86752462387085, + "learning_rate": 4.751165230209882e-05, + "loss": 0.5302, + "step": 159400 + }, + { + "epoch": 7.170455566537874, + "grad_norm": 14.792780876159668, + "learning_rate": 4.750550994583227e-05, + "loss": 0.5341, + "step": 159600 + }, + { + "epoch": 7.179441099829275, + "grad_norm": 9.056463241577148, + "learning_rate": 4.749936041591544e-05, + "loss": 0.5453, + "step": 159800 + }, + { + "epoch": 7.188426633120676, + "grad_norm": 6.764106750488281, + "learning_rate": 4.74932037143085e-05, + "loss": 0.4882, + "step": 160000 + }, + { + "epoch": 7.188426633120676, + "eval_loss": 2.592075824737549, + "eval_runtime": 1072.2539, + "eval_samples_per_second": 9.237, + "eval_steps_per_second": 0.145, + "step": 160000 + }, + { + "epoch": 7.197412166412077, + "grad_norm": 10.36343765258789, + "learning_rate": 4.74870398429739e-05, + "loss": 0.5078, + "step": 160200 + }, + { + "epoch": 7.206397699703477, + "grad_norm": 3.3423054218292236, + "learning_rate": 4.748086880387638e-05, + "loss": 0.5265, + "step": 160400 + }, + { + "epoch": 7.215383232994879, + "grad_norm": 7.084263801574707, + "learning_rate": 4.7474690598982975e-05, + "loss": 0.5367, + "step": 160600 + }, + { + "epoch": 7.224368766286279, + "grad_norm": 7.648595333099365, + "learning_rate": 4.7468505230262974e-05, + "loss": 0.5392, + "step": 160800 + }, + { + "epoch": 7.23335429957768, + "grad_norm": 1.4495679140090942, + "learning_rate": 4.746231269968798e-05, + "loss": 0.5099, + "step": 161000 + }, + { + "epoch": 7.23335429957768, + "eval_loss": 2.630073070526123, + "eval_runtime": 1049.8697, + "eval_samples_per_second": 9.434, + "eval_steps_per_second": 0.148, + "step": 161000 + }, + { + "epoch": 7.242339832869081, + "grad_norm": 2.1218910217285156, + "learning_rate": 4.745611300923187e-05, + "loss": 0.5101, + "step": 161200 + }, + { + "epoch": 7.251325366160482, + "grad_norm": 27.048370361328125, + "learning_rate": 4.744990616087079e-05, + "loss": 0.5328, + "step": 161400 + }, + { + "epoch": 7.260310899451882, + "grad_norm": 9.959211349487305, + "learning_rate": 4.7443692156583194e-05, + "loss": 0.5176, + "step": 161600 + }, + { + "epoch": 7.269296432743284, + "grad_norm": 8.372459411621094, + "learning_rate": 4.7437470998349785e-05, + "loss": 0.5379, + "step": 161800 + }, + { + "epoch": 7.278281966034684, + "grad_norm": 12.155389785766602, + "learning_rate": 4.7431242688153564e-05, + "loss": 0.5518, + "step": 162000 + }, + { + "epoch": 7.278281966034684, + "eval_loss": 2.5808417797088623, + "eval_runtime": 1051.3983, + "eval_samples_per_second": 9.42, + "eval_steps_per_second": 0.147, + "step": 162000 + }, + { + "epoch": 7.287267499326085, + "grad_norm": 12.06241226196289, + "learning_rate": 4.7425007227979826e-05, + "loss": 0.5364, + "step": 162200 + }, + { + "epoch": 7.296253032617486, + "grad_norm": 7.406551837921143, + "learning_rate": 4.741876461981611e-05, + "loss": 0.4916, + "step": 162400 + }, + { + "epoch": 7.305238565908887, + "grad_norm": 4.847611904144287, + "learning_rate": 4.741251486565226e-05, + "loss": 0.4856, + "step": 162600 + }, + { + "epoch": 7.314224099200287, + "grad_norm": 4.857258319854736, + "learning_rate": 4.740625796748039e-05, + "loss": 0.5113, + "step": 162800 + }, + { + "epoch": 7.323209632491689, + "grad_norm": 3.5690536499023438, + "learning_rate": 4.7399993927294904e-05, + "loss": 0.5447, + "step": 163000 + }, + { + "epoch": 7.323209632491689, + "eval_loss": 2.5550215244293213, + "eval_runtime": 1050.4921, + "eval_samples_per_second": 9.428, + "eval_steps_per_second": 0.148, + "step": 163000 + }, + { + "epoch": 7.332195165783089, + "grad_norm": 2.832630157470703, + "learning_rate": 4.739372274709245e-05, + "loss": 0.5102, + "step": 163200 + }, + { + "epoch": 7.34118069907449, + "grad_norm": 6.479580879211426, + "learning_rate": 4.7387444428871985e-05, + "loss": 0.49, + "step": 163400 + }, + { + "epoch": 7.350166232365891, + "grad_norm": 5.155001640319824, + "learning_rate": 4.738115897463472e-05, + "loss": 0.5256, + "step": 163600 + }, + { + "epoch": 7.359151765657292, + "grad_norm": 10.935525894165039, + "learning_rate": 4.7374866386384155e-05, + "loss": 0.5168, + "step": 163800 + }, + { + "epoch": 7.368137298948692, + "grad_norm": 3.9100871086120605, + "learning_rate": 4.736856666612605e-05, + "loss": 0.5287, + "step": 164000 + }, + { + "epoch": 7.368137298948692, + "eval_loss": 2.5780515670776367, + "eval_runtime": 1051.1987, + "eval_samples_per_second": 9.422, + "eval_steps_per_second": 0.147, + "step": 164000 + }, + { + "epoch": 7.377122832240094, + "grad_norm": 16.054746627807617, + "learning_rate": 4.736225981586846e-05, + "loss": 0.5182, + "step": 164200 + }, + { + "epoch": 7.386108365531494, + "grad_norm": 8.413787841796875, + "learning_rate": 4.735594583762169e-05, + "loss": 0.5142, + "step": 164400 + }, + { + "epoch": 7.395093898822895, + "grad_norm": 10.230764389038086, + "learning_rate": 4.7349624733398324e-05, + "loss": 0.532, + "step": 164600 + }, + { + "epoch": 7.404079432114296, + "grad_norm": 6.237130641937256, + "learning_rate": 4.734329650521322e-05, + "loss": 0.5217, + "step": 164800 + }, + { + "epoch": 7.413064965405697, + "grad_norm": 12.266544342041016, + "learning_rate": 4.733696115508351e-05, + "loss": 0.5514, + "step": 165000 + }, + { + "epoch": 7.413064965405697, + "eval_loss": 2.5827889442443848, + "eval_runtime": 1050.6343, + "eval_samples_per_second": 9.427, + "eval_steps_per_second": 0.148, + "step": 165000 + }, + { + "epoch": 7.422050498697097, + "grad_norm": 8.876433372497559, + "learning_rate": 4.7330618685028585e-05, + "loss": 0.5055, + "step": 165200 + }, + { + "epoch": 7.431036031988499, + "grad_norm": 4.292701244354248, + "learning_rate": 4.732426909707013e-05, + "loss": 0.5443, + "step": 165400 + }, + { + "epoch": 7.440021565279899, + "grad_norm": 11.186918258666992, + "learning_rate": 4.731791239323205e-05, + "loss": 0.5327, + "step": 165600 + }, + { + "epoch": 7.4490070985713, + "grad_norm": 2.4021294116973877, + "learning_rate": 4.7311548575540586e-05, + "loss": 0.5159, + "step": 165800 + }, + { + "epoch": 7.457992631862701, + "grad_norm": 13.129263877868652, + "learning_rate": 4.730517764602419e-05, + "loss": 0.5135, + "step": 166000 + }, + { + "epoch": 7.457992631862701, + "eval_loss": 2.5977518558502197, + "eval_runtime": 1050.7073, + "eval_samples_per_second": 9.426, + "eval_steps_per_second": 0.148, + "step": 166000 + }, + { + "epoch": 7.466978165154102, + "grad_norm": 1.4429153203964233, + "learning_rate": 4.7298799606713606e-05, + "loss": 0.5522, + "step": 166200 + }, + { + "epoch": 7.4759636984455025, + "grad_norm": 8.0523042678833, + "learning_rate": 4.729241445964183e-05, + "loss": 0.5187, + "step": 166400 + }, + { + "epoch": 7.484949231736904, + "grad_norm": 8.555193901062012, + "learning_rate": 4.728602220684415e-05, + "loss": 0.5157, + "step": 166600 + }, + { + "epoch": 7.493934765028304, + "grad_norm": 4.992981910705566, + "learning_rate": 4.727962285035809e-05, + "loss": 0.5323, + "step": 166800 + }, + { + "epoch": 7.502920298319705, + "grad_norm": 8.440316200256348, + "learning_rate": 4.727321639222345e-05, + "loss": 0.5371, + "step": 167000 + }, + { + "epoch": 7.502920298319705, + "eval_loss": 2.536879062652588, + "eval_runtime": 1050.1243, + "eval_samples_per_second": 9.431, + "eval_steps_per_second": 0.148, + "step": 167000 + }, + { + "epoch": 7.511905831611106, + "grad_norm": 14.163543701171875, + "learning_rate": 4.7266802834482296e-05, + "loss": 0.5096, + "step": 167200 + }, + { + "epoch": 7.520891364902507, + "grad_norm": 2.259485960006714, + "learning_rate": 4.726038217917896e-05, + "loss": 0.5099, + "step": 167400 + }, + { + "epoch": 7.5298768981939075, + "grad_norm": 10.735986709594727, + "learning_rate": 4.7253954428360024e-05, + "loss": 0.5192, + "step": 167600 + }, + { + "epoch": 7.538862431485309, + "grad_norm": 3.719405174255371, + "learning_rate": 4.7247519584074343e-05, + "loss": 0.5043, + "step": 167800 + }, + { + "epoch": 7.547847964776709, + "grad_norm": 2.679960012435913, + "learning_rate": 4.724107764837303e-05, + "loss": 0.5153, + "step": 168000 + }, + { + "epoch": 7.547847964776709, + "eval_loss": 2.623818874359131, + "eval_runtime": 1050.9471, + "eval_samples_per_second": 9.424, + "eval_steps_per_second": 0.147, + "step": 168000 + }, + { + "epoch": 7.55683349806811, + "grad_norm": 18.183778762817383, + "learning_rate": 4.723462862330945e-05, + "loss": 0.5054, + "step": 168200 + }, + { + "epoch": 7.565819031359511, + "grad_norm": 1.4932595491409302, + "learning_rate": 4.722817251093925e-05, + "loss": 0.5461, + "step": 168400 + }, + { + "epoch": 7.574804564650912, + "grad_norm": 10.546357154846191, + "learning_rate": 4.722170931332031e-05, + "loss": 0.544, + "step": 168600 + }, + { + "epoch": 7.5837900979423125, + "grad_norm": 1.394518256187439, + "learning_rate": 4.721523903251278e-05, + "loss": 0.4983, + "step": 168800 + }, + { + "epoch": 7.592775631233714, + "grad_norm": 6.905360698699951, + "learning_rate": 4.720876167057907e-05, + "loss": 0.5109, + "step": 169000 + }, + { + "epoch": 7.592775631233714, + "eval_loss": 2.588412284851074, + "eval_runtime": 1050.4908, + "eval_samples_per_second": 9.428, + "eval_steps_per_second": 0.148, + "step": 169000 + }, + { + "epoch": 7.601761164525114, + "grad_norm": 19.295528411865234, + "learning_rate": 4.7202277229583846e-05, + "loss": 0.5174, + "step": 169200 + }, + { + "epoch": 7.6107466978165155, + "grad_norm": 22.249040603637695, + "learning_rate": 4.719578571159402e-05, + "loss": 0.5101, + "step": 169400 + }, + { + "epoch": 7.619732231107916, + "grad_norm": 7.415430068969727, + "learning_rate": 4.718928711867878e-05, + "loss": 0.4998, + "step": 169600 + }, + { + "epoch": 7.628717764399317, + "grad_norm": 2.853653907775879, + "learning_rate": 4.718278145290955e-05, + "loss": 0.5099, + "step": 169800 + }, + { + "epoch": 7.637703297690718, + "grad_norm": 4.130895137786865, + "learning_rate": 4.7176268716360026e-05, + "loss": 0.4822, + "step": 170000 + }, + { + "epoch": 7.637703297690718, + "eval_loss": 2.6600334644317627, + "eval_runtime": 1049.8197, + "eval_samples_per_second": 9.434, + "eval_steps_per_second": 0.148, + "step": 170000 + }, + { + "epoch": 7.646688830982119, + "grad_norm": 2.998149871826172, + "learning_rate": 4.7169748911106146e-05, + "loss": 0.514, + "step": 170200 + }, + { + "epoch": 7.655674364273519, + "grad_norm": 2.742155075073242, + "learning_rate": 4.71632220392261e-05, + "loss": 0.5168, + "step": 170400 + }, + { + "epoch": 7.6646598975649205, + "grad_norm": 1.7436096668243408, + "learning_rate": 4.7156688102800326e-05, + "loss": 0.5029, + "step": 170600 + }, + { + "epoch": 7.673645430856322, + "grad_norm": 4.7532806396484375, + "learning_rate": 4.715014710391153e-05, + "loss": 0.5279, + "step": 170800 + }, + { + "epoch": 7.682630964147722, + "grad_norm": 8.532057762145996, + "learning_rate": 4.714359904464466e-05, + "loss": 0.5241, + "step": 171000 + }, + { + "epoch": 7.682630964147722, + "eval_loss": 2.546463966369629, + "eval_runtime": 1051.0534, + "eval_samples_per_second": 9.423, + "eval_steps_per_second": 0.147, + "step": 171000 + }, + { + "epoch": 7.691616497439123, + "grad_norm": 5.461520195007324, + "learning_rate": 4.713704392708692e-05, + "loss": 0.5415, + "step": 171200 + }, + { + "epoch": 7.700602030730524, + "grad_norm": 5.665705680847168, + "learning_rate": 4.713048175332775e-05, + "loss": 0.5263, + "step": 171400 + }, + { + "epoch": 7.709587564021925, + "grad_norm": 8.942784309387207, + "learning_rate": 4.7123912525458865e-05, + "loss": 0.5518, + "step": 171600 + }, + { + "epoch": 7.7185730973133255, + "grad_norm": 9.14636516571045, + "learning_rate": 4.7117336245574186e-05, + "loss": 0.5277, + "step": 171800 + }, + { + "epoch": 7.727558630604726, + "grad_norm": 4.771318435668945, + "learning_rate": 4.7110752915769934e-05, + "loss": 0.4941, + "step": 172000 + }, + { + "epoch": 7.727558630604726, + "eval_loss": 2.600043296813965, + "eval_runtime": 1049.7614, + "eval_samples_per_second": 9.435, + "eval_steps_per_second": 0.148, + "step": 172000 + }, + { + "epoch": 7.736544163896127, + "grad_norm": 4.336336135864258, + "learning_rate": 4.710416253814454e-05, + "loss": 0.5547, + "step": 172200 + }, + { + "epoch": 7.7455296971875285, + "grad_norm": 13.351747512817383, + "learning_rate": 4.709756511479868e-05, + "loss": 0.4655, + "step": 172400 + }, + { + "epoch": 7.754515230478929, + "grad_norm": 14.320053100585938, + "learning_rate": 4.7090960647835305e-05, + "loss": 0.5079, + "step": 172600 + }, + { + "epoch": 7.763500763770329, + "grad_norm": 9.463343620300293, + "learning_rate": 4.708434913935959e-05, + "loss": 0.5139, + "step": 172800 + }, + { + "epoch": 7.7724862970617306, + "grad_norm": 6.440632343292236, + "learning_rate": 4.707773059147896e-05, + "loss": 0.5042, + "step": 173000 + }, + { + "epoch": 7.7724862970617306, + "eval_loss": 2.626408576965332, + "eval_runtime": 1128.6913, + "eval_samples_per_second": 8.775, + "eval_steps_per_second": 0.137, + "step": 173000 + }, + { + "epoch": 7.781471830353132, + "grad_norm": 7.2138261795043945, + "learning_rate": 4.707110500630308e-05, + "loss": 0.5522, + "step": 173200 + }, + { + "epoch": 7.790457363644532, + "grad_norm": 7.865017890930176, + "learning_rate": 4.706447238594386e-05, + "loss": 0.5161, + "step": 173400 + }, + { + "epoch": 7.7994428969359335, + "grad_norm": 18.77448844909668, + "learning_rate": 4.7057832732515464e-05, + "loss": 0.5437, + "step": 173600 + }, + { + "epoch": 7.808428430227334, + "grad_norm": 2.390789031982422, + "learning_rate": 4.705118604813426e-05, + "loss": 0.5101, + "step": 173800 + }, + { + "epoch": 7.817413963518735, + "grad_norm": 9.706137657165527, + "learning_rate": 4.7044532334918915e-05, + "loss": 0.5106, + "step": 174000 + }, + { + "epoch": 7.817413963518735, + "eval_loss": 2.6232926845550537, + "eval_runtime": 1128.6235, + "eval_samples_per_second": 8.775, + "eval_steps_per_second": 0.137, + "step": 174000 + }, + { + "epoch": 7.826399496810136, + "grad_norm": 1.1721101999282837, + "learning_rate": 4.70378715949903e-05, + "loss": 0.5015, + "step": 174200 + }, + { + "epoch": 7.835385030101537, + "grad_norm": 15.840973854064941, + "learning_rate": 4.703120383047151e-05, + "loss": 0.4983, + "step": 174400 + }, + { + "epoch": 7.844370563392937, + "grad_norm": 11.476134300231934, + "learning_rate": 4.702452904348792e-05, + "loss": 0.5375, + "step": 174600 + }, + { + "epoch": 7.8533560966843385, + "grad_norm": 1.3802037239074707, + "learning_rate": 4.701784723616712e-05, + "loss": 0.5123, + "step": 174800 + }, + { + "epoch": 7.862341629975739, + "grad_norm": 8.808523178100586, + "learning_rate": 4.7011158410638944e-05, + "loss": 0.5052, + "step": 175000 + }, + { + "epoch": 7.862341629975739, + "eval_loss": 2.5762908458709717, + "eval_runtime": 1129.5094, + "eval_samples_per_second": 8.768, + "eval_steps_per_second": 0.137, + "step": 175000 + }, + { + "epoch": 7.87132716326714, + "grad_norm": 3.9836955070495605, + "learning_rate": 4.7004462569035456e-05, + "loss": 0.521, + "step": 175200 + }, + { + "epoch": 7.880312696558541, + "grad_norm": 3.1506991386413574, + "learning_rate": 4.6997759713490966e-05, + "loss": 0.5264, + "step": 175400 + }, + { + "epoch": 7.889298229849942, + "grad_norm": 6.831039905548096, + "learning_rate": 4.6991049846142e-05, + "loss": 0.5244, + "step": 175600 + }, + { + "epoch": 7.898283763141342, + "grad_norm": 3.348510503768921, + "learning_rate": 4.698433296912736e-05, + "loss": 0.4787, + "step": 175800 + }, + { + "epoch": 7.907269296432744, + "grad_norm": 3.6049258708953857, + "learning_rate": 4.697760908458804e-05, + "loss": 0.5266, + "step": 176000 + }, + { + "epoch": 7.907269296432744, + "eval_loss": 2.573176622390747, + "eval_runtime": 1129.0656, + "eval_samples_per_second": 8.772, + "eval_steps_per_second": 0.137, + "step": 176000 + }, + { + "epoch": 7.916254829724144, + "grad_norm": 13.29443073272705, + "learning_rate": 4.697087819466728e-05, + "loss": 0.4962, + "step": 176200 + }, + { + "epoch": 7.925240363015545, + "grad_norm": 7.278706073760986, + "learning_rate": 4.696414030151056e-05, + "loss": 0.5111, + "step": 176400 + }, + { + "epoch": 7.934225896306946, + "grad_norm": 5.561307907104492, + "learning_rate": 4.695739540726559e-05, + "loss": 0.5019, + "step": 176600 + }, + { + "epoch": 7.943211429598347, + "grad_norm": 7.39556884765625, + "learning_rate": 4.695064351408232e-05, + "loss": 0.5252, + "step": 176800 + }, + { + "epoch": 7.952196962889747, + "grad_norm": 8.245197296142578, + "learning_rate": 4.694388462411291e-05, + "loss": 0.5361, + "step": 177000 + }, + { + "epoch": 7.952196962889747, + "eval_loss": 2.5876715183258057, + "eval_runtime": 1129.3784, + "eval_samples_per_second": 8.769, + "eval_steps_per_second": 0.137, + "step": 177000 + }, + { + "epoch": 7.961182496181149, + "grad_norm": 4.86469841003418, + "learning_rate": 4.693711873951177e-05, + "loss": 0.4771, + "step": 177200 + }, + { + "epoch": 7.970168029472549, + "grad_norm": 13.049267768859863, + "learning_rate": 4.6930345862435527e-05, + "loss": 0.5369, + "step": 177400 + }, + { + "epoch": 7.97915356276395, + "grad_norm": 6.7220258712768555, + "learning_rate": 4.692356599504304e-05, + "loss": 0.529, + "step": 177600 + }, + { + "epoch": 7.988139096055351, + "grad_norm": 10.31705379486084, + "learning_rate": 4.69167791394954e-05, + "loss": 0.5603, + "step": 177800 + }, + { + "epoch": 7.997124629346752, + "grad_norm": 6.541712760925293, + "learning_rate": 4.690998529795592e-05, + "loss": 0.5193, + "step": 178000 + }, + { + "epoch": 7.997124629346752, + "eval_loss": 2.6211884021759033, + "eval_runtime": 1127.8197, + "eval_samples_per_second": 8.782, + "eval_steps_per_second": 0.137, + "step": 178000 + }, + { + "epoch": 8.006110162638153, + "grad_norm": 7.912782192230225, + "learning_rate": 4.6903184472590145e-05, + "loss": 0.5203, + "step": 178200 + }, + { + "epoch": 8.015095695929553, + "grad_norm": 4.079019546508789, + "learning_rate": 4.6896376665565843e-05, + "loss": 0.4817, + "step": 178400 + }, + { + "epoch": 8.024081229220954, + "grad_norm": 3.5934817790985107, + "learning_rate": 4.6889561879053014e-05, + "loss": 0.4757, + "step": 178600 + }, + { + "epoch": 8.033066762512355, + "grad_norm": 5.87857723236084, + "learning_rate": 4.6882740115223864e-05, + "loss": 0.5184, + "step": 178800 + }, + { + "epoch": 8.042052295803757, + "grad_norm": 10.092915534973145, + "learning_rate": 4.687591137625285e-05, + "loss": 0.475, + "step": 179000 + }, + { + "epoch": 8.042052295803757, + "eval_loss": 2.614030599594116, + "eval_runtime": 1129.8602, + "eval_samples_per_second": 8.766, + "eval_steps_per_second": 0.137, + "step": 179000 + }, + { + "epoch": 8.051037829095156, + "grad_norm": 5.135852813720703, + "learning_rate": 4.686907566431663e-05, + "loss": 0.5036, + "step": 179200 + }, + { + "epoch": 8.060023362386557, + "grad_norm": 8.39755630493164, + "learning_rate": 4.686223298159409e-05, + "loss": 0.4812, + "step": 179400 + }, + { + "epoch": 8.069008895677959, + "grad_norm": 9.086663246154785, + "learning_rate": 4.685538333026636e-05, + "loss": 0.494, + "step": 179600 + }, + { + "epoch": 8.07799442896936, + "grad_norm": 4.75005578994751, + "learning_rate": 4.6848526712516744e-05, + "loss": 0.514, + "step": 179800 + }, + { + "epoch": 8.08697996226076, + "grad_norm": 5.841987133026123, + "learning_rate": 4.684166313053081e-05, + "loss": 0.5183, + "step": 180000 + }, + { + "epoch": 8.08697996226076, + "eval_loss": 2.6352553367614746, + "eval_runtime": 1129.1046, + "eval_samples_per_second": 8.772, + "eval_steps_per_second": 0.137, + "step": 180000 + }, + { + "epoch": 8.09596549555216, + "grad_norm": 6.5779852867126465, + "learning_rate": 4.683479258649633e-05, + "loss": 0.515, + "step": 180200 + }, + { + "epoch": 8.104951028843562, + "grad_norm": 10.88022232055664, + "learning_rate": 4.6827915082603304e-05, + "loss": 0.4703, + "step": 180400 + }, + { + "epoch": 8.113936562134963, + "grad_norm": 4.6330366134643555, + "learning_rate": 4.6821030621043927e-05, + "loss": 0.5193, + "step": 180600 + }, + { + "epoch": 8.122922095426363, + "grad_norm": 6.782657146453857, + "learning_rate": 4.681413920401263e-05, + "loss": 0.4852, + "step": 180800 + }, + { + "epoch": 8.131907628717764, + "grad_norm": 15.633230209350586, + "learning_rate": 4.680724083370606e-05, + "loss": 0.5076, + "step": 181000 + }, + { + "epoch": 8.131907628717764, + "eval_loss": 2.5747714042663574, + "eval_runtime": 1129.3837, + "eval_samples_per_second": 8.769, + "eval_steps_per_second": 0.137, + "step": 181000 + }, + { + "epoch": 8.140893162009165, + "grad_norm": 13.606180191040039, + "learning_rate": 4.680033551232308e-05, + "loss": 0.4894, + "step": 181200 + }, + { + "epoch": 8.149878695300567, + "grad_norm": 6.643714904785156, + "learning_rate": 4.679342324206478e-05, + "loss": 0.5166, + "step": 181400 + }, + { + "epoch": 8.158864228591966, + "grad_norm": 30.02402687072754, + "learning_rate": 4.678650402513442e-05, + "loss": 0.5312, + "step": 181600 + }, + { + "epoch": 8.167849761883367, + "grad_norm": 3.5424320697784424, + "learning_rate": 4.6779577863737534e-05, + "loss": 0.485, + "step": 181800 + }, + { + "epoch": 8.176835295174769, + "grad_norm": 3.954418897628784, + "learning_rate": 4.677264476008183e-05, + "loss": 0.4791, + "step": 182000 + }, + { + "epoch": 8.176835295174769, + "eval_loss": 2.621889114379883, + "eval_runtime": 1127.9131, + "eval_samples_per_second": 8.781, + "eval_steps_per_second": 0.137, + "step": 182000 + }, + { + "epoch": 8.18582082846617, + "grad_norm": 8.198515892028809, + "learning_rate": 4.6765704716377244e-05, + "loss": 0.5274, + "step": 182200 + }, + { + "epoch": 8.19480636175757, + "grad_norm": 7.865370750427246, + "learning_rate": 4.6758757734835925e-05, + "loss": 0.478, + "step": 182400 + }, + { + "epoch": 8.20379189504897, + "grad_norm": 22.58502769470215, + "learning_rate": 4.6751803817672214e-05, + "loss": 0.4986, + "step": 182600 + }, + { + "epoch": 8.212777428340372, + "grad_norm": 1.826743245124817, + "learning_rate": 4.6744842967102695e-05, + "loss": 0.526, + "step": 182800 + }, + { + "epoch": 8.221762961631773, + "grad_norm": 1.7866239547729492, + "learning_rate": 4.6737875185346134e-05, + "loss": 0.4812, + "step": 183000 + }, + { + "epoch": 8.221762961631773, + "eval_loss": 2.6146905422210693, + "eval_runtime": 1120.9053, + "eval_samples_per_second": 8.836, + "eval_steps_per_second": 0.138, + "step": 183000 + }, + { + "epoch": 8.230748494923173, + "grad_norm": 17.78580093383789, + "learning_rate": 4.6730900474623525e-05, + "loss": 0.4622, + "step": 183200 + }, + { + "epoch": 8.239734028214574, + "grad_norm": 2.1143832206726074, + "learning_rate": 4.672391883715805e-05, + "loss": 0.5061, + "step": 183400 + }, + { + "epoch": 8.248719561505975, + "grad_norm": 5.723171710968018, + "learning_rate": 4.671693027517513e-05, + "loss": 0.4791, + "step": 183600 + }, + { + "epoch": 8.257705094797377, + "grad_norm": 8.541521072387695, + "learning_rate": 4.670993479090237e-05, + "loss": 0.4839, + "step": 183800 + }, + { + "epoch": 8.266690628088778, + "grad_norm": 4.935067653656006, + "learning_rate": 4.670293238656958e-05, + "loss": 0.4801, + "step": 184000 + }, + { + "epoch": 8.266690628088778, + "eval_loss": 2.671586751937866, + "eval_runtime": 1093.7184, + "eval_samples_per_second": 9.055, + "eval_steps_per_second": 0.142, + "step": 184000 + }, + { + "epoch": 8.275676161380177, + "grad_norm": 10.030083656311035, + "learning_rate": 4.6695923064408776e-05, + "loss": 0.5172, + "step": 184200 + }, + { + "epoch": 8.284661694671579, + "grad_norm": 5.141510486602783, + "learning_rate": 4.66889068266542e-05, + "loss": 0.5185, + "step": 184400 + }, + { + "epoch": 8.29364722796298, + "grad_norm": 1.1735432147979736, + "learning_rate": 4.668188367554228e-05, + "loss": 0.463, + "step": 184600 + }, + { + "epoch": 8.30263276125438, + "grad_norm": 12.648009300231934, + "learning_rate": 4.667485361331165e-05, + "loss": 0.5135, + "step": 184800 + }, + { + "epoch": 8.31161829454578, + "grad_norm": 10.014856338500977, + "learning_rate": 4.6667816642203146e-05, + "loss": 0.4898, + "step": 185000 + }, + { + "epoch": 8.31161829454578, + "eval_loss": 2.5692856311798096, + "eval_runtime": 1092.5868, + "eval_samples_per_second": 9.065, + "eval_steps_per_second": 0.142, + "step": 185000 + }, + { + "epoch": 8.320603827837182, + "grad_norm": 0.6926993131637573, + "learning_rate": 4.66607727644598e-05, + "loss": 0.5116, + "step": 185200 + }, + { + "epoch": 8.329589361128583, + "grad_norm": 8.623538970947266, + "learning_rate": 4.665372198232688e-05, + "loss": 0.5403, + "step": 185400 + }, + { + "epoch": 8.338574894419985, + "grad_norm": 10.916993141174316, + "learning_rate": 4.664666429805181e-05, + "loss": 0.4905, + "step": 185600 + }, + { + "epoch": 8.347560427711384, + "grad_norm": 13.056023597717285, + "learning_rate": 4.663959971388423e-05, + "loss": 0.523, + "step": 185800 + }, + { + "epoch": 8.356545961002785, + "grad_norm": 9.11626148223877, + "learning_rate": 4.663252823207599e-05, + "loss": 0.5183, + "step": 186000 + }, + { + "epoch": 8.356545961002785, + "eval_loss": 2.5466091632843018, + "eval_runtime": 1090.823, + "eval_samples_per_second": 9.079, + "eval_steps_per_second": 0.142, + "step": 186000 + }, + { + "epoch": 8.365531494294187, + "grad_norm": 4.152465343475342, + "learning_rate": 4.6625449854881124e-05, + "loss": 0.4888, + "step": 186200 + }, + { + "epoch": 8.374517027585588, + "grad_norm": 3.7355167865753174, + "learning_rate": 4.661836458455588e-05, + "loss": 0.5065, + "step": 186400 + }, + { + "epoch": 8.383502560876988, + "grad_norm": 4.155386447906494, + "learning_rate": 4.661127242335869e-05, + "loss": 0.5209, + "step": 186600 + }, + { + "epoch": 8.392488094168389, + "grad_norm": 16.843454360961914, + "learning_rate": 4.660417337355018e-05, + "loss": 0.4961, + "step": 186800 + }, + { + "epoch": 8.40147362745979, + "grad_norm": 8.681642532348633, + "learning_rate": 4.659706743739319e-05, + "loss": 0.5324, + "step": 187000 + }, + { + "epoch": 8.40147362745979, + "eval_loss": 2.5965471267700195, + "eval_runtime": 1091.868, + "eval_samples_per_second": 9.071, + "eval_steps_per_second": 0.142, + "step": 187000 + }, + { + "epoch": 8.410459160751191, + "grad_norm": 16.07400131225586, + "learning_rate": 4.658995461715273e-05, + "loss": 0.4946, + "step": 187200 + }, + { + "epoch": 8.41944469404259, + "grad_norm": 3.314675807952881, + "learning_rate": 4.658283491509603e-05, + "loss": 0.4955, + "step": 187400 + }, + { + "epoch": 8.428430227333992, + "grad_norm": 8.137290000915527, + "learning_rate": 4.6575708333492495e-05, + "loss": 0.5202, + "step": 187600 + }, + { + "epoch": 8.437415760625393, + "grad_norm": 3.797729730606079, + "learning_rate": 4.6568574874613725e-05, + "loss": 0.542, + "step": 187800 + }, + { + "epoch": 8.446401293916795, + "grad_norm": 10.251813888549805, + "learning_rate": 4.6561434540733525e-05, + "loss": 0.4847, + "step": 188000 + }, + { + "epoch": 8.446401293916795, + "eval_loss": 2.5656449794769287, + "eval_runtime": 1090.1823, + "eval_samples_per_second": 9.085, + "eval_steps_per_second": 0.142, + "step": 188000 + }, + { + "epoch": 8.455386827208194, + "grad_norm": 8.841021537780762, + "learning_rate": 4.6554287334127874e-05, + "loss": 0.4929, + "step": 188200 + }, + { + "epoch": 8.464372360499596, + "grad_norm": 3.129969596862793, + "learning_rate": 4.654713325707496e-05, + "loss": 0.5191, + "step": 188400 + }, + { + "epoch": 8.473357893790997, + "grad_norm": 4.764856815338135, + "learning_rate": 4.653997231185514e-05, + "loss": 0.4668, + "step": 188600 + }, + { + "epoch": 8.482343427082398, + "grad_norm": 2.219456195831299, + "learning_rate": 4.653280450075097e-05, + "loss": 0.4939, + "step": 188800 + }, + { + "epoch": 8.491328960373798, + "grad_norm": 15.745511054992676, + "learning_rate": 4.652562982604721e-05, + "loss": 0.5246, + "step": 189000 + }, + { + "epoch": 8.491328960373798, + "eval_loss": 2.595158576965332, + "eval_runtime": 1091.4106, + "eval_samples_per_second": 9.074, + "eval_steps_per_second": 0.142, + "step": 189000 + }, + { + "epoch": 8.500314493665199, + "grad_norm": 28.447345733642578, + "learning_rate": 4.651844829003078e-05, + "loss": 0.5212, + "step": 189200 + }, + { + "epoch": 8.5093000269566, + "grad_norm": 5.278013229370117, + "learning_rate": 4.651125989499081e-05, + "loss": 0.5092, + "step": 189400 + }, + { + "epoch": 8.518285560248001, + "grad_norm": 7.048742294311523, + "learning_rate": 4.65040646432186e-05, + "loss": 0.484, + "step": 189600 + }, + { + "epoch": 8.527271093539401, + "grad_norm": 1.3166794776916504, + "learning_rate": 4.6496862537007655e-05, + "loss": 0.4682, + "step": 189800 + }, + { + "epoch": 8.536256626830802, + "grad_norm": 2.944568634033203, + "learning_rate": 4.6489653578653636e-05, + "loss": 0.4905, + "step": 190000 + }, + { + "epoch": 8.536256626830802, + "eval_loss": 2.6485064029693604, + "eval_runtime": 1090.2995, + "eval_samples_per_second": 9.084, + "eval_steps_per_second": 0.142, + "step": 190000 + }, + { + "epoch": 8.545242160122204, + "grad_norm": 12.636077880859375, + "learning_rate": 4.6482437770454415e-05, + "loss": 0.4857, + "step": 190200 + }, + { + "epoch": 8.554227693413605, + "grad_norm": 8.520101547241211, + "learning_rate": 4.647521511471003e-05, + "loss": 0.529, + "step": 190400 + }, + { + "epoch": 8.563213226705004, + "grad_norm": 3.0266263484954834, + "learning_rate": 4.646798561372272e-05, + "loss": 0.5178, + "step": 190600 + }, + { + "epoch": 8.572198759996406, + "grad_norm": 6.245327949523926, + "learning_rate": 4.6460749269796875e-05, + "loss": 0.49, + "step": 190800 + }, + { + "epoch": 8.581184293287807, + "grad_norm": 11.986411094665527, + "learning_rate": 4.645350608523911e-05, + "loss": 0.4862, + "step": 191000 + }, + { + "epoch": 8.581184293287807, + "eval_loss": 2.6468417644500732, + "eval_runtime": 1089.985, + "eval_samples_per_second": 9.086, + "eval_steps_per_second": 0.142, + "step": 191000 + }, + { + "epoch": 8.590169826579208, + "grad_norm": 33.56387710571289, + "learning_rate": 4.6446256062358175e-05, + "loss": 0.477, + "step": 191200 + }, + { + "epoch": 8.599155359870608, + "grad_norm": 6.720004558563232, + "learning_rate": 4.6438999203465036e-05, + "loss": 0.5533, + "step": 191400 + }, + { + "epoch": 8.608140893162009, + "grad_norm": 5.972818374633789, + "learning_rate": 4.643173551087281e-05, + "loss": 0.4685, + "step": 191600 + }, + { + "epoch": 8.61712642645341, + "grad_norm": 4.098087787628174, + "learning_rate": 4.6424464986896814e-05, + "loss": 0.5085, + "step": 191800 + }, + { + "epoch": 8.626111959744811, + "grad_norm": 9.735739707946777, + "learning_rate": 4.641718763385454e-05, + "loss": 0.5209, + "step": 192000 + }, + { + "epoch": 8.626111959744811, + "eval_loss": 2.538106679916382, + "eval_runtime": 1089.8225, + "eval_samples_per_second": 9.088, + "eval_steps_per_second": 0.142, + "step": 192000 + }, + { + "epoch": 8.635097493036211, + "grad_norm": 17.28936004638672, + "learning_rate": 4.640990345406563e-05, + "loss": 0.4939, + "step": 192200 + }, + { + "epoch": 8.644083026327612, + "grad_norm": 5.040442943572998, + "learning_rate": 4.640261244985194e-05, + "loss": 0.5788, + "step": 192400 + }, + { + "epoch": 8.653068559619014, + "grad_norm": 5.635134220123291, + "learning_rate": 4.639531462353748e-05, + "loss": 0.5067, + "step": 192600 + }, + { + "epoch": 8.662054092910415, + "grad_norm": 9.026660919189453, + "learning_rate": 4.638800997744843e-05, + "loss": 0.5487, + "step": 192800 + }, + { + "epoch": 8.671039626201814, + "grad_norm": 14.188516616821289, + "learning_rate": 4.6380698513913154e-05, + "loss": 0.5135, + "step": 193000 + }, + { + "epoch": 8.671039626201814, + "eval_loss": 2.6619675159454346, + "eval_runtime": 1089.9555, + "eval_samples_per_second": 9.087, + "eval_steps_per_second": 0.142, + "step": 193000 + }, + { + "epoch": 8.680025159493216, + "grad_norm": 3.390214204788208, + "learning_rate": 4.6373380235262206e-05, + "loss": 0.494, + "step": 193200 + }, + { + "epoch": 8.689010692784617, + "grad_norm": 6.442393779754639, + "learning_rate": 4.636605514382827e-05, + "loss": 0.476, + "step": 193400 + }, + { + "epoch": 8.697996226076018, + "grad_norm": 2.047686815261841, + "learning_rate": 4.635872324194624e-05, + "loss": 0.4956, + "step": 193600 + }, + { + "epoch": 8.706981759367418, + "grad_norm": 14.76450252532959, + "learning_rate": 4.635138453195316e-05, + "loss": 0.508, + "step": 193800 + }, + { + "epoch": 8.715967292658819, + "grad_norm": 12.547980308532715, + "learning_rate": 4.634403901618824e-05, + "loss": 0.493, + "step": 194000 + }, + { + "epoch": 8.715967292658819, + "eval_loss": 2.619582414627075, + "eval_runtime": 1090.1869, + "eval_samples_per_second": 9.085, + "eval_steps_per_second": 0.142, + "step": 194000 + }, + { + "epoch": 8.72495282595022, + "grad_norm": 7.085901260375977, + "learning_rate": 4.633668669699289e-05, + "loss": 0.5181, + "step": 194200 + }, + { + "epoch": 8.733938359241622, + "grad_norm": 2.719491958618164, + "learning_rate": 4.6329327576710654e-05, + "loss": 0.4997, + "step": 194400 + }, + { + "epoch": 8.742923892533021, + "grad_norm": 1.1107314825057983, + "learning_rate": 4.632196165768726e-05, + "loss": 0.5234, + "step": 194600 + }, + { + "epoch": 8.751909425824422, + "grad_norm": 8.07888126373291, + "learning_rate": 4.63145889422706e-05, + "loss": 0.5515, + "step": 194800 + }, + { + "epoch": 8.760894959115824, + "grad_norm": 8.861418724060059, + "learning_rate": 4.6307209432810736e-05, + "loss": 0.491, + "step": 195000 + }, + { + "epoch": 8.760894959115824, + "eval_loss": 2.562807559967041, + "eval_runtime": 1047.6466, + "eval_samples_per_second": 9.454, + "eval_steps_per_second": 0.148, + "step": 195000 + }, + { + "epoch": 8.769880492407225, + "grad_norm": 15.92845344543457, + "learning_rate": 4.62998231316599e-05, + "loss": 0.4595, + "step": 195200 + }, + { + "epoch": 8.778866025698624, + "grad_norm": 13.050873756408691, + "learning_rate": 4.629243004117246e-05, + "loss": 0.486, + "step": 195400 + }, + { + "epoch": 8.787851558990026, + "grad_norm": 2.353410005569458, + "learning_rate": 4.6285030163705004e-05, + "loss": 0.5059, + "step": 195600 + }, + { + "epoch": 8.796837092281427, + "grad_norm": 6.4239501953125, + "learning_rate": 4.6277623501616206e-05, + "loss": 0.5145, + "step": 195800 + }, + { + "epoch": 8.805822625572828, + "grad_norm": 10.336437225341797, + "learning_rate": 4.627021005726698e-05, + "loss": 0.4984, + "step": 196000 + }, + { + "epoch": 8.805822625572828, + "eval_loss": 2.643347978591919, + "eval_runtime": 1054.0102, + "eval_samples_per_second": 9.396, + "eval_steps_per_second": 0.147, + "step": 196000 + }, + { + "epoch": 8.814808158864228, + "grad_norm": 1.9258716106414795, + "learning_rate": 4.6262789833020356e-05, + "loss": 0.503, + "step": 196200 + }, + { + "epoch": 8.823793692155629, + "grad_norm": 1.0549428462982178, + "learning_rate": 4.625536283124154e-05, + "loss": 0.5193, + "step": 196400 + }, + { + "epoch": 8.83277922544703, + "grad_norm": 8.691810607910156, + "learning_rate": 4.624792905429789e-05, + "loss": 0.4829, + "step": 196600 + }, + { + "epoch": 8.841764758738432, + "grad_norm": 2.745849370956421, + "learning_rate": 4.624048850455893e-05, + "loss": 0.5121, + "step": 196800 + }, + { + "epoch": 8.850750292029833, + "grad_norm": 4.562199115753174, + "learning_rate": 4.623304118439635e-05, + "loss": 0.4943, + "step": 197000 + }, + { + "epoch": 8.850750292029833, + "eval_loss": 2.5749173164367676, + "eval_runtime": 1045.0959, + "eval_samples_per_second": 9.477, + "eval_steps_per_second": 0.148, + "step": 197000 + }, + { + "epoch": 8.859735825321232, + "grad_norm": 9.411834716796875, + "learning_rate": 4.622558709618397e-05, + "loss": 0.5262, + "step": 197200 + }, + { + "epoch": 8.868721358612634, + "grad_norm": 35.47937774658203, + "learning_rate": 4.62181262422978e-05, + "loss": 0.529, + "step": 197400 + }, + { + "epoch": 8.877706891904035, + "grad_norm": 3.0108392238616943, + "learning_rate": 4.6210658625116e-05, + "loss": 0.4835, + "step": 197600 + }, + { + "epoch": 8.886692425195434, + "grad_norm": 9.288346290588379, + "learning_rate": 4.620318424701887e-05, + "loss": 0.5115, + "step": 197800 + }, + { + "epoch": 8.895677958486836, + "grad_norm": 4.2439045906066895, + "learning_rate": 4.6195703110388875e-05, + "loss": 0.5205, + "step": 198000 + }, + { + "epoch": 8.895677958486836, + "eval_loss": 2.5893914699554443, + "eval_runtime": 1047.7304, + "eval_samples_per_second": 9.453, + "eval_steps_per_second": 0.148, + "step": 198000 + }, + { + "epoch": 8.904663491778237, + "grad_norm": 15.511228561401367, + "learning_rate": 4.618821521761063e-05, + "loss": 0.501, + "step": 198200 + }, + { + "epoch": 8.913649025069638, + "grad_norm": 27.06317710876465, + "learning_rate": 4.618072057107091e-05, + "loss": 0.4678, + "step": 198400 + }, + { + "epoch": 8.92263455836104, + "grad_norm": 9.34231185913086, + "learning_rate": 4.6173219173158646e-05, + "loss": 0.5284, + "step": 198600 + }, + { + "epoch": 8.931620091652439, + "grad_norm": 3.9095022678375244, + "learning_rate": 4.6165711026264914e-05, + "loss": 0.5517, + "step": 198800 + }, + { + "epoch": 8.94060562494384, + "grad_norm": 16.16065788269043, + "learning_rate": 4.6158196132782935e-05, + "loss": 0.459, + "step": 199000 + }, + { + "epoch": 8.94060562494384, + "eval_loss": 2.5856435298919678, + "eval_runtime": 1051.5622, + "eval_samples_per_second": 9.418, + "eval_steps_per_second": 0.147, + "step": 199000 + }, + { + "epoch": 8.949591158235242, + "grad_norm": 7.442063331604004, + "learning_rate": 4.615067449510809e-05, + "loss": 0.5037, + "step": 199200 + }, + { + "epoch": 8.958576691526643, + "grad_norm": 8.311750411987305, + "learning_rate": 4.6143146115637915e-05, + "loss": 0.5125, + "step": 199400 + }, + { + "epoch": 8.967562224818042, + "grad_norm": 12.351191520690918, + "learning_rate": 4.613561099677207e-05, + "loss": 0.5011, + "step": 199600 + }, + { + "epoch": 8.976547758109444, + "grad_norm": 4.787649154663086, + "learning_rate": 4.61280691409124e-05, + "loss": 0.502, + "step": 199800 + }, + { + "epoch": 8.985533291400845, + "grad_norm": 2.0292952060699463, + "learning_rate": 4.612052055046287e-05, + "loss": 0.51, + "step": 200000 + }, + { + "epoch": 8.985533291400845, + "eval_loss": 2.5565273761749268, + "eval_runtime": 1056.4909, + "eval_samples_per_second": 9.374, + "eval_steps_per_second": 0.147, + "step": 200000 + }, + { + "epoch": 8.994518824692246, + "grad_norm": 17.331127166748047, + "learning_rate": 4.61129652278296e-05, + "loss": 0.4937, + "step": 200200 + }, + { + "epoch": 9.003504357983646, + "grad_norm": 1.4020780324935913, + "learning_rate": 4.6105403175420844e-05, + "loss": 0.5383, + "step": 200400 + }, + { + "epoch": 9.012489891275047, + "grad_norm": 8.02592658996582, + "learning_rate": 4.6097834395647034e-05, + "loss": 0.5085, + "step": 200600 + }, + { + "epoch": 9.021475424566448, + "grad_norm": 4.4860358238220215, + "learning_rate": 4.6090258890920706e-05, + "loss": 0.4802, + "step": 200800 + }, + { + "epoch": 9.03046095785785, + "grad_norm": 38.50815963745117, + "learning_rate": 4.6082676663656575e-05, + "loss": 0.4924, + "step": 201000 + }, + { + "epoch": 9.03046095785785, + "eval_loss": 2.609539031982422, + "eval_runtime": 1047.6211, + "eval_samples_per_second": 9.454, + "eval_steps_per_second": 0.148, + "step": 201000 + }, + { + "epoch": 9.03944649114925, + "grad_norm": 6.612710952758789, + "learning_rate": 4.607508771627146e-05, + "loss": 0.4848, + "step": 201200 + }, + { + "epoch": 9.04843202444065, + "grad_norm": 6.748866558074951, + "learning_rate": 4.606749205118437e-05, + "loss": 0.4901, + "step": 201400 + }, + { + "epoch": 9.057417557732052, + "grad_norm": 8.580459594726562, + "learning_rate": 4.6059889670816415e-05, + "loss": 0.4836, + "step": 201600 + }, + { + "epoch": 9.066403091023453, + "grad_norm": 12.98373794555664, + "learning_rate": 4.605228057759087e-05, + "loss": 0.5037, + "step": 201800 + }, + { + "epoch": 9.075388624314852, + "grad_norm": 12.246403694152832, + "learning_rate": 4.604466477393312e-05, + "loss": 0.5253, + "step": 202000 + }, + { + "epoch": 9.075388624314852, + "eval_loss": 2.579723358154297, + "eval_runtime": 1049.0403, + "eval_samples_per_second": 9.441, + "eval_steps_per_second": 0.148, + "step": 202000 + }, + { + "epoch": 9.084374157606254, + "grad_norm": 4.6200995445251465, + "learning_rate": 4.603704226227072e-05, + "loss": 0.5103, + "step": 202200 + }, + { + "epoch": 9.093359690897655, + "grad_norm": 2.7461910247802734, + "learning_rate": 4.6029413045033366e-05, + "loss": 0.5191, + "step": 202400 + }, + { + "epoch": 9.102345224189056, + "grad_norm": 9.832839965820312, + "learning_rate": 4.602177712465286e-05, + "loss": 0.441, + "step": 202600 + }, + { + "epoch": 9.111330757480456, + "grad_norm": 38.25431823730469, + "learning_rate": 4.6014134503563164e-05, + "loss": 0.4912, + "step": 202800 + }, + { + "epoch": 9.120316290771857, + "grad_norm": 4.103306293487549, + "learning_rate": 4.6006485184200365e-05, + "loss": 0.5063, + "step": 203000 + }, + { + "epoch": 9.120316290771857, + "eval_loss": 2.5657711029052734, + "eval_runtime": 1049.2539, + "eval_samples_per_second": 9.439, + "eval_steps_per_second": 0.148, + "step": 203000 + }, + { + "epoch": 9.129301824063258, + "grad_norm": 4.588971138000488, + "learning_rate": 4.59988291690027e-05, + "loss": 0.4868, + "step": 203200 + }, + { + "epoch": 9.13828735735466, + "grad_norm": 4.60148811340332, + "learning_rate": 4.599116646041052e-05, + "loss": 0.4724, + "step": 203400 + }, + { + "epoch": 9.14727289064606, + "grad_norm": 9.302680969238281, + "learning_rate": 4.5983497060866334e-05, + "loss": 0.4685, + "step": 203600 + }, + { + "epoch": 9.15625842393746, + "grad_norm": 15.227461814880371, + "learning_rate": 4.597582097281475e-05, + "loss": 0.4643, + "step": 203800 + }, + { + "epoch": 9.165243957228862, + "grad_norm": 3.3283636569976807, + "learning_rate": 4.596813819870254e-05, + "loss": 0.4851, + "step": 204000 + }, + { + "epoch": 9.165243957228862, + "eval_loss": 2.586775779724121, + "eval_runtime": 1044.1753, + "eval_samples_per_second": 9.485, + "eval_steps_per_second": 0.148, + "step": 204000 + }, + { + "epoch": 9.174229490520263, + "grad_norm": 13.116498947143555, + "learning_rate": 4.596044874097859e-05, + "loss": 0.4914, + "step": 204200 + }, + { + "epoch": 9.183215023811663, + "grad_norm": 4.156534194946289, + "learning_rate": 4.595275260209392e-05, + "loss": 0.4347, + "step": 204400 + }, + { + "epoch": 9.192200557103064, + "grad_norm": 13.453794479370117, + "learning_rate": 4.594504978450169e-05, + "loss": 0.5118, + "step": 204600 + }, + { + "epoch": 9.201186090394465, + "grad_norm": 7.623902320861816, + "learning_rate": 4.5937340290657175e-05, + "loss": 0.4727, + "step": 204800 + }, + { + "epoch": 9.210171623685866, + "grad_norm": 1.6703872680664062, + "learning_rate": 4.592962412301778e-05, + "loss": 0.4967, + "step": 205000 + }, + { + "epoch": 9.210171623685866, + "eval_loss": 2.5800576210021973, + "eval_runtime": 1046.6856, + "eval_samples_per_second": 9.462, + "eval_steps_per_second": 0.148, + "step": 205000 + }, + { + "epoch": 9.219157156977266, + "grad_norm": 5.957919120788574, + "learning_rate": 4.5921901284043033e-05, + "loss": 0.5113, + "step": 205200 + }, + { + "epoch": 9.228142690268667, + "grad_norm": 1.301614761352539, + "learning_rate": 4.5914171776194615e-05, + "loss": 0.4691, + "step": 205400 + }, + { + "epoch": 9.237128223560068, + "grad_norm": 10.48454475402832, + "learning_rate": 4.59064356019363e-05, + "loss": 0.4726, + "step": 205600 + }, + { + "epoch": 9.24611375685147, + "grad_norm": 6.0278825759887695, + "learning_rate": 4.5898692763734e-05, + "loss": 0.558, + "step": 205800 + }, + { + "epoch": 9.25509929014287, + "grad_norm": 5.763274192810059, + "learning_rate": 4.5890943264055754e-05, + "loss": 0.5259, + "step": 206000 + }, + { + "epoch": 9.25509929014287, + "eval_loss": 2.604665756225586, + "eval_runtime": 1046.3791, + "eval_samples_per_second": 9.465, + "eval_steps_per_second": 0.148, + "step": 206000 + }, + { + "epoch": 9.26408482343427, + "grad_norm": 10.876523971557617, + "learning_rate": 4.588318710537172e-05, + "loss": 0.4809, + "step": 206200 + }, + { + "epoch": 9.273070356725672, + "grad_norm": 0.9701793789863586, + "learning_rate": 4.5875424290154175e-05, + "loss": 0.4769, + "step": 206400 + }, + { + "epoch": 9.282055890017073, + "grad_norm": 1.0843396186828613, + "learning_rate": 4.5867654820877534e-05, + "loss": 0.463, + "step": 206600 + }, + { + "epoch": 9.291041423308473, + "grad_norm": 5.901642799377441, + "learning_rate": 4.585987870001831e-05, + "loss": 0.4497, + "step": 206800 + }, + { + "epoch": 9.300026956599874, + "grad_norm": 3.498466968536377, + "learning_rate": 4.585209593005516e-05, + "loss": 0.503, + "step": 207000 + }, + { + "epoch": 9.300026956599874, + "eval_loss": 2.567307472229004, + "eval_runtime": 1105.7578, + "eval_samples_per_second": 8.957, + "eval_steps_per_second": 0.14, + "step": 207000 + }, + { + "epoch": 9.309012489891275, + "grad_norm": 6.869686603546143, + "learning_rate": 4.5844306513468846e-05, + "loss": 0.5243, + "step": 207200 + }, + { + "epoch": 9.317998023182676, + "grad_norm": 6.0725579261779785, + "learning_rate": 4.583651045274225e-05, + "loss": 0.4945, + "step": 207400 + }, + { + "epoch": 9.326983556474076, + "grad_norm": 7.266490936279297, + "learning_rate": 4.582870775036037e-05, + "loss": 0.5574, + "step": 207600 + }, + { + "epoch": 9.335969089765477, + "grad_norm": 9.448139190673828, + "learning_rate": 4.582089840881032e-05, + "loss": 0.4698, + "step": 207800 + }, + { + "epoch": 9.344954623056879, + "grad_norm": 22.13079071044922, + "learning_rate": 4.581308243058134e-05, + "loss": 0.4998, + "step": 208000 + }, + { + "epoch": 9.344954623056879, + "eval_loss": 2.5886385440826416, + "eval_runtime": 1087.2015, + "eval_samples_per_second": 9.11, + "eval_steps_per_second": 0.143, + "step": 208000 + }, + { + "epoch": 9.35394015634828, + "grad_norm": 5.0014214515686035, + "learning_rate": 4.580525981816478e-05, + "loss": 0.4776, + "step": 208200 + }, + { + "epoch": 9.36292568963968, + "grad_norm": 14.449097633361816, + "learning_rate": 4.57974305740541e-05, + "loss": 0.496, + "step": 208400 + }, + { + "epoch": 9.37191122293108, + "grad_norm": 9.349239349365234, + "learning_rate": 4.5789594700744885e-05, + "loss": 0.4866, + "step": 208600 + }, + { + "epoch": 9.380896756222482, + "grad_norm": 11.318212509155273, + "learning_rate": 4.5781752200734826e-05, + "loss": 0.5278, + "step": 208800 + }, + { + "epoch": 9.389882289513883, + "grad_norm": 5.554197311401367, + "learning_rate": 4.5773903076523715e-05, + "loss": 0.5253, + "step": 209000 + }, + { + "epoch": 9.389882289513883, + "eval_loss": 2.599957227706909, + "eval_runtime": 1086.4312, + "eval_samples_per_second": 9.116, + "eval_steps_per_second": 0.143, + "step": 209000 + }, + { + "epoch": 9.398867822805283, + "grad_norm": 6.716334342956543, + "learning_rate": 4.5766047330613484e-05, + "loss": 0.5018, + "step": 209200 + }, + { + "epoch": 9.407853356096684, + "grad_norm": 6.921196937561035, + "learning_rate": 4.5758184965508145e-05, + "loss": 0.492, + "step": 209400 + }, + { + "epoch": 9.416838889388085, + "grad_norm": 9.290867805480957, + "learning_rate": 4.5750315983713845e-05, + "loss": 0.4961, + "step": 209600 + }, + { + "epoch": 9.425824422679487, + "grad_norm": 4.696343898773193, + "learning_rate": 4.574244038773881e-05, + "loss": 0.5124, + "step": 209800 + }, + { + "epoch": 9.434809955970888, + "grad_norm": 7.172698020935059, + "learning_rate": 4.5734558180093414e-05, + "loss": 0.5043, + "step": 210000 + }, + { + "epoch": 9.434809955970888, + "eval_loss": 2.5734574794769287, + "eval_runtime": 1084.6494, + "eval_samples_per_second": 9.131, + "eval_steps_per_second": 0.143, + "step": 210000 + }, + { + "epoch": 9.443795489262287, + "grad_norm": 11.656425476074219, + "learning_rate": 4.5726669363290106e-05, + "loss": 0.4677, + "step": 210200 + }, + { + "epoch": 9.452781022553689, + "grad_norm": 5.8166327476501465, + "learning_rate": 4.571877393984345e-05, + "loss": 0.5262, + "step": 210400 + }, + { + "epoch": 9.46176655584509, + "grad_norm": 9.039112091064453, + "learning_rate": 4.571087191227013e-05, + "loss": 0.4918, + "step": 210600 + }, + { + "epoch": 9.47075208913649, + "grad_norm": 5.360496520996094, + "learning_rate": 4.570296328308892e-05, + "loss": 0.4785, + "step": 210800 + }, + { + "epoch": 9.47973762242789, + "grad_norm": 3.4371631145477295, + "learning_rate": 4.569504805482069e-05, + "loss": 0.5008, + "step": 211000 + }, + { + "epoch": 9.47973762242789, + "eval_loss": 2.543621778488159, + "eval_runtime": 1079.1033, + "eval_samples_per_second": 9.178, + "eval_steps_per_second": 0.144, + "step": 211000 + }, + { + "epoch": 9.488723155719292, + "grad_norm": 34.157020568847656, + "learning_rate": 4.568712622998844e-05, + "loss": 0.4958, + "step": 211200 + }, + { + "epoch": 9.497708689010693, + "grad_norm": 15.599651336669922, + "learning_rate": 4.567919781111726e-05, + "loss": 0.4775, + "step": 211400 + }, + { + "epoch": 9.506694222302094, + "grad_norm": 7.594967842102051, + "learning_rate": 4.567126280073433e-05, + "loss": 0.4781, + "step": 211600 + }, + { + "epoch": 9.515679755593494, + "grad_norm": 6.20011043548584, + "learning_rate": 4.566332120136895e-05, + "loss": 0.5039, + "step": 211800 + }, + { + "epoch": 9.524665288884895, + "grad_norm": 3.579672336578369, + "learning_rate": 4.56553730155525e-05, + "loss": 0.5192, + "step": 212000 + }, + { + "epoch": 9.524665288884895, + "eval_loss": 2.5792055130004883, + "eval_runtime": 1068.6939, + "eval_samples_per_second": 9.267, + "eval_steps_per_second": 0.145, + "step": 212000 + }, + { + "epoch": 9.533650822176297, + "grad_norm": 16.665241241455078, + "learning_rate": 4.564741824581848e-05, + "loss": 0.4815, + "step": 212200 + }, + { + "epoch": 9.542636355467698, + "grad_norm": 2.774914503097534, + "learning_rate": 4.563945689470247e-05, + "loss": 0.5013, + "step": 212400 + }, + { + "epoch": 9.551621888759097, + "grad_norm": 5.757125377655029, + "learning_rate": 4.563148896474218e-05, + "loss": 0.4649, + "step": 212600 + }, + { + "epoch": 9.560607422050499, + "grad_norm": 6.996931552886963, + "learning_rate": 4.562351445847737e-05, + "loss": 0.4774, + "step": 212800 + }, + { + "epoch": 9.5695929553419, + "grad_norm": 8.286883354187012, + "learning_rate": 4.561553337844994e-05, + "loss": 0.4759, + "step": 213000 + }, + { + "epoch": 9.5695929553419, + "eval_loss": 2.6342129707336426, + "eval_runtime": 1062.477, + "eval_samples_per_second": 9.322, + "eval_steps_per_second": 0.146, + "step": 213000 + }, + { + "epoch": 9.578578488633301, + "grad_norm": 16.222797393798828, + "learning_rate": 4.560754572720385e-05, + "loss": 0.4855, + "step": 213200 + }, + { + "epoch": 9.5875640219247, + "grad_norm": 3.249690532684326, + "learning_rate": 4.559955150728517e-05, + "loss": 0.4865, + "step": 213400 + }, + { + "epoch": 9.596549555216102, + "grad_norm": 1.507887601852417, + "learning_rate": 4.559155072124208e-05, + "loss": 0.4639, + "step": 213600 + }, + { + "epoch": 9.605535088507503, + "grad_norm": 5.217645645141602, + "learning_rate": 4.558354337162482e-05, + "loss": 0.4814, + "step": 213800 + }, + { + "epoch": 9.614520621798905, + "grad_norm": 8.47757339477539, + "learning_rate": 4.557552946098575e-05, + "loss": 0.4777, + "step": 214000 + }, + { + "epoch": 9.614520621798905, + "eval_loss": 2.528547525405884, + "eval_runtime": 1060.9351, + "eval_samples_per_second": 9.335, + "eval_steps_per_second": 0.146, + "step": 214000 + }, + { + "epoch": 9.623506155090304, + "grad_norm": 5.725880146026611, + "learning_rate": 4.556750899187932e-05, + "loss": 0.4685, + "step": 214200 + }, + { + "epoch": 9.632491688381705, + "grad_norm": 2.501408100128174, + "learning_rate": 4.555948196686204e-05, + "loss": 0.4731, + "step": 214400 + }, + { + "epoch": 9.641477221673107, + "grad_norm": 5.393123626708984, + "learning_rate": 4.555144838849253e-05, + "loss": 0.4806, + "step": 214600 + }, + { + "epoch": 9.650462754964508, + "grad_norm": 9.90498161315918, + "learning_rate": 4.5543408259331534e-05, + "loss": 0.5061, + "step": 214800 + }, + { + "epoch": 9.659448288255907, + "grad_norm": 9.07691764831543, + "learning_rate": 4.553536158194181e-05, + "loss": 0.5264, + "step": 215000 + }, + { + "epoch": 9.659448288255907, + "eval_loss": 2.618248462677002, + "eval_runtime": 1061.3051, + "eval_samples_per_second": 9.332, + "eval_steps_per_second": 0.146, + "step": 215000 + }, + { + "epoch": 9.668433821547309, + "grad_norm": 12.091426849365234, + "learning_rate": 4.552730835888827e-05, + "loss": 0.4808, + "step": 215200 + }, + { + "epoch": 9.67741935483871, + "grad_norm": 10.131613731384277, + "learning_rate": 4.551924859273786e-05, + "loss": 0.4742, + "step": 215400 + }, + { + "epoch": 9.686404888130111, + "grad_norm": 7.796463966369629, + "learning_rate": 4.551118228605966e-05, + "loss": 0.4831, + "step": 215600 + }, + { + "epoch": 9.69539042142151, + "grad_norm": 9.690413475036621, + "learning_rate": 4.550310944142481e-05, + "loss": 0.4876, + "step": 215800 + }, + { + "epoch": 9.704375954712912, + "grad_norm": 23.55455207824707, + "learning_rate": 4.549503006140653e-05, + "loss": 0.5262, + "step": 216000 + }, + { + "epoch": 9.704375954712912, + "eval_loss": 2.5615086555480957, + "eval_runtime": 1066.0893, + "eval_samples_per_second": 9.29, + "eval_steps_per_second": 0.145, + "step": 216000 + }, + { + "epoch": 9.713361488004313, + "grad_norm": 4.3534674644470215, + "learning_rate": 4.548694414858012e-05, + "loss": 0.4968, + "step": 216200 + }, + { + "epoch": 9.722347021295715, + "grad_norm": 2.0972509384155273, + "learning_rate": 4.5478851705523e-05, + "loss": 0.4623, + "step": 216400 + }, + { + "epoch": 9.731332554587114, + "grad_norm": 7.557238578796387, + "learning_rate": 4.547075273481461e-05, + "loss": 0.4959, + "step": 216600 + }, + { + "epoch": 9.740318087878515, + "grad_norm": 4.63540506362915, + "learning_rate": 4.546264723903652e-05, + "loss": 0.4961, + "step": 216800 + }, + { + "epoch": 9.749303621169917, + "grad_norm": 6.184654712677002, + "learning_rate": 4.545453522077237e-05, + "loss": 0.4631, + "step": 217000 + }, + { + "epoch": 9.749303621169917, + "eval_loss": 2.5767123699188232, + "eval_runtime": 1069.0714, + "eval_samples_per_second": 9.264, + "eval_steps_per_second": 0.145, + "step": 217000 + }, + { + "epoch": 9.758289154461318, + "grad_norm": 1.6774091720581055, + "learning_rate": 4.544641668260785e-05, + "loss": 0.4835, + "step": 217200 + }, + { + "epoch": 9.767274687752717, + "grad_norm": 13.404745101928711, + "learning_rate": 4.543829162713078e-05, + "loss": 0.4959, + "step": 217400 + }, + { + "epoch": 9.776260221044119, + "grad_norm": 6.530130386352539, + "learning_rate": 4.5430160056931004e-05, + "loss": 0.5029, + "step": 217600 + }, + { + "epoch": 9.78524575433552, + "grad_norm": 9.423506736755371, + "learning_rate": 4.5422021974600484e-05, + "loss": 0.4966, + "step": 217800 + }, + { + "epoch": 9.794231287626921, + "grad_norm": 12.464203834533691, + "learning_rate": 4.5413877382733226e-05, + "loss": 0.447, + "step": 218000 + }, + { + "epoch": 9.794231287626921, + "eval_loss": 2.601382255554199, + "eval_runtime": 1079.2743, + "eval_samples_per_second": 9.177, + "eval_steps_per_second": 0.144, + "step": 218000 + }, + { + "epoch": 9.80321682091832, + "grad_norm": 3.708329439163208, + "learning_rate": 4.540572628392534e-05, + "loss": 0.4721, + "step": 218200 + }, + { + "epoch": 9.812202354209722, + "grad_norm": 3.581702947616577, + "learning_rate": 4.539756868077498e-05, + "loss": 0.5079, + "step": 218400 + }, + { + "epoch": 9.821187887501123, + "grad_norm": 2.959970235824585, + "learning_rate": 4.53894045758824e-05, + "loss": 0.5195, + "step": 218600 + }, + { + "epoch": 9.830173420792525, + "grad_norm": 3.9296224117279053, + "learning_rate": 4.5381233971849915e-05, + "loss": 0.4751, + "step": 218800 + }, + { + "epoch": 9.839158954083924, + "grad_norm": 5.21635103225708, + "learning_rate": 4.53730568712819e-05, + "loss": 0.4505, + "step": 219000 + }, + { + "epoch": 9.839158954083924, + "eval_loss": 2.5183651447296143, + "eval_runtime": 1079.5489, + "eval_samples_per_second": 9.174, + "eval_steps_per_second": 0.144, + "step": 219000 + }, + { + "epoch": 9.848144487375325, + "grad_norm": 10.114027976989746, + "learning_rate": 4.536487327678484e-05, + "loss": 0.4909, + "step": 219200 + }, + { + "epoch": 9.857130020666727, + "grad_norm": 4.078984260559082, + "learning_rate": 4.535668319096723e-05, + "loss": 0.5135, + "step": 219400 + }, + { + "epoch": 9.866115553958128, + "grad_norm": 9.926795959472656, + "learning_rate": 4.534848661643969e-05, + "loss": 0.5231, + "step": 219600 + }, + { + "epoch": 9.875101087249528, + "grad_norm": 6.326144218444824, + "learning_rate": 4.534028355581488e-05, + "loss": 0.5147, + "step": 219800 + }, + { + "epoch": 9.884086620540929, + "grad_norm": 7.665927410125732, + "learning_rate": 4.5332074011707515e-05, + "loss": 0.4863, + "step": 220000 + }, + { + "epoch": 9.884086620540929, + "eval_loss": 2.528228998184204, + "eval_runtime": 1079.0365, + "eval_samples_per_second": 9.179, + "eval_steps_per_second": 0.144, + "step": 220000 + }, + { + "epoch": 9.89307215383233, + "grad_norm": 13.316097259521484, + "learning_rate": 4.532385798673442e-05, + "loss": 0.517, + "step": 220200 + }, + { + "epoch": 9.902057687123731, + "grad_norm": 6.809960842132568, + "learning_rate": 4.531563548351444e-05, + "loss": 0.5025, + "step": 220400 + }, + { + "epoch": 9.91104322041513, + "grad_norm": 130.9669189453125, + "learning_rate": 4.530740650466852e-05, + "loss": 0.4974, + "step": 220600 + }, + { + "epoch": 9.920028753706532, + "grad_norm": 8.149009704589844, + "learning_rate": 4.529917105281964e-05, + "loss": 0.475, + "step": 220800 + }, + { + "epoch": 9.929014286997933, + "grad_norm": 9.56112289428711, + "learning_rate": 4.529092913059287e-05, + "loss": 0.5231, + "step": 221000 + }, + { + "epoch": 9.929014286997933, + "eval_loss": 2.5265750885009766, + "eval_runtime": 1080.8883, + "eval_samples_per_second": 9.163, + "eval_steps_per_second": 0.143, + "step": 221000 + }, + { + "epoch": 9.937999820289335, + "grad_norm": 2.8517773151397705, + "learning_rate": 4.5282680740615324e-05, + "loss": 0.447, + "step": 221200 + }, + { + "epoch": 9.946985353580734, + "grad_norm": 9.419743537902832, + "learning_rate": 4.527442588551618e-05, + "loss": 0.5271, + "step": 221400 + }, + { + "epoch": 9.955970886872135, + "grad_norm": 5.280923366546631, + "learning_rate": 4.5266164567926686e-05, + "loss": 0.4949, + "step": 221600 + }, + { + "epoch": 9.964956420163537, + "grad_norm": 2.162322521209717, + "learning_rate": 4.525789679048014e-05, + "loss": 0.5058, + "step": 221800 + }, + { + "epoch": 9.973941953454938, + "grad_norm": 12.884297370910645, + "learning_rate": 4.52496225558119e-05, + "loss": 0.4859, + "step": 222000 + }, + { + "epoch": 9.973941953454938, + "eval_loss": 2.5312891006469727, + "eval_runtime": 1083.1979, + "eval_samples_per_second": 9.143, + "eval_steps_per_second": 0.143, + "step": 222000 + }, + { + "epoch": 9.982927486746338, + "grad_norm": 12.709576606750488, + "learning_rate": 4.52413418665594e-05, + "loss": 0.504, + "step": 222200 + }, + { + "epoch": 9.991913020037739, + "grad_norm": 3.7961857318878174, + "learning_rate": 4.523305472536209e-05, + "loss": 0.4957, + "step": 222400 + }, + { + "epoch": 10.00089855332914, + "grad_norm": 9.928500175476074, + "learning_rate": 4.522476113486153e-05, + "loss": 0.497, + "step": 222600 + }, + { + "epoch": 10.009884086620541, + "grad_norm": 2.6933352947235107, + "learning_rate": 4.52164610977013e-05, + "loss": 0.4644, + "step": 222800 + }, + { + "epoch": 10.018869619911941, + "grad_norm": 2.5882034301757812, + "learning_rate": 4.520815461652704e-05, + "loss": 0.4717, + "step": 223000 + }, + { + "epoch": 10.018869619911941, + "eval_loss": 2.542062997817993, + "eval_runtime": 1081.4133, + "eval_samples_per_second": 9.158, + "eval_steps_per_second": 0.143, + "step": 223000 + }, + { + "epoch": 10.027855153203342, + "grad_norm": 1.036136269569397, + "learning_rate": 4.5199841693986446e-05, + "loss": 0.4663, + "step": 223200 + }, + { + "epoch": 10.036840686494743, + "grad_norm": 3.3049538135528564, + "learning_rate": 4.5191522332729276e-05, + "loss": 0.4899, + "step": 223400 + }, + { + "epoch": 10.045826219786145, + "grad_norm": 3.9398066997528076, + "learning_rate": 4.518319653540733e-05, + "loss": 0.4902, + "step": 223600 + }, + { + "epoch": 10.054811753077544, + "grad_norm": 7.958073139190674, + "learning_rate": 4.517486430467446e-05, + "loss": 0.4853, + "step": 223800 + }, + { + "epoch": 10.063797286368946, + "grad_norm": 6.440467357635498, + "learning_rate": 4.516652564318658e-05, + "loss": 0.4674, + "step": 224000 + }, + { + "epoch": 10.063797286368946, + "eval_loss": 2.563239097595215, + "eval_runtime": 1080.8507, + "eval_samples_per_second": 9.163, + "eval_steps_per_second": 0.143, + "step": 224000 + }, + { + "epoch": 10.072782819660347, + "grad_norm": 3.7625374794006348, + "learning_rate": 4.5158180553601635e-05, + "loss": 0.4607, + "step": 224200 + }, + { + "epoch": 10.081768352951748, + "grad_norm": 2.02681303024292, + "learning_rate": 4.514982903857964e-05, + "loss": 0.4737, + "step": 224400 + }, + { + "epoch": 10.09075388624315, + "grad_norm": 15.780081748962402, + "learning_rate": 4.514147110078264e-05, + "loss": 0.4451, + "step": 224600 + }, + { + "epoch": 10.099739419534549, + "grad_norm": 9.11990737915039, + "learning_rate": 4.513310674287474e-05, + "loss": 0.4585, + "step": 224800 + }, + { + "epoch": 10.10872495282595, + "grad_norm": 19.485971450805664, + "learning_rate": 4.512473596752208e-05, + "loss": 0.4777, + "step": 225000 + }, + { + "epoch": 10.10872495282595, + "eval_loss": 2.589509963989258, + "eval_runtime": 1080.2805, + "eval_samples_per_second": 9.168, + "eval_steps_per_second": 0.143, + "step": 225000 + }, + { + "epoch": 10.117710486117351, + "grad_norm": 7.728920936584473, + "learning_rate": 4.511635877739285e-05, + "loss": 0.452, + "step": 225200 + }, + { + "epoch": 10.126696019408753, + "grad_norm": 6.3267412185668945, + "learning_rate": 4.51079751751573e-05, + "loss": 0.4296, + "step": 225400 + }, + { + "epoch": 10.135681552700152, + "grad_norm": 7.468375205993652, + "learning_rate": 4.50995851634877e-05, + "loss": 0.4678, + "step": 225600 + }, + { + "epoch": 10.144667085991554, + "grad_norm": 5.496447563171387, + "learning_rate": 4.509118874505837e-05, + "loss": 0.4364, + "step": 225800 + }, + { + "epoch": 10.153652619282955, + "grad_norm": 1.2194163799285889, + "learning_rate": 4.508278592254568e-05, + "loss": 0.4963, + "step": 226000 + }, + { + "epoch": 10.153652619282955, + "eval_loss": 2.564985513687134, + "eval_runtime": 1079.4368, + "eval_samples_per_second": 9.175, + "eval_steps_per_second": 0.144, + "step": 226000 + }, + { + "epoch": 10.162638152574356, + "grad_norm": 4.605660438537598, + "learning_rate": 4.507437669862804e-05, + "loss": 0.5033, + "step": 226200 + }, + { + "epoch": 10.171623685865756, + "grad_norm": 7.148728370666504, + "learning_rate": 4.5065961075985894e-05, + "loss": 0.46, + "step": 226400 + }, + { + "epoch": 10.180609219157157, + "grad_norm": 6.414613246917725, + "learning_rate": 4.505753905730173e-05, + "loss": 0.4905, + "step": 226600 + }, + { + "epoch": 10.189594752448558, + "grad_norm": 17.29862403869629, + "learning_rate": 4.504911064526007e-05, + "loss": 0.4554, + "step": 226800 + }, + { + "epoch": 10.19858028573996, + "grad_norm": 26.544200897216797, + "learning_rate": 4.504067584254748e-05, + "loss": 0.446, + "step": 227000 + }, + { + "epoch": 10.19858028573996, + "eval_loss": 2.5394065380096436, + "eval_runtime": 1081.1162, + "eval_samples_per_second": 9.161, + "eval_steps_per_second": 0.143, + "step": 227000 + }, + { + "epoch": 10.207565819031359, + "grad_norm": 2.5992953777313232, + "learning_rate": 4.503223465185257e-05, + "loss": 0.4749, + "step": 227200 + }, + { + "epoch": 10.21655135232276, + "grad_norm": 5.341890811920166, + "learning_rate": 4.5023787075865955e-05, + "loss": 0.4482, + "step": 227400 + }, + { + "epoch": 10.225536885614162, + "grad_norm": 1.8888834714889526, + "learning_rate": 4.5015333117280324e-05, + "loss": 0.465, + "step": 227600 + }, + { + "epoch": 10.234522418905563, + "grad_norm": 7.757589817047119, + "learning_rate": 4.500687277879038e-05, + "loss": 0.4819, + "step": 227800 + }, + { + "epoch": 10.243507952196962, + "grad_norm": 8.244403839111328, + "learning_rate": 4.499840606309285e-05, + "loss": 0.4512, + "step": 228000 + }, + { + "epoch": 10.243507952196962, + "eval_loss": 2.5606801509857178, + "eval_runtime": 1079.9496, + "eval_samples_per_second": 9.171, + "eval_steps_per_second": 0.144, + "step": 228000 + }, + { + "epoch": 10.252493485488364, + "grad_norm": 9.635261535644531, + "learning_rate": 4.498993297288653e-05, + "loss": 0.4661, + "step": 228200 + }, + { + "epoch": 10.261479018779765, + "grad_norm": 0.8005920648574829, + "learning_rate": 4.498145351087221e-05, + "loss": 0.4503, + "step": 228400 + }, + { + "epoch": 10.270464552071166, + "grad_norm": 13.759466171264648, + "learning_rate": 4.497296767975273e-05, + "loss": 0.4807, + "step": 228600 + }, + { + "epoch": 10.279450085362566, + "grad_norm": 8.74666976928711, + "learning_rate": 4.496447548223295e-05, + "loss": 0.4259, + "step": 228800 + }, + { + "epoch": 10.288435618653967, + "grad_norm": 2.4805383682250977, + "learning_rate": 4.495597692101977e-05, + "loss": 0.4893, + "step": 229000 + }, + { + "epoch": 10.288435618653967, + "eval_loss": 2.536832809448242, + "eval_runtime": 1080.0948, + "eval_samples_per_second": 9.17, + "eval_steps_per_second": 0.144, + "step": 229000 + }, + { + "epoch": 10.297421151945368, + "grad_norm": 16.94227409362793, + "learning_rate": 4.494747199882212e-05, + "loss": 0.5009, + "step": 229200 + }, + { + "epoch": 10.30640668523677, + "grad_norm": 28.570947647094727, + "learning_rate": 4.4938960718350945e-05, + "loss": 0.4331, + "step": 229400 + }, + { + "epoch": 10.315392218528169, + "grad_norm": 9.431313514709473, + "learning_rate": 4.493044308231921e-05, + "loss": 0.4823, + "step": 229600 + }, + { + "epoch": 10.32437775181957, + "grad_norm": 6.612549304962158, + "learning_rate": 4.4921919093441944e-05, + "loss": 0.4985, + "step": 229800 + }, + { + "epoch": 10.333363285110972, + "grad_norm": 4.512430667877197, + "learning_rate": 4.4913388754436156e-05, + "loss": 0.4586, + "step": 230000 + }, + { + "epoch": 10.333363285110972, + "eval_loss": 2.5845720767974854, + "eval_runtime": 1086.1502, + "eval_samples_per_second": 9.118, + "eval_steps_per_second": 0.143, + "step": 230000 + }, + { + "epoch": 10.342348818402373, + "grad_norm": 8.223472595214844, + "learning_rate": 4.4904852068020906e-05, + "loss": 0.4548, + "step": 230200 + }, + { + "epoch": 10.351334351693772, + "grad_norm": 4.4741530418396, + "learning_rate": 4.4896309036917264e-05, + "loss": 0.4753, + "step": 230400 + }, + { + "epoch": 10.360319884985174, + "grad_norm": 8.382828712463379, + "learning_rate": 4.488775966384834e-05, + "loss": 0.4858, + "step": 230600 + }, + { + "epoch": 10.369305418276575, + "grad_norm": 5.764524459838867, + "learning_rate": 4.4879203951539246e-05, + "loss": 0.462, + "step": 230800 + }, + { + "epoch": 10.378290951567976, + "grad_norm": 9.164348602294922, + "learning_rate": 4.4870641902717126e-05, + "loss": 0.4565, + "step": 231000 + }, + { + "epoch": 10.378290951567976, + "eval_loss": 2.533195972442627, + "eval_runtime": 1076.7261, + "eval_samples_per_second": 9.198, + "eval_steps_per_second": 0.144, + "step": 231000 + }, + { + "epoch": 10.387276484859376, + "grad_norm": 7.0318732261657715, + "learning_rate": 4.486207352011113e-05, + "loss": 0.4456, + "step": 231200 + }, + { + "epoch": 10.396262018150777, + "grad_norm": 8.506872177124023, + "learning_rate": 4.4853498806452454e-05, + "loss": 0.4627, + "step": 231400 + }, + { + "epoch": 10.405247551442178, + "grad_norm": 8.952465057373047, + "learning_rate": 4.484491776447428e-05, + "loss": 0.4674, + "step": 231600 + }, + { + "epoch": 10.41423308473358, + "grad_norm": 56.0440559387207, + "learning_rate": 4.483633039691184e-05, + "loss": 0.4451, + "step": 231800 + }, + { + "epoch": 10.423218618024979, + "grad_norm": 2.9122977256774902, + "learning_rate": 4.4827736706502344e-05, + "loss": 0.4789, + "step": 232000 + }, + { + "epoch": 10.423218618024979, + "eval_loss": 2.555021286010742, + "eval_runtime": 1072.7806, + "eval_samples_per_second": 9.232, + "eval_steps_per_second": 0.144, + "step": 232000 + }, + { + "epoch": 10.43220415131638, + "grad_norm": 11.758764266967773, + "learning_rate": 4.481913669598505e-05, + "loss": 0.5142, + "step": 232200 + }, + { + "epoch": 10.441189684607782, + "grad_norm": 4.137763023376465, + "learning_rate": 4.481053036810121e-05, + "loss": 0.4642, + "step": 232400 + }, + { + "epoch": 10.450175217899183, + "grad_norm": 4.821073055267334, + "learning_rate": 4.4801917725594113e-05, + "loss": 0.4967, + "step": 232600 + }, + { + "epoch": 10.459160751190582, + "grad_norm": 3.3275232315063477, + "learning_rate": 4.4793298771209036e-05, + "loss": 0.4814, + "step": 232800 + }, + { + "epoch": 10.468146284481984, + "grad_norm": 10.877018928527832, + "learning_rate": 4.4784673507693284e-05, + "loss": 0.4652, + "step": 233000 + }, + { + "epoch": 10.468146284481984, + "eval_loss": 2.536766529083252, + "eval_runtime": 1073.3016, + "eval_samples_per_second": 9.228, + "eval_steps_per_second": 0.144, + "step": 233000 + }, + { + "epoch": 10.477131817773385, + "grad_norm": 10.973562240600586, + "learning_rate": 4.477604193779615e-05, + "loss": 0.4667, + "step": 233200 + }, + { + "epoch": 10.486117351064786, + "grad_norm": 6.547046661376953, + "learning_rate": 4.476740406426898e-05, + "loss": 0.4834, + "step": 233400 + }, + { + "epoch": 10.495102884356186, + "grad_norm": 11.464012145996094, + "learning_rate": 4.475875988986509e-05, + "loss": 0.4755, + "step": 233600 + }, + { + "epoch": 10.504088417647587, + "grad_norm": 4.013788223266602, + "learning_rate": 4.475010941733981e-05, + "loss": 0.4742, + "step": 233800 + }, + { + "epoch": 10.513073950938988, + "grad_norm": 0.9032938480377197, + "learning_rate": 4.474145264945049e-05, + "loss": 0.5054, + "step": 234000 + }, + { + "epoch": 10.513073950938988, + "eval_loss": 2.5643973350524902, + "eval_runtime": 1071.8884, + "eval_samples_per_second": 9.24, + "eval_steps_per_second": 0.145, + "step": 234000 + }, + { + "epoch": 10.52205948423039, + "grad_norm": 12.91777229309082, + "learning_rate": 4.47327895889565e-05, + "loss": 0.4666, + "step": 234200 + }, + { + "epoch": 10.53104501752179, + "grad_norm": 15.215625762939453, + "learning_rate": 4.472412023861917e-05, + "loss": 0.4704, + "step": 234400 + }, + { + "epoch": 10.54003055081319, + "grad_norm": 8.357992172241211, + "learning_rate": 4.4715444601201884e-05, + "loss": 0.4887, + "step": 234600 + }, + { + "epoch": 10.549016084104592, + "grad_norm": 10.161919593811035, + "learning_rate": 4.470676267947e-05, + "loss": 0.4796, + "step": 234800 + }, + { + "epoch": 10.558001617395993, + "grad_norm": 14.575705528259277, + "learning_rate": 4.4698074476190885e-05, + "loss": 0.4384, + "step": 235000 + }, + { + "epoch": 10.558001617395993, + "eval_loss": 2.5507290363311768, + "eval_runtime": 1070.9659, + "eval_samples_per_second": 9.248, + "eval_steps_per_second": 0.145, + "step": 235000 + }, + { + "epoch": 10.566987150687392, + "grad_norm": 4.9642109870910645, + "learning_rate": 4.4689379994133915e-05, + "loss": 0.4849, + "step": 235200 + }, + { + "epoch": 10.575972683978794, + "grad_norm": 6.950181007385254, + "learning_rate": 4.468067923607047e-05, + "loss": 0.4751, + "step": 235400 + }, + { + "epoch": 10.584958217270195, + "grad_norm": 9.092172622680664, + "learning_rate": 4.4671972204773913e-05, + "loss": 0.4987, + "step": 235600 + }, + { + "epoch": 10.593943750561596, + "grad_norm": 2.7059104442596436, + "learning_rate": 4.466325890301963e-05, + "loss": 0.5025, + "step": 235800 + }, + { + "epoch": 10.602929283852998, + "grad_norm": 0.9468827247619629, + "learning_rate": 4.465453933358498e-05, + "loss": 0.449, + "step": 236000 + }, + { + "epoch": 10.602929283852998, + "eval_loss": 2.53763747215271, + "eval_runtime": 1070.9813, + "eval_samples_per_second": 9.248, + "eval_steps_per_second": 0.145, + "step": 236000 + }, + { + "epoch": 10.611914817144397, + "grad_norm": 6.531583309173584, + "learning_rate": 4.464581349924933e-05, + "loss": 0.513, + "step": 236200 + }, + { + "epoch": 10.620900350435798, + "grad_norm": 10.116623878479004, + "learning_rate": 4.4637081402794065e-05, + "loss": 0.4852, + "step": 236400 + }, + { + "epoch": 10.6298858837272, + "grad_norm": 6.903548240661621, + "learning_rate": 4.462834304700253e-05, + "loss": 0.4906, + "step": 236600 + }, + { + "epoch": 10.6388714170186, + "grad_norm": 14.256983757019043, + "learning_rate": 4.4619598434660103e-05, + "loss": 0.4823, + "step": 236800 + }, + { + "epoch": 10.64785695031, + "grad_norm": 4.879205703735352, + "learning_rate": 4.461084756855411e-05, + "loss": 0.4704, + "step": 237000 + }, + { + "epoch": 10.64785695031, + "eval_loss": 2.573296546936035, + "eval_runtime": 1070.7212, + "eval_samples_per_second": 9.25, + "eval_steps_per_second": 0.145, + "step": 237000 + }, + { + "epoch": 10.656842483601402, + "grad_norm": 7.068393230438232, + "learning_rate": 4.460209045147393e-05, + "loss": 0.4907, + "step": 237200 + }, + { + "epoch": 10.665828016892803, + "grad_norm": 9.679513931274414, + "learning_rate": 4.459332708621088e-05, + "loss": 0.458, + "step": 237400 + }, + { + "epoch": 10.674813550184204, + "grad_norm": 3.086480140686035, + "learning_rate": 4.458455747555829e-05, + "loss": 0.4512, + "step": 237600 + }, + { + "epoch": 10.683799083475604, + "grad_norm": 7.147046089172363, + "learning_rate": 4.4575781622311483e-05, + "loss": 0.4981, + "step": 237800 + }, + { + "epoch": 10.692784616767005, + "grad_norm": 7.950299263000488, + "learning_rate": 4.456699952926777e-05, + "loss": 0.5095, + "step": 238000 + }, + { + "epoch": 10.692784616767005, + "eval_loss": 2.5305910110473633, + "eval_runtime": 1069.7405, + "eval_samples_per_second": 9.258, + "eval_steps_per_second": 0.145, + "step": 238000 + }, + { + "epoch": 10.701770150058406, + "grad_norm": 7.476064205169678, + "learning_rate": 4.455821119922646e-05, + "loss": 0.4871, + "step": 238200 + }, + { + "epoch": 10.710755683349806, + "grad_norm": 0.6263104677200317, + "learning_rate": 4.454941663498882e-05, + "loss": 0.487, + "step": 238400 + }, + { + "epoch": 10.719741216641207, + "grad_norm": 12.403650283813477, + "learning_rate": 4.4540615839358144e-05, + "loss": 0.4504, + "step": 238600 + }, + { + "epoch": 10.728726749932608, + "grad_norm": 4.677651882171631, + "learning_rate": 4.4531808815139685e-05, + "loss": 0.4703, + "step": 238800 + }, + { + "epoch": 10.73771228322401, + "grad_norm": 3.9398200511932373, + "learning_rate": 4.45229955651407e-05, + "loss": 0.4882, + "step": 239000 + }, + { + "epoch": 10.73771228322401, + "eval_loss": 2.5735087394714355, + "eval_runtime": 1071.2709, + "eval_samples_per_second": 9.245, + "eval_steps_per_second": 0.145, + "step": 239000 + }, + { + "epoch": 10.746697816515411, + "grad_norm": 7.807620525360107, + "learning_rate": 4.45141760921704e-05, + "loss": 0.4666, + "step": 239200 + }, + { + "epoch": 10.75568334980681, + "grad_norm": 3.5220091342926025, + "learning_rate": 4.450535039904001e-05, + "loss": 0.4507, + "step": 239400 + }, + { + "epoch": 10.764668883098212, + "grad_norm": 5.474115371704102, + "learning_rate": 4.4496518488562735e-05, + "loss": 0.5232, + "step": 239600 + }, + { + "epoch": 10.773654416389613, + "grad_norm": 3.3102242946624756, + "learning_rate": 4.448768036355374e-05, + "loss": 0.4838, + "step": 239800 + }, + { + "epoch": 10.782639949681014, + "grad_norm": 6.073796272277832, + "learning_rate": 4.447883602683019e-05, + "loss": 0.5051, + "step": 240000 + }, + { + "epoch": 10.782639949681014, + "eval_loss": 2.6252071857452393, + "eval_runtime": 1070.75, + "eval_samples_per_second": 9.25, + "eval_steps_per_second": 0.145, + "step": 240000 + }, + { + "epoch": 10.791625482972414, + "grad_norm": 11.76477336883545, + "learning_rate": 4.446998548121123e-05, + "loss": 0.4978, + "step": 240200 + }, + { + "epoch": 10.800611016263815, + "grad_norm": 9.04162311553955, + "learning_rate": 4.446112872951798e-05, + "loss": 0.4882, + "step": 240400 + }, + { + "epoch": 10.809596549555216, + "grad_norm": 7.809966564178467, + "learning_rate": 4.445226577457351e-05, + "loss": 0.4747, + "step": 240600 + }, + { + "epoch": 10.818582082846618, + "grad_norm": 10.286615371704102, + "learning_rate": 4.4443396619202936e-05, + "loss": 0.4706, + "step": 240800 + }, + { + "epoch": 10.827567616138017, + "grad_norm": 4.194571018218994, + "learning_rate": 4.4434521266233284e-05, + "loss": 0.4912, + "step": 241000 + }, + { + "epoch": 10.827567616138017, + "eval_loss": 2.5471911430358887, + "eval_runtime": 1122.4761, + "eval_samples_per_second": 8.823, + "eval_steps_per_second": 0.138, + "step": 241000 + }, + { + "epoch": 10.836553149429418, + "grad_norm": 8.166125297546387, + "learning_rate": 4.442563971849358e-05, + "loss": 0.4689, + "step": 241200 + }, + { + "epoch": 10.84553868272082, + "grad_norm": 0.8636496663093567, + "learning_rate": 4.441675197881483e-05, + "loss": 0.5064, + "step": 241400 + }, + { + "epoch": 10.854524216012221, + "grad_norm": 7.717101573944092, + "learning_rate": 4.440785805003002e-05, + "loss": 0.4968, + "step": 241600 + }, + { + "epoch": 10.86350974930362, + "grad_norm": 6.4478440284729, + "learning_rate": 4.439895793497407e-05, + "loss": 0.4771, + "step": 241800 + }, + { + "epoch": 10.872495282595022, + "grad_norm": 6.758020877838135, + "learning_rate": 4.439005163648393e-05, + "loss": 0.464, + "step": 242000 + }, + { + "epoch": 10.872495282595022, + "eval_loss": 2.5376241207122803, + "eval_runtime": 1093.847, + "eval_samples_per_second": 9.054, + "eval_steps_per_second": 0.142, + "step": 242000 + }, + { + "epoch": 10.881480815886423, + "grad_norm": 3.514791488647461, + "learning_rate": 4.438113915739847e-05, + "loss": 0.4488, + "step": 242200 + }, + { + "epoch": 10.890466349177824, + "grad_norm": 5.87647008895874, + "learning_rate": 4.437222050055855e-05, + "loss": 0.4547, + "step": 242400 + }, + { + "epoch": 10.899451882469224, + "grad_norm": 7.898502826690674, + "learning_rate": 4.4363295668807006e-05, + "loss": 0.5082, + "step": 242600 + }, + { + "epoch": 10.908437415760625, + "grad_norm": 23.251298904418945, + "learning_rate": 4.435436466498863e-05, + "loss": 0.5251, + "step": 242800 + }, + { + "epoch": 10.917422949052026, + "grad_norm": 12.48715877532959, + "learning_rate": 4.4345427491950194e-05, + "loss": 0.5158, + "step": 243000 + }, + { + "epoch": 10.917422949052026, + "eval_loss": 2.5292649269104004, + "eval_runtime": 1091.8273, + "eval_samples_per_second": 9.071, + "eval_steps_per_second": 0.142, + "step": 243000 + }, + { + "epoch": 10.926408482343428, + "grad_norm": 4.933159351348877, + "learning_rate": 4.433648415254043e-05, + "loss": 0.4988, + "step": 243200 + }, + { + "epoch": 10.935394015634827, + "grad_norm": 8.043121337890625, + "learning_rate": 4.432753464961003e-05, + "loss": 0.4807, + "step": 243400 + }, + { + "epoch": 10.944379548926229, + "grad_norm": 5.658725738525391, + "learning_rate": 4.431857898601166e-05, + "loss": 0.5186, + "step": 243600 + }, + { + "epoch": 10.95336508221763, + "grad_norm": 4.071963787078857, + "learning_rate": 4.4309617164599935e-05, + "loss": 0.4554, + "step": 243800 + }, + { + "epoch": 10.962350615509031, + "grad_norm": 11.117284774780273, + "learning_rate": 4.430064918823146e-05, + "loss": 0.4819, + "step": 244000 + }, + { + "epoch": 10.962350615509031, + "eval_loss": 2.524524211883545, + "eval_runtime": 1093.0541, + "eval_samples_per_second": 9.061, + "eval_steps_per_second": 0.142, + "step": 244000 + }, + { + "epoch": 10.97133614880043, + "grad_norm": 2.5072007179260254, + "learning_rate": 4.429167505976477e-05, + "loss": 0.462, + "step": 244200 + }, + { + "epoch": 10.980321682091832, + "grad_norm": 0.8460531830787659, + "learning_rate": 4.428269478206038e-05, + "loss": 0.4288, + "step": 244400 + }, + { + "epoch": 10.989307215383233, + "grad_norm": 14.47143840789795, + "learning_rate": 4.4273708357980767e-05, + "loss": 0.5106, + "step": 244600 + }, + { + "epoch": 10.998292748674634, + "grad_norm": 7.705573558807373, + "learning_rate": 4.426471579039037e-05, + "loss": 0.4879, + "step": 244800 + }, + { + "epoch": 11.007278281966034, + "grad_norm": 2.811030626296997, + "learning_rate": 4.4255717082155545e-05, + "loss": 0.4478, + "step": 245000 + }, + { + "epoch": 11.007278281966034, + "eval_loss": 2.5267140865325928, + "eval_runtime": 1093.249, + "eval_samples_per_second": 9.059, + "eval_steps_per_second": 0.142, + "step": 245000 + }, + { + "epoch": 11.016263815257435, + "grad_norm": 2.7444190979003906, + "learning_rate": 4.424671223614466e-05, + "loss": 0.4124, + "step": 245200 + }, + { + "epoch": 11.025249348548837, + "grad_norm": 4.81060266494751, + "learning_rate": 4.423770125522802e-05, + "loss": 0.4267, + "step": 245400 + }, + { + "epoch": 11.034234881840238, + "grad_norm": 8.938187599182129, + "learning_rate": 4.4228684142277874e-05, + "loss": 0.4374, + "step": 245600 + }, + { + "epoch": 11.043220415131637, + "grad_norm": 2.805171012878418, + "learning_rate": 4.421966090016844e-05, + "loss": 0.4774, + "step": 245800 + }, + { + "epoch": 11.052205948423039, + "grad_norm": 0.964135468006134, + "learning_rate": 4.421063153177588e-05, + "loss": 0.4706, + "step": 246000 + }, + { + "epoch": 11.052205948423039, + "eval_loss": 2.5728235244750977, + "eval_runtime": 1091.2334, + "eval_samples_per_second": 9.076, + "eval_steps_per_second": 0.142, + "step": 246000 + }, + { + "epoch": 11.06119148171444, + "grad_norm": 14.399362564086914, + "learning_rate": 4.420159603997832e-05, + "loss": 0.4882, + "step": 246200 + }, + { + "epoch": 11.070177015005841, + "grad_norm": 10.316938400268555, + "learning_rate": 4.4192554427655824e-05, + "loss": 0.4716, + "step": 246400 + }, + { + "epoch": 11.07916254829724, + "grad_norm": 6.025542259216309, + "learning_rate": 4.418350669769041e-05, + "loss": 0.4675, + "step": 246600 + }, + { + "epoch": 11.088148081588642, + "grad_norm": 4.75909948348999, + "learning_rate": 4.417445285296606e-05, + "loss": 0.4213, + "step": 246800 + }, + { + "epoch": 11.097133614880043, + "grad_norm": 1.9783635139465332, + "learning_rate": 4.416539289636869e-05, + "loss": 0.4627, + "step": 247000 + }, + { + "epoch": 11.097133614880043, + "eval_loss": 2.543732166290283, + "eval_runtime": 1092.6379, + "eval_samples_per_second": 9.064, + "eval_steps_per_second": 0.142, + "step": 247000 + }, + { + "epoch": 11.106119148171445, + "grad_norm": 15.855208396911621, + "learning_rate": 4.415632683078615e-05, + "loss": 0.4413, + "step": 247200 + }, + { + "epoch": 11.115104681462844, + "grad_norm": 10.875030517578125, + "learning_rate": 4.41472546591083e-05, + "loss": 0.462, + "step": 247400 + }, + { + "epoch": 11.124090214754245, + "grad_norm": 12.176704406738281, + "learning_rate": 4.413817638422686e-05, + "loss": 0.4606, + "step": 247600 + }, + { + "epoch": 11.133075748045647, + "grad_norm": 9.033163070678711, + "learning_rate": 4.412909200903555e-05, + "loss": 0.4772, + "step": 247800 + }, + { + "epoch": 11.142061281337048, + "grad_norm": 3.4691646099090576, + "learning_rate": 4.4120001536430045e-05, + "loss": 0.4675, + "step": 248000 + }, + { + "epoch": 11.142061281337048, + "eval_loss": 2.5572187900543213, + "eval_runtime": 1093.238, + "eval_samples_per_second": 9.059, + "eval_steps_per_second": 0.142, + "step": 248000 + }, + { + "epoch": 11.151046814628447, + "grad_norm": 5.028947830200195, + "learning_rate": 4.411090496930791e-05, + "loss": 0.4654, + "step": 248200 + }, + { + "epoch": 11.160032347919849, + "grad_norm": 13.782191276550293, + "learning_rate": 4.410180231056869e-05, + "loss": 0.4893, + "step": 248400 + }, + { + "epoch": 11.16901788121125, + "grad_norm": 18.2941837310791, + "learning_rate": 4.4092693563113886e-05, + "loss": 0.4495, + "step": 248600 + }, + { + "epoch": 11.178003414502651, + "grad_norm": 3.19677734375, + "learning_rate": 4.40835787298469e-05, + "loss": 0.4599, + "step": 248800 + }, + { + "epoch": 11.18698894779405, + "grad_norm": 5.5048956871032715, + "learning_rate": 4.4074457813673085e-05, + "loss": 0.4923, + "step": 249000 + }, + { + "epoch": 11.18698894779405, + "eval_loss": 2.5093724727630615, + "eval_runtime": 1090.7596, + "eval_samples_per_second": 9.08, + "eval_steps_per_second": 0.142, + "step": 249000 + }, + { + "epoch": 11.195974481085452, + "grad_norm": 6.13324499130249, + "learning_rate": 4.406533081749976e-05, + "loss": 0.4531, + "step": 249200 + }, + { + "epoch": 11.204960014376853, + "grad_norm": 7.9370012283325195, + "learning_rate": 4.4056197744236146e-05, + "loss": 0.471, + "step": 249400 + }, + { + "epoch": 11.213945547668255, + "grad_norm": 8.390715599060059, + "learning_rate": 4.404705859679345e-05, + "loss": 0.4765, + "step": 249600 + }, + { + "epoch": 11.222931080959654, + "grad_norm": 5.003363609313965, + "learning_rate": 4.403791337808474e-05, + "loss": 0.4939, + "step": 249800 + }, + { + "epoch": 11.231916614251055, + "grad_norm": 27.854265213012695, + "learning_rate": 4.4028762091025085e-05, + "loss": 0.4676, + "step": 250000 + }, + { + "epoch": 11.231916614251055, + "eval_loss": 2.5488498210906982, + "eval_runtime": 1093.4053, + "eval_samples_per_second": 9.058, + "eval_steps_per_second": 0.142, + "step": 250000 + }, + { + "epoch": 11.240902147542457, + "grad_norm": 20.608421325683594, + "learning_rate": 4.401960473853146e-05, + "loss": 0.4464, + "step": 250200 + }, + { + "epoch": 11.249887680833858, + "grad_norm": 2.9301233291625977, + "learning_rate": 4.401044132352279e-05, + "loss": 0.4746, + "step": 250400 + }, + { + "epoch": 11.25887321412526, + "grad_norm": 13.66663646697998, + "learning_rate": 4.400127184891991e-05, + "loss": 0.474, + "step": 250600 + }, + { + "epoch": 11.267858747416659, + "grad_norm": 19.16084098815918, + "learning_rate": 4.399209631764559e-05, + "loss": 0.4846, + "step": 250800 + }, + { + "epoch": 11.27684428070806, + "grad_norm": 5.497101306915283, + "learning_rate": 4.398291473262456e-05, + "loss": 0.4921, + "step": 251000 + }, + { + "epoch": 11.27684428070806, + "eval_loss": 2.606623411178589, + "eval_runtime": 1091.4454, + "eval_samples_per_second": 9.074, + "eval_steps_per_second": 0.142, + "step": 251000 + }, + { + "epoch": 11.285829813999461, + "grad_norm": 16.50528335571289, + "learning_rate": 4.397372709678344e-05, + "loss": 0.4951, + "step": 251200 + }, + { + "epoch": 11.294815347290863, + "grad_norm": 3.4211204051971436, + "learning_rate": 4.3964533413050805e-05, + "loss": 0.4456, + "step": 251400 + }, + { + "epoch": 11.303800880582262, + "grad_norm": 4.113375186920166, + "learning_rate": 4.3955333684357145e-05, + "loss": 0.4471, + "step": 251600 + }, + { + "epoch": 11.312786413873663, + "grad_norm": 6.673891067504883, + "learning_rate": 4.3946127913634894e-05, + "loss": 0.5014, + "step": 251800 + }, + { + "epoch": 11.321771947165065, + "grad_norm": 16.668277740478516, + "learning_rate": 4.393691610381838e-05, + "loss": 0.4654, + "step": 252000 + }, + { + "epoch": 11.321771947165065, + "eval_loss": 2.590348243713379, + "eval_runtime": 1090.7216, + "eval_samples_per_second": 9.08, + "eval_steps_per_second": 0.142, + "step": 252000 + }, + { + "epoch": 11.330757480456466, + "grad_norm": 8.572153091430664, + "learning_rate": 4.392769825784389e-05, + "loss": 0.4574, + "step": 252200 + }, + { + "epoch": 11.339743013747865, + "grad_norm": 14.801168441772461, + "learning_rate": 4.391847437864961e-05, + "loss": 0.4844, + "step": 252400 + }, + { + "epoch": 11.348728547039267, + "grad_norm": 10.526625633239746, + "learning_rate": 4.390924446917566e-05, + "loss": 0.4687, + "step": 252600 + }, + { + "epoch": 11.357714080330668, + "grad_norm": 4.2288126945495605, + "learning_rate": 4.390000853236409e-05, + "loss": 0.4693, + "step": 252800 + }, + { + "epoch": 11.36669961362207, + "grad_norm": 4.500141143798828, + "learning_rate": 4.389076657115886e-05, + "loss": 0.4602, + "step": 253000 + }, + { + "epoch": 11.36669961362207, + "eval_loss": 2.5286338329315186, + "eval_runtime": 1088.5161, + "eval_samples_per_second": 9.099, + "eval_steps_per_second": 0.142, + "step": 253000 + }, + { + "epoch": 11.375685146913469, + "grad_norm": 4.990228176116943, + "learning_rate": 4.3881518588505846e-05, + "loss": 0.4347, + "step": 253200 + }, + { + "epoch": 11.38467068020487, + "grad_norm": 2.7549238204956055, + "learning_rate": 4.3872264587352864e-05, + "loss": 0.445, + "step": 253400 + }, + { + "epoch": 11.393656213496271, + "grad_norm": 4.3550519943237305, + "learning_rate": 4.3863004570649614e-05, + "loss": 0.4574, + "step": 253600 + }, + { + "epoch": 11.402641746787673, + "grad_norm": 2.8987128734588623, + "learning_rate": 4.385373854134775e-05, + "loss": 0.4668, + "step": 253800 + }, + { + "epoch": 11.411627280079072, + "grad_norm": 11.990416526794434, + "learning_rate": 4.384446650240082e-05, + "loss": 0.4634, + "step": 254000 + }, + { + "epoch": 11.411627280079072, + "eval_loss": 2.5327000617980957, + "eval_runtime": 1087.7639, + "eval_samples_per_second": 9.105, + "eval_steps_per_second": 0.142, + "step": 254000 + }, + { + "epoch": 11.420612813370473, + "grad_norm": 11.864954948425293, + "learning_rate": 4.38351884567643e-05, + "loss": 0.4627, + "step": 254200 + }, + { + "epoch": 11.429598346661875, + "grad_norm": 8.507243156433105, + "learning_rate": 4.3825904407395574e-05, + "loss": 0.4492, + "step": 254400 + }, + { + "epoch": 11.438583879953276, + "grad_norm": 3.335512399673462, + "learning_rate": 4.3816614357253935e-05, + "loss": 0.5134, + "step": 254600 + }, + { + "epoch": 11.447569413244675, + "grad_norm": 9.387479782104492, + "learning_rate": 4.38073183093006e-05, + "loss": 0.4559, + "step": 254800 + }, + { + "epoch": 11.456554946536077, + "grad_norm": 8.435622215270996, + "learning_rate": 4.379801626649869e-05, + "loss": 0.4588, + "step": 255000 + }, + { + "epoch": 11.456554946536077, + "eval_loss": 2.593653917312622, + "eval_runtime": 1084.7817, + "eval_samples_per_second": 9.13, + "eval_steps_per_second": 0.143, + "step": 255000 + }, + { + "epoch": 11.465540479827478, + "grad_norm": 1.6870744228363037, + "learning_rate": 4.378870823181323e-05, + "loss": 0.4554, + "step": 255200 + }, + { + "epoch": 11.47452601311888, + "grad_norm": 6.257181644439697, + "learning_rate": 4.3779394208211174e-05, + "loss": 0.4805, + "step": 255400 + }, + { + "epoch": 11.483511546410279, + "grad_norm": 2.434807062149048, + "learning_rate": 4.3770074198661385e-05, + "loss": 0.4651, + "step": 255600 + }, + { + "epoch": 11.49249707970168, + "grad_norm": 3.8635079860687256, + "learning_rate": 4.37607482061346e-05, + "loss": 0.4393, + "step": 255800 + }, + { + "epoch": 11.501482612993081, + "grad_norm": 16.132322311401367, + "learning_rate": 4.37514162336035e-05, + "loss": 0.483, + "step": 256000 + }, + { + "epoch": 11.501482612993081, + "eval_loss": 2.567880153656006, + "eval_runtime": 1085.3827, + "eval_samples_per_second": 9.125, + "eval_steps_per_second": 0.143, + "step": 256000 + }, + { + "epoch": 11.510468146284483, + "grad_norm": 18.950214385986328, + "learning_rate": 4.374207828404267e-05, + "loss": 0.4645, + "step": 256200 + }, + { + "epoch": 11.519453679575882, + "grad_norm": 30.078716278076172, + "learning_rate": 4.373273436042857e-05, + "loss": 0.4436, + "step": 256400 + }, + { + "epoch": 11.528439212867283, + "grad_norm": 11.811574935913086, + "learning_rate": 4.3723384465739594e-05, + "loss": 0.4611, + "step": 256600 + }, + { + "epoch": 11.537424746158685, + "grad_norm": 7.034965515136719, + "learning_rate": 4.371402860295601e-05, + "loss": 0.4889, + "step": 256800 + }, + { + "epoch": 11.546410279450086, + "grad_norm": 12.620630264282227, + "learning_rate": 4.3704666775060045e-05, + "loss": 0.4649, + "step": 257000 + }, + { + "epoch": 11.546410279450086, + "eval_loss": 2.515794038772583, + "eval_runtime": 1084.1853, + "eval_samples_per_second": 9.135, + "eval_steps_per_second": 0.143, + "step": 257000 + }, + { + "epoch": 11.555395812741486, + "grad_norm": 2.5326550006866455, + "learning_rate": 4.369529898503576e-05, + "loss": 0.4934, + "step": 257200 + }, + { + "epoch": 11.564381346032887, + "grad_norm": 8.968504905700684, + "learning_rate": 4.3685925235869155e-05, + "loss": 0.4643, + "step": 257400 + }, + { + "epoch": 11.573366879324288, + "grad_norm": 3.6532328128814697, + "learning_rate": 4.367654553054811e-05, + "loss": 0.4552, + "step": 257600 + }, + { + "epoch": 11.58235241261569, + "grad_norm": 14.925705909729004, + "learning_rate": 4.3667159872062434e-05, + "loss": 0.4879, + "step": 257800 + }, + { + "epoch": 11.591337945907089, + "grad_norm": 4.690251350402832, + "learning_rate": 4.36577682634038e-05, + "loss": 0.4709, + "step": 258000 + }, + { + "epoch": 11.591337945907089, + "eval_loss": 2.600820541381836, + "eval_runtime": 1083.5624, + "eval_samples_per_second": 9.14, + "eval_steps_per_second": 0.143, + "step": 258000 + }, + { + "epoch": 11.60032347919849, + "grad_norm": 14.12942123413086, + "learning_rate": 4.3648370707565786e-05, + "loss": 0.4925, + "step": 258200 + }, + { + "epoch": 11.609309012489891, + "grad_norm": 10.568379402160645, + "learning_rate": 4.363896720754389e-05, + "loss": 0.4636, + "step": 258400 + }, + { + "epoch": 11.618294545781293, + "grad_norm": 6.521212100982666, + "learning_rate": 4.362955776633546e-05, + "loss": 0.5114, + "step": 258600 + }, + { + "epoch": 11.627280079072692, + "grad_norm": 5.636810302734375, + "learning_rate": 4.362014238693979e-05, + "loss": 0.4439, + "step": 258800 + }, + { + "epoch": 11.636265612364094, + "grad_norm": 9.390134811401367, + "learning_rate": 4.361072107235803e-05, + "loss": 0.4771, + "step": 259000 + }, + { + "epoch": 11.636265612364094, + "eval_loss": 2.567819118499756, + "eval_runtime": 1083.8444, + "eval_samples_per_second": 9.138, + "eval_steps_per_second": 0.143, + "step": 259000 + }, + { + "epoch": 11.645251145655495, + "grad_norm": 6.163935661315918, + "learning_rate": 4.360129382559323e-05, + "loss": 0.4715, + "step": 259200 + }, + { + "epoch": 11.654236678946896, + "grad_norm": 8.139466285705566, + "learning_rate": 4.359186064965032e-05, + "loss": 0.4934, + "step": 259400 + }, + { + "epoch": 11.663222212238296, + "grad_norm": 19.77556610107422, + "learning_rate": 4.358242154753615e-05, + "loss": 0.4945, + "step": 259600 + }, + { + "epoch": 11.672207745529697, + "grad_norm": 1.9366395473480225, + "learning_rate": 4.357297652225943e-05, + "loss": 0.4604, + "step": 259800 + }, + { + "epoch": 11.681193278821098, + "grad_norm": 5.113880157470703, + "learning_rate": 4.356352557683079e-05, + "loss": 0.4671, + "step": 260000 + }, + { + "epoch": 11.681193278821098, + "eval_loss": 2.564166307449341, + "eval_runtime": 1084.7483, + "eval_samples_per_second": 9.13, + "eval_steps_per_second": 0.143, + "step": 260000 + }, + { + "epoch": 11.6901788121125, + "grad_norm": 1.103203535079956, + "learning_rate": 4.355406871426271e-05, + "loss": 0.4809, + "step": 260200 + }, + { + "epoch": 11.699164345403899, + "grad_norm": 3.9322304725646973, + "learning_rate": 4.3544605937569585e-05, + "loss": 0.5147, + "step": 260400 + }, + { + "epoch": 11.7081498786953, + "grad_norm": 14.528691291809082, + "learning_rate": 4.353513724976765e-05, + "loss": 0.46, + "step": 260600 + }, + { + "epoch": 11.717135411986701, + "grad_norm": 4.72658634185791, + "learning_rate": 4.3525662653875105e-05, + "loss": 0.5064, + "step": 260800 + }, + { + "epoch": 11.726120945278103, + "grad_norm": 1.3560961484909058, + "learning_rate": 4.351618215291196e-05, + "loss": 0.4535, + "step": 261000 + }, + { + "epoch": 11.726120945278103, + "eval_loss": 2.5357089042663574, + "eval_runtime": 1084.1462, + "eval_samples_per_second": 9.135, + "eval_steps_per_second": 0.143, + "step": 261000 + }, + { + "epoch": 11.735106478569502, + "grad_norm": 14.868110656738281, + "learning_rate": 4.350669574990013e-05, + "loss": 0.4626, + "step": 261200 + }, + { + "epoch": 11.744092011860904, + "grad_norm": 5.739045143127441, + "learning_rate": 4.3497203447863415e-05, + "loss": 0.5111, + "step": 261400 + }, + { + "epoch": 11.753077545152305, + "grad_norm": 7.391199111938477, + "learning_rate": 4.34877052498275e-05, + "loss": 0.485, + "step": 261600 + }, + { + "epoch": 11.762063078443706, + "grad_norm": 7.108745098114014, + "learning_rate": 4.347820115881994e-05, + "loss": 0.4663, + "step": 261800 + }, + { + "epoch": 11.771048611735107, + "grad_norm": 15.372479438781738, + "learning_rate": 4.346869117787018e-05, + "loss": 0.4235, + "step": 262000 + }, + { + "epoch": 11.771048611735107, + "eval_loss": 2.5822150707244873, + "eval_runtime": 1083.6043, + "eval_samples_per_second": 9.14, + "eval_steps_per_second": 0.143, + "step": 262000 + }, + { + "epoch": 11.780034145026507, + "grad_norm": 4.675400257110596, + "learning_rate": 4.345917531000952e-05, + "loss": 0.5049, + "step": 262200 + }, + { + "epoch": 11.789019678317908, + "grad_norm": 7.368799209594727, + "learning_rate": 4.344965355827117e-05, + "loss": 0.4666, + "step": 262400 + }, + { + "epoch": 11.79800521160931, + "grad_norm": 24.108701705932617, + "learning_rate": 4.344012592569018e-05, + "loss": 0.4994, + "step": 262600 + }, + { + "epoch": 11.806990744900709, + "grad_norm": 3.419159412384033, + "learning_rate": 4.34305924153035e-05, + "loss": 0.473, + "step": 262800 + }, + { + "epoch": 11.81597627819211, + "grad_norm": 29.086864471435547, + "learning_rate": 4.3421053030149936e-05, + "loss": 0.4757, + "step": 263000 + }, + { + "epoch": 11.81597627819211, + "eval_loss": 2.5641908645629883, + "eval_runtime": 1084.8454, + "eval_samples_per_second": 9.129, + "eval_steps_per_second": 0.143, + "step": 263000 + }, + { + "epoch": 11.824961811483512, + "grad_norm": 11.448222160339355, + "learning_rate": 4.341150777327019e-05, + "loss": 0.4729, + "step": 263200 + }, + { + "epoch": 11.833947344774913, + "grad_norm": 4.488698482513428, + "learning_rate": 4.34019566477068e-05, + "loss": 0.4513, + "step": 263400 + }, + { + "epoch": 11.842932878066314, + "grad_norm": 2.3001222610473633, + "learning_rate": 4.3392399656504214e-05, + "loss": 0.4475, + "step": 263600 + }, + { + "epoch": 11.851918411357714, + "grad_norm": 6.0910844802856445, + "learning_rate": 4.3382836802708715e-05, + "loss": 0.5439, + "step": 263800 + }, + { + "epoch": 11.860903944649115, + "grad_norm": 4.601564407348633, + "learning_rate": 4.337326808936848e-05, + "loss": 0.4688, + "step": 264000 + }, + { + "epoch": 11.860903944649115, + "eval_loss": 2.945237874984741, + "eval_runtime": 1100.7652, + "eval_samples_per_second": 8.997, + "eval_steps_per_second": 0.091, + "step": 264000 + }, + { + "epoch": 11.869889477940516, + "grad_norm": 5.200575828552246, + "learning_rate": 4.336369351953354e-05, + "loss": 0.4502, + "step": 264200 + }, + { + "epoch": 11.878875011231916, + "grad_norm": 0.4828265905380249, + "learning_rate": 4.335411309625581e-05, + "loss": 0.4914, + "step": 264400 + }, + { + "epoch": 11.887860544523317, + "grad_norm": 6.368671894073486, + "learning_rate": 4.334452682258905e-05, + "loss": 0.47, + "step": 264600 + }, + { + "epoch": 11.896846077814718, + "grad_norm": 11.522847175598145, + "learning_rate": 4.333493470158888e-05, + "loss": 0.4316, + "step": 264800 + }, + { + "epoch": 11.90583161110612, + "grad_norm": 5.565563678741455, + "learning_rate": 4.3325336736312814e-05, + "loss": 0.5091, + "step": 265000 + }, + { + "epoch": 11.90583161110612, + "eval_loss": 2.9430134296417236, + "eval_runtime": 1099.498, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.091, + "step": 265000 + }, + { + "epoch": 11.91481714439752, + "grad_norm": 2.104519844055176, + "learning_rate": 4.331573292982021e-05, + "loss": 0.4338, + "step": 265200 + }, + { + "epoch": 11.92380267768892, + "grad_norm": 5.740574836730957, + "learning_rate": 4.3306123285172275e-05, + "loss": 0.4399, + "step": 265400 + }, + { + "epoch": 11.932788210980322, + "grad_norm": 5.429746150970459, + "learning_rate": 4.329650780543211e-05, + "loss": 0.479, + "step": 265600 + }, + { + "epoch": 11.941773744271723, + "grad_norm": 1.9795042276382446, + "learning_rate": 4.328688649366465e-05, + "loss": 0.4407, + "step": 265800 + }, + { + "epoch": 11.950759277563124, + "grad_norm": 7.313149452209473, + "learning_rate": 4.327725935293668e-05, + "loss": 0.4642, + "step": 266000 + }, + { + "epoch": 11.950759277563124, + "eval_loss": 3.0007801055908203, + "eval_runtime": 1098.5023, + "eval_samples_per_second": 9.016, + "eval_steps_per_second": 0.091, + "step": 266000 + }, + { + "epoch": 11.959744810854524, + "grad_norm": 3.4922845363616943, + "learning_rate": 4.3267626386316884e-05, + "loss": 0.4454, + "step": 266200 + }, + { + "epoch": 11.968730344145925, + "grad_norm": 20.564990997314453, + "learning_rate": 4.325798759687577e-05, + "loss": 0.4763, + "step": 266400 + }, + { + "epoch": 11.977715877437326, + "grad_norm": 15.71061897277832, + "learning_rate": 4.324834298768571e-05, + "loss": 0.4989, + "step": 266600 + }, + { + "epoch": 11.986701410728728, + "grad_norm": 5.444253921508789, + "learning_rate": 4.323869256182092e-05, + "loss": 0.4474, + "step": 266800 + }, + { + "epoch": 11.995686944020127, + "grad_norm": 7.9454216957092285, + "learning_rate": 4.3229036322357505e-05, + "loss": 0.4415, + "step": 267000 + }, + { + "epoch": 11.995686944020127, + "eval_loss": 2.9907069206237793, + "eval_runtime": 1098.2527, + "eval_samples_per_second": 9.018, + "eval_steps_per_second": 0.091, + "step": 267000 + }, + { + "epoch": 12.004672477311528, + "grad_norm": 10.628538131713867, + "learning_rate": 4.3219374272373375e-05, + "loss": 0.4892, + "step": 267200 + }, + { + "epoch": 12.01365801060293, + "grad_norm": 11.927538871765137, + "learning_rate": 4.3209706414948326e-05, + "loss": 0.4157, + "step": 267400 + }, + { + "epoch": 12.02264354389433, + "grad_norm": 4.5106682777404785, + "learning_rate": 4.3200032753164004e-05, + "loss": 0.4235, + "step": 267600 + }, + { + "epoch": 12.03162907718573, + "grad_norm": 9.342924118041992, + "learning_rate": 4.319035329010389e-05, + "loss": 0.4333, + "step": 267800 + }, + { + "epoch": 12.040614610477132, + "grad_norm": 5.0819244384765625, + "learning_rate": 4.3180668028853314e-05, + "loss": 0.4374, + "step": 268000 + }, + { + "epoch": 12.040614610477132, + "eval_loss": 2.9819138050079346, + "eval_runtime": 1099.2643, + "eval_samples_per_second": 9.01, + "eval_steps_per_second": 0.091, + "step": 268000 + }, + { + "epoch": 12.049600143768533, + "grad_norm": 11.678213119506836, + "learning_rate": 4.317097697249948e-05, + "loss": 0.4525, + "step": 268200 + }, + { + "epoch": 12.058585677059934, + "grad_norm": 5.52247428894043, + "learning_rate": 4.31612801241314e-05, + "loss": 0.4444, + "step": 268400 + }, + { + "epoch": 12.067571210351334, + "grad_norm": 6.6727190017700195, + "learning_rate": 4.315157748683996e-05, + "loss": 0.4566, + "step": 268600 + }, + { + "epoch": 12.076556743642735, + "grad_norm": 5.082212448120117, + "learning_rate": 4.314186906371788e-05, + "loss": 0.4681, + "step": 268800 + }, + { + "epoch": 12.085542276934136, + "grad_norm": 12.604265213012695, + "learning_rate": 4.3132154857859744e-05, + "loss": 0.4056, + "step": 269000 + }, + { + "epoch": 12.085542276934136, + "eval_loss": 2.960404634475708, + "eval_runtime": 1098.0453, + "eval_samples_per_second": 9.02, + "eval_steps_per_second": 0.091, + "step": 269000 + }, + { + "epoch": 12.094527810225538, + "grad_norm": 10.235774993896484, + "learning_rate": 4.312243487236194e-05, + "loss": 0.4455, + "step": 269200 + }, + { + "epoch": 12.103513343516937, + "grad_norm": 7.912709712982178, + "learning_rate": 4.3112709110322744e-05, + "loss": 0.4643, + "step": 269400 + }, + { + "epoch": 12.112498876808338, + "grad_norm": 4.5928473472595215, + "learning_rate": 4.310297757484224e-05, + "loss": 0.4281, + "step": 269600 + }, + { + "epoch": 12.12148441009974, + "grad_norm": 1.3474705219268799, + "learning_rate": 4.309324026902236e-05, + "loss": 0.4354, + "step": 269800 + }, + { + "epoch": 12.130469943391141, + "grad_norm": 7.204748153686523, + "learning_rate": 4.3083497195966887e-05, + "loss": 0.42, + "step": 270000 + }, + { + "epoch": 12.130469943391141, + "eval_loss": 3.0123867988586426, + "eval_runtime": 1098.9017, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.091, + "step": 270000 + }, + { + "epoch": 12.13945547668254, + "grad_norm": 3.3051373958587646, + "learning_rate": 4.3073748358781424e-05, + "loss": 0.4633, + "step": 270200 + }, + { + "epoch": 12.148441009973942, + "grad_norm": 3.480196952819824, + "learning_rate": 4.306399376057343e-05, + "loss": 0.4057, + "step": 270400 + }, + { + "epoch": 12.157426543265343, + "grad_norm": 14.72482681274414, + "learning_rate": 4.305423340445218e-05, + "loss": 0.4233, + "step": 270600 + }, + { + "epoch": 12.166412076556744, + "grad_norm": 8.279642105102539, + "learning_rate": 4.304446729352881e-05, + "loss": 0.4694, + "step": 270800 + }, + { + "epoch": 12.175397609848144, + "grad_norm": 4.855335712432861, + "learning_rate": 4.303469543091627e-05, + "loss": 0.4497, + "step": 271000 + }, + { + "epoch": 12.175397609848144, + "eval_loss": 2.980236291885376, + "eval_runtime": 1098.5437, + "eval_samples_per_second": 9.016, + "eval_steps_per_second": 0.091, + "step": 271000 + }, + { + "epoch": 12.184383143139545, + "grad_norm": 9.080001831054688, + "learning_rate": 4.302491781972935e-05, + "loss": 0.4435, + "step": 271200 + }, + { + "epoch": 12.193368676430946, + "grad_norm": 2.5085525512695312, + "learning_rate": 4.301513446308466e-05, + "loss": 0.4243, + "step": 271400 + }, + { + "epoch": 12.202354209722348, + "grad_norm": 10.801093101501465, + "learning_rate": 4.300534536410068e-05, + "loss": 0.4641, + "step": 271600 + }, + { + "epoch": 12.211339743013747, + "grad_norm": 2.8049042224884033, + "learning_rate": 4.2995550525897667e-05, + "loss": 0.4632, + "step": 271800 + }, + { + "epoch": 12.220325276305148, + "grad_norm": 4.995143413543701, + "learning_rate": 4.298574995159774e-05, + "loss": 0.4471, + "step": 272000 + }, + { + "epoch": 12.220325276305148, + "eval_loss": 2.955246686935425, + "eval_runtime": 1098.9794, + "eval_samples_per_second": 9.012, + "eval_steps_per_second": 0.091, + "step": 272000 + }, + { + "epoch": 12.22931080959655, + "grad_norm": 2.9934492111206055, + "learning_rate": 4.297594364432486e-05, + "loss": 0.4534, + "step": 272200 + }, + { + "epoch": 12.238296342887951, + "grad_norm": 6.686132907867432, + "learning_rate": 4.2966131607204764e-05, + "loss": 0.4186, + "step": 272400 + }, + { + "epoch": 12.24728187617935, + "grad_norm": 7.996724605560303, + "learning_rate": 4.295631384336507e-05, + "loss": 0.4452, + "step": 272600 + }, + { + "epoch": 12.256267409470752, + "grad_norm": 3.5460829734802246, + "learning_rate": 4.294649035593519e-05, + "loss": 0.4479, + "step": 272800 + }, + { + "epoch": 12.265252942762153, + "grad_norm": 6.196242809295654, + "learning_rate": 4.2936661148046375e-05, + "loss": 0.5112, + "step": 273000 + }, + { + "epoch": 12.265252942762153, + "eval_loss": 2.9934980869293213, + "eval_runtime": 1098.838, + "eval_samples_per_second": 9.013, + "eval_steps_per_second": 0.091, + "step": 273000 + }, + { + "epoch": 12.274238476053554, + "grad_norm": 3.0045993328094482, + "learning_rate": 4.292682622283168e-05, + "loss": 0.4462, + "step": 273200 + }, + { + "epoch": 12.283224009344954, + "grad_norm": 5.161373138427734, + "learning_rate": 4.2916985583426016e-05, + "loss": 0.459, + "step": 273400 + }, + { + "epoch": 12.292209542636355, + "grad_norm": 2.4376187324523926, + "learning_rate": 4.290713923296607e-05, + "loss": 0.4572, + "step": 273600 + }, + { + "epoch": 12.301195075927756, + "grad_norm": 1.416688323020935, + "learning_rate": 4.289728717459041e-05, + "loss": 0.4842, + "step": 273800 + }, + { + "epoch": 12.310180609219158, + "grad_norm": 7.329530715942383, + "learning_rate": 4.288742941143935e-05, + "loss": 0.4582, + "step": 274000 + }, + { + "epoch": 12.310180609219158, + "eval_loss": 3.067824125289917, + "eval_runtime": 1099.4168, + "eval_samples_per_second": 9.008, + "eval_steps_per_second": 0.091, + "step": 274000 + }, + { + "epoch": 12.319166142510557, + "grad_norm": 12.674388885498047, + "learning_rate": 4.287756594665508e-05, + "loss": 0.4969, + "step": 274200 + }, + { + "epoch": 12.328151675801958, + "grad_norm": 12.752253532409668, + "learning_rate": 4.286769678338159e-05, + "loss": 0.4488, + "step": 274400 + }, + { + "epoch": 12.33713720909336, + "grad_norm": 22.549896240234375, + "learning_rate": 4.285782192476467e-05, + "loss": 0.4084, + "step": 274600 + }, + { + "epoch": 12.346122742384761, + "grad_norm": 18.12051010131836, + "learning_rate": 4.284794137395195e-05, + "loss": 0.4575, + "step": 274800 + }, + { + "epoch": 12.35510827567616, + "grad_norm": 0.43731093406677246, + "learning_rate": 4.283805513409287e-05, + "loss": 0.4361, + "step": 275000 + }, + { + "epoch": 12.35510827567616, + "eval_loss": 2.9659314155578613, + "eval_runtime": 1099.8228, + "eval_samples_per_second": 9.005, + "eval_steps_per_second": 0.091, + "step": 275000 + }, + { + "epoch": 12.364093808967562, + "grad_norm": 19.862689971923828, + "learning_rate": 4.282816320833866e-05, + "loss": 0.4251, + "step": 275200 + }, + { + "epoch": 12.373079342258963, + "grad_norm": 10.183892250061035, + "learning_rate": 4.281826559984239e-05, + "loss": 0.4746, + "step": 275400 + }, + { + "epoch": 12.382064875550364, + "grad_norm": 5.8187642097473145, + "learning_rate": 4.280836231175893e-05, + "loss": 0.4471, + "step": 275600 + }, + { + "epoch": 12.391050408841764, + "grad_norm": 15.410677909851074, + "learning_rate": 4.279845334724496e-05, + "loss": 0.4219, + "step": 275800 + }, + { + "epoch": 12.400035942133165, + "grad_norm": 3.4729344844818115, + "learning_rate": 4.2788538709458984e-05, + "loss": 0.4493, + "step": 276000 + }, + { + "epoch": 12.400035942133165, + "eval_loss": 3.924736499786377, + "eval_runtime": 1200.9974, + "eval_samples_per_second": 8.246, + "eval_steps_per_second": 0.032, + "step": 276000 + }, + { + "epoch": 12.409021475424566, + "grad_norm": 3.802396059036255, + "learning_rate": 4.277861840156128e-05, + "loss": 0.4697, + "step": 276200 + }, + { + "epoch": 12.418007008715968, + "grad_norm": 3.487226963043213, + "learning_rate": 4.276869242671396e-05, + "loss": 0.4842, + "step": 276400 + }, + { + "epoch": 12.426992542007369, + "grad_norm": 15.522408485412598, + "learning_rate": 4.275876078808095e-05, + "loss": 0.4582, + "step": 276600 + }, + { + "epoch": 12.435978075298769, + "grad_norm": 4.422022819519043, + "learning_rate": 4.274882348882795e-05, + "loss": 0.4654, + "step": 276800 + }, + { + "epoch": 12.44496360859017, + "grad_norm": 7.4790520668029785, + "learning_rate": 4.27388805321225e-05, + "loss": 0.4306, + "step": 277000 + }, + { + "epoch": 12.44496360859017, + "eval_loss": 3.912114143371582, + "eval_runtime": 1203.2852, + "eval_samples_per_second": 8.231, + "eval_steps_per_second": 0.032, + "step": 277000 + }, + { + "epoch": 12.453949141881571, + "grad_norm": 23.840351104736328, + "learning_rate": 4.272893192113391e-05, + "loss": 0.4198, + "step": 277200 + }, + { + "epoch": 12.46293467517297, + "grad_norm": 2.9730992317199707, + "learning_rate": 4.271897765903332e-05, + "loss": 0.4503, + "step": 277400 + }, + { + "epoch": 12.471920208464372, + "grad_norm": 5.045375823974609, + "learning_rate": 4.2709017748993654e-05, + "loss": 0.4917, + "step": 277600 + }, + { + "epoch": 12.480905741755773, + "grad_norm": 5.691855430603027, + "learning_rate": 4.269905219418964e-05, + "loss": 0.4699, + "step": 277800 + }, + { + "epoch": 12.489891275047174, + "grad_norm": 3.8715128898620605, + "learning_rate": 4.2689080997797815e-05, + "loss": 0.4549, + "step": 278000 + }, + { + "epoch": 12.489891275047174, + "eval_loss": 3.87607741355896, + "eval_runtime": 1200.8926, + "eval_samples_per_second": 8.247, + "eval_steps_per_second": 0.032, + "step": 278000 + }, + { + "epoch": 12.498876808338576, + "grad_norm": 6.021407604217529, + "learning_rate": 4.2679104162996495e-05, + "loss": 0.4249, + "step": 278200 + }, + { + "epoch": 12.507862341629975, + "grad_norm": 7.7932538986206055, + "learning_rate": 4.266912169296581e-05, + "loss": 0.4297, + "step": 278400 + }, + { + "epoch": 12.516847874921377, + "grad_norm": 20.896095275878906, + "learning_rate": 4.265913359088769e-05, + "loss": 0.4688, + "step": 278600 + }, + { + "epoch": 12.525833408212778, + "grad_norm": 17.99188804626465, + "learning_rate": 4.264913985994583e-05, + "loss": 0.4563, + "step": 278800 + }, + { + "epoch": 12.534818941504179, + "grad_norm": 1.572239875793457, + "learning_rate": 4.263914050332576e-05, + "loss": 0.4485, + "step": 279000 + }, + { + "epoch": 12.534818941504179, + "eval_loss": 3.8942999839782715, + "eval_runtime": 1200.7162, + "eval_samples_per_second": 8.248, + "eval_steps_per_second": 0.032, + "step": 279000 + }, + { + "epoch": 12.543804474795579, + "grad_norm": 1.1558527946472168, + "learning_rate": 4.2629135524214777e-05, + "loss": 0.4433, + "step": 279200 + }, + { + "epoch": 12.55279000808698, + "grad_norm": 10.34830379486084, + "learning_rate": 4.261912492580197e-05, + "loss": 0.4556, + "step": 279400 + }, + { + "epoch": 12.561775541378381, + "grad_norm": 8.091256141662598, + "learning_rate": 4.260910871127823e-05, + "loss": 0.4459, + "step": 279600 + }, + { + "epoch": 12.570761074669782, + "grad_norm": 5.710160732269287, + "learning_rate": 4.2599086883836236e-05, + "loss": 0.4667, + "step": 279800 + }, + { + "epoch": 12.579746607961182, + "grad_norm": 11.081522941589355, + "learning_rate": 4.2589059446670454e-05, + "loss": 0.4969, + "step": 280000 + }, + { + "epoch": 12.579746607961182, + "eval_loss": 3.8589940071105957, + "eval_runtime": 1206.2897, + "eval_samples_per_second": 8.21, + "eval_steps_per_second": 0.032, + "step": 280000 + }, + { + "epoch": 12.588732141252583, + "grad_norm": 17.533634185791016, + "learning_rate": 4.257902640297714e-05, + "loss": 0.4725, + "step": 280200 + }, + { + "epoch": 12.597717674543985, + "grad_norm": 2.660717487335205, + "learning_rate": 4.256898775595432e-05, + "loss": 0.4301, + "step": 280400 + }, + { + "epoch": 12.606703207835386, + "grad_norm": 22.708642959594727, + "learning_rate": 4.255894350880185e-05, + "loss": 0.4595, + "step": 280600 + }, + { + "epoch": 12.615688741126785, + "grad_norm": 8.68639087677002, + "learning_rate": 4.254889366472131e-05, + "loss": 0.512, + "step": 280800 + }, + { + "epoch": 12.624674274418187, + "grad_norm": 9.3152494430542, + "learning_rate": 4.253883822691612e-05, + "loss": 0.4898, + "step": 281000 + }, + { + "epoch": 12.624674274418187, + "eval_loss": 3.836467981338501, + "eval_runtime": 1202.5642, + "eval_samples_per_second": 8.236, + "eval_steps_per_second": 0.032, + "step": 281000 + }, + { + "epoch": 12.633659807709588, + "grad_norm": 1.8501704931259155, + "learning_rate": 4.252877719859145e-05, + "loss": 0.4381, + "step": 281200 + }, + { + "epoch": 12.64264534100099, + "grad_norm": 9.407011032104492, + "learning_rate": 4.2518710582954255e-05, + "loss": 0.4878, + "step": 281400 + }, + { + "epoch": 12.651630874292389, + "grad_norm": 18.41656494140625, + "learning_rate": 4.2508638383213296e-05, + "loss": 0.4736, + "step": 281600 + }, + { + "epoch": 12.66061640758379, + "grad_norm": 8.159863471984863, + "learning_rate": 4.249856060257908e-05, + "loss": 0.4956, + "step": 281800 + }, + { + "epoch": 12.669601940875191, + "grad_norm": 2.042884588241577, + "learning_rate": 4.248847724426391e-05, + "loss": 0.4835, + "step": 282000 + }, + { + "epoch": 12.669601940875191, + "eval_loss": 3.9126241207122803, + "eval_runtime": 1205.3139, + "eval_samples_per_second": 8.217, + "eval_steps_per_second": 0.032, + "step": 282000 + }, + { + "epoch": 12.678587474166592, + "grad_norm": 6.690242767333984, + "learning_rate": 4.247838831148186e-05, + "loss": 0.4672, + "step": 282200 + }, + { + "epoch": 12.687573007457992, + "grad_norm": 1.212893009185791, + "learning_rate": 4.24682938074488e-05, + "loss": 0.4522, + "step": 282400 + }, + { + "epoch": 12.696558540749393, + "grad_norm": 6.8718581199646, + "learning_rate": 4.245819373538235e-05, + "loss": 0.4921, + "step": 282600 + }, + { + "epoch": 12.705544074040795, + "grad_norm": 5.218339920043945, + "learning_rate": 4.244808809850193e-05, + "loss": 0.4412, + "step": 282800 + }, + { + "epoch": 12.714529607332196, + "grad_norm": 3.228175401687622, + "learning_rate": 4.24379769000287e-05, + "loss": 0.4452, + "step": 283000 + }, + { + "epoch": 12.714529607332196, + "eval_loss": 3.84609055519104, + "eval_runtime": 1201.2158, + "eval_samples_per_second": 8.245, + "eval_steps_per_second": 0.032, + "step": 283000 + }, + { + "epoch": 12.723515140623595, + "grad_norm": 10.501503944396973, + "learning_rate": 4.2427860143185625e-05, + "loss": 0.4471, + "step": 283200 + }, + { + "epoch": 12.732500673914997, + "grad_norm": 10.110664367675781, + "learning_rate": 4.241773783119742e-05, + "loss": 0.4441, + "step": 283400 + }, + { + "epoch": 12.741486207206398, + "grad_norm": 5.942151069641113, + "learning_rate": 4.240760996729061e-05, + "loss": 0.4631, + "step": 283600 + }, + { + "epoch": 12.7504717404978, + "grad_norm": 17.07978057861328, + "learning_rate": 4.2397476554693427e-05, + "loss": 0.4466, + "step": 283800 + }, + { + "epoch": 12.759457273789199, + "grad_norm": 6.301132678985596, + "learning_rate": 4.238733759663592e-05, + "loss": 0.4957, + "step": 284000 + }, + { + "epoch": 12.759457273789199, + "eval_loss": 3.8400514125823975, + "eval_runtime": 1202.3941, + "eval_samples_per_second": 8.237, + "eval_steps_per_second": 0.032, + "step": 284000 + }, + { + "epoch": 12.7684428070806, + "grad_norm": 4.1205573081970215, + "learning_rate": 4.237719309634989e-05, + "loss": 0.4325, + "step": 284200 + }, + { + "epoch": 12.777428340372001, + "grad_norm": 2.6801910400390625, + "learning_rate": 4.236704305706889e-05, + "loss": 0.478, + "step": 284400 + }, + { + "epoch": 12.786413873663403, + "grad_norm": 5.553824424743652, + "learning_rate": 4.235688748202828e-05, + "loss": 0.4462, + "step": 284600 + }, + { + "epoch": 12.795399406954802, + "grad_norm": 4.970882415771484, + "learning_rate": 4.234672637446514e-05, + "loss": 0.4544, + "step": 284800 + }, + { + "epoch": 12.804384940246203, + "grad_norm": 7.782638072967529, + "learning_rate": 4.233655973761833e-05, + "loss": 0.4713, + "step": 285000 + }, + { + "epoch": 12.804384940246203, + "eval_loss": 3.8344786167144775, + "eval_runtime": 1202.9038, + "eval_samples_per_second": 8.233, + "eval_steps_per_second": 0.032, + "step": 285000 + }, + { + "epoch": 12.813370473537605, + "grad_norm": 4.948213577270508, + "learning_rate": 4.232638757472849e-05, + "loss": 0.452, + "step": 285200 + }, + { + "epoch": 12.822356006829006, + "grad_norm": 16.379188537597656, + "learning_rate": 4.2316209889037986e-05, + "loss": 0.4633, + "step": 285400 + }, + { + "epoch": 12.831341540120405, + "grad_norm": 3.503868341445923, + "learning_rate": 4.230602668379098e-05, + "loss": 0.467, + "step": 285600 + }, + { + "epoch": 12.840327073411807, + "grad_norm": 1.0399272441864014, + "learning_rate": 4.229583796223337e-05, + "loss": 0.43, + "step": 285800 + }, + { + "epoch": 12.849312606703208, + "grad_norm": 1.698477029800415, + "learning_rate": 4.228564372761281e-05, + "loss": 0.4586, + "step": 286000 + }, + { + "epoch": 12.849312606703208, + "eval_loss": 3.8653202056884766, + "eval_runtime": 1185.291, + "eval_samples_per_second": 8.356, + "eval_steps_per_second": 0.033, + "step": 286000 + }, + { + "epoch": 12.85829813999461, + "grad_norm": 10.822354316711426, + "learning_rate": 4.2275443983178744e-05, + "loss": 0.4417, + "step": 286200 + }, + { + "epoch": 12.867283673286009, + "grad_norm": 8.866846084594727, + "learning_rate": 4.2265238732182334e-05, + "loss": 0.4166, + "step": 286400 + }, + { + "epoch": 12.87626920657741, + "grad_norm": 4.1137261390686035, + "learning_rate": 4.225502797787651e-05, + "loss": 0.4994, + "step": 286600 + }, + { + "epoch": 12.885254739868811, + "grad_norm": 3.115154266357422, + "learning_rate": 4.224481172351596e-05, + "loss": 0.4336, + "step": 286800 + }, + { + "epoch": 12.894240273160213, + "grad_norm": 7.953911304473877, + "learning_rate": 4.2234589972357144e-05, + "loss": 0.4433, + "step": 287000 + }, + { + "epoch": 12.894240273160213, + "eval_loss": 3.8534297943115234, + "eval_runtime": 1184.3457, + "eval_samples_per_second": 8.362, + "eval_steps_per_second": 0.033, + "step": 287000 + }, + { + "epoch": 12.903225806451612, + "grad_norm": 3.455723524093628, + "learning_rate": 4.222436272765822e-05, + "loss": 0.4541, + "step": 287200 + }, + { + "epoch": 12.912211339743013, + "grad_norm": 9.256354331970215, + "learning_rate": 4.221412999267915e-05, + "loss": 0.4282, + "step": 287400 + }, + { + "epoch": 12.921196873034415, + "grad_norm": 5.0986409187316895, + "learning_rate": 4.220389177068163e-05, + "loss": 0.4577, + "step": 287600 + }, + { + "epoch": 12.930182406325816, + "grad_norm": 10.405719757080078, + "learning_rate": 4.2193648064929094e-05, + "loss": 0.4245, + "step": 287800 + }, + { + "epoch": 12.939167939617215, + "grad_norm": 6.69377326965332, + "learning_rate": 4.218339887868673e-05, + "loss": 0.4955, + "step": 288000 + }, + { + "epoch": 12.939167939617215, + "eval_loss": 3.7864327430725098, + "eval_runtime": 1165.7975, + "eval_samples_per_second": 8.495, + "eval_steps_per_second": 0.033, + "step": 288000 + }, + { + "epoch": 12.948153472908617, + "grad_norm": 4.542316436767578, + "learning_rate": 4.2173144215221475e-05, + "loss": 0.4509, + "step": 288200 + }, + { + "epoch": 12.957139006200018, + "grad_norm": 9.559526443481445, + "learning_rate": 4.216288407780202e-05, + "loss": 0.426, + "step": 288400 + }, + { + "epoch": 12.96612453949142, + "grad_norm": 7.886917591094971, + "learning_rate": 4.21526184696988e-05, + "loss": 0.4613, + "step": 288600 + }, + { + "epoch": 12.975110072782819, + "grad_norm": 4.012725353240967, + "learning_rate": 4.214234739418396e-05, + "loss": 0.4668, + "step": 288800 + }, + { + "epoch": 12.98409560607422, + "grad_norm": 10.49506664276123, + "learning_rate": 4.213207085453143e-05, + "loss": 0.4632, + "step": 289000 + }, + { + "epoch": 12.98409560607422, + "eval_loss": 3.8832597732543945, + "eval_runtime": 1163.551, + "eval_samples_per_second": 8.512, + "eval_steps_per_second": 0.034, + "step": 289000 + }, + { + "epoch": 12.993081139365621, + "grad_norm": 14.843647956848145, + "learning_rate": 4.2121788854016864e-05, + "loss": 0.487, + "step": 289200 + }, + { + "epoch": 13.002066672657023, + "grad_norm": 12.702319145202637, + "learning_rate": 4.211150139591766e-05, + "loss": 0.4755, + "step": 289400 + }, + { + "epoch": 13.011052205948422, + "grad_norm": 12.583155632019043, + "learning_rate": 4.2101208483512954e-05, + "loss": 0.4325, + "step": 289600 + }, + { + "epoch": 13.020037739239823, + "grad_norm": 1.6690092086791992, + "learning_rate": 4.209091012008362e-05, + "loss": 0.4279, + "step": 289800 + }, + { + "epoch": 13.029023272531225, + "grad_norm": 13.319869995117188, + "learning_rate": 4.208060630891226e-05, + "loss": 0.459, + "step": 290000 + }, + { + "epoch": 13.029023272531225, + "eval_loss": 3.850545883178711, + "eval_runtime": 1164.1167, + "eval_samples_per_second": 8.508, + "eval_steps_per_second": 0.034, + "step": 290000 + }, + { + "epoch": 13.038008805822626, + "grad_norm": 11.082257270812988, + "learning_rate": 4.207029705328324e-05, + "loss": 0.4205, + "step": 290200 + }, + { + "epoch": 13.046994339114027, + "grad_norm": 3.647700309753418, + "learning_rate": 4.2059982356482636e-05, + "loss": 0.4541, + "step": 290400 + }, + { + "epoch": 13.055979872405427, + "grad_norm": 6.96566104888916, + "learning_rate": 4.204966222179826e-05, + "loss": 0.448, + "step": 290600 + }, + { + "epoch": 13.064965405696828, + "grad_norm": 4.0198235511779785, + "learning_rate": 4.2039336652519665e-05, + "loss": 0.4345, + "step": 290800 + }, + { + "epoch": 13.07395093898823, + "grad_norm": 5.543626308441162, + "learning_rate": 4.2029005651938146e-05, + "loss": 0.4483, + "step": 291000 + }, + { + "epoch": 13.07395093898823, + "eval_loss": 3.8965601921081543, + "eval_runtime": 1165.1251, + "eval_samples_per_second": 8.5, + "eval_steps_per_second": 0.033, + "step": 291000 + }, + { + "epoch": 13.08293647227963, + "grad_norm": 13.703949928283691, + "learning_rate": 4.201866922334672e-05, + "loss": 0.4145, + "step": 291200 + }, + { + "epoch": 13.09192200557103, + "grad_norm": 28.786453247070312, + "learning_rate": 4.20083273700401e-05, + "loss": 0.4455, + "step": 291400 + }, + { + "epoch": 13.100907538862431, + "grad_norm": 9.806286811828613, + "learning_rate": 4.199798009531481e-05, + "loss": 0.4122, + "step": 291600 + }, + { + "epoch": 13.109893072153833, + "grad_norm": 6.537720203399658, + "learning_rate": 4.198762740246901e-05, + "loss": 0.4223, + "step": 291800 + }, + { + "epoch": 13.118878605445234, + "grad_norm": 8.785443305969238, + "learning_rate": 4.1977269294802645e-05, + "loss": 0.4664, + "step": 292000 + }, + { + "epoch": 13.118878605445234, + "eval_loss": 3.8596513271331787, + "eval_runtime": 1165.6454, + "eval_samples_per_second": 8.497, + "eval_steps_per_second": 0.033, + "step": 292000 + }, + { + "epoch": 13.127864138736633, + "grad_norm": 6.35100793838501, + "learning_rate": 4.196690577561738e-05, + "loss": 0.4475, + "step": 292200 + }, + { + "epoch": 13.136849672028035, + "grad_norm": 6.956860065460205, + "learning_rate": 4.195653684821658e-05, + "loss": 0.4396, + "step": 292400 + }, + { + "epoch": 13.145835205319436, + "grad_norm": 5.264865875244141, + "learning_rate": 4.1946162515905364e-05, + "loss": 0.4265, + "step": 292600 + }, + { + "epoch": 13.154820738610837, + "grad_norm": 12.176240921020508, + "learning_rate": 4.193578278199054e-05, + "loss": 0.4379, + "step": 292800 + }, + { + "epoch": 13.163806271902237, + "grad_norm": 6.024650573730469, + "learning_rate": 4.192539764978068e-05, + "loss": 0.4243, + "step": 293000 + }, + { + "epoch": 13.163806271902237, + "eval_loss": 3.8728034496307373, + "eval_runtime": 1170.4759, + "eval_samples_per_second": 8.462, + "eval_steps_per_second": 0.033, + "step": 293000 + }, + { + "epoch": 13.172791805193638, + "grad_norm": 1.1849206686019897, + "learning_rate": 4.191500712258604e-05, + "loss": 0.4381, + "step": 293200 + }, + { + "epoch": 13.18177733848504, + "grad_norm": 3.522000789642334, + "learning_rate": 4.190461120371861e-05, + "loss": 0.472, + "step": 293400 + }, + { + "epoch": 13.19076287177644, + "grad_norm": 2.328458309173584, + "learning_rate": 4.1894209896492096e-05, + "loss": 0.4262, + "step": 293600 + }, + { + "epoch": 13.19974840506784, + "grad_norm": 9.86052131652832, + "learning_rate": 4.188380320422193e-05, + "loss": 0.442, + "step": 293800 + }, + { + "epoch": 13.208733938359241, + "grad_norm": 4.702374458312988, + "learning_rate": 4.187339113022525e-05, + "loss": 0.3967, + "step": 294000 + }, + { + "epoch": 13.208733938359241, + "eval_loss": 3.881704568862915, + "eval_runtime": 1178.457, + "eval_samples_per_second": 8.404, + "eval_steps_per_second": 0.033, + "step": 294000 + }, + { + "epoch": 13.217719471650643, + "grad_norm": 7.168625354766846, + "learning_rate": 4.186297367782091e-05, + "loss": 0.4736, + "step": 294200 + }, + { + "epoch": 13.226705004942044, + "grad_norm": 9.348653793334961, + "learning_rate": 4.1852550850329494e-05, + "loss": 0.4496, + "step": 294400 + }, + { + "epoch": 13.235690538233444, + "grad_norm": 6.130259990692139, + "learning_rate": 4.184212265107328e-05, + "loss": 0.4574, + "step": 294600 + }, + { + "epoch": 13.244676071524845, + "grad_norm": 8.369153022766113, + "learning_rate": 4.1831689083376256e-05, + "loss": 0.4083, + "step": 294800 + }, + { + "epoch": 13.253661604816246, + "grad_norm": 7.550708770751953, + "learning_rate": 4.182125015056415e-05, + "loss": 0.4462, + "step": 295000 + }, + { + "epoch": 13.253661604816246, + "eval_loss": 3.848435163497925, + "eval_runtime": 1171.7179, + "eval_samples_per_second": 8.453, + "eval_steps_per_second": 0.033, + "step": 295000 + }, + { + "epoch": 13.262647138107647, + "grad_norm": 4.578621864318848, + "learning_rate": 4.181080585596436e-05, + "loss": 0.4379, + "step": 295200 + }, + { + "epoch": 13.271632671399047, + "grad_norm": 5.007719039916992, + "learning_rate": 4.1800356202906024e-05, + "loss": 0.4498, + "step": 295400 + }, + { + "epoch": 13.280618204690448, + "grad_norm": 20.014347076416016, + "learning_rate": 4.178990119471998e-05, + "loss": 0.454, + "step": 295600 + }, + { + "epoch": 13.28960373798185, + "grad_norm": 7.8681254386901855, + "learning_rate": 4.1779440834738757e-05, + "loss": 0.451, + "step": 295800 + }, + { + "epoch": 13.29858927127325, + "grad_norm": 6.996041774749756, + "learning_rate": 4.176897512629663e-05, + "loss": 0.4109, + "step": 296000 + }, + { + "epoch": 13.29858927127325, + "eval_loss": 3.9298160076141357, + "eval_runtime": 1180.5598, + "eval_samples_per_second": 8.389, + "eval_steps_per_second": 0.033, + "step": 296000 + }, + { + "epoch": 13.30757480456465, + "grad_norm": 3.667933464050293, + "learning_rate": 4.175850407272953e-05, + "loss": 0.417, + "step": 296200 + }, + { + "epoch": 13.316560337856052, + "grad_norm": 4.346782684326172, + "learning_rate": 4.1748027677375116e-05, + "loss": 0.4439, + "step": 296400 + }, + { + "epoch": 13.325545871147453, + "grad_norm": 7.255468368530273, + "learning_rate": 4.1737545943572756e-05, + "loss": 0.4517, + "step": 296600 + }, + { + "epoch": 13.334531404438854, + "grad_norm": 1.1761934757232666, + "learning_rate": 4.172705887466351e-05, + "loss": 0.4611, + "step": 296800 + }, + { + "epoch": 13.343516937730254, + "grad_norm": 2.3793375492095947, + "learning_rate": 4.171656647399014e-05, + "loss": 0.4535, + "step": 297000 + }, + { + "epoch": 13.343516937730254, + "eval_loss": 3.8182103633880615, + "eval_runtime": 1137.4266, + "eval_samples_per_second": 8.707, + "eval_steps_per_second": 0.034, + "step": 297000 + }, + { + "epoch": 13.352502471021655, + "grad_norm": 8.53345775604248, + "learning_rate": 4.17060687448971e-05, + "loss": 0.416, + "step": 297200 + }, + { + "epoch": 13.361488004313056, + "grad_norm": 4.831078052520752, + "learning_rate": 4.169556569073056e-05, + "loss": 0.4341, + "step": 297400 + }, + { + "epoch": 13.370473537604457, + "grad_norm": 9.299762725830078, + "learning_rate": 4.168505731483837e-05, + "loss": 0.3995, + "step": 297600 + }, + { + "epoch": 13.379459070895857, + "grad_norm": 11.03166389465332, + "learning_rate": 4.167454362057008e-05, + "loss": 0.4338, + "step": 297800 + }, + { + "epoch": 13.388444604187258, + "grad_norm": 6.606450080871582, + "learning_rate": 4.166402461127696e-05, + "loss": 0.4563, + "step": 298000 + }, + { + "epoch": 13.388444604187258, + "eval_loss": 3.860046863555908, + "eval_runtime": 1114.1874, + "eval_samples_per_second": 8.889, + "eval_steps_per_second": 0.035, + "step": 298000 + }, + { + "epoch": 13.39743013747866, + "grad_norm": 9.79546070098877, + "learning_rate": 4.1653500290311934e-05, + "loss": 0.4505, + "step": 298200 + }, + { + "epoch": 13.40641567077006, + "grad_norm": 5.0448832511901855, + "learning_rate": 4.1642970661029634e-05, + "loss": 0.4342, + "step": 298400 + }, + { + "epoch": 13.41540120406146, + "grad_norm": 15.43664836883545, + "learning_rate": 4.163243572678641e-05, + "loss": 0.4311, + "step": 298600 + }, + { + "epoch": 13.424386737352862, + "grad_norm": 5.8657612800598145, + "learning_rate": 4.162189549094026e-05, + "loss": 0.4572, + "step": 298800 + }, + { + "epoch": 13.433372270644263, + "grad_norm": 8.958415031433105, + "learning_rate": 4.161134995685091e-05, + "loss": 0.4754, + "step": 299000 + }, + { + "epoch": 13.433372270644263, + "eval_loss": 3.8714182376861572, + "eval_runtime": 1117.5357, + "eval_samples_per_second": 8.862, + "eval_steps_per_second": 0.035, + "step": 299000 + }, + { + "epoch": 13.442357803935664, + "grad_norm": 12.89301586151123, + "learning_rate": 4.160079912787974e-05, + "loss": 0.4224, + "step": 299200 + }, + { + "epoch": 13.451343337227064, + "grad_norm": 30.66848373413086, + "learning_rate": 4.1590243007389845e-05, + "loss": 0.4751, + "step": 299400 + }, + { + "epoch": 13.460328870518465, + "grad_norm": 9.195915222167969, + "learning_rate": 4.1579681598746e-05, + "loss": 0.4678, + "step": 299600 + }, + { + "epoch": 13.469314403809866, + "grad_norm": 9.206331253051758, + "learning_rate": 4.156911490531466e-05, + "loss": 0.4399, + "step": 299800 + }, + { + "epoch": 13.478299937101268, + "grad_norm": 4.251493453979492, + "learning_rate": 4.1558542930463965e-05, + "loss": 0.4103, + "step": 300000 + }, + { + "epoch": 13.478299937101268, + "eval_loss": 3.946397542953491, + "eval_runtime": 1115.2299, + "eval_samples_per_second": 8.881, + "eval_steps_per_second": 0.035, + "step": 300000 + }, + { + "epoch": 13.487285470392667, + "grad_norm": 12.777297973632812, + "learning_rate": 4.154796567756375e-05, + "loss": 0.5246, + "step": 300200 + }, + { + "epoch": 13.496271003684068, + "grad_norm": 2.6797468662261963, + "learning_rate": 4.1537383149985506e-05, + "loss": 0.4457, + "step": 300400 + }, + { + "epoch": 13.50525653697547, + "grad_norm": 5.52931547164917, + "learning_rate": 4.1526795351102444e-05, + "loss": 0.4505, + "step": 300600 + }, + { + "epoch": 13.51424207026687, + "grad_norm": 12.613361358642578, + "learning_rate": 4.151620228428942e-05, + "loss": 0.4745, + "step": 300800 + }, + { + "epoch": 13.52322760355827, + "grad_norm": 7.806926727294922, + "learning_rate": 4.150560395292298e-05, + "loss": 0.4347, + "step": 301000 + }, + { + "epoch": 13.52322760355827, + "eval_loss": 3.85687255859375, + "eval_runtime": 1114.6959, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 0.035, + "step": 301000 + }, + { + "epoch": 13.532213136849672, + "grad_norm": 4.979412078857422, + "learning_rate": 4.1495000360381363e-05, + "loss": 0.4813, + "step": 301200 + }, + { + "epoch": 13.541198670141073, + "grad_norm": 13.663886070251465, + "learning_rate": 4.1484391510044475e-05, + "loss": 0.4744, + "step": 301400 + }, + { + "epoch": 13.550184203432474, + "grad_norm": 6.1580681800842285, + "learning_rate": 4.147377740529388e-05, + "loss": 0.4415, + "step": 301600 + }, + { + "epoch": 13.559169736723874, + "grad_norm": 13.568781852722168, + "learning_rate": 4.146315804951284e-05, + "loss": 0.4407, + "step": 301800 + }, + { + "epoch": 13.568155270015275, + "grad_norm": 1.211671233177185, + "learning_rate": 4.145253344608628e-05, + "loss": 0.4566, + "step": 302000 + }, + { + "epoch": 13.568155270015275, + "eval_loss": 3.837907552719116, + "eval_runtime": 1113.6432, + "eval_samples_per_second": 8.893, + "eval_steps_per_second": 0.035, + "step": 302000 + }, + { + "epoch": 13.577140803306676, + "grad_norm": 1.426780343055725, + "learning_rate": 4.1441903598400814e-05, + "loss": 0.4497, + "step": 302200 + }, + { + "epoch": 13.586126336598078, + "grad_norm": 7.560256004333496, + "learning_rate": 4.1431268509844706e-05, + "loss": 0.4683, + "step": 302400 + }, + { + "epoch": 13.595111869889479, + "grad_norm": 20.501848220825195, + "learning_rate": 4.1420628183807896e-05, + "loss": 0.4646, + "step": 302600 + }, + { + "epoch": 13.604097403180878, + "grad_norm": 3.325043201446533, + "learning_rate": 4.140998262368201e-05, + "loss": 0.443, + "step": 302800 + }, + { + "epoch": 13.61308293647228, + "grad_norm": 2.9573566913604736, + "learning_rate": 4.139933183286031e-05, + "loss": 0.4471, + "step": 303000 + }, + { + "epoch": 13.61308293647228, + "eval_loss": 3.8605709075927734, + "eval_runtime": 1118.1313, + "eval_samples_per_second": 8.858, + "eval_steps_per_second": 0.035, + "step": 303000 + }, + { + "epoch": 13.622068469763681, + "grad_norm": 4.5685319900512695, + "learning_rate": 4.138867581473776e-05, + "loss": 0.4583, + "step": 303200 + }, + { + "epoch": 13.63105400305508, + "grad_norm": 0.45331665873527527, + "learning_rate": 4.1378014572710974e-05, + "loss": 0.4281, + "step": 303400 + }, + { + "epoch": 13.640039536346482, + "grad_norm": 8.040594100952148, + "learning_rate": 4.136734811017822e-05, + "loss": 0.4353, + "step": 303600 + }, + { + "epoch": 13.649025069637883, + "grad_norm": 7.731649398803711, + "learning_rate": 4.135667643053945e-05, + "loss": 0.4867, + "step": 303800 + }, + { + "epoch": 13.658010602929284, + "grad_norm": 13.919236183166504, + "learning_rate": 4.1345999537196275e-05, + "loss": 0.4752, + "step": 304000 + }, + { + "epoch": 13.658010602929284, + "eval_loss": 3.850292444229126, + "eval_runtime": 1113.3609, + "eval_samples_per_second": 8.896, + "eval_steps_per_second": 0.035, + "step": 304000 + }, + { + "epoch": 13.666996136220686, + "grad_norm": 7.589078426361084, + "learning_rate": 4.1335317433551954e-05, + "loss": 0.4251, + "step": 304200 + }, + { + "epoch": 13.675981669512085, + "grad_norm": 10.349044799804688, + "learning_rate": 4.132463012301143e-05, + "loss": 0.4303, + "step": 304400 + }, + { + "epoch": 13.684967202803486, + "grad_norm": 1.0288686752319336, + "learning_rate": 4.131393760898128e-05, + "loss": 0.4318, + "step": 304600 + }, + { + "epoch": 13.693952736094888, + "grad_norm": 13.238295555114746, + "learning_rate": 4.130323989486976e-05, + "loss": 0.4539, + "step": 304800 + }, + { + "epoch": 13.702938269386289, + "grad_norm": 17.6412410736084, + "learning_rate": 4.1292536984086764e-05, + "loss": 0.4484, + "step": 305000 + }, + { + "epoch": 13.702938269386289, + "eval_loss": 3.859189033508301, + "eval_runtime": 1112.8183, + "eval_samples_per_second": 8.9, + "eval_steps_per_second": 0.035, + "step": 305000 + }, + { + "epoch": 13.711923802677688, + "grad_norm": 2.382539749145508, + "learning_rate": 4.128182888004387e-05, + "loss": 0.4026, + "step": 305200 + }, + { + "epoch": 13.72090933596909, + "grad_norm": 7.253118515014648, + "learning_rate": 4.127111558615427e-05, + "loss": 0.4531, + "step": 305400 + }, + { + "epoch": 13.729894869260491, + "grad_norm": 8.220928192138672, + "learning_rate": 4.126039710583287e-05, + "loss": 0.4339, + "step": 305600 + }, + { + "epoch": 13.738880402551892, + "grad_norm": 4.559962749481201, + "learning_rate": 4.124967344249617e-05, + "loss": 0.4274, + "step": 305800 + }, + { + "epoch": 13.747865935843292, + "grad_norm": 25.09603500366211, + "learning_rate": 4.1238944599562354e-05, + "loss": 0.451, + "step": 306000 + }, + { + "epoch": 13.747865935843292, + "eval_loss": 3.9123668670654297, + "eval_runtime": 1113.8568, + "eval_samples_per_second": 8.892, + "eval_steps_per_second": 0.035, + "step": 306000 + }, + { + "epoch": 13.756851469134693, + "grad_norm": 7.623703479766846, + "learning_rate": 4.122821058045125e-05, + "loss": 0.4204, + "step": 306200 + }, + { + "epoch": 13.765837002426094, + "grad_norm": 16.578161239624023, + "learning_rate": 4.121747138858433e-05, + "loss": 0.4556, + "step": 306400 + }, + { + "epoch": 13.774822535717496, + "grad_norm": 39.884002685546875, + "learning_rate": 4.120672702738473e-05, + "loss": 0.4342, + "step": 306600 + }, + { + "epoch": 13.783808069008895, + "grad_norm": 6.272052764892578, + "learning_rate": 4.1195977500277215e-05, + "loss": 0.4377, + "step": 306800 + }, + { + "epoch": 13.792793602300296, + "grad_norm": 4.232491970062256, + "learning_rate": 4.1185222810688214e-05, + "loss": 0.4948, + "step": 307000 + }, + { + "epoch": 13.792793602300296, + "eval_loss": 3.866061210632324, + "eval_runtime": 1113.1102, + "eval_samples_per_second": 8.898, + "eval_steps_per_second": 0.035, + "step": 307000 + }, + { + "epoch": 13.801779135591698, + "grad_norm": 7.848074913024902, + "learning_rate": 4.1174462962045784e-05, + "loss": 0.4657, + "step": 307200 + }, + { + "epoch": 13.810764668883099, + "grad_norm": 11.766325950622559, + "learning_rate": 4.1163697957779644e-05, + "loss": 0.4369, + "step": 307400 + }, + { + "epoch": 13.819750202174498, + "grad_norm": 4.907791614532471, + "learning_rate": 4.115292780132115e-05, + "loss": 0.4427, + "step": 307600 + }, + { + "epoch": 13.8287357354659, + "grad_norm": 2.2997195720672607, + "learning_rate": 4.114215249610329e-05, + "loss": 0.4261, + "step": 307800 + }, + { + "epoch": 13.837721268757301, + "grad_norm": 4.029343605041504, + "learning_rate": 4.1131372045560704e-05, + "loss": 0.4393, + "step": 308000 + }, + { + "epoch": 13.837721268757301, + "eval_loss": 3.869534969329834, + "eval_runtime": 1145.7345, + "eval_samples_per_second": 8.644, + "eval_steps_per_second": 0.034, + "step": 308000 + }, + { + "epoch": 13.846706802048702, + "grad_norm": 3.6049351692199707, + "learning_rate": 4.112058645312967e-05, + "loss": 0.4413, + "step": 308200 + }, + { + "epoch": 13.855692335340102, + "grad_norm": 0.6825031638145447, + "learning_rate": 4.110979572224811e-05, + "loss": 0.4046, + "step": 308400 + }, + { + "epoch": 13.864677868631503, + "grad_norm": 11.253166198730469, + "learning_rate": 4.109899985635558e-05, + "loss": 0.4877, + "step": 308600 + }, + { + "epoch": 13.873663401922904, + "grad_norm": 3.120997428894043, + "learning_rate": 4.108819885889326e-05, + "loss": 0.4409, + "step": 308800 + }, + { + "epoch": 13.882648935214306, + "grad_norm": 18.108745574951172, + "learning_rate": 4.107739273330398e-05, + "loss": 0.4455, + "step": 309000 + }, + { + "epoch": 13.882648935214306, + "eval_loss": 3.858532667160034, + "eval_runtime": 1133.8734, + "eval_samples_per_second": 8.735, + "eval_steps_per_second": 0.034, + "step": 309000 + }, + { + "epoch": 13.891634468505705, + "grad_norm": 4.392665863037109, + "learning_rate": 4.1066581483032206e-05, + "loss": 0.4946, + "step": 309200 + }, + { + "epoch": 13.900620001797106, + "grad_norm": 0.8881078958511353, + "learning_rate": 4.1055765111524036e-05, + "loss": 0.4265, + "step": 309400 + }, + { + "epoch": 13.909605535088508, + "grad_norm": 1.4993141889572144, + "learning_rate": 4.104494362222719e-05, + "loss": 0.4309, + "step": 309600 + }, + { + "epoch": 13.918591068379909, + "grad_norm": 5.614892959594727, + "learning_rate": 4.103411701859103e-05, + "loss": 0.4848, + "step": 309800 + }, + { + "epoch": 13.927576601671309, + "grad_norm": 6.294254779815674, + "learning_rate": 4.102328530406655e-05, + "loss": 0.4334, + "step": 310000 + }, + { + "epoch": 13.927576601671309, + "eval_loss": 3.8455817699432373, + "eval_runtime": 1137.7256, + "eval_samples_per_second": 8.705, + "eval_steps_per_second": 0.034, + "step": 310000 + }, + { + "epoch": 13.93656213496271, + "grad_norm": 2.6192963123321533, + "learning_rate": 4.101244848210636e-05, + "loss": 0.4564, + "step": 310200 + }, + { + "epoch": 13.945547668254111, + "grad_norm": 17.42061424255371, + "learning_rate": 4.100160655616471e-05, + "loss": 0.4186, + "step": 310400 + }, + { + "epoch": 13.954533201545512, + "grad_norm": 13.576807022094727, + "learning_rate": 4.099075952969747e-05, + "loss": 0.4534, + "step": 310600 + }, + { + "epoch": 13.963518734836912, + "grad_norm": 7.059383392333984, + "learning_rate": 4.097990740616214e-05, + "loss": 0.4483, + "step": 310800 + }, + { + "epoch": 13.972504268128313, + "grad_norm": 6.2722978591918945, + "learning_rate": 4.096905018901785e-05, + "loss": 0.448, + "step": 311000 + }, + { + "epoch": 13.972504268128313, + "eval_loss": 3.86065673828125, + "eval_runtime": 1127.0444, + "eval_samples_per_second": 8.788, + "eval_steps_per_second": 0.035, + "step": 311000 + }, + { + "epoch": 13.981489801419714, + "grad_norm": 0.11190976202487946, + "learning_rate": 4.095818788172534e-05, + "loss": 0.4484, + "step": 311200 + }, + { + "epoch": 13.990475334711116, + "grad_norm": 11.270726203918457, + "learning_rate": 4.094732048774698e-05, + "loss": 0.4496, + "step": 311400 + }, + { + "epoch": 13.999460868002515, + "grad_norm": 25.78597640991211, + "learning_rate": 4.093644801054676e-05, + "loss": 0.4627, + "step": 311600 + }, + { + "epoch": 14.008446401293916, + "grad_norm": 7.157655239105225, + "learning_rate": 4.09255704535903e-05, + "loss": 0.4073, + "step": 311800 + }, + { + "epoch": 14.017431934585318, + "grad_norm": 6.422256946563721, + "learning_rate": 4.0914687820344824e-05, + "loss": 0.3854, + "step": 312000 + }, + { + "epoch": 14.017431934585318, + "eval_loss": 3.9006946086883545, + "eval_runtime": 1133.3391, + "eval_samples_per_second": 8.739, + "eval_steps_per_second": 0.034, + "step": 312000 + }, + { + "epoch": 14.026417467876719, + "grad_norm": 2.7464749813079834, + "learning_rate": 4.090380011427918e-05, + "loss": 0.435, + "step": 312200 + }, + { + "epoch": 14.035403001168119, + "grad_norm": 9.64920425415039, + "learning_rate": 4.0892907338863833e-05, + "loss": 0.4341, + "step": 312400 + }, + { + "epoch": 14.04438853445952, + "grad_norm": 28.953222274780273, + "learning_rate": 4.088200949757087e-05, + "loss": 0.4119, + "step": 312600 + }, + { + "epoch": 14.053374067750921, + "grad_norm": 11.050024032592773, + "learning_rate": 4.0871106593873975e-05, + "loss": 0.4425, + "step": 312800 + }, + { + "epoch": 14.062359601042322, + "grad_norm": 7.281927585601807, + "learning_rate": 4.086019863124847e-05, + "loss": 0.4323, + "step": 313000 + }, + { + "epoch": 14.062359601042322, + "eval_loss": 3.8579936027526855, + "eval_runtime": 1129.0178, + "eval_samples_per_second": 8.772, + "eval_steps_per_second": 0.035, + "step": 313000 + }, + { + "epoch": 14.071345134333722, + "grad_norm": 9.319841384887695, + "learning_rate": 4.084928561317127e-05, + "loss": 0.4312, + "step": 313200 + }, + { + "epoch": 14.080330667625123, + "grad_norm": 4.579616069793701, + "learning_rate": 4.0838367543120916e-05, + "loss": 0.4136, + "step": 313400 + }, + { + "epoch": 14.089316200916524, + "grad_norm": 10.863465309143066, + "learning_rate": 4.0827444424577543e-05, + "loss": 0.4331, + "step": 313600 + }, + { + "epoch": 14.098301734207926, + "grad_norm": 6.145780086517334, + "learning_rate": 4.0816516261022915e-05, + "loss": 0.425, + "step": 313800 + }, + { + "epoch": 14.107287267499325, + "grad_norm": 6.644456386566162, + "learning_rate": 4.080558305594039e-05, + "loss": 0.4153, + "step": 314000 + }, + { + "epoch": 14.107287267499325, + "eval_loss": 3.8607418537139893, + "eval_runtime": 1121.8494, + "eval_samples_per_second": 8.828, + "eval_steps_per_second": 0.035, + "step": 314000 + }, + { + "epoch": 14.116272800790727, + "grad_norm": 20.19847297668457, + "learning_rate": 4.079464481281493e-05, + "loss": 0.3909, + "step": 314200 + }, + { + "epoch": 14.125258334082128, + "grad_norm": 11.029516220092773, + "learning_rate": 4.07837015351331e-05, + "loss": 0.4105, + "step": 314400 + }, + { + "epoch": 14.13424386737353, + "grad_norm": 9.190872192382812, + "learning_rate": 4.077275322638311e-05, + "loss": 0.4244, + "step": 314600 + }, + { + "epoch": 14.143229400664929, + "grad_norm": 15.798444747924805, + "learning_rate": 4.076179989005471e-05, + "loss": 0.4464, + "step": 314800 + }, + { + "epoch": 14.15221493395633, + "grad_norm": 7.170180797576904, + "learning_rate": 4.07508415296393e-05, + "loss": 0.4383, + "step": 315000 + }, + { + "epoch": 14.15221493395633, + "eval_loss": 3.8738784790039062, + "eval_runtime": 1126.1206, + "eval_samples_per_second": 8.795, + "eval_steps_per_second": 0.035, + "step": 315000 + }, + { + "epoch": 14.161200467247731, + "grad_norm": 3.4297237396240234, + "learning_rate": 4.073987814862988e-05, + "loss": 0.4147, + "step": 315200 + }, + { + "epoch": 14.170186000539132, + "grad_norm": 17.3597469329834, + "learning_rate": 4.072890975052102e-05, + "loss": 0.4264, + "step": 315400 + }, + { + "epoch": 14.179171533830532, + "grad_norm": 3.725116014480591, + "learning_rate": 4.071793633880891e-05, + "loss": 0.3873, + "step": 315600 + }, + { + "epoch": 14.188157067121933, + "grad_norm": 8.087611198425293, + "learning_rate": 4.070695791699132e-05, + "loss": 0.4188, + "step": 315800 + }, + { + "epoch": 14.197142600413335, + "grad_norm": 2.207904577255249, + "learning_rate": 4.069597448856765e-05, + "loss": 0.4476, + "step": 316000 + }, + { + "epoch": 14.197142600413335, + "eval_loss": 3.8536148071289062, + "eval_runtime": 1123.8487, + "eval_samples_per_second": 8.813, + "eval_steps_per_second": 0.035, + "step": 316000 + }, + { + "epoch": 14.206128133704736, + "grad_norm": 4.730515956878662, + "learning_rate": 4.0684986057038876e-05, + "loss": 0.4299, + "step": 316200 + }, + { + "epoch": 14.215113666996135, + "grad_norm": 17.80805015563965, + "learning_rate": 4.067399262590757e-05, + "loss": 0.452, + "step": 316400 + }, + { + "epoch": 14.224099200287537, + "grad_norm": 5.914919853210449, + "learning_rate": 4.0662994198677883e-05, + "loss": 0.4265, + "step": 316600 + }, + { + "epoch": 14.233084733578938, + "grad_norm": 7.017390251159668, + "learning_rate": 4.065199077885559e-05, + "loss": 0.4424, + "step": 316800 + }, + { + "epoch": 14.24207026687034, + "grad_norm": 2.4039924144744873, + "learning_rate": 4.064098236994803e-05, + "loss": 0.3815, + "step": 317000 + }, + { + "epoch": 14.24207026687034, + "eval_loss": 3.8721015453338623, + "eval_runtime": 1123.2832, + "eval_samples_per_second": 8.817, + "eval_steps_per_second": 0.035, + "step": 317000 + }, + { + "epoch": 14.25105580016174, + "grad_norm": 25.048295974731445, + "learning_rate": 4.062996897546415e-05, + "loss": 0.4516, + "step": 317200 + }, + { + "epoch": 14.26004133345314, + "grad_norm": 10.468742370605469, + "learning_rate": 4.0618950598914475e-05, + "loss": 0.3964, + "step": 317400 + }, + { + "epoch": 14.269026866744541, + "grad_norm": 5.206949710845947, + "learning_rate": 4.060792724381112e-05, + "loss": 0.405, + "step": 317600 + }, + { + "epoch": 14.278012400035943, + "grad_norm": 6.171004772186279, + "learning_rate": 4.0596898913667795e-05, + "loss": 0.4015, + "step": 317800 + }, + { + "epoch": 14.286997933327344, + "grad_norm": 7.8683905601501465, + "learning_rate": 4.0585865611999775e-05, + "loss": 0.4184, + "step": 318000 + }, + { + "epoch": 14.286997933327344, + "eval_loss": 3.863692045211792, + "eval_runtime": 1121.258, + "eval_samples_per_second": 8.833, + "eval_steps_per_second": 0.035, + "step": 318000 + }, + { + "epoch": 14.295983466618743, + "grad_norm": 17.344314575195312, + "learning_rate": 4.0574827342323945e-05, + "loss": 0.4423, + "step": 318200 + }, + { + "epoch": 14.304968999910145, + "grad_norm": 7.545623302459717, + "learning_rate": 4.056378410815877e-05, + "loss": 0.4582, + "step": 318400 + }, + { + "epoch": 14.313954533201546, + "grad_norm": 4.13499641418457, + "learning_rate": 4.055273591302427e-05, + "loss": 0.4233, + "step": 318600 + }, + { + "epoch": 14.322940066492947, + "grad_norm": 1.984163761138916, + "learning_rate": 4.054168276044209e-05, + "loss": 0.4549, + "step": 318800 + }, + { + "epoch": 14.331925599784347, + "grad_norm": 8.898198127746582, + "learning_rate": 4.053062465393542e-05, + "loss": 0.4277, + "step": 319000 + }, + { + "epoch": 14.331925599784347, + "eval_loss": 3.831319808959961, + "eval_runtime": 1136.9161, + "eval_samples_per_second": 8.711, + "eval_steps_per_second": 0.034, + "step": 319000 + }, + { + "epoch": 14.340911133075748, + "grad_norm": 4.621338367462158, + "learning_rate": 4.0519561597029036e-05, + "loss": 0.4108, + "step": 319200 + }, + { + "epoch": 14.34989666636715, + "grad_norm": 6.966736793518066, + "learning_rate": 4.050849359324931e-05, + "loss": 0.4347, + "step": 319400 + }, + { + "epoch": 14.35888219965855, + "grad_norm": 2.585519313812256, + "learning_rate": 4.0497420646124157e-05, + "loss": 0.4252, + "step": 319600 + }, + { + "epoch": 14.36786773294995, + "grad_norm": 10.04625415802002, + "learning_rate": 4.0486342759183115e-05, + "loss": 0.4074, + "step": 319800 + }, + { + "epoch": 14.376853266241351, + "grad_norm": 6.281806945800781, + "learning_rate": 4.047525993595724e-05, + "loss": 0.4581, + "step": 320000 + }, + { + "epoch": 14.376853266241351, + "eval_loss": 3.7998464107513428, + "eval_runtime": 1123.5798, + "eval_samples_per_second": 8.815, + "eval_steps_per_second": 0.035, + "step": 320000 + }, + { + "epoch": 14.385838799532753, + "grad_norm": 16.557212829589844, + "learning_rate": 4.046417217997922e-05, + "loss": 0.4741, + "step": 320200 + }, + { + "epoch": 14.394824332824154, + "grad_norm": 7.429055213928223, + "learning_rate": 4.045307949478326e-05, + "loss": 0.4885, + "step": 320400 + }, + { + "epoch": 14.403809866115553, + "grad_norm": 13.883950233459473, + "learning_rate": 4.044198188390519e-05, + "loss": 0.3895, + "step": 320600 + }, + { + "epoch": 14.412795399406955, + "grad_norm": 7.166148662567139, + "learning_rate": 4.0430879350882364e-05, + "loss": 0.4325, + "step": 320800 + }, + { + "epoch": 14.421780932698356, + "grad_norm": 24.932443618774414, + "learning_rate": 4.0419771899253724e-05, + "loss": 0.4677, + "step": 321000 + }, + { + "epoch": 14.421780932698356, + "eval_loss": 3.8351047039031982, + "eval_runtime": 1104.1188, + "eval_samples_per_second": 8.97, + "eval_steps_per_second": 0.035, + "step": 321000 + }, + { + "epoch": 14.430766465989757, + "grad_norm": 1.9560954570770264, + "learning_rate": 4.040865953255979e-05, + "loss": 0.421, + "step": 321200 + }, + { + "epoch": 14.439751999281157, + "grad_norm": 14.022553443908691, + "learning_rate": 4.0397542254342624e-05, + "loss": 0.447, + "step": 321400 + }, + { + "epoch": 14.448737532572558, + "grad_norm": 7.733597755432129, + "learning_rate": 4.0386420068145886e-05, + "loss": 0.4134, + "step": 321600 + }, + { + "epoch": 14.45772306586396, + "grad_norm": 9.011775016784668, + "learning_rate": 4.0375292977514765e-05, + "loss": 0.4656, + "step": 321800 + }, + { + "epoch": 14.46670859915536, + "grad_norm": 3.5252091884613037, + "learning_rate": 4.036416098599605e-05, + "loss": 0.4171, + "step": 322000 + }, + { + "epoch": 14.46670859915536, + "eval_loss": 3.8441038131713867, + "eval_runtime": 1104.159, + "eval_samples_per_second": 8.97, + "eval_steps_per_second": 0.035, + "step": 322000 + }, + { + "epoch": 14.47569413244676, + "grad_norm": 1.1404999494552612, + "learning_rate": 4.035302409713805e-05, + "loss": 0.3627, + "step": 322200 + }, + { + "epoch": 14.484679665738161, + "grad_norm": 5.832608699798584, + "learning_rate": 4.034188231449067e-05, + "loss": 0.4487, + "step": 322400 + }, + { + "epoch": 14.493665199029563, + "grad_norm": 8.705142974853516, + "learning_rate": 4.033073564160535e-05, + "loss": 0.4353, + "step": 322600 + }, + { + "epoch": 14.502650732320964, + "grad_norm": 14.9191312789917, + "learning_rate": 4.0319584082035136e-05, + "loss": 0.4538, + "step": 322800 + }, + { + "epoch": 14.511636265612363, + "grad_norm": 6.388049602508545, + "learning_rate": 4.030842763933456e-05, + "loss": 0.4367, + "step": 323000 + }, + { + "epoch": 14.511636265612363, + "eval_loss": 3.840134382247925, + "eval_runtime": 1105.0884, + "eval_samples_per_second": 8.962, + "eval_steps_per_second": 0.035, + "step": 323000 + }, + { + "epoch": 14.520621798903765, + "grad_norm": 5.0418524742126465, + "learning_rate": 4.0297266317059765e-05, + "loss": 0.4324, + "step": 323200 + }, + { + "epoch": 14.529607332195166, + "grad_norm": 9.340652465820312, + "learning_rate": 4.0286100118768426e-05, + "loss": 0.427, + "step": 323400 + }, + { + "epoch": 14.538592865486567, + "grad_norm": 25.69853973388672, + "learning_rate": 4.027492904801978e-05, + "loss": 0.4492, + "step": 323600 + }, + { + "epoch": 14.547578398777967, + "grad_norm": 1.1400892734527588, + "learning_rate": 4.026375310837461e-05, + "loss": 0.4793, + "step": 323800 + }, + { + "epoch": 14.556563932069368, + "grad_norm": 4.694724082946777, + "learning_rate": 4.025257230339527e-05, + "loss": 0.4572, + "step": 324000 + }, + { + "epoch": 14.556563932069368, + "eval_loss": 3.8130171298980713, + "eval_runtime": 1105.0408, + "eval_samples_per_second": 8.963, + "eval_steps_per_second": 0.035, + "step": 324000 + }, + { + "epoch": 14.56554946536077, + "grad_norm": 8.171147346496582, + "learning_rate": 4.024138663664564e-05, + "loss": 0.4274, + "step": 324200 + }, + { + "epoch": 14.57453499865217, + "grad_norm": 6.94440221786499, + "learning_rate": 4.023019611169116e-05, + "loss": 0.4361, + "step": 324400 + }, + { + "epoch": 14.58352053194357, + "grad_norm": 5.78433084487915, + "learning_rate": 4.021900073209882e-05, + "loss": 0.431, + "step": 324600 + }, + { + "epoch": 14.592506065234971, + "grad_norm": 10.060790061950684, + "learning_rate": 4.020780050143717e-05, + "loss": 0.4193, + "step": 324800 + }, + { + "epoch": 14.601491598526373, + "grad_norm": 2.9336678981781006, + "learning_rate": 4.0196595423276276e-05, + "loss": 0.4811, + "step": 325000 + }, + { + "epoch": 14.601491598526373, + "eval_loss": 3.8441808223724365, + "eval_runtime": 1105.4679, + "eval_samples_per_second": 8.959, + "eval_steps_per_second": 0.035, + "step": 325000 + }, + { + "epoch": 14.610477131817774, + "grad_norm": 11.331477165222168, + "learning_rate": 4.018538550118777e-05, + "loss": 0.4118, + "step": 325200 + }, + { + "epoch": 14.619462665109173, + "grad_norm": 4.01665735244751, + "learning_rate": 4.017417073874482e-05, + "loss": 0.43, + "step": 325400 + }, + { + "epoch": 14.628448198400575, + "grad_norm": 3.0681374073028564, + "learning_rate": 4.016295113952216e-05, + "loss": 0.411, + "step": 325600 + }, + { + "epoch": 14.637433731691976, + "grad_norm": 0.3734178841114044, + "learning_rate": 4.015172670709603e-05, + "loss": 0.4073, + "step": 325800 + }, + { + "epoch": 14.646419264983377, + "grad_norm": 14.095786094665527, + "learning_rate": 4.0140497445044234e-05, + "loss": 0.4476, + "step": 326000 + }, + { + "epoch": 14.646419264983377, + "eval_loss": 3.848971366882324, + "eval_runtime": 1104.6646, + "eval_samples_per_second": 8.966, + "eval_steps_per_second": 0.035, + "step": 326000 + }, + { + "epoch": 14.655404798274777, + "grad_norm": 19.044757843017578, + "learning_rate": 4.01292633569461e-05, + "loss": 0.4564, + "step": 326200 + }, + { + "epoch": 14.664390331566178, + "grad_norm": 6.487691402435303, + "learning_rate": 4.011802444638251e-05, + "loss": 0.4744, + "step": 326400 + }, + { + "epoch": 14.67337586485758, + "grad_norm": 5.221654891967773, + "learning_rate": 4.0106780716935875e-05, + "loss": 0.4423, + "step": 326600 + }, + { + "epoch": 14.68236139814898, + "grad_norm": 17.094696044921875, + "learning_rate": 4.009553217219015e-05, + "loss": 0.4425, + "step": 326800 + }, + { + "epoch": 14.69134693144038, + "grad_norm": 3.616652488708496, + "learning_rate": 4.008427881573081e-05, + "loss": 0.5084, + "step": 327000 + }, + { + "epoch": 14.69134693144038, + "eval_loss": 3.8496687412261963, + "eval_runtime": 1107.6478, + "eval_samples_per_second": 8.941, + "eval_steps_per_second": 0.035, + "step": 327000 + }, + { + "epoch": 14.700332464731781, + "grad_norm": 5.430749893188477, + "learning_rate": 4.0073020651144864e-05, + "loss": 0.4159, + "step": 327200 + }, + { + "epoch": 14.709317998023183, + "grad_norm": 5.325740814208984, + "learning_rate": 4.0061757682020886e-05, + "loss": 0.4361, + "step": 327400 + }, + { + "epoch": 14.718303531314584, + "grad_norm": 10.217351913452148, + "learning_rate": 4.005048991194893e-05, + "loss": 0.4284, + "step": 327600 + }, + { + "epoch": 14.727289064605984, + "grad_norm": 18.080963134765625, + "learning_rate": 4.003921734452063e-05, + "loss": 0.4282, + "step": 327800 + }, + { + "epoch": 14.736274597897385, + "grad_norm": 14.644773483276367, + "learning_rate": 4.00279399833291e-05, + "loss": 0.4241, + "step": 328000 + }, + { + "epoch": 14.736274597897385, + "eval_loss": 3.9514822959899902, + "eval_runtime": 1105.1163, + "eval_samples_per_second": 8.962, + "eval_steps_per_second": 0.035, + "step": 328000 + }, + { + "epoch": 14.745260131188786, + "grad_norm": 6.811315536499023, + "learning_rate": 4.001665783196904e-05, + "loss": 0.4371, + "step": 328200 + }, + { + "epoch": 14.754245664480187, + "grad_norm": 2.8421096801757812, + "learning_rate": 4.000537089403662e-05, + "loss": 0.386, + "step": 328400 + }, + { + "epoch": 14.763231197771589, + "grad_norm": 9.394848823547363, + "learning_rate": 3.999407917312957e-05, + "loss": 0.4609, + "step": 328600 + }, + { + "epoch": 14.772216731062988, + "grad_norm": 4.573288440704346, + "learning_rate": 3.998278267284714e-05, + "loss": 0.4733, + "step": 328800 + }, + { + "epoch": 14.78120226435439, + "grad_norm": 7.103633880615234, + "learning_rate": 3.997148139679009e-05, + "loss": 0.4596, + "step": 329000 + }, + { + "epoch": 14.78120226435439, + "eval_loss": 3.844900131225586, + "eval_runtime": 1104.3562, + "eval_samples_per_second": 8.968, + "eval_steps_per_second": 0.035, + "step": 329000 + }, + { + "epoch": 14.79018779764579, + "grad_norm": 21.354633331298828, + "learning_rate": 3.996017534856072e-05, + "loss": 0.4149, + "step": 329200 + }, + { + "epoch": 14.79917333093719, + "grad_norm": 3.860731363296509, + "learning_rate": 3.9948864531762833e-05, + "loss": 0.43, + "step": 329400 + }, + { + "epoch": 14.808158864228592, + "grad_norm": 9.424334526062012, + "learning_rate": 3.9937548950001775e-05, + "loss": 0.4443, + "step": 329600 + }, + { + "epoch": 14.817144397519993, + "grad_norm": 4.933842658996582, + "learning_rate": 3.992622860688439e-05, + "loss": 0.4222, + "step": 329800 + }, + { + "epoch": 14.826129930811394, + "grad_norm": 5.060630798339844, + "learning_rate": 3.9914903506019036e-05, + "loss": 0.4871, + "step": 330000 + }, + { + "epoch": 14.826129930811394, + "eval_loss": 3.873565673828125, + "eval_runtime": 1110.331, + "eval_samples_per_second": 8.92, + "eval_steps_per_second": 0.035, + "step": 330000 + }, + { + "epoch": 14.835115464102795, + "grad_norm": 14.746922492980957, + "learning_rate": 3.990357365101561e-05, + "loss": 0.4373, + "step": 330200 + }, + { + "epoch": 14.844100997394195, + "grad_norm": 15.675421714782715, + "learning_rate": 3.989223904548551e-05, + "loss": 0.4631, + "step": 330400 + }, + { + "epoch": 14.853086530685596, + "grad_norm": 9.67367935180664, + "learning_rate": 3.988089969304166e-05, + "loss": 0.4458, + "step": 330600 + }, + { + "epoch": 14.862072063976997, + "grad_norm": 3.0517771244049072, + "learning_rate": 3.986955559729848e-05, + "loss": 0.4513, + "step": 330800 + }, + { + "epoch": 14.871057597268399, + "grad_norm": 1.9877949953079224, + "learning_rate": 3.985820676187191e-05, + "loss": 0.4313, + "step": 331000 + }, + { + "epoch": 14.871057597268399, + "eval_loss": 3.8447208404541016, + "eval_runtime": 1163.0107, + "eval_samples_per_second": 8.516, + "eval_steps_per_second": 0.034, + "step": 331000 + }, + { + "epoch": 14.880043130559798, + "grad_norm": 7.18410587310791, + "learning_rate": 3.9846853190379394e-05, + "loss": 0.4369, + "step": 331200 + }, + { + "epoch": 14.8890286638512, + "grad_norm": 10.671833992004395, + "learning_rate": 3.9835494886439914e-05, + "loss": 0.3974, + "step": 331400 + }, + { + "epoch": 14.8980141971426, + "grad_norm": 4.593978404998779, + "learning_rate": 3.9824131853673904e-05, + "loss": 0.4512, + "step": 331600 + }, + { + "epoch": 14.906999730434002, + "grad_norm": 9.309211730957031, + "learning_rate": 3.981276409570338e-05, + "loss": 0.4041, + "step": 331800 + }, + { + "epoch": 14.915985263725402, + "grad_norm": 5.8800435066223145, + "learning_rate": 3.980139161615179e-05, + "loss": 0.4698, + "step": 332000 + }, + { + "epoch": 14.915985263725402, + "eval_loss": 3.8392350673675537, + "eval_runtime": 1142.4653, + "eval_samples_per_second": 8.669, + "eval_steps_per_second": 0.034, + "step": 332000 + }, + { + "epoch": 14.924970797016803, + "grad_norm": 4.226430892944336, + "learning_rate": 3.979001441864416e-05, + "loss": 0.4409, + "step": 332200 + }, + { + "epoch": 14.933956330308204, + "grad_norm": 3.3841519355773926, + "learning_rate": 3.977863250680694e-05, + "loss": 0.4371, + "step": 332400 + }, + { + "epoch": 14.942941863599605, + "grad_norm": 7.70395040512085, + "learning_rate": 3.976724588426815e-05, + "loss": 0.4421, + "step": 332600 + }, + { + "epoch": 14.951927396891005, + "grad_norm": 10.1765718460083, + "learning_rate": 3.975585455465727e-05, + "loss": 0.4105, + "step": 332800 + }, + { + "epoch": 14.960912930182406, + "grad_norm": 6.869187355041504, + "learning_rate": 3.974445852160531e-05, + "loss": 0.4158, + "step": 333000 + }, + { + "epoch": 14.960912930182406, + "eval_loss": 3.8126509189605713, + "eval_runtime": 1144.9743, + "eval_samples_per_second": 8.65, + "eval_steps_per_second": 0.034, + "step": 333000 + }, + { + "epoch": 14.969898463473807, + "grad_norm": 5.523416042327881, + "learning_rate": 3.973305778874475e-05, + "loss": 0.4251, + "step": 333200 + }, + { + "epoch": 14.978883996765209, + "grad_norm": 5.1718950271606445, + "learning_rate": 3.97216523597096e-05, + "loss": 0.4309, + "step": 333400 + }, + { + "epoch": 14.987869530056608, + "grad_norm": 5.314184188842773, + "learning_rate": 3.971024223813535e-05, + "loss": 0.4442, + "step": 333600 + }, + { + "epoch": 14.99685506334801, + "grad_norm": 5.813663482666016, + "learning_rate": 3.969882742765897e-05, + "loss": 0.4774, + "step": 333800 + }, + { + "epoch": 15.00584059663941, + "grad_norm": 4.15483283996582, + "learning_rate": 3.968740793191895e-05, + "loss": 0.386, + "step": 334000 + }, + { + "epoch": 15.00584059663941, + "eval_loss": 3.831601619720459, + "eval_runtime": 1157.4903, + "eval_samples_per_second": 8.556, + "eval_steps_per_second": 0.034, + "step": 334000 + }, + { + "epoch": 15.014826129930812, + "grad_norm": 4.984675407409668, + "learning_rate": 3.9675983754555257e-05, + "loss": 0.3864, + "step": 334200 + }, + { + "epoch": 15.023811663222212, + "grad_norm": 8.731829643249512, + "learning_rate": 3.966455489920937e-05, + "loss": 0.3777, + "step": 334400 + }, + { + "epoch": 15.032797196513613, + "grad_norm": 9.469175338745117, + "learning_rate": 3.9653121369524234e-05, + "loss": 0.4377, + "step": 334600 + }, + { + "epoch": 15.041782729805014, + "grad_norm": 16.434850692749023, + "learning_rate": 3.9641683169144304e-05, + "loss": 0.4178, + "step": 334800 + }, + { + "epoch": 15.050768263096415, + "grad_norm": 2.574371099472046, + "learning_rate": 3.9630240301715516e-05, + "loss": 0.4114, + "step": 335000 + }, + { + "epoch": 15.050768263096415, + "eval_loss": 3.860501289367676, + "eval_runtime": 1146.1338, + "eval_samples_per_second": 8.641, + "eval_steps_per_second": 0.034, + "step": 335000 + }, + { + "epoch": 15.059753796387815, + "grad_norm": 5.90514612197876, + "learning_rate": 3.961879277088529e-05, + "loss": 0.4158, + "step": 335200 + }, + { + "epoch": 15.068739329679216, + "grad_norm": 4.330122470855713, + "learning_rate": 3.9607340580302535e-05, + "loss": 0.398, + "step": 335400 + }, + { + "epoch": 15.077724862970618, + "grad_norm": 0.6313864588737488, + "learning_rate": 3.9595883733617646e-05, + "loss": 0.4184, + "step": 335600 + }, + { + "epoch": 15.086710396262019, + "grad_norm": 1.5892980098724365, + "learning_rate": 3.9584422234482505e-05, + "loss": 0.3704, + "step": 335800 + }, + { + "epoch": 15.095695929553418, + "grad_norm": 13.559605598449707, + "learning_rate": 3.957295608655047e-05, + "loss": 0.4061, + "step": 336000 + }, + { + "epoch": 15.095695929553418, + "eval_loss": 3.878929853439331, + "eval_runtime": 1159.8964, + "eval_samples_per_second": 8.539, + "eval_steps_per_second": 0.034, + "step": 336000 + }, + { + "epoch": 15.10468146284482, + "grad_norm": 4.454782009124756, + "learning_rate": 3.95614852934764e-05, + "loss": 0.4292, + "step": 336200 + }, + { + "epoch": 15.11366699613622, + "grad_norm": 12.67405891418457, + "learning_rate": 3.9550009858916606e-05, + "loss": 0.4449, + "step": 336400 + }, + { + "epoch": 15.122652529427622, + "grad_norm": 7.279116153717041, + "learning_rate": 3.9538529786528896e-05, + "loss": 0.4239, + "step": 336600 + }, + { + "epoch": 15.131638062719022, + "grad_norm": 8.419065475463867, + "learning_rate": 3.952704507997256e-05, + "loss": 0.3916, + "step": 336800 + }, + { + "epoch": 15.140623596010423, + "grad_norm": 7.502383232116699, + "learning_rate": 3.951555574290834e-05, + "loss": 0.4076, + "step": 337000 + }, + { + "epoch": 15.140623596010423, + "eval_loss": 3.861605167388916, + "eval_runtime": 1176.4609, + "eval_samples_per_second": 8.418, + "eval_steps_per_second": 0.033, + "step": 337000 + }, + { + "epoch": 15.149609129301824, + "grad_norm": 5.945129871368408, + "learning_rate": 3.950406177899849e-05, + "loss": 0.416, + "step": 337200 + }, + { + "epoch": 15.158594662593226, + "grad_norm": 14.246264457702637, + "learning_rate": 3.9492563191906706e-05, + "loss": 0.3824, + "step": 337400 + }, + { + "epoch": 15.167580195884625, + "grad_norm": 2.2644824981689453, + "learning_rate": 3.9481059985298186e-05, + "loss": 0.4079, + "step": 337600 + }, + { + "epoch": 15.176565729176026, + "grad_norm": 6.7229204177856445, + "learning_rate": 3.946955216283958e-05, + "loss": 0.4154, + "step": 337800 + }, + { + "epoch": 15.185551262467428, + "grad_norm": 5.469477653503418, + "learning_rate": 3.9458039728199016e-05, + "loss": 0.3919, + "step": 338000 + }, + { + "epoch": 15.185551262467428, + "eval_loss": 3.9068820476531982, + "eval_runtime": 1146.6357, + "eval_samples_per_second": 8.637, + "eval_steps_per_second": 0.034, + "step": 338000 + }, + { + "epoch": 15.194536795758829, + "grad_norm": 0.9827006459236145, + "learning_rate": 3.944652268504609e-05, + "loss": 0.3947, + "step": 338200 + }, + { + "epoch": 15.203522329050228, + "grad_norm": 8.862197875976562, + "learning_rate": 3.943500103705188e-05, + "loss": 0.4456, + "step": 338400 + }, + { + "epoch": 15.21250786234163, + "grad_norm": 9.226635932922363, + "learning_rate": 3.94234747878889e-05, + "loss": 0.4429, + "step": 338600 + }, + { + "epoch": 15.221493395633031, + "grad_norm": 9.727663040161133, + "learning_rate": 3.9411943941231175e-05, + "loss": 0.4261, + "step": 338800 + }, + { + "epoch": 15.230478928924432, + "grad_norm": 6.154589653015137, + "learning_rate": 3.940040850075416e-05, + "loss": 0.4575, + "step": 339000 + }, + { + "epoch": 15.230478928924432, + "eval_loss": 3.8878021240234375, + "eval_runtime": 1146.7256, + "eval_samples_per_second": 8.637, + "eval_steps_per_second": 0.034, + "step": 339000 + }, + { + "epoch": 15.239464462215832, + "grad_norm": 5.461616039276123, + "learning_rate": 3.938886847013479e-05, + "loss": 0.413, + "step": 339200 + }, + { + "epoch": 15.248449995507233, + "grad_norm": 12.906144142150879, + "learning_rate": 3.937732385305145e-05, + "loss": 0.4228, + "step": 339400 + }, + { + "epoch": 15.257435528798634, + "grad_norm": 21.305442810058594, + "learning_rate": 3.936577465318402e-05, + "loss": 0.4037, + "step": 339600 + }, + { + "epoch": 15.266421062090036, + "grad_norm": 7.382744789123535, + "learning_rate": 3.9354220874213785e-05, + "loss": 0.3948, + "step": 339800 + }, + { + "epoch": 15.275406595381435, + "grad_norm": 5.708733558654785, + "learning_rate": 3.9342662519823545e-05, + "loss": 0.4167, + "step": 340000 + }, + { + "epoch": 15.275406595381435, + "eval_loss": 3.8730831146240234, + "eval_runtime": 1143.9137, + "eval_samples_per_second": 8.658, + "eval_steps_per_second": 0.034, + "step": 340000 + }, + { + "epoch": 15.284392128672836, + "grad_norm": 4.250601768493652, + "learning_rate": 3.933109959369753e-05, + "loss": 0.3798, + "step": 340200 + }, + { + "epoch": 15.293377661964238, + "grad_norm": 8.226158142089844, + "learning_rate": 3.9319532099521434e-05, + "loss": 0.3839, + "step": 340400 + }, + { + "epoch": 15.302363195255639, + "grad_norm": 30.672576904296875, + "learning_rate": 3.9307960040982396e-05, + "loss": 0.4016, + "step": 340600 + }, + { + "epoch": 15.311348728547038, + "grad_norm": 12.382901191711426, + "learning_rate": 3.929638342176902e-05, + "loss": 0.411, + "step": 340800 + }, + { + "epoch": 15.32033426183844, + "grad_norm": 5.150439262390137, + "learning_rate": 3.9284802245571385e-05, + "loss": 0.4006, + "step": 341000 + }, + { + "epoch": 15.32033426183844, + "eval_loss": 3.9192259311676025, + "eval_runtime": 1145.0085, + "eval_samples_per_second": 8.65, + "eval_steps_per_second": 0.034, + "step": 341000 + }, + { + "epoch": 15.329319795129841, + "grad_norm": 6.119823932647705, + "learning_rate": 3.927321651608097e-05, + "loss": 0.4234, + "step": 341200 + }, + { + "epoch": 15.338305328421242, + "grad_norm": 2.2303431034088135, + "learning_rate": 3.926162623699077e-05, + "loss": 0.393, + "step": 341400 + }, + { + "epoch": 15.347290861712642, + "grad_norm": 19.413272857666016, + "learning_rate": 3.9250031411995155e-05, + "loss": 0.4275, + "step": 341600 + }, + { + "epoch": 15.356276395004043, + "grad_norm": 2.270556688308716, + "learning_rate": 3.923843204479002e-05, + "loss": 0.4144, + "step": 341800 + }, + { + "epoch": 15.365261928295444, + "grad_norm": 10.509578704833984, + "learning_rate": 3.922682813907265e-05, + "loss": 0.4045, + "step": 342000 + }, + { + "epoch": 15.365261928295444, + "eval_loss": 3.8500490188598633, + "eval_runtime": 1170.295, + "eval_samples_per_second": 8.463, + "eval_steps_per_second": 0.033, + "step": 342000 + }, + { + "epoch": 15.374247461586846, + "grad_norm": 9.872151374816895, + "learning_rate": 3.921521969854182e-05, + "loss": 0.4156, + "step": 342200 + }, + { + "epoch": 15.383232994878245, + "grad_norm": 7.011927604675293, + "learning_rate": 3.9203606726897724e-05, + "loss": 0.4073, + "step": 342400 + }, + { + "epoch": 15.392218528169646, + "grad_norm": 8.124802589416504, + "learning_rate": 3.919198922784199e-05, + "loss": 0.4099, + "step": 342600 + }, + { + "epoch": 15.401204061461048, + "grad_norm": 9.334155082702637, + "learning_rate": 3.918036720507773e-05, + "loss": 0.423, + "step": 342800 + }, + { + "epoch": 15.410189594752449, + "grad_norm": 3.0574357509613037, + "learning_rate": 3.916874066230945e-05, + "loss": 0.4416, + "step": 343000 + }, + { + "epoch": 15.410189594752449, + "eval_loss": 3.8163387775421143, + "eval_runtime": 1150.3405, + "eval_samples_per_second": 8.61, + "eval_steps_per_second": 0.034, + "step": 343000 + }, + { + "epoch": 15.41917512804385, + "grad_norm": 4.572579383850098, + "learning_rate": 3.915710960324314e-05, + "loss": 0.4077, + "step": 343200 + }, + { + "epoch": 15.42816066133525, + "grad_norm": 60.36442184448242, + "learning_rate": 3.91454740315862e-05, + "loss": 0.4761, + "step": 343400 + }, + { + "epoch": 15.437146194626651, + "grad_norm": 7.321791172027588, + "learning_rate": 3.913383395104748e-05, + "loss": 0.393, + "step": 343600 + }, + { + "epoch": 15.446131727918052, + "grad_norm": 8.782684326171875, + "learning_rate": 3.912218936533727e-05, + "loss": 0.4361, + "step": 343800 + }, + { + "epoch": 15.455117261209454, + "grad_norm": 17.37846565246582, + "learning_rate": 3.911054027816729e-05, + "loss": 0.4088, + "step": 344000 + }, + { + "epoch": 15.455117261209454, + "eval_loss": 3.8347713947296143, + "eval_runtime": 1150.0338, + "eval_samples_per_second": 8.612, + "eval_steps_per_second": 0.034, + "step": 344000 + }, + { + "epoch": 15.464102794500853, + "grad_norm": 4.234193325042725, + "learning_rate": 3.909888669325068e-05, + "loss": 0.4399, + "step": 344200 + }, + { + "epoch": 15.473088327792254, + "grad_norm": 6.374758720397949, + "learning_rate": 3.908722861430205e-05, + "loss": 0.4039, + "step": 344400 + }, + { + "epoch": 15.482073861083656, + "grad_norm": 34.553226470947266, + "learning_rate": 3.907556604503743e-05, + "loss": 0.4337, + "step": 344600 + }, + { + "epoch": 15.491059394375057, + "grad_norm": 10.942513465881348, + "learning_rate": 3.906389898917424e-05, + "loss": 0.4693, + "step": 344800 + }, + { + "epoch": 15.500044927666456, + "grad_norm": 8.577802658081055, + "learning_rate": 3.905222745043139e-05, + "loss": 0.3982, + "step": 345000 + }, + { + "epoch": 15.500044927666456, + "eval_loss": 3.816509962081909, + "eval_runtime": 1149.9103, + "eval_samples_per_second": 8.613, + "eval_steps_per_second": 0.034, + "step": 345000 + }, + { + "epoch": 15.509030460957858, + "grad_norm": 6.402909278869629, + "learning_rate": 3.9040551432529195e-05, + "loss": 0.4115, + "step": 345200 + }, + { + "epoch": 15.518015994249259, + "grad_norm": 6.276604175567627, + "learning_rate": 3.902887093918938e-05, + "loss": 0.4154, + "step": 345400 + }, + { + "epoch": 15.52700152754066, + "grad_norm": 7.94034481048584, + "learning_rate": 3.9017185974135115e-05, + "loss": 0.3947, + "step": 345600 + }, + { + "epoch": 15.53598706083206, + "grad_norm": 1.8332997560501099, + "learning_rate": 3.900549654109101e-05, + "loss": 0.41, + "step": 345800 + }, + { + "epoch": 15.544972594123461, + "grad_norm": 19.339252471923828, + "learning_rate": 3.899380264378305e-05, + "loss": 0.4381, + "step": 346000 + }, + { + "epoch": 15.544972594123461, + "eval_loss": 3.820833206176758, + "eval_runtime": 1150.5308, + "eval_samples_per_second": 8.608, + "eval_steps_per_second": 0.034, + "step": 346000 + }, + { + "epoch": 15.553958127414862, + "grad_norm": 23.56734275817871, + "learning_rate": 3.898210428593872e-05, + "loss": 0.411, + "step": 346200 + }, + { + "epoch": 15.562943660706264, + "grad_norm": 6.649259567260742, + "learning_rate": 3.897040147128683e-05, + "loss": 0.424, + "step": 346400 + }, + { + "epoch": 15.571929193997663, + "grad_norm": 5.427579879760742, + "learning_rate": 3.89586942035577e-05, + "loss": 0.4441, + "step": 346600 + }, + { + "epoch": 15.580914727289064, + "grad_norm": 5.252974510192871, + "learning_rate": 3.8946982486483015e-05, + "loss": 0.4452, + "step": 346800 + }, + { + "epoch": 15.589900260580466, + "grad_norm": 3.2411303520202637, + "learning_rate": 3.8935266323795895e-05, + "loss": 0.3956, + "step": 347000 + }, + { + "epoch": 15.589900260580466, + "eval_loss": 3.8776004314422607, + "eval_runtime": 1148.9182, + "eval_samples_per_second": 8.62, + "eval_steps_per_second": 0.034, + "step": 347000 + }, + { + "epoch": 15.598885793871867, + "grad_norm": 9.3895902633667, + "learning_rate": 3.892354571923088e-05, + "loss": 0.4057, + "step": 347200 + }, + { + "epoch": 15.607871327163267, + "grad_norm": 3.1582448482513428, + "learning_rate": 3.8911820676523925e-05, + "loss": 0.4189, + "step": 347400 + }, + { + "epoch": 15.616856860454668, + "grad_norm": 9.8271484375, + "learning_rate": 3.890009119941239e-05, + "loss": 0.4239, + "step": 347600 + }, + { + "epoch": 15.625842393746069, + "grad_norm": 2.3805694580078125, + "learning_rate": 3.888835729163507e-05, + "loss": 0.4121, + "step": 347800 + }, + { + "epoch": 15.63482792703747, + "grad_norm": 12.050047874450684, + "learning_rate": 3.887661895693214e-05, + "loss": 0.4411, + "step": 348000 + }, + { + "epoch": 15.63482792703747, + "eval_loss": 3.842379570007324, + "eval_runtime": 1150.1946, + "eval_samples_per_second": 8.611, + "eval_steps_per_second": 0.034, + "step": 348000 + }, + { + "epoch": 15.64381346032887, + "grad_norm": 12.517159461975098, + "learning_rate": 3.886487619904521e-05, + "loss": 0.4285, + "step": 348200 + }, + { + "epoch": 15.652798993620271, + "grad_norm": 8.59961223602295, + "learning_rate": 3.88531290217173e-05, + "loss": 0.4315, + "step": 348400 + }, + { + "epoch": 15.661784526911672, + "grad_norm": 9.657811164855957, + "learning_rate": 3.8841377428692835e-05, + "loss": 0.4277, + "step": 348600 + }, + { + "epoch": 15.670770060203074, + "grad_norm": 4.169412136077881, + "learning_rate": 3.882962142371763e-05, + "loss": 0.4158, + "step": 348800 + }, + { + "epoch": 15.679755593494473, + "grad_norm": 5.746458530426025, + "learning_rate": 3.881786101053894e-05, + "loss": 0.4112, + "step": 349000 + }, + { + "epoch": 15.679755593494473, + "eval_loss": 3.84271240234375, + "eval_runtime": 1152.7298, + "eval_samples_per_second": 8.592, + "eval_steps_per_second": 0.034, + "step": 349000 + }, + { + "epoch": 15.688741126785875, + "grad_norm": 5.669808387756348, + "learning_rate": 3.880609619290538e-05, + "loss": 0.4544, + "step": 349200 + }, + { + "epoch": 15.697726660077276, + "grad_norm": 2.429694652557373, + "learning_rate": 3.879432697456703e-05, + "loss": 0.4341, + "step": 349400 + }, + { + "epoch": 15.706712193368677, + "grad_norm": 2.860553026199341, + "learning_rate": 3.8782553359275315e-05, + "loss": 0.4342, + "step": 349600 + }, + { + "epoch": 15.715697726660077, + "grad_norm": 11.57726001739502, + "learning_rate": 3.877077535078309e-05, + "loss": 0.4178, + "step": 349800 + }, + { + "epoch": 15.724683259951478, + "grad_norm": 2.3827250003814697, + "learning_rate": 3.8758992952844605e-05, + "loss": 0.4078, + "step": 350000 + }, + { + "epoch": 15.724683259951478, + "eval_loss": 3.8592307567596436, + "eval_runtime": 1149.9252, + "eval_samples_per_second": 8.613, + "eval_steps_per_second": 0.034, + "step": 350000 + }, + { + "epoch": 15.73366879324288, + "grad_norm": 28.76621437072754, + "learning_rate": 3.8747206169215516e-05, + "loss": 0.4289, + "step": 350200 + }, + { + "epoch": 15.74265432653428, + "grad_norm": 1.1635797023773193, + "learning_rate": 3.873541500365286e-05, + "loss": 0.4409, + "step": 350400 + }, + { + "epoch": 15.75163985982568, + "grad_norm": 9.564525604248047, + "learning_rate": 3.872361945991509e-05, + "loss": 0.4339, + "step": 350600 + }, + { + "epoch": 15.760625393117081, + "grad_norm": 3.1764824390411377, + "learning_rate": 3.871181954176204e-05, + "loss": 0.4069, + "step": 350800 + }, + { + "epoch": 15.769610926408482, + "grad_norm": 5.794785499572754, + "learning_rate": 3.870001525295494e-05, + "loss": 0.4446, + "step": 351000 + }, + { + "epoch": 15.769610926408482, + "eval_loss": 3.835042953491211, + "eval_runtime": 1150.8003, + "eval_samples_per_second": 8.606, + "eval_steps_per_second": 0.034, + "step": 351000 + }, + { + "epoch": 15.778596459699884, + "grad_norm": 3.9470226764678955, + "learning_rate": 3.868820659725642e-05, + "loss": 0.4118, + "step": 351200 + }, + { + "epoch": 15.787581992991283, + "grad_norm": 25.599266052246094, + "learning_rate": 3.86763935784305e-05, + "loss": 0.3989, + "step": 351400 + }, + { + "epoch": 15.796567526282685, + "grad_norm": 11.884906768798828, + "learning_rate": 3.8664576200242604e-05, + "loss": 0.4074, + "step": 351600 + }, + { + "epoch": 15.805553059574086, + "grad_norm": 4.182280540466309, + "learning_rate": 3.8652754466459504e-05, + "loss": 0.4018, + "step": 351800 + }, + { + "epoch": 15.814538592865487, + "grad_norm": 2.89786696434021, + "learning_rate": 3.8640928380849406e-05, + "loss": 0.4295, + "step": 352000 + }, + { + "epoch": 15.814538592865487, + "eval_loss": 3.835994005203247, + "eval_runtime": 1149.5102, + "eval_samples_per_second": 8.616, + "eval_steps_per_second": 0.034, + "step": 352000 + }, + { + "epoch": 15.823524126156887, + "grad_norm": 2.728250741958618, + "learning_rate": 3.862909794718188e-05, + "loss": 0.4141, + "step": 352200 + }, + { + "epoch": 15.832509659448288, + "grad_norm": 5.0473456382751465, + "learning_rate": 3.861726316922789e-05, + "loss": 0.4068, + "step": 352400 + }, + { + "epoch": 15.84149519273969, + "grad_norm": 4.916729927062988, + "learning_rate": 3.860542405075978e-05, + "loss": 0.4048, + "step": 352600 + }, + { + "epoch": 15.85048072603109, + "grad_norm": 5.58930778503418, + "learning_rate": 3.859358059555127e-05, + "loss": 0.431, + "step": 352800 + }, + { + "epoch": 15.85946625932249, + "grad_norm": 2.4550957679748535, + "learning_rate": 3.858173280737748e-05, + "loss": 0.434, + "step": 353000 + }, + { + "epoch": 15.85946625932249, + "eval_loss": 3.8414108753204346, + "eval_runtime": 1140.739, + "eval_samples_per_second": 8.682, + "eval_steps_per_second": 0.034, + "step": 353000 + }, + { + "epoch": 15.868451792613891, + "grad_norm": 1.504676342010498, + "learning_rate": 3.85698806900149e-05, + "loss": 0.4354, + "step": 353200 + }, + { + "epoch": 15.877437325905293, + "grad_norm": 5.374175071716309, + "learning_rate": 3.8558024247241414e-05, + "loss": 0.458, + "step": 353400 + }, + { + "epoch": 15.886422859196694, + "grad_norm": 14.35389518737793, + "learning_rate": 3.854616348283625e-05, + "loss": 0.4403, + "step": 353600 + }, + { + "epoch": 15.895408392488093, + "grad_norm": 4.4372148513793945, + "learning_rate": 3.853429840058006e-05, + "loss": 0.4214, + "step": 353800 + }, + { + "epoch": 15.904393925779495, + "grad_norm": 10.166844367980957, + "learning_rate": 3.852242900425483e-05, + "loss": 0.43, + "step": 354000 + }, + { + "epoch": 15.904393925779495, + "eval_loss": 3.879225492477417, + "eval_runtime": 1145.2973, + "eval_samples_per_second": 8.648, + "eval_steps_per_second": 0.034, + "step": 354000 + }, + { + "epoch": 15.913379459070896, + "grad_norm": 3.3060805797576904, + "learning_rate": 3.8510555297643956e-05, + "loss": 0.4449, + "step": 354200 + }, + { + "epoch": 15.922364992362297, + "grad_norm": 17.104143142700195, + "learning_rate": 3.849867728453218e-05, + "loss": 0.4431, + "step": 354400 + }, + { + "epoch": 15.931350525653698, + "grad_norm": 5.082907676696777, + "learning_rate": 3.848679496870563e-05, + "loss": 0.4273, + "step": 354600 + }, + { + "epoch": 15.940336058945098, + "grad_norm": 9.734619140625, + "learning_rate": 3.847490835395181e-05, + "loss": 0.4214, + "step": 354800 + }, + { + "epoch": 15.9493215922365, + "grad_norm": 10.629302024841309, + "learning_rate": 3.846301744405959e-05, + "loss": 0.4601, + "step": 355000 + }, + { + "epoch": 15.9493215922365, + "eval_loss": 3.8631420135498047, + "eval_runtime": 1142.5819, + "eval_samples_per_second": 8.668, + "eval_steps_per_second": 0.034, + "step": 355000 + }, + { + "epoch": 15.9583071255279, + "grad_norm": 15.07685375213623, + "learning_rate": 3.84511222428192e-05, + "loss": 0.4517, + "step": 355200 + }, + { + "epoch": 15.9672926588193, + "grad_norm": 2.141556978225708, + "learning_rate": 3.843922275402225e-05, + "loss": 0.4253, + "step": 355400 + }, + { + "epoch": 15.976278192110701, + "grad_norm": 9.05489444732666, + "learning_rate": 3.842731898146171e-05, + "loss": 0.4403, + "step": 355600 + }, + { + "epoch": 15.985263725402103, + "grad_norm": 7.7289557456970215, + "learning_rate": 3.841541092893191e-05, + "loss": 0.4053, + "step": 355800 + }, + { + "epoch": 15.994249258693504, + "grad_norm": 16.47095489501953, + "learning_rate": 3.8403498600228574e-05, + "loss": 0.4137, + "step": 356000 + }, + { + "epoch": 15.994249258693504, + "eval_loss": 3.8049228191375732, + "eval_runtime": 1141.3474, + "eval_samples_per_second": 8.677, + "eval_steps_per_second": 0.034, + "step": 356000 + }, + { + "epoch": 16.003234791984905, + "grad_norm": 7.816695213317871, + "learning_rate": 3.839158199914874e-05, + "loss": 0.4137, + "step": 356200 + }, + { + "epoch": 16.012220325276306, + "grad_norm": 2.7365758419036865, + "learning_rate": 3.837966112949086e-05, + "loss": 0.4017, + "step": 356400 + }, + { + "epoch": 16.021205858567708, + "grad_norm": 8.747932434082031, + "learning_rate": 3.8367735995054704e-05, + "loss": 0.3901, + "step": 356600 + }, + { + "epoch": 16.030191391859105, + "grad_norm": 4.3832106590271, + "learning_rate": 3.835580659964142e-05, + "loss": 0.3867, + "step": 356800 + }, + { + "epoch": 16.039176925150507, + "grad_norm": 12.593661308288574, + "learning_rate": 3.834387294705352e-05, + "loss": 0.4276, + "step": 357000 + }, + { + "epoch": 16.039176925150507, + "eval_loss": 3.8479878902435303, + "eval_runtime": 1145.2444, + "eval_samples_per_second": 8.648, + "eval_steps_per_second": 0.034, + "step": 357000 + }, + { + "epoch": 16.048162458441908, + "grad_norm": 4.510431289672852, + "learning_rate": 3.833193504109487e-05, + "loss": 0.4091, + "step": 357200 + }, + { + "epoch": 16.05714799173331, + "grad_norm": 14.032699584960938, + "learning_rate": 3.831999288557067e-05, + "loss": 0.382, + "step": 357400 + }, + { + "epoch": 16.06613352502471, + "grad_norm": 8.67285442352295, + "learning_rate": 3.83080464842875e-05, + "loss": 0.4095, + "step": 357600 + }, + { + "epoch": 16.075119058316112, + "grad_norm": 11.347421646118164, + "learning_rate": 3.8296095841053295e-05, + "loss": 0.4026, + "step": 357800 + }, + { + "epoch": 16.084104591607513, + "grad_norm": 2.454707622528076, + "learning_rate": 3.8284140959677315e-05, + "loss": 0.3763, + "step": 358000 + }, + { + "epoch": 16.084104591607513, + "eval_loss": 3.891216993331909, + "eval_runtime": 1143.6428, + "eval_samples_per_second": 8.66, + "eval_steps_per_second": 0.034, + "step": 358000 + }, + { + "epoch": 16.093090124898914, + "grad_norm": 6.182559490203857, + "learning_rate": 3.827218184397021e-05, + "loss": 0.3719, + "step": 358200 + }, + { + "epoch": 16.102075658190312, + "grad_norm": 8.535185813903809, + "learning_rate": 3.826021849774394e-05, + "loss": 0.3971, + "step": 358400 + }, + { + "epoch": 16.111061191481713, + "grad_norm": 4.548397064208984, + "learning_rate": 3.8248250924811843e-05, + "loss": 0.371, + "step": 358600 + }, + { + "epoch": 16.120046724773115, + "grad_norm": 10.030683517456055, + "learning_rate": 3.8236279128988584e-05, + "loss": 0.4092, + "step": 358800 + }, + { + "epoch": 16.129032258064516, + "grad_norm": 5.520787239074707, + "learning_rate": 3.8224303114090196e-05, + "loss": 0.436, + "step": 359000 + }, + { + "epoch": 16.129032258064516, + "eval_loss": 3.845858573913574, + "eval_runtime": 1151.3773, + "eval_samples_per_second": 8.602, + "eval_steps_per_second": 0.034, + "step": 359000 + }, + { + "epoch": 16.138017791355917, + "grad_norm": 0.6454381346702576, + "learning_rate": 3.8212322883934026e-05, + "loss": 0.4252, + "step": 359200 + }, + { + "epoch": 16.14700332464732, + "grad_norm": 10.40180492401123, + "learning_rate": 3.82003384423388e-05, + "loss": 0.3774, + "step": 359400 + }, + { + "epoch": 16.15598885793872, + "grad_norm": 1.8541001081466675, + "learning_rate": 3.8188349793124554e-05, + "loss": 0.3787, + "step": 359600 + }, + { + "epoch": 16.16497439123012, + "grad_norm": 9.01765251159668, + "learning_rate": 3.817635694011268e-05, + "loss": 0.4182, + "step": 359800 + }, + { + "epoch": 16.17395992452152, + "grad_norm": 1.7692986726760864, + "learning_rate": 3.8164359887125935e-05, + "loss": 0.4164, + "step": 360000 + }, + { + "epoch": 16.17395992452152, + "eval_loss": 3.8807284832000732, + "eval_runtime": 1141.9331, + "eval_samples_per_second": 8.673, + "eval_steps_per_second": 0.034, + "step": 360000 + }, + { + "epoch": 16.18294545781292, + "grad_norm": 13.624265670776367, + "learning_rate": 3.815235863798836e-05, + "loss": 0.3842, + "step": 360200 + }, + { + "epoch": 16.19193099110432, + "grad_norm": 4.887984275817871, + "learning_rate": 3.814035319652538e-05, + "loss": 0.3879, + "step": 360400 + }, + { + "epoch": 16.200916524395723, + "grad_norm": 0.7442801594734192, + "learning_rate": 3.8128343566563726e-05, + "loss": 0.3995, + "step": 360600 + }, + { + "epoch": 16.209902057687124, + "grad_norm": 10.681866645812988, + "learning_rate": 3.811632975193149e-05, + "loss": 0.4225, + "step": 360800 + }, + { + "epoch": 16.218887590978525, + "grad_norm": 0.09919462352991104, + "learning_rate": 3.8104311756458085e-05, + "loss": 0.4133, + "step": 361000 + }, + { + "epoch": 16.218887590978525, + "eval_loss": 3.8468129634857178, + "eval_runtime": 1141.1126, + "eval_samples_per_second": 8.679, + "eval_steps_per_second": 0.034, + "step": 361000 + }, + { + "epoch": 16.227873124269927, + "grad_norm": 2.938690185546875, + "learning_rate": 3.809228958397425e-05, + "loss": 0.4147, + "step": 361200 + }, + { + "epoch": 16.236858657561328, + "grad_norm": 5.6593828201293945, + "learning_rate": 3.808026323831208e-05, + "loss": 0.3787, + "step": 361400 + }, + { + "epoch": 16.245844190852726, + "grad_norm": 4.981930255889893, + "learning_rate": 3.806823272330495e-05, + "loss": 0.3999, + "step": 361600 + }, + { + "epoch": 16.254829724144127, + "grad_norm": 5.699765205383301, + "learning_rate": 3.805619804278763e-05, + "loss": 0.4093, + "step": 361800 + }, + { + "epoch": 16.263815257435528, + "grad_norm": 1.215476155281067, + "learning_rate": 3.804415920059616e-05, + "loss": 0.4021, + "step": 362000 + }, + { + "epoch": 16.263815257435528, + "eval_loss": 3.8529727458953857, + "eval_runtime": 1150.9758, + "eval_samples_per_second": 8.605, + "eval_steps_per_second": 0.034, + "step": 362000 + }, + { + "epoch": 16.27280079072693, + "grad_norm": 15.102256774902344, + "learning_rate": 3.8032116200567944e-05, + "loss": 0.4041, + "step": 362200 + }, + { + "epoch": 16.28178632401833, + "grad_norm": 8.938138008117676, + "learning_rate": 3.80200690465417e-05, + "loss": 0.4056, + "step": 362400 + }, + { + "epoch": 16.290771857309732, + "grad_norm": 0.7558520436286926, + "learning_rate": 3.800801774235746e-05, + "loss": 0.3967, + "step": 362600 + }, + { + "epoch": 16.299757390601133, + "grad_norm": 3.1432087421417236, + "learning_rate": 3.79959622918566e-05, + "loss": 0.4021, + "step": 362800 + }, + { + "epoch": 16.308742923892535, + "grad_norm": 11.30734920501709, + "learning_rate": 3.798390269888179e-05, + "loss": 0.39, + "step": 363000 + }, + { + "epoch": 16.308742923892535, + "eval_loss": 3.8927652835845947, + "eval_runtime": 1141.2518, + "eval_samples_per_second": 8.678, + "eval_steps_per_second": 0.034, + "step": 363000 + }, + { + "epoch": 16.317728457183932, + "grad_norm": 11.273520469665527, + "learning_rate": 3.797183896727704e-05, + "loss": 0.4538, + "step": 363200 + }, + { + "epoch": 16.326713990475334, + "grad_norm": 17.33855438232422, + "learning_rate": 3.7959771100887685e-05, + "loss": 0.4019, + "step": 363400 + }, + { + "epoch": 16.335699523766735, + "grad_norm": 9.408929824829102, + "learning_rate": 3.794769910356036e-05, + "loss": 0.4173, + "step": 363600 + }, + { + "epoch": 16.344685057058136, + "grad_norm": 5.125523567199707, + "learning_rate": 3.793562297914302e-05, + "loss": 0.4259, + "step": 363800 + }, + { + "epoch": 16.353670590349537, + "grad_norm": 17.848237991333008, + "learning_rate": 3.792354273148495e-05, + "loss": 0.4109, + "step": 364000 + }, + { + "epoch": 16.353670590349537, + "eval_loss": 3.8154456615448, + "eval_runtime": 1133.9853, + "eval_samples_per_second": 8.734, + "eval_steps_per_second": 0.034, + "step": 364000 + }, + { + "epoch": 16.36265612364094, + "grad_norm": 7.285728931427002, + "learning_rate": 3.791145836443673e-05, + "loss": 0.4203, + "step": 364200 + }, + { + "epoch": 16.37164165693234, + "grad_norm": 0.5706067681312561, + "learning_rate": 3.7899369881850264e-05, + "loss": 0.4326, + "step": 364400 + }, + { + "epoch": 16.38062719022374, + "grad_norm": 6.83461856842041, + "learning_rate": 3.788727728757876e-05, + "loss": 0.415, + "step": 364600 + }, + { + "epoch": 16.38961272351514, + "grad_norm": 3.2358269691467285, + "learning_rate": 3.7875180585476754e-05, + "loss": 0.4249, + "step": 364800 + }, + { + "epoch": 16.39859825680654, + "grad_norm": 4.388341903686523, + "learning_rate": 3.786307977940008e-05, + "loss": 0.4001, + "step": 365000 + }, + { + "epoch": 16.39859825680654, + "eval_loss": 3.87809681892395, + "eval_runtime": 1106.541, + "eval_samples_per_second": 8.95, + "eval_steps_per_second": 0.035, + "step": 365000 + }, + { + "epoch": 16.40758379009794, + "grad_norm": 10.232439994812012, + "learning_rate": 3.785097487320588e-05, + "loss": 0.4246, + "step": 365200 + }, + { + "epoch": 16.416569323389343, + "grad_norm": 21.1503849029541, + "learning_rate": 3.783886587075259e-05, + "loss": 0.4109, + "step": 365400 + }, + { + "epoch": 16.425554856680744, + "grad_norm": 15.055440902709961, + "learning_rate": 3.782675277589998e-05, + "loss": 0.4047, + "step": 365600 + }, + { + "epoch": 16.434540389972145, + "grad_norm": 5.9024128913879395, + "learning_rate": 3.78146355925091e-05, + "loss": 0.4365, + "step": 365800 + }, + { + "epoch": 16.443525923263547, + "grad_norm": 3.827387571334839, + "learning_rate": 3.780251432444232e-05, + "loss": 0.3897, + "step": 366000 + }, + { + "epoch": 16.443525923263547, + "eval_loss": 3.8388655185699463, + "eval_runtime": 1105.7998, + "eval_samples_per_second": 8.956, + "eval_steps_per_second": 0.035, + "step": 366000 + }, + { + "epoch": 16.452511456554948, + "grad_norm": 5.388125419616699, + "learning_rate": 3.7790388975563296e-05, + "loss": 0.4402, + "step": 366200 + }, + { + "epoch": 16.461496989846346, + "grad_norm": 1.5944033861160278, + "learning_rate": 3.777825954973699e-05, + "loss": 0.4247, + "step": 366400 + }, + { + "epoch": 16.470482523137747, + "grad_norm": 3.2299532890319824, + "learning_rate": 3.7766126050829683e-05, + "loss": 0.4161, + "step": 366600 + }, + { + "epoch": 16.47946805642915, + "grad_norm": 4.81660270690918, + "learning_rate": 3.7753988482708923e-05, + "loss": 0.4256, + "step": 366800 + }, + { + "epoch": 16.48845358972055, + "grad_norm": 12.131381034851074, + "learning_rate": 3.774184684924359e-05, + "loss": 0.4218, + "step": 367000 + }, + { + "epoch": 16.48845358972055, + "eval_loss": 3.8612823486328125, + "eval_runtime": 1100.2738, + "eval_samples_per_second": 9.001, + "eval_steps_per_second": 0.035, + "step": 367000 + }, + { + "epoch": 16.49743912301195, + "grad_norm": 2.8556697368621826, + "learning_rate": 3.772970115430381e-05, + "loss": 0.4187, + "step": 367200 + }, + { + "epoch": 16.506424656303352, + "grad_norm": 8.463600158691406, + "learning_rate": 3.7717551401761055e-05, + "loss": 0.3736, + "step": 367400 + }, + { + "epoch": 16.515410189594753, + "grad_norm": 0.5444090962409973, + "learning_rate": 3.770539759548806e-05, + "loss": 0.4075, + "step": 367600 + }, + { + "epoch": 16.524395722886155, + "grad_norm": 16.545907974243164, + "learning_rate": 3.7693239739358865e-05, + "loss": 0.4065, + "step": 367800 + }, + { + "epoch": 16.533381256177556, + "grad_norm": 17.78046989440918, + "learning_rate": 3.76810778372488e-05, + "loss": 0.4137, + "step": 368000 + }, + { + "epoch": 16.533381256177556, + "eval_loss": 3.8438374996185303, + "eval_runtime": 1102.6952, + "eval_samples_per_second": 8.982, + "eval_steps_per_second": 0.035, + "step": 368000 + }, + { + "epoch": 16.542366789468954, + "grad_norm": 5.933611869812012, + "learning_rate": 3.766891189303448e-05, + "loss": 0.4089, + "step": 368200 + }, + { + "epoch": 16.551352322760355, + "grad_norm": 2.965001106262207, + "learning_rate": 3.76567419105938e-05, + "loss": 0.3756, + "step": 368400 + }, + { + "epoch": 16.560337856051756, + "grad_norm": 12.640633583068848, + "learning_rate": 3.764456789380596e-05, + "loss": 0.4273, + "step": 368600 + }, + { + "epoch": 16.569323389343158, + "grad_norm": 7.198838233947754, + "learning_rate": 3.763238984655144e-05, + "loss": 0.4022, + "step": 368800 + }, + { + "epoch": 16.57830892263456, + "grad_norm": 3.5390090942382812, + "learning_rate": 3.7620207772712e-05, + "loss": 0.4116, + "step": 369000 + }, + { + "epoch": 16.57830892263456, + "eval_loss": 3.8293216228485107, + "eval_runtime": 1099.8945, + "eval_samples_per_second": 9.005, + "eval_steps_per_second": 0.035, + "step": 369000 + }, + { + "epoch": 16.58729445592596, + "grad_norm": 5.592366695404053, + "learning_rate": 3.7608021676170695e-05, + "loss": 0.4036, + "step": 369200 + }, + { + "epoch": 16.59627998921736, + "grad_norm": 12.47636890411377, + "learning_rate": 3.759583156081184e-05, + "loss": 0.3893, + "step": 369400 + }, + { + "epoch": 16.60526552250876, + "grad_norm": 3.6026880741119385, + "learning_rate": 3.758363743052105e-05, + "loss": 0.4395, + "step": 369600 + }, + { + "epoch": 16.61425105580016, + "grad_norm": 8.781318664550781, + "learning_rate": 3.7571439289185204e-05, + "loss": 0.3842, + "step": 369800 + }, + { + "epoch": 16.62323658909156, + "grad_norm": 1.9131399393081665, + "learning_rate": 3.75592371406925e-05, + "loss": 0.4082, + "step": 370000 + }, + { + "epoch": 16.62323658909156, + "eval_loss": 3.8365583419799805, + "eval_runtime": 1106.4819, + "eval_samples_per_second": 8.951, + "eval_steps_per_second": 0.035, + "step": 370000 + }, + { + "epoch": 16.632222122382963, + "grad_norm": 9.32291030883789, + "learning_rate": 3.754703098893235e-05, + "loss": 0.4044, + "step": 370200 + }, + { + "epoch": 16.641207655674364, + "grad_norm": 7.453135013580322, + "learning_rate": 3.753482083779549e-05, + "loss": 0.4132, + "step": 370400 + }, + { + "epoch": 16.650193188965766, + "grad_norm": 13.478267669677734, + "learning_rate": 3.752260669117392e-05, + "loss": 0.4149, + "step": 370600 + }, + { + "epoch": 16.659178722257167, + "grad_norm": 4.782924652099609, + "learning_rate": 3.7510388552960895e-05, + "loss": 0.4303, + "step": 370800 + }, + { + "epoch": 16.668164255548568, + "grad_norm": 6.732643127441406, + "learning_rate": 3.749816642705098e-05, + "loss": 0.4386, + "step": 371000 + }, + { + "epoch": 16.668164255548568, + "eval_loss": 3.8590922355651855, + "eval_runtime": 1101.0023, + "eval_samples_per_second": 8.995, + "eval_steps_per_second": 0.035, + "step": 371000 + }, + { + "epoch": 16.67714978883997, + "grad_norm": 11.248590469360352, + "learning_rate": 3.748594031733996e-05, + "loss": 0.4137, + "step": 371200 + }, + { + "epoch": 16.686135322131367, + "grad_norm": 7.598705768585205, + "learning_rate": 3.747371022772494e-05, + "loss": 0.415, + "step": 371400 + }, + { + "epoch": 16.69512085542277, + "grad_norm": 2.1938705444335938, + "learning_rate": 3.746147616210426e-05, + "loss": 0.4304, + "step": 371600 + }, + { + "epoch": 16.70410638871417, + "grad_norm": 4.91569185256958, + "learning_rate": 3.7449238124377536e-05, + "loss": 0.4076, + "step": 371800 + }, + { + "epoch": 16.71309192200557, + "grad_norm": 20.976909637451172, + "learning_rate": 3.743699611844567e-05, + "loss": 0.405, + "step": 372000 + }, + { + "epoch": 16.71309192200557, + "eval_loss": 3.873788595199585, + "eval_runtime": 1101.0887, + "eval_samples_per_second": 8.995, + "eval_steps_per_second": 0.035, + "step": 372000 + }, + { + "epoch": 16.722077455296972, + "grad_norm": 8.065682411193848, + "learning_rate": 3.7424750148210794e-05, + "loss": 0.4384, + "step": 372200 + }, + { + "epoch": 16.731062988588373, + "grad_norm": 13.42385482788086, + "learning_rate": 3.741250021757633e-05, + "loss": 0.4002, + "step": 372400 + }, + { + "epoch": 16.740048521879775, + "grad_norm": 14.792691230773926, + "learning_rate": 3.7400246330446954e-05, + "loss": 0.3998, + "step": 372600 + }, + { + "epoch": 16.749034055171176, + "grad_norm": 28.727434158325195, + "learning_rate": 3.7387988490728595e-05, + "loss": 0.4238, + "step": 372800 + }, + { + "epoch": 16.758019588462574, + "grad_norm": 10.067317008972168, + "learning_rate": 3.7375726702328454e-05, + "loss": 0.4134, + "step": 373000 + }, + { + "epoch": 16.758019588462574, + "eval_loss": 3.951530933380127, + "eval_runtime": 1102.4686, + "eval_samples_per_second": 8.983, + "eval_steps_per_second": 0.035, + "step": 373000 + }, + { + "epoch": 16.767005121753975, + "grad_norm": 9.972529411315918, + "learning_rate": 3.736346096915499e-05, + "loss": 0.4335, + "step": 373200 + }, + { + "epoch": 16.775990655045376, + "grad_norm": 2.3625543117523193, + "learning_rate": 3.735119129511792e-05, + "loss": 0.4357, + "step": 373400 + }, + { + "epoch": 16.784976188336778, + "grad_norm": 5.44252347946167, + "learning_rate": 3.733891768412819e-05, + "loss": 0.4042, + "step": 373600 + }, + { + "epoch": 16.79396172162818, + "grad_norm": 14.719382286071777, + "learning_rate": 3.7326640140098056e-05, + "loss": 0.379, + "step": 373800 + }, + { + "epoch": 16.80294725491958, + "grad_norm": 12.511571884155273, + "learning_rate": 3.731435866694097e-05, + "loss": 0.4258, + "step": 374000 + }, + { + "epoch": 16.80294725491958, + "eval_loss": 3.8407986164093018, + "eval_runtime": 1100.7682, + "eval_samples_per_second": 8.997, + "eval_steps_per_second": 0.035, + "step": 374000 + }, + { + "epoch": 16.81193278821098, + "grad_norm": 2.9213812351226807, + "learning_rate": 3.7302073268571673e-05, + "loss": 0.4111, + "step": 374200 + }, + { + "epoch": 16.820918321502383, + "grad_norm": 40.420196533203125, + "learning_rate": 3.728978394890615e-05, + "loss": 0.4209, + "step": 374400 + }, + { + "epoch": 16.82990385479378, + "grad_norm": 1.4034184217453003, + "learning_rate": 3.727749071186162e-05, + "loss": 0.4118, + "step": 374600 + }, + { + "epoch": 16.83888938808518, + "grad_norm": 10.61877727508545, + "learning_rate": 3.7265193561356576e-05, + "loss": 0.3717, + "step": 374800 + }, + { + "epoch": 16.847874921376583, + "grad_norm": 15.831500053405762, + "learning_rate": 3.725289250131074e-05, + "loss": 0.4242, + "step": 375000 + }, + { + "epoch": 16.847874921376583, + "eval_loss": 3.901285171508789, + "eval_runtime": 1085.5255, + "eval_samples_per_second": 9.124, + "eval_steps_per_second": 0.036, + "step": 375000 + }, + { + "epoch": 16.856860454667984, + "grad_norm": 19.590776443481445, + "learning_rate": 3.724058753564507e-05, + "loss": 0.4149, + "step": 375200 + }, + { + "epoch": 16.865845987959386, + "grad_norm": 12.736054420471191, + "learning_rate": 3.722827866828181e-05, + "loss": 0.4186, + "step": 375400 + }, + { + "epoch": 16.874831521250787, + "grad_norm": 18.651493072509766, + "learning_rate": 3.721596590314441e-05, + "loss": 0.4529, + "step": 375600 + }, + { + "epoch": 16.883817054542188, + "grad_norm": 9.52115535736084, + "learning_rate": 3.720364924415757e-05, + "loss": 0.4294, + "step": 375800 + }, + { + "epoch": 16.89280258783359, + "grad_norm": 11.281582832336426, + "learning_rate": 3.719132869524723e-05, + "loss": 0.4451, + "step": 376000 + }, + { + "epoch": 16.89280258783359, + "eval_loss": 3.8090622425079346, + "eval_runtime": 1084.0102, + "eval_samples_per_second": 9.136, + "eval_steps_per_second": 0.036, + "step": 376000 + }, + { + "epoch": 16.901788121124987, + "grad_norm": 17.860044479370117, + "learning_rate": 3.71790042603406e-05, + "loss": 0.4197, + "step": 376200 + }, + { + "epoch": 16.91077365441639, + "grad_norm": 2.703660488128662, + "learning_rate": 3.716667594336608e-05, + "loss": 0.4291, + "step": 376400 + }, + { + "epoch": 16.91975918770779, + "grad_norm": 6.559628486633301, + "learning_rate": 3.715434374825334e-05, + "loss": 0.4271, + "step": 376600 + }, + { + "epoch": 16.92874472099919, + "grad_norm": 17.741317749023438, + "learning_rate": 3.7142007678933286e-05, + "loss": 0.4216, + "step": 376800 + }, + { + "epoch": 16.937730254290592, + "grad_norm": 14.408329963684082, + "learning_rate": 3.7129667739338035e-05, + "loss": 0.3846, + "step": 377000 + }, + { + "epoch": 16.937730254290592, + "eval_loss": 3.846365213394165, + "eval_runtime": 1084.0168, + "eval_samples_per_second": 9.136, + "eval_steps_per_second": 0.036, + "step": 377000 + }, + { + "epoch": 16.946715787581994, + "grad_norm": 6.594641208648682, + "learning_rate": 3.711732393340097e-05, + "loss": 0.4175, + "step": 377200 + }, + { + "epoch": 16.955701320873395, + "grad_norm": 22.12388801574707, + "learning_rate": 3.710497626505666e-05, + "loss": 0.4371, + "step": 377400 + }, + { + "epoch": 16.964686854164796, + "grad_norm": 18.402645111083984, + "learning_rate": 3.7092624738240974e-05, + "loss": 0.3814, + "step": 377600 + }, + { + "epoch": 16.973672387456194, + "grad_norm": 0.5258151888847351, + "learning_rate": 3.708026935689094e-05, + "loss": 0.3426, + "step": 377800 + }, + { + "epoch": 16.982657920747595, + "grad_norm": 13.795966148376465, + "learning_rate": 3.7067910124944866e-05, + "loss": 0.3805, + "step": 378000 + }, + { + "epoch": 16.982657920747595, + "eval_loss": 3.942888021469116, + "eval_runtime": 1083.5357, + "eval_samples_per_second": 9.14, + "eval_steps_per_second": 0.036, + "step": 378000 + }, + { + "epoch": 16.991643454038996, + "grad_norm": 15.092402458190918, + "learning_rate": 3.7055547046342257e-05, + "loss": 0.4181, + "step": 378200 + }, + { + "epoch": 17.000628987330398, + "grad_norm": 8.252157211303711, + "learning_rate": 3.704318012502386e-05, + "loss": 0.4221, + "step": 378400 + }, + { + "epoch": 17.0096145206218, + "grad_norm": 7.719264030456543, + "learning_rate": 3.703080936493163e-05, + "loss": 0.3772, + "step": 378600 + }, + { + "epoch": 17.0186000539132, + "grad_norm": 9.026861190795898, + "learning_rate": 3.701843477000879e-05, + "loss": 0.3988, + "step": 378800 + }, + { + "epoch": 17.0275855872046, + "grad_norm": 6.281711101531982, + "learning_rate": 3.7006056344199716e-05, + "loss": 0.3912, + "step": 379000 + }, + { + "epoch": 17.0275855872046, + "eval_loss": 3.819859504699707, + "eval_runtime": 1085.6011, + "eval_samples_per_second": 9.123, + "eval_steps_per_second": 0.036, + "step": 379000 + }, + { + "epoch": 17.036571120496003, + "grad_norm": 2.070225954055786, + "learning_rate": 3.699367409145005e-05, + "loss": 0.4107, + "step": 379200 + }, + { + "epoch": 17.0455566537874, + "grad_norm": 8.535941123962402, + "learning_rate": 3.698128801570665e-05, + "loss": 0.3904, + "step": 379400 + }, + { + "epoch": 17.054542187078802, + "grad_norm": 6.998322486877441, + "learning_rate": 3.69688981209176e-05, + "loss": 0.4092, + "step": 379600 + }, + { + "epoch": 17.063527720370203, + "grad_norm": 1.5596981048583984, + "learning_rate": 3.6956504411032165e-05, + "loss": 0.4072, + "step": 379800 + }, + { + "epoch": 17.072513253661604, + "grad_norm": 11.192583084106445, + "learning_rate": 3.694410689000087e-05, + "loss": 0.3701, + "step": 380000 + }, + { + "epoch": 17.072513253661604, + "eval_loss": 3.847810745239258, + "eval_runtime": 1083.6619, + "eval_samples_per_second": 9.139, + "eval_steps_per_second": 0.036, + "step": 380000 + }, + { + "epoch": 17.081498786953006, + "grad_norm": 21.050588607788086, + "learning_rate": 3.693170556177542e-05, + "loss": 0.3933, + "step": 380200 + }, + { + "epoch": 17.090484320244407, + "grad_norm": 6.3362016677856445, + "learning_rate": 3.691930043030877e-05, + "loss": 0.3821, + "step": 380400 + }, + { + "epoch": 17.09946985353581, + "grad_norm": 7.509994029998779, + "learning_rate": 3.6906891499555054e-05, + "loss": 0.3792, + "step": 380600 + }, + { + "epoch": 17.10845538682721, + "grad_norm": 13.802506446838379, + "learning_rate": 3.6894478773469624e-05, + "loss": 0.3725, + "step": 380800 + }, + { + "epoch": 17.117440920118607, + "grad_norm": 9.925665855407715, + "learning_rate": 3.688206225600904e-05, + "loss": 0.3727, + "step": 381000 + }, + { + "epoch": 17.117440920118607, + "eval_loss": 3.851689100265503, + "eval_runtime": 1083.8981, + "eval_samples_per_second": 9.137, + "eval_steps_per_second": 0.036, + "step": 381000 + }, + { + "epoch": 17.12642645341001, + "grad_norm": 0.7609677910804749, + "learning_rate": 3.68696419511311e-05, + "loss": 0.3871, + "step": 381200 + }, + { + "epoch": 17.13541198670141, + "grad_norm": 11.126961708068848, + "learning_rate": 3.685721786279478e-05, + "loss": 0.4077, + "step": 381400 + }, + { + "epoch": 17.14439751999281, + "grad_norm": 5.107800006866455, + "learning_rate": 3.684478999496026e-05, + "loss": 0.4096, + "step": 381600 + }, + { + "epoch": 17.153383053284212, + "grad_norm": 4.639297008514404, + "learning_rate": 3.6832358351588945e-05, + "loss": 0.3921, + "step": 381800 + }, + { + "epoch": 17.162368586575614, + "grad_norm": 5.009506702423096, + "learning_rate": 3.681992293664341e-05, + "loss": 0.3988, + "step": 382000 + }, + { + "epoch": 17.162368586575614, + "eval_loss": 3.8172054290771484, + "eval_runtime": 1088.2423, + "eval_samples_per_second": 9.101, + "eval_steps_per_second": 0.036, + "step": 382000 + }, + { + "epoch": 17.171354119867015, + "grad_norm": 2.0426735877990723, + "learning_rate": 3.6807483754087476e-05, + "loss": 0.3995, + "step": 382200 + }, + { + "epoch": 17.180339653158416, + "grad_norm": 0.8747676014900208, + "learning_rate": 3.679504080788614e-05, + "loss": 0.3465, + "step": 382400 + }, + { + "epoch": 17.189325186449818, + "grad_norm": 9.304901123046875, + "learning_rate": 3.678259410200558e-05, + "loss": 0.3792, + "step": 382600 + }, + { + "epoch": 17.198310719741215, + "grad_norm": 5.541252136230469, + "learning_rate": 3.677014364041323e-05, + "loss": 0.3944, + "step": 382800 + }, + { + "epoch": 17.207296253032617, + "grad_norm": 7.812130451202393, + "learning_rate": 3.675768942707767e-05, + "loss": 0.4363, + "step": 383000 + }, + { + "epoch": 17.207296253032617, + "eval_loss": 3.8186628818511963, + "eval_runtime": 1085.5035, + "eval_samples_per_second": 9.124, + "eval_steps_per_second": 0.036, + "step": 383000 + }, + { + "epoch": 17.216281786324018, + "grad_norm": 8.80836296081543, + "learning_rate": 3.6745231465968674e-05, + "loss": 0.3704, + "step": 383200 + }, + { + "epoch": 17.22526731961542, + "grad_norm": 2.294656276702881, + "learning_rate": 3.673276976105724e-05, + "loss": 0.3851, + "step": 383400 + }, + { + "epoch": 17.23425285290682, + "grad_norm": 0.8409772515296936, + "learning_rate": 3.6720304316315556e-05, + "loss": 0.365, + "step": 383600 + }, + { + "epoch": 17.24323838619822, + "grad_norm": 7.286799430847168, + "learning_rate": 3.670783513571698e-05, + "loss": 0.3604, + "step": 383800 + }, + { + "epoch": 17.252223919489623, + "grad_norm": 11.555950164794922, + "learning_rate": 3.6695362223236086e-05, + "loss": 0.3812, + "step": 384000 + }, + { + "epoch": 17.252223919489623, + "eval_loss": 3.913374185562134, + "eval_runtime": 1084.6125, + "eval_samples_per_second": 9.131, + "eval_steps_per_second": 0.036, + "step": 384000 + }, + { + "epoch": 17.261209452781024, + "grad_norm": 2.9781994819641113, + "learning_rate": 3.668288558284861e-05, + "loss": 0.3923, + "step": 384200 + }, + { + "epoch": 17.270194986072422, + "grad_norm": 7.835712432861328, + "learning_rate": 3.66704052185315e-05, + "loss": 0.4073, + "step": 384400 + }, + { + "epoch": 17.279180519363823, + "grad_norm": 9.055235862731934, + "learning_rate": 3.6657921134262885e-05, + "loss": 0.382, + "step": 384600 + }, + { + "epoch": 17.288166052655225, + "grad_norm": 27.968557357788086, + "learning_rate": 3.664543333402207e-05, + "loss": 0.4148, + "step": 384800 + }, + { + "epoch": 17.297151585946626, + "grad_norm": 12.404014587402344, + "learning_rate": 3.663294182178956e-05, + "loss": 0.3557, + "step": 385000 + }, + { + "epoch": 17.297151585946626, + "eval_loss": 3.8852949142456055, + "eval_runtime": 1086.2089, + "eval_samples_per_second": 9.118, + "eval_steps_per_second": 0.036, + "step": 385000 + }, + { + "epoch": 17.306137119238027, + "grad_norm": 10.516440391540527, + "learning_rate": 3.662044660154703e-05, + "loss": 0.4145, + "step": 385200 + }, + { + "epoch": 17.31512265252943, + "grad_norm": 2.42533278465271, + "learning_rate": 3.660794767727735e-05, + "loss": 0.3952, + "step": 385400 + }, + { + "epoch": 17.32410818582083, + "grad_norm": 1.5313594341278076, + "learning_rate": 3.659544505296456e-05, + "loss": 0.3634, + "step": 385600 + }, + { + "epoch": 17.33309371911223, + "grad_norm": 6.5009765625, + "learning_rate": 3.6582938732593865e-05, + "loss": 0.4266, + "step": 385800 + }, + { + "epoch": 17.34207925240363, + "grad_norm": 7.348703384399414, + "learning_rate": 3.657042872015168e-05, + "loss": 0.4209, + "step": 386000 + }, + { + "epoch": 17.34207925240363, + "eval_loss": 3.80428147315979, + "eval_runtime": 1088.4654, + "eval_samples_per_second": 9.099, + "eval_steps_per_second": 0.036, + "step": 386000 + }, + { + "epoch": 17.35106478569503, + "grad_norm": 5.27815580368042, + "learning_rate": 3.655791501962559e-05, + "loss": 0.3811, + "step": 386200 + }, + { + "epoch": 17.36005031898643, + "grad_norm": 10.278822898864746, + "learning_rate": 3.654539763500433e-05, + "loss": 0.3897, + "step": 386400 + }, + { + "epoch": 17.369035852277833, + "grad_norm": 7.166937351226807, + "learning_rate": 3.653287657027783e-05, + "loss": 0.4025, + "step": 386600 + }, + { + "epoch": 17.378021385569234, + "grad_norm": 15.087567329406738, + "learning_rate": 3.652035182943721e-05, + "loss": 0.333, + "step": 386800 + }, + { + "epoch": 17.387006918860635, + "grad_norm": 18.905258178710938, + "learning_rate": 3.6507823416474715e-05, + "loss": 0.3743, + "step": 387000 + }, + { + "epoch": 17.387006918860635, + "eval_loss": 3.854860782623291, + "eval_runtime": 1149.6352, + "eval_samples_per_second": 8.615, + "eval_steps_per_second": 0.034, + "step": 387000 + }, + { + "epoch": 17.395992452152036, + "grad_norm": 14.928525924682617, + "learning_rate": 3.6495291335383805e-05, + "loss": 0.4021, + "step": 387200 + }, + { + "epoch": 17.404977985443438, + "grad_norm": 3.540318012237549, + "learning_rate": 3.648275559015909e-05, + "loss": 0.4007, + "step": 387400 + }, + { + "epoch": 17.413963518734835, + "grad_norm": 1.0011667013168335, + "learning_rate": 3.647021618479634e-05, + "loss": 0.3821, + "step": 387600 + }, + { + "epoch": 17.422949052026237, + "grad_norm": 9.072355270385742, + "learning_rate": 3.6457673123292504e-05, + "loss": 0.4013, + "step": 387800 + }, + { + "epoch": 17.431934585317638, + "grad_norm": 5.886098861694336, + "learning_rate": 3.644512640964569e-05, + "loss": 0.3763, + "step": 388000 + }, + { + "epoch": 17.431934585317638, + "eval_loss": 3.810971260070801, + "eval_runtime": 1130.6573, + "eval_samples_per_second": 8.76, + "eval_steps_per_second": 0.034, + "step": 388000 + }, + { + "epoch": 17.44092011860904, + "grad_norm": 7.5825514793396, + "learning_rate": 3.643257604785518e-05, + "loss": 0.4158, + "step": 388200 + }, + { + "epoch": 17.44990565190044, + "grad_norm": 4.319643020629883, + "learning_rate": 3.642002204192142e-05, + "loss": 0.3819, + "step": 388400 + }, + { + "epoch": 17.458891185191842, + "grad_norm": 12.306256294250488, + "learning_rate": 3.6407464395845996e-05, + "loss": 0.4156, + "step": 388600 + }, + { + "epoch": 17.467876718483243, + "grad_norm": 22.988723754882812, + "learning_rate": 3.639490311363167e-05, + "loss": 0.4123, + "step": 388800 + }, + { + "epoch": 17.476862251774644, + "grad_norm": 7.2487359046936035, + "learning_rate": 3.638233819928237e-05, + "loss": 0.4258, + "step": 389000 + }, + { + "epoch": 17.476862251774644, + "eval_loss": 3.8038196563720703, + "eval_runtime": 1126.3212, + "eval_samples_per_second": 8.793, + "eval_steps_per_second": 0.035, + "step": 389000 + }, + { + "epoch": 17.485847785066042, + "grad_norm": 13.96484088897705, + "learning_rate": 3.6369769656803165e-05, + "loss": 0.3725, + "step": 389200 + }, + { + "epoch": 17.494833318357443, + "grad_norm": 6.461380958557129, + "learning_rate": 3.63571974902003e-05, + "loss": 0.4061, + "step": 389400 + }, + { + "epoch": 17.503818851648845, + "grad_norm": 8.86327075958252, + "learning_rate": 3.6344621703481146e-05, + "loss": 0.3814, + "step": 389600 + }, + { + "epoch": 17.512804384940246, + "grad_norm": 1.6969479322433472, + "learning_rate": 3.6332042300654255e-05, + "loss": 0.3937, + "step": 389800 + }, + { + "epoch": 17.521789918231647, + "grad_norm": 6.137419700622559, + "learning_rate": 3.631945928572932e-05, + "loss": 0.3711, + "step": 390000 + }, + { + "epoch": 17.521789918231647, + "eval_loss": 3.819227457046509, + "eval_runtime": 1126.304, + "eval_samples_per_second": 8.793, + "eval_steps_per_second": 0.035, + "step": 390000 + }, + { + "epoch": 17.53077545152305, + "grad_norm": 13.840421676635742, + "learning_rate": 3.6306872662717195e-05, + "loss": 0.4058, + "step": 390200 + }, + { + "epoch": 17.53976098481445, + "grad_norm": 9.404634475708008, + "learning_rate": 3.6294282435629865e-05, + "loss": 0.425, + "step": 390400 + }, + { + "epoch": 17.54874651810585, + "grad_norm": 13.545289993286133, + "learning_rate": 3.6281688608480486e-05, + "loss": 0.3879, + "step": 390600 + }, + { + "epoch": 17.55773205139725, + "grad_norm": 10.073009490966797, + "learning_rate": 3.6269091185283345e-05, + "loss": 0.4131, + "step": 390800 + }, + { + "epoch": 17.56671758468865, + "grad_norm": 4.1348676681518555, + "learning_rate": 3.6256490170053885e-05, + "loss": 0.4094, + "step": 391000 + }, + { + "epoch": 17.56671758468865, + "eval_loss": 3.8144443035125732, + "eval_runtime": 1125.7795, + "eval_samples_per_second": 8.797, + "eval_steps_per_second": 0.035, + "step": 391000 + }, + { + "epoch": 17.57570311798005, + "grad_norm": 12.360026359558105, + "learning_rate": 3.624388556680869e-05, + "loss": 0.3895, + "step": 391200 + }, + { + "epoch": 17.584688651271453, + "grad_norm": 3.9698164463043213, + "learning_rate": 3.6231277379565476e-05, + "loss": 0.4149, + "step": 391400 + }, + { + "epoch": 17.593674184562854, + "grad_norm": 13.396862030029297, + "learning_rate": 3.621866561234314e-05, + "loss": 0.3643, + "step": 391600 + }, + { + "epoch": 17.602659717854255, + "grad_norm": 5.373486518859863, + "learning_rate": 3.620605026916166e-05, + "loss": 0.4009, + "step": 391800 + }, + { + "epoch": 17.611645251145656, + "grad_norm": 5.472818374633789, + "learning_rate": 3.619343135404221e-05, + "loss": 0.401, + "step": 392000 + }, + { + "epoch": 17.611645251145656, + "eval_loss": 3.7937300205230713, + "eval_runtime": 1126.5045, + "eval_samples_per_second": 8.792, + "eval_steps_per_second": 0.035, + "step": 392000 + }, + { + "epoch": 17.620630784437058, + "grad_norm": 11.465763092041016, + "learning_rate": 3.6180808871007076e-05, + "loss": 0.3799, + "step": 392200 + }, + { + "epoch": 17.629616317728455, + "grad_norm": 1.5130301713943481, + "learning_rate": 3.6168182824079684e-05, + "loss": 0.3873, + "step": 392400 + }, + { + "epoch": 17.638601851019857, + "grad_norm": 4.5390143394470215, + "learning_rate": 3.61555532172846e-05, + "loss": 0.4056, + "step": 392600 + }, + { + "epoch": 17.647587384311258, + "grad_norm": 5.865408897399902, + "learning_rate": 3.6142920054647514e-05, + "loss": 0.4667, + "step": 392800 + }, + { + "epoch": 17.65657291760266, + "grad_norm": 11.054267883300781, + "learning_rate": 3.613028334019526e-05, + "loss": 0.4056, + "step": 393000 + }, + { + "epoch": 17.65657291760266, + "eval_loss": 3.8446738719940186, + "eval_runtime": 1128.0658, + "eval_samples_per_second": 8.78, + "eval_steps_per_second": 0.035, + "step": 393000 + }, + { + "epoch": 17.66555845089406, + "grad_norm": 1.73776376247406, + "learning_rate": 3.6117643077955795e-05, + "loss": 0.3956, + "step": 393200 + }, + { + "epoch": 17.674543984185462, + "grad_norm": 8.85155200958252, + "learning_rate": 3.610499927195823e-05, + "loss": 0.4032, + "step": 393400 + }, + { + "epoch": 17.683529517476863, + "grad_norm": 0.8997072577476501, + "learning_rate": 3.6092351926232784e-05, + "loss": 0.4166, + "step": 393600 + }, + { + "epoch": 17.692515050768264, + "grad_norm": 5.855953216552734, + "learning_rate": 3.6079701044810796e-05, + "loss": 0.3818, + "step": 393800 + }, + { + "epoch": 17.701500584059666, + "grad_norm": 5.543238162994385, + "learning_rate": 3.606704663172476e-05, + "loss": 0.3927, + "step": 394000 + }, + { + "epoch": 17.701500584059666, + "eval_loss": 3.8253390789031982, + "eval_runtime": 1130.3479, + "eval_samples_per_second": 8.762, + "eval_steps_per_second": 0.035, + "step": 394000 + }, + { + "epoch": 17.710486117351063, + "grad_norm": 9.299339294433594, + "learning_rate": 3.6054388691008264e-05, + "loss": 0.3598, + "step": 394200 + }, + { + "epoch": 17.719471650642465, + "grad_norm": 16.317785263061523, + "learning_rate": 3.604172722669607e-05, + "loss": 0.3629, + "step": 394400 + }, + { + "epoch": 17.728457183933866, + "grad_norm": 11.917454719543457, + "learning_rate": 3.602906224282398e-05, + "loss": 0.4213, + "step": 394600 + }, + { + "epoch": 17.737442717225267, + "grad_norm": 6.563929080963135, + "learning_rate": 3.6016393743429024e-05, + "loss": 0.3994, + "step": 394800 + }, + { + "epoch": 17.74642825051667, + "grad_norm": 8.417221069335938, + "learning_rate": 3.6003721732549254e-05, + "loss": 0.3833, + "step": 395000 + }, + { + "epoch": 17.74642825051667, + "eval_loss": 3.8368141651153564, + "eval_runtime": 1125.9952, + "eval_samples_per_second": 8.796, + "eval_steps_per_second": 0.035, + "step": 395000 + }, + { + "epoch": 17.75541378380807, + "grad_norm": 18.441783905029297, + "learning_rate": 3.59910462142239e-05, + "loss": 0.3396, + "step": 395200 + }, + { + "epoch": 17.76439931709947, + "grad_norm": 13.164015769958496, + "learning_rate": 3.59783671924933e-05, + "loss": 0.4187, + "step": 395400 + }, + { + "epoch": 17.77338485039087, + "grad_norm": 14.248663902282715, + "learning_rate": 3.59656846713989e-05, + "loss": 0.4077, + "step": 395600 + }, + { + "epoch": 17.78237038368227, + "grad_norm": 11.191965103149414, + "learning_rate": 3.595299865498325e-05, + "loss": 0.3516, + "step": 395800 + }, + { + "epoch": 17.79135591697367, + "grad_norm": 1.773537039756775, + "learning_rate": 3.594030914729005e-05, + "loss": 0.3653, + "step": 396000 + }, + { + "epoch": 17.79135591697367, + "eval_loss": 3.8245689868927, + "eval_runtime": 1126.8022, + "eval_samples_per_second": 8.789, + "eval_steps_per_second": 0.035, + "step": 396000 + }, + { + "epoch": 17.800341450265073, + "grad_norm": 3.224982261657715, + "learning_rate": 3.592761615236407e-05, + "loss": 0.3715, + "step": 396200 + }, + { + "epoch": 17.809326983556474, + "grad_norm": 11.764269828796387, + "learning_rate": 3.591491967425123e-05, + "loss": 0.4247, + "step": 396400 + }, + { + "epoch": 17.818312516847875, + "grad_norm": 28.149105072021484, + "learning_rate": 3.5902219716998545e-05, + "loss": 0.4073, + "step": 396600 + }, + { + "epoch": 17.827298050139277, + "grad_norm": 5.350660800933838, + "learning_rate": 3.5889516284654115e-05, + "loss": 0.4157, + "step": 396800 + }, + { + "epoch": 17.836283583430678, + "grad_norm": 3.0195703506469727, + "learning_rate": 3.587680938126719e-05, + "loss": 0.4154, + "step": 397000 + }, + { + "epoch": 17.836283583430678, + "eval_loss": 3.830150842666626, + "eval_runtime": 1126.5253, + "eval_samples_per_second": 8.792, + "eval_steps_per_second": 0.035, + "step": 397000 + }, + { + "epoch": 17.84526911672208, + "grad_norm": 16.077167510986328, + "learning_rate": 3.58640990108881e-05, + "loss": 0.3934, + "step": 397200 + }, + { + "epoch": 17.854254650013477, + "grad_norm": 7.119049072265625, + "learning_rate": 3.5851385177568287e-05, + "loss": 0.3933, + "step": 397400 + }, + { + "epoch": 17.863240183304878, + "grad_norm": 4.785800933837891, + "learning_rate": 3.583866788536029e-05, + "loss": 0.4054, + "step": 397600 + }, + { + "epoch": 17.87222571659628, + "grad_norm": 15.827156066894531, + "learning_rate": 3.582594713831777e-05, + "loss": 0.3705, + "step": 397800 + }, + { + "epoch": 17.88121124988768, + "grad_norm": 8.269429206848145, + "learning_rate": 3.581322294049546e-05, + "loss": 0.3958, + "step": 398000 + }, + { + "epoch": 17.88121124988768, + "eval_loss": 3.8027560710906982, + "eval_runtime": 1224.91, + "eval_samples_per_second": 8.085, + "eval_steps_per_second": 0.032, + "step": 398000 + }, + { + "epoch": 17.890196783179082, + "grad_norm": 8.487425804138184, + "learning_rate": 3.580049529594922e-05, + "loss": 0.3931, + "step": 398200 + }, + { + "epoch": 17.899182316470483, + "grad_norm": 18.79955291748047, + "learning_rate": 3.5787764208736e-05, + "loss": 0.4494, + "step": 398400 + }, + { + "epoch": 17.908167849761885, + "grad_norm": 12.001044273376465, + "learning_rate": 3.577502968291383e-05, + "loss": 0.4309, + "step": 398600 + }, + { + "epoch": 17.917153383053286, + "grad_norm": 5.9302873611450195, + "learning_rate": 3.576229172254186e-05, + "loss": 0.415, + "step": 398800 + }, + { + "epoch": 17.926138916344684, + "grad_norm": 6.8387346267700195, + "learning_rate": 3.574955033168033e-05, + "loss": 0.392, + "step": 399000 + }, + { + "epoch": 17.926138916344684, + "eval_loss": 3.784846544265747, + "eval_runtime": 1204.623, + "eval_samples_per_second": 8.222, + "eval_steps_per_second": 0.032, + "step": 399000 + }, + { + "epoch": 17.935124449636085, + "grad_norm": 3.8658130168914795, + "learning_rate": 3.573680551439056e-05, + "loss": 0.382, + "step": 399200 + }, + { + "epoch": 17.944109982927486, + "grad_norm": 2.803126573562622, + "learning_rate": 3.572405727473498e-05, + "loss": 0.3711, + "step": 399400 + }, + { + "epoch": 17.953095516218887, + "grad_norm": 0.6691089272499084, + "learning_rate": 3.5711305616777095e-05, + "loss": 0.3527, + "step": 399600 + }, + { + "epoch": 17.96208104951029, + "grad_norm": 5.192505836486816, + "learning_rate": 3.569855054458151e-05, + "loss": 0.4064, + "step": 399800 + }, + { + "epoch": 17.97106658280169, + "grad_norm": 10.876336097717285, + "learning_rate": 3.568579206221392e-05, + "loss": 0.4061, + "step": 400000 + }, + { + "epoch": 17.97106658280169, + "eval_loss": 3.802236557006836, + "eval_runtime": 1204.5349, + "eval_samples_per_second": 8.222, + "eval_steps_per_second": 0.032, + "step": 400000 + }, + { + "epoch": 17.98005211609309, + "grad_norm": 10.837194442749023, + "learning_rate": 3.5673030173741085e-05, + "loss": 0.3892, + "step": 400200 + }, + { + "epoch": 17.989037649384493, + "grad_norm": 19.335147857666016, + "learning_rate": 3.566026488323089e-05, + "loss": 0.4285, + "step": 400400 + }, + { + "epoch": 17.99802318267589, + "grad_norm": 7.6052470207214355, + "learning_rate": 3.5647496194752264e-05, + "loss": 0.4123, + "step": 400600 + }, + { + "epoch": 18.00700871596729, + "grad_norm": 1.3463623523712158, + "learning_rate": 3.5634724112375236e-05, + "loss": 0.3767, + "step": 400800 + }, + { + "epoch": 18.015994249258693, + "grad_norm": 6.778363227844238, + "learning_rate": 3.5621948640170944e-05, + "loss": 0.3737, + "step": 401000 + }, + { + "epoch": 18.015994249258693, + "eval_loss": 3.854170083999634, + "eval_runtime": 1204.5669, + "eval_samples_per_second": 8.222, + "eval_steps_per_second": 0.032, + "step": 401000 + }, + { + "epoch": 18.024979782550094, + "grad_norm": 6.250158309936523, + "learning_rate": 3.560916978221156e-05, + "loss": 0.3642, + "step": 401200 + }, + { + "epoch": 18.033965315841495, + "grad_norm": 12.505826950073242, + "learning_rate": 3.559638754257035e-05, + "loss": 0.3701, + "step": 401400 + }, + { + "epoch": 18.042950849132897, + "grad_norm": 18.78114891052246, + "learning_rate": 3.558360192532168e-05, + "loss": 0.3628, + "step": 401600 + }, + { + "epoch": 18.051936382424298, + "grad_norm": 2.8729214668273926, + "learning_rate": 3.557081293454097e-05, + "loss": 0.3777, + "step": 401800 + }, + { + "epoch": 18.0609219157157, + "grad_norm": 8.019610404968262, + "learning_rate": 3.555802057430471e-05, + "loss": 0.3402, + "step": 402000 + }, + { + "epoch": 18.0609219157157, + "eval_loss": 3.8658034801483154, + "eval_runtime": 1205.5991, + "eval_samples_per_second": 8.215, + "eval_steps_per_second": 0.032, + "step": 402000 + }, + { + "epoch": 18.069907449007097, + "grad_norm": 0.7817026376724243, + "learning_rate": 3.5545224848690495e-05, + "loss": 0.3799, + "step": 402200 + }, + { + "epoch": 18.0788929822985, + "grad_norm": 5.083946704864502, + "learning_rate": 3.553242576177697e-05, + "loss": 0.3577, + "step": 402400 + }, + { + "epoch": 18.0878785155899, + "grad_norm": 7.09104061126709, + "learning_rate": 3.5519623317643834e-05, + "loss": 0.3819, + "step": 402600 + }, + { + "epoch": 18.0968640488813, + "grad_norm": 8.251867294311523, + "learning_rate": 3.55068175203719e-05, + "loss": 0.3898, + "step": 402800 + }, + { + "epoch": 18.105849582172702, + "grad_norm": 29.634862899780273, + "learning_rate": 3.549400837404302e-05, + "loss": 0.3648, + "step": 403000 + }, + { + "epoch": 18.105849582172702, + "eval_loss": 3.867095947265625, + "eval_runtime": 1203.7886, + "eval_samples_per_second": 8.227, + "eval_steps_per_second": 0.032, + "step": 403000 + }, + { + "epoch": 18.114835115464103, + "grad_norm": 8.83678913116455, + "learning_rate": 3.548119588274012e-05, + "loss": 0.3644, + "step": 403200 + }, + { + "epoch": 18.123820648755505, + "grad_norm": 3.9877867698669434, + "learning_rate": 3.5468380050547185e-05, + "loss": 0.3518, + "step": 403400 + }, + { + "epoch": 18.132806182046906, + "grad_norm": 12.110077857971191, + "learning_rate": 3.545556088154928e-05, + "loss": 0.4015, + "step": 403600 + }, + { + "epoch": 18.141791715338304, + "grad_norm": 20.395000457763672, + "learning_rate": 3.544273837983253e-05, + "loss": 0.356, + "step": 403800 + }, + { + "epoch": 18.150777248629705, + "grad_norm": 7.915891170501709, + "learning_rate": 3.5429912549484114e-05, + "loss": 0.3513, + "step": 404000 + }, + { + "epoch": 18.150777248629705, + "eval_loss": 3.825883626937866, + "eval_runtime": 1205.146, + "eval_samples_per_second": 8.218, + "eval_steps_per_second": 0.032, + "step": 404000 + }, + { + "epoch": 18.159762781921106, + "grad_norm": 2.465219736099243, + "learning_rate": 3.541708339459227e-05, + "loss": 0.3469, + "step": 404200 + }, + { + "epoch": 18.168748315212508, + "grad_norm": 16.333881378173828, + "learning_rate": 3.54042509192463e-05, + "loss": 0.3947, + "step": 404400 + }, + { + "epoch": 18.17773384850391, + "grad_norm": 6.627115249633789, + "learning_rate": 3.539141512753658e-05, + "loss": 0.4071, + "step": 404600 + }, + { + "epoch": 18.18671938179531, + "grad_norm": 9.679762840270996, + "learning_rate": 3.5378576023554524e-05, + "loss": 0.382, + "step": 404800 + }, + { + "epoch": 18.19570491508671, + "grad_norm": 4.362650394439697, + "learning_rate": 3.536573361139261e-05, + "loss": 0.3896, + "step": 405000 + }, + { + "epoch": 18.19570491508671, + "eval_loss": 3.831510543823242, + "eval_runtime": 1203.2249, + "eval_samples_per_second": 8.231, + "eval_steps_per_second": 0.032, + "step": 405000 + }, + { + "epoch": 18.204690448378113, + "grad_norm": 3.280683994293213, + "learning_rate": 3.5352887895144354e-05, + "loss": 0.3867, + "step": 405200 + }, + { + "epoch": 18.21367598166951, + "grad_norm": 25.597644805908203, + "learning_rate": 3.534003887890435e-05, + "loss": 0.3474, + "step": 405400 + }, + { + "epoch": 18.22266151496091, + "grad_norm": 15.584162712097168, + "learning_rate": 3.532718656676824e-05, + "loss": 0.377, + "step": 405600 + }, + { + "epoch": 18.231647048252313, + "grad_norm": 5.3182053565979, + "learning_rate": 3.5314330962832696e-05, + "loss": 0.3463, + "step": 405800 + }, + { + "epoch": 18.240632581543714, + "grad_norm": 3.7088468074798584, + "learning_rate": 3.5301472071195454e-05, + "loss": 0.3678, + "step": 406000 + }, + { + "epoch": 18.240632581543714, + "eval_loss": 3.8044979572296143, + "eval_runtime": 1210.8568, + "eval_samples_per_second": 8.179, + "eval_steps_per_second": 0.032, + "step": 406000 + }, + { + "epoch": 18.249618114835116, + "grad_norm": 7.514823913574219, + "learning_rate": 3.5288609895955304e-05, + "loss": 0.357, + "step": 406200 + }, + { + "epoch": 18.258603648126517, + "grad_norm": 2.4954440593719482, + "learning_rate": 3.527574444121207e-05, + "loss": 0.3982, + "step": 406400 + }, + { + "epoch": 18.267589181417918, + "grad_norm": 3.856297016143799, + "learning_rate": 3.5262875711066625e-05, + "loss": 0.3921, + "step": 406600 + }, + { + "epoch": 18.27657471470932, + "grad_norm": 3.8277928829193115, + "learning_rate": 3.525000370962089e-05, + "loss": 0.387, + "step": 406800 + }, + { + "epoch": 18.285560248000717, + "grad_norm": 1.290062665939331, + "learning_rate": 3.523712844097783e-05, + "loss": 0.3554, + "step": 407000 + }, + { + "epoch": 18.285560248000717, + "eval_loss": 3.9154751300811768, + "eval_runtime": 1217.0508, + "eval_samples_per_second": 8.138, + "eval_steps_per_second": 0.032, + "step": 407000 + }, + { + "epoch": 18.29454578129212, + "grad_norm": 8.983039855957031, + "learning_rate": 3.522424990924145e-05, + "loss": 0.3989, + "step": 407200 + }, + { + "epoch": 18.30353131458352, + "grad_norm": 15.448911666870117, + "learning_rate": 3.5211368118516774e-05, + "loss": 0.395, + "step": 407400 + }, + { + "epoch": 18.31251684787492, + "grad_norm": 6.722110271453857, + "learning_rate": 3.51984830729099e-05, + "loss": 0.3846, + "step": 407600 + }, + { + "epoch": 18.321502381166322, + "grad_norm": 5.694580554962158, + "learning_rate": 3.5185594776527945e-05, + "loss": 0.3845, + "step": 407800 + }, + { + "epoch": 18.330487914457724, + "grad_norm": 4.475128173828125, + "learning_rate": 3.517270323347907e-05, + "loss": 0.4102, + "step": 408000 + }, + { + "epoch": 18.330487914457724, + "eval_loss": 3.8598849773406982, + "eval_runtime": 1097.2151, + "eval_samples_per_second": 9.026, + "eval_steps_per_second": 0.036, + "step": 408000 + }, + { + "epoch": 18.339473447749125, + "grad_norm": 7.8763933181762695, + "learning_rate": 3.5159808447872456e-05, + "loss": 0.3745, + "step": 408200 + }, + { + "epoch": 18.348458981040526, + "grad_norm": 35.217857360839844, + "learning_rate": 3.5146910423818324e-05, + "loss": 0.3821, + "step": 408400 + }, + { + "epoch": 18.357444514331924, + "grad_norm": 7.480992794036865, + "learning_rate": 3.513400916542793e-05, + "loss": 0.3777, + "step": 408600 + }, + { + "epoch": 18.366430047623325, + "grad_norm": 1.083188772201538, + "learning_rate": 3.5121104676813575e-05, + "loss": 0.353, + "step": 408800 + }, + { + "epoch": 18.375415580914726, + "grad_norm": 5.977663040161133, + "learning_rate": 3.510819696208857e-05, + "loss": 0.3875, + "step": 409000 + }, + { + "epoch": 18.375415580914726, + "eval_loss": 3.8312559127807617, + "eval_runtime": 1097.7017, + "eval_samples_per_second": 9.022, + "eval_steps_per_second": 0.036, + "step": 409000 + }, + { + "epoch": 18.384401114206128, + "grad_norm": 5.178797721862793, + "learning_rate": 3.509528602536725e-05, + "loss": 0.3846, + "step": 409200 + }, + { + "epoch": 18.39338664749753, + "grad_norm": 0.88429194688797, + "learning_rate": 3.5082371870764997e-05, + "loss": 0.3766, + "step": 409400 + }, + { + "epoch": 18.40237218078893, + "grad_norm": 1.1388074159622192, + "learning_rate": 3.50694545023982e-05, + "loss": 0.4182, + "step": 409600 + }, + { + "epoch": 18.41135771408033, + "grad_norm": 10.69584846496582, + "learning_rate": 3.50565339243843e-05, + "loss": 0.3962, + "step": 409800 + }, + { + "epoch": 18.420343247371733, + "grad_norm": 3.2189548015594482, + "learning_rate": 3.5043610140841716e-05, + "loss": 0.3745, + "step": 410000 + }, + { + "epoch": 18.420343247371733, + "eval_loss": 3.84757399559021, + "eval_runtime": 1096.3132, + "eval_samples_per_second": 9.034, + "eval_steps_per_second": 0.036, + "step": 410000 + }, + { + "epoch": 18.429328780663134, + "grad_norm": 4.857696056365967, + "learning_rate": 3.503068315588993e-05, + "loss": 0.3714, + "step": 410200 + }, + { + "epoch": 18.438314313954532, + "grad_norm": 22.0413875579834, + "learning_rate": 3.501775297364943e-05, + "loss": 0.3584, + "step": 410400 + }, + { + "epoch": 18.447299847245933, + "grad_norm": 12.368648529052734, + "learning_rate": 3.5004819598241725e-05, + "loss": 0.3731, + "step": 410600 + }, + { + "epoch": 18.456285380537334, + "grad_norm": 7.075397968292236, + "learning_rate": 3.4991883033789316e-05, + "loss": 0.3521, + "step": 410800 + }, + { + "epoch": 18.465270913828736, + "grad_norm": 10.172215461730957, + "learning_rate": 3.4978943284415784e-05, + "loss": 0.3916, + "step": 411000 + }, + { + "epoch": 18.465270913828736, + "eval_loss": 3.8483147621154785, + "eval_runtime": 1094.622, + "eval_samples_per_second": 9.048, + "eval_steps_per_second": 0.036, + "step": 411000 + }, + { + "epoch": 18.474256447120137, + "grad_norm": 5.510894775390625, + "learning_rate": 3.496600035424565e-05, + "loss": 0.3889, + "step": 411200 + }, + { + "epoch": 18.483241980411538, + "grad_norm": 7.840881824493408, + "learning_rate": 3.495305424740449e-05, + "loss": 0.3941, + "step": 411400 + }, + { + "epoch": 18.49222751370294, + "grad_norm": 2.5886456966400146, + "learning_rate": 3.4940104968018904e-05, + "loss": 0.3836, + "step": 411600 + }, + { + "epoch": 18.50121304699434, + "grad_norm": 7.37034273147583, + "learning_rate": 3.4927152520216474e-05, + "loss": 0.3475, + "step": 411800 + }, + { + "epoch": 18.51019858028574, + "grad_norm": 6.969428062438965, + "learning_rate": 3.49141969081258e-05, + "loss": 0.3713, + "step": 412000 + }, + { + "epoch": 18.51019858028574, + "eval_loss": 3.88724684715271, + "eval_runtime": 1095.691, + "eval_samples_per_second": 9.039, + "eval_steps_per_second": 0.036, + "step": 412000 + }, + { + "epoch": 18.51918411357714, + "grad_norm": 10.182289123535156, + "learning_rate": 3.49012381358765e-05, + "loss": 0.3692, + "step": 412200 + }, + { + "epoch": 18.52816964686854, + "grad_norm": 11.804682731628418, + "learning_rate": 3.4888276207599194e-05, + "loss": 0.3947, + "step": 412400 + }, + { + "epoch": 18.537155180159942, + "grad_norm": 12.905986785888672, + "learning_rate": 3.48753111274255e-05, + "loss": 0.3867, + "step": 412600 + }, + { + "epoch": 18.546140713451344, + "grad_norm": 3.650761842727661, + "learning_rate": 3.4862342899488066e-05, + "loss": 0.3821, + "step": 412800 + }, + { + "epoch": 18.555126246742745, + "grad_norm": 14.769987106323242, + "learning_rate": 3.484937152792051e-05, + "loss": 0.3525, + "step": 413000 + }, + { + "epoch": 18.555126246742745, + "eval_loss": 3.798630475997925, + "eval_runtime": 1096.711, + "eval_samples_per_second": 9.031, + "eval_steps_per_second": 0.036, + "step": 413000 + }, + { + "epoch": 18.564111780034146, + "grad_norm": 12.465880393981934, + "learning_rate": 3.483639701685746e-05, + "loss": 0.3876, + "step": 413200 + }, + { + "epoch": 18.573097313325547, + "grad_norm": 19.23861312866211, + "learning_rate": 3.4823419370434574e-05, + "loss": 0.3585, + "step": 413400 + }, + { + "epoch": 18.582082846616945, + "grad_norm": 2.4888880252838135, + "learning_rate": 3.481043859278847e-05, + "loss": 0.3783, + "step": 413600 + }, + { + "epoch": 18.591068379908346, + "grad_norm": 12.582083702087402, + "learning_rate": 3.4797454688056804e-05, + "loss": 0.3861, + "step": 413800 + }, + { + "epoch": 18.600053913199748, + "grad_norm": 0.991515576839447, + "learning_rate": 3.4784467660378174e-05, + "loss": 0.4015, + "step": 414000 + }, + { + "epoch": 18.600053913199748, + "eval_loss": 3.845909833908081, + "eval_runtime": 1096.418, + "eval_samples_per_second": 9.033, + "eval_steps_per_second": 0.036, + "step": 414000 + }, + { + "epoch": 18.60903944649115, + "grad_norm": 0.9095927476882935, + "learning_rate": 3.4771477513892234e-05, + "loss": 0.357, + "step": 414200 + }, + { + "epoch": 18.61802497978255, + "grad_norm": 8.816062927246094, + "learning_rate": 3.47584842527396e-05, + "loss": 0.3994, + "step": 414400 + }, + { + "epoch": 18.62701051307395, + "grad_norm": 12.012443542480469, + "learning_rate": 3.4745487881061865e-05, + "loss": 0.39, + "step": 414600 + }, + { + "epoch": 18.635996046365353, + "grad_norm": 31.449888229370117, + "learning_rate": 3.473248840300165e-05, + "loss": 0.357, + "step": 414800 + }, + { + "epoch": 18.644981579656754, + "grad_norm": 4.814366817474365, + "learning_rate": 3.471948582270256e-05, + "loss": 0.3608, + "step": 415000 + }, + { + "epoch": 18.644981579656754, + "eval_loss": 3.8544886112213135, + "eval_runtime": 1097.8947, + "eval_samples_per_second": 9.021, + "eval_steps_per_second": 0.036, + "step": 415000 + }, + { + "epoch": 18.653967112948152, + "grad_norm": 3.825913429260254, + "learning_rate": 3.470648014430915e-05, + "loss": 0.3929, + "step": 415200 + }, + { + "epoch": 18.662952646239553, + "grad_norm": 12.636445045471191, + "learning_rate": 3.4693471371967014e-05, + "loss": 0.3701, + "step": 415400 + }, + { + "epoch": 18.671938179530954, + "grad_norm": 9.792268753051758, + "learning_rate": 3.4680459509822696e-05, + "loss": 0.4264, + "step": 415600 + }, + { + "epoch": 18.680923712822356, + "grad_norm": 2.876805305480957, + "learning_rate": 3.466744456202375e-05, + "loss": 0.4097, + "step": 415800 + }, + { + "epoch": 18.689909246113757, + "grad_norm": 3.836838722229004, + "learning_rate": 3.4654426532718695e-05, + "loss": 0.4236, + "step": 416000 + }, + { + "epoch": 18.689909246113757, + "eval_loss": 3.790830135345459, + "eval_runtime": 1100.7008, + "eval_samples_per_second": 8.998, + "eval_steps_per_second": 0.035, + "step": 416000 + }, + { + "epoch": 18.69889477940516, + "grad_norm": 11.140382766723633, + "learning_rate": 3.4641405426057034e-05, + "loss": 0.388, + "step": 416200 + }, + { + "epoch": 18.70788031269656, + "grad_norm": 6.716423034667969, + "learning_rate": 3.462838124618926e-05, + "loss": 0.366, + "step": 416400 + }, + { + "epoch": 18.71686584598796, + "grad_norm": 5.123216152191162, + "learning_rate": 3.461535399726685e-05, + "loss": 0.4019, + "step": 416600 + }, + { + "epoch": 18.72585137927936, + "grad_norm": 0.5618104338645935, + "learning_rate": 3.460232368344224e-05, + "loss": 0.3711, + "step": 416800 + }, + { + "epoch": 18.73483691257076, + "grad_norm": 3.904057264328003, + "learning_rate": 3.458929030886885e-05, + "loss": 0.4017, + "step": 417000 + }, + { + "epoch": 18.73483691257076, + "eval_loss": 3.819016695022583, + "eval_runtime": 1097.3118, + "eval_samples_per_second": 9.026, + "eval_steps_per_second": 0.036, + "step": 417000 + }, + { + "epoch": 18.74382244586216, + "grad_norm": 9.883956909179688, + "learning_rate": 3.457625387770109e-05, + "loss": 0.3891, + "step": 417200 + }, + { + "epoch": 18.752807979153562, + "grad_norm": 22.456649780273438, + "learning_rate": 3.456321439409432e-05, + "loss": 0.4144, + "step": 417400 + }, + { + "epoch": 18.761793512444964, + "grad_norm": 12.037010192871094, + "learning_rate": 3.455017186220491e-05, + "loss": 0.3706, + "step": 417600 + }, + { + "epoch": 18.770779045736365, + "grad_norm": 30.236738204956055, + "learning_rate": 3.4537126286190155e-05, + "loss": 0.4131, + "step": 417800 + }, + { + "epoch": 18.779764579027766, + "grad_norm": 6.0100321769714355, + "learning_rate": 3.452407767020835e-05, + "loss": 0.4224, + "step": 418000 + }, + { + "epoch": 18.779764579027766, + "eval_loss": 3.8412423133850098, + "eval_runtime": 1095.5506, + "eval_samples_per_second": 9.04, + "eval_steps_per_second": 0.036, + "step": 418000 + }, + { + "epoch": 18.788750112319168, + "grad_norm": 16.41374969482422, + "learning_rate": 3.4511026018418765e-05, + "loss": 0.3991, + "step": 418200 + }, + { + "epoch": 18.797735645610565, + "grad_norm": 15.420040130615234, + "learning_rate": 3.4497971334981596e-05, + "loss": 0.4127, + "step": 418400 + }, + { + "epoch": 18.806721178901967, + "grad_norm": 13.536659240722656, + "learning_rate": 3.448491362405807e-05, + "loss": 0.3659, + "step": 418600 + }, + { + "epoch": 18.815706712193368, + "grad_norm": 20.171710968017578, + "learning_rate": 3.447185288981031e-05, + "loss": 0.4017, + "step": 418800 + }, + { + "epoch": 18.82469224548477, + "grad_norm": 9.69514274597168, + "learning_rate": 3.445878913640146e-05, + "loss": 0.38, + "step": 419000 + }, + { + "epoch": 18.82469224548477, + "eval_loss": 3.8335611820220947, + "eval_runtime": 1094.6714, + "eval_samples_per_second": 9.047, + "eval_steps_per_second": 0.036, + "step": 419000 + }, + { + "epoch": 18.83367777877617, + "grad_norm": 1.9153423309326172, + "learning_rate": 3.444572236799559e-05, + "loss": 0.4292, + "step": 419200 + }, + { + "epoch": 18.84266331206757, + "grad_norm": 16.780864715576172, + "learning_rate": 3.443265258875776e-05, + "loss": 0.386, + "step": 419400 + }, + { + "epoch": 18.851648845358973, + "grad_norm": 7.751341819763184, + "learning_rate": 3.4419579802853946e-05, + "loss": 0.4026, + "step": 419600 + }, + { + "epoch": 18.860634378650374, + "grad_norm": 10.850844383239746, + "learning_rate": 3.440650401445113e-05, + "loss": 0.3684, + "step": 419800 + }, + { + "epoch": 18.869619911941776, + "grad_norm": 10.96944522857666, + "learning_rate": 3.439342522771722e-05, + "loss": 0.3631, + "step": 420000 + }, + { + "epoch": 18.869619911941776, + "eval_loss": 3.8032419681549072, + "eval_runtime": 1188.8528, + "eval_samples_per_second": 8.331, + "eval_steps_per_second": 0.033, + "step": 420000 + }, + { + "epoch": 18.878605445233173, + "grad_norm": 61.311546325683594, + "learning_rate": 3.43803434468211e-05, + "loss": 0.3718, + "step": 420200 + }, + { + "epoch": 18.887590978524575, + "grad_norm": 0.1739572435617447, + "learning_rate": 3.43672586759326e-05, + "loss": 0.3735, + "step": 420400 + }, + { + "epoch": 18.896576511815976, + "grad_norm": 1.1089012622833252, + "learning_rate": 3.4354170919222484e-05, + "loss": 0.383, + "step": 420600 + }, + { + "epoch": 18.905562045107377, + "grad_norm": 3.8840813636779785, + "learning_rate": 3.43410801808625e-05, + "loss": 0.3992, + "step": 420800 + }, + { + "epoch": 18.91454757839878, + "grad_norm": 10.133760452270508, + "learning_rate": 3.432798646502533e-05, + "loss": 0.383, + "step": 421000 + }, + { + "epoch": 18.91454757839878, + "eval_loss": 3.857928514480591, + "eval_runtime": 1170.5487, + "eval_samples_per_second": 8.461, + "eval_steps_per_second": 0.033, + "step": 421000 + }, + { + "epoch": 18.92353311169018, + "grad_norm": 12.687873840332031, + "learning_rate": 3.4314889775884615e-05, + "loss": 0.3884, + "step": 421200 + }, + { + "epoch": 18.93251864498158, + "grad_norm": 3.658750534057617, + "learning_rate": 3.4301790117614906e-05, + "loss": 0.372, + "step": 421400 + }, + { + "epoch": 18.94150417827298, + "grad_norm": 24.821044921875, + "learning_rate": 3.4288687494391766e-05, + "loss": 0.398, + "step": 421600 + }, + { + "epoch": 18.95048971156438, + "grad_norm": 1.3283342123031616, + "learning_rate": 3.427558191039165e-05, + "loss": 0.3814, + "step": 421800 + }, + { + "epoch": 18.95947524485578, + "grad_norm": 4.043994426727295, + "learning_rate": 3.426247336979198e-05, + "loss": 0.383, + "step": 422000 + }, + { + "epoch": 18.95947524485578, + "eval_loss": 3.8787529468536377, + "eval_runtime": 1167.8307, + "eval_samples_per_second": 8.481, + "eval_steps_per_second": 0.033, + "step": 422000 + }, + { + "epoch": 18.968460778147183, + "grad_norm": 10.233535766601562, + "learning_rate": 3.4249361876771106e-05, + "loss": 0.3636, + "step": 422200 + }, + { + "epoch": 18.977446311438584, + "grad_norm": 7.685864448547363, + "learning_rate": 3.423624743550833e-05, + "loss": 0.3719, + "step": 422400 + }, + { + "epoch": 18.986431844729985, + "grad_norm": 4.338862895965576, + "learning_rate": 3.422313005018389e-05, + "loss": 0.3908, + "step": 422600 + }, + { + "epoch": 18.995417378021386, + "grad_norm": 6.173080921173096, + "learning_rate": 3.421000972497897e-05, + "loss": 0.4272, + "step": 422800 + }, + { + "epoch": 19.004402911312788, + "grad_norm": 9.796375274658203, + "learning_rate": 3.419688646407569e-05, + "loss": 0.405, + "step": 423000 + }, + { + "epoch": 19.004402911312788, + "eval_loss": 3.8710274696350098, + "eval_runtime": 1174.2983, + "eval_samples_per_second": 8.434, + "eval_steps_per_second": 0.033, + "step": 423000 + }, + { + "epoch": 19.01338844460419, + "grad_norm": 16.157901763916016, + "learning_rate": 3.418376027165708e-05, + "loss": 0.3669, + "step": 423200 + }, + { + "epoch": 19.022373977895587, + "grad_norm": 6.099151134490967, + "learning_rate": 3.417063115190714e-05, + "loss": 0.3595, + "step": 423400 + }, + { + "epoch": 19.031359511186988, + "grad_norm": 18.236555099487305, + "learning_rate": 3.4157499109010786e-05, + "loss": 0.3571, + "step": 423600 + }, + { + "epoch": 19.04034504447839, + "grad_norm": 0.8889177441596985, + "learning_rate": 3.414436414715386e-05, + "loss": 0.3457, + "step": 423800 + }, + { + "epoch": 19.04933057776979, + "grad_norm": 10.380514144897461, + "learning_rate": 3.413122627052316e-05, + "loss": 0.3385, + "step": 424000 + }, + { + "epoch": 19.04933057776979, + "eval_loss": 3.8625006675720215, + "eval_runtime": 1174.9973, + "eval_samples_per_second": 8.429, + "eval_steps_per_second": 0.033, + "step": 424000 + }, + { + "epoch": 19.058316111061192, + "grad_norm": 1.4684069156646729, + "learning_rate": 3.4118085483306375e-05, + "loss": 0.3354, + "step": 424200 + }, + { + "epoch": 19.067301644352593, + "grad_norm": 7.4322075843811035, + "learning_rate": 3.4104941789692156e-05, + "loss": 0.3579, + "step": 424400 + }, + { + "epoch": 19.076287177643994, + "grad_norm": 10.02495002746582, + "learning_rate": 3.409179519387006e-05, + "loss": 0.3629, + "step": 424600 + }, + { + "epoch": 19.085272710935396, + "grad_norm": 4.068674564361572, + "learning_rate": 3.4078645700030575e-05, + "loss": 0.3463, + "step": 424800 + }, + { + "epoch": 19.094258244226793, + "grad_norm": 0.7052398920059204, + "learning_rate": 3.406549331236511e-05, + "loss": 0.393, + "step": 425000 + }, + { + "epoch": 19.094258244226793, + "eval_loss": 3.8474578857421875, + "eval_runtime": 1176.9074, + "eval_samples_per_second": 8.415, + "eval_steps_per_second": 0.033, + "step": 425000 + }, + { + "epoch": 19.103243777518195, + "grad_norm": 9.41407585144043, + "learning_rate": 3.405233803506602e-05, + "loss": 0.3732, + "step": 425200 + }, + { + "epoch": 19.112229310809596, + "grad_norm": 9.691625595092773, + "learning_rate": 3.403917987232653e-05, + "loss": 0.3649, + "step": 425400 + }, + { + "epoch": 19.121214844100997, + "grad_norm": 3.508151054382324, + "learning_rate": 3.4026018828340846e-05, + "loss": 0.3801, + "step": 425600 + }, + { + "epoch": 19.1302003773924, + "grad_norm": 10.020624160766602, + "learning_rate": 3.401285490730404e-05, + "loss": 0.3543, + "step": 425800 + }, + { + "epoch": 19.1391859106838, + "grad_norm": 32.40066909790039, + "learning_rate": 3.399968811341212e-05, + "loss": 0.3514, + "step": 426000 + }, + { + "epoch": 19.1391859106838, + "eval_loss": 3.8292617797851562, + "eval_runtime": 1170.371, + "eval_samples_per_second": 8.462, + "eval_steps_per_second": 0.033, + "step": 426000 + }, + { + "epoch": 19.1481714439752, + "grad_norm": 16.520408630371094, + "learning_rate": 3.398651845086203e-05, + "loss": 0.3583, + "step": 426200 + }, + { + "epoch": 19.157156977266602, + "grad_norm": 9.090585708618164, + "learning_rate": 3.3973345923851604e-05, + "loss": 0.3934, + "step": 426400 + }, + { + "epoch": 19.166142510558, + "grad_norm": 11.521536827087402, + "learning_rate": 3.39601705365796e-05, + "loss": 0.351, + "step": 426600 + }, + { + "epoch": 19.1751280438494, + "grad_norm": 8.667354583740234, + "learning_rate": 3.394699229324567e-05, + "loss": 0.3621, + "step": 426800 + }, + { + "epoch": 19.184113577140803, + "grad_norm": 28.831558227539062, + "learning_rate": 3.3933811198050405e-05, + "loss": 0.3502, + "step": 427000 + }, + { + "epoch": 19.184113577140803, + "eval_loss": 3.881221055984497, + "eval_runtime": 1173.9735, + "eval_samples_per_second": 8.436, + "eval_steps_per_second": 0.033, + "step": 427000 + }, + { + "epoch": 19.193099110432204, + "grad_norm": 8.013230323791504, + "learning_rate": 3.392062725519529e-05, + "loss": 0.3609, + "step": 427200 + }, + { + "epoch": 19.202084643723605, + "grad_norm": 11.29799747467041, + "learning_rate": 3.390744046888271e-05, + "loss": 0.4193, + "step": 427400 + }, + { + "epoch": 19.211070177015007, + "grad_norm": 3.9097185134887695, + "learning_rate": 3.389425084331596e-05, + "loss": 0.3746, + "step": 427600 + }, + { + "epoch": 19.220055710306408, + "grad_norm": 11.717888832092285, + "learning_rate": 3.388105838269925e-05, + "loss": 0.3999, + "step": 427800 + }, + { + "epoch": 19.22904124359781, + "grad_norm": 12.494455337524414, + "learning_rate": 3.386786309123769e-05, + "loss": 0.3875, + "step": 428000 + }, + { + "epoch": 19.22904124359781, + "eval_loss": 3.8519411087036133, + "eval_runtime": 1173.1781, + "eval_samples_per_second": 8.442, + "eval_steps_per_second": 0.033, + "step": 428000 + }, + { + "epoch": 19.238026776889207, + "grad_norm": 3.4043800830841064, + "learning_rate": 3.38546649731373e-05, + "loss": 0.3683, + "step": 428200 + }, + { + "epoch": 19.247012310180608, + "grad_norm": 12.774907112121582, + "learning_rate": 3.3841464032604974e-05, + "loss": 0.3805, + "step": 428400 + }, + { + "epoch": 19.25599784347201, + "grad_norm": 7.213978290557861, + "learning_rate": 3.382826027384853e-05, + "loss": 0.3526, + "step": 428600 + }, + { + "epoch": 19.26498337676341, + "grad_norm": 8.512626647949219, + "learning_rate": 3.3815053701076674e-05, + "loss": 0.3925, + "step": 428800 + }, + { + "epoch": 19.273968910054812, + "grad_norm": 3.8123066425323486, + "learning_rate": 3.3801844318499024e-05, + "loss": 0.3349, + "step": 429000 + }, + { + "epoch": 19.273968910054812, + "eval_loss": 3.8657233715057373, + "eval_runtime": 1171.8186, + "eval_samples_per_second": 8.452, + "eval_steps_per_second": 0.033, + "step": 429000 + }, + { + "epoch": 19.282954443346213, + "grad_norm": 1.9035091400146484, + "learning_rate": 3.378863213032607e-05, + "loss": 0.3481, + "step": 429200 + }, + { + "epoch": 19.291939976637615, + "grad_norm": 14.608076095581055, + "learning_rate": 3.37754171407692e-05, + "loss": 0.3859, + "step": 429400 + }, + { + "epoch": 19.300925509929016, + "grad_norm": 6.863801002502441, + "learning_rate": 3.376219935404072e-05, + "loss": 0.3843, + "step": 429600 + }, + { + "epoch": 19.309911043220414, + "grad_norm": 11.920736312866211, + "learning_rate": 3.374897877435381e-05, + "loss": 0.3549, + "step": 429800 + }, + { + "epoch": 19.318896576511815, + "grad_norm": 4.002532482147217, + "learning_rate": 3.373575540592253e-05, + "loss": 0.4075, + "step": 430000 + }, + { + "epoch": 19.318896576511815, + "eval_loss": 3.8724846839904785, + "eval_runtime": 1110.6742, + "eval_samples_per_second": 8.917, + "eval_steps_per_second": 0.035, + "step": 430000 + }, + { + "epoch": 19.327882109803216, + "grad_norm": 19.618444442749023, + "learning_rate": 3.372252925296186e-05, + "loss": 0.3922, + "step": 430200 + }, + { + "epoch": 19.336867643094617, + "grad_norm": 3.7305030822753906, + "learning_rate": 3.370930031968762e-05, + "loss": 0.3698, + "step": 430400 + }, + { + "epoch": 19.34585317638602, + "grad_norm": 4.330793380737305, + "learning_rate": 3.3696068610316556e-05, + "loss": 0.3633, + "step": 430600 + }, + { + "epoch": 19.35483870967742, + "grad_norm": 0.21204280853271484, + "learning_rate": 3.368283412906629e-05, + "loss": 0.3499, + "step": 430800 + }, + { + "epoch": 19.36382424296882, + "grad_norm": 6.117523193359375, + "learning_rate": 3.366959688015531e-05, + "loss": 0.3454, + "step": 431000 + }, + { + "epoch": 19.36382424296882, + "eval_loss": 3.8316211700439453, + "eval_runtime": 1087.1061, + "eval_samples_per_second": 9.11, + "eval_steps_per_second": 0.036, + "step": 431000 + }, + { + "epoch": 19.372809776260222, + "grad_norm": 3.591719627380371, + "learning_rate": 3.365635686780303e-05, + "loss": 0.3373, + "step": 431200 + }, + { + "epoch": 19.38179530955162, + "grad_norm": 8.026259422302246, + "learning_rate": 3.364311409622969e-05, + "loss": 0.3859, + "step": 431400 + }, + { + "epoch": 19.39078084284302, + "grad_norm": 4.9064836502075195, + "learning_rate": 3.362986856965644e-05, + "loss": 0.3662, + "step": 431600 + }, + { + "epoch": 19.399766376134423, + "grad_norm": 2.1227197647094727, + "learning_rate": 3.3616620292305304e-05, + "loss": 0.345, + "step": 431800 + }, + { + "epoch": 19.408751909425824, + "grad_norm": 14.224973678588867, + "learning_rate": 3.3603369268399174e-05, + "loss": 0.398, + "step": 432000 + }, + { + "epoch": 19.408751909425824, + "eval_loss": 3.853020191192627, + "eval_runtime": 1079.4522, + "eval_samples_per_second": 9.175, + "eval_steps_per_second": 0.036, + "step": 432000 + }, + { + "epoch": 19.417737442717225, + "grad_norm": 8.285384178161621, + "learning_rate": 3.359011550216184e-05, + "loss": 0.3661, + "step": 432200 + }, + { + "epoch": 19.426722976008627, + "grad_norm": 8.617288589477539, + "learning_rate": 3.3576858997817936e-05, + "loss": 0.3613, + "step": 432400 + }, + { + "epoch": 19.435708509300028, + "grad_norm": 3.534817934036255, + "learning_rate": 3.3563599759593007e-05, + "loss": 0.3901, + "step": 432600 + }, + { + "epoch": 19.44469404259143, + "grad_norm": 0.19126541912555695, + "learning_rate": 3.3550337791713426e-05, + "loss": 0.3549, + "step": 432800 + }, + { + "epoch": 19.453679575882827, + "grad_norm": 10.775198936462402, + "learning_rate": 3.353707309840646e-05, + "loss": 0.3864, + "step": 433000 + }, + { + "epoch": 19.453679575882827, + "eval_loss": 3.870607376098633, + "eval_runtime": 1102.0643, + "eval_samples_per_second": 8.987, + "eval_steps_per_second": 0.035, + "step": 433000 + }, + { + "epoch": 19.462665109174228, + "grad_norm": 10.87759780883789, + "learning_rate": 3.352380568390024e-05, + "loss": 0.3797, + "step": 433200 + }, + { + "epoch": 19.47165064246563, + "grad_norm": 8.955763816833496, + "learning_rate": 3.351053555242376e-05, + "loss": 0.3572, + "step": 433400 + }, + { + "epoch": 19.48063617575703, + "grad_norm": 11.83018684387207, + "learning_rate": 3.349726270820691e-05, + "loss": 0.3859, + "step": 433600 + }, + { + "epoch": 19.489621709048432, + "grad_norm": 29.993505477905273, + "learning_rate": 3.3483987155480396e-05, + "loss": 0.4068, + "step": 433800 + }, + { + "epoch": 19.498607242339833, + "grad_norm": 7.300692081451416, + "learning_rate": 3.347070889847582e-05, + "loss": 0.3916, + "step": 434000 + }, + { + "epoch": 19.498607242339833, + "eval_loss": 3.8529105186462402, + "eval_runtime": 1098.1299, + "eval_samples_per_second": 9.019, + "eval_steps_per_second": 0.036, + "step": 434000 + }, + { + "epoch": 19.507592775631235, + "grad_norm": 21.306541442871094, + "learning_rate": 3.345742794142564e-05, + "loss": 0.3635, + "step": 434200 + }, + { + "epoch": 19.516578308922636, + "grad_norm": 0.5357521772384644, + "learning_rate": 3.3444144288563174e-05, + "loss": 0.3509, + "step": 434400 + }, + { + "epoch": 19.525563842214034, + "grad_norm": 10.118279457092285, + "learning_rate": 3.343085794412258e-05, + "loss": 0.3619, + "step": 434600 + }, + { + "epoch": 19.534549375505435, + "grad_norm": 8.305274963378906, + "learning_rate": 3.341756891233891e-05, + "loss": 0.3737, + "step": 434800 + }, + { + "epoch": 19.543534908796836, + "grad_norm": 0.6471884846687317, + "learning_rate": 3.3404277197448054e-05, + "loss": 0.3445, + "step": 435000 + }, + { + "epoch": 19.543534908796836, + "eval_loss": 3.916043281555176, + "eval_runtime": 1098.0537, + "eval_samples_per_second": 9.02, + "eval_steps_per_second": 0.036, + "step": 435000 + }, + { + "epoch": 19.552520442088237, + "grad_norm": 9.640978813171387, + "learning_rate": 3.339098280368675e-05, + "loss": 0.3829, + "step": 435200 + }, + { + "epoch": 19.56150597537964, + "grad_norm": 28.039609909057617, + "learning_rate": 3.33776857352926e-05, + "loss": 0.403, + "step": 435400 + }, + { + "epoch": 19.57049150867104, + "grad_norm": 1.782164216041565, + "learning_rate": 3.3364385996504055e-05, + "loss": 0.3996, + "step": 435600 + }, + { + "epoch": 19.57947704196244, + "grad_norm": 15.381430625915527, + "learning_rate": 3.335108359156042e-05, + "loss": 0.358, + "step": 435800 + }, + { + "epoch": 19.588462575253843, + "grad_norm": 6.020942211151123, + "learning_rate": 3.3337778524701835e-05, + "loss": 0.3816, + "step": 436000 + }, + { + "epoch": 19.588462575253843, + "eval_loss": 3.842747449874878, + "eval_runtime": 1082.6766, + "eval_samples_per_second": 9.148, + "eval_steps_per_second": 0.036, + "step": 436000 + }, + { + "epoch": 19.597448108545244, + "grad_norm": 15.338593482971191, + "learning_rate": 3.332447080016932e-05, + "loss": 0.3869, + "step": 436200 + }, + { + "epoch": 19.60643364183664, + "grad_norm": 11.474835395812988, + "learning_rate": 3.3311160422204715e-05, + "loss": 0.3966, + "step": 436400 + }, + { + "epoch": 19.615419175128043, + "grad_norm": 2.0930511951446533, + "learning_rate": 3.329784739505072e-05, + "loss": 0.3639, + "step": 436600 + }, + { + "epoch": 19.624404708419444, + "grad_norm": 3.015812635421753, + "learning_rate": 3.3284531722950855e-05, + "loss": 0.3951, + "step": 436800 + }, + { + "epoch": 19.633390241710845, + "grad_norm": 6.570770740509033, + "learning_rate": 3.3271213410149524e-05, + "loss": 0.3735, + "step": 437000 + }, + { + "epoch": 19.633390241710845, + "eval_loss": 3.8144209384918213, + "eval_runtime": 1090.0308, + "eval_samples_per_second": 9.086, + "eval_steps_per_second": 0.036, + "step": 437000 + }, + { + "epoch": 19.642375775002247, + "grad_norm": 3.2332072257995605, + "learning_rate": 3.325789246089195e-05, + "loss": 0.3631, + "step": 437200 + }, + { + "epoch": 19.651361308293648, + "grad_norm": 3.6440892219543457, + "learning_rate": 3.324456887942417e-05, + "loss": 0.3675, + "step": 437400 + }, + { + "epoch": 19.66034684158505, + "grad_norm": 11.325727462768555, + "learning_rate": 3.323124266999312e-05, + "loss": 0.3748, + "step": 437600 + }, + { + "epoch": 19.66933237487645, + "grad_norm": 1.8451133966445923, + "learning_rate": 3.3217913836846524e-05, + "loss": 0.3727, + "step": 437800 + }, + { + "epoch": 19.67831790816785, + "grad_norm": 6.25849723815918, + "learning_rate": 3.320458238423295e-05, + "loss": 0.4164, + "step": 438000 + }, + { + "epoch": 19.67831790816785, + "eval_loss": 3.8024802207946777, + "eval_runtime": 1094.4447, + "eval_samples_per_second": 9.049, + "eval_steps_per_second": 0.036, + "step": 438000 + }, + { + "epoch": 19.68730344145925, + "grad_norm": 22.77155113220215, + "learning_rate": 3.319124831640183e-05, + "loss": 0.3534, + "step": 438200 + }, + { + "epoch": 19.69628897475065, + "grad_norm": 9.079693794250488, + "learning_rate": 3.31779116376034e-05, + "loss": 0.3323, + "step": 438400 + }, + { + "epoch": 19.705274508042052, + "grad_norm": 5.9739813804626465, + "learning_rate": 3.316457235208873e-05, + "loss": 0.3551, + "step": 438600 + }, + { + "epoch": 19.714260041333453, + "grad_norm": 7.636072635650635, + "learning_rate": 3.315123046410974e-05, + "loss": 0.3599, + "step": 438800 + }, + { + "epoch": 19.723245574624855, + "grad_norm": 8.846769332885742, + "learning_rate": 3.313788597791917e-05, + "loss": 0.3778, + "step": 439000 + }, + { + "epoch": 19.723245574624855, + "eval_loss": 3.8162496089935303, + "eval_runtime": 1105.6042, + "eval_samples_per_second": 8.958, + "eval_steps_per_second": 0.035, + "step": 439000 + }, + { + "epoch": 19.732231107916256, + "grad_norm": 5.736910343170166, + "learning_rate": 3.312453889777057e-05, + "loss": 0.3947, + "step": 439200 + }, + { + "epoch": 19.741216641207657, + "grad_norm": 13.45654582977295, + "learning_rate": 3.311118922791835e-05, + "loss": 0.3551, + "step": 439400 + }, + { + "epoch": 19.750202174499055, + "grad_norm": 2.0433974266052246, + "learning_rate": 3.309783697261771e-05, + "loss": 0.3922, + "step": 439600 + }, + { + "epoch": 19.759187707790456, + "grad_norm": 7.121521949768066, + "learning_rate": 3.3084482136124716e-05, + "loss": 0.3869, + "step": 439800 + }, + { + "epoch": 19.768173241081858, + "grad_norm": 0.8535615801811218, + "learning_rate": 3.3071124722696224e-05, + "loss": 0.401, + "step": 440000 + }, + { + "epoch": 19.768173241081858, + "eval_loss": 3.806692361831665, + "eval_runtime": 1098.742, + "eval_samples_per_second": 9.014, + "eval_steps_per_second": 0.035, + "step": 440000 + }, + { + "epoch": 19.77715877437326, + "grad_norm": 13.158157348632812, + "learning_rate": 3.305776473658991e-05, + "loss": 0.3573, + "step": 440200 + }, + { + "epoch": 19.78614430766466, + "grad_norm": 10.366994857788086, + "learning_rate": 3.304440218206429e-05, + "loss": 0.3676, + "step": 440400 + }, + { + "epoch": 19.79512984095606, + "grad_norm": 11.056921005249023, + "learning_rate": 3.3031037063378695e-05, + "loss": 0.3905, + "step": 440600 + }, + { + "epoch": 19.804115374247463, + "grad_norm": 3.31510066986084, + "learning_rate": 3.301766938479325e-05, + "loss": 0.3789, + "step": 440800 + }, + { + "epoch": 19.813100907538864, + "grad_norm": 0.25016453862190247, + "learning_rate": 3.300429915056894e-05, + "loss": 0.35, + "step": 441000 + }, + { + "epoch": 19.813100907538864, + "eval_loss": 3.828049421310425, + "eval_runtime": 1104.6838, + "eval_samples_per_second": 8.965, + "eval_steps_per_second": 0.035, + "step": 441000 + }, + { + "epoch": 19.82208644083026, + "grad_norm": 5.278088569641113, + "learning_rate": 3.299092636496751e-05, + "loss": 0.372, + "step": 441200 + }, + { + "epoch": 19.831071974121663, + "grad_norm": 7.003445625305176, + "learning_rate": 3.297755103225157e-05, + "loss": 0.3633, + "step": 441400 + }, + { + "epoch": 19.840057507413064, + "grad_norm": 18.454580307006836, + "learning_rate": 3.296417315668451e-05, + "loss": 0.3645, + "step": 441600 + }, + { + "epoch": 19.849043040704466, + "grad_norm": 6.675582408905029, + "learning_rate": 3.2950792742530536e-05, + "loss": 0.3794, + "step": 441800 + }, + { + "epoch": 19.858028573995867, + "grad_norm": 3.7882144451141357, + "learning_rate": 3.293740979405467e-05, + "loss": 0.3936, + "step": 442000 + }, + { + "epoch": 19.858028573995867, + "eval_loss": 3.856177806854248, + "eval_runtime": 1169.3786, + "eval_samples_per_second": 8.469, + "eval_steps_per_second": 0.033, + "step": 442000 + }, + { + "epoch": 19.867014107287268, + "grad_norm": 2.224478006362915, + "learning_rate": 3.292402431552273e-05, + "loss": 0.3826, + "step": 442200 + }, + { + "epoch": 19.87599964057867, + "grad_norm": 1.1260976791381836, + "learning_rate": 3.291063631120137e-05, + "loss": 0.367, + "step": 442400 + }, + { + "epoch": 19.88498517387007, + "grad_norm": 7.941216468811035, + "learning_rate": 3.2897245785357995e-05, + "loss": 0.4042, + "step": 442600 + }, + { + "epoch": 19.89397070716147, + "grad_norm": 8.846776008605957, + "learning_rate": 3.288385274226088e-05, + "loss": 0.3933, + "step": 442800 + }, + { + "epoch": 19.90295624045287, + "grad_norm": 16.292428970336914, + "learning_rate": 3.287045718617904e-05, + "loss": 0.3749, + "step": 443000 + }, + { + "epoch": 19.90295624045287, + "eval_loss": 3.854950428009033, + "eval_runtime": 1159.3263, + "eval_samples_per_second": 8.543, + "eval_steps_per_second": 0.034, + "step": 443000 + }, + { + "epoch": 19.91194177374427, + "grad_norm": 12.939181327819824, + "learning_rate": 3.285705912138234e-05, + "loss": 0.3701, + "step": 443200 + }, + { + "epoch": 19.920927307035672, + "grad_norm": 3.3179798126220703, + "learning_rate": 3.284365855214141e-05, + "loss": 0.427, + "step": 443400 + }, + { + "epoch": 19.929912840327074, + "grad_norm": 4.160244941711426, + "learning_rate": 3.283025548272771e-05, + "loss": 0.3636, + "step": 443600 + }, + { + "epoch": 19.938898373618475, + "grad_norm": 1.0800896883010864, + "learning_rate": 3.281684991741347e-05, + "loss": 0.4054, + "step": 443800 + }, + { + "epoch": 19.947883906909876, + "grad_norm": 10.361804962158203, + "learning_rate": 3.2803441860471725e-05, + "loss": 0.4003, + "step": 444000 + }, + { + "epoch": 19.947883906909876, + "eval_loss": 3.795114517211914, + "eval_runtime": 1157.2871, + "eval_samples_per_second": 8.558, + "eval_steps_per_second": 0.034, + "step": 444000 + }, + { + "epoch": 19.956869440201277, + "grad_norm": 2.5146071910858154, + "learning_rate": 3.27900313161763e-05, + "loss": 0.3784, + "step": 444200 + }, + { + "epoch": 19.965854973492675, + "grad_norm": 2.567941904067993, + "learning_rate": 3.277661828880182e-05, + "loss": 0.3757, + "step": 444400 + }, + { + "epoch": 19.974840506784076, + "grad_norm": 7.472506046295166, + "learning_rate": 3.276320278262371e-05, + "loss": 0.383, + "step": 444600 + }, + { + "epoch": 19.983826040075478, + "grad_norm": 1.7942224740982056, + "learning_rate": 3.2749784801918155e-05, + "loss": 0.3547, + "step": 444800 + }, + { + "epoch": 19.99281157336688, + "grad_norm": 12.670038223266602, + "learning_rate": 3.273636435096216e-05, + "loss": 0.4145, + "step": 445000 + }, + { + "epoch": 19.99281157336688, + "eval_loss": 3.7545852661132812, + "eval_runtime": 1143.5493, + "eval_samples_per_second": 8.661, + "eval_steps_per_second": 0.034, + "step": 445000 + }, + { + "epoch": 20.00179710665828, + "grad_norm": 0.7427432537078857, + "learning_rate": 3.27229414340335e-05, + "loss": 0.3815, + "step": 445200 + }, + { + "epoch": 20.01078263994968, + "grad_norm": 2.870213270187378, + "learning_rate": 3.270951605541075e-05, + "loss": 0.3358, + "step": 445400 + }, + { + "epoch": 20.019768173241083, + "grad_norm": 7.560419082641602, + "learning_rate": 3.269608821937325e-05, + "loss": 0.3451, + "step": 445600 + }, + { + "epoch": 20.028753706532484, + "grad_norm": 6.4001078605651855, + "learning_rate": 3.268265793020114e-05, + "loss": 0.3516, + "step": 445800 + }, + { + "epoch": 20.037739239823882, + "grad_norm": 21.972902297973633, + "learning_rate": 3.2669225192175334e-05, + "loss": 0.3828, + "step": 446000 + }, + { + "epoch": 20.037739239823882, + "eval_loss": 3.8768162727355957, + "eval_runtime": 1147.0252, + "eval_samples_per_second": 8.635, + "eval_steps_per_second": 0.034, + "step": 446000 + }, + { + "epoch": 20.046724773115283, + "grad_norm": 13.854667663574219, + "learning_rate": 3.265579000957753e-05, + "loss": 0.3745, + "step": 446200 + }, + { + "epoch": 20.055710306406684, + "grad_norm": 1.945226788520813, + "learning_rate": 3.26423523866902e-05, + "loss": 0.3407, + "step": 446400 + }, + { + "epoch": 20.064695839698086, + "grad_norm": 2.497396469116211, + "learning_rate": 3.26289123277966e-05, + "loss": 0.3409, + "step": 446600 + }, + { + "epoch": 20.073681372989487, + "grad_norm": 17.679908752441406, + "learning_rate": 3.261546983718077e-05, + "loss": 0.3555, + "step": 446800 + }, + { + "epoch": 20.08266690628089, + "grad_norm": 12.340278625488281, + "learning_rate": 3.2602024919127495e-05, + "loss": 0.3559, + "step": 447000 + }, + { + "epoch": 20.08266690628089, + "eval_loss": 3.868159532546997, + "eval_runtime": 1144.6119, + "eval_samples_per_second": 8.653, + "eval_steps_per_second": 0.034, + "step": 447000 + }, + { + "epoch": 20.09165243957229, + "grad_norm": 7.965939521789551, + "learning_rate": 3.2588577577922366e-05, + "loss": 0.3499, + "step": 447200 + }, + { + "epoch": 20.10063797286369, + "grad_norm": 1.9072184562683105, + "learning_rate": 3.2575127817851734e-05, + "loss": 0.3428, + "step": 447400 + }, + { + "epoch": 20.10962350615509, + "grad_norm": 6.992972373962402, + "learning_rate": 3.256167564320272e-05, + "loss": 0.3544, + "step": 447600 + }, + { + "epoch": 20.11860903944649, + "grad_norm": 5.526668548583984, + "learning_rate": 3.2548221058263214e-05, + "loss": 0.3596, + "step": 447800 + }, + { + "epoch": 20.12759457273789, + "grad_norm": 8.724543571472168, + "learning_rate": 3.2534764067321874e-05, + "loss": 0.3359, + "step": 448000 + }, + { + "epoch": 20.12759457273789, + "eval_loss": 3.878002882003784, + "eval_runtime": 1143.5931, + "eval_samples_per_second": 8.66, + "eval_steps_per_second": 0.034, + "step": 448000 + }, + { + "epoch": 20.136580106029292, + "grad_norm": 5.3289361000061035, + "learning_rate": 3.252130467466814e-05, + "loss": 0.3555, + "step": 448200 + }, + { + "epoch": 20.145565639320694, + "grad_norm": 2.90199875831604, + "learning_rate": 3.25078428845922e-05, + "loss": 0.3167, + "step": 448400 + }, + { + "epoch": 20.154551172612095, + "grad_norm": 4.369307041168213, + "learning_rate": 3.2494378701385e-05, + "loss": 0.3423, + "step": 448600 + }, + { + "epoch": 20.163536705903496, + "grad_norm": 6.077184677124023, + "learning_rate": 3.248091212933827e-05, + "loss": 0.3617, + "step": 448800 + }, + { + "epoch": 20.172522239194898, + "grad_norm": 4.385313034057617, + "learning_rate": 3.246744317274449e-05, + "loss": 0.3382, + "step": 449000 + }, + { + "epoch": 20.172522239194898, + "eval_loss": 3.871030807495117, + "eval_runtime": 1143.6866, + "eval_samples_per_second": 8.66, + "eval_steps_per_second": 0.034, + "step": 449000 + }, + { + "epoch": 20.1815077724863, + "grad_norm": 4.845536708831787, + "learning_rate": 3.24539718358969e-05, + "loss": 0.3544, + "step": 449200 + }, + { + "epoch": 20.190493305777697, + "grad_norm": 9.48888111114502, + "learning_rate": 3.2440498123089496e-05, + "loss": 0.3651, + "step": 449400 + }, + { + "epoch": 20.199478839069098, + "grad_norm": 16.708328247070312, + "learning_rate": 3.242702203861704e-05, + "loss": 0.3364, + "step": 449600 + }, + { + "epoch": 20.2084643723605, + "grad_norm": 31.345827102661133, + "learning_rate": 3.241354358677505e-05, + "loss": 0.3687, + "step": 449800 + }, + { + "epoch": 20.2174499056519, + "grad_norm": 6.827626705169678, + "learning_rate": 3.240006277185978e-05, + "loss": 0.3804, + "step": 450000 + }, + { + "epoch": 20.2174499056519, + "eval_loss": 3.9251058101654053, + "eval_runtime": 1154.8423, + "eval_samples_per_second": 8.576, + "eval_steps_per_second": 0.034, + "step": 450000 + }, + { + "epoch": 20.2264354389433, + "grad_norm": 6.233980178833008, + "learning_rate": 3.2386579598168266e-05, + "loss": 0.3687, + "step": 450200 + }, + { + "epoch": 20.235420972234703, + "grad_norm": 6.345924377441406, + "learning_rate": 3.237309406999827e-05, + "loss": 0.3432, + "step": 450400 + }, + { + "epoch": 20.244406505526104, + "grad_norm": 1.4343754053115845, + "learning_rate": 3.235960619164832e-05, + "loss": 0.3801, + "step": 450600 + }, + { + "epoch": 20.253392038817505, + "grad_norm": 17.45358657836914, + "learning_rate": 3.234611596741769e-05, + "loss": 0.365, + "step": 450800 + }, + { + "epoch": 20.262377572108903, + "grad_norm": 16.016883850097656, + "learning_rate": 3.23326234016064e-05, + "loss": 0.3624, + "step": 451000 + }, + { + "epoch": 20.262377572108903, + "eval_loss": 3.8094112873077393, + "eval_runtime": 1142.5451, + "eval_samples_per_second": 8.668, + "eval_steps_per_second": 0.034, + "step": 451000 + }, + { + "epoch": 20.271363105400305, + "grad_norm": 17.484983444213867, + "learning_rate": 3.2319128498515214e-05, + "loss": 0.3379, + "step": 451200 + }, + { + "epoch": 20.280348638691706, + "grad_norm": 17.760513305664062, + "learning_rate": 3.230563126244564e-05, + "loss": 0.371, + "step": 451400 + }, + { + "epoch": 20.289334171983107, + "grad_norm": 6.531546592712402, + "learning_rate": 3.229213169769995e-05, + "loss": 0.3737, + "step": 451600 + }, + { + "epoch": 20.29831970527451, + "grad_norm": 10.28607177734375, + "learning_rate": 3.227862980858112e-05, + "loss": 0.3628, + "step": 451800 + }, + { + "epoch": 20.30730523856591, + "grad_norm": 5.768312454223633, + "learning_rate": 3.22651255993929e-05, + "loss": 0.377, + "step": 452000 + }, + { + "epoch": 20.30730523856591, + "eval_loss": 3.835094690322876, + "eval_runtime": 1150.0337, + "eval_samples_per_second": 8.612, + "eval_steps_per_second": 0.034, + "step": 452000 + }, + { + "epoch": 20.31629077185731, + "grad_norm": 9.820401191711426, + "learning_rate": 3.2251619074439776e-05, + "loss": 0.3633, + "step": 452200 + }, + { + "epoch": 20.325276305148712, + "grad_norm": 9.445414543151855, + "learning_rate": 3.2238110238026944e-05, + "loss": 0.3547, + "step": 452400 + }, + { + "epoch": 20.33426183844011, + "grad_norm": 5.395224571228027, + "learning_rate": 3.2224599094460376e-05, + "loss": 0.3578, + "step": 452600 + }, + { + "epoch": 20.34324737173151, + "grad_norm": 12.77868938446045, + "learning_rate": 3.221108564804675e-05, + "loss": 0.3832, + "step": 452800 + }, + { + "epoch": 20.352232905022912, + "grad_norm": 5.215237617492676, + "learning_rate": 3.219756990309349e-05, + "loss": 0.3757, + "step": 453000 + }, + { + "epoch": 20.352232905022912, + "eval_loss": 3.832378625869751, + "eval_runtime": 1145.081, + "eval_samples_per_second": 8.649, + "eval_steps_per_second": 0.034, + "step": 453000 + }, + { + "epoch": 20.361218438314314, + "grad_norm": 8.17989730834961, + "learning_rate": 3.2184051863908746e-05, + "loss": 0.3425, + "step": 453200 + }, + { + "epoch": 20.370203971605715, + "grad_norm": 8.778077125549316, + "learning_rate": 3.217053153480142e-05, + "loss": 0.3502, + "step": 453400 + }, + { + "epoch": 20.379189504897116, + "grad_norm": 22.368091583251953, + "learning_rate": 3.2157008920081115e-05, + "loss": 0.373, + "step": 453600 + }, + { + "epoch": 20.388175038188518, + "grad_norm": 2.329055070877075, + "learning_rate": 3.2143484024058186e-05, + "loss": 0.3252, + "step": 453800 + }, + { + "epoch": 20.39716057147992, + "grad_norm": 8.0297269821167, + "learning_rate": 3.212995685104369e-05, + "loss": 0.3704, + "step": 454000 + }, + { + "epoch": 20.39716057147992, + "eval_loss": 3.886225938796997, + "eval_runtime": 1143.4802, + "eval_samples_per_second": 8.661, + "eval_steps_per_second": 0.034, + "step": 454000 + }, + { + "epoch": 20.406146104771317, + "grad_norm": 4.103653430938721, + "learning_rate": 3.2116427405349437e-05, + "loss": 0.3638, + "step": 454200 + }, + { + "epoch": 20.415131638062718, + "grad_norm": 12.913371086120605, + "learning_rate": 3.210289569128795e-05, + "loss": 0.3766, + "step": 454400 + }, + { + "epoch": 20.42411717135412, + "grad_norm": 8.67467975616455, + "learning_rate": 3.208936171317246e-05, + "loss": 0.3515, + "step": 454600 + }, + { + "epoch": 20.43310270464552, + "grad_norm": 14.403546333312988, + "learning_rate": 3.2075825475316954e-05, + "loss": 0.3751, + "step": 454800 + }, + { + "epoch": 20.44208823793692, + "grad_norm": 4.453256607055664, + "learning_rate": 3.20622869820361e-05, + "loss": 0.37, + "step": 455000 + }, + { + "epoch": 20.44208823793692, + "eval_loss": 3.873455762863159, + "eval_runtime": 1125.7815, + "eval_samples_per_second": 8.797, + "eval_steps_per_second": 0.035, + "step": 455000 + }, + { + "epoch": 20.451073771228323, + "grad_norm": 12.016096115112305, + "learning_rate": 3.204874623764532e-05, + "loss": 0.3539, + "step": 455200 + }, + { + "epoch": 20.460059304519724, + "grad_norm": 10.212580680847168, + "learning_rate": 3.2035203246460725e-05, + "loss": 0.3843, + "step": 455400 + }, + { + "epoch": 20.469044837811126, + "grad_norm": 6.088382720947266, + "learning_rate": 3.2021658012799166e-05, + "loss": 0.3938, + "step": 455600 + }, + { + "epoch": 20.478030371102523, + "grad_norm": 11.492984771728516, + "learning_rate": 3.200811054097819e-05, + "loss": 0.372, + "step": 455800 + }, + { + "epoch": 20.487015904393925, + "grad_norm": 12.331425666809082, + "learning_rate": 3.1994560835316073e-05, + "loss": 0.3457, + "step": 456000 + }, + { + "epoch": 20.487015904393925, + "eval_loss": 3.8303720951080322, + "eval_runtime": 1114.4203, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 0.035, + "step": 456000 + }, + { + "epoch": 20.496001437685326, + "grad_norm": 28.88426399230957, + "learning_rate": 3.198100890013178e-05, + "loss": 0.3414, + "step": 456200 + }, + { + "epoch": 20.504986970976727, + "grad_norm": 12.088685989379883, + "learning_rate": 3.196745473974502e-05, + "loss": 0.3848, + "step": 456400 + }, + { + "epoch": 20.51397250426813, + "grad_norm": 15.99104118347168, + "learning_rate": 3.195389835847619e-05, + "loss": 0.3815, + "step": 456600 + }, + { + "epoch": 20.52295803755953, + "grad_norm": 7.567880153656006, + "learning_rate": 3.194033976064637e-05, + "loss": 0.3409, + "step": 456800 + }, + { + "epoch": 20.53194357085093, + "grad_norm": 0.6070024371147156, + "learning_rate": 3.192677895057742e-05, + "loss": 0.3422, + "step": 457000 + }, + { + "epoch": 20.53194357085093, + "eval_loss": 3.879889726638794, + "eval_runtime": 1114.428, + "eval_samples_per_second": 8.887, + "eval_steps_per_second": 0.035, + "step": 457000 + }, + { + "epoch": 20.540929104142332, + "grad_norm": 1.9777508974075317, + "learning_rate": 3.1913215932591826e-05, + "loss": 0.3976, + "step": 457200 + }, + { + "epoch": 20.54991463743373, + "grad_norm": 2.3788673877716064, + "learning_rate": 3.189965071101282e-05, + "loss": 0.3776, + "step": 457400 + }, + { + "epoch": 20.55890017072513, + "grad_norm": 10.905414581298828, + "learning_rate": 3.188608329016433e-05, + "loss": 0.374, + "step": 457600 + }, + { + "epoch": 20.567885704016533, + "grad_norm": 9.221813201904297, + "learning_rate": 3.187251367437099e-05, + "loss": 0.3753, + "step": 457800 + }, + { + "epoch": 20.576871237307934, + "grad_norm": 35.775840759277344, + "learning_rate": 3.185894186795811e-05, + "loss": 0.3513, + "step": 458000 + }, + { + "epoch": 20.576871237307934, + "eval_loss": 3.8578977584838867, + "eval_runtime": 1114.7006, + "eval_samples_per_second": 8.885, + "eval_steps_per_second": 0.035, + "step": 458000 + }, + { + "epoch": 20.585856770599335, + "grad_norm": 8.585643768310547, + "learning_rate": 3.184536787525173e-05, + "loss": 0.3549, + "step": 458200 + }, + { + "epoch": 20.594842303890736, + "grad_norm": 7.512677192687988, + "learning_rate": 3.183179170057857e-05, + "loss": 0.3572, + "step": 458400 + }, + { + "epoch": 20.603827837182138, + "grad_norm": 11.871265411376953, + "learning_rate": 3.1818213348266035e-05, + "loss": 0.3588, + "step": 458600 + }, + { + "epoch": 20.61281337047354, + "grad_norm": 4.45906925201416, + "learning_rate": 3.180463282264225e-05, + "loss": 0.3437, + "step": 458800 + }, + { + "epoch": 20.621798903764937, + "grad_norm": 3.7630507946014404, + "learning_rate": 3.179105012803601e-05, + "loss": 0.3904, + "step": 459000 + }, + { + "epoch": 20.621798903764937, + "eval_loss": 3.8454971313476562, + "eval_runtime": 1116.6233, + "eval_samples_per_second": 8.87, + "eval_steps_per_second": 0.035, + "step": 459000 + }, + { + "epoch": 20.630784437056338, + "grad_norm": 9.435053825378418, + "learning_rate": 3.1777465268776805e-05, + "loss": 0.3552, + "step": 459200 + }, + { + "epoch": 20.63976997034774, + "grad_norm": 0.3744598925113678, + "learning_rate": 3.176387824919484e-05, + "loss": 0.3446, + "step": 459400 + }, + { + "epoch": 20.64875550363914, + "grad_norm": 2.1311497688293457, + "learning_rate": 3.175028907362097e-05, + "loss": 0.3755, + "step": 459600 + }, + { + "epoch": 20.657741036930542, + "grad_norm": 7.7464141845703125, + "learning_rate": 3.173669774638677e-05, + "loss": 0.3599, + "step": 459800 + }, + { + "epoch": 20.666726570221943, + "grad_norm": 18.331575393676758, + "learning_rate": 3.172310427182448e-05, + "loss": 0.3311, + "step": 460000 + }, + { + "epoch": 20.666726570221943, + "eval_loss": 3.899061918258667, + "eval_runtime": 1122.1771, + "eval_samples_per_second": 8.826, + "eval_steps_per_second": 0.035, + "step": 460000 + }, + { + "epoch": 20.675712103513344, + "grad_norm": 4.977959156036377, + "learning_rate": 3.1709508654267026e-05, + "loss": 0.3996, + "step": 460200 + }, + { + "epoch": 20.684697636804746, + "grad_norm": 6.856226921081543, + "learning_rate": 3.169591089804804e-05, + "loss": 0.3761, + "step": 460400 + }, + { + "epoch": 20.693683170096143, + "grad_norm": 8.389673233032227, + "learning_rate": 3.1682311007501795e-05, + "loss": 0.3726, + "step": 460600 + }, + { + "epoch": 20.702668703387545, + "grad_norm": 3.833249807357788, + "learning_rate": 3.1668708986963284e-05, + "loss": 0.3422, + "step": 460800 + }, + { + "epoch": 20.711654236678946, + "grad_norm": 7.320929527282715, + "learning_rate": 3.165510484076816e-05, + "loss": 0.3855, + "step": 461000 + }, + { + "epoch": 20.711654236678946, + "eval_loss": 3.8244404792785645, + "eval_runtime": 1128.0561, + "eval_samples_per_second": 8.78, + "eval_steps_per_second": 0.035, + "step": 461000 + }, + { + "epoch": 20.720639769970347, + "grad_norm": 3.787951946258545, + "learning_rate": 3.164149857325276e-05, + "loss": 0.3799, + "step": 461200 + }, + { + "epoch": 20.72962530326175, + "grad_norm": 5.104145526885986, + "learning_rate": 3.162789018875408e-05, + "loss": 0.3677, + "step": 461400 + }, + { + "epoch": 20.73861083655315, + "grad_norm": 6.0579962730407715, + "learning_rate": 3.1614279691609804e-05, + "loss": 0.3492, + "step": 461600 + }, + { + "epoch": 20.74759636984455, + "grad_norm": 5.607633590698242, + "learning_rate": 3.1600667086158315e-05, + "loss": 0.3562, + "step": 461800 + }, + { + "epoch": 20.756581903135952, + "grad_norm": 13.053763389587402, + "learning_rate": 3.158705237673861e-05, + "loss": 0.3833, + "step": 462000 + }, + { + "epoch": 20.756581903135952, + "eval_loss": 3.8414077758789062, + "eval_runtime": 1119.0904, + "eval_samples_per_second": 8.85, + "eval_steps_per_second": 0.035, + "step": 462000 + }, + { + "epoch": 20.765567436427354, + "grad_norm": 8.402251243591309, + "learning_rate": 3.157343556769041e-05, + "loss": 0.412, + "step": 462200 + }, + { + "epoch": 20.77455296971875, + "grad_norm": 21.891206741333008, + "learning_rate": 3.1559816663354076e-05, + "loss": 0.3489, + "step": 462400 + }, + { + "epoch": 20.783538503010153, + "grad_norm": 6.903267860412598, + "learning_rate": 3.1546195668070646e-05, + "loss": 0.389, + "step": 462600 + }, + { + "epoch": 20.792524036301554, + "grad_norm": 5.88771915435791, + "learning_rate": 3.153257258618183e-05, + "loss": 0.3546, + "step": 462800 + }, + { + "epoch": 20.801509569592955, + "grad_norm": 5.859227657318115, + "learning_rate": 3.151894742202999e-05, + "loss": 0.3742, + "step": 463000 + }, + { + "epoch": 20.801509569592955, + "eval_loss": 3.807049512863159, + "eval_runtime": 1121.8109, + "eval_samples_per_second": 8.829, + "eval_steps_per_second": 0.035, + "step": 463000 + }, + { + "epoch": 20.810495102884357, + "grad_norm": 9.092805862426758, + "learning_rate": 3.150532017995816e-05, + "loss": 0.3714, + "step": 463200 + }, + { + "epoch": 20.819480636175758, + "grad_norm": 32.67975997924805, + "learning_rate": 3.149169086431003e-05, + "loss": 0.4, + "step": 463400 + }, + { + "epoch": 20.82846616946716, + "grad_norm": 8.08678913116455, + "learning_rate": 3.1478059479429966e-05, + "loss": 0.3589, + "step": 463600 + }, + { + "epoch": 20.83745170275856, + "grad_norm": 2.283585548400879, + "learning_rate": 3.146442602966297e-05, + "loss": 0.3339, + "step": 463800 + }, + { + "epoch": 20.846437236049958, + "grad_norm": 8.233623504638672, + "learning_rate": 3.145079051935475e-05, + "loss": 0.3761, + "step": 464000 + }, + { + "epoch": 20.846437236049958, + "eval_loss": 3.8668360710144043, + "eval_runtime": 1173.3335, + "eval_samples_per_second": 8.441, + "eval_steps_per_second": 0.033, + "step": 464000 + }, + { + "epoch": 20.85542276934136, + "grad_norm": 5.021024703979492, + "learning_rate": 3.143715295285158e-05, + "loss": 0.339, + "step": 464200 + }, + { + "epoch": 20.86440830263276, + "grad_norm": 7.741531848907471, + "learning_rate": 3.142351333450049e-05, + "loss": 0.3532, + "step": 464400 + }, + { + "epoch": 20.873393835924162, + "grad_norm": 3.023864984512329, + "learning_rate": 3.140987166864911e-05, + "loss": 0.3614, + "step": 464600 + }, + { + "epoch": 20.882379369215563, + "grad_norm": 5.5194549560546875, + "learning_rate": 3.1396227959645717e-05, + "loss": 0.3642, + "step": 464800 + }, + { + "epoch": 20.891364902506965, + "grad_norm": 0.732132613658905, + "learning_rate": 3.138258221183928e-05, + "loss": 0.3897, + "step": 465000 + }, + { + "epoch": 20.891364902506965, + "eval_loss": 3.830918073654175, + "eval_runtime": 1150.322, + "eval_samples_per_second": 8.61, + "eval_steps_per_second": 0.034, + "step": 465000 + }, + { + "epoch": 20.900350435798366, + "grad_norm": 4.300996780395508, + "learning_rate": 3.1368934429579376e-05, + "loss": 0.302, + "step": 465200 + }, + { + "epoch": 20.909335969089767, + "grad_norm": 5.096749782562256, + "learning_rate": 3.135528461721624e-05, + "loss": 0.3462, + "step": 465400 + }, + { + "epoch": 20.918321502381165, + "grad_norm": 13.806108474731445, + "learning_rate": 3.134163277910078e-05, + "loss": 0.3477, + "step": 465600 + }, + { + "epoch": 20.927307035672566, + "grad_norm": 1.5174065828323364, + "learning_rate": 3.1327978919584526e-05, + "loss": 0.3579, + "step": 465800 + }, + { + "epoch": 20.936292568963967, + "grad_norm": 4.7623395919799805, + "learning_rate": 3.131432304301965e-05, + "loss": 0.3539, + "step": 466000 + }, + { + "epoch": 20.936292568963967, + "eval_loss": 3.8357908725738525, + "eval_runtime": 1154.0612, + "eval_samples_per_second": 8.582, + "eval_steps_per_second": 0.034, + "step": 466000 + }, + { + "epoch": 20.94527810225537, + "grad_norm": 13.757698059082031, + "learning_rate": 3.130066515375897e-05, + "loss": 0.3352, + "step": 466200 + }, + { + "epoch": 20.95426363554677, + "grad_norm": 4.73702335357666, + "learning_rate": 3.1287005256155964e-05, + "loss": 0.3747, + "step": 466400 + }, + { + "epoch": 20.96324916883817, + "grad_norm": 0.19603075087070465, + "learning_rate": 3.1273343354564734e-05, + "loss": 0.382, + "step": 466600 + }, + { + "epoch": 20.972234702129573, + "grad_norm": 2.0142762660980225, + "learning_rate": 3.1259679453340006e-05, + "loss": 0.3544, + "step": 466800 + }, + { + "epoch": 20.981220235420974, + "grad_norm": 13.178425788879395, + "learning_rate": 3.1246013556837184e-05, + "loss": 0.3255, + "step": 467000 + }, + { + "epoch": 20.981220235420974, + "eval_loss": 3.835940361022949, + "eval_runtime": 1155.7445, + "eval_samples_per_second": 8.569, + "eval_steps_per_second": 0.034, + "step": 467000 + }, + { + "epoch": 20.99020576871237, + "grad_norm": 9.660638809204102, + "learning_rate": 3.1232345669412265e-05, + "loss": 0.3552, + "step": 467200 + }, + { + "epoch": 20.999191302003773, + "grad_norm": 5.755095958709717, + "learning_rate": 3.121867579542191e-05, + "loss": 0.3652, + "step": 467400 + }, + { + "epoch": 21.008176835295174, + "grad_norm": 23.942413330078125, + "learning_rate": 3.1205003939223395e-05, + "loss": 0.3479, + "step": 467600 + }, + { + "epoch": 21.017162368586575, + "grad_norm": 5.542444229125977, + "learning_rate": 3.119133010517465e-05, + "loss": 0.3158, + "step": 467800 + }, + { + "epoch": 21.026147901877977, + "grad_norm": 3.515453815460205, + "learning_rate": 3.1177654297634203e-05, + "loss": 0.2882, + "step": 468000 + }, + { + "epoch": 21.026147901877977, + "eval_loss": 3.8817296028137207, + "eval_runtime": 1153.4188, + "eval_samples_per_second": 8.587, + "eval_steps_per_second": 0.034, + "step": 468000 + }, + { + "epoch": 21.035133435169378, + "grad_norm": 3.5313735008239746, + "learning_rate": 3.116397652096124e-05, + "loss": 0.3262, + "step": 468200 + }, + { + "epoch": 21.04411896846078, + "grad_norm": 10.718170166015625, + "learning_rate": 3.1150296779515566e-05, + "loss": 0.337, + "step": 468400 + }, + { + "epoch": 21.05310450175218, + "grad_norm": 8.422656059265137, + "learning_rate": 3.11366150776576e-05, + "loss": 0.3319, + "step": 468600 + }, + { + "epoch": 21.06209003504358, + "grad_norm": 7.027642726898193, + "learning_rate": 3.11229314197484e-05, + "loss": 0.3825, + "step": 468800 + }, + { + "epoch": 21.07107556833498, + "grad_norm": 2.228684902191162, + "learning_rate": 3.110924581014964e-05, + "loss": 0.329, + "step": 469000 + }, + { + "epoch": 21.07107556833498, + "eval_loss": 3.8373589515686035, + "eval_runtime": 1150.8556, + "eval_samples_per_second": 8.606, + "eval_steps_per_second": 0.034, + "step": 469000 + }, + { + "epoch": 21.08006110162638, + "grad_norm": 6.492588996887207, + "learning_rate": 3.109555825322364e-05, + "loss": 0.3721, + "step": 469200 + }, + { + "epoch": 21.089046634917782, + "grad_norm": 5.467384338378906, + "learning_rate": 3.1081868753333306e-05, + "loss": 0.3371, + "step": 469400 + }, + { + "epoch": 21.098032168209183, + "grad_norm": 19.02194595336914, + "learning_rate": 3.106817731484216e-05, + "loss": 0.3575, + "step": 469600 + }, + { + "epoch": 21.107017701500585, + "grad_norm": 5.688388347625732, + "learning_rate": 3.105448394211439e-05, + "loss": 0.3323, + "step": 469800 + }, + { + "epoch": 21.116003234791986, + "grad_norm": 6.124304294586182, + "learning_rate": 3.104078863951475e-05, + "loss": 0.3399, + "step": 470000 + }, + { + "epoch": 21.116003234791986, + "eval_loss": 3.8396663665771484, + "eval_runtime": 1148.2714, + "eval_samples_per_second": 8.625, + "eval_steps_per_second": 0.034, + "step": 470000 + }, + { + "epoch": 21.124988768083387, + "grad_norm": 14.203096389770508, + "learning_rate": 3.1027091411408634e-05, + "loss": 0.3087, + "step": 470200 + }, + { + "epoch": 21.133974301374785, + "grad_norm": 10.170199394226074, + "learning_rate": 3.101339226216205e-05, + "loss": 0.3511, + "step": 470400 + }, + { + "epoch": 21.142959834666186, + "grad_norm": 3.682291030883789, + "learning_rate": 3.099969119614161e-05, + "loss": 0.3443, + "step": 470600 + }, + { + "epoch": 21.151945367957588, + "grad_norm": 3.399019718170166, + "learning_rate": 3.098598821771454e-05, + "loss": 0.329, + "step": 470800 + }, + { + "epoch": 21.16093090124899, + "grad_norm": 4.879147052764893, + "learning_rate": 3.0972283331248675e-05, + "loss": 0.3404, + "step": 471000 + }, + { + "epoch": 21.16093090124899, + "eval_loss": 3.8527607917785645, + "eval_runtime": 1154.1744, + "eval_samples_per_second": 8.581, + "eval_steps_per_second": 0.034, + "step": 471000 + }, + { + "epoch": 21.16991643454039, + "grad_norm": 14.056867599487305, + "learning_rate": 3.095857654111246e-05, + "loss": 0.367, + "step": 471200 + }, + { + "epoch": 21.17890196783179, + "grad_norm": 2.038222312927246, + "learning_rate": 3.094486785167495e-05, + "loss": 0.3434, + "step": 471400 + }, + { + "epoch": 21.187887501123193, + "grad_norm": 5.393631458282471, + "learning_rate": 3.09311572673058e-05, + "loss": 0.3316, + "step": 471600 + }, + { + "epoch": 21.196873034414594, + "grad_norm": 9.57490348815918, + "learning_rate": 3.091744479237526e-05, + "loss": 0.3618, + "step": 471800 + }, + { + "epoch": 21.20585856770599, + "grad_norm": 6.818603515625, + "learning_rate": 3.090373043125421e-05, + "loss": 0.3651, + "step": 472000 + }, + { + "epoch": 21.20585856770599, + "eval_loss": 3.847317695617676, + "eval_runtime": 1155.725, + "eval_samples_per_second": 8.57, + "eval_steps_per_second": 0.034, + "step": 472000 + }, + { + "epoch": 21.214844100997393, + "grad_norm": 2.522334575653076, + "learning_rate": 3.0890014188314095e-05, + "loss": 0.3264, + "step": 472200 + }, + { + "epoch": 21.223829634288794, + "grad_norm": 25.88078498840332, + "learning_rate": 3.0876296067927e-05, + "loss": 0.3423, + "step": 472400 + }, + { + "epoch": 21.232815167580195, + "grad_norm": 0.09056749939918518, + "learning_rate": 3.0862576074465566e-05, + "loss": 0.3413, + "step": 472600 + }, + { + "epoch": 21.241800700871597, + "grad_norm": 28.01805305480957, + "learning_rate": 3.0848854212303065e-05, + "loss": 0.3273, + "step": 472800 + }, + { + "epoch": 21.250786234162998, + "grad_norm": 6.097854137420654, + "learning_rate": 3.083513048581335e-05, + "loss": 0.3848, + "step": 473000 + }, + { + "epoch": 21.250786234162998, + "eval_loss": 3.879460334777832, + "eval_runtime": 1149.2535, + "eval_samples_per_second": 8.618, + "eval_steps_per_second": 0.034, + "step": 473000 + }, + { + "epoch": 21.2597717674544, + "grad_norm": 0.36335647106170654, + "learning_rate": 3.082140489937088e-05, + "loss": 0.3841, + "step": 473200 + }, + { + "epoch": 21.2687573007458, + "grad_norm": 2.704850435256958, + "learning_rate": 3.080767745735067e-05, + "loss": 0.3488, + "step": 473400 + }, + { + "epoch": 21.2777428340372, + "grad_norm": 0.6730875968933105, + "learning_rate": 3.079394816412839e-05, + "loss": 0.3457, + "step": 473600 + }, + { + "epoch": 21.2867283673286, + "grad_norm": 16.261018753051758, + "learning_rate": 3.078021702408024e-05, + "loss": 0.3444, + "step": 473800 + }, + { + "epoch": 21.29571390062, + "grad_norm": 8.230804443359375, + "learning_rate": 3.076648404158303e-05, + "loss": 0.3606, + "step": 474000 + }, + { + "epoch": 21.29571390062, + "eval_loss": 3.8442225456237793, + "eval_runtime": 1152.5751, + "eval_samples_per_second": 8.593, + "eval_steps_per_second": 0.034, + "step": 474000 + }, + { + "epoch": 21.304699433911402, + "grad_norm": 6.650168418884277, + "learning_rate": 3.075274922101418e-05, + "loss": 0.3307, + "step": 474200 + }, + { + "epoch": 21.313684967202803, + "grad_norm": 9.012650489807129, + "learning_rate": 3.073901256675166e-05, + "loss": 0.3595, + "step": 474400 + }, + { + "epoch": 21.322670500494205, + "grad_norm": 3.0658600330352783, + "learning_rate": 3.072527408317403e-05, + "loss": 0.365, + "step": 474600 + }, + { + "epoch": 21.331656033785606, + "grad_norm": 8.665407180786133, + "learning_rate": 3.071153377466047e-05, + "loss": 0.3393, + "step": 474800 + }, + { + "epoch": 21.340641567077007, + "grad_norm": 0.1144244521856308, + "learning_rate": 3.0697791645590696e-05, + "loss": 0.3567, + "step": 475000 + }, + { + "epoch": 21.340641567077007, + "eval_loss": 3.848034143447876, + "eval_runtime": 1168.8081, + "eval_samples_per_second": 8.474, + "eval_steps_per_second": 0.033, + "step": 475000 + }, + { + "epoch": 21.34962710036841, + "grad_norm": 9.049808502197266, + "learning_rate": 3.068404770034503e-05, + "loss": 0.3773, + "step": 475200 + }, + { + "epoch": 21.358612633659806, + "grad_norm": 5.73265266418457, + "learning_rate": 3.067030194330437e-05, + "loss": 0.3476, + "step": 475400 + }, + { + "epoch": 21.367598166951208, + "grad_norm": 12.6224365234375, + "learning_rate": 3.065655437885018e-05, + "loss": 0.3389, + "step": 475600 + }, + { + "epoch": 21.37658370024261, + "grad_norm": 19.895153045654297, + "learning_rate": 3.06428050113645e-05, + "loss": 0.3646, + "step": 475800 + }, + { + "epoch": 21.38556923353401, + "grad_norm": 9.202630043029785, + "learning_rate": 3.062905384522998e-05, + "loss": 0.4052, + "step": 476000 + }, + { + "epoch": 21.38556923353401, + "eval_loss": 3.8101115226745605, + "eval_runtime": 1161.6908, + "eval_samples_per_second": 8.526, + "eval_steps_per_second": 0.034, + "step": 476000 + }, + { + "epoch": 21.39455476682541, + "grad_norm": 24.745006561279297, + "learning_rate": 3.0615300884829785e-05, + "loss": 0.3686, + "step": 476200 + }, + { + "epoch": 21.403540300116813, + "grad_norm": 2.2949283123016357, + "learning_rate": 3.060154613454771e-05, + "loss": 0.3118, + "step": 476400 + }, + { + "epoch": 21.412525833408214, + "grad_norm": 1.272202491760254, + "learning_rate": 3.058778959876807e-05, + "loss": 0.3484, + "step": 476600 + }, + { + "epoch": 21.421511366699615, + "grad_norm": 0.6712559461593628, + "learning_rate": 3.057403128187578e-05, + "loss": 0.3196, + "step": 476800 + }, + { + "epoch": 21.430496899991013, + "grad_norm": 4.88563346862793, + "learning_rate": 3.056027118825632e-05, + "loss": 0.3432, + "step": 477000 + }, + { + "epoch": 21.430496899991013, + "eval_loss": 3.836414098739624, + "eval_runtime": 1156.4826, + "eval_samples_per_second": 8.564, + "eval_steps_per_second": 0.034, + "step": 477000 + }, + { + "epoch": 21.439482433282414, + "grad_norm": 5.171449661254883, + "learning_rate": 3.054650932229573e-05, + "loss": 0.3461, + "step": 477200 + }, + { + "epoch": 21.448467966573816, + "grad_norm": 6.105608940124512, + "learning_rate": 3.053274568838061e-05, + "loss": 0.3616, + "step": 477400 + }, + { + "epoch": 21.457453499865217, + "grad_norm": 0.032906968146562576, + "learning_rate": 3.051898029089814e-05, + "loss": 0.3433, + "step": 477600 + }, + { + "epoch": 21.466439033156618, + "grad_norm": 15.590333938598633, + "learning_rate": 3.0505213134236043e-05, + "loss": 0.3356, + "step": 477800 + }, + { + "epoch": 21.47542456644802, + "grad_norm": 4.688640117645264, + "learning_rate": 3.0491444222782616e-05, + "loss": 0.3906, + "step": 478000 + }, + { + "epoch": 21.47542456644802, + "eval_loss": 3.85675048828125, + "eval_runtime": 1155.4131, + "eval_samples_per_second": 8.572, + "eval_steps_per_second": 0.034, + "step": 478000 + }, + { + "epoch": 21.48441009973942, + "grad_norm": 10.541050910949707, + "learning_rate": 3.0477673560926723e-05, + "loss": 0.3419, + "step": 478200 + }, + { + "epoch": 21.493395633030822, + "grad_norm": 2.6476938724517822, + "learning_rate": 3.046390115305775e-05, + "loss": 0.3415, + "step": 478400 + }, + { + "epoch": 21.50238116632222, + "grad_norm": 14.356165885925293, + "learning_rate": 3.0450127003565676e-05, + "loss": 0.3367, + "step": 478600 + }, + { + "epoch": 21.51136669961362, + "grad_norm": 16.879222869873047, + "learning_rate": 3.043635111684102e-05, + "loss": 0.3584, + "step": 478800 + }, + { + "epoch": 21.520352232905022, + "grad_norm": 7.5179009437561035, + "learning_rate": 3.0422573497274865e-05, + "loss": 0.3594, + "step": 479000 + }, + { + "epoch": 21.520352232905022, + "eval_loss": 3.820604085922241, + "eval_runtime": 1154.9865, + "eval_samples_per_second": 8.575, + "eval_steps_per_second": 0.034, + "step": 479000 + }, + { + "epoch": 21.529337766196424, + "grad_norm": 14.661418914794922, + "learning_rate": 3.040879414925883e-05, + "loss": 0.3627, + "step": 479200 + }, + { + "epoch": 21.538323299487825, + "grad_norm": 38.703025817871094, + "learning_rate": 3.0395013077185103e-05, + "loss": 0.3574, + "step": 479400 + }, + { + "epoch": 21.547308832779226, + "grad_norm": 4.57069730758667, + "learning_rate": 3.0381230285446395e-05, + "loss": 0.2861, + "step": 479600 + }, + { + "epoch": 21.556294366070627, + "grad_norm": 15.500905990600586, + "learning_rate": 3.036744577843601e-05, + "loss": 0.3579, + "step": 479800 + }, + { + "epoch": 21.56527989936203, + "grad_norm": 5.1388959884643555, + "learning_rate": 3.0353659560547748e-05, + "loss": 0.3689, + "step": 480000 + }, + { + "epoch": 21.56527989936203, + "eval_loss": 3.8755042552948, + "eval_runtime": 1153.7667, + "eval_samples_per_second": 8.584, + "eval_steps_per_second": 0.034, + "step": 480000 + }, + { + "epoch": 21.574265432653426, + "grad_norm": 0.9813115000724792, + "learning_rate": 3.0339871636175982e-05, + "loss": 0.3489, + "step": 480200 + }, + { + "epoch": 21.583250965944828, + "grad_norm": 10.196927070617676, + "learning_rate": 3.0326082009715636e-05, + "loss": 0.3901, + "step": 480400 + }, + { + "epoch": 21.59223649923623, + "grad_norm": 14.794051170349121, + "learning_rate": 3.031229068556215e-05, + "loss": 0.3294, + "step": 480600 + }, + { + "epoch": 21.60122203252763, + "grad_norm": 14.24916934967041, + "learning_rate": 3.029849766811153e-05, + "loss": 0.387, + "step": 480800 + }, + { + "epoch": 21.61020756581903, + "grad_norm": 15.70306396484375, + "learning_rate": 3.0284702961760304e-05, + "loss": 0.3595, + "step": 481000 + }, + { + "epoch": 21.61020756581903, + "eval_loss": 3.8320348262786865, + "eval_runtime": 1154.7214, + "eval_samples_per_second": 8.577, + "eval_steps_per_second": 0.034, + "step": 481000 + }, + { + "epoch": 21.619193099110433, + "grad_norm": 16.37736701965332, + "learning_rate": 3.027090657090556e-05, + "loss": 0.3717, + "step": 481200 + }, + { + "epoch": 21.628178632401834, + "grad_norm": 3.5008671283721924, + "learning_rate": 3.025710849994489e-05, + "loss": 0.3668, + "step": 481400 + }, + { + "epoch": 21.637164165693235, + "grad_norm": 9.52043628692627, + "learning_rate": 3.024330875327646e-05, + "loss": 0.3244, + "step": 481600 + }, + { + "epoch": 21.646149698984633, + "grad_norm": 8.85307502746582, + "learning_rate": 3.022950733529894e-05, + "loss": 0.3817, + "step": 481800 + }, + { + "epoch": 21.655135232276034, + "grad_norm": 18.641752243041992, + "learning_rate": 3.0215704250411542e-05, + "loss": 0.3254, + "step": 482000 + }, + { + "epoch": 21.655135232276034, + "eval_loss": 3.8365020751953125, + "eval_runtime": 1155.1846, + "eval_samples_per_second": 8.574, + "eval_steps_per_second": 0.034, + "step": 482000 + }, + { + "epoch": 21.664120765567436, + "grad_norm": 11.407354354858398, + "learning_rate": 3.0201899503014013e-05, + "loss": 0.3427, + "step": 482200 + }, + { + "epoch": 21.673106298858837, + "grad_norm": 20.381561279296875, + "learning_rate": 3.0188093097506642e-05, + "loss": 0.3127, + "step": 482400 + }, + { + "epoch": 21.68209183215024, + "grad_norm": 11.307368278503418, + "learning_rate": 3.0174285038290208e-05, + "loss": 0.356, + "step": 482600 + }, + { + "epoch": 21.69107736544164, + "grad_norm": 4.448453903198242, + "learning_rate": 3.016047532976606e-05, + "loss": 0.3319, + "step": 482800 + }, + { + "epoch": 21.70006289873304, + "grad_norm": 14.862668991088867, + "learning_rate": 3.0146663976336036e-05, + "loss": 0.3684, + "step": 483000 + }, + { + "epoch": 21.70006289873304, + "eval_loss": 3.879840135574341, + "eval_runtime": 1155.6614, + "eval_samples_per_second": 8.57, + "eval_steps_per_second": 0.034, + "step": 483000 + }, + { + "epoch": 21.709048432024442, + "grad_norm": 7.227370738983154, + "learning_rate": 3.0132850982402538e-05, + "loss": 0.3515, + "step": 483200 + }, + { + "epoch": 21.71803396531584, + "grad_norm": 1.9134999513626099, + "learning_rate": 3.0119036352368463e-05, + "loss": 0.3544, + "step": 483400 + }, + { + "epoch": 21.72701949860724, + "grad_norm": 5.353797912597656, + "learning_rate": 3.010522009063722e-05, + "loss": 0.325, + "step": 483600 + }, + { + "epoch": 21.736005031898642, + "grad_norm": 3.9726414680480957, + "learning_rate": 3.0091402201612785e-05, + "loss": 0.3743, + "step": 483800 + }, + { + "epoch": 21.744990565190044, + "grad_norm": 7.579124927520752, + "learning_rate": 3.007758268969959e-05, + "loss": 0.3347, + "step": 484000 + }, + { + "epoch": 21.744990565190044, + "eval_loss": 3.8592593669891357, + "eval_runtime": 1154.5705, + "eval_samples_per_second": 8.578, + "eval_steps_per_second": 0.034, + "step": 484000 + }, + { + "epoch": 21.753976098481445, + "grad_norm": 2.528778076171875, + "learning_rate": 3.0063761559302626e-05, + "loss": 0.3497, + "step": 484200 + }, + { + "epoch": 21.762961631772846, + "grad_norm": 7.943315029144287, + "learning_rate": 3.0049938814827405e-05, + "loss": 0.3666, + "step": 484400 + }, + { + "epoch": 21.771947165064248, + "grad_norm": 33.58492660522461, + "learning_rate": 3.0036114460679926e-05, + "loss": 0.3457, + "step": 484600 + }, + { + "epoch": 21.78093269835565, + "grad_norm": 1.3153636455535889, + "learning_rate": 3.002228850126671e-05, + "loss": 0.3493, + "step": 484800 + }, + { + "epoch": 21.789918231647047, + "grad_norm": 8.177019119262695, + "learning_rate": 3.00084609409948e-05, + "loss": 0.3624, + "step": 485000 + }, + { + "epoch": 21.789918231647047, + "eval_loss": 3.820582389831543, + "eval_runtime": 1154.2343, + "eval_samples_per_second": 8.581, + "eval_steps_per_second": 0.034, + "step": 485000 + }, + { + "epoch": 21.798903764938448, + "grad_norm": 3.7506697177886963, + "learning_rate": 2.9994631784271743e-05, + "loss": 0.3678, + "step": 485200 + }, + { + "epoch": 21.80788929822985, + "grad_norm": 14.741352081298828, + "learning_rate": 2.998080103550558e-05, + "loss": 0.3489, + "step": 485400 + }, + { + "epoch": 21.81687483152125, + "grad_norm": 9.07077693939209, + "learning_rate": 2.9966968699104896e-05, + "loss": 0.325, + "step": 485600 + }, + { + "epoch": 21.82586036481265, + "grad_norm": 56.59426498413086, + "learning_rate": 2.995313477947875e-05, + "loss": 0.3738, + "step": 485800 + }, + { + "epoch": 21.834845898104053, + "grad_norm": 16.987424850463867, + "learning_rate": 2.993929928103671e-05, + "loss": 0.3698, + "step": 486000 + }, + { + "epoch": 21.834845898104053, + "eval_loss": 3.7959418296813965, + "eval_runtime": 1183.6378, + "eval_samples_per_second": 8.367, + "eval_steps_per_second": 0.033, + "step": 486000 + }, + { + "epoch": 21.843831431395454, + "grad_norm": 23.582782745361328, + "learning_rate": 2.992546220818886e-05, + "loss": 0.3545, + "step": 486200 + }, + { + "epoch": 21.852816964686856, + "grad_norm": 8.88424301147461, + "learning_rate": 2.991162356534577e-05, + "loss": 0.3428, + "step": 486400 + }, + { + "epoch": 21.861802497978253, + "grad_norm": 9.823083877563477, + "learning_rate": 2.9897783356918536e-05, + "loss": 0.3352, + "step": 486600 + }, + { + "epoch": 21.870788031269655, + "grad_norm": 1.0258564949035645, + "learning_rate": 2.988394158731872e-05, + "loss": 0.3661, + "step": 486800 + }, + { + "epoch": 21.879773564561056, + "grad_norm": 2.3258697986602783, + "learning_rate": 2.98700982609584e-05, + "loss": 0.3484, + "step": 487000 + }, + { + "epoch": 21.879773564561056, + "eval_loss": 3.8458335399627686, + "eval_runtime": 1171.3081, + "eval_samples_per_second": 8.456, + "eval_steps_per_second": 0.033, + "step": 487000 + }, + { + "epoch": 21.888759097852457, + "grad_norm": 16.876636505126953, + "learning_rate": 2.985625338225016e-05, + "loss": 0.356, + "step": 487200 + }, + { + "epoch": 21.89774463114386, + "grad_norm": 1.0593225955963135, + "learning_rate": 2.9842406955607054e-05, + "loss": 0.3426, + "step": 487400 + }, + { + "epoch": 21.90673016443526, + "grad_norm": 0.3930041491985321, + "learning_rate": 2.9828558985442647e-05, + "loss": 0.3712, + "step": 487600 + }, + { + "epoch": 21.91571569772666, + "grad_norm": 47.871334075927734, + "learning_rate": 2.9814709476170988e-05, + "loss": 0.3656, + "step": 487800 + }, + { + "epoch": 21.924701231018062, + "grad_norm": 7.659090042114258, + "learning_rate": 2.9800858432206625e-05, + "loss": 0.3934, + "step": 488000 + }, + { + "epoch": 21.924701231018062, + "eval_loss": 3.867889881134033, + "eval_runtime": 1172.2377, + "eval_samples_per_second": 8.449, + "eval_steps_per_second": 0.033, + "step": 488000 + }, + { + "epoch": 21.933686764309464, + "grad_norm": 11.335125923156738, + "learning_rate": 2.9787005857964583e-05, + "loss": 0.3697, + "step": 488200 + }, + { + "epoch": 21.94267229760086, + "grad_norm": 5.224600791931152, + "learning_rate": 2.977315175786039e-05, + "loss": 0.3876, + "step": 488400 + }, + { + "epoch": 21.951657830892263, + "grad_norm": 0.7447425723075867, + "learning_rate": 2.9759296136310048e-05, + "loss": 0.3723, + "step": 488600 + }, + { + "epoch": 21.960643364183664, + "grad_norm": 13.654375076293945, + "learning_rate": 2.9745438997730045e-05, + "loss": 0.3389, + "step": 488800 + }, + { + "epoch": 21.969628897475065, + "grad_norm": 3.7496023178100586, + "learning_rate": 2.9731580346537357e-05, + "loss": 0.3349, + "step": 489000 + }, + { + "epoch": 21.969628897475065, + "eval_loss": 3.8698184490203857, + "eval_runtime": 1168.8312, + "eval_samples_per_second": 8.473, + "eval_steps_per_second": 0.033, + "step": 489000 + }, + { + "epoch": 21.978614430766466, + "grad_norm": 1.3468828201293945, + "learning_rate": 2.971772018714945e-05, + "loss": 0.3456, + "step": 489200 + }, + { + "epoch": 21.987599964057868, + "grad_norm": 6.780975341796875, + "learning_rate": 2.9703858523984245e-05, + "loss": 0.3457, + "step": 489400 + }, + { + "epoch": 21.99658549734927, + "grad_norm": 5.41343355178833, + "learning_rate": 2.9689995361460175e-05, + "loss": 0.3758, + "step": 489600 + }, + { + "epoch": 22.00557103064067, + "grad_norm": 4.552206993103027, + "learning_rate": 2.9676130703996124e-05, + "loss": 0.3399, + "step": 489800 + }, + { + "epoch": 22.014556563932068, + "grad_norm": 9.643780708312988, + "learning_rate": 2.9662264556011465e-05, + "loss": 0.3381, + "step": 490000 + }, + { + "epoch": 22.014556563932068, + "eval_loss": 3.8691928386688232, + "eval_runtime": 1170.7785, + "eval_samples_per_second": 8.459, + "eval_steps_per_second": 0.033, + "step": 490000 + }, + { + "epoch": 22.02354209722347, + "grad_norm": 7.726506233215332, + "learning_rate": 2.9648396921926047e-05, + "loss": 0.3159, + "step": 490200 + }, + { + "epoch": 22.03252763051487, + "grad_norm": 4.900279521942139, + "learning_rate": 2.963452780616019e-05, + "loss": 0.3327, + "step": 490400 + }, + { + "epoch": 22.041513163806272, + "grad_norm": 6.858339786529541, + "learning_rate": 2.9620657213134684e-05, + "loss": 0.3054, + "step": 490600 + }, + { + "epoch": 22.050498697097673, + "grad_norm": 1.6258982419967651, + "learning_rate": 2.9606785147270798e-05, + "loss": 0.3267, + "step": 490800 + }, + { + "epoch": 22.059484230389074, + "grad_norm": 0.9190937876701355, + "learning_rate": 2.959291161299026e-05, + "loss": 0.3167, + "step": 491000 + }, + { + "epoch": 22.059484230389074, + "eval_loss": 3.9671905040740967, + "eval_runtime": 1171.7463, + "eval_samples_per_second": 8.452, + "eval_steps_per_second": 0.033, + "step": 491000 + }, + { + "epoch": 22.068469763680476, + "grad_norm": 10.989773750305176, + "learning_rate": 2.9579036614715267e-05, + "loss": 0.3332, + "step": 491200 + }, + { + "epoch": 22.077455296971877, + "grad_norm": 10.96854305267334, + "learning_rate": 2.95651601568685e-05, + "loss": 0.3212, + "step": 491400 + }, + { + "epoch": 22.086440830263275, + "grad_norm": 5.382962703704834, + "learning_rate": 2.9551282243873068e-05, + "loss": 0.3327, + "step": 491600 + }, + { + "epoch": 22.095426363554676, + "grad_norm": 13.09936237335205, + "learning_rate": 2.953740288015259e-05, + "loss": 0.3301, + "step": 491800 + }, + { + "epoch": 22.104411896846077, + "grad_norm": 2.1858365535736084, + "learning_rate": 2.9523522070131116e-05, + "loss": 0.3324, + "step": 492000 + }, + { + "epoch": 22.104411896846077, + "eval_loss": 3.9012913703918457, + "eval_runtime": 1170.9398, + "eval_samples_per_second": 8.458, + "eval_steps_per_second": 0.033, + "step": 492000 + }, + { + "epoch": 22.11339743013748, + "grad_norm": 2.50134015083313, + "learning_rate": 2.9509639818233166e-05, + "loss": 0.2969, + "step": 492200 + }, + { + "epoch": 22.12238296342888, + "grad_norm": 1.286801815032959, + "learning_rate": 2.9495756128883716e-05, + "loss": 0.2918, + "step": 492400 + }, + { + "epoch": 22.13136849672028, + "grad_norm": 2.6734347343444824, + "learning_rate": 2.9481871006508215e-05, + "loss": 0.3323, + "step": 492600 + }, + { + "epoch": 22.140354030011682, + "grad_norm": 6.276237487792969, + "learning_rate": 2.946798445553254e-05, + "loss": 0.323, + "step": 492800 + }, + { + "epoch": 22.149339563303084, + "grad_norm": 1.7359256744384766, + "learning_rate": 2.945409648038306e-05, + "loss": 0.3305, + "step": 493000 + }, + { + "epoch": 22.149339563303084, + "eval_loss": 3.8641602993011475, + "eval_runtime": 1172.5282, + "eval_samples_per_second": 8.447, + "eval_steps_per_second": 0.033, + "step": 493000 + }, + { + "epoch": 22.15832509659448, + "grad_norm": 17.382686614990234, + "learning_rate": 2.9440207085486565e-05, + "loss": 0.3097, + "step": 493200 + }, + { + "epoch": 22.167310629885883, + "grad_norm": 5.912476062774658, + "learning_rate": 2.9426316275270316e-05, + "loss": 0.3329, + "step": 493400 + }, + { + "epoch": 22.176296163177284, + "grad_norm": 9.099150657653809, + "learning_rate": 2.941242405416203e-05, + "loss": 0.3517, + "step": 493600 + }, + { + "epoch": 22.185281696468685, + "grad_norm": 1.9675058126449585, + "learning_rate": 2.9398530426589843e-05, + "loss": 0.3251, + "step": 493800 + }, + { + "epoch": 22.194267229760086, + "grad_norm": 3.559220552444458, + "learning_rate": 2.9384635396982373e-05, + "loss": 0.3182, + "step": 494000 + }, + { + "epoch": 22.194267229760086, + "eval_loss": 3.8617329597473145, + "eval_runtime": 1172.2551, + "eval_samples_per_second": 8.449, + "eval_steps_per_second": 0.033, + "step": 494000 + }, + { + "epoch": 22.203252763051488, + "grad_norm": 1.4313397407531738, + "learning_rate": 2.937073896976868e-05, + "loss": 0.3291, + "step": 494200 + }, + { + "epoch": 22.21223829634289, + "grad_norm": 10.649069786071777, + "learning_rate": 2.9356841149378243e-05, + "loss": 0.3143, + "step": 494400 + }, + { + "epoch": 22.22122382963429, + "grad_norm": 2.5395827293395996, + "learning_rate": 2.934294194024102e-05, + "loss": 0.3239, + "step": 494600 + }, + { + "epoch": 22.230209362925688, + "grad_norm": 16.162391662597656, + "learning_rate": 2.9329041346787393e-05, + "loss": 0.3264, + "step": 494800 + }, + { + "epoch": 22.23919489621709, + "grad_norm": 4.001119136810303, + "learning_rate": 2.9315139373448187e-05, + "loss": 0.3633, + "step": 495000 + }, + { + "epoch": 22.23919489621709, + "eval_loss": 3.887908935546875, + "eval_runtime": 1171.3046, + "eval_samples_per_second": 8.456, + "eval_steps_per_second": 0.033, + "step": 495000 + }, + { + "epoch": 22.24818042950849, + "grad_norm": 3.224276065826416, + "learning_rate": 2.930123602465466e-05, + "loss": 0.3412, + "step": 495200 + }, + { + "epoch": 22.257165962799892, + "grad_norm": 8.406235694885254, + "learning_rate": 2.9287331304838526e-05, + "loss": 0.3101, + "step": 495400 + }, + { + "epoch": 22.266151496091293, + "grad_norm": 0.37792113423347473, + "learning_rate": 2.927342521843191e-05, + "loss": 0.313, + "step": 495600 + }, + { + "epoch": 22.275137029382694, + "grad_norm": 7.6752119064331055, + "learning_rate": 2.925951776986742e-05, + "loss": 0.3194, + "step": 495800 + }, + { + "epoch": 22.284122562674096, + "grad_norm": 8.115521430969238, + "learning_rate": 2.9245608963578035e-05, + "loss": 0.3282, + "step": 496000 + }, + { + "epoch": 22.284122562674096, + "eval_loss": 3.8440310955047607, + "eval_runtime": 1171.3815, + "eval_samples_per_second": 8.455, + "eval_steps_per_second": 0.033, + "step": 496000 + }, + { + "epoch": 22.293108095965497, + "grad_norm": 6.122352123260498, + "learning_rate": 2.9231698803997214e-05, + "loss": 0.3584, + "step": 496200 + }, + { + "epoch": 29.735234215885946, + "grad_norm": 6.727758884429932, + "learning_rate": 1.76713460327016e-05, + "loss": 0.4305, + "step": 496400 + }, + { + "epoch": 29.7472145681083, + "grad_norm": 27.16288185119629, + "learning_rate": 1.7653356059332797e-05, + "loss": 0.4504, + "step": 496600 + }, + { + "epoch": 29.759194920330657, + "grad_norm": 20.496925354003906, + "learning_rate": 1.7635370248836235e-05, + "loss": 0.4269, + "step": 496800 + }, + { + "epoch": 29.771175272553013, + "grad_norm": 11.9760160446167, + "learning_rate": 1.7617388611403342e-05, + "loss": 0.4121, + "step": 497000 + }, + { + "epoch": 29.771175272553013, + "eval_loss": 1.3001623153686523, + "eval_runtime": 1179.5019, + "eval_samples_per_second": 8.397, + "eval_steps_per_second": 0.525, + "step": 497000 + }, + { + "epoch": 29.783155624775368, + "grad_norm": 18.339258193969727, + "learning_rate": 1.7599411157223162e-05, + "loss": 0.3986, + "step": 497200 + }, + { + "epoch": 29.795135976997724, + "grad_norm": 13.581840515136719, + "learning_rate": 1.758143789648235e-05, + "loss": 0.4327, + "step": 497400 + }, + { + "epoch": 29.80711632922008, + "grad_norm": 7.681920528411865, + "learning_rate": 1.7563468839365203e-05, + "loss": 0.4123, + "step": 497600 + }, + { + "epoch": 29.819096681442435, + "grad_norm": 9.169760704040527, + "learning_rate": 1.7545503996053654e-05, + "loss": 0.414, + "step": 497800 + }, + { + "epoch": 29.83107703366479, + "grad_norm": 14.092098236083984, + "learning_rate": 1.7527543376727206e-05, + "loss": 0.4185, + "step": 498000 + }, + { + "epoch": 29.83107703366479, + "eval_loss": 1.3006553649902344, + "eval_runtime": 1179.444, + "eval_samples_per_second": 8.397, + "eval_steps_per_second": 0.525, + "step": 498000 + }, + { + "epoch": 29.843057385887146, + "grad_norm": 5.654545783996582, + "learning_rate": 1.7509586991563e-05, + "loss": 0.4006, + "step": 498200 + }, + { + "epoch": 29.855037738109502, + "grad_norm": 13.537749290466309, + "learning_rate": 1.7491634850735765e-05, + "loss": 0.4088, + "step": 498400 + }, + { + "epoch": 29.867018090331857, + "grad_norm": 24.24238395690918, + "learning_rate": 1.7473686964417836e-05, + "loss": 0.432, + "step": 498600 + }, + { + "epoch": 29.87899844255421, + "grad_norm": 9.747505187988281, + "learning_rate": 1.745574334277912e-05, + "loss": 0.4162, + "step": 498800 + }, + { + "epoch": 29.890978794776565, + "grad_norm": 17.57337188720703, + "learning_rate": 1.743780399598713e-05, + "loss": 0.4, + "step": 499000 + }, + { + "epoch": 29.890978794776565, + "eval_loss": 1.2909830808639526, + "eval_runtime": 1174.8573, + "eval_samples_per_second": 8.43, + "eval_steps_per_second": 0.527, + "step": 499000 + }, + { + "epoch": 29.90295914699892, + "grad_norm": 20.43497657775879, + "learning_rate": 1.7419868934206927e-05, + "loss": 0.3781, + "step": 499200 + }, + { + "epoch": 29.914939499221276, + "grad_norm": 6.868372917175293, + "learning_rate": 1.7401938167601173e-05, + "loss": 0.3713, + "step": 499400 + }, + { + "epoch": 29.926919851443632, + "grad_norm": 3.9050910472869873, + "learning_rate": 1.7384011706330083e-05, + "loss": 0.3943, + "step": 499600 + }, + { + "epoch": 29.938900203665987, + "grad_norm": 4.61909294128418, + "learning_rate": 1.7366089560551432e-05, + "loss": 0.4047, + "step": 499800 + }, + { + "epoch": 29.950880555888343, + "grad_norm": 14.102638244628906, + "learning_rate": 1.7348171740420547e-05, + "loss": 0.4009, + "step": 500000 + }, + { + "epoch": 29.950880555888343, + "eval_loss": 1.2899349927902222, + "eval_runtime": 1176.47, + "eval_samples_per_second": 8.418, + "eval_steps_per_second": 0.526, + "step": 500000 + }, + { + "epoch": 29.9628609081107, + "grad_norm": 16.03158187866211, + "learning_rate": 1.7330258256090326e-05, + "loss": 0.3929, + "step": 500200 + }, + { + "epoch": 29.974841260333054, + "grad_norm": 12.243492126464844, + "learning_rate": 1.731234911771117e-05, + "loss": 0.423, + "step": 500400 + }, + { + "epoch": 29.98682161255541, + "grad_norm": 17.75141143798828, + "learning_rate": 1.7294444335431046e-05, + "loss": 0.3905, + "step": 500600 + }, + { + "epoch": 29.998801964777766, + "grad_norm": 14.251209259033203, + "learning_rate": 1.7276543919395454e-05, + "loss": 0.4274, + "step": 500800 + }, + { + "epoch": 30.01078231700012, + "grad_norm": 5.90828275680542, + "learning_rate": 1.725864787974741e-05, + "loss": 0.3744, + "step": 501000 + }, + { + "epoch": 30.01078231700012, + "eval_loss": 1.2981280088424683, + "eval_runtime": 1177.1036, + "eval_samples_per_second": 8.414, + "eval_steps_per_second": 0.526, + "step": 501000 + }, + { + "epoch": 30.022762669222477, + "grad_norm": 7.459860324859619, + "learning_rate": 1.724075622662745e-05, + "loss": 0.3641, + "step": 501200 + }, + { + "epoch": 30.03474302144483, + "grad_norm": 6.359617710113525, + "learning_rate": 1.7222868970173625e-05, + "loss": 0.3961, + "step": 501400 + }, + { + "epoch": 30.046723373667184, + "grad_norm": 8.468971252441406, + "learning_rate": 1.72049861205215e-05, + "loss": 0.3861, + "step": 501600 + }, + { + "epoch": 30.05870372588954, + "grad_norm": 9.226763725280762, + "learning_rate": 1.718710768780414e-05, + "loss": 0.3803, + "step": 501800 + }, + { + "epoch": 30.070684078111896, + "grad_norm": 6.459045886993408, + "learning_rate": 1.7169233682152108e-05, + "loss": 0.3691, + "step": 502000 + }, + { + "epoch": 30.070684078111896, + "eval_loss": 1.2914437055587769, + "eval_runtime": 1176.221, + "eval_samples_per_second": 8.42, + "eval_steps_per_second": 0.526, + "step": 502000 + }, + { + "epoch": 30.08266443033425, + "grad_norm": 0.5821087956428528, + "learning_rate": 1.7151364113693456e-05, + "loss": 0.3721, + "step": 502200 + }, + { + "epoch": 30.094644782556607, + "grad_norm": 0.9501954317092896, + "learning_rate": 1.713349899255372e-05, + "loss": 0.4402, + "step": 502400 + }, + { + "epoch": 30.106625134778962, + "grad_norm": 4.453815460205078, + "learning_rate": 1.7115638328855927e-05, + "loss": 0.4195, + "step": 502600 + }, + { + "epoch": 30.118605487001318, + "grad_norm": 5.928565502166748, + "learning_rate": 1.709778213272056e-05, + "loss": 0.4023, + "step": 502800 + }, + { + "epoch": 30.130585839223674, + "grad_norm": 12.186752319335938, + "learning_rate": 1.7079930414265587e-05, + "loss": 0.3775, + "step": 503000 + }, + { + "epoch": 30.130585839223674, + "eval_loss": 1.2876982688903809, + "eval_runtime": 1177.2126, + "eval_samples_per_second": 8.413, + "eval_steps_per_second": 0.526, + "step": 503000 + }, + { + "epoch": 30.14256619144603, + "grad_norm": 6.3686017990112305, + "learning_rate": 1.706208318360644e-05, + "loss": 0.3965, + "step": 503200 + }, + { + "epoch": 30.154546543668385, + "grad_norm": 5.7197089195251465, + "learning_rate": 1.7044240450855985e-05, + "loss": 0.3283, + "step": 503400 + }, + { + "epoch": 30.16652689589074, + "grad_norm": 9.594609260559082, + "learning_rate": 1.7026402226124558e-05, + "loss": 0.4004, + "step": 503600 + }, + { + "epoch": 30.178507248113096, + "grad_norm": 4.027350425720215, + "learning_rate": 1.7008568519519958e-05, + "loss": 0.4013, + "step": 503800 + }, + { + "epoch": 30.19048760033545, + "grad_norm": 5.989893913269043, + "learning_rate": 1.6990739341147378e-05, + "loss": 0.3604, + "step": 504000 + }, + { + "epoch": 30.19048760033545, + "eval_loss": 1.2966716289520264, + "eval_runtime": 1178.6668, + "eval_samples_per_second": 8.403, + "eval_steps_per_second": 0.525, + "step": 504000 + }, + { + "epoch": 30.202467952557804, + "grad_norm": 3.6295764446258545, + "learning_rate": 1.6972914701109475e-05, + "loss": 0.4039, + "step": 504200 + }, + { + "epoch": 30.21444830478016, + "grad_norm": 22.197795867919922, + "learning_rate": 1.6955094609506355e-05, + "loss": 0.3813, + "step": 504400 + }, + { + "epoch": 30.226428657002515, + "grad_norm": 16.731632232666016, + "learning_rate": 1.6937279076435488e-05, + "loss": 0.4041, + "step": 504600 + }, + { + "epoch": 30.23840900922487, + "grad_norm": 9.170949935913086, + "learning_rate": 1.6919468111991805e-05, + "loss": 0.3707, + "step": 504800 + }, + { + "epoch": 30.250389361447226, + "grad_norm": 10.209980010986328, + "learning_rate": 1.690166172626766e-05, + "loss": 0.3934, + "step": 505000 + }, + { + "epoch": 30.250389361447226, + "eval_loss": 1.289827585220337, + "eval_runtime": 1172.8257, + "eval_samples_per_second": 8.445, + "eval_steps_per_second": 0.528, + "step": 505000 + }, + { + "epoch": 30.26236971366958, + "grad_norm": 4.348522663116455, + "learning_rate": 1.6883859929352756e-05, + "loss": 0.3851, + "step": 505200 + }, + { + "epoch": 30.274350065891937, + "grad_norm": 4.488011360168457, + "learning_rate": 1.6866062731334254e-05, + "loss": 0.402, + "step": 505400 + }, + { + "epoch": 30.286330418114293, + "grad_norm": 9.877191543579102, + "learning_rate": 1.6848270142296684e-05, + "loss": 0.4081, + "step": 505600 + }, + { + "epoch": 30.29831077033665, + "grad_norm": 8.008275032043457, + "learning_rate": 1.683048217232195e-05, + "loss": 0.3914, + "step": 505800 + }, + { + "epoch": 37.8884312991389, + "grad_norm": 9.622276306152344, + "learning_rate": 3.428469915162767e-05, + "loss": 0.4741, + "step": 506000 + }, + { + "epoch": 37.8884312991389, + "eval_loss": 1.4929417371749878, + "eval_runtime": 1176.5683, + "eval_samples_per_second": 8.418, + "eval_steps_per_second": 0.422, + "step": 506000 + }, + { + "epoch": 37.90340696368401, + "grad_norm": 9.39040756225586, + "learning_rate": 3.4273777458497844e-05, + "loss": 0.5071, + "step": 506200 + }, + { + "epoch": 37.918382628229125, + "grad_norm": 5.257010459899902, + "learning_rate": 3.426285371263784e-05, + "loss": 0.5165, + "step": 506400 + }, + { + "epoch": 37.933358292774244, + "grad_norm": 11.673007011413574, + "learning_rate": 3.425192791646561e-05, + "loss": 0.4934, + "step": 506600 + }, + { + "epoch": 37.948333957319356, + "grad_norm": 8.285956382751465, + "learning_rate": 3.424100007239956e-05, + "loss": 0.4938, + "step": 506800 + }, + { + "epoch": 37.96330962186447, + "grad_norm": 12.166071891784668, + "learning_rate": 3.423007018285853e-05, + "loss": 0.5294, + "step": 507000 + }, + { + "epoch": 37.96330962186447, + "eval_loss": 1.4884788990020752, + "eval_runtime": 1175.8977, + "eval_samples_per_second": 8.423, + "eval_steps_per_second": 0.422, + "step": 507000 + }, + { + "epoch": 37.97828528640959, + "grad_norm": 8.446301460266113, + "learning_rate": 3.4219138250261844e-05, + "loss": 0.4972, + "step": 507200 + }, + { + "epoch": 37.9932609509547, + "grad_norm": 11.271819114685059, + "learning_rate": 3.4208204277029254e-05, + "loss": 0.4955, + "step": 507400 + }, + { + "epoch": 38.00823661549981, + "grad_norm": 3.9000062942504883, + "learning_rate": 3.419726826558097e-05, + "loss": 0.5022, + "step": 507600 + }, + { + "epoch": 38.02321228004493, + "grad_norm": 5.071476936340332, + "learning_rate": 3.418633021833766e-05, + "loss": 0.5001, + "step": 507800 + }, + { + "epoch": 38.03818794459004, + "grad_norm": 6.885400295257568, + "learning_rate": 3.4175390137720426e-05, + "loss": 0.5128, + "step": 508000 + }, + { + "epoch": 38.03818794459004, + "eval_loss": 1.5250493288040161, + "eval_runtime": 1200.985, + "eval_samples_per_second": 8.247, + "eval_steps_per_second": 0.413, + "step": 508000 + }, + { + "epoch": 38.053163609135154, + "grad_norm": 10.252553939819336, + "learning_rate": 3.4164448026150833e-05, + "loss": 0.5061, + "step": 508200 + }, + { + "epoch": 38.068139273680266, + "grad_norm": 13.395514488220215, + "learning_rate": 3.41535038860509e-05, + "loss": 0.4998, + "step": 508400 + }, + { + "epoch": 38.083114938225386, + "grad_norm": 8.786001205444336, + "learning_rate": 3.414255771984308e-05, + "loss": 0.4754, + "step": 508600 + }, + { + "epoch": 38.0980906027705, + "grad_norm": 4.015108108520508, + "learning_rate": 3.413160952995029e-05, + "loss": 0.5136, + "step": 508800 + }, + { + "epoch": 38.11306626731561, + "grad_norm": 12.217897415161133, + "learning_rate": 3.412065931879589e-05, + "loss": 0.5201, + "step": 509000 + }, + { + "epoch": 38.11306626731561, + "eval_loss": 1.5008865594863892, + "eval_runtime": 1179.3365, + "eval_samples_per_second": 8.398, + "eval_steps_per_second": 0.421, + "step": 509000 + }, + { + "epoch": 38.12804193186073, + "grad_norm": 6.388108730316162, + "learning_rate": 3.4109707088803675e-05, + "loss": 0.4798, + "step": 509200 + }, + { + "epoch": 38.14301759640584, + "grad_norm": 11.298495292663574, + "learning_rate": 3.40987528423979e-05, + "loss": 0.5075, + "step": 509400 + }, + { + "epoch": 38.15799326095095, + "grad_norm": 10.524277687072754, + "learning_rate": 3.4087796582003275e-05, + "loss": 0.5069, + "step": 509600 + }, + { + "epoch": 38.17296892549607, + "grad_norm": 4.701446533203125, + "learning_rate": 3.407683831004492e-05, + "loss": 0.5348, + "step": 509800 + }, + { + "epoch": 38.187944590041184, + "grad_norm": 6.099429130554199, + "learning_rate": 3.4065878028948444e-05, + "loss": 0.5197, + "step": 510000 + }, + { + "epoch": 38.187944590041184, + "eval_loss": 1.4903627634048462, + "eval_runtime": 1176.9696, + "eval_samples_per_second": 8.415, + "eval_steps_per_second": 0.421, + "step": 510000 + }, + { + "epoch": 38.202920254586296, + "grad_norm": 2.2168385982513428, + "learning_rate": 3.405491574113988e-05, + "loss": 0.5274, + "step": 510200 + }, + { + "epoch": 38.217895919131415, + "grad_norm": 5.571523666381836, + "learning_rate": 3.4043951449045695e-05, + "loss": 0.508, + "step": 510400 + }, + { + "epoch": 38.23287158367653, + "grad_norm": 6.537966728210449, + "learning_rate": 3.403298515509283e-05, + "loss": 0.4882, + "step": 510600 + }, + { + "epoch": 38.24784724822164, + "grad_norm": 28.721155166625977, + "learning_rate": 3.4022016861708624e-05, + "loss": 0.5249, + "step": 510800 + }, + { + "epoch": 38.26282291276675, + "grad_norm": 42.41488265991211, + "learning_rate": 3.401104657132091e-05, + "loss": 0.5189, + "step": 511000 + }, + { + "epoch": 38.26282291276675, + "eval_loss": 1.4894089698791504, + "eval_runtime": 1177.1322, + "eval_samples_per_second": 8.414, + "eval_steps_per_second": 0.421, + "step": 511000 + }, + { + "epoch": 38.27779857731187, + "grad_norm": 4.228972434997559, + "learning_rate": 3.4000074286357915e-05, + "loss": 0.5163, + "step": 511200 + }, + { + "epoch": 38.29277424185698, + "grad_norm": 15.363574028015137, + "learning_rate": 3.398910000924834e-05, + "loss": 0.5183, + "step": 511400 + }, + { + "epoch": 38.307749906402094, + "grad_norm": 9.935384750366211, + "learning_rate": 3.3978123742421324e-05, + "loss": 0.5591, + "step": 511600 + }, + { + "epoch": 38.32272557094721, + "grad_norm": 4.404662609100342, + "learning_rate": 3.396714548830643e-05, + "loss": 0.5022, + "step": 511800 + }, + { + "epoch": 38.337701235492325, + "grad_norm": 11.139054298400879, + "learning_rate": 3.395616524933368e-05, + "loss": 0.5119, + "step": 512000 + }, + { + "epoch": 38.337701235492325, + "eval_loss": 1.499504804611206, + "eval_runtime": 1176.9511, + "eval_samples_per_second": 8.415, + "eval_steps_per_second": 0.421, + "step": 512000 + }, + { + "epoch": 38.35267690003744, + "grad_norm": 6.452362537384033, + "learning_rate": 3.39451830279335e-05, + "loss": 0.5166, + "step": 512200 + }, + { + "epoch": 38.367652564582556, + "grad_norm": 18.25565528869629, + "learning_rate": 3.3934198826536816e-05, + "loss": 0.5368, + "step": 512400 + }, + { + "epoch": 38.38262822912767, + "grad_norm": 23.89322280883789, + "learning_rate": 3.3923212647574944e-05, + "loss": 0.5608, + "step": 512600 + }, + { + "epoch": 38.39760389367278, + "grad_norm": 31.064790725708008, + "learning_rate": 3.3912224493479636e-05, + "loss": 0.5323, + "step": 512800 + }, + { + "epoch": 38.4125795582179, + "grad_norm": 5.670620918273926, + "learning_rate": 3.390123436668312e-05, + "loss": 0.5253, + "step": 513000 + }, + { + "epoch": 38.4125795582179, + "eval_loss": 1.4939533472061157, + "eval_runtime": 1175.7739, + "eval_samples_per_second": 8.423, + "eval_steps_per_second": 0.422, + "step": 513000 + }, + { + "epoch": 38.42755522276301, + "grad_norm": 13.84334945678711, + "learning_rate": 3.389024226961801e-05, + "loss": 0.5102, + "step": 513200 + }, + { + "epoch": 38.44253088730812, + "grad_norm": 6.627730369567871, + "learning_rate": 3.38792482047174e-05, + "loss": 0.5247, + "step": 513400 + }, + { + "epoch": 38.457506551853236, + "grad_norm": 6.677562236785889, + "learning_rate": 3.3868252174414795e-05, + "loss": 0.5242, + "step": 513600 + }, + { + "epoch": 38.472482216398355, + "grad_norm": 12.594396591186523, + "learning_rate": 3.385725418114414e-05, + "loss": 0.5081, + "step": 513800 + }, + { + "epoch": 38.48745788094347, + "grad_norm": 11.885519981384277, + "learning_rate": 3.384625422733983e-05, + "loss": 0.5153, + "step": 514000 + }, + { + "epoch": 38.48745788094347, + "eval_loss": 1.4921842813491821, + "eval_runtime": 1175.5948, + "eval_samples_per_second": 8.425, + "eval_steps_per_second": 0.422, + "step": 514000 + }, + { + "epoch": 38.50243354548858, + "grad_norm": 11.020857810974121, + "learning_rate": 3.383525231543667e-05, + "loss": 0.5568, + "step": 514200 + }, + { + "epoch": 38.5174092100337, + "grad_norm": 8.571372985839844, + "learning_rate": 3.38242484478699e-05, + "loss": 0.5276, + "step": 514400 + }, + { + "epoch": 38.53238487457881, + "grad_norm": 9.76460075378418, + "learning_rate": 3.3813242627075227e-05, + "loss": 0.5185, + "step": 514600 + }, + { + "epoch": 38.54736053912392, + "grad_norm": 12.717774391174316, + "learning_rate": 3.380223485548874e-05, + "loss": 0.5122, + "step": 514800 + }, + { + "epoch": 38.56233620366904, + "grad_norm": 16.786684036254883, + "learning_rate": 3.3791225135547e-05, + "loss": 0.5573, + "step": 515000 + }, + { + "epoch": 38.56233620366904, + "eval_loss": 1.4709687232971191, + "eval_runtime": 1214.3398, + "eval_samples_per_second": 8.156, + "eval_steps_per_second": 0.408, + "step": 515000 + }, + { + "epoch": 38.57731186821415, + "grad_norm": 9.267097473144531, + "learning_rate": 3.3780213469686974e-05, + "loss": 0.532, + "step": 515200 + }, + { + "epoch": 38.592287532759265, + "grad_norm": 10.737237930297852, + "learning_rate": 3.376919986034608e-05, + "loss": 0.527, + "step": 515400 + }, + { + "epoch": 38.60726319730438, + "grad_norm": 5.553459167480469, + "learning_rate": 3.375818430996214e-05, + "loss": 0.5508, + "step": 515600 + }, + { + "epoch": 38.622238861849496, + "grad_norm": 14.4932279586792, + "learning_rate": 3.374716682097345e-05, + "loss": 0.4968, + "step": 515800 + }, + { + "epoch": 38.63721452639461, + "grad_norm": 14.62238597869873, + "learning_rate": 3.373614739581868e-05, + "loss": 0.5395, + "step": 516000 + }, + { + "epoch": 38.63721452639461, + "eval_loss": 1.4602041244506836, + "eval_runtime": 1211.7704, + "eval_samples_per_second": 8.173, + "eval_steps_per_second": 0.409, + "step": 516000 + }, + { + "epoch": 38.65219019093972, + "grad_norm": 6.761282920837402, + "learning_rate": 3.372512603693697e-05, + "loss": 0.5142, + "step": 516200 + }, + { + "epoch": 38.66716585548484, + "grad_norm": 9.413261413574219, + "learning_rate": 3.371410274676787e-05, + "loss": 0.5155, + "step": 516400 + }, + { + "epoch": 38.68214152002995, + "grad_norm": 11.109018325805664, + "learning_rate": 3.3703077527751355e-05, + "loss": 0.559, + "step": 516600 + }, + { + "epoch": 38.69711718457506, + "grad_norm": 8.78099536895752, + "learning_rate": 3.369205038232784e-05, + "loss": 0.5409, + "step": 516800 + }, + { + "epoch": 38.71209284912018, + "grad_norm": 25.103275299072266, + "learning_rate": 3.3681021312938154e-05, + "loss": 0.5416, + "step": 517000 + }, + { + "epoch": 38.71209284912018, + "eval_loss": 1.4606013298034668, + "eval_runtime": 1208.7353, + "eval_samples_per_second": 8.194, + "eval_steps_per_second": 0.41, + "step": 517000 + }, + { + "epoch": 38.727068513665294, + "grad_norm": 9.382966995239258, + "learning_rate": 3.366999032202356e-05, + "loss": 0.5401, + "step": 517200 + }, + { + "epoch": 38.742044178210406, + "grad_norm": 17.933349609375, + "learning_rate": 3.3658957412025746e-05, + "loss": 0.533, + "step": 517400 + }, + { + "epoch": 38.757019842755525, + "grad_norm": 9.837800979614258, + "learning_rate": 3.364792258538683e-05, + "loss": 0.5357, + "step": 517600 + }, + { + "epoch": 38.77199550730064, + "grad_norm": 15.193432807922363, + "learning_rate": 3.363688584454932e-05, + "loss": 0.5173, + "step": 517800 + }, + { + "epoch": 38.78697117184575, + "grad_norm": 8.285181045532227, + "learning_rate": 3.36258471919562e-05, + "loss": 0.5606, + "step": 518000 + }, + { + "epoch": 38.78697117184575, + "eval_loss": 1.470923662185669, + "eval_runtime": 1194.9156, + "eval_samples_per_second": 8.288, + "eval_steps_per_second": 0.415, + "step": 518000 + }, + { + "epoch": 38.80194683639086, + "grad_norm": 5.0072784423828125, + "learning_rate": 3.361480663005084e-05, + "loss": 0.5307, + "step": 518200 + }, + { + "epoch": 38.81692250093598, + "grad_norm": 3.3750789165496826, + "learning_rate": 3.3603764161277045e-05, + "loss": 0.5357, + "step": 518400 + }, + { + "epoch": 38.83189816548109, + "grad_norm": 6.130032062530518, + "learning_rate": 3.359271978807904e-05, + "loss": 0.5223, + "step": 518600 + }, + { + "epoch": 38.846873830026205, + "grad_norm": 8.677162170410156, + "learning_rate": 3.358167351290148e-05, + "loss": 0.5397, + "step": 518800 + }, + { + "epoch": 38.861849494571324, + "grad_norm": 11.169751167297363, + "learning_rate": 3.357062533818942e-05, + "loss": 0.5717, + "step": 519000 + }, + { + "epoch": 38.861849494571324, + "eval_loss": 1.4725736379623413, + "eval_runtime": 1194.2386, + "eval_samples_per_second": 8.293, + "eval_steps_per_second": 0.415, + "step": 519000 + }, + { + "epoch": 38.876825159116436, + "grad_norm": 11.185416221618652, + "learning_rate": 3.3559575266388373e-05, + "loss": 0.5311, + "step": 519200 + }, + { + "epoch": 38.89180082366155, + "grad_norm": 19.092422485351562, + "learning_rate": 3.3548523299944215e-05, + "loss": 0.5407, + "step": 519400 + }, + { + "epoch": 38.90677648820667, + "grad_norm": 12.053357124328613, + "learning_rate": 3.353746944130331e-05, + "loss": 0.5422, + "step": 519600 + }, + { + "epoch": 38.92175215275178, + "grad_norm": 20.237478256225586, + "learning_rate": 3.3526413692912375e-05, + "loss": 0.5055, + "step": 519800 + }, + { + "epoch": 38.93672781729689, + "grad_norm": 5.862433910369873, + "learning_rate": 3.351535605721859e-05, + "loss": 0.5506, + "step": 520000 + }, + { + "epoch": 38.93672781729689, + "eval_loss": 1.456874966621399, + "eval_runtime": 1194.9365, + "eval_samples_per_second": 8.288, + "eval_steps_per_second": 0.415, + "step": 520000 + }, + { + "epoch": 38.95170348184201, + "grad_norm": 4.471254825592041, + "learning_rate": 3.350429653666953e-05, + "loss": 0.551, + "step": 520200 + }, + { + "epoch": 38.96667914638712, + "grad_norm": 12.976078987121582, + "learning_rate": 3.349323513371321e-05, + "loss": 0.5434, + "step": 520400 + }, + { + "epoch": 38.981654810932234, + "grad_norm": 3.8949899673461914, + "learning_rate": 3.348217185079804e-05, + "loss": 0.5403, + "step": 520600 + }, + { + "epoch": 38.996630475477346, + "grad_norm": 10.096271514892578, + "learning_rate": 3.3471106690372844e-05, + "loss": 0.5327, + "step": 520800 + }, + { + "epoch": 39.011606140022465, + "grad_norm": 15.142107009887695, + "learning_rate": 3.3460039654886886e-05, + "loss": 0.5073, + "step": 521000 + }, + { + "epoch": 39.011606140022465, + "eval_loss": 1.4990595579147339, + "eval_runtime": 1194.3065, + "eval_samples_per_second": 8.293, + "eval_steps_per_second": 0.415, + "step": 521000 + }, + { + "epoch": 39.02658180456758, + "grad_norm": 14.253155708312988, + "learning_rate": 3.344897074678981e-05, + "loss": 0.4797, + "step": 521200 + }, + { + "epoch": 39.04155746911269, + "grad_norm": 9.489974975585938, + "learning_rate": 3.343789996853173e-05, + "loss": 0.4967, + "step": 521400 + }, + { + "epoch": 39.05653313365781, + "grad_norm": 5.21506404876709, + "learning_rate": 3.342682732256309e-05, + "loss": 0.4869, + "step": 521600 + }, + { + "epoch": 39.07150879820292, + "grad_norm": 8.484586715698242, + "learning_rate": 3.341575281133483e-05, + "loss": 0.467, + "step": 521800 + }, + { + "epoch": 39.08648446274803, + "grad_norm": 5.683114528656006, + "learning_rate": 3.3404676437298265e-05, + "loss": 0.4985, + "step": 522000 + }, + { + "epoch": 39.08648446274803, + "eval_loss": 1.4869056940078735, + "eval_runtime": 1217.0785, + "eval_samples_per_second": 8.138, + "eval_steps_per_second": 0.408, + "step": 522000 + }, + { + "epoch": 39.10146012729315, + "grad_norm": 6.743536949157715, + "learning_rate": 3.3393598202905116e-05, + "loss": 0.4841, + "step": 522200 + } + ], + "logging_steps": 200, + "max_steps": 1335500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.688898869047296e+18, + "train_batch_size": 10, + "trial_name": null, + "trial_params": null +}