diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.24999189937074312, + "epoch": 0.9999675974829725, "eval_steps": 2411, - "global_step": 2411, + "global_step": 9644, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -16900,6 +16900,50661 @@ "eval_samples_per_second": 339.196, "eval_steps_per_second": 15.935, "step": 2411 + }, + { + "epoch": 0.2500955874252312, + "grad_norm": 0.578125, + "learning_rate": 0.00019661550413017746, + "loss": 4.5607, + "step": 2412 + }, + { + "epoch": 0.25019927547971926, + "grad_norm": 0.55078125, + "learning_rate": 0.00019661270150023348, + "loss": 4.5456, + "step": 2413 + }, + { + "epoch": 0.25030296353420733, + "grad_norm": 0.6171875, + "learning_rate": 0.00019660989773036174, + "loss": 4.5299, + "step": 2414 + }, + { + "epoch": 0.2504066515886954, + "grad_norm": 0.5234375, + "learning_rate": 0.00019660709282059532, + "loss": 4.4934, + "step": 2415 + }, + { + "epoch": 0.2505103396431835, + "grad_norm": 0.65234375, + "learning_rate": 0.00019660428677096734, + "loss": 4.5475, + "step": 2416 + }, + { + "epoch": 0.25061402769767155, + "grad_norm": 0.494140625, + "learning_rate": 0.0001966014795815109, + "loss": 4.5659, + "step": 2417 + }, + { + "epoch": 0.2507177157521596, + "grad_norm": 0.609375, + "learning_rate": 0.0001965986712522591, + "loss": 4.5103, + "step": 2418 + }, + { + "epoch": 0.2508214038066477, + "grad_norm": 0.625, + "learning_rate": 0.00019659586178324506, + "loss": 4.5531, + "step": 2419 + }, + { + "epoch": 0.25092509186113576, + "grad_norm": 0.515625, + "learning_rate": 0.000196593051174502, + "loss": 4.5444, + "step": 2420 + }, + { + "epoch": 0.25102877991562383, + "grad_norm": 0.57421875, + "learning_rate": 0.00019659023942606303, + "loss": 4.554, + "step": 2421 + }, + { + "epoch": 0.2511324679701119, + "grad_norm": 0.53515625, + "learning_rate": 0.00019658742653796133, + "loss": 4.5388, + "step": 2422 + }, + { + "epoch": 0.2512361560246, + "grad_norm": 0.6015625, + "learning_rate": 0.00019658461251023012, + "loss": 4.554, + "step": 2423 + }, + { + "epoch": 0.25133984407908805, + "grad_norm": 0.62109375, + "learning_rate": 0.00019658179734290254, + "loss": 4.5397, + "step": 2424 + }, + { + "epoch": 0.2514435321335761, + "grad_norm": 0.55859375, + "learning_rate": 0.00019657898103601185, + "loss": 4.5455, + "step": 2425 + }, + { + "epoch": 0.2515472201880642, + "grad_norm": 0.6953125, + "learning_rate": 0.00019657616358959126, + "loss": 4.5079, + "step": 2426 + }, + { + "epoch": 0.25165090824255226, + "grad_norm": 0.62109375, + "learning_rate": 0.00019657334500367406, + "loss": 4.5894, + "step": 2427 + }, + { + "epoch": 0.25175459629704033, + "grad_norm": 0.5625, + "learning_rate": 0.00019657052527829346, + "loss": 4.4915, + "step": 2428 + }, + { + "epoch": 0.2518582843515284, + "grad_norm": 0.546875, + "learning_rate": 0.00019656770441348273, + "loss": 4.5516, + "step": 2429 + }, + { + "epoch": 0.2519619724060165, + "grad_norm": 0.6171875, + "learning_rate": 0.00019656488240927516, + "loss": 4.5341, + "step": 2430 + }, + { + "epoch": 0.25206566046050455, + "grad_norm": 0.6015625, + "learning_rate": 0.00019656205926570406, + "loss": 4.5455, + "step": 2431 + }, + { + "epoch": 0.2521693485149926, + "grad_norm": 0.5390625, + "learning_rate": 0.00019655923498280273, + "loss": 4.5265, + "step": 2432 + }, + { + "epoch": 0.2522730365694807, + "grad_norm": 0.49609375, + "learning_rate": 0.0001965564095606045, + "loss": 4.5306, + "step": 2433 + }, + { + "epoch": 0.25237672462396876, + "grad_norm": 0.5546875, + "learning_rate": 0.0001965535829991427, + "loss": 4.5759, + "step": 2434 + }, + { + "epoch": 0.2524804126784569, + "grad_norm": 0.5078125, + "learning_rate": 0.0001965507552984507, + "loss": 4.5264, + "step": 2435 + }, + { + "epoch": 0.25258410073294496, + "grad_norm": 0.5859375, + "learning_rate": 0.0001965479264585618, + "loss": 4.5116, + "step": 2436 + }, + { + "epoch": 0.25268778878743303, + "grad_norm": 0.494140625, + "learning_rate": 0.00019654509647950945, + "loss": 4.4965, + "step": 2437 + }, + { + "epoch": 0.2527914768419211, + "grad_norm": 0.5625, + "learning_rate": 0.000196542265361327, + "loss": 4.5255, + "step": 2438 + }, + { + "epoch": 0.2528951648964092, + "grad_norm": 0.55078125, + "learning_rate": 0.0001965394331040479, + "loss": 4.5525, + "step": 2439 + }, + { + "epoch": 0.25299885295089725, + "grad_norm": 0.546875, + "learning_rate": 0.0001965365997077055, + "loss": 4.5402, + "step": 2440 + }, + { + "epoch": 0.2531025410053853, + "grad_norm": 0.51171875, + "learning_rate": 0.00019653376517233327, + "loss": 4.5721, + "step": 2441 + }, + { + "epoch": 0.2532062290598734, + "grad_norm": 0.5234375, + "learning_rate": 0.00019653092949796467, + "loss": 4.538, + "step": 2442 + }, + { + "epoch": 0.25330991711436146, + "grad_norm": 0.50390625, + "learning_rate": 0.00019652809268463315, + "loss": 4.5701, + "step": 2443 + }, + { + "epoch": 0.25341360516884953, + "grad_norm": 0.51953125, + "learning_rate": 0.00019652525473237215, + "loss": 4.5366, + "step": 2444 + }, + { + "epoch": 0.2535172932233376, + "grad_norm": 0.486328125, + "learning_rate": 0.00019652241564121518, + "loss": 4.5249, + "step": 2445 + }, + { + "epoch": 0.2536209812778257, + "grad_norm": 0.5625, + "learning_rate": 0.00019651957541119575, + "loss": 4.5277, + "step": 2446 + }, + { + "epoch": 0.25372466933231375, + "grad_norm": 0.5390625, + "learning_rate": 0.00019651673404234732, + "loss": 4.5749, + "step": 2447 + }, + { + "epoch": 0.2538283573868018, + "grad_norm": 0.56640625, + "learning_rate": 0.00019651389153470348, + "loss": 4.5396, + "step": 2448 + }, + { + "epoch": 0.2539320454412899, + "grad_norm": 0.5078125, + "learning_rate": 0.00019651104788829775, + "loss": 4.5754, + "step": 2449 + }, + { + "epoch": 0.25403573349577796, + "grad_norm": 0.546875, + "learning_rate": 0.00019650820310316362, + "loss": 4.5094, + "step": 2450 + }, + { + "epoch": 0.25413942155026603, + "grad_norm": 0.46875, + "learning_rate": 0.00019650535717933475, + "loss": 4.5006, + "step": 2451 + }, + { + "epoch": 0.2542431096047541, + "grad_norm": 0.5390625, + "learning_rate": 0.0001965025101168447, + "loss": 4.5539, + "step": 2452 + }, + { + "epoch": 0.2543467976592422, + "grad_norm": 0.46875, + "learning_rate": 0.00019649966191572697, + "loss": 4.4919, + "step": 2453 + }, + { + "epoch": 0.25445048571373025, + "grad_norm": 0.58203125, + "learning_rate": 0.0001964968125760153, + "loss": 4.5391, + "step": 2454 + }, + { + "epoch": 0.2545541737682183, + "grad_norm": 0.58203125, + "learning_rate": 0.0001964939620977432, + "loss": 4.5305, + "step": 2455 + }, + { + "epoch": 0.2546578618227064, + "grad_norm": 0.5703125, + "learning_rate": 0.00019649111048094435, + "loss": 4.5178, + "step": 2456 + }, + { + "epoch": 0.25476154987719446, + "grad_norm": 0.6171875, + "learning_rate": 0.00019648825772565238, + "loss": 4.545, + "step": 2457 + }, + { + "epoch": 0.25486523793168253, + "grad_norm": 0.6015625, + "learning_rate": 0.00019648540383190098, + "loss": 4.5473, + "step": 2458 + }, + { + "epoch": 0.2549689259861706, + "grad_norm": 0.58203125, + "learning_rate": 0.0001964825487997238, + "loss": 4.5182, + "step": 2459 + }, + { + "epoch": 0.2550726140406587, + "grad_norm": 0.59765625, + "learning_rate": 0.00019647969262915455, + "loss": 4.517, + "step": 2460 + }, + { + "epoch": 0.25517630209514675, + "grad_norm": 0.6640625, + "learning_rate": 0.00019647683532022692, + "loss": 4.5324, + "step": 2461 + }, + { + "epoch": 0.2552799901496348, + "grad_norm": 0.57421875, + "learning_rate": 0.00019647397687297456, + "loss": 4.5349, + "step": 2462 + }, + { + "epoch": 0.2553836782041229, + "grad_norm": 0.66015625, + "learning_rate": 0.0001964711172874313, + "loss": 4.5047, + "step": 2463 + }, + { + "epoch": 0.25548736625861096, + "grad_norm": 0.5703125, + "learning_rate": 0.0001964682565636308, + "loss": 4.5019, + "step": 2464 + }, + { + "epoch": 0.25559105431309903, + "grad_norm": 0.59375, + "learning_rate": 0.00019646539470160684, + "loss": 4.5242, + "step": 2465 + }, + { + "epoch": 0.2556947423675871, + "grad_norm": 0.58203125, + "learning_rate": 0.0001964625317013932, + "loss": 4.5476, + "step": 2466 + }, + { + "epoch": 0.2557984304220752, + "grad_norm": 0.5390625, + "learning_rate": 0.00019645966756302367, + "loss": 4.5218, + "step": 2467 + }, + { + "epoch": 0.25590211847656325, + "grad_norm": 0.53125, + "learning_rate": 0.00019645680228653204, + "loss": 4.5236, + "step": 2468 + }, + { + "epoch": 0.2560058065310513, + "grad_norm": 0.60546875, + "learning_rate": 0.00019645393587195204, + "loss": 4.5162, + "step": 2469 + }, + { + "epoch": 0.2561094945855394, + "grad_norm": 0.55078125, + "learning_rate": 0.0001964510683193176, + "loss": 4.5434, + "step": 2470 + }, + { + "epoch": 0.25621318264002746, + "grad_norm": 0.578125, + "learning_rate": 0.0001964481996286625, + "loss": 4.4977, + "step": 2471 + }, + { + "epoch": 0.25631687069451553, + "grad_norm": 0.57421875, + "learning_rate": 0.00019644532980002058, + "loss": 4.5071, + "step": 2472 + }, + { + "epoch": 0.2564205587490036, + "grad_norm": 0.5859375, + "learning_rate": 0.00019644245883342572, + "loss": 4.4715, + "step": 2473 + }, + { + "epoch": 0.2565242468034917, + "grad_norm": 0.5703125, + "learning_rate": 0.00019643958672891181, + "loss": 4.5657, + "step": 2474 + }, + { + "epoch": 0.25662793485797974, + "grad_norm": 0.58984375, + "learning_rate": 0.00019643671348651268, + "loss": 4.518, + "step": 2475 + }, + { + "epoch": 0.2567316229124678, + "grad_norm": 0.5703125, + "learning_rate": 0.0001964338391062623, + "loss": 4.5594, + "step": 2476 + }, + { + "epoch": 0.2568353109669559, + "grad_norm": 0.546875, + "learning_rate": 0.00019643096358819455, + "loss": 4.5379, + "step": 2477 + }, + { + "epoch": 0.25693899902144396, + "grad_norm": 0.6171875, + "learning_rate": 0.00019642808693234333, + "loss": 4.4973, + "step": 2478 + }, + { + "epoch": 0.2570426870759321, + "grad_norm": 0.5625, + "learning_rate": 0.00019642520913874264, + "loss": 4.5271, + "step": 2479 + }, + { + "epoch": 0.25714637513042016, + "grad_norm": 0.68359375, + "learning_rate": 0.0001964223302074264, + "loss": 4.4797, + "step": 2480 + }, + { + "epoch": 0.25725006318490823, + "grad_norm": 0.5703125, + "learning_rate": 0.00019641945013842862, + "loss": 4.5094, + "step": 2481 + }, + { + "epoch": 0.2573537512393963, + "grad_norm": 0.65234375, + "learning_rate": 0.0001964165689317832, + "loss": 4.5076, + "step": 2482 + }, + { + "epoch": 0.25745743929388437, + "grad_norm": 0.64453125, + "learning_rate": 0.0001964136865875242, + "loss": 4.5485, + "step": 2483 + }, + { + "epoch": 0.25756112734837244, + "grad_norm": 0.58203125, + "learning_rate": 0.00019641080310568563, + "loss": 4.55, + "step": 2484 + }, + { + "epoch": 0.2576648154028605, + "grad_norm": 0.61328125, + "learning_rate": 0.00019640791848630148, + "loss": 4.4889, + "step": 2485 + }, + { + "epoch": 0.2577685034573486, + "grad_norm": 0.64453125, + "learning_rate": 0.0001964050327294058, + "loss": 4.5108, + "step": 2486 + }, + { + "epoch": 0.25787219151183666, + "grad_norm": 0.58984375, + "learning_rate": 0.00019640214583503264, + "loss": 4.5616, + "step": 2487 + }, + { + "epoch": 0.25797587956632473, + "grad_norm": 0.609375, + "learning_rate": 0.00019639925780321607, + "loss": 4.5208, + "step": 2488 + }, + { + "epoch": 0.2580795676208128, + "grad_norm": 0.640625, + "learning_rate": 0.0001963963686339901, + "loss": 4.5417, + "step": 2489 + }, + { + "epoch": 0.25818325567530087, + "grad_norm": 0.56640625, + "learning_rate": 0.00019639347832738896, + "loss": 4.5336, + "step": 2490 + }, + { + "epoch": 0.25828694372978894, + "grad_norm": 0.65625, + "learning_rate": 0.00019639058688344663, + "loss": 4.5381, + "step": 2491 + }, + { + "epoch": 0.258390631784277, + "grad_norm": 0.63671875, + "learning_rate": 0.00019638769430219727, + "loss": 4.5333, + "step": 2492 + }, + { + "epoch": 0.2584943198387651, + "grad_norm": 0.59765625, + "learning_rate": 0.00019638480058367498, + "loss": 4.5055, + "step": 2493 + }, + { + "epoch": 0.25859800789325316, + "grad_norm": 0.5546875, + "learning_rate": 0.00019638190572791392, + "loss": 4.5351, + "step": 2494 + }, + { + "epoch": 0.25870169594774123, + "grad_norm": 0.6484375, + "learning_rate": 0.00019637900973494828, + "loss": 4.5316, + "step": 2495 + }, + { + "epoch": 0.2588053840022293, + "grad_norm": 0.625, + "learning_rate": 0.00019637611260481223, + "loss": 4.545, + "step": 2496 + }, + { + "epoch": 0.25890907205671737, + "grad_norm": 0.63671875, + "learning_rate": 0.00019637321433753986, + "loss": 4.5428, + "step": 2497 + }, + { + "epoch": 0.25901276011120544, + "grad_norm": 0.69921875, + "learning_rate": 0.00019637031493316548, + "loss": 4.5547, + "step": 2498 + }, + { + "epoch": 0.2591164481656935, + "grad_norm": 0.625, + "learning_rate": 0.00019636741439172322, + "loss": 4.5579, + "step": 2499 + }, + { + "epoch": 0.2592201362201816, + "grad_norm": 0.58984375, + "learning_rate": 0.00019636451271324735, + "loss": 4.5162, + "step": 2500 + }, + { + "epoch": 0.25932382427466966, + "grad_norm": 0.73828125, + "learning_rate": 0.00019636160989777208, + "loss": 4.5121, + "step": 2501 + }, + { + "epoch": 0.25942751232915773, + "grad_norm": 0.63671875, + "learning_rate": 0.0001963587059453317, + "loss": 4.5258, + "step": 2502 + }, + { + "epoch": 0.2595312003836458, + "grad_norm": 0.63671875, + "learning_rate": 0.0001963558008559604, + "loss": 4.5086, + "step": 2503 + }, + { + "epoch": 0.25963488843813387, + "grad_norm": 0.609375, + "learning_rate": 0.00019635289462969255, + "loss": 4.5275, + "step": 2504 + }, + { + "epoch": 0.25973857649262194, + "grad_norm": 0.5703125, + "learning_rate": 0.00019634998726656238, + "loss": 4.5625, + "step": 2505 + }, + { + "epoch": 0.25984226454711, + "grad_norm": 0.64453125, + "learning_rate": 0.0001963470787666042, + "loss": 4.5497, + "step": 2506 + }, + { + "epoch": 0.2599459526015981, + "grad_norm": 0.578125, + "learning_rate": 0.00019634416912985234, + "loss": 4.4894, + "step": 2507 + }, + { + "epoch": 0.26004964065608616, + "grad_norm": 0.65234375, + "learning_rate": 0.0001963412583563411, + "loss": 4.4685, + "step": 2508 + }, + { + "epoch": 0.2601533287105742, + "grad_norm": 0.5625, + "learning_rate": 0.00019633834644610488, + "loss": 4.5355, + "step": 2509 + }, + { + "epoch": 0.2602570167650623, + "grad_norm": 0.6171875, + "learning_rate": 0.000196335433399178, + "loss": 4.5377, + "step": 2510 + }, + { + "epoch": 0.26036070481955037, + "grad_norm": 0.53515625, + "learning_rate": 0.00019633251921559482, + "loss": 4.5568, + "step": 2511 + }, + { + "epoch": 0.26046439287403844, + "grad_norm": 0.62109375, + "learning_rate": 0.00019632960389538975, + "loss": 4.5713, + "step": 2512 + }, + { + "epoch": 0.2605680809285265, + "grad_norm": 0.625, + "learning_rate": 0.00019632668743859718, + "loss": 4.5268, + "step": 2513 + }, + { + "epoch": 0.2606717689830146, + "grad_norm": 0.62890625, + "learning_rate": 0.00019632376984525155, + "loss": 4.5509, + "step": 2514 + }, + { + "epoch": 0.26077545703750266, + "grad_norm": 0.625, + "learning_rate": 0.0001963208511153872, + "loss": 4.5079, + "step": 2515 + }, + { + "epoch": 0.2608791450919907, + "grad_norm": 0.640625, + "learning_rate": 0.00019631793124903863, + "loss": 4.551, + "step": 2516 + }, + { + "epoch": 0.2609828331464788, + "grad_norm": 0.58984375, + "learning_rate": 0.00019631501024624032, + "loss": 4.515, + "step": 2517 + }, + { + "epoch": 0.26108652120096687, + "grad_norm": 0.61328125, + "learning_rate": 0.00019631208810702667, + "loss": 4.498, + "step": 2518 + }, + { + "epoch": 0.26119020925545494, + "grad_norm": 0.6796875, + "learning_rate": 0.0001963091648314322, + "loss": 4.5372, + "step": 2519 + }, + { + "epoch": 0.261293897309943, + "grad_norm": 0.61328125, + "learning_rate": 0.0001963062404194914, + "loss": 4.513, + "step": 2520 + }, + { + "epoch": 0.2613975853644311, + "grad_norm": 0.69921875, + "learning_rate": 0.00019630331487123872, + "loss": 4.497, + "step": 2521 + }, + { + "epoch": 0.26150127341891916, + "grad_norm": 0.65625, + "learning_rate": 0.00019630038818670874, + "loss": 4.4994, + "step": 2522 + }, + { + "epoch": 0.2616049614734072, + "grad_norm": 0.703125, + "learning_rate": 0.000196297460365936, + "loss": 4.4981, + "step": 2523 + }, + { + "epoch": 0.26170864952789535, + "grad_norm": 0.65625, + "learning_rate": 0.000196294531408955, + "loss": 4.5548, + "step": 2524 + }, + { + "epoch": 0.2618123375823834, + "grad_norm": 0.6640625, + "learning_rate": 0.00019629160131580032, + "loss": 4.5412, + "step": 2525 + }, + { + "epoch": 0.2619160256368715, + "grad_norm": 0.70703125, + "learning_rate": 0.00019628867008650652, + "loss": 4.4908, + "step": 2526 + }, + { + "epoch": 0.26201971369135957, + "grad_norm": 0.6796875, + "learning_rate": 0.00019628573772110822, + "loss": 4.4642, + "step": 2527 + }, + { + "epoch": 0.26212340174584764, + "grad_norm": 0.71875, + "learning_rate": 0.00019628280421963995, + "loss": 4.4949, + "step": 2528 + }, + { + "epoch": 0.2622270898003357, + "grad_norm": 0.6328125, + "learning_rate": 0.0001962798695821364, + "loss": 4.4916, + "step": 2529 + }, + { + "epoch": 0.2623307778548238, + "grad_norm": 0.63671875, + "learning_rate": 0.00019627693380863215, + "loss": 4.5176, + "step": 2530 + }, + { + "epoch": 0.26243446590931185, + "grad_norm": 0.69140625, + "learning_rate": 0.00019627399689916186, + "loss": 4.555, + "step": 2531 + }, + { + "epoch": 0.2625381539637999, + "grad_norm": 0.578125, + "learning_rate": 0.00019627105885376017, + "loss": 4.5436, + "step": 2532 + }, + { + "epoch": 0.262641842018288, + "grad_norm": 0.7734375, + "learning_rate": 0.00019626811967246173, + "loss": 4.509, + "step": 2533 + }, + { + "epoch": 0.26274553007277607, + "grad_norm": 0.640625, + "learning_rate": 0.00019626517935530125, + "loss": 4.5512, + "step": 2534 + }, + { + "epoch": 0.26284921812726414, + "grad_norm": 0.71875, + "learning_rate": 0.0001962622379023134, + "loss": 4.5567, + "step": 2535 + }, + { + "epoch": 0.2629529061817522, + "grad_norm": 0.70703125, + "learning_rate": 0.0001962592953135329, + "loss": 4.5253, + "step": 2536 + }, + { + "epoch": 0.2630565942362403, + "grad_norm": 0.68359375, + "learning_rate": 0.0001962563515889945, + "loss": 4.5262, + "step": 2537 + }, + { + "epoch": 0.26316028229072835, + "grad_norm": 0.6796875, + "learning_rate": 0.00019625340672873285, + "loss": 4.5034, + "step": 2538 + }, + { + "epoch": 0.2632639703452164, + "grad_norm": 0.76953125, + "learning_rate": 0.00019625046073278276, + "loss": 4.5212, + "step": 2539 + }, + { + "epoch": 0.2633676583997045, + "grad_norm": 0.58984375, + "learning_rate": 0.00019624751360117898, + "loss": 4.521, + "step": 2540 + }, + { + "epoch": 0.26347134645419257, + "grad_norm": 0.8359375, + "learning_rate": 0.00019624456533395628, + "loss": 4.5292, + "step": 2541 + }, + { + "epoch": 0.26357503450868064, + "grad_norm": 0.75390625, + "learning_rate": 0.00019624161593114945, + "loss": 4.5043, + "step": 2542 + }, + { + "epoch": 0.2636787225631687, + "grad_norm": 0.6640625, + "learning_rate": 0.00019623866539279327, + "loss": 4.4988, + "step": 2543 + }, + { + "epoch": 0.2637824106176568, + "grad_norm": 0.75, + "learning_rate": 0.00019623571371892257, + "loss": 4.5159, + "step": 2544 + }, + { + "epoch": 0.26388609867214485, + "grad_norm": 0.671875, + "learning_rate": 0.00019623276090957218, + "loss": 4.5345, + "step": 2545 + }, + { + "epoch": 0.2639897867266329, + "grad_norm": 0.671875, + "learning_rate": 0.00019622980696477692, + "loss": 4.5314, + "step": 2546 + }, + { + "epoch": 0.264093474781121, + "grad_norm": 0.765625, + "learning_rate": 0.00019622685188457167, + "loss": 4.5287, + "step": 2547 + }, + { + "epoch": 0.26419716283560907, + "grad_norm": 0.7734375, + "learning_rate": 0.0001962238956689913, + "loss": 4.4609, + "step": 2548 + }, + { + "epoch": 0.26430085089009714, + "grad_norm": 0.6484375, + "learning_rate": 0.00019622093831807064, + "loss": 4.5249, + "step": 2549 + }, + { + "epoch": 0.2644045389445852, + "grad_norm": 0.890625, + "learning_rate": 0.00019621797983184464, + "loss": 4.5609, + "step": 2550 + }, + { + "epoch": 0.2645082269990733, + "grad_norm": 0.83203125, + "learning_rate": 0.0001962150202103482, + "loss": 4.526, + "step": 2551 + }, + { + "epoch": 0.26461191505356135, + "grad_norm": 0.58203125, + "learning_rate": 0.00019621205945361618, + "loss": 4.4839, + "step": 2552 + }, + { + "epoch": 0.2647156031080494, + "grad_norm": 0.8828125, + "learning_rate": 0.00019620909756168356, + "loss": 4.487, + "step": 2553 + }, + { + "epoch": 0.2648192911625375, + "grad_norm": 0.90234375, + "learning_rate": 0.0001962061345345853, + "loss": 4.5395, + "step": 2554 + }, + { + "epoch": 0.26492297921702557, + "grad_norm": 0.765625, + "learning_rate": 0.00019620317037235638, + "loss": 4.5355, + "step": 2555 + }, + { + "epoch": 0.26502666727151364, + "grad_norm": 0.671875, + "learning_rate": 0.00019620020507503174, + "loss": 4.4756, + "step": 2556 + }, + { + "epoch": 0.2651303553260017, + "grad_norm": 0.68359375, + "learning_rate": 0.0001961972386426463, + "loss": 4.5216, + "step": 2557 + }, + { + "epoch": 0.2652340433804898, + "grad_norm": 0.7421875, + "learning_rate": 0.00019619427107523521, + "loss": 4.4505, + "step": 2558 + }, + { + "epoch": 0.26533773143497785, + "grad_norm": 0.640625, + "learning_rate": 0.00019619130237283336, + "loss": 4.5255, + "step": 2559 + }, + { + "epoch": 0.2654414194894659, + "grad_norm": 0.62109375, + "learning_rate": 0.00019618833253547583, + "loss": 4.52, + "step": 2560 + }, + { + "epoch": 0.265545107543954, + "grad_norm": 0.6328125, + "learning_rate": 0.00019618536156319766, + "loss": 4.5316, + "step": 2561 + }, + { + "epoch": 0.26564879559844207, + "grad_norm": 0.6328125, + "learning_rate": 0.00019618238945603387, + "loss": 4.5438, + "step": 2562 + }, + { + "epoch": 0.26575248365293014, + "grad_norm": 0.546875, + "learning_rate": 0.00019617941621401957, + "loss": 4.4668, + "step": 2563 + }, + { + "epoch": 0.2658561717074182, + "grad_norm": 0.6015625, + "learning_rate": 0.00019617644183718983, + "loss": 4.5624, + "step": 2564 + }, + { + "epoch": 0.2659598597619063, + "grad_norm": 0.62109375, + "learning_rate": 0.00019617346632557972, + "loss": 4.4884, + "step": 2565 + }, + { + "epoch": 0.26606354781639435, + "grad_norm": 0.63671875, + "learning_rate": 0.00019617048967922438, + "loss": 4.5099, + "step": 2566 + }, + { + "epoch": 0.2661672358708824, + "grad_norm": 0.5703125, + "learning_rate": 0.00019616751189815892, + "loss": 4.5287, + "step": 2567 + }, + { + "epoch": 0.2662709239253705, + "grad_norm": 0.6015625, + "learning_rate": 0.00019616453298241845, + "loss": 4.5201, + "step": 2568 + }, + { + "epoch": 0.2663746119798586, + "grad_norm": 0.5859375, + "learning_rate": 0.00019616155293203816, + "loss": 4.5195, + "step": 2569 + }, + { + "epoch": 0.2664783000343467, + "grad_norm": 0.6171875, + "learning_rate": 0.0001961585717470532, + "loss": 4.5095, + "step": 2570 + }, + { + "epoch": 0.26658198808883476, + "grad_norm": 0.6484375, + "learning_rate": 0.00019615558942749872, + "loss": 4.4663, + "step": 2571 + }, + { + "epoch": 0.26668567614332284, + "grad_norm": 0.6953125, + "learning_rate": 0.00019615260597340996, + "loss": 4.5458, + "step": 2572 + }, + { + "epoch": 0.2667893641978109, + "grad_norm": 0.6171875, + "learning_rate": 0.00019614962138482205, + "loss": 4.5008, + "step": 2573 + }, + { + "epoch": 0.266893052252299, + "grad_norm": 0.6953125, + "learning_rate": 0.00019614663566177028, + "loss": 4.4962, + "step": 2574 + }, + { + "epoch": 0.26699674030678705, + "grad_norm": 0.703125, + "learning_rate": 0.0001961436488042898, + "loss": 4.5274, + "step": 2575 + }, + { + "epoch": 0.2671004283612751, + "grad_norm": 0.66796875, + "learning_rate": 0.0001961406608124159, + "loss": 4.5169, + "step": 2576 + }, + { + "epoch": 0.2672041164157632, + "grad_norm": 0.6640625, + "learning_rate": 0.00019613767168618384, + "loss": 4.498, + "step": 2577 + }, + { + "epoch": 0.26730780447025126, + "grad_norm": 0.75390625, + "learning_rate": 0.00019613468142562888, + "loss": 4.5055, + "step": 2578 + }, + { + "epoch": 0.26741149252473934, + "grad_norm": 0.70703125, + "learning_rate": 0.00019613169003078628, + "loss": 4.4896, + "step": 2579 + }, + { + "epoch": 0.2675151805792274, + "grad_norm": 0.671875, + "learning_rate": 0.0001961286975016914, + "loss": 4.5204, + "step": 2580 + }, + { + "epoch": 0.2676188686337155, + "grad_norm": 0.63671875, + "learning_rate": 0.00019612570383837943, + "loss": 4.4656, + "step": 2581 + }, + { + "epoch": 0.26772255668820355, + "grad_norm": 0.75, + "learning_rate": 0.00019612270904088578, + "loss": 4.5293, + "step": 2582 + }, + { + "epoch": 0.2678262447426916, + "grad_norm": 0.6171875, + "learning_rate": 0.0001961197131092458, + "loss": 4.5334, + "step": 2583 + }, + { + "epoch": 0.2679299327971797, + "grad_norm": 0.734375, + "learning_rate": 0.00019611671604349482, + "loss": 4.5137, + "step": 2584 + }, + { + "epoch": 0.26803362085166776, + "grad_norm": 0.7421875, + "learning_rate": 0.00019611371784366815, + "loss": 4.5113, + "step": 2585 + }, + { + "epoch": 0.26813730890615584, + "grad_norm": 0.6796875, + "learning_rate": 0.00019611071850980123, + "loss": 4.5013, + "step": 2586 + }, + { + "epoch": 0.2682409969606439, + "grad_norm": 0.67578125, + "learning_rate": 0.00019610771804192943, + "loss": 4.498, + "step": 2587 + }, + { + "epoch": 0.268344685015132, + "grad_norm": 0.625, + "learning_rate": 0.00019610471644008811, + "loss": 4.5094, + "step": 2588 + }, + { + "epoch": 0.26844837306962005, + "grad_norm": 0.71484375, + "learning_rate": 0.00019610171370431276, + "loss": 4.5432, + "step": 2589 + }, + { + "epoch": 0.2685520611241081, + "grad_norm": 0.58984375, + "learning_rate": 0.00019609870983463876, + "loss": 4.5388, + "step": 2590 + }, + { + "epoch": 0.2686557491785962, + "grad_norm": 0.65234375, + "learning_rate": 0.00019609570483110158, + "loss": 4.5287, + "step": 2591 + }, + { + "epoch": 0.26875943723308426, + "grad_norm": 0.63671875, + "learning_rate": 0.00019609269869373663, + "loss": 4.4932, + "step": 2592 + }, + { + "epoch": 0.26886312528757234, + "grad_norm": 0.6640625, + "learning_rate": 0.00019608969142257941, + "loss": 4.4847, + "step": 2593 + }, + { + "epoch": 0.2689668133420604, + "grad_norm": 0.66015625, + "learning_rate": 0.00019608668301766545, + "loss": 4.5315, + "step": 2594 + }, + { + "epoch": 0.2690705013965485, + "grad_norm": 0.59375, + "learning_rate": 0.00019608367347903017, + "loss": 4.5097, + "step": 2595 + }, + { + "epoch": 0.26917418945103655, + "grad_norm": 0.64453125, + "learning_rate": 0.0001960806628067091, + "loss": 4.501, + "step": 2596 + }, + { + "epoch": 0.2692778775055246, + "grad_norm": 0.640625, + "learning_rate": 0.0001960776510007378, + "loss": 4.4971, + "step": 2597 + }, + { + "epoch": 0.2693815655600127, + "grad_norm": 0.5859375, + "learning_rate": 0.0001960746380611517, + "loss": 4.5445, + "step": 2598 + }, + { + "epoch": 0.26948525361450076, + "grad_norm": 0.75, + "learning_rate": 0.0001960716239879865, + "loss": 4.511, + "step": 2599 + }, + { + "epoch": 0.26958894166898884, + "grad_norm": 0.703125, + "learning_rate": 0.00019606860878127767, + "loss": 4.5239, + "step": 2600 + }, + { + "epoch": 0.2696926297234769, + "grad_norm": 0.66796875, + "learning_rate": 0.0001960655924410608, + "loss": 4.5162, + "step": 2601 + }, + { + "epoch": 0.269796317777965, + "grad_norm": 0.62109375, + "learning_rate": 0.00019606257496737148, + "loss": 4.511, + "step": 2602 + }, + { + "epoch": 0.26990000583245305, + "grad_norm": 0.66015625, + "learning_rate": 0.00019605955636024534, + "loss": 4.4955, + "step": 2603 + }, + { + "epoch": 0.2700036938869411, + "grad_norm": 0.6328125, + "learning_rate": 0.00019605653661971796, + "loss": 4.5352, + "step": 2604 + }, + { + "epoch": 0.2701073819414292, + "grad_norm": 0.67578125, + "learning_rate": 0.00019605351574582497, + "loss": 4.5061, + "step": 2605 + }, + { + "epoch": 0.27021106999591726, + "grad_norm": 0.58984375, + "learning_rate": 0.00019605049373860205, + "loss": 4.4833, + "step": 2606 + }, + { + "epoch": 0.27031475805040533, + "grad_norm": 0.55078125, + "learning_rate": 0.0001960474705980848, + "loss": 4.5036, + "step": 2607 + }, + { + "epoch": 0.2704184461048934, + "grad_norm": 0.6953125, + "learning_rate": 0.00019604444632430895, + "loss": 4.4686, + "step": 2608 + }, + { + "epoch": 0.2705221341593815, + "grad_norm": 0.546875, + "learning_rate": 0.00019604142091731016, + "loss": 4.5039, + "step": 2609 + }, + { + "epoch": 0.27062582221386955, + "grad_norm": 0.6328125, + "learning_rate": 0.00019603839437712413, + "loss": 4.4556, + "step": 2610 + }, + { + "epoch": 0.2707295102683576, + "grad_norm": 0.6328125, + "learning_rate": 0.00019603536670378656, + "loss": 4.4963, + "step": 2611 + }, + { + "epoch": 0.2708331983228457, + "grad_norm": 0.54296875, + "learning_rate": 0.00019603233789733317, + "loss": 4.4963, + "step": 2612 + }, + { + "epoch": 0.2709368863773338, + "grad_norm": 0.64453125, + "learning_rate": 0.00019602930795779969, + "loss": 4.5181, + "step": 2613 + }, + { + "epoch": 0.2710405744318219, + "grad_norm": 0.625, + "learning_rate": 0.00019602627688522191, + "loss": 4.5117, + "step": 2614 + }, + { + "epoch": 0.27114426248630996, + "grad_norm": 0.61328125, + "learning_rate": 0.00019602324467963555, + "loss": 4.5349, + "step": 2615 + }, + { + "epoch": 0.27124795054079803, + "grad_norm": 0.66796875, + "learning_rate": 0.0001960202113410764, + "loss": 4.532, + "step": 2616 + }, + { + "epoch": 0.2713516385952861, + "grad_norm": 0.6171875, + "learning_rate": 0.00019601717686958025, + "loss": 4.4892, + "step": 2617 + }, + { + "epoch": 0.2714553266497742, + "grad_norm": 0.60546875, + "learning_rate": 0.00019601414126518293, + "loss": 4.468, + "step": 2618 + }, + { + "epoch": 0.27155901470426225, + "grad_norm": 0.640625, + "learning_rate": 0.00019601110452792023, + "loss": 4.4757, + "step": 2619 + }, + { + "epoch": 0.2716627027587503, + "grad_norm": 0.5625, + "learning_rate": 0.00019600806665782795, + "loss": 4.5217, + "step": 2620 + }, + { + "epoch": 0.2717663908132384, + "grad_norm": 0.64453125, + "learning_rate": 0.00019600502765494202, + "loss": 4.4974, + "step": 2621 + }, + { + "epoch": 0.27187007886772646, + "grad_norm": 0.5625, + "learning_rate": 0.00019600198751929822, + "loss": 4.4916, + "step": 2622 + }, + { + "epoch": 0.27197376692221453, + "grad_norm": 0.59765625, + "learning_rate": 0.00019599894625093244, + "loss": 4.5144, + "step": 2623 + }, + { + "epoch": 0.2720774549767026, + "grad_norm": 0.6328125, + "learning_rate": 0.00019599590384988057, + "loss": 4.5205, + "step": 2624 + }, + { + "epoch": 0.2721811430311907, + "grad_norm": 0.59765625, + "learning_rate": 0.00019599286031617852, + "loss": 4.5329, + "step": 2625 + }, + { + "epoch": 0.27228483108567875, + "grad_norm": 0.703125, + "learning_rate": 0.00019598981564986217, + "loss": 4.5093, + "step": 2626 + }, + { + "epoch": 0.2723885191401668, + "grad_norm": 0.66796875, + "learning_rate": 0.00019598676985096747, + "loss": 4.4862, + "step": 2627 + }, + { + "epoch": 0.2724922071946549, + "grad_norm": 0.61328125, + "learning_rate": 0.00019598372291953036, + "loss": 4.5171, + "step": 2628 + }, + { + "epoch": 0.27259589524914296, + "grad_norm": 0.703125, + "learning_rate": 0.00019598067485558675, + "loss": 4.5127, + "step": 2629 + }, + { + "epoch": 0.27269958330363103, + "grad_norm": 0.625, + "learning_rate": 0.00019597762565917265, + "loss": 4.4947, + "step": 2630 + }, + { + "epoch": 0.2728032713581191, + "grad_norm": 0.71875, + "learning_rate": 0.000195974575330324, + "loss": 4.4842, + "step": 2631 + }, + { + "epoch": 0.2729069594126072, + "grad_norm": 0.73828125, + "learning_rate": 0.00019597152386907683, + "loss": 4.5063, + "step": 2632 + }, + { + "epoch": 0.27301064746709525, + "grad_norm": 0.57421875, + "learning_rate": 0.00019596847127546717, + "loss": 4.4661, + "step": 2633 + }, + { + "epoch": 0.2731143355215833, + "grad_norm": 0.7109375, + "learning_rate": 0.0001959654175495309, + "loss": 4.5253, + "step": 2634 + }, + { + "epoch": 0.2732180235760714, + "grad_norm": 0.8359375, + "learning_rate": 0.0001959623626913042, + "loss": 4.5194, + "step": 2635 + }, + { + "epoch": 0.27332171163055946, + "grad_norm": 0.85546875, + "learning_rate": 0.00019595930670082305, + "loss": 4.5654, + "step": 2636 + }, + { + "epoch": 0.27342539968504753, + "grad_norm": 0.7265625, + "learning_rate": 0.00019595624957812353, + "loss": 4.5322, + "step": 2637 + }, + { + "epoch": 0.2735290877395356, + "grad_norm": 0.61328125, + "learning_rate": 0.00019595319132324166, + "loss": 4.5138, + "step": 2638 + }, + { + "epoch": 0.2736327757940237, + "grad_norm": 0.8359375, + "learning_rate": 0.00019595013193621357, + "loss": 4.4681, + "step": 2639 + }, + { + "epoch": 0.27373646384851175, + "grad_norm": 0.703125, + "learning_rate": 0.00019594707141707535, + "loss": 4.5471, + "step": 2640 + }, + { + "epoch": 0.2738401519029998, + "grad_norm": 0.6953125, + "learning_rate": 0.00019594400976586315, + "loss": 4.5668, + "step": 2641 + }, + { + "epoch": 0.2739438399574879, + "grad_norm": 0.72265625, + "learning_rate": 0.00019594094698261298, + "loss": 4.5293, + "step": 2642 + }, + { + "epoch": 0.27404752801197596, + "grad_norm": 0.63671875, + "learning_rate": 0.0001959378830673611, + "loss": 4.5093, + "step": 2643 + }, + { + "epoch": 0.27415121606646403, + "grad_norm": 0.6796875, + "learning_rate": 0.00019593481802014358, + "loss": 4.5032, + "step": 2644 + }, + { + "epoch": 0.2742549041209521, + "grad_norm": 0.5625, + "learning_rate": 0.00019593175184099662, + "loss": 4.5133, + "step": 2645 + }, + { + "epoch": 0.2743585921754402, + "grad_norm": 0.66796875, + "learning_rate": 0.0001959286845299564, + "loss": 4.5058, + "step": 2646 + }, + { + "epoch": 0.27446228022992825, + "grad_norm": 0.71875, + "learning_rate": 0.00019592561608705911, + "loss": 4.5501, + "step": 2647 + }, + { + "epoch": 0.2745659682844163, + "grad_norm": 0.71484375, + "learning_rate": 0.00019592254651234093, + "loss": 4.5248, + "step": 2648 + }, + { + "epoch": 0.2746696563389044, + "grad_norm": 0.81640625, + "learning_rate": 0.0001959194758058381, + "loss": 4.4747, + "step": 2649 + }, + { + "epoch": 0.27477334439339246, + "grad_norm": 0.77734375, + "learning_rate": 0.00019591640396758687, + "loss": 4.5561, + "step": 2650 + }, + { + "epoch": 0.27487703244788053, + "grad_norm": 0.6640625, + "learning_rate": 0.00019591333099762344, + "loss": 4.5369, + "step": 2651 + }, + { + "epoch": 0.2749807205023686, + "grad_norm": 0.7265625, + "learning_rate": 0.00019591025689598407, + "loss": 4.5263, + "step": 2652 + }, + { + "epoch": 0.2750844085568567, + "grad_norm": 0.72265625, + "learning_rate": 0.0001959071816627051, + "loss": 4.5292, + "step": 2653 + }, + { + "epoch": 0.27518809661134475, + "grad_norm": 0.8125, + "learning_rate": 0.00019590410529782273, + "loss": 4.5482, + "step": 2654 + }, + { + "epoch": 0.2752917846658328, + "grad_norm": 0.67578125, + "learning_rate": 0.00019590102780137333, + "loss": 4.5152, + "step": 2655 + }, + { + "epoch": 0.2753954727203209, + "grad_norm": 0.59375, + "learning_rate": 0.00019589794917339312, + "loss": 4.5202, + "step": 2656 + }, + { + "epoch": 0.27549916077480896, + "grad_norm": 0.671875, + "learning_rate": 0.0001958948694139185, + "loss": 4.5526, + "step": 2657 + }, + { + "epoch": 0.2756028488292971, + "grad_norm": 0.6015625, + "learning_rate": 0.0001958917885229858, + "loss": 4.4628, + "step": 2658 + }, + { + "epoch": 0.27570653688378516, + "grad_norm": 0.6171875, + "learning_rate": 0.00019588870650063135, + "loss": 4.4477, + "step": 2659 + }, + { + "epoch": 0.27581022493827323, + "grad_norm": 0.78125, + "learning_rate": 0.0001958856233468915, + "loss": 4.4959, + "step": 2660 + }, + { + "epoch": 0.2759139129927613, + "grad_norm": 0.66015625, + "learning_rate": 0.00019588253906180266, + "loss": 4.4946, + "step": 2661 + }, + { + "epoch": 0.2760176010472494, + "grad_norm": 0.70703125, + "learning_rate": 0.0001958794536454012, + "loss": 4.4696, + "step": 2662 + }, + { + "epoch": 0.27612128910173744, + "grad_norm": 0.8046875, + "learning_rate": 0.00019587636709772358, + "loss": 4.522, + "step": 2663 + }, + { + "epoch": 0.2762249771562255, + "grad_norm": 0.671875, + "learning_rate": 0.00019587327941880615, + "loss": 4.483, + "step": 2664 + }, + { + "epoch": 0.2763286652107136, + "grad_norm": 0.71484375, + "learning_rate": 0.00019587019060868536, + "loss": 4.5078, + "step": 2665 + }, + { + "epoch": 0.27643235326520166, + "grad_norm": 0.9140625, + "learning_rate": 0.0001958671006673977, + "loss": 4.4836, + "step": 2666 + }, + { + "epoch": 0.27653604131968973, + "grad_norm": 0.69140625, + "learning_rate": 0.00019586400959497954, + "loss": 4.4048, + "step": 2667 + }, + { + "epoch": 0.2766397293741778, + "grad_norm": 0.76171875, + "learning_rate": 0.00019586091739146738, + "loss": 4.5028, + "step": 2668 + }, + { + "epoch": 0.27674341742866587, + "grad_norm": 0.77734375, + "learning_rate": 0.0001958578240568978, + "loss": 4.4601, + "step": 2669 + }, + { + "epoch": 0.27684710548315394, + "grad_norm": 0.76171875, + "learning_rate": 0.00019585472959130715, + "loss": 4.4928, + "step": 2670 + }, + { + "epoch": 0.276950793537642, + "grad_norm": 0.65625, + "learning_rate": 0.00019585163399473204, + "loss": 4.4804, + "step": 2671 + }, + { + "epoch": 0.2770544815921301, + "grad_norm": 0.6328125, + "learning_rate": 0.00019584853726720898, + "loss": 4.4653, + "step": 2672 + }, + { + "epoch": 0.27715816964661816, + "grad_norm": 0.6875, + "learning_rate": 0.0001958454394087745, + "loss": 4.5314, + "step": 2673 + }, + { + "epoch": 0.27726185770110623, + "grad_norm": 0.65625, + "learning_rate": 0.0001958423404194651, + "loss": 4.4906, + "step": 2674 + }, + { + "epoch": 0.2773655457555943, + "grad_norm": 0.71484375, + "learning_rate": 0.00019583924029931745, + "loss": 4.4875, + "step": 2675 + }, + { + "epoch": 0.27746923381008237, + "grad_norm": 0.6953125, + "learning_rate": 0.00019583613904836805, + "loss": 4.4854, + "step": 2676 + }, + { + "epoch": 0.27757292186457044, + "grad_norm": 0.5859375, + "learning_rate": 0.0001958330366666535, + "loss": 4.5304, + "step": 2677 + }, + { + "epoch": 0.2776766099190585, + "grad_norm": 0.84375, + "learning_rate": 0.00019582993315421044, + "loss": 4.5095, + "step": 2678 + }, + { + "epoch": 0.2777802979735466, + "grad_norm": 0.65234375, + "learning_rate": 0.00019582682851107547, + "loss": 4.4651, + "step": 2679 + }, + { + "epoch": 0.27788398602803466, + "grad_norm": 0.67578125, + "learning_rate": 0.00019582372273728517, + "loss": 4.5261, + "step": 2680 + }, + { + "epoch": 0.27798767408252273, + "grad_norm": 0.71484375, + "learning_rate": 0.00019582061583287626, + "loss": 4.5165, + "step": 2681 + }, + { + "epoch": 0.2780913621370108, + "grad_norm": 0.6796875, + "learning_rate": 0.00019581750779788538, + "loss": 4.511, + "step": 2682 + }, + { + "epoch": 0.27819505019149887, + "grad_norm": 0.67578125, + "learning_rate": 0.00019581439863234918, + "loss": 4.5259, + "step": 2683 + }, + { + "epoch": 0.27829873824598694, + "grad_norm": 0.74609375, + "learning_rate": 0.00019581128833630435, + "loss": 4.4464, + "step": 2684 + }, + { + "epoch": 0.278402426300475, + "grad_norm": 0.79296875, + "learning_rate": 0.0001958081769097876, + "loss": 4.5156, + "step": 2685 + }, + { + "epoch": 0.2785061143549631, + "grad_norm": 0.64453125, + "learning_rate": 0.00019580506435283566, + "loss": 4.4865, + "step": 2686 + }, + { + "epoch": 0.27860980240945116, + "grad_norm": 0.6484375, + "learning_rate": 0.00019580195066548522, + "loss": 4.4432, + "step": 2687 + }, + { + "epoch": 0.27871349046393923, + "grad_norm": 0.65625, + "learning_rate": 0.000195798835847773, + "loss": 4.476, + "step": 2688 + }, + { + "epoch": 0.2788171785184273, + "grad_norm": 0.68359375, + "learning_rate": 0.00019579571989973582, + "loss": 4.5375, + "step": 2689 + }, + { + "epoch": 0.27892086657291537, + "grad_norm": 0.77734375, + "learning_rate": 0.00019579260282141038, + "loss": 4.5139, + "step": 2690 + }, + { + "epoch": 0.27902455462740344, + "grad_norm": 0.7578125, + "learning_rate": 0.0001957894846128335, + "loss": 4.5164, + "step": 2691 + }, + { + "epoch": 0.2791282426818915, + "grad_norm": 0.5703125, + "learning_rate": 0.00019578636527404193, + "loss": 4.4868, + "step": 2692 + }, + { + "epoch": 0.2792319307363796, + "grad_norm": 0.75390625, + "learning_rate": 0.00019578324480507253, + "loss": 4.4798, + "step": 2693 + }, + { + "epoch": 0.27933561879086766, + "grad_norm": 0.70703125, + "learning_rate": 0.0001957801232059621, + "loss": 4.4708, + "step": 2694 + }, + { + "epoch": 0.27943930684535573, + "grad_norm": 0.69921875, + "learning_rate": 0.00019577700047674742, + "loss": 4.5079, + "step": 2695 + }, + { + "epoch": 0.2795429948998438, + "grad_norm": 0.6953125, + "learning_rate": 0.00019577387661746538, + "loss": 4.508, + "step": 2696 + }, + { + "epoch": 0.27964668295433187, + "grad_norm": 0.72265625, + "learning_rate": 0.00019577075162815284, + "loss": 4.5315, + "step": 2697 + }, + { + "epoch": 0.27975037100881994, + "grad_norm": 0.66796875, + "learning_rate": 0.00019576762550884666, + "loss": 4.4953, + "step": 2698 + }, + { + "epoch": 0.279854059063308, + "grad_norm": 0.796875, + "learning_rate": 0.00019576449825958373, + "loss": 4.4543, + "step": 2699 + }, + { + "epoch": 0.2799577471177961, + "grad_norm": 0.77734375, + "learning_rate": 0.00019576136988040096, + "loss": 4.4965, + "step": 2700 + }, + { + "epoch": 0.28006143517228416, + "grad_norm": 0.77734375, + "learning_rate": 0.00019575824037133525, + "loss": 4.4783, + "step": 2701 + }, + { + "epoch": 0.28016512322677223, + "grad_norm": 0.7578125, + "learning_rate": 0.00019575510973242355, + "loss": 4.5024, + "step": 2702 + }, + { + "epoch": 0.28026881128126035, + "grad_norm": 0.65234375, + "learning_rate": 0.00019575197796370273, + "loss": 4.5101, + "step": 2703 + }, + { + "epoch": 0.2803724993357484, + "grad_norm": 0.6796875, + "learning_rate": 0.0001957488450652098, + "loss": 4.469, + "step": 2704 + }, + { + "epoch": 0.2804761873902365, + "grad_norm": 0.67578125, + "learning_rate": 0.00019574571103698172, + "loss": 4.533, + "step": 2705 + }, + { + "epoch": 0.28057987544472457, + "grad_norm": 0.6640625, + "learning_rate": 0.00019574257587905543, + "loss": 4.4898, + "step": 2706 + }, + { + "epoch": 0.28068356349921264, + "grad_norm": 0.58203125, + "learning_rate": 0.00019573943959146797, + "loss": 4.5083, + "step": 2707 + }, + { + "epoch": 0.2807872515537007, + "grad_norm": 0.57421875, + "learning_rate": 0.0001957363021742563, + "loss": 4.5129, + "step": 2708 + }, + { + "epoch": 0.2808909396081888, + "grad_norm": 0.52734375, + "learning_rate": 0.0001957331636274575, + "loss": 4.4601, + "step": 2709 + }, + { + "epoch": 0.28099462766267685, + "grad_norm": 0.58203125, + "learning_rate": 0.00019573002395110854, + "loss": 4.4549, + "step": 2710 + }, + { + "epoch": 0.2810983157171649, + "grad_norm": 0.6015625, + "learning_rate": 0.00019572688314524648, + "loss": 4.4636, + "step": 2711 + }, + { + "epoch": 0.281202003771653, + "grad_norm": 0.58203125, + "learning_rate": 0.0001957237412099084, + "loss": 4.5096, + "step": 2712 + }, + { + "epoch": 0.28130569182614107, + "grad_norm": 0.71484375, + "learning_rate": 0.00019572059814513136, + "loss": 4.4603, + "step": 2713 + }, + { + "epoch": 0.28140937988062914, + "grad_norm": 0.6796875, + "learning_rate": 0.00019571745395095244, + "loss": 4.4586, + "step": 2714 + }, + { + "epoch": 0.2815130679351172, + "grad_norm": 0.67578125, + "learning_rate": 0.00019571430862740875, + "loss": 4.4631, + "step": 2715 + }, + { + "epoch": 0.2816167559896053, + "grad_norm": 0.65234375, + "learning_rate": 0.00019571116217453736, + "loss": 4.528, + "step": 2716 + }, + { + "epoch": 0.28172044404409335, + "grad_norm": 0.81640625, + "learning_rate": 0.00019570801459237543, + "loss": 4.5021, + "step": 2717 + }, + { + "epoch": 0.2818241320985814, + "grad_norm": 0.71875, + "learning_rate": 0.00019570486588096014, + "loss": 4.4842, + "step": 2718 + }, + { + "epoch": 0.2819278201530695, + "grad_norm": 0.5703125, + "learning_rate": 0.00019570171604032857, + "loss": 4.4669, + "step": 2719 + }, + { + "epoch": 0.28203150820755757, + "grad_norm": 0.7109375, + "learning_rate": 0.00019569856507051792, + "loss": 4.5322, + "step": 2720 + }, + { + "epoch": 0.28213519626204564, + "grad_norm": 0.6953125, + "learning_rate": 0.00019569541297156535, + "loss": 4.4856, + "step": 2721 + }, + { + "epoch": 0.2822388843165337, + "grad_norm": 0.71875, + "learning_rate": 0.00019569225974350806, + "loss": 4.4897, + "step": 2722 + }, + { + "epoch": 0.2823425723710218, + "grad_norm": 0.77734375, + "learning_rate": 0.00019568910538638327, + "loss": 4.5146, + "step": 2723 + }, + { + "epoch": 0.28244626042550985, + "grad_norm": 0.66796875, + "learning_rate": 0.00019568594990022816, + "loss": 4.5252, + "step": 2724 + }, + { + "epoch": 0.2825499484799979, + "grad_norm": 0.75, + "learning_rate": 0.00019568279328508, + "loss": 4.5108, + "step": 2725 + }, + { + "epoch": 0.282653636534486, + "grad_norm": 0.72265625, + "learning_rate": 0.000195679635540976, + "loss": 4.4586, + "step": 2726 + }, + { + "epoch": 0.28275732458897407, + "grad_norm": 0.69140625, + "learning_rate": 0.00019567647666795347, + "loss": 4.4596, + "step": 2727 + }, + { + "epoch": 0.28286101264346214, + "grad_norm": 0.734375, + "learning_rate": 0.00019567331666604963, + "loss": 4.506, + "step": 2728 + }, + { + "epoch": 0.2829647006979502, + "grad_norm": 0.95703125, + "learning_rate": 0.00019567015553530182, + "loss": 4.4883, + "step": 2729 + }, + { + "epoch": 0.2830683887524383, + "grad_norm": 1.015625, + "learning_rate": 0.0001956669932757473, + "loss": 4.4837, + "step": 2730 + }, + { + "epoch": 0.28317207680692635, + "grad_norm": 0.8828125, + "learning_rate": 0.00019566382988742332, + "loss": 4.4863, + "step": 2731 + }, + { + "epoch": 0.2832757648614144, + "grad_norm": 0.640625, + "learning_rate": 0.00019566066537036734, + "loss": 4.5071, + "step": 2732 + }, + { + "epoch": 0.2833794529159025, + "grad_norm": 0.73046875, + "learning_rate": 0.00019565749972461657, + "loss": 4.5131, + "step": 2733 + }, + { + "epoch": 0.28348314097039057, + "grad_norm": 0.90625, + "learning_rate": 0.00019565433295020844, + "loss": 4.4927, + "step": 2734 + }, + { + "epoch": 0.28358682902487864, + "grad_norm": 0.82421875, + "learning_rate": 0.0001956511650471803, + "loss": 4.4971, + "step": 2735 + }, + { + "epoch": 0.2836905170793667, + "grad_norm": 0.72265625, + "learning_rate": 0.00019564799601556955, + "loss": 4.4516, + "step": 2736 + }, + { + "epoch": 0.2837942051338548, + "grad_norm": 0.62890625, + "learning_rate": 0.00019564482585541351, + "loss": 4.4599, + "step": 2737 + }, + { + "epoch": 0.28389789318834285, + "grad_norm": 0.6875, + "learning_rate": 0.0001956416545667496, + "loss": 4.4894, + "step": 2738 + }, + { + "epoch": 0.2840015812428309, + "grad_norm": 0.69921875, + "learning_rate": 0.0001956384821496153, + "loss": 4.5148, + "step": 2739 + }, + { + "epoch": 0.284105269297319, + "grad_norm": 0.5625, + "learning_rate": 0.00019563530860404802, + "loss": 4.5061, + "step": 2740 + }, + { + "epoch": 0.28420895735180707, + "grad_norm": 0.6796875, + "learning_rate": 0.00019563213393008515, + "loss": 4.4999, + "step": 2741 + }, + { + "epoch": 0.28431264540629514, + "grad_norm": 0.72265625, + "learning_rate": 0.0001956289581277642, + "loss": 4.5199, + "step": 2742 + }, + { + "epoch": 0.2844163334607832, + "grad_norm": 0.69921875, + "learning_rate": 0.00019562578119712264, + "loss": 4.5523, + "step": 2743 + }, + { + "epoch": 0.2845200215152713, + "grad_norm": 0.68359375, + "learning_rate": 0.00019562260313819795, + "loss": 4.4767, + "step": 2744 + }, + { + "epoch": 0.28462370956975935, + "grad_norm": 0.77734375, + "learning_rate": 0.00019561942395102762, + "loss": 4.5022, + "step": 2745 + }, + { + "epoch": 0.2847273976242474, + "grad_norm": 0.859375, + "learning_rate": 0.00019561624363564914, + "loss": 4.4705, + "step": 2746 + }, + { + "epoch": 0.28483108567873555, + "grad_norm": 0.9609375, + "learning_rate": 0.00019561306219210005, + "loss": 4.5097, + "step": 2747 + }, + { + "epoch": 0.2849347737332236, + "grad_norm": 0.88671875, + "learning_rate": 0.00019560987962041792, + "loss": 4.4899, + "step": 2748 + }, + { + "epoch": 0.2850384617877117, + "grad_norm": 0.62109375, + "learning_rate": 0.00019560669592064026, + "loss": 4.4619, + "step": 2749 + }, + { + "epoch": 0.28514214984219977, + "grad_norm": 0.7734375, + "learning_rate": 0.00019560351109280467, + "loss": 4.4725, + "step": 2750 + }, + { + "epoch": 0.28524583789668784, + "grad_norm": 0.75, + "learning_rate": 0.00019560032513694865, + "loss": 4.4604, + "step": 2751 + }, + { + "epoch": 0.2853495259511759, + "grad_norm": 0.6875, + "learning_rate": 0.0001955971380531099, + "loss": 4.5025, + "step": 2752 + }, + { + "epoch": 0.285453214005664, + "grad_norm": 0.76171875, + "learning_rate": 0.00019559394984132596, + "loss": 4.5058, + "step": 2753 + }, + { + "epoch": 0.28555690206015205, + "grad_norm": 0.64453125, + "learning_rate": 0.00019559076050163445, + "loss": 4.4853, + "step": 2754 + }, + { + "epoch": 0.2856605901146401, + "grad_norm": 0.71484375, + "learning_rate": 0.00019558757003407303, + "loss": 4.4615, + "step": 2755 + }, + { + "epoch": 0.2857642781691282, + "grad_norm": 0.75, + "learning_rate": 0.00019558437843867932, + "loss": 4.4716, + "step": 2756 + }, + { + "epoch": 0.28586796622361627, + "grad_norm": 0.71484375, + "learning_rate": 0.00019558118571549096, + "loss": 4.4484, + "step": 2757 + }, + { + "epoch": 0.28597165427810434, + "grad_norm": 0.66796875, + "learning_rate": 0.00019557799186454566, + "loss": 4.5193, + "step": 2758 + }, + { + "epoch": 0.2860753423325924, + "grad_norm": 0.81640625, + "learning_rate": 0.00019557479688588108, + "loss": 4.4887, + "step": 2759 + }, + { + "epoch": 0.2861790303870805, + "grad_norm": 0.68359375, + "learning_rate": 0.00019557160077953491, + "loss": 4.4622, + "step": 2760 + }, + { + "epoch": 0.28628271844156855, + "grad_norm": 0.7109375, + "learning_rate": 0.0001955684035455449, + "loss": 4.4998, + "step": 2761 + }, + { + "epoch": 0.2863864064960566, + "grad_norm": 0.71484375, + "learning_rate": 0.00019556520518394875, + "loss": 4.5327, + "step": 2762 + }, + { + "epoch": 0.2864900945505447, + "grad_norm": 0.8046875, + "learning_rate": 0.00019556200569478417, + "loss": 4.4883, + "step": 2763 + }, + { + "epoch": 0.28659378260503277, + "grad_norm": 0.7421875, + "learning_rate": 0.00019555880507808894, + "loss": 4.5244, + "step": 2764 + }, + { + "epoch": 0.28669747065952084, + "grad_norm": 0.7265625, + "learning_rate": 0.00019555560333390083, + "loss": 4.4726, + "step": 2765 + }, + { + "epoch": 0.2868011587140089, + "grad_norm": 0.89453125, + "learning_rate": 0.00019555240046225763, + "loss": 4.457, + "step": 2766 + }, + { + "epoch": 0.286904846768497, + "grad_norm": 1.1015625, + "learning_rate": 0.00019554919646319708, + "loss": 4.4616, + "step": 2767 + }, + { + "epoch": 0.28700853482298505, + "grad_norm": 0.9453125, + "learning_rate": 0.000195545991336757, + "loss": 4.4876, + "step": 2768 + }, + { + "epoch": 0.2871122228774731, + "grad_norm": 0.890625, + "learning_rate": 0.00019554278508297524, + "loss": 4.4903, + "step": 2769 + }, + { + "epoch": 0.2872159109319612, + "grad_norm": 0.72265625, + "learning_rate": 0.0001955395777018896, + "loss": 4.5038, + "step": 2770 + }, + { + "epoch": 0.28731959898644926, + "grad_norm": 0.95703125, + "learning_rate": 0.00019553636919353794, + "loss": 4.4924, + "step": 2771 + }, + { + "epoch": 0.28742328704093734, + "grad_norm": 1.0703125, + "learning_rate": 0.0001955331595579581, + "loss": 4.4793, + "step": 2772 + }, + { + "epoch": 0.2875269750954254, + "grad_norm": 0.73046875, + "learning_rate": 0.00019552994879518798, + "loss": 4.524, + "step": 2773 + }, + { + "epoch": 0.2876306631499135, + "grad_norm": 0.78125, + "learning_rate": 0.00019552673690526544, + "loss": 4.5274, + "step": 2774 + }, + { + "epoch": 0.28773435120440155, + "grad_norm": 0.83984375, + "learning_rate": 0.00019552352388822834, + "loss": 4.4656, + "step": 2775 + }, + { + "epoch": 0.2878380392588896, + "grad_norm": 0.84375, + "learning_rate": 0.0001955203097441147, + "loss": 4.5221, + "step": 2776 + }, + { + "epoch": 0.2879417273133777, + "grad_norm": 0.69140625, + "learning_rate": 0.0001955170944729623, + "loss": 4.4798, + "step": 2777 + }, + { + "epoch": 0.28804541536786576, + "grad_norm": 0.7421875, + "learning_rate": 0.00019551387807480918, + "loss": 4.425, + "step": 2778 + }, + { + "epoch": 0.28814910342235384, + "grad_norm": 0.8203125, + "learning_rate": 0.0001955106605496933, + "loss": 4.4931, + "step": 2779 + }, + { + "epoch": 0.2882527914768419, + "grad_norm": 0.80078125, + "learning_rate": 0.00019550744189765254, + "loss": 4.5125, + "step": 2780 + }, + { + "epoch": 0.28835647953133, + "grad_norm": 0.8515625, + "learning_rate": 0.00019550422211872493, + "loss": 4.4673, + "step": 2781 + }, + { + "epoch": 0.28846016758581805, + "grad_norm": 0.7265625, + "learning_rate": 0.00019550100121294844, + "loss": 4.4965, + "step": 2782 + }, + { + "epoch": 0.2885638556403061, + "grad_norm": 0.8359375, + "learning_rate": 0.00019549777918036112, + "loss": 4.5221, + "step": 2783 + }, + { + "epoch": 0.2886675436947942, + "grad_norm": 0.85546875, + "learning_rate": 0.00019549455602100094, + "loss": 4.5056, + "step": 2784 + }, + { + "epoch": 0.28877123174928226, + "grad_norm": 0.84375, + "learning_rate": 0.00019549133173490593, + "loss": 4.4957, + "step": 2785 + }, + { + "epoch": 0.28887491980377034, + "grad_norm": 0.67578125, + "learning_rate": 0.00019548810632211413, + "loss": 4.5048, + "step": 2786 + }, + { + "epoch": 0.2889786078582584, + "grad_norm": 0.796875, + "learning_rate": 0.00019548487978266363, + "loss": 4.462, + "step": 2787 + }, + { + "epoch": 0.2890822959127465, + "grad_norm": 0.73046875, + "learning_rate": 0.00019548165211659248, + "loss": 4.4877, + "step": 2788 + }, + { + "epoch": 0.28918598396723455, + "grad_norm": 0.75390625, + "learning_rate": 0.00019547842332393874, + "loss": 4.513, + "step": 2789 + }, + { + "epoch": 0.2892896720217226, + "grad_norm": 0.8359375, + "learning_rate": 0.00019547519340474057, + "loss": 4.4637, + "step": 2790 + }, + { + "epoch": 0.2893933600762107, + "grad_norm": 0.79296875, + "learning_rate": 0.00019547196235903603, + "loss": 4.4883, + "step": 2791 + }, + { + "epoch": 0.2894970481306988, + "grad_norm": 0.73046875, + "learning_rate": 0.00019546873018686322, + "loss": 4.4691, + "step": 2792 + }, + { + "epoch": 0.2896007361851869, + "grad_norm": 0.71875, + "learning_rate": 0.00019546549688826038, + "loss": 4.4894, + "step": 2793 + }, + { + "epoch": 0.28970442423967496, + "grad_norm": 0.67578125, + "learning_rate": 0.0001954622624632655, + "loss": 4.5193, + "step": 2794 + }, + { + "epoch": 0.28980811229416303, + "grad_norm": 0.90625, + "learning_rate": 0.00019545902691191688, + "loss": 4.5056, + "step": 2795 + }, + { + "epoch": 0.2899118003486511, + "grad_norm": 0.8515625, + "learning_rate": 0.00019545579023425263, + "loss": 4.4285, + "step": 2796 + }, + { + "epoch": 0.2900154884031392, + "grad_norm": 0.65234375, + "learning_rate": 0.00019545255243031098, + "loss": 4.4977, + "step": 2797 + }, + { + "epoch": 0.29011917645762725, + "grad_norm": 0.93359375, + "learning_rate": 0.0001954493135001301, + "loss": 4.442, + "step": 2798 + }, + { + "epoch": 0.2902228645121153, + "grad_norm": 0.93359375, + "learning_rate": 0.0001954460734437482, + "loss": 4.4989, + "step": 2799 + }, + { + "epoch": 0.2903265525666034, + "grad_norm": 1.078125, + "learning_rate": 0.00019544283226120352, + "loss": 4.5369, + "step": 2800 + }, + { + "epoch": 0.29043024062109146, + "grad_norm": 0.80859375, + "learning_rate": 0.00019543958995253433, + "loss": 4.4906, + "step": 2801 + }, + { + "epoch": 0.29053392867557953, + "grad_norm": 0.76953125, + "learning_rate": 0.00019543634651777886, + "loss": 4.489, + "step": 2802 + }, + { + "epoch": 0.2906376167300676, + "grad_norm": 1.0625, + "learning_rate": 0.00019543310195697537, + "loss": 4.472, + "step": 2803 + }, + { + "epoch": 0.2907413047845557, + "grad_norm": 0.8828125, + "learning_rate": 0.00019542985627016214, + "loss": 4.464, + "step": 2804 + }, + { + "epoch": 0.29084499283904375, + "grad_norm": 0.74609375, + "learning_rate": 0.00019542660945737753, + "loss": 4.4578, + "step": 2805 + }, + { + "epoch": 0.2909486808935318, + "grad_norm": 0.77734375, + "learning_rate": 0.00019542336151865975, + "loss": 4.5206, + "step": 2806 + }, + { + "epoch": 0.2910523689480199, + "grad_norm": 0.8515625, + "learning_rate": 0.00019542011245404716, + "loss": 4.4985, + "step": 2807 + }, + { + "epoch": 0.29115605700250796, + "grad_norm": 0.8984375, + "learning_rate": 0.00019541686226357813, + "loss": 4.4695, + "step": 2808 + }, + { + "epoch": 0.29125974505699603, + "grad_norm": 0.76171875, + "learning_rate": 0.00019541361094729102, + "loss": 4.4816, + "step": 2809 + }, + { + "epoch": 0.2913634331114841, + "grad_norm": 0.7578125, + "learning_rate": 0.00019541035850522412, + "loss": 4.4805, + "step": 2810 + }, + { + "epoch": 0.2914671211659722, + "grad_norm": 1.046875, + "learning_rate": 0.00019540710493741586, + "loss": 4.5198, + "step": 2811 + }, + { + "epoch": 0.29157080922046025, + "grad_norm": 0.9921875, + "learning_rate": 0.00019540385024390458, + "loss": 4.4809, + "step": 2812 + }, + { + "epoch": 0.2916744972749483, + "grad_norm": 0.87890625, + "learning_rate": 0.00019540059442472874, + "loss": 4.4642, + "step": 2813 + }, + { + "epoch": 0.2917781853294364, + "grad_norm": 0.98828125, + "learning_rate": 0.00019539733747992672, + "loss": 4.5016, + "step": 2814 + }, + { + "epoch": 0.29188187338392446, + "grad_norm": 0.99609375, + "learning_rate": 0.00019539407940953696, + "loss": 4.4307, + "step": 2815 + }, + { + "epoch": 0.29198556143841253, + "grad_norm": 0.80078125, + "learning_rate": 0.0001953908202135979, + "loss": 4.4877, + "step": 2816 + }, + { + "epoch": 0.2920892494929006, + "grad_norm": 0.72265625, + "learning_rate": 0.000195387559892148, + "loss": 4.4839, + "step": 2817 + }, + { + "epoch": 0.2921929375473887, + "grad_norm": 0.7265625, + "learning_rate": 0.0001953842984452257, + "loss": 4.4767, + "step": 2818 + }, + { + "epoch": 0.29229662560187675, + "grad_norm": 0.71484375, + "learning_rate": 0.0001953810358728695, + "loss": 4.4613, + "step": 2819 + }, + { + "epoch": 0.2924003136563648, + "grad_norm": 0.6640625, + "learning_rate": 0.00019537777217511795, + "loss": 4.4937, + "step": 2820 + }, + { + "epoch": 0.2925040017108529, + "grad_norm": 0.6640625, + "learning_rate": 0.00019537450735200947, + "loss": 4.4661, + "step": 2821 + }, + { + "epoch": 0.29260768976534096, + "grad_norm": 0.85546875, + "learning_rate": 0.0001953712414035826, + "loss": 4.5173, + "step": 2822 + }, + { + "epoch": 0.29271137781982903, + "grad_norm": 0.74609375, + "learning_rate": 0.00019536797432987592, + "loss": 4.5015, + "step": 2823 + }, + { + "epoch": 0.2928150658743171, + "grad_norm": 0.63671875, + "learning_rate": 0.00019536470613092796, + "loss": 4.4875, + "step": 2824 + }, + { + "epoch": 0.2929187539288052, + "grad_norm": 0.6328125, + "learning_rate": 0.00019536143680677726, + "loss": 4.464, + "step": 2825 + }, + { + "epoch": 0.29302244198329325, + "grad_norm": 0.71875, + "learning_rate": 0.00019535816635746241, + "loss": 4.4916, + "step": 2826 + }, + { + "epoch": 0.2931261300377813, + "grad_norm": 0.7109375, + "learning_rate": 0.00019535489478302197, + "loss": 4.491, + "step": 2827 + }, + { + "epoch": 0.2932298180922694, + "grad_norm": 0.72265625, + "learning_rate": 0.0001953516220834946, + "loss": 4.5309, + "step": 2828 + }, + { + "epoch": 0.29333350614675746, + "grad_norm": 0.70703125, + "learning_rate": 0.00019534834825891886, + "loss": 4.4935, + "step": 2829 + }, + { + "epoch": 0.29343719420124553, + "grad_norm": 0.80859375, + "learning_rate": 0.00019534507330933344, + "loss": 4.447, + "step": 2830 + }, + { + "epoch": 0.2935408822557336, + "grad_norm": 0.73046875, + "learning_rate": 0.0001953417972347769, + "loss": 4.4797, + "step": 2831 + }, + { + "epoch": 0.2936445703102217, + "grad_norm": 0.66015625, + "learning_rate": 0.00019533852003528792, + "loss": 4.4689, + "step": 2832 + }, + { + "epoch": 0.29374825836470975, + "grad_norm": 0.71484375, + "learning_rate": 0.0001953352417109052, + "loss": 4.5026, + "step": 2833 + }, + { + "epoch": 0.2938519464191978, + "grad_norm": 0.6875, + "learning_rate": 0.00019533196226166738, + "loss": 4.4699, + "step": 2834 + }, + { + "epoch": 0.2939556344736859, + "grad_norm": 0.69921875, + "learning_rate": 0.00019532868168761322, + "loss": 4.5083, + "step": 2835 + }, + { + "epoch": 0.29405932252817396, + "grad_norm": 0.69921875, + "learning_rate": 0.00019532539998878137, + "loss": 4.4928, + "step": 2836 + }, + { + "epoch": 0.2941630105826621, + "grad_norm": 0.69140625, + "learning_rate": 0.00019532211716521053, + "loss": 4.492, + "step": 2837 + }, + { + "epoch": 0.29426669863715016, + "grad_norm": 0.6796875, + "learning_rate": 0.0001953188332169395, + "loss": 4.4951, + "step": 2838 + }, + { + "epoch": 0.29437038669163823, + "grad_norm": 0.68359375, + "learning_rate": 0.000195315548144007, + "loss": 4.4724, + "step": 2839 + }, + { + "epoch": 0.2944740747461263, + "grad_norm": 0.67578125, + "learning_rate": 0.00019531226194645176, + "loss": 4.4568, + "step": 2840 + }, + { + "epoch": 0.2945777628006144, + "grad_norm": 0.67578125, + "learning_rate": 0.00019530897462431256, + "loss": 4.4589, + "step": 2841 + }, + { + "epoch": 0.29468145085510244, + "grad_norm": 0.6875, + "learning_rate": 0.00019530568617762825, + "loss": 4.4916, + "step": 2842 + }, + { + "epoch": 0.2947851389095905, + "grad_norm": 0.6875, + "learning_rate": 0.00019530239660643756, + "loss": 4.4933, + "step": 2843 + }, + { + "epoch": 0.2948888269640786, + "grad_norm": 0.64453125, + "learning_rate": 0.00019529910591077932, + "loss": 4.4446, + "step": 2844 + }, + { + "epoch": 0.29499251501856666, + "grad_norm": 0.64453125, + "learning_rate": 0.0001952958140906924, + "loss": 4.4681, + "step": 2845 + }, + { + "epoch": 0.29509620307305473, + "grad_norm": 0.82421875, + "learning_rate": 0.00019529252114621558, + "loss": 4.4875, + "step": 2846 + }, + { + "epoch": 0.2951998911275428, + "grad_norm": 0.76953125, + "learning_rate": 0.00019528922707738776, + "loss": 4.4859, + "step": 2847 + }, + { + "epoch": 0.2953035791820309, + "grad_norm": 0.62109375, + "learning_rate": 0.00019528593188424779, + "loss": 4.4818, + "step": 2848 + }, + { + "epoch": 0.29540726723651894, + "grad_norm": 0.69140625, + "learning_rate": 0.0001952826355668345, + "loss": 4.4778, + "step": 2849 + }, + { + "epoch": 0.295510955291007, + "grad_norm": 0.703125, + "learning_rate": 0.00019527933812518686, + "loss": 4.4353, + "step": 2850 + }, + { + "epoch": 0.2956146433454951, + "grad_norm": 0.62109375, + "learning_rate": 0.00019527603955934376, + "loss": 4.5151, + "step": 2851 + }, + { + "epoch": 0.29571833139998316, + "grad_norm": 0.73828125, + "learning_rate": 0.0001952727398693441, + "loss": 4.508, + "step": 2852 + }, + { + "epoch": 0.29582201945447123, + "grad_norm": 0.73828125, + "learning_rate": 0.00019526943905522678, + "loss": 4.5373, + "step": 2853 + }, + { + "epoch": 0.2959257075089593, + "grad_norm": 0.55078125, + "learning_rate": 0.00019526613711703082, + "loss": 4.5151, + "step": 2854 + }, + { + "epoch": 0.2960293955634474, + "grad_norm": 0.73046875, + "learning_rate": 0.00019526283405479512, + "loss": 4.49, + "step": 2855 + }, + { + "epoch": 0.29613308361793544, + "grad_norm": 0.68359375, + "learning_rate": 0.00019525952986855868, + "loss": 4.4924, + "step": 2856 + }, + { + "epoch": 0.2962367716724235, + "grad_norm": 0.609375, + "learning_rate": 0.0001952562245583605, + "loss": 4.4681, + "step": 2857 + }, + { + "epoch": 0.2963404597269116, + "grad_norm": 0.68359375, + "learning_rate": 0.00019525291812423954, + "loss": 4.4637, + "step": 2858 + }, + { + "epoch": 0.29644414778139966, + "grad_norm": 0.671875, + "learning_rate": 0.00019524961056623482, + "loss": 4.4826, + "step": 2859 + }, + { + "epoch": 0.29654783583588773, + "grad_norm": 0.62109375, + "learning_rate": 0.0001952463018843854, + "loss": 4.4995, + "step": 2860 + }, + { + "epoch": 0.2966515238903758, + "grad_norm": 0.76171875, + "learning_rate": 0.0001952429920787303, + "loss": 4.4589, + "step": 2861 + }, + { + "epoch": 0.2967552119448639, + "grad_norm": 0.80078125, + "learning_rate": 0.00019523968114930858, + "loss": 4.49, + "step": 2862 + }, + { + "epoch": 0.29685889999935194, + "grad_norm": 0.88671875, + "learning_rate": 0.00019523636909615926, + "loss": 4.4875, + "step": 2863 + }, + { + "epoch": 0.29696258805384, + "grad_norm": 0.7890625, + "learning_rate": 0.00019523305591932148, + "loss": 4.48, + "step": 2864 + }, + { + "epoch": 0.2970662761083281, + "grad_norm": 0.625, + "learning_rate": 0.00019522974161883432, + "loss": 4.5249, + "step": 2865 + }, + { + "epoch": 0.29716996416281616, + "grad_norm": 0.78125, + "learning_rate": 0.00019522642619473684, + "loss": 4.4829, + "step": 2866 + }, + { + "epoch": 0.29727365221730423, + "grad_norm": 0.76171875, + "learning_rate": 0.0001952231096470682, + "loss": 4.4608, + "step": 2867 + }, + { + "epoch": 0.2973773402717923, + "grad_norm": 0.8828125, + "learning_rate": 0.0001952197919758675, + "loss": 4.4793, + "step": 2868 + }, + { + "epoch": 0.2974810283262804, + "grad_norm": 0.7421875, + "learning_rate": 0.00019521647318117394, + "loss": 4.4322, + "step": 2869 + }, + { + "epoch": 0.29758471638076844, + "grad_norm": 0.64453125, + "learning_rate": 0.00019521315326302664, + "loss": 4.4672, + "step": 2870 + }, + { + "epoch": 0.2976884044352565, + "grad_norm": 0.8203125, + "learning_rate": 0.00019520983222146476, + "loss": 4.4932, + "step": 2871 + }, + { + "epoch": 0.2977920924897446, + "grad_norm": 0.7265625, + "learning_rate": 0.0001952065100565275, + "loss": 4.4657, + "step": 2872 + }, + { + "epoch": 0.29789578054423266, + "grad_norm": 0.6484375, + "learning_rate": 0.0001952031867682541, + "loss": 4.4955, + "step": 2873 + }, + { + "epoch": 0.29799946859872073, + "grad_norm": 0.94921875, + "learning_rate": 0.0001951998623566837, + "loss": 4.487, + "step": 2874 + }, + { + "epoch": 0.2981031566532088, + "grad_norm": 1.078125, + "learning_rate": 0.00019519653682185552, + "loss": 4.4835, + "step": 2875 + }, + { + "epoch": 0.29820684470769687, + "grad_norm": 0.81640625, + "learning_rate": 0.00019519321016380888, + "loss": 4.4324, + "step": 2876 + }, + { + "epoch": 0.29831053276218494, + "grad_norm": 0.74609375, + "learning_rate": 0.00019518988238258298, + "loss": 4.5012, + "step": 2877 + }, + { + "epoch": 0.298414220816673, + "grad_norm": 0.8515625, + "learning_rate": 0.00019518655347821709, + "loss": 4.4762, + "step": 2878 + }, + { + "epoch": 0.2985179088711611, + "grad_norm": 0.96875, + "learning_rate": 0.00019518322345075047, + "loss": 4.4632, + "step": 2879 + }, + { + "epoch": 0.29862159692564916, + "grad_norm": 0.8671875, + "learning_rate": 0.00019517989230022242, + "loss": 4.437, + "step": 2880 + }, + { + "epoch": 0.2987252849801373, + "grad_norm": 0.828125, + "learning_rate": 0.00019517656002667226, + "loss": 4.4664, + "step": 2881 + }, + { + "epoch": 0.29882897303462536, + "grad_norm": 0.765625, + "learning_rate": 0.00019517322663013928, + "loss": 4.496, + "step": 2882 + }, + { + "epoch": 0.2989326610891134, + "grad_norm": 0.6953125, + "learning_rate": 0.00019516989211066285, + "loss": 4.4569, + "step": 2883 + }, + { + "epoch": 0.2990363491436015, + "grad_norm": 1.046875, + "learning_rate": 0.0001951665564682823, + "loss": 4.4752, + "step": 2884 + }, + { + "epoch": 0.29914003719808957, + "grad_norm": 0.828125, + "learning_rate": 0.00019516321970303695, + "loss": 4.5071, + "step": 2885 + }, + { + "epoch": 0.29924372525257764, + "grad_norm": 0.73046875, + "learning_rate": 0.00019515988181496624, + "loss": 4.4836, + "step": 2886 + }, + { + "epoch": 0.2993474133070657, + "grad_norm": 0.92578125, + "learning_rate": 0.00019515654280410945, + "loss": 4.4671, + "step": 2887 + }, + { + "epoch": 0.2994511013615538, + "grad_norm": 1.1484375, + "learning_rate": 0.00019515320267050613, + "loss": 4.4976, + "step": 2888 + }, + { + "epoch": 0.29955478941604186, + "grad_norm": 0.6640625, + "learning_rate": 0.00019514986141419552, + "loss": 4.4499, + "step": 2889 + }, + { + "epoch": 0.2996584774705299, + "grad_norm": 0.8515625, + "learning_rate": 0.00019514651903521717, + "loss": 4.5257, + "step": 2890 + }, + { + "epoch": 0.299762165525018, + "grad_norm": 0.9921875, + "learning_rate": 0.00019514317553361043, + "loss": 4.4424, + "step": 2891 + }, + { + "epoch": 0.29986585357950607, + "grad_norm": 0.84375, + "learning_rate": 0.00019513983090941483, + "loss": 4.4534, + "step": 2892 + }, + { + "epoch": 0.29996954163399414, + "grad_norm": 0.79296875, + "learning_rate": 0.00019513648516266975, + "loss": 4.4862, + "step": 2893 + }, + { + "epoch": 0.3000732296884822, + "grad_norm": 1.0625, + "learning_rate": 0.00019513313829341473, + "loss": 4.5139, + "step": 2894 + }, + { + "epoch": 0.3001769177429703, + "grad_norm": 0.63671875, + "learning_rate": 0.00019512979030168927, + "loss": 4.4577, + "step": 2895 + }, + { + "epoch": 0.30028060579745836, + "grad_norm": 1.046875, + "learning_rate": 0.00019512644118753277, + "loss": 4.4544, + "step": 2896 + }, + { + "epoch": 0.3003842938519464, + "grad_norm": 0.94140625, + "learning_rate": 0.0001951230909509849, + "loss": 4.4591, + "step": 2897 + }, + { + "epoch": 0.3004879819064345, + "grad_norm": 0.85546875, + "learning_rate": 0.00019511973959208506, + "loss": 4.5211, + "step": 2898 + }, + { + "epoch": 0.30059166996092257, + "grad_norm": 1.0625, + "learning_rate": 0.00019511638711087282, + "loss": 4.4828, + "step": 2899 + }, + { + "epoch": 0.30069535801541064, + "grad_norm": 1.1875, + "learning_rate": 0.0001951130335073878, + "loss": 4.4544, + "step": 2900 + }, + { + "epoch": 0.3007990460698987, + "grad_norm": 0.7734375, + "learning_rate": 0.00019510967878166947, + "loss": 4.4558, + "step": 2901 + }, + { + "epoch": 0.3009027341243868, + "grad_norm": 1.1484375, + "learning_rate": 0.00019510632293375752, + "loss": 4.4773, + "step": 2902 + }, + { + "epoch": 0.30100642217887486, + "grad_norm": 0.9296875, + "learning_rate": 0.00019510296596369147, + "loss": 4.4952, + "step": 2903 + }, + { + "epoch": 0.3011101102333629, + "grad_norm": 1.0625, + "learning_rate": 0.00019509960787151095, + "loss": 4.5134, + "step": 2904 + }, + { + "epoch": 0.301213798287851, + "grad_norm": 0.84375, + "learning_rate": 0.00019509624865725558, + "loss": 4.4672, + "step": 2905 + }, + { + "epoch": 0.30131748634233907, + "grad_norm": 1.0703125, + "learning_rate": 0.00019509288832096497, + "loss": 4.5035, + "step": 2906 + }, + { + "epoch": 0.30142117439682714, + "grad_norm": 0.83203125, + "learning_rate": 0.00019508952686267885, + "loss": 4.42, + "step": 2907 + }, + { + "epoch": 0.3015248624513152, + "grad_norm": 0.90234375, + "learning_rate": 0.00019508616428243677, + "loss": 4.465, + "step": 2908 + }, + { + "epoch": 0.3016285505058033, + "grad_norm": 0.76953125, + "learning_rate": 0.0001950828005802785, + "loss": 4.4657, + "step": 2909 + }, + { + "epoch": 0.30173223856029135, + "grad_norm": 0.65234375, + "learning_rate": 0.0001950794357562437, + "loss": 4.484, + "step": 2910 + }, + { + "epoch": 0.3018359266147794, + "grad_norm": 0.81640625, + "learning_rate": 0.00019507606981037203, + "loss": 4.4433, + "step": 2911 + }, + { + "epoch": 0.3019396146692675, + "grad_norm": 0.83984375, + "learning_rate": 0.00019507270274270324, + "loss": 4.4762, + "step": 2912 + }, + { + "epoch": 0.30204330272375557, + "grad_norm": 0.7109375, + "learning_rate": 0.00019506933455327706, + "loss": 4.5061, + "step": 2913 + }, + { + "epoch": 0.30214699077824364, + "grad_norm": 0.76953125, + "learning_rate": 0.00019506596524213325, + "loss": 4.4683, + "step": 2914 + }, + { + "epoch": 0.3022506788327317, + "grad_norm": 0.73828125, + "learning_rate": 0.0001950625948093115, + "loss": 4.4668, + "step": 2915 + }, + { + "epoch": 0.3023543668872198, + "grad_norm": 0.71875, + "learning_rate": 0.00019505922325485165, + "loss": 4.4326, + "step": 2916 + }, + { + "epoch": 0.30245805494170785, + "grad_norm": 0.64453125, + "learning_rate": 0.00019505585057879343, + "loss": 4.4932, + "step": 2917 + }, + { + "epoch": 0.3025617429961959, + "grad_norm": 0.7890625, + "learning_rate": 0.00019505247678117663, + "loss": 4.4537, + "step": 2918 + }, + { + "epoch": 0.302665431050684, + "grad_norm": 0.6875, + "learning_rate": 0.0001950491018620411, + "loss": 4.4999, + "step": 2919 + }, + { + "epoch": 0.30276911910517207, + "grad_norm": 0.82421875, + "learning_rate": 0.00019504572582142667, + "loss": 4.4697, + "step": 2920 + }, + { + "epoch": 0.30287280715966014, + "grad_norm": 0.60546875, + "learning_rate": 0.0001950423486593731, + "loss": 4.4829, + "step": 2921 + }, + { + "epoch": 0.3029764952141482, + "grad_norm": 0.7890625, + "learning_rate": 0.0001950389703759203, + "loss": 4.4401, + "step": 2922 + }, + { + "epoch": 0.3030801832686363, + "grad_norm": 0.640625, + "learning_rate": 0.00019503559097110807, + "loss": 4.4518, + "step": 2923 + }, + { + "epoch": 0.30318387132312435, + "grad_norm": 0.74609375, + "learning_rate": 0.00019503221044497637, + "loss": 4.4695, + "step": 2924 + }, + { + "epoch": 0.3032875593776124, + "grad_norm": 0.7578125, + "learning_rate": 0.00019502882879756503, + "loss": 4.4743, + "step": 2925 + }, + { + "epoch": 0.30339124743210055, + "grad_norm": 0.6953125, + "learning_rate": 0.00019502544602891395, + "loss": 4.5007, + "step": 2926 + }, + { + "epoch": 0.3034949354865886, + "grad_norm": 0.7265625, + "learning_rate": 0.00019502206213906306, + "loss": 4.4697, + "step": 2927 + }, + { + "epoch": 0.3035986235410767, + "grad_norm": 0.6484375, + "learning_rate": 0.0001950186771280523, + "loss": 4.4931, + "step": 2928 + }, + { + "epoch": 0.30370231159556477, + "grad_norm": 0.75, + "learning_rate": 0.00019501529099592155, + "loss": 4.4786, + "step": 2929 + }, + { + "epoch": 0.30380599965005284, + "grad_norm": 0.82421875, + "learning_rate": 0.0001950119037427108, + "loss": 4.4402, + "step": 2930 + }, + { + "epoch": 0.3039096877045409, + "grad_norm": 0.6640625, + "learning_rate": 0.00019500851536846008, + "loss": 4.4227, + "step": 2931 + }, + { + "epoch": 0.304013375759029, + "grad_norm": 0.609375, + "learning_rate": 0.00019500512587320926, + "loss": 4.4456, + "step": 2932 + }, + { + "epoch": 0.30411706381351705, + "grad_norm": 0.640625, + "learning_rate": 0.0001950017352569984, + "loss": 4.4523, + "step": 2933 + }, + { + "epoch": 0.3042207518680051, + "grad_norm": 0.66796875, + "learning_rate": 0.00019499834351986746, + "loss": 4.453, + "step": 2934 + }, + { + "epoch": 0.3043244399224932, + "grad_norm": 0.67578125, + "learning_rate": 0.00019499495066185646, + "loss": 4.4963, + "step": 2935 + }, + { + "epoch": 0.30442812797698127, + "grad_norm": 0.7109375, + "learning_rate": 0.0001949915566830055, + "loss": 4.4715, + "step": 2936 + }, + { + "epoch": 0.30453181603146934, + "grad_norm": 0.703125, + "learning_rate": 0.00019498816158335458, + "loss": 4.4423, + "step": 2937 + }, + { + "epoch": 0.3046355040859574, + "grad_norm": 0.640625, + "learning_rate": 0.00019498476536294375, + "loss": 4.4536, + "step": 2938 + }, + { + "epoch": 0.3047391921404455, + "grad_norm": 0.65625, + "learning_rate": 0.0001949813680218131, + "loss": 4.4166, + "step": 2939 + }, + { + "epoch": 0.30484288019493355, + "grad_norm": 0.6875, + "learning_rate": 0.0001949779695600027, + "loss": 4.4335, + "step": 2940 + }, + { + "epoch": 0.3049465682494216, + "grad_norm": 0.6484375, + "learning_rate": 0.00019497456997755264, + "loss": 4.4783, + "step": 2941 + }, + { + "epoch": 0.3050502563039097, + "grad_norm": 0.73046875, + "learning_rate": 0.00019497116927450305, + "loss": 4.474, + "step": 2942 + }, + { + "epoch": 0.30515394435839777, + "grad_norm": 0.62890625, + "learning_rate": 0.00019496776745089406, + "loss": 4.4656, + "step": 2943 + }, + { + "epoch": 0.30525763241288584, + "grad_norm": 0.69140625, + "learning_rate": 0.0001949643645067658, + "loss": 4.4925, + "step": 2944 + }, + { + "epoch": 0.3053613204673739, + "grad_norm": 0.7265625, + "learning_rate": 0.00019496096044215847, + "loss": 4.4422, + "step": 2945 + }, + { + "epoch": 0.305465008521862, + "grad_norm": 0.64453125, + "learning_rate": 0.00019495755525711212, + "loss": 4.4618, + "step": 2946 + }, + { + "epoch": 0.30556869657635005, + "grad_norm": 0.8125, + "learning_rate": 0.000194954148951667, + "loss": 4.4711, + "step": 2947 + }, + { + "epoch": 0.3056723846308381, + "grad_norm": 0.578125, + "learning_rate": 0.0001949507415258633, + "loss": 4.45, + "step": 2948 + }, + { + "epoch": 0.3057760726853262, + "grad_norm": 0.7265625, + "learning_rate": 0.00019494733297974125, + "loss": 4.428, + "step": 2949 + }, + { + "epoch": 0.30587976073981427, + "grad_norm": 0.71484375, + "learning_rate": 0.000194943923313341, + "loss": 4.451, + "step": 2950 + }, + { + "epoch": 0.30598344879430234, + "grad_norm": 0.6328125, + "learning_rate": 0.0001949405125267028, + "loss": 4.4983, + "step": 2951 + }, + { + "epoch": 0.3060871368487904, + "grad_norm": 0.8984375, + "learning_rate": 0.00019493710061986694, + "loss": 4.4957, + "step": 2952 + }, + { + "epoch": 0.3061908249032785, + "grad_norm": 0.90625, + "learning_rate": 0.00019493368759287361, + "loss": 4.4796, + "step": 2953 + }, + { + "epoch": 0.30629451295776655, + "grad_norm": 0.76171875, + "learning_rate": 0.00019493027344576316, + "loss": 4.4595, + "step": 2954 + }, + { + "epoch": 0.3063982010122546, + "grad_norm": 0.8359375, + "learning_rate": 0.0001949268581785758, + "loss": 4.4666, + "step": 2955 + }, + { + "epoch": 0.3065018890667427, + "grad_norm": 0.84765625, + "learning_rate": 0.00019492344179135188, + "loss": 4.498, + "step": 2956 + }, + { + "epoch": 0.30660557712123077, + "grad_norm": 1.0078125, + "learning_rate": 0.00019492002428413168, + "loss": 4.4927, + "step": 2957 + }, + { + "epoch": 0.30670926517571884, + "grad_norm": 1.0078125, + "learning_rate": 0.0001949166056569555, + "loss": 4.4527, + "step": 2958 + }, + { + "epoch": 0.3068129532302069, + "grad_norm": 0.921875, + "learning_rate": 0.0001949131859098637, + "loss": 4.4533, + "step": 2959 + }, + { + "epoch": 0.306916641284695, + "grad_norm": 0.984375, + "learning_rate": 0.00019490976504289668, + "loss": 4.4634, + "step": 2960 + }, + { + "epoch": 0.30702032933918305, + "grad_norm": 0.8984375, + "learning_rate": 0.00019490634305609471, + "loss": 4.5015, + "step": 2961 + }, + { + "epoch": 0.3071240173936711, + "grad_norm": 0.9140625, + "learning_rate": 0.00019490291994949828, + "loss": 4.4517, + "step": 2962 + }, + { + "epoch": 0.3072277054481592, + "grad_norm": 0.8828125, + "learning_rate": 0.00019489949572314765, + "loss": 4.4849, + "step": 2963 + }, + { + "epoch": 0.30733139350264727, + "grad_norm": 0.6796875, + "learning_rate": 0.0001948960703770833, + "loss": 4.4918, + "step": 2964 + }, + { + "epoch": 0.30743508155713534, + "grad_norm": 0.9609375, + "learning_rate": 0.00019489264391134563, + "loss": 4.4788, + "step": 2965 + }, + { + "epoch": 0.3075387696116234, + "grad_norm": 0.84375, + "learning_rate": 0.00019488921632597505, + "loss": 4.5025, + "step": 2966 + }, + { + "epoch": 0.3076424576661115, + "grad_norm": 0.83984375, + "learning_rate": 0.00019488578762101203, + "loss": 4.467, + "step": 2967 + }, + { + "epoch": 0.30774614572059955, + "grad_norm": 0.8125, + "learning_rate": 0.00019488235779649703, + "loss": 4.4541, + "step": 2968 + }, + { + "epoch": 0.3078498337750876, + "grad_norm": 0.78125, + "learning_rate": 0.0001948789268524705, + "loss": 4.4652, + "step": 2969 + }, + { + "epoch": 0.3079535218295757, + "grad_norm": 0.7890625, + "learning_rate": 0.0001948754947889729, + "loss": 4.4786, + "step": 2970 + }, + { + "epoch": 0.3080572098840638, + "grad_norm": 0.859375, + "learning_rate": 0.0001948720616060448, + "loss": 4.4873, + "step": 2971 + }, + { + "epoch": 0.3081608979385519, + "grad_norm": 0.75, + "learning_rate": 0.0001948686273037266, + "loss": 4.4651, + "step": 2972 + }, + { + "epoch": 0.30826458599303996, + "grad_norm": 0.78515625, + "learning_rate": 0.00019486519188205892, + "loss": 4.5188, + "step": 2973 + }, + { + "epoch": 0.30836827404752803, + "grad_norm": 0.86328125, + "learning_rate": 0.00019486175534108225, + "loss": 4.5097, + "step": 2974 + }, + { + "epoch": 0.3084719621020161, + "grad_norm": 0.8046875, + "learning_rate": 0.00019485831768083713, + "loss": 4.4577, + "step": 2975 + }, + { + "epoch": 0.3085756501565042, + "grad_norm": 0.73828125, + "learning_rate": 0.00019485487890136412, + "loss": 4.4674, + "step": 2976 + }, + { + "epoch": 0.30867933821099225, + "grad_norm": 0.875, + "learning_rate": 0.00019485143900270383, + "loss": 4.4575, + "step": 2977 + }, + { + "epoch": 0.3087830262654803, + "grad_norm": 0.88671875, + "learning_rate": 0.00019484799798489676, + "loss": 4.5353, + "step": 2978 + }, + { + "epoch": 0.3088867143199684, + "grad_norm": 0.97265625, + "learning_rate": 0.00019484455584798361, + "loss": 4.4304, + "step": 2979 + }, + { + "epoch": 0.30899040237445646, + "grad_norm": 0.95703125, + "learning_rate": 0.000194841112592005, + "loss": 4.4927, + "step": 2980 + }, + { + "epoch": 0.30909409042894453, + "grad_norm": 0.87109375, + "learning_rate": 0.00019483766821700146, + "loss": 4.483, + "step": 2981 + }, + { + "epoch": 0.3091977784834326, + "grad_norm": 0.66015625, + "learning_rate": 0.0001948342227230137, + "loss": 4.4858, + "step": 2982 + }, + { + "epoch": 0.3093014665379207, + "grad_norm": 0.9453125, + "learning_rate": 0.00019483077611008235, + "loss": 4.4839, + "step": 2983 + }, + { + "epoch": 0.30940515459240875, + "grad_norm": 0.9609375, + "learning_rate": 0.0001948273283782481, + "loss": 4.4471, + "step": 2984 + }, + { + "epoch": 0.3095088426468968, + "grad_norm": 0.625, + "learning_rate": 0.0001948238795275516, + "loss": 4.4844, + "step": 2985 + }, + { + "epoch": 0.3096125307013849, + "grad_norm": 0.78125, + "learning_rate": 0.00019482042955803355, + "loss": 4.4448, + "step": 2986 + }, + { + "epoch": 0.30971621875587296, + "grad_norm": 0.890625, + "learning_rate": 0.00019481697846973465, + "loss": 4.4542, + "step": 2987 + }, + { + "epoch": 0.30981990681036103, + "grad_norm": 0.73828125, + "learning_rate": 0.00019481352626269565, + "loss": 4.4617, + "step": 2988 + }, + { + "epoch": 0.3099235948648491, + "grad_norm": 0.875, + "learning_rate": 0.00019481007293695727, + "loss": 4.458, + "step": 2989 + }, + { + "epoch": 0.3100272829193372, + "grad_norm": 0.8203125, + "learning_rate": 0.00019480661849256023, + "loss": 4.4468, + "step": 2990 + }, + { + "epoch": 0.31013097097382525, + "grad_norm": 0.89453125, + "learning_rate": 0.0001948031629295453, + "loss": 4.4601, + "step": 2991 + }, + { + "epoch": 0.3102346590283133, + "grad_norm": 0.83984375, + "learning_rate": 0.00019479970624795327, + "loss": 4.4764, + "step": 2992 + }, + { + "epoch": 0.3103383470828014, + "grad_norm": 0.69921875, + "learning_rate": 0.0001947962484478249, + "loss": 4.4696, + "step": 2993 + }, + { + "epoch": 0.31044203513728946, + "grad_norm": 0.87109375, + "learning_rate": 0.00019479278952920102, + "loss": 4.477, + "step": 2994 + }, + { + "epoch": 0.31054572319177753, + "grad_norm": 0.94140625, + "learning_rate": 0.00019478932949212245, + "loss": 4.4776, + "step": 2995 + }, + { + "epoch": 0.3106494112462656, + "grad_norm": 0.68359375, + "learning_rate": 0.00019478586833662995, + "loss": 4.4574, + "step": 2996 + }, + { + "epoch": 0.3107530993007537, + "grad_norm": 0.75390625, + "learning_rate": 0.00019478240606276442, + "loss": 4.4864, + "step": 2997 + }, + { + "epoch": 0.31085678735524175, + "grad_norm": 0.91796875, + "learning_rate": 0.00019477894267056666, + "loss": 4.368, + "step": 2998 + }, + { + "epoch": 0.3109604754097298, + "grad_norm": 0.84765625, + "learning_rate": 0.00019477547816007756, + "loss": 4.4526, + "step": 2999 + }, + { + "epoch": 0.3110641634642179, + "grad_norm": 0.7421875, + "learning_rate": 0.000194772012531338, + "loss": 4.4717, + "step": 3000 + }, + { + "epoch": 0.31116785151870596, + "grad_norm": 0.77734375, + "learning_rate": 0.0001947685457843889, + "loss": 4.4854, + "step": 3001 + }, + { + "epoch": 0.31127153957319403, + "grad_norm": 0.76171875, + "learning_rate": 0.00019476507791927112, + "loss": 4.4169, + "step": 3002 + }, + { + "epoch": 0.3113752276276821, + "grad_norm": 0.89453125, + "learning_rate": 0.0001947616089360256, + "loss": 4.4557, + "step": 3003 + }, + { + "epoch": 0.3114789156821702, + "grad_norm": 0.86328125, + "learning_rate": 0.00019475813883469326, + "loss": 4.4646, + "step": 3004 + }, + { + "epoch": 0.31158260373665825, + "grad_norm": 0.8515625, + "learning_rate": 0.00019475466761531505, + "loss": 4.4866, + "step": 3005 + }, + { + "epoch": 0.3116862917911463, + "grad_norm": 0.87890625, + "learning_rate": 0.0001947511952779319, + "loss": 4.4938, + "step": 3006 + }, + { + "epoch": 0.3117899798456344, + "grad_norm": 0.81640625, + "learning_rate": 0.0001947477218225848, + "loss": 4.4514, + "step": 3007 + }, + { + "epoch": 0.31189366790012246, + "grad_norm": 1.1015625, + "learning_rate": 0.00019474424724931475, + "loss": 4.4562, + "step": 3008 + }, + { + "epoch": 0.31199735595461053, + "grad_norm": 0.9609375, + "learning_rate": 0.00019474077155816276, + "loss": 4.4845, + "step": 3009 + }, + { + "epoch": 0.3121010440090986, + "grad_norm": 0.80859375, + "learning_rate": 0.00019473729474916976, + "loss": 4.4958, + "step": 3010 + }, + { + "epoch": 0.3122047320635867, + "grad_norm": 0.80859375, + "learning_rate": 0.00019473381682237685, + "loss": 4.5092, + "step": 3011 + }, + { + "epoch": 0.31230842011807475, + "grad_norm": 0.7421875, + "learning_rate": 0.00019473033777782503, + "loss": 4.4751, + "step": 3012 + }, + { + "epoch": 0.3124121081725628, + "grad_norm": 0.88671875, + "learning_rate": 0.00019472685761555536, + "loss": 4.4743, + "step": 3013 + }, + { + "epoch": 0.3125157962270509, + "grad_norm": 0.9375, + "learning_rate": 0.0001947233763356089, + "loss": 4.491, + "step": 3014 + }, + { + "epoch": 0.312619484281539, + "grad_norm": 0.82421875, + "learning_rate": 0.00019471989393802673, + "loss": 4.4783, + "step": 3015 + }, + { + "epoch": 0.3127231723360271, + "grad_norm": 0.6484375, + "learning_rate": 0.00019471641042284992, + "loss": 4.4875, + "step": 3016 + }, + { + "epoch": 0.31282686039051516, + "grad_norm": 0.765625, + "learning_rate": 0.0001947129257901196, + "loss": 4.4885, + "step": 3017 + }, + { + "epoch": 0.31293054844500323, + "grad_norm": 0.74609375, + "learning_rate": 0.00019470944003987687, + "loss": 4.4643, + "step": 3018 + }, + { + "epoch": 0.3130342364994913, + "grad_norm": 0.70703125, + "learning_rate": 0.00019470595317216288, + "loss": 4.4748, + "step": 3019 + }, + { + "epoch": 0.3131379245539794, + "grad_norm": 0.78125, + "learning_rate": 0.0001947024651870187, + "loss": 4.4483, + "step": 3020 + }, + { + "epoch": 0.31324161260846745, + "grad_norm": 0.671875, + "learning_rate": 0.0001946989760844856, + "loss": 4.4574, + "step": 3021 + }, + { + "epoch": 0.3133453006629555, + "grad_norm": 0.59375, + "learning_rate": 0.00019469548586460464, + "loss": 4.4674, + "step": 3022 + }, + { + "epoch": 0.3134489887174436, + "grad_norm": 0.7578125, + "learning_rate": 0.00019469199452741705, + "loss": 4.5057, + "step": 3023 + }, + { + "epoch": 0.31355267677193166, + "grad_norm": 0.71875, + "learning_rate": 0.00019468850207296403, + "loss": 4.4566, + "step": 3024 + }, + { + "epoch": 0.31365636482641973, + "grad_norm": 0.7734375, + "learning_rate": 0.0001946850085012868, + "loss": 4.4284, + "step": 3025 + }, + { + "epoch": 0.3137600528809078, + "grad_norm": 0.8203125, + "learning_rate": 0.00019468151381242649, + "loss": 4.4086, + "step": 3026 + }, + { + "epoch": 0.3138637409353959, + "grad_norm": 0.70703125, + "learning_rate": 0.00019467801800642444, + "loss": 4.4613, + "step": 3027 + }, + { + "epoch": 0.31396742898988395, + "grad_norm": 0.8203125, + "learning_rate": 0.00019467452108332185, + "loss": 4.438, + "step": 3028 + }, + { + "epoch": 0.314071117044372, + "grad_norm": 0.75, + "learning_rate": 0.00019467102304316, + "loss": 4.4498, + "step": 3029 + }, + { + "epoch": 0.3141748050988601, + "grad_norm": 0.7421875, + "learning_rate": 0.00019466752388598013, + "loss": 4.5043, + "step": 3030 + }, + { + "epoch": 0.31427849315334816, + "grad_norm": 0.81640625, + "learning_rate": 0.00019466402361182356, + "loss": 4.4541, + "step": 3031 + }, + { + "epoch": 0.31438218120783623, + "grad_norm": 0.796875, + "learning_rate": 0.00019466052222073157, + "loss": 4.4769, + "step": 3032 + }, + { + "epoch": 0.3144858692623243, + "grad_norm": 0.7734375, + "learning_rate": 0.00019465701971274548, + "loss": 4.489, + "step": 3033 + }, + { + "epoch": 0.3145895573168124, + "grad_norm": 1.0390625, + "learning_rate": 0.0001946535160879066, + "loss": 4.4899, + "step": 3034 + }, + { + "epoch": 0.31469324537130045, + "grad_norm": 0.8125, + "learning_rate": 0.0001946500113462563, + "loss": 4.4684, + "step": 3035 + }, + { + "epoch": 0.3147969334257885, + "grad_norm": 0.84375, + "learning_rate": 0.00019464650548783592, + "loss": 4.4623, + "step": 3036 + }, + { + "epoch": 0.3149006214802766, + "grad_norm": 0.8203125, + "learning_rate": 0.0001946429985126868, + "loss": 4.4772, + "step": 3037 + }, + { + "epoch": 0.31500430953476466, + "grad_norm": 0.81640625, + "learning_rate": 0.00019463949042085036, + "loss": 4.4399, + "step": 3038 + }, + { + "epoch": 0.31510799758925273, + "grad_norm": 0.90625, + "learning_rate": 0.00019463598121236797, + "loss": 4.3842, + "step": 3039 + }, + { + "epoch": 0.3152116856437408, + "grad_norm": 0.76953125, + "learning_rate": 0.00019463247088728102, + "loss": 4.462, + "step": 3040 + }, + { + "epoch": 0.3153153736982289, + "grad_norm": 0.8125, + "learning_rate": 0.00019462895944563098, + "loss": 4.4954, + "step": 3041 + }, + { + "epoch": 0.31541906175271694, + "grad_norm": 0.734375, + "learning_rate": 0.0001946254468874592, + "loss": 4.4741, + "step": 3042 + }, + { + "epoch": 0.315522749807205, + "grad_norm": 0.78515625, + "learning_rate": 0.0001946219332128072, + "loss": 4.4557, + "step": 3043 + }, + { + "epoch": 0.3156264378616931, + "grad_norm": 0.78515625, + "learning_rate": 0.0001946184184217164, + "loss": 4.4779, + "step": 3044 + }, + { + "epoch": 0.31573012591618116, + "grad_norm": 0.77734375, + "learning_rate": 0.00019461490251422827, + "loss": 4.4799, + "step": 3045 + }, + { + "epoch": 0.31583381397066923, + "grad_norm": 0.6875, + "learning_rate": 0.0001946113854903843, + "loss": 4.476, + "step": 3046 + }, + { + "epoch": 0.3159375020251573, + "grad_norm": 0.7265625, + "learning_rate": 0.000194607867350226, + "loss": 4.4456, + "step": 3047 + }, + { + "epoch": 0.3160411900796454, + "grad_norm": 0.80859375, + "learning_rate": 0.00019460434809379486, + "loss": 4.4778, + "step": 3048 + }, + { + "epoch": 0.31614487813413344, + "grad_norm": 0.8046875, + "learning_rate": 0.00019460082772113245, + "loss": 4.4231, + "step": 3049 + }, + { + "epoch": 0.3162485661886215, + "grad_norm": 0.78515625, + "learning_rate": 0.00019459730623228022, + "loss": 4.4713, + "step": 3050 + }, + { + "epoch": 0.3163522542431096, + "grad_norm": 0.90234375, + "learning_rate": 0.0001945937836272798, + "loss": 4.4625, + "step": 3051 + }, + { + "epoch": 0.31645594229759766, + "grad_norm": 0.91796875, + "learning_rate": 0.00019459025990617272, + "loss": 4.4777, + "step": 3052 + }, + { + "epoch": 0.31655963035208573, + "grad_norm": 0.875, + "learning_rate": 0.00019458673506900052, + "loss": 4.4678, + "step": 3053 + }, + { + "epoch": 0.3166633184065738, + "grad_norm": 0.953125, + "learning_rate": 0.0001945832091158049, + "loss": 4.4341, + "step": 3054 + }, + { + "epoch": 0.3167670064610619, + "grad_norm": 0.98046875, + "learning_rate": 0.00019457968204662733, + "loss": 4.4406, + "step": 3055 + }, + { + "epoch": 0.31687069451554994, + "grad_norm": 0.890625, + "learning_rate": 0.00019457615386150954, + "loss": 4.4484, + "step": 3056 + }, + { + "epoch": 0.316974382570038, + "grad_norm": 0.87109375, + "learning_rate": 0.00019457262456049307, + "loss": 4.4671, + "step": 3057 + }, + { + "epoch": 0.3170780706245261, + "grad_norm": 1.046875, + "learning_rate": 0.00019456909414361962, + "loss": 4.4402, + "step": 3058 + }, + { + "epoch": 0.31718175867901416, + "grad_norm": 0.63671875, + "learning_rate": 0.0001945655626109308, + "loss": 4.4295, + "step": 3059 + }, + { + "epoch": 0.3172854467335023, + "grad_norm": 0.8125, + "learning_rate": 0.0001945620299624683, + "loss": 4.414, + "step": 3060 + }, + { + "epoch": 0.31738913478799036, + "grad_norm": 0.87890625, + "learning_rate": 0.00019455849619827382, + "loss": 4.4506, + "step": 3061 + }, + { + "epoch": 0.31749282284247843, + "grad_norm": 0.76171875, + "learning_rate": 0.000194554961318389, + "loss": 4.4642, + "step": 3062 + }, + { + "epoch": 0.3175965108969665, + "grad_norm": 0.87890625, + "learning_rate": 0.00019455142532285563, + "loss": 4.4586, + "step": 3063 + }, + { + "epoch": 0.31770019895145457, + "grad_norm": 0.76953125, + "learning_rate": 0.00019454788821171538, + "loss": 4.4649, + "step": 3064 + }, + { + "epoch": 0.31780388700594264, + "grad_norm": 0.77734375, + "learning_rate": 0.00019454434998501, + "loss": 4.4466, + "step": 3065 + }, + { + "epoch": 0.3179075750604307, + "grad_norm": 0.88671875, + "learning_rate": 0.0001945408106427812, + "loss": 4.4542, + "step": 3066 + }, + { + "epoch": 0.3180112631149188, + "grad_norm": 0.75390625, + "learning_rate": 0.00019453727018507077, + "loss": 4.5246, + "step": 3067 + }, + { + "epoch": 0.31811495116940686, + "grad_norm": 0.7421875, + "learning_rate": 0.0001945337286119205, + "loss": 4.5043, + "step": 3068 + }, + { + "epoch": 0.31821863922389493, + "grad_norm": 0.73828125, + "learning_rate": 0.00019453018592337213, + "loss": 4.4265, + "step": 3069 + }, + { + "epoch": 0.318322327278383, + "grad_norm": 0.828125, + "learning_rate": 0.00019452664211946753, + "loss": 4.5128, + "step": 3070 + }, + { + "epoch": 0.31842601533287107, + "grad_norm": 0.83203125, + "learning_rate": 0.00019452309720024844, + "loss": 4.4348, + "step": 3071 + }, + { + "epoch": 0.31852970338735914, + "grad_norm": 0.796875, + "learning_rate": 0.00019451955116575674, + "loss": 4.4831, + "step": 3072 + }, + { + "epoch": 0.3186333914418472, + "grad_norm": 0.66015625, + "learning_rate": 0.00019451600401603422, + "loss": 4.4866, + "step": 3073 + }, + { + "epoch": 0.3187370794963353, + "grad_norm": 0.7421875, + "learning_rate": 0.00019451245575112278, + "loss": 4.3868, + "step": 3074 + }, + { + "epoch": 0.31884076755082336, + "grad_norm": 0.70703125, + "learning_rate": 0.00019450890637106428, + "loss": 4.4373, + "step": 3075 + }, + { + "epoch": 0.3189444556053114, + "grad_norm": 0.76171875, + "learning_rate": 0.00019450535587590056, + "loss": 4.4538, + "step": 3076 + }, + { + "epoch": 0.3190481436597995, + "grad_norm": 0.6953125, + "learning_rate": 0.00019450180426567354, + "loss": 4.4422, + "step": 3077 + }, + { + "epoch": 0.31915183171428757, + "grad_norm": 0.7578125, + "learning_rate": 0.00019449825154042513, + "loss": 4.4561, + "step": 3078 + }, + { + "epoch": 0.31925551976877564, + "grad_norm": 0.7109375, + "learning_rate": 0.00019449469770019723, + "loss": 4.5059, + "step": 3079 + }, + { + "epoch": 0.3193592078232637, + "grad_norm": 0.6640625, + "learning_rate": 0.00019449114274503178, + "loss": 4.4227, + "step": 3080 + }, + { + "epoch": 0.3194628958777518, + "grad_norm": 0.6796875, + "learning_rate": 0.00019448758667497075, + "loss": 4.4908, + "step": 3081 + }, + { + "epoch": 0.31956658393223986, + "grad_norm": 0.82421875, + "learning_rate": 0.00019448402949005607, + "loss": 4.4663, + "step": 3082 + }, + { + "epoch": 0.3196702719867279, + "grad_norm": 0.67578125, + "learning_rate": 0.0001944804711903297, + "loss": 4.4608, + "step": 3083 + }, + { + "epoch": 0.319773960041216, + "grad_norm": 0.6796875, + "learning_rate": 0.00019447691177583364, + "loss": 4.4182, + "step": 3084 + }, + { + "epoch": 0.31987764809570407, + "grad_norm": 0.6875, + "learning_rate": 0.00019447335124660992, + "loss": 4.4701, + "step": 3085 + }, + { + "epoch": 0.31998133615019214, + "grad_norm": 0.703125, + "learning_rate": 0.00019446978960270048, + "loss": 4.4572, + "step": 3086 + }, + { + "epoch": 0.3200850242046802, + "grad_norm": 0.72265625, + "learning_rate": 0.00019446622684414738, + "loss": 4.4828, + "step": 3087 + }, + { + "epoch": 0.3201887122591683, + "grad_norm": 0.88671875, + "learning_rate": 0.0001944626629709927, + "loss": 4.4391, + "step": 3088 + }, + { + "epoch": 0.32029240031365636, + "grad_norm": 0.84765625, + "learning_rate": 0.0001944590979832784, + "loss": 4.4336, + "step": 3089 + }, + { + "epoch": 0.3203960883681444, + "grad_norm": 0.68359375, + "learning_rate": 0.00019445553188104665, + "loss": 4.4397, + "step": 3090 + }, + { + "epoch": 0.3204997764226325, + "grad_norm": 0.80859375, + "learning_rate": 0.0001944519646643394, + "loss": 4.4129, + "step": 3091 + }, + { + "epoch": 0.32060346447712057, + "grad_norm": 0.83203125, + "learning_rate": 0.00019444839633319885, + "loss": 4.4109, + "step": 3092 + }, + { + "epoch": 0.32070715253160864, + "grad_norm": 0.90625, + "learning_rate": 0.00019444482688766703, + "loss": 4.4595, + "step": 3093 + }, + { + "epoch": 0.3208108405860967, + "grad_norm": 0.87890625, + "learning_rate": 0.00019444125632778612, + "loss": 4.4086, + "step": 3094 + }, + { + "epoch": 0.3209145286405848, + "grad_norm": 0.77734375, + "learning_rate": 0.0001944376846535982, + "loss": 4.4644, + "step": 3095 + }, + { + "epoch": 0.32101821669507286, + "grad_norm": 0.80859375, + "learning_rate": 0.00019443411186514543, + "loss": 4.4432, + "step": 3096 + }, + { + "epoch": 0.3211219047495609, + "grad_norm": 1.0234375, + "learning_rate": 0.00019443053796246992, + "loss": 4.4119, + "step": 3097 + }, + { + "epoch": 0.321225592804049, + "grad_norm": 1.0625, + "learning_rate": 0.00019442696294561394, + "loss": 4.4644, + "step": 3098 + }, + { + "epoch": 0.32132928085853707, + "grad_norm": 1.046875, + "learning_rate": 0.00019442338681461958, + "loss": 4.4355, + "step": 3099 + }, + { + "epoch": 0.32143296891302514, + "grad_norm": 0.85546875, + "learning_rate": 0.00019441980956952905, + "loss": 4.4222, + "step": 3100 + }, + { + "epoch": 0.3215366569675132, + "grad_norm": 0.81640625, + "learning_rate": 0.00019441623121038462, + "loss": 4.4843, + "step": 3101 + }, + { + "epoch": 0.3216403450220013, + "grad_norm": 1.1796875, + "learning_rate": 0.00019441265173722843, + "loss": 4.4442, + "step": 3102 + }, + { + "epoch": 0.32174403307648936, + "grad_norm": 0.89453125, + "learning_rate": 0.00019440907115010275, + "loss": 4.503, + "step": 3103 + }, + { + "epoch": 0.3218477211309774, + "grad_norm": 1.0625, + "learning_rate": 0.00019440548944904985, + "loss": 4.475, + "step": 3104 + }, + { + "epoch": 0.32195140918546555, + "grad_norm": 1.0859375, + "learning_rate": 0.00019440190663411194, + "loss": 4.4357, + "step": 3105 + }, + { + "epoch": 0.3220550972399536, + "grad_norm": 0.953125, + "learning_rate": 0.00019439832270533132, + "loss": 4.4239, + "step": 3106 + }, + { + "epoch": 0.3221587852944417, + "grad_norm": 1.1015625, + "learning_rate": 0.00019439473766275027, + "loss": 4.4629, + "step": 3107 + }, + { + "epoch": 0.32226247334892977, + "grad_norm": 0.921875, + "learning_rate": 0.0001943911515064111, + "loss": 4.4815, + "step": 3108 + }, + { + "epoch": 0.32236616140341784, + "grad_norm": 1.2421875, + "learning_rate": 0.00019438756423635615, + "loss": 4.4407, + "step": 3109 + }, + { + "epoch": 0.3224698494579059, + "grad_norm": 0.84765625, + "learning_rate": 0.00019438397585262767, + "loss": 4.467, + "step": 3110 + }, + { + "epoch": 0.322573537512394, + "grad_norm": 1.0625, + "learning_rate": 0.00019438038635526806, + "loss": 4.4749, + "step": 3111 + }, + { + "epoch": 0.32267722556688205, + "grad_norm": 0.8671875, + "learning_rate": 0.00019437679574431965, + "loss": 4.4781, + "step": 3112 + }, + { + "epoch": 0.3227809136213701, + "grad_norm": 0.94921875, + "learning_rate": 0.00019437320401982481, + "loss": 4.4409, + "step": 3113 + }, + { + "epoch": 0.3228846016758582, + "grad_norm": 1.15625, + "learning_rate": 0.00019436961118182592, + "loss": 4.4956, + "step": 3114 + }, + { + "epoch": 0.32298828973034627, + "grad_norm": 0.7265625, + "learning_rate": 0.0001943660172303654, + "loss": 4.4362, + "step": 3115 + }, + { + "epoch": 0.32309197778483434, + "grad_norm": 1.1484375, + "learning_rate": 0.0001943624221654856, + "loss": 4.4844, + "step": 3116 + }, + { + "epoch": 0.3231956658393224, + "grad_norm": 0.92578125, + "learning_rate": 0.00019435882598722897, + "loss": 4.4239, + "step": 3117 + }, + { + "epoch": 0.3232993538938105, + "grad_norm": 0.87109375, + "learning_rate": 0.00019435522869563793, + "loss": 4.4157, + "step": 3118 + }, + { + "epoch": 0.32340304194829855, + "grad_norm": 0.9765625, + "learning_rate": 0.00019435163029075491, + "loss": 4.4409, + "step": 3119 + }, + { + "epoch": 0.3235067300027866, + "grad_norm": 0.83984375, + "learning_rate": 0.00019434803077262244, + "loss": 4.4128, + "step": 3120 + }, + { + "epoch": 0.3236104180572747, + "grad_norm": 0.83984375, + "learning_rate": 0.00019434443014128288, + "loss": 4.4665, + "step": 3121 + }, + { + "epoch": 0.32371410611176277, + "grad_norm": 0.98828125, + "learning_rate": 0.00019434082839677879, + "loss": 4.4656, + "step": 3122 + }, + { + "epoch": 0.32381779416625084, + "grad_norm": 1.234375, + "learning_rate": 0.00019433722553915267, + "loss": 4.4474, + "step": 3123 + }, + { + "epoch": 0.3239214822207389, + "grad_norm": 0.8125, + "learning_rate": 0.00019433362156844698, + "loss": 4.4509, + "step": 3124 + }, + { + "epoch": 0.324025170275227, + "grad_norm": 1.1015625, + "learning_rate": 0.00019433001648470427, + "loss": 4.415, + "step": 3125 + }, + { + "epoch": 0.32412885832971505, + "grad_norm": 1.1796875, + "learning_rate": 0.0001943264102879671, + "loss": 4.4765, + "step": 3126 + }, + { + "epoch": 0.3242325463842031, + "grad_norm": 0.86328125, + "learning_rate": 0.00019432280297827797, + "loss": 4.4392, + "step": 3127 + }, + { + "epoch": 0.3243362344386912, + "grad_norm": 1.1015625, + "learning_rate": 0.0001943191945556795, + "loss": 4.4432, + "step": 3128 + }, + { + "epoch": 0.32443992249317927, + "grad_norm": 0.7734375, + "learning_rate": 0.0001943155850202142, + "loss": 4.4573, + "step": 3129 + }, + { + "epoch": 0.32454361054766734, + "grad_norm": 0.9765625, + "learning_rate": 0.00019431197437192471, + "loss": 4.444, + "step": 3130 + }, + { + "epoch": 0.3246472986021554, + "grad_norm": 0.99609375, + "learning_rate": 0.00019430836261085364, + "loss": 4.4296, + "step": 3131 + }, + { + "epoch": 0.3247509866566435, + "grad_norm": 0.83203125, + "learning_rate": 0.00019430474973704354, + "loss": 4.458, + "step": 3132 + }, + { + "epoch": 0.32485467471113155, + "grad_norm": 0.6796875, + "learning_rate": 0.00019430113575053708, + "loss": 4.4591, + "step": 3133 + }, + { + "epoch": 0.3249583627656196, + "grad_norm": 0.78515625, + "learning_rate": 0.0001942975206513769, + "loss": 4.4305, + "step": 3134 + }, + { + "epoch": 0.3250620508201077, + "grad_norm": 0.7421875, + "learning_rate": 0.00019429390443960568, + "loss": 4.4696, + "step": 3135 + }, + { + "epoch": 0.32516573887459577, + "grad_norm": 0.69921875, + "learning_rate": 0.00019429028711526604, + "loss": 4.4581, + "step": 3136 + }, + { + "epoch": 0.32526942692908384, + "grad_norm": 0.73828125, + "learning_rate": 0.00019428666867840066, + "loss": 4.4581, + "step": 3137 + }, + { + "epoch": 0.3253731149835719, + "grad_norm": 0.75390625, + "learning_rate": 0.0001942830491290523, + "loss": 4.4528, + "step": 3138 + }, + { + "epoch": 0.32547680303806, + "grad_norm": 0.734375, + "learning_rate": 0.0001942794284672636, + "loss": 4.4092, + "step": 3139 + }, + { + "epoch": 0.32558049109254805, + "grad_norm": 0.65625, + "learning_rate": 0.0001942758066930773, + "loss": 4.4827, + "step": 3140 + }, + { + "epoch": 0.3256841791470361, + "grad_norm": 0.6875, + "learning_rate": 0.00019427218380653613, + "loss": 4.5186, + "step": 3141 + }, + { + "epoch": 0.3257878672015242, + "grad_norm": 0.63671875, + "learning_rate": 0.00019426855980768287, + "loss": 4.4642, + "step": 3142 + }, + { + "epoch": 0.32589155525601227, + "grad_norm": 0.6796875, + "learning_rate": 0.0001942649346965602, + "loss": 4.4665, + "step": 3143 + }, + { + "epoch": 0.32599524331050034, + "grad_norm": 0.65234375, + "learning_rate": 0.00019426130847321097, + "loss": 4.4525, + "step": 3144 + }, + { + "epoch": 0.3260989313649884, + "grad_norm": 0.6953125, + "learning_rate": 0.00019425768113767795, + "loss": 4.4475, + "step": 3145 + }, + { + "epoch": 0.3262026194194765, + "grad_norm": 0.72265625, + "learning_rate": 0.0001942540526900039, + "loss": 4.4347, + "step": 3146 + }, + { + "epoch": 0.32630630747396455, + "grad_norm": 0.6171875, + "learning_rate": 0.0001942504231302317, + "loss": 4.4461, + "step": 3147 + }, + { + "epoch": 0.3264099955284526, + "grad_norm": 0.671875, + "learning_rate": 0.0001942467924584041, + "loss": 4.4618, + "step": 3148 + }, + { + "epoch": 0.32651368358294075, + "grad_norm": 0.75390625, + "learning_rate": 0.00019424316067456396, + "loss": 4.4542, + "step": 3149 + }, + { + "epoch": 0.3266173716374288, + "grad_norm": 0.6796875, + "learning_rate": 0.00019423952777875418, + "loss": 4.3964, + "step": 3150 + }, + { + "epoch": 0.3267210596919169, + "grad_norm": 0.5703125, + "learning_rate": 0.00019423589377101758, + "loss": 4.4189, + "step": 3151 + }, + { + "epoch": 0.32682474774640496, + "grad_norm": 0.75, + "learning_rate": 0.00019423225865139703, + "loss": 4.503, + "step": 3152 + }, + { + "epoch": 0.32692843580089304, + "grad_norm": 0.76953125, + "learning_rate": 0.00019422862241993545, + "loss": 4.4763, + "step": 3153 + }, + { + "epoch": 0.3270321238553811, + "grad_norm": 0.734375, + "learning_rate": 0.00019422498507667572, + "loss": 4.4683, + "step": 3154 + }, + { + "epoch": 0.3271358119098692, + "grad_norm": 0.70703125, + "learning_rate": 0.00019422134662166077, + "loss": 4.4992, + "step": 3155 + }, + { + "epoch": 0.32723949996435725, + "grad_norm": 0.7109375, + "learning_rate": 0.00019421770705493354, + "loss": 4.4608, + "step": 3156 + }, + { + "epoch": 0.3273431880188453, + "grad_norm": 0.75, + "learning_rate": 0.00019421406637653692, + "loss": 4.482, + "step": 3157 + }, + { + "epoch": 0.3274468760733334, + "grad_norm": 0.73828125, + "learning_rate": 0.00019421042458651395, + "loss": 4.4546, + "step": 3158 + }, + { + "epoch": 0.32755056412782146, + "grad_norm": 0.74609375, + "learning_rate": 0.00019420678168490755, + "loss": 4.4352, + "step": 3159 + }, + { + "epoch": 0.32765425218230954, + "grad_norm": 0.796875, + "learning_rate": 0.00019420313767176065, + "loss": 4.4403, + "step": 3160 + }, + { + "epoch": 0.3277579402367976, + "grad_norm": 0.7265625, + "learning_rate": 0.00019419949254711636, + "loss": 4.4726, + "step": 3161 + }, + { + "epoch": 0.3278616282912857, + "grad_norm": 0.71875, + "learning_rate": 0.0001941958463110176, + "loss": 4.4651, + "step": 3162 + }, + { + "epoch": 0.32796531634577375, + "grad_norm": 0.91796875, + "learning_rate": 0.00019419219896350747, + "loss": 4.515, + "step": 3163 + }, + { + "epoch": 0.3280690044002618, + "grad_norm": 1.0, + "learning_rate": 0.0001941885505046289, + "loss": 4.4578, + "step": 3164 + }, + { + "epoch": 0.3281726924547499, + "grad_norm": 0.74609375, + "learning_rate": 0.00019418490093442504, + "loss": 4.4244, + "step": 3165 + }, + { + "epoch": 0.32827638050923796, + "grad_norm": 0.81640625, + "learning_rate": 0.00019418125025293887, + "loss": 4.4859, + "step": 3166 + }, + { + "epoch": 0.32838006856372604, + "grad_norm": 1.015625, + "learning_rate": 0.00019417759846021354, + "loss": 4.4438, + "step": 3167 + }, + { + "epoch": 0.3284837566182141, + "grad_norm": 0.984375, + "learning_rate": 0.00019417394555629208, + "loss": 4.4728, + "step": 3168 + }, + { + "epoch": 0.3285874446727022, + "grad_norm": 0.87890625, + "learning_rate": 0.00019417029154121757, + "loss": 4.4602, + "step": 3169 + }, + { + "epoch": 0.32869113272719025, + "grad_norm": 0.83984375, + "learning_rate": 0.00019416663641503323, + "loss": 4.423, + "step": 3170 + }, + { + "epoch": 0.3287948207816783, + "grad_norm": 0.87890625, + "learning_rate": 0.00019416298017778207, + "loss": 4.4466, + "step": 3171 + }, + { + "epoch": 0.3288985088361664, + "grad_norm": 0.98828125, + "learning_rate": 0.00019415932282950728, + "loss": 4.4669, + "step": 3172 + }, + { + "epoch": 0.32900219689065446, + "grad_norm": 1.0390625, + "learning_rate": 0.00019415566437025206, + "loss": 4.4427, + "step": 3173 + }, + { + "epoch": 0.32910588494514253, + "grad_norm": 0.79296875, + "learning_rate": 0.00019415200480005947, + "loss": 4.4346, + "step": 3174 + }, + { + "epoch": 0.3292095729996306, + "grad_norm": 0.99609375, + "learning_rate": 0.0001941483441189728, + "loss": 4.4512, + "step": 3175 + }, + { + "epoch": 0.3293132610541187, + "grad_norm": 1.140625, + "learning_rate": 0.00019414468232703517, + "loss": 4.4743, + "step": 3176 + }, + { + "epoch": 0.32941694910860675, + "grad_norm": 0.7265625, + "learning_rate": 0.00019414101942428978, + "loss": 4.4539, + "step": 3177 + }, + { + "epoch": 0.3295206371630948, + "grad_norm": 1.0234375, + "learning_rate": 0.00019413735541077987, + "loss": 4.4291, + "step": 3178 + }, + { + "epoch": 0.3296243252175829, + "grad_norm": 0.79296875, + "learning_rate": 0.00019413369028654868, + "loss": 4.4172, + "step": 3179 + }, + { + "epoch": 0.32972801327207096, + "grad_norm": 0.859375, + "learning_rate": 0.00019413002405163944, + "loss": 4.4088, + "step": 3180 + }, + { + "epoch": 0.32983170132655903, + "grad_norm": 0.97265625, + "learning_rate": 0.00019412635670609544, + "loss": 4.4669, + "step": 3181 + }, + { + "epoch": 0.3299353893810471, + "grad_norm": 0.85546875, + "learning_rate": 0.00019412268824995992, + "loss": 4.4327, + "step": 3182 + }, + { + "epoch": 0.3300390774355352, + "grad_norm": 0.73828125, + "learning_rate": 0.00019411901868327617, + "loss": 4.4497, + "step": 3183 + }, + { + "epoch": 0.33014276549002325, + "grad_norm": 0.90625, + "learning_rate": 0.0001941153480060875, + "loss": 4.4268, + "step": 3184 + }, + { + "epoch": 0.3302464535445113, + "grad_norm": 0.765625, + "learning_rate": 0.0001941116762184372, + "loss": 4.4297, + "step": 3185 + }, + { + "epoch": 0.3303501415989994, + "grad_norm": 0.73828125, + "learning_rate": 0.0001941080033203686, + "loss": 4.4364, + "step": 3186 + }, + { + "epoch": 0.33045382965348746, + "grad_norm": 0.87109375, + "learning_rate": 0.00019410432931192504, + "loss": 4.4927, + "step": 3187 + }, + { + "epoch": 0.33055751770797553, + "grad_norm": 0.87109375, + "learning_rate": 0.00019410065419314985, + "loss": 4.4977, + "step": 3188 + }, + { + "epoch": 0.3306612057624636, + "grad_norm": 0.8828125, + "learning_rate": 0.00019409697796408641, + "loss": 4.4403, + "step": 3189 + }, + { + "epoch": 0.3307648938169517, + "grad_norm": 1.09375, + "learning_rate": 0.0001940933006247781, + "loss": 4.467, + "step": 3190 + }, + { + "epoch": 0.33086858187143975, + "grad_norm": 0.9375, + "learning_rate": 0.00019408962217526833, + "loss": 4.4594, + "step": 3191 + }, + { + "epoch": 0.3309722699259278, + "grad_norm": 0.94921875, + "learning_rate": 0.00019408594261560044, + "loss": 4.4412, + "step": 3192 + }, + { + "epoch": 0.3310759579804159, + "grad_norm": 0.93359375, + "learning_rate": 0.0001940822619458179, + "loss": 4.4959, + "step": 3193 + }, + { + "epoch": 0.331179646034904, + "grad_norm": 0.8125, + "learning_rate": 0.0001940785801659641, + "loss": 4.4427, + "step": 3194 + }, + { + "epoch": 0.3312833340893921, + "grad_norm": 0.9921875, + "learning_rate": 0.0001940748972760825, + "loss": 4.4779, + "step": 3195 + }, + { + "epoch": 0.33138702214388016, + "grad_norm": 1.15625, + "learning_rate": 0.00019407121327621657, + "loss": 4.4233, + "step": 3196 + }, + { + "epoch": 0.33149071019836823, + "grad_norm": 0.640625, + "learning_rate": 0.00019406752816640977, + "loss": 4.443, + "step": 3197 + }, + { + "epoch": 0.3315943982528563, + "grad_norm": 1.1640625, + "learning_rate": 0.00019406384194670554, + "loss": 4.4023, + "step": 3198 + }, + { + "epoch": 0.3316980863073444, + "grad_norm": 1.0078125, + "learning_rate": 0.00019406015461714745, + "loss": 4.4119, + "step": 3199 + }, + { + "epoch": 0.33180177436183245, + "grad_norm": 0.90625, + "learning_rate": 0.00019405646617777893, + "loss": 4.4525, + "step": 3200 + }, + { + "epoch": 0.3319054624163205, + "grad_norm": 0.796875, + "learning_rate": 0.0001940527766286435, + "loss": 4.4334, + "step": 3201 + }, + { + "epoch": 0.3320091504708086, + "grad_norm": 0.98828125, + "learning_rate": 0.00019404908596978477, + "loss": 4.4435, + "step": 3202 + }, + { + "epoch": 0.33211283852529666, + "grad_norm": 0.87890625, + "learning_rate": 0.00019404539420124622, + "loss": 4.4833, + "step": 3203 + }, + { + "epoch": 0.33221652657978473, + "grad_norm": 0.91796875, + "learning_rate": 0.00019404170132307144, + "loss": 4.4724, + "step": 3204 + }, + { + "epoch": 0.3323202146342728, + "grad_norm": 0.6640625, + "learning_rate": 0.000194038007335304, + "loss": 4.4751, + "step": 3205 + }, + { + "epoch": 0.3324239026887609, + "grad_norm": 0.83203125, + "learning_rate": 0.00019403431223798747, + "loss": 4.4767, + "step": 3206 + }, + { + "epoch": 0.33252759074324895, + "grad_norm": 0.84375, + "learning_rate": 0.00019403061603116543, + "loss": 4.4561, + "step": 3207 + }, + { + "epoch": 0.332631278797737, + "grad_norm": 0.7265625, + "learning_rate": 0.00019402691871488154, + "loss": 4.4188, + "step": 3208 + }, + { + "epoch": 0.3327349668522251, + "grad_norm": 0.80859375, + "learning_rate": 0.0001940232202891794, + "loss": 4.4851, + "step": 3209 + }, + { + "epoch": 0.33283865490671316, + "grad_norm": 0.7265625, + "learning_rate": 0.00019401952075410263, + "loss": 4.4548, + "step": 3210 + }, + { + "epoch": 0.33294234296120123, + "grad_norm": 0.68359375, + "learning_rate": 0.00019401582010969494, + "loss": 4.405, + "step": 3211 + }, + { + "epoch": 0.3330460310156893, + "grad_norm": 0.703125, + "learning_rate": 0.00019401211835599989, + "loss": 4.439, + "step": 3212 + }, + { + "epoch": 0.3331497190701774, + "grad_norm": 0.84375, + "learning_rate": 0.00019400841549306125, + "loss": 4.4627, + "step": 3213 + }, + { + "epoch": 0.33325340712466545, + "grad_norm": 0.8046875, + "learning_rate": 0.00019400471152092267, + "loss": 4.4395, + "step": 3214 + }, + { + "epoch": 0.3333570951791535, + "grad_norm": 0.74609375, + "learning_rate": 0.00019400100643962787, + "loss": 4.4621, + "step": 3215 + }, + { + "epoch": 0.3334607832336416, + "grad_norm": 0.88671875, + "learning_rate": 0.00019399730024922057, + "loss": 4.4411, + "step": 3216 + }, + { + "epoch": 0.33356447128812966, + "grad_norm": 0.96875, + "learning_rate": 0.00019399359294974445, + "loss": 4.4311, + "step": 3217 + }, + { + "epoch": 0.33366815934261773, + "grad_norm": 1.0546875, + "learning_rate": 0.00019398988454124333, + "loss": 4.4267, + "step": 3218 + }, + { + "epoch": 0.3337718473971058, + "grad_norm": 0.8359375, + "learning_rate": 0.00019398617502376092, + "loss": 4.4588, + "step": 3219 + }, + { + "epoch": 0.3338755354515939, + "grad_norm": 0.796875, + "learning_rate": 0.00019398246439734096, + "loss": 4.4264, + "step": 3220 + }, + { + "epoch": 0.33397922350608195, + "grad_norm": 0.94921875, + "learning_rate": 0.00019397875266202727, + "loss": 4.4731, + "step": 3221 + }, + { + "epoch": 0.33408291156057, + "grad_norm": 1.109375, + "learning_rate": 0.00019397503981786365, + "loss": 4.4846, + "step": 3222 + }, + { + "epoch": 0.3341865996150581, + "grad_norm": 0.69921875, + "learning_rate": 0.0001939713258648939, + "loss": 4.3899, + "step": 3223 + }, + { + "epoch": 0.33429028766954616, + "grad_norm": 0.88671875, + "learning_rate": 0.00019396761080316184, + "loss": 4.4881, + "step": 3224 + }, + { + "epoch": 0.33439397572403423, + "grad_norm": 1.2421875, + "learning_rate": 0.00019396389463271127, + "loss": 4.4176, + "step": 3225 + }, + { + "epoch": 0.3344976637785223, + "grad_norm": 0.73046875, + "learning_rate": 0.0001939601773535861, + "loss": 4.409, + "step": 3226 + }, + { + "epoch": 0.3346013518330104, + "grad_norm": 0.91796875, + "learning_rate": 0.00019395645896583017, + "loss": 4.4441, + "step": 3227 + }, + { + "epoch": 0.33470503988749845, + "grad_norm": 1.3125, + "learning_rate": 0.0001939527394694873, + "loss": 4.4747, + "step": 3228 + }, + { + "epoch": 0.3348087279419865, + "grad_norm": 0.84375, + "learning_rate": 0.00019394901886460143, + "loss": 4.4516, + "step": 3229 + }, + { + "epoch": 0.3349124159964746, + "grad_norm": 1.015625, + "learning_rate": 0.00019394529715121645, + "loss": 4.4232, + "step": 3230 + }, + { + "epoch": 0.33501610405096266, + "grad_norm": 1.21875, + "learning_rate": 0.00019394157432937629, + "loss": 4.4171, + "step": 3231 + }, + { + "epoch": 0.33511979210545073, + "grad_norm": 0.8125, + "learning_rate": 0.00019393785039912483, + "loss": 4.4399, + "step": 3232 + }, + { + "epoch": 0.3352234801599388, + "grad_norm": 1.359375, + "learning_rate": 0.00019393412536050604, + "loss": 4.4708, + "step": 3233 + }, + { + "epoch": 0.3353271682144269, + "grad_norm": 0.80078125, + "learning_rate": 0.00019393039921356385, + "loss": 4.4745, + "step": 3234 + }, + { + "epoch": 0.33543085626891495, + "grad_norm": 1.3125, + "learning_rate": 0.0001939266719583422, + "loss": 4.4292, + "step": 3235 + }, + { + "epoch": 0.335534544323403, + "grad_norm": 0.7265625, + "learning_rate": 0.0001939229435948852, + "loss": 4.4479, + "step": 3236 + }, + { + "epoch": 0.3356382323778911, + "grad_norm": 1.421875, + "learning_rate": 0.0001939192141232367, + "loss": 4.4192, + "step": 3237 + }, + { + "epoch": 0.33574192043237916, + "grad_norm": 0.88671875, + "learning_rate": 0.00019391548354344074, + "loss": 4.4338, + "step": 3238 + }, + { + "epoch": 0.3358456084868673, + "grad_norm": 1.6796875, + "learning_rate": 0.00019391175185554135, + "loss": 4.4788, + "step": 3239 + }, + { + "epoch": 0.33594929654135536, + "grad_norm": 1.0703125, + "learning_rate": 0.00019390801905958257, + "loss": 4.4474, + "step": 3240 + }, + { + "epoch": 0.33605298459584343, + "grad_norm": 2.5, + "learning_rate": 0.00019390428515560841, + "loss": 4.4906, + "step": 3241 + }, + { + "epoch": 0.3361566726503315, + "grad_norm": 2.203125, + "learning_rate": 0.00019390055014366296, + "loss": 4.4659, + "step": 3242 + }, + { + "epoch": 0.33626036070481957, + "grad_norm": 1.53125, + "learning_rate": 0.00019389681402379028, + "loss": 4.4293, + "step": 3243 + }, + { + "epoch": 0.33636404875930764, + "grad_norm": 1.640625, + "learning_rate": 0.00019389307679603442, + "loss": 4.4542, + "step": 3244 + }, + { + "epoch": 0.3364677368137957, + "grad_norm": 1.375, + "learning_rate": 0.00019388933846043954, + "loss": 4.4636, + "step": 3245 + }, + { + "epoch": 0.3365714248682838, + "grad_norm": 1.3671875, + "learning_rate": 0.0001938855990170497, + "loss": 4.4272, + "step": 3246 + }, + { + "epoch": 0.33667511292277186, + "grad_norm": 1.1796875, + "learning_rate": 0.00019388185846590903, + "loss": 4.4256, + "step": 3247 + }, + { + "epoch": 0.33677880097725993, + "grad_norm": 1.546875, + "learning_rate": 0.00019387811680706167, + "loss": 4.4195, + "step": 3248 + }, + { + "epoch": 0.336882489031748, + "grad_norm": 1.1953125, + "learning_rate": 0.00019387437404055175, + "loss": 4.4388, + "step": 3249 + }, + { + "epoch": 0.33698617708623607, + "grad_norm": 2.046875, + "learning_rate": 0.00019387063016642345, + "loss": 4.4516, + "step": 3250 + }, + { + "epoch": 0.33708986514072414, + "grad_norm": 1.734375, + "learning_rate": 0.00019386688518472096, + "loss": 4.4562, + "step": 3251 + }, + { + "epoch": 0.3371935531952122, + "grad_norm": 1.9453125, + "learning_rate": 0.00019386313909548842, + "loss": 4.4315, + "step": 3252 + }, + { + "epoch": 0.3372972412497003, + "grad_norm": 1.625, + "learning_rate": 0.00019385939189877008, + "loss": 4.4577, + "step": 3253 + }, + { + "epoch": 0.33740092930418836, + "grad_norm": 2.328125, + "learning_rate": 0.00019385564359461013, + "loss": 4.4885, + "step": 3254 + }, + { + "epoch": 0.33750461735867643, + "grad_norm": 1.8828125, + "learning_rate": 0.00019385189418305275, + "loss": 4.4272, + "step": 3255 + }, + { + "epoch": 0.3376083054131645, + "grad_norm": 2.3125, + "learning_rate": 0.00019384814366414231, + "loss": 4.4444, + "step": 3256 + }, + { + "epoch": 0.33771199346765257, + "grad_norm": 2.28125, + "learning_rate": 0.00019384439203792291, + "loss": 4.4779, + "step": 3257 + }, + { + "epoch": 0.33781568152214064, + "grad_norm": 1.1796875, + "learning_rate": 0.00019384063930443887, + "loss": 4.4524, + "step": 3258 + }, + { + "epoch": 0.3379193695766287, + "grad_norm": 1.3359375, + "learning_rate": 0.00019383688546373453, + "loss": 4.4602, + "step": 3259 + }, + { + "epoch": 0.3380230576311168, + "grad_norm": 1.2890625, + "learning_rate": 0.0001938331305158541, + "loss": 4.4119, + "step": 3260 + }, + { + "epoch": 0.33812674568560486, + "grad_norm": 1.1796875, + "learning_rate": 0.00019382937446084194, + "loss": 4.4518, + "step": 3261 + }, + { + "epoch": 0.33823043374009293, + "grad_norm": 1.25, + "learning_rate": 0.00019382561729874232, + "loss": 4.4152, + "step": 3262 + }, + { + "epoch": 0.338334121794581, + "grad_norm": 1.0078125, + "learning_rate": 0.00019382185902959962, + "loss": 4.4262, + "step": 3263 + }, + { + "epoch": 0.33843780984906907, + "grad_norm": 1.453125, + "learning_rate": 0.00019381809965345813, + "loss": 4.4535, + "step": 3264 + }, + { + "epoch": 0.33854149790355714, + "grad_norm": 1.046875, + "learning_rate": 0.00019381433917036223, + "loss": 4.4597, + "step": 3265 + }, + { + "epoch": 0.3386451859580452, + "grad_norm": 2.453125, + "learning_rate": 0.0001938105775803563, + "loss": 4.4852, + "step": 3266 + }, + { + "epoch": 0.3387488740125333, + "grad_norm": 2.234375, + "learning_rate": 0.00019380681488348473, + "loss": 4.4376, + "step": 3267 + }, + { + "epoch": 0.33885256206702136, + "grad_norm": 1.6875, + "learning_rate": 0.00019380305107979191, + "loss": 4.4622, + "step": 3268 + }, + { + "epoch": 0.33895625012150943, + "grad_norm": 1.6484375, + "learning_rate": 0.0001937992861693222, + "loss": 4.4553, + "step": 3269 + }, + { + "epoch": 0.3390599381759975, + "grad_norm": 1.640625, + "learning_rate": 0.0001937955201521201, + "loss": 4.4485, + "step": 3270 + }, + { + "epoch": 0.33916362623048557, + "grad_norm": 1.4609375, + "learning_rate": 0.00019379175302823, + "loss": 4.4232, + "step": 3271 + }, + { + "epoch": 0.33926731428497364, + "grad_norm": 1.7890625, + "learning_rate": 0.00019378798479769636, + "loss": 4.4852, + "step": 3272 + }, + { + "epoch": 0.3393710023394617, + "grad_norm": 1.609375, + "learning_rate": 0.00019378421546056363, + "loss": 4.4148, + "step": 3273 + }, + { + "epoch": 0.3394746903939498, + "grad_norm": 1.828125, + "learning_rate": 0.0001937804450168763, + "loss": 4.4718, + "step": 3274 + }, + { + "epoch": 0.33957837844843786, + "grad_norm": 1.5546875, + "learning_rate": 0.00019377667346667885, + "loss": 4.4646, + "step": 3275 + }, + { + "epoch": 0.33968206650292593, + "grad_norm": 1.9453125, + "learning_rate": 0.00019377290081001576, + "loss": 4.4659, + "step": 3276 + }, + { + "epoch": 0.339785754557414, + "grad_norm": 1.7265625, + "learning_rate": 0.0001937691270469316, + "loss": 4.4167, + "step": 3277 + }, + { + "epoch": 0.33988944261190207, + "grad_norm": 1.71875, + "learning_rate": 0.0001937653521774708, + "loss": 4.4329, + "step": 3278 + }, + { + "epoch": 0.33999313066639014, + "grad_norm": 1.4921875, + "learning_rate": 0.000193761576201678, + "loss": 4.4596, + "step": 3279 + }, + { + "epoch": 0.3400968187208782, + "grad_norm": 1.8984375, + "learning_rate": 0.0001937577991195977, + "loss": 4.4565, + "step": 3280 + }, + { + "epoch": 0.3402005067753663, + "grad_norm": 1.703125, + "learning_rate": 0.0001937540209312745, + "loss": 4.4684, + "step": 3281 + }, + { + "epoch": 0.34030419482985436, + "grad_norm": 2.0, + "learning_rate": 0.00019375024163675292, + "loss": 4.4353, + "step": 3282 + }, + { + "epoch": 0.3404078828843425, + "grad_norm": 1.8828125, + "learning_rate": 0.00019374646123607764, + "loss": 4.444, + "step": 3283 + }, + { + "epoch": 0.34051157093883055, + "grad_norm": 1.65625, + "learning_rate": 0.00019374267972929317, + "loss": 4.4671, + "step": 3284 + }, + { + "epoch": 0.3406152589933186, + "grad_norm": 1.5234375, + "learning_rate": 0.00019373889711644417, + "loss": 4.4257, + "step": 3285 + }, + { + "epoch": 0.3407189470478067, + "grad_norm": 1.7265625, + "learning_rate": 0.0001937351133975753, + "loss": 4.4502, + "step": 3286 + }, + { + "epoch": 0.34082263510229477, + "grad_norm": 1.4921875, + "learning_rate": 0.00019373132857273114, + "loss": 4.4666, + "step": 3287 + }, + { + "epoch": 0.34092632315678284, + "grad_norm": 2.046875, + "learning_rate": 0.0001937275426419564, + "loss": 4.4717, + "step": 3288 + }, + { + "epoch": 0.3410300112112709, + "grad_norm": 1.859375, + "learning_rate": 0.00019372375560529567, + "loss": 4.4035, + "step": 3289 + }, + { + "epoch": 0.341133699265759, + "grad_norm": 1.890625, + "learning_rate": 0.00019371996746279376, + "loss": 4.4463, + "step": 3290 + }, + { + "epoch": 0.34123738732024705, + "grad_norm": 1.71875, + "learning_rate": 0.0001937161782144953, + "loss": 4.4601, + "step": 3291 + }, + { + "epoch": 0.3413410753747351, + "grad_norm": 1.890625, + "learning_rate": 0.00019371238786044498, + "loss": 4.4237, + "step": 3292 + }, + { + "epoch": 0.3414447634292232, + "grad_norm": 1.7578125, + "learning_rate": 0.00019370859640068755, + "loss": 4.4393, + "step": 3293 + }, + { + "epoch": 0.34154845148371127, + "grad_norm": 1.65625, + "learning_rate": 0.00019370480383526774, + "loss": 4.461, + "step": 3294 + }, + { + "epoch": 0.34165213953819934, + "grad_norm": 1.4921875, + "learning_rate": 0.00019370101016423028, + "loss": 4.4328, + "step": 3295 + }, + { + "epoch": 0.3417558275926874, + "grad_norm": 1.8046875, + "learning_rate": 0.00019369721538761996, + "loss": 4.4228, + "step": 3296 + }, + { + "epoch": 0.3418595156471755, + "grad_norm": 1.6015625, + "learning_rate": 0.00019369341950548153, + "loss": 4.4922, + "step": 3297 + }, + { + "epoch": 0.34196320370166355, + "grad_norm": 1.78125, + "learning_rate": 0.0001936896225178598, + "loss": 4.443, + "step": 3298 + }, + { + "epoch": 0.3420668917561516, + "grad_norm": 1.625, + "learning_rate": 0.00019368582442479953, + "loss": 4.4383, + "step": 3299 + }, + { + "epoch": 0.3421705798106397, + "grad_norm": 1.90625, + "learning_rate": 0.0001936820252263456, + "loss": 4.4737, + "step": 3300 + }, + { + "epoch": 0.34227426786512777, + "grad_norm": 1.828125, + "learning_rate": 0.0001936782249225428, + "loss": 4.4784, + "step": 3301 + }, + { + "epoch": 0.34237795591961584, + "grad_norm": 1.71875, + "learning_rate": 0.00019367442351343593, + "loss": 4.4926, + "step": 3302 + }, + { + "epoch": 0.3424816439741039, + "grad_norm": 1.5625, + "learning_rate": 0.0001936706209990699, + "loss": 4.4373, + "step": 3303 + }, + { + "epoch": 0.342585332028592, + "grad_norm": 1.6328125, + "learning_rate": 0.00019366681737948956, + "loss": 4.4738, + "step": 3304 + }, + { + "epoch": 0.34268902008308005, + "grad_norm": 1.453125, + "learning_rate": 0.00019366301265473978, + "loss": 4.4515, + "step": 3305 + }, + { + "epoch": 0.3427927081375681, + "grad_norm": 1.84375, + "learning_rate": 0.00019365920682486547, + "loss": 4.4219, + "step": 3306 + }, + { + "epoch": 0.3428963961920562, + "grad_norm": 1.6015625, + "learning_rate": 0.0001936553998899115, + "loss": 4.4172, + "step": 3307 + }, + { + "epoch": 0.34300008424654427, + "grad_norm": 2.09375, + "learning_rate": 0.00019365159184992284, + "loss": 4.4943, + "step": 3308 + }, + { + "epoch": 0.34310377230103234, + "grad_norm": 1.890625, + "learning_rate": 0.00019364778270494437, + "loss": 4.4547, + "step": 3309 + }, + { + "epoch": 0.3432074603555204, + "grad_norm": 1.7109375, + "learning_rate": 0.00019364397245502107, + "loss": 4.4844, + "step": 3310 + }, + { + "epoch": 0.3433111484100085, + "grad_norm": 1.515625, + "learning_rate": 0.00019364016110019785, + "loss": 4.4454, + "step": 3311 + }, + { + "epoch": 0.34341483646449655, + "grad_norm": 1.796875, + "learning_rate": 0.0001936363486405197, + "loss": 4.4257, + "step": 3312 + }, + { + "epoch": 0.3435185245189846, + "grad_norm": 1.53125, + "learning_rate": 0.00019363253507603164, + "loss": 4.4496, + "step": 3313 + }, + { + "epoch": 0.3436222125734727, + "grad_norm": 2.125, + "learning_rate": 0.00019362872040677866, + "loss": 4.4632, + "step": 3314 + }, + { + "epoch": 0.34372590062796077, + "grad_norm": 1.9921875, + "learning_rate": 0.00019362490463280572, + "loss": 4.4439, + "step": 3315 + }, + { + "epoch": 0.34382958868244884, + "grad_norm": 1.5078125, + "learning_rate": 0.0001936210877541579, + "loss": 4.5136, + "step": 3316 + }, + { + "epoch": 0.3439332767369369, + "grad_norm": 1.4921875, + "learning_rate": 0.00019361726977088018, + "loss": 4.435, + "step": 3317 + }, + { + "epoch": 0.344036964791425, + "grad_norm": 1.5625, + "learning_rate": 0.00019361345068301763, + "loss": 4.4213, + "step": 3318 + }, + { + "epoch": 0.34414065284591305, + "grad_norm": 1.3359375, + "learning_rate": 0.00019360963049061533, + "loss": 4.4375, + "step": 3319 + }, + { + "epoch": 0.3442443409004011, + "grad_norm": 1.796875, + "learning_rate": 0.00019360580919371834, + "loss": 4.449, + "step": 3320 + }, + { + "epoch": 0.3443480289548892, + "grad_norm": 1.6015625, + "learning_rate": 0.00019360198679237172, + "loss": 4.4631, + "step": 3321 + }, + { + "epoch": 0.34445171700937727, + "grad_norm": 1.625, + "learning_rate": 0.00019359816328662065, + "loss": 4.4259, + "step": 3322 + }, + { + "epoch": 0.34455540506386534, + "grad_norm": 1.5234375, + "learning_rate": 0.00019359433867651018, + "loss": 4.4103, + "step": 3323 + }, + { + "epoch": 0.3446590931183534, + "grad_norm": 1.640625, + "learning_rate": 0.0001935905129620854, + "loss": 4.4288, + "step": 3324 + }, + { + "epoch": 0.3447627811728415, + "grad_norm": 1.4765625, + "learning_rate": 0.00019358668614339152, + "loss": 4.47, + "step": 3325 + }, + { + "epoch": 0.34486646922732955, + "grad_norm": 1.8125, + "learning_rate": 0.0001935828582204737, + "loss": 4.4248, + "step": 3326 + }, + { + "epoch": 0.3449701572818176, + "grad_norm": 1.6640625, + "learning_rate": 0.00019357902919337706, + "loss": 4.3995, + "step": 3327 + }, + { + "epoch": 0.34507384533630575, + "grad_norm": 1.5234375, + "learning_rate": 0.00019357519906214676, + "loss": 4.4162, + "step": 3328 + }, + { + "epoch": 0.3451775333907938, + "grad_norm": 1.375, + "learning_rate": 0.00019357136782682804, + "loss": 4.4012, + "step": 3329 + }, + { + "epoch": 0.3452812214452819, + "grad_norm": 1.6953125, + "learning_rate": 0.00019356753548746612, + "loss": 4.4357, + "step": 3330 + }, + { + "epoch": 0.34538490949976997, + "grad_norm": 1.546875, + "learning_rate": 0.00019356370204410615, + "loss": 4.441, + "step": 3331 + }, + { + "epoch": 0.34548859755425804, + "grad_norm": 1.6953125, + "learning_rate": 0.00019355986749679342, + "loss": 4.4196, + "step": 3332 + }, + { + "epoch": 0.3455922856087461, + "grad_norm": 1.5078125, + "learning_rate": 0.00019355603184557314, + "loss": 4.4693, + "step": 3333 + }, + { + "epoch": 0.3456959736632342, + "grad_norm": 1.671875, + "learning_rate": 0.00019355219509049058, + "loss": 4.481, + "step": 3334 + }, + { + "epoch": 0.34579966171772225, + "grad_norm": 1.546875, + "learning_rate": 0.000193548357231591, + "loss": 4.4777, + "step": 3335 + }, + { + "epoch": 0.3459033497722103, + "grad_norm": 1.7890625, + "learning_rate": 0.00019354451826891967, + "loss": 4.4111, + "step": 3336 + }, + { + "epoch": 0.3460070378266984, + "grad_norm": 1.6171875, + "learning_rate": 0.00019354067820252194, + "loss": 4.4399, + "step": 3337 + }, + { + "epoch": 0.34611072588118647, + "grad_norm": 1.671875, + "learning_rate": 0.00019353683703244307, + "loss": 4.4452, + "step": 3338 + }, + { + "epoch": 0.34621441393567454, + "grad_norm": 1.609375, + "learning_rate": 0.0001935329947587284, + "loss": 4.4372, + "step": 3339 + }, + { + "epoch": 0.3463181019901626, + "grad_norm": 1.6953125, + "learning_rate": 0.00019352915138142325, + "loss": 4.4171, + "step": 3340 + }, + { + "epoch": 0.3464217900446507, + "grad_norm": 1.5, + "learning_rate": 0.000193525306900573, + "loss": 4.4561, + "step": 3341 + }, + { + "epoch": 0.34652547809913875, + "grad_norm": 1.5546875, + "learning_rate": 0.00019352146131622298, + "loss": 4.4123, + "step": 3342 + }, + { + "epoch": 0.3466291661536268, + "grad_norm": 1.4375, + "learning_rate": 0.00019351761462841857, + "loss": 4.4771, + "step": 3343 + }, + { + "epoch": 0.3467328542081149, + "grad_norm": 1.609375, + "learning_rate": 0.00019351376683720515, + "loss": 4.4315, + "step": 3344 + }, + { + "epoch": 0.34683654226260296, + "grad_norm": 1.4609375, + "learning_rate": 0.00019350991794262813, + "loss": 4.4284, + "step": 3345 + }, + { + "epoch": 0.34694023031709104, + "grad_norm": 1.703125, + "learning_rate": 0.00019350606794473293, + "loss": 4.4584, + "step": 3346 + }, + { + "epoch": 0.3470439183715791, + "grad_norm": 1.5625, + "learning_rate": 0.000193502216843565, + "loss": 4.3793, + "step": 3347 + }, + { + "epoch": 0.3471476064260672, + "grad_norm": 1.53125, + "learning_rate": 0.0001934983646391697, + "loss": 4.4539, + "step": 3348 + }, + { + "epoch": 0.34725129448055525, + "grad_norm": 1.4140625, + "learning_rate": 0.00019349451133159255, + "loss": 4.458, + "step": 3349 + }, + { + "epoch": 0.3473549825350433, + "grad_norm": 1.4375, + "learning_rate": 0.000193490656920879, + "loss": 4.4281, + "step": 3350 + }, + { + "epoch": 0.3474586705895314, + "grad_norm": 1.328125, + "learning_rate": 0.0001934868014070745, + "loss": 4.4403, + "step": 3351 + }, + { + "epoch": 0.34756235864401946, + "grad_norm": 1.5, + "learning_rate": 0.00019348294479022457, + "loss": 4.4345, + "step": 3352 + }, + { + "epoch": 0.34766604669850754, + "grad_norm": 1.3828125, + "learning_rate": 0.0001934790870703747, + "loss": 4.4486, + "step": 3353 + }, + { + "epoch": 0.3477697347529956, + "grad_norm": 1.59375, + "learning_rate": 0.00019347522824757042, + "loss": 4.4779, + "step": 3354 + }, + { + "epoch": 0.3478734228074837, + "grad_norm": 1.4375, + "learning_rate": 0.00019347136832185727, + "loss": 4.4556, + "step": 3355 + }, + { + "epoch": 0.34797711086197175, + "grad_norm": 1.5234375, + "learning_rate": 0.00019346750729328077, + "loss": 4.4484, + "step": 3356 + }, + { + "epoch": 0.3480807989164598, + "grad_norm": 1.3125, + "learning_rate": 0.00019346364516188648, + "loss": 4.4024, + "step": 3357 + }, + { + "epoch": 0.3481844869709479, + "grad_norm": 1.859375, + "learning_rate": 0.00019345978192772, + "loss": 4.4108, + "step": 3358 + }, + { + "epoch": 0.34828817502543596, + "grad_norm": 1.7109375, + "learning_rate": 0.00019345591759082684, + "loss": 4.4214, + "step": 3359 + }, + { + "epoch": 0.34839186307992404, + "grad_norm": 1.7109375, + "learning_rate": 0.00019345205215125265, + "loss": 4.4519, + "step": 3360 + }, + { + "epoch": 0.3484955511344121, + "grad_norm": 1.5546875, + "learning_rate": 0.00019344818560904306, + "loss": 4.4294, + "step": 3361 + }, + { + "epoch": 0.3485992391889002, + "grad_norm": 1.625, + "learning_rate": 0.00019344431796424364, + "loss": 4.4023, + "step": 3362 + }, + { + "epoch": 0.34870292724338825, + "grad_norm": 1.4453125, + "learning_rate": 0.0001934404492169, + "loss": 4.4062, + "step": 3363 + }, + { + "epoch": 0.3488066152978763, + "grad_norm": 1.7734375, + "learning_rate": 0.0001934365793670579, + "loss": 4.4425, + "step": 3364 + }, + { + "epoch": 0.3489103033523644, + "grad_norm": 1.6484375, + "learning_rate": 0.0001934327084147629, + "loss": 4.4259, + "step": 3365 + }, + { + "epoch": 0.34901399140685246, + "grad_norm": 1.453125, + "learning_rate": 0.0001934288363600607, + "loss": 4.4137, + "step": 3366 + }, + { + "epoch": 0.34911767946134054, + "grad_norm": 1.40625, + "learning_rate": 0.000193424963202997, + "loss": 4.4226, + "step": 3367 + }, + { + "epoch": 0.3492213675158286, + "grad_norm": 1.3984375, + "learning_rate": 0.0001934210889436175, + "loss": 4.4478, + "step": 3368 + }, + { + "epoch": 0.3493250555703167, + "grad_norm": 1.2265625, + "learning_rate": 0.00019341721358196785, + "loss": 4.4312, + "step": 3369 + }, + { + "epoch": 0.34942874362480475, + "grad_norm": 1.6875, + "learning_rate": 0.00019341333711809386, + "loss": 4.4476, + "step": 3370 + }, + { + "epoch": 0.3495324316792928, + "grad_norm": 1.3828125, + "learning_rate": 0.00019340945955204121, + "loss": 4.4335, + "step": 3371 + }, + { + "epoch": 0.3496361197337809, + "grad_norm": 2.0, + "learning_rate": 0.0001934055808838557, + "loss": 4.4462, + "step": 3372 + }, + { + "epoch": 0.349739807788269, + "grad_norm": 1.9375, + "learning_rate": 0.0001934017011135831, + "loss": 4.4577, + "step": 3373 + }, + { + "epoch": 0.3498434958427571, + "grad_norm": 1.28125, + "learning_rate": 0.00019339782024126908, + "loss": 4.3599, + "step": 3374 + }, + { + "epoch": 0.34994718389724516, + "grad_norm": 1.359375, + "learning_rate": 0.00019339393826695958, + "loss": 4.4257, + "step": 3375 + }, + { + "epoch": 0.35005087195173323, + "grad_norm": 1.4296875, + "learning_rate": 0.00019339005519070028, + "loss": 4.443, + "step": 3376 + }, + { + "epoch": 0.3501545600062213, + "grad_norm": 1.078125, + "learning_rate": 0.0001933861710125371, + "loss": 4.4858, + "step": 3377 + }, + { + "epoch": 0.3502582480607094, + "grad_norm": 1.9921875, + "learning_rate": 0.00019338228573251575, + "loss": 4.4624, + "step": 3378 + }, + { + "epoch": 0.35036193611519745, + "grad_norm": 1.8671875, + "learning_rate": 0.00019337839935068218, + "loss": 4.4479, + "step": 3379 + }, + { + "epoch": 0.3504656241696855, + "grad_norm": 1.453125, + "learning_rate": 0.00019337451186708218, + "loss": 4.4735, + "step": 3380 + }, + { + "epoch": 0.3505693122241736, + "grad_norm": 1.359375, + "learning_rate": 0.00019337062328176165, + "loss": 4.4579, + "step": 3381 + }, + { + "epoch": 0.35067300027866166, + "grad_norm": 1.609375, + "learning_rate": 0.00019336673359476647, + "loss": 4.4533, + "step": 3382 + }, + { + "epoch": 0.35077668833314973, + "grad_norm": 1.3671875, + "learning_rate": 0.0001933628428061425, + "loss": 4.4427, + "step": 3383 + }, + { + "epoch": 0.3508803763876378, + "grad_norm": 1.96875, + "learning_rate": 0.00019335895091593573, + "loss": 4.4462, + "step": 3384 + }, + { + "epoch": 0.3509840644421259, + "grad_norm": 1.7734375, + "learning_rate": 0.00019335505792419198, + "loss": 4.4379, + "step": 3385 + }, + { + "epoch": 0.35108775249661395, + "grad_norm": 1.515625, + "learning_rate": 0.00019335116383095724, + "loss": 4.4643, + "step": 3386 + }, + { + "epoch": 0.351191440551102, + "grad_norm": 1.484375, + "learning_rate": 0.00019334726863627744, + "loss": 4.4121, + "step": 3387 + }, + { + "epoch": 0.3512951286055901, + "grad_norm": 1.5234375, + "learning_rate": 0.00019334337234019856, + "loss": 4.4721, + "step": 3388 + }, + { + "epoch": 0.35139881666007816, + "grad_norm": 1.3046875, + "learning_rate": 0.0001933394749427665, + "loss": 4.4425, + "step": 3389 + }, + { + "epoch": 0.35150250471456623, + "grad_norm": 1.796875, + "learning_rate": 0.00019333557644402735, + "loss": 4.4795, + "step": 3390 + }, + { + "epoch": 0.3516061927690543, + "grad_norm": 1.59375, + "learning_rate": 0.00019333167684402704, + "loss": 4.4531, + "step": 3391 + }, + { + "epoch": 0.3517098808235424, + "grad_norm": 1.9140625, + "learning_rate": 0.00019332777614281162, + "loss": 4.4808, + "step": 3392 + }, + { + "epoch": 0.35181356887803045, + "grad_norm": 1.828125, + "learning_rate": 0.00019332387434042706, + "loss": 4.4205, + "step": 3393 + }, + { + "epoch": 0.3519172569325185, + "grad_norm": 1.640625, + "learning_rate": 0.00019331997143691947, + "loss": 4.429, + "step": 3394 + }, + { + "epoch": 0.3520209449870066, + "grad_norm": 1.46875, + "learning_rate": 0.00019331606743233483, + "loss": 4.4505, + "step": 3395 + }, + { + "epoch": 0.35212463304149466, + "grad_norm": 1.546875, + "learning_rate": 0.00019331216232671924, + "loss": 4.4677, + "step": 3396 + }, + { + "epoch": 0.35222832109598273, + "grad_norm": 1.4296875, + "learning_rate": 0.00019330825612011877, + "loss": 4.428, + "step": 3397 + }, + { + "epoch": 0.3523320091504708, + "grad_norm": 1.59375, + "learning_rate": 0.0001933043488125795, + "loss": 4.4564, + "step": 3398 + }, + { + "epoch": 0.3524356972049589, + "grad_norm": 1.3125, + "learning_rate": 0.00019330044040414754, + "loss": 4.4115, + "step": 3399 + }, + { + "epoch": 0.35253938525944695, + "grad_norm": 1.4921875, + "learning_rate": 0.00019329653089486903, + "loss": 4.4736, + "step": 3400 + }, + { + "epoch": 0.352643073313935, + "grad_norm": 1.2265625, + "learning_rate": 0.00019329262028479005, + "loss": 4.4048, + "step": 3401 + }, + { + "epoch": 0.3527467613684231, + "grad_norm": 1.7265625, + "learning_rate": 0.0001932887085739568, + "loss": 4.4026, + "step": 3402 + }, + { + "epoch": 0.35285044942291116, + "grad_norm": 1.4609375, + "learning_rate": 0.00019328479576241535, + "loss": 4.4926, + "step": 3403 + }, + { + "epoch": 0.35295413747739923, + "grad_norm": 1.6484375, + "learning_rate": 0.00019328088185021196, + "loss": 4.4422, + "step": 3404 + }, + { + "epoch": 0.3530578255318873, + "grad_norm": 1.53125, + "learning_rate": 0.00019327696683739274, + "loss": 4.4698, + "step": 3405 + }, + { + "epoch": 0.3531615135863754, + "grad_norm": 1.4375, + "learning_rate": 0.00019327305072400393, + "loss": 4.4035, + "step": 3406 + }, + { + "epoch": 0.35326520164086345, + "grad_norm": 1.28125, + "learning_rate": 0.00019326913351009172, + "loss": 4.4461, + "step": 3407 + }, + { + "epoch": 0.3533688896953515, + "grad_norm": 1.515625, + "learning_rate": 0.0001932652151957023, + "loss": 4.4329, + "step": 3408 + }, + { + "epoch": 0.3534725777498396, + "grad_norm": 1.2578125, + "learning_rate": 0.00019326129578088193, + "loss": 4.4534, + "step": 3409 + }, + { + "epoch": 0.35357626580432766, + "grad_norm": 1.8671875, + "learning_rate": 0.00019325737526567683, + "loss": 4.457, + "step": 3410 + }, + { + "epoch": 0.35367995385881573, + "grad_norm": 1.6796875, + "learning_rate": 0.00019325345365013333, + "loss": 4.4506, + "step": 3411 + }, + { + "epoch": 0.3537836419133038, + "grad_norm": 1.59375, + "learning_rate": 0.0001932495309342976, + "loss": 4.4096, + "step": 3412 + }, + { + "epoch": 0.3538873299677919, + "grad_norm": 1.46875, + "learning_rate": 0.000193245607118216, + "loss": 4.4292, + "step": 3413 + }, + { + "epoch": 0.35399101802227995, + "grad_norm": 1.65625, + "learning_rate": 0.0001932416822019348, + "loss": 4.4447, + "step": 3414 + }, + { + "epoch": 0.354094706076768, + "grad_norm": 1.5234375, + "learning_rate": 0.00019323775618550033, + "loss": 4.4365, + "step": 3415 + }, + { + "epoch": 0.3541983941312561, + "grad_norm": 1.6328125, + "learning_rate": 0.00019323382906895887, + "loss": 4.4433, + "step": 3416 + }, + { + "epoch": 0.3543020821857442, + "grad_norm": 1.5078125, + "learning_rate": 0.00019322990085235677, + "loss": 4.3989, + "step": 3417 + }, + { + "epoch": 0.3544057702402323, + "grad_norm": 1.453125, + "learning_rate": 0.00019322597153574041, + "loss": 4.4656, + "step": 3418 + }, + { + "epoch": 0.35450945829472036, + "grad_norm": 1.3515625, + "learning_rate": 0.00019322204111915612, + "loss": 4.4606, + "step": 3419 + }, + { + "epoch": 0.35461314634920843, + "grad_norm": 1.6640625, + "learning_rate": 0.00019321810960265027, + "loss": 4.3985, + "step": 3420 + }, + { + "epoch": 0.3547168344036965, + "grad_norm": 1.3984375, + "learning_rate": 0.0001932141769862693, + "loss": 4.4343, + "step": 3421 + }, + { + "epoch": 0.3548205224581846, + "grad_norm": 1.765625, + "learning_rate": 0.00019321024327005953, + "loss": 4.4955, + "step": 3422 + }, + { + "epoch": 0.35492421051267264, + "grad_norm": 1.6640625, + "learning_rate": 0.00019320630845406743, + "loss": 4.4026, + "step": 3423 + }, + { + "epoch": 0.3550278985671607, + "grad_norm": 1.4453125, + "learning_rate": 0.00019320237253833944, + "loss": 4.4636, + "step": 3424 + }, + { + "epoch": 0.3551315866216488, + "grad_norm": 1.359375, + "learning_rate": 0.00019319843552292193, + "loss": 4.4602, + "step": 3425 + }, + { + "epoch": 0.35523527467613686, + "grad_norm": 1.484375, + "learning_rate": 0.00019319449740786145, + "loss": 4.4292, + "step": 3426 + }, + { + "epoch": 0.35533896273062493, + "grad_norm": 1.3515625, + "learning_rate": 0.00019319055819320437, + "loss": 4.4169, + "step": 3427 + }, + { + "epoch": 0.355442650785113, + "grad_norm": 1.546875, + "learning_rate": 0.00019318661787899724, + "loss": 4.4037, + "step": 3428 + }, + { + "epoch": 0.3555463388396011, + "grad_norm": 1.390625, + "learning_rate": 0.0001931826764652865, + "loss": 4.4441, + "step": 3429 + }, + { + "epoch": 0.35565002689408914, + "grad_norm": 1.5859375, + "learning_rate": 0.00019317873395211868, + "loss": 4.406, + "step": 3430 + }, + { + "epoch": 0.3557537149485772, + "grad_norm": 1.4375, + "learning_rate": 0.0001931747903395403, + "loss": 4.4601, + "step": 3431 + }, + { + "epoch": 0.3558574030030653, + "grad_norm": 1.6328125, + "learning_rate": 0.00019317084562759786, + "loss": 4.4696, + "step": 3432 + }, + { + "epoch": 0.35596109105755336, + "grad_norm": 1.46875, + "learning_rate": 0.00019316689981633796, + "loss": 4.4326, + "step": 3433 + }, + { + "epoch": 0.35606477911204143, + "grad_norm": 1.5625, + "learning_rate": 0.00019316295290580708, + "loss": 4.443, + "step": 3434 + }, + { + "epoch": 0.3561684671665295, + "grad_norm": 1.4375, + "learning_rate": 0.00019315900489605186, + "loss": 4.4226, + "step": 3435 + }, + { + "epoch": 0.3562721552210176, + "grad_norm": 1.4609375, + "learning_rate": 0.00019315505578711888, + "loss": 4.4494, + "step": 3436 + }, + { + "epoch": 0.35637584327550564, + "grad_norm": 1.390625, + "learning_rate": 0.0001931511055790547, + "loss": 4.4322, + "step": 3437 + }, + { + "epoch": 0.3564795313299937, + "grad_norm": 1.5859375, + "learning_rate": 0.00019314715427190592, + "loss": 4.3908, + "step": 3438 + }, + { + "epoch": 0.3565832193844818, + "grad_norm": 1.4296875, + "learning_rate": 0.00019314320186571919, + "loss": 4.4296, + "step": 3439 + }, + { + "epoch": 0.35668690743896986, + "grad_norm": 1.5625, + "learning_rate": 0.0001931392483605411, + "loss": 4.4108, + "step": 3440 + }, + { + "epoch": 0.35679059549345793, + "grad_norm": 1.46875, + "learning_rate": 0.0001931352937564184, + "loss": 4.4312, + "step": 3441 + }, + { + "epoch": 0.356894283547946, + "grad_norm": 1.53125, + "learning_rate": 0.00019313133805339762, + "loss": 4.4593, + "step": 3442 + }, + { + "epoch": 0.35699797160243407, + "grad_norm": 1.3203125, + "learning_rate": 0.00019312738125152554, + "loss": 4.3779, + "step": 3443 + }, + { + "epoch": 0.35710165965692214, + "grad_norm": 1.6171875, + "learning_rate": 0.00019312342335084878, + "loss": 4.3964, + "step": 3444 + }, + { + "epoch": 0.3572053477114102, + "grad_norm": 1.4765625, + "learning_rate": 0.00019311946435141407, + "loss": 4.4928, + "step": 3445 + }, + { + "epoch": 0.3573090357658983, + "grad_norm": 1.4296875, + "learning_rate": 0.0001931155042532681, + "loss": 4.4779, + "step": 3446 + }, + { + "epoch": 0.35741272382038636, + "grad_norm": 1.3671875, + "learning_rate": 0.00019311154305645762, + "loss": 4.4363, + "step": 3447 + }, + { + "epoch": 0.35751641187487443, + "grad_norm": 1.3515625, + "learning_rate": 0.00019310758076102933, + "loss": 4.4306, + "step": 3448 + }, + { + "epoch": 0.3576200999293625, + "grad_norm": 1.2734375, + "learning_rate": 0.00019310361736703003, + "loss": 4.4224, + "step": 3449 + }, + { + "epoch": 0.35772378798385057, + "grad_norm": 1.375, + "learning_rate": 0.00019309965287450645, + "loss": 4.4059, + "step": 3450 + }, + { + "epoch": 0.35782747603833864, + "grad_norm": 1.2734375, + "learning_rate": 0.00019309568728350537, + "loss": 4.506, + "step": 3451 + }, + { + "epoch": 0.3579311640928267, + "grad_norm": 1.46875, + "learning_rate": 0.0001930917205940736, + "loss": 4.4283, + "step": 3452 + }, + { + "epoch": 0.3580348521473148, + "grad_norm": 1.3125, + "learning_rate": 0.00019308775280625794, + "loss": 4.4682, + "step": 3453 + }, + { + "epoch": 0.35813854020180286, + "grad_norm": 1.3515625, + "learning_rate": 0.0001930837839201052, + "loss": 4.4591, + "step": 3454 + }, + { + "epoch": 0.35824222825629093, + "grad_norm": 1.2421875, + "learning_rate": 0.00019307981393566217, + "loss": 4.4646, + "step": 3455 + }, + { + "epoch": 0.358345916310779, + "grad_norm": 1.3515625, + "learning_rate": 0.00019307584285297572, + "loss": 4.3935, + "step": 3456 + }, + { + "epoch": 0.35844960436526707, + "grad_norm": 1.1875, + "learning_rate": 0.00019307187067209273, + "loss": 4.469, + "step": 3457 + }, + { + "epoch": 0.35855329241975514, + "grad_norm": 1.5078125, + "learning_rate": 0.00019306789739306006, + "loss": 4.4591, + "step": 3458 + }, + { + "epoch": 0.3586569804742432, + "grad_norm": 1.3515625, + "learning_rate": 0.00019306392301592456, + "loss": 4.4508, + "step": 3459 + }, + { + "epoch": 0.3587606685287313, + "grad_norm": 1.390625, + "learning_rate": 0.00019305994754073314, + "loss": 4.3816, + "step": 3460 + }, + { + "epoch": 0.35886435658321936, + "grad_norm": 1.2421875, + "learning_rate": 0.00019305597096753272, + "loss": 4.4473, + "step": 3461 + }, + { + "epoch": 0.3589680446377075, + "grad_norm": 1.4453125, + "learning_rate": 0.00019305199329637017, + "loss": 4.4264, + "step": 3462 + }, + { + "epoch": 0.35907173269219556, + "grad_norm": 1.3046875, + "learning_rate": 0.0001930480145272925, + "loss": 4.4503, + "step": 3463 + }, + { + "epoch": 0.3591754207466836, + "grad_norm": 1.484375, + "learning_rate": 0.0001930440346603466, + "loss": 4.4796, + "step": 3464 + }, + { + "epoch": 0.3592791088011717, + "grad_norm": 1.2734375, + "learning_rate": 0.00019304005369557946, + "loss": 4.4133, + "step": 3465 + }, + { + "epoch": 0.35938279685565977, + "grad_norm": 1.5703125, + "learning_rate": 0.00019303607163303804, + "loss": 4.4165, + "step": 3466 + }, + { + "epoch": 0.35948648491014784, + "grad_norm": 1.40625, + "learning_rate": 0.00019303208847276928, + "loss": 4.4471, + "step": 3467 + }, + { + "epoch": 0.3595901729646359, + "grad_norm": 1.5703125, + "learning_rate": 0.00019302810421482021, + "loss": 4.3831, + "step": 3468 + }, + { + "epoch": 0.359693861019124, + "grad_norm": 1.4375, + "learning_rate": 0.00019302411885923788, + "loss": 4.4317, + "step": 3469 + }, + { + "epoch": 0.35979754907361206, + "grad_norm": 1.7890625, + "learning_rate": 0.00019302013240606928, + "loss": 4.4411, + "step": 3470 + }, + { + "epoch": 0.3599012371281001, + "grad_norm": 1.6015625, + "learning_rate": 0.00019301614485536144, + "loss": 4.4209, + "step": 3471 + }, + { + "epoch": 0.3600049251825882, + "grad_norm": 1.5, + "learning_rate": 0.0001930121562071614, + "loss": 4.4295, + "step": 3472 + }, + { + "epoch": 0.36010861323707627, + "grad_norm": 1.3984375, + "learning_rate": 0.00019300816646151622, + "loss": 4.4055, + "step": 3473 + }, + { + "epoch": 0.36021230129156434, + "grad_norm": 1.5703125, + "learning_rate": 0.000193004175618473, + "loss": 4.4217, + "step": 3474 + }, + { + "epoch": 0.3603159893460524, + "grad_norm": 1.3984375, + "learning_rate": 0.00019300018367807885, + "loss": 4.4355, + "step": 3475 + }, + { + "epoch": 0.3604196774005405, + "grad_norm": 1.625, + "learning_rate": 0.0001929961906403808, + "loss": 4.425, + "step": 3476 + }, + { + "epoch": 0.36052336545502855, + "grad_norm": 1.4765625, + "learning_rate": 0.00019299219650542604, + "loss": 4.4662, + "step": 3477 + }, + { + "epoch": 0.3606270535095166, + "grad_norm": 1.5, + "learning_rate": 0.0001929882012732616, + "loss": 4.4452, + "step": 3478 + }, + { + "epoch": 0.3607307415640047, + "grad_norm": 1.453125, + "learning_rate": 0.00019298420494393473, + "loss": 4.4603, + "step": 3479 + }, + { + "epoch": 0.36083442961849277, + "grad_norm": 1.1875, + "learning_rate": 0.00019298020751749251, + "loss": 4.39, + "step": 3480 + }, + { + "epoch": 0.36093811767298084, + "grad_norm": 1.15625, + "learning_rate": 0.00019297620899398212, + "loss": 4.4747, + "step": 3481 + }, + { + "epoch": 0.3610418057274689, + "grad_norm": 1.3671875, + "learning_rate": 0.00019297220937345078, + "loss": 4.4274, + "step": 3482 + }, + { + "epoch": 0.361145493781957, + "grad_norm": 1.1640625, + "learning_rate": 0.00019296820865594562, + "loss": 4.4059, + "step": 3483 + }, + { + "epoch": 0.36124918183644505, + "grad_norm": 1.5859375, + "learning_rate": 0.00019296420684151384, + "loss": 4.3839, + "step": 3484 + }, + { + "epoch": 0.3613528698909331, + "grad_norm": 1.4609375, + "learning_rate": 0.00019296020393020272, + "loss": 4.4561, + "step": 3485 + }, + { + "epoch": 0.3614565579454212, + "grad_norm": 1.40625, + "learning_rate": 0.00019295619992205944, + "loss": 4.45, + "step": 3486 + }, + { + "epoch": 0.36156024599990927, + "grad_norm": 1.34375, + "learning_rate": 0.0001929521948171313, + "loss": 4.4206, + "step": 3487 + }, + { + "epoch": 0.36166393405439734, + "grad_norm": 1.4453125, + "learning_rate": 0.00019294818861546547, + "loss": 4.4488, + "step": 3488 + }, + { + "epoch": 0.3617676221088854, + "grad_norm": 1.296875, + "learning_rate": 0.00019294418131710926, + "loss": 4.4317, + "step": 3489 + }, + { + "epoch": 0.3618713101633735, + "grad_norm": 1.4296875, + "learning_rate": 0.00019294017292211, + "loss": 4.4465, + "step": 3490 + }, + { + "epoch": 0.36197499821786155, + "grad_norm": 1.3359375, + "learning_rate": 0.00019293616343051491, + "loss": 4.4293, + "step": 3491 + }, + { + "epoch": 0.3620786862723496, + "grad_norm": 1.203125, + "learning_rate": 0.00019293215284237134, + "loss": 4.4282, + "step": 3492 + }, + { + "epoch": 0.3621823743268377, + "grad_norm": 1.078125, + "learning_rate": 0.0001929281411577266, + "loss": 4.4317, + "step": 3493 + }, + { + "epoch": 0.36228606238132577, + "grad_norm": 1.46875, + "learning_rate": 0.000192924128376628, + "loss": 4.4273, + "step": 3494 + }, + { + "epoch": 0.36238975043581384, + "grad_norm": 1.1953125, + "learning_rate": 0.00019292011449912295, + "loss": 4.422, + "step": 3495 + }, + { + "epoch": 0.3624934384903019, + "grad_norm": 1.6640625, + "learning_rate": 0.00019291609952525876, + "loss": 4.441, + "step": 3496 + }, + { + "epoch": 0.36259712654479, + "grad_norm": 1.4765625, + "learning_rate": 0.00019291208345508277, + "loss": 4.4409, + "step": 3497 + }, + { + "epoch": 0.36270081459927805, + "grad_norm": 1.40625, + "learning_rate": 0.00019290806628864245, + "loss": 4.4563, + "step": 3498 + }, + { + "epoch": 0.3628045026537661, + "grad_norm": 1.328125, + "learning_rate": 0.00019290404802598516, + "loss": 4.4164, + "step": 3499 + }, + { + "epoch": 0.3629081907082542, + "grad_norm": 1.4609375, + "learning_rate": 0.00019290002866715827, + "loss": 4.4176, + "step": 3500 + }, + { + "epoch": 0.36301187876274227, + "grad_norm": 1.3046875, + "learning_rate": 0.00019289600821220928, + "loss": 4.455, + "step": 3501 + }, + { + "epoch": 0.36311556681723034, + "grad_norm": 1.484375, + "learning_rate": 0.00019289198666118558, + "loss": 4.4429, + "step": 3502 + }, + { + "epoch": 0.3632192548717184, + "grad_norm": 1.390625, + "learning_rate": 0.00019288796401413462, + "loss": 4.4002, + "step": 3503 + }, + { + "epoch": 0.3633229429262065, + "grad_norm": 1.3125, + "learning_rate": 0.00019288394027110387, + "loss": 4.413, + "step": 3504 + }, + { + "epoch": 0.36342663098069455, + "grad_norm": 1.1796875, + "learning_rate": 0.00019287991543214084, + "loss": 4.4612, + "step": 3505 + }, + { + "epoch": 0.3635303190351826, + "grad_norm": 1.4453125, + "learning_rate": 0.00019287588949729295, + "loss": 4.4533, + "step": 3506 + }, + { + "epoch": 0.36363400708967075, + "grad_norm": 1.2734375, + "learning_rate": 0.00019287186246660774, + "loss": 4.4742, + "step": 3507 + }, + { + "epoch": 0.3637376951441588, + "grad_norm": 1.5546875, + "learning_rate": 0.0001928678343401327, + "loss": 4.4463, + "step": 3508 + }, + { + "epoch": 0.3638413831986469, + "grad_norm": 1.3046875, + "learning_rate": 0.0001928638051179154, + "loss": 4.41, + "step": 3509 + }, + { + "epoch": 0.36394507125313497, + "grad_norm": 1.5546875, + "learning_rate": 0.0001928597748000034, + "loss": 4.4391, + "step": 3510 + }, + { + "epoch": 0.36404875930762304, + "grad_norm": 1.328125, + "learning_rate": 0.0001928557433864442, + "loss": 4.4267, + "step": 3511 + }, + { + "epoch": 0.3641524473621111, + "grad_norm": 1.6484375, + "learning_rate": 0.00019285171087728536, + "loss": 4.4649, + "step": 3512 + }, + { + "epoch": 0.3642561354165992, + "grad_norm": 1.4921875, + "learning_rate": 0.00019284767727257445, + "loss": 4.4497, + "step": 3513 + }, + { + "epoch": 0.36435982347108725, + "grad_norm": 1.4375, + "learning_rate": 0.00019284364257235917, + "loss": 4.479, + "step": 3514 + }, + { + "epoch": 0.3644635115255753, + "grad_norm": 1.328125, + "learning_rate": 0.000192839606776687, + "loss": 4.4417, + "step": 3515 + }, + { + "epoch": 0.3645671995800634, + "grad_norm": 1.2890625, + "learning_rate": 0.00019283556988560562, + "loss": 4.3996, + "step": 3516 + }, + { + "epoch": 0.36467088763455147, + "grad_norm": 1.203125, + "learning_rate": 0.0001928315318991626, + "loss": 4.4304, + "step": 3517 + }, + { + "epoch": 0.36477457568903954, + "grad_norm": 1.546875, + "learning_rate": 0.0001928274928174057, + "loss": 4.4209, + "step": 3518 + }, + { + "epoch": 0.3648782637435276, + "grad_norm": 1.4609375, + "learning_rate": 0.00019282345264038246, + "loss": 4.4319, + "step": 3519 + }, + { + "epoch": 0.3649819517980157, + "grad_norm": 1.328125, + "learning_rate": 0.0001928194113681406, + "loss": 4.4293, + "step": 3520 + }, + { + "epoch": 0.36508563985250375, + "grad_norm": 1.15625, + "learning_rate": 0.00019281536900072783, + "loss": 4.4231, + "step": 3521 + }, + { + "epoch": 0.3651893279069918, + "grad_norm": 1.4609375, + "learning_rate": 0.00019281132553819182, + "loss": 4.435, + "step": 3522 + }, + { + "epoch": 0.3652930159614799, + "grad_norm": 1.3125, + "learning_rate": 0.00019280728098058022, + "loss": 4.444, + "step": 3523 + }, + { + "epoch": 0.36539670401596797, + "grad_norm": 1.75, + "learning_rate": 0.00019280323532794085, + "loss": 4.4257, + "step": 3524 + }, + { + "epoch": 0.36550039207045604, + "grad_norm": 1.59375, + "learning_rate": 0.0001927991885803214, + "loss": 4.39, + "step": 3525 + }, + { + "epoch": 0.3656040801249441, + "grad_norm": 1.3984375, + "learning_rate": 0.00019279514073776958, + "loss": 4.4168, + "step": 3526 + }, + { + "epoch": 0.3657077681794322, + "grad_norm": 1.3203125, + "learning_rate": 0.00019279109180033322, + "loss": 4.4334, + "step": 3527 + }, + { + "epoch": 0.36581145623392025, + "grad_norm": 1.390625, + "learning_rate": 0.00019278704176806008, + "loss": 4.4292, + "step": 3528 + }, + { + "epoch": 0.3659151442884083, + "grad_norm": 1.2578125, + "learning_rate": 0.0001927829906409979, + "loss": 4.4186, + "step": 3529 + }, + { + "epoch": 0.3660188323428964, + "grad_norm": 1.6484375, + "learning_rate": 0.00019277893841919451, + "loss": 4.4232, + "step": 3530 + }, + { + "epoch": 0.36612252039738447, + "grad_norm": 1.4140625, + "learning_rate": 0.00019277488510269773, + "loss": 4.4418, + "step": 3531 + }, + { + "epoch": 0.36622620845187254, + "grad_norm": 1.4453125, + "learning_rate": 0.00019277083069155535, + "loss": 4.4193, + "step": 3532 + }, + { + "epoch": 0.3663298965063606, + "grad_norm": 1.3359375, + "learning_rate": 0.00019276677518581523, + "loss": 4.4612, + "step": 3533 + }, + { + "epoch": 0.3664335845608487, + "grad_norm": 1.5390625, + "learning_rate": 0.00019276271858552523, + "loss": 4.4345, + "step": 3534 + }, + { + "epoch": 0.36653727261533675, + "grad_norm": 1.421875, + "learning_rate": 0.0001927586608907332, + "loss": 4.4532, + "step": 3535 + }, + { + "epoch": 0.3666409606698248, + "grad_norm": 1.5390625, + "learning_rate": 0.00019275460210148705, + "loss": 4.456, + "step": 3536 + }, + { + "epoch": 0.3667446487243129, + "grad_norm": 1.4296875, + "learning_rate": 0.00019275054221783462, + "loss": 4.4461, + "step": 3537 + }, + { + "epoch": 0.36684833677880097, + "grad_norm": 1.453125, + "learning_rate": 0.00019274648123982383, + "loss": 4.393, + "step": 3538 + }, + { + "epoch": 0.36695202483328904, + "grad_norm": 1.3203125, + "learning_rate": 0.0001927424191675026, + "loss": 4.4317, + "step": 3539 + }, + { + "epoch": 0.3670557128877771, + "grad_norm": 1.4765625, + "learning_rate": 0.00019273835600091887, + "loss": 4.4409, + "step": 3540 + }, + { + "epoch": 0.3671594009422652, + "grad_norm": 1.3515625, + "learning_rate": 0.00019273429174012057, + "loss": 4.4772, + "step": 3541 + }, + { + "epoch": 0.36726308899675325, + "grad_norm": 1.421875, + "learning_rate": 0.00019273022638515564, + "loss": 4.4135, + "step": 3542 + }, + { + "epoch": 0.3673667770512413, + "grad_norm": 1.234375, + "learning_rate": 0.00019272615993607206, + "loss": 4.5041, + "step": 3543 + }, + { + "epoch": 0.3674704651057294, + "grad_norm": 1.765625, + "learning_rate": 0.0001927220923929178, + "loss": 4.449, + "step": 3544 + }, + { + "epoch": 0.36757415316021746, + "grad_norm": 1.5, + "learning_rate": 0.00019271802375574087, + "loss": 4.4044, + "step": 3545 + }, + { + "epoch": 0.36767784121470554, + "grad_norm": 1.75, + "learning_rate": 0.00019271395402458926, + "loss": 4.5054, + "step": 3546 + }, + { + "epoch": 0.3677815292691936, + "grad_norm": 1.6796875, + "learning_rate": 0.00019270988319951103, + "loss": 4.4233, + "step": 3547 + }, + { + "epoch": 0.3678852173236817, + "grad_norm": 1.3046875, + "learning_rate": 0.00019270581128055412, + "loss": 4.466, + "step": 3548 + }, + { + "epoch": 0.36798890537816975, + "grad_norm": 1.234375, + "learning_rate": 0.00019270173826776667, + "loss": 4.4506, + "step": 3549 + }, + { + "epoch": 0.3680925934326578, + "grad_norm": 1.3984375, + "learning_rate": 0.0001926976641611967, + "loss": 4.4769, + "step": 3550 + }, + { + "epoch": 0.36819628148714595, + "grad_norm": 1.140625, + "learning_rate": 0.00019269358896089226, + "loss": 4.4227, + "step": 3551 + }, + { + "epoch": 0.368299969541634, + "grad_norm": 1.7890625, + "learning_rate": 0.00019268951266690146, + "loss": 4.4437, + "step": 3552 + }, + { + "epoch": 0.3684036575961221, + "grad_norm": 1.625, + "learning_rate": 0.00019268543527927237, + "loss": 4.4569, + "step": 3553 + }, + { + "epoch": 0.36850734565061016, + "grad_norm": 1.5078125, + "learning_rate": 0.00019268135679805312, + "loss": 4.4919, + "step": 3554 + }, + { + "epoch": 0.36861103370509823, + "grad_norm": 1.4609375, + "learning_rate": 0.0001926772772232919, + "loss": 4.4385, + "step": 3555 + }, + { + "epoch": 0.3687147217595863, + "grad_norm": 1.296875, + "learning_rate": 0.00019267319655503665, + "loss": 4.4652, + "step": 3556 + }, + { + "epoch": 0.3688184098140744, + "grad_norm": 1.2109375, + "learning_rate": 0.00019266911479333572, + "loss": 4.3996, + "step": 3557 + }, + { + "epoch": 0.36892209786856245, + "grad_norm": 1.515625, + "learning_rate": 0.00019266503193823717, + "loss": 4.4644, + "step": 3558 + }, + { + "epoch": 0.3690257859230505, + "grad_norm": 1.3046875, + "learning_rate": 0.00019266094798978922, + "loss": 4.4592, + "step": 3559 + }, + { + "epoch": 0.3691294739775386, + "grad_norm": 1.6015625, + "learning_rate": 0.00019265686294804, + "loss": 4.444, + "step": 3560 + }, + { + "epoch": 0.36923316203202666, + "grad_norm": 1.46875, + "learning_rate": 0.00019265277681303775, + "loss": 4.4513, + "step": 3561 + }, + { + "epoch": 0.36933685008651473, + "grad_norm": 1.4296875, + "learning_rate": 0.00019264868958483066, + "loss": 4.435, + "step": 3562 + }, + { + "epoch": 0.3694405381410028, + "grad_norm": 1.2578125, + "learning_rate": 0.00019264460126346697, + "loss": 4.4655, + "step": 3563 + }, + { + "epoch": 0.3695442261954909, + "grad_norm": 1.4453125, + "learning_rate": 0.00019264051184899494, + "loss": 4.4762, + "step": 3564 + }, + { + "epoch": 0.36964791424997895, + "grad_norm": 1.3125, + "learning_rate": 0.00019263642134146277, + "loss": 4.4287, + "step": 3565 + }, + { + "epoch": 0.369751602304467, + "grad_norm": 1.5546875, + "learning_rate": 0.00019263232974091877, + "loss": 4.4286, + "step": 3566 + }, + { + "epoch": 0.3698552903589551, + "grad_norm": 1.421875, + "learning_rate": 0.00019262823704741119, + "loss": 4.4731, + "step": 3567 + }, + { + "epoch": 0.36995897841344316, + "grad_norm": 1.34375, + "learning_rate": 0.00019262414326098832, + "loss": 4.4016, + "step": 3568 + }, + { + "epoch": 0.37006266646793123, + "grad_norm": 1.1640625, + "learning_rate": 0.00019262004838169845, + "loss": 4.3934, + "step": 3569 + }, + { + "epoch": 0.3701663545224193, + "grad_norm": 1.4453125, + "learning_rate": 0.00019261595240958993, + "loss": 4.4368, + "step": 3570 + }, + { + "epoch": 0.3702700425769074, + "grad_norm": 1.2421875, + "learning_rate": 0.00019261185534471108, + "loss": 4.4612, + "step": 3571 + }, + { + "epoch": 0.37037373063139545, + "grad_norm": 1.515625, + "learning_rate": 0.0001926077571871102, + "loss": 4.4292, + "step": 3572 + }, + { + "epoch": 0.3704774186858835, + "grad_norm": 1.3515625, + "learning_rate": 0.00019260365793683572, + "loss": 4.421, + "step": 3573 + }, + { + "epoch": 0.3705811067403716, + "grad_norm": 1.359375, + "learning_rate": 0.00019259955759393593, + "loss": 4.4411, + "step": 3574 + }, + { + "epoch": 0.37068479479485966, + "grad_norm": 1.265625, + "learning_rate": 0.00019259545615845925, + "loss": 4.4327, + "step": 3575 + }, + { + "epoch": 0.37078848284934773, + "grad_norm": 1.4609375, + "learning_rate": 0.00019259135363045406, + "loss": 4.4329, + "step": 3576 + }, + { + "epoch": 0.3708921709038358, + "grad_norm": 1.3046875, + "learning_rate": 0.00019258725000996881, + "loss": 4.4206, + "step": 3577 + }, + { + "epoch": 0.3709958589583239, + "grad_norm": 1.3828125, + "learning_rate": 0.0001925831452970518, + "loss": 4.4502, + "step": 3578 + }, + { + "epoch": 0.37109954701281195, + "grad_norm": 1.3515625, + "learning_rate": 0.0001925790394917516, + "loss": 4.4349, + "step": 3579 + }, + { + "epoch": 0.3712032350673, + "grad_norm": 1.203125, + "learning_rate": 0.00019257493259411659, + "loss": 4.4454, + "step": 3580 + }, + { + "epoch": 0.3713069231217881, + "grad_norm": 1.0859375, + "learning_rate": 0.0001925708246041952, + "loss": 4.4163, + "step": 3581 + }, + { + "epoch": 0.37141061117627616, + "grad_norm": 1.359375, + "learning_rate": 0.00019256671552203596, + "loss": 4.4385, + "step": 3582 + }, + { + "epoch": 0.37151429923076423, + "grad_norm": 1.0703125, + "learning_rate": 0.00019256260534768733, + "loss": 4.4556, + "step": 3583 + }, + { + "epoch": 0.3716179872852523, + "grad_norm": 1.671875, + "learning_rate": 0.00019255849408119778, + "loss": 4.4005, + "step": 3584 + }, + { + "epoch": 0.3717216753397404, + "grad_norm": 1.4453125, + "learning_rate": 0.00019255438172261586, + "loss": 4.4442, + "step": 3585 + }, + { + "epoch": 0.37182536339422845, + "grad_norm": 1.40625, + "learning_rate": 0.00019255026827199006, + "loss": 4.4232, + "step": 3586 + }, + { + "epoch": 0.3719290514487165, + "grad_norm": 1.34375, + "learning_rate": 0.0001925461537293689, + "loss": 4.4578, + "step": 3587 + }, + { + "epoch": 0.3720327395032046, + "grad_norm": 1.3671875, + "learning_rate": 0.00019254203809480097, + "loss": 4.4158, + "step": 3588 + }, + { + "epoch": 0.37213642755769266, + "grad_norm": 1.171875, + "learning_rate": 0.00019253792136833482, + "loss": 4.4179, + "step": 3589 + }, + { + "epoch": 0.37224011561218073, + "grad_norm": 1.328125, + "learning_rate": 0.000192533803550019, + "loss": 4.4066, + "step": 3590 + }, + { + "epoch": 0.3723438036666688, + "grad_norm": 1.234375, + "learning_rate": 0.0001925296846399021, + "loss": 4.4247, + "step": 3591 + }, + { + "epoch": 0.3724474917211569, + "grad_norm": 1.6640625, + "learning_rate": 0.00019252556463803279, + "loss": 4.4401, + "step": 3592 + }, + { + "epoch": 0.37255117977564495, + "grad_norm": 1.5390625, + "learning_rate": 0.00019252144354445957, + "loss": 4.4367, + "step": 3593 + }, + { + "epoch": 0.372654867830133, + "grad_norm": 1.2734375, + "learning_rate": 0.0001925173213592311, + "loss": 4.3991, + "step": 3594 + }, + { + "epoch": 0.3727585558846211, + "grad_norm": 1.1953125, + "learning_rate": 0.00019251319808239609, + "loss": 4.4147, + "step": 3595 + }, + { + "epoch": 0.3728622439391092, + "grad_norm": 1.2109375, + "learning_rate": 0.00019250907371400308, + "loss": 4.4117, + "step": 3596 + }, + { + "epoch": 0.3729659319935973, + "grad_norm": 1.046875, + "learning_rate": 0.0001925049482541008, + "loss": 4.4487, + "step": 3597 + }, + { + "epoch": 0.37306962004808536, + "grad_norm": 1.5234375, + "learning_rate": 0.00019250082170273793, + "loss": 4.4308, + "step": 3598 + }, + { + "epoch": 0.37317330810257343, + "grad_norm": 1.3046875, + "learning_rate": 0.0001924966940599631, + "loss": 4.4148, + "step": 3599 + }, + { + "epoch": 0.3732769961570615, + "grad_norm": 1.609375, + "learning_rate": 0.0001924925653258251, + "loss": 4.4425, + "step": 3600 + }, + { + "epoch": 0.3733806842115496, + "grad_norm": 1.5078125, + "learning_rate": 0.0001924884355003726, + "loss": 4.4362, + "step": 3601 + }, + { + "epoch": 0.37348437226603765, + "grad_norm": 1.140625, + "learning_rate": 0.0001924843045836543, + "loss": 4.4168, + "step": 3602 + }, + { + "epoch": 0.3735880603205257, + "grad_norm": 1.09375, + "learning_rate": 0.000192480172575719, + "loss": 4.4619, + "step": 3603 + }, + { + "epoch": 0.3736917483750138, + "grad_norm": 1.25, + "learning_rate": 0.00019247603947661535, + "loss": 4.424, + "step": 3604 + }, + { + "epoch": 0.37379543642950186, + "grad_norm": 1.1015625, + "learning_rate": 0.00019247190528639223, + "loss": 4.4545, + "step": 3605 + }, + { + "epoch": 0.37389912448398993, + "grad_norm": 1.6171875, + "learning_rate": 0.00019246777000509838, + "loss": 4.4457, + "step": 3606 + }, + { + "epoch": 0.374002812538478, + "grad_norm": 1.4765625, + "learning_rate": 0.00019246363363278259, + "loss": 4.4527, + "step": 3607 + }, + { + "epoch": 0.3741065005929661, + "grad_norm": 1.2421875, + "learning_rate": 0.00019245949616949367, + "loss": 4.4568, + "step": 3608 + }, + { + "epoch": 0.37421018864745415, + "grad_norm": 1.1171875, + "learning_rate": 0.00019245535761528037, + "loss": 4.4389, + "step": 3609 + }, + { + "epoch": 0.3743138767019422, + "grad_norm": 1.3125, + "learning_rate": 0.00019245121797019165, + "loss": 4.411, + "step": 3610 + }, + { + "epoch": 0.3744175647564303, + "grad_norm": 1.0859375, + "learning_rate": 0.00019244707723427623, + "loss": 4.3858, + "step": 3611 + }, + { + "epoch": 0.37452125281091836, + "grad_norm": 1.515625, + "learning_rate": 0.00019244293540758304, + "loss": 4.4078, + "step": 3612 + }, + { + "epoch": 0.37462494086540643, + "grad_norm": 1.2890625, + "learning_rate": 0.00019243879249016094, + "loss": 4.4748, + "step": 3613 + }, + { + "epoch": 0.3747286289198945, + "grad_norm": 1.5234375, + "learning_rate": 0.0001924346484820588, + "loss": 4.439, + "step": 3614 + }, + { + "epoch": 0.3748323169743826, + "grad_norm": 1.4140625, + "learning_rate": 0.0001924305033833255, + "loss": 4.3731, + "step": 3615 + }, + { + "epoch": 0.37493600502887064, + "grad_norm": 1.3359375, + "learning_rate": 0.00019242635719400996, + "loss": 4.4164, + "step": 3616 + }, + { + "epoch": 0.3750396930833587, + "grad_norm": 1.25, + "learning_rate": 0.00019242220991416112, + "loss": 4.4429, + "step": 3617 + }, + { + "epoch": 0.3751433811378468, + "grad_norm": 1.359375, + "learning_rate": 0.00019241806154382792, + "loss": 4.3936, + "step": 3618 + }, + { + "epoch": 0.37524706919233486, + "grad_norm": 1.234375, + "learning_rate": 0.00019241391208305926, + "loss": 4.4168, + "step": 3619 + }, + { + "epoch": 0.37535075724682293, + "grad_norm": 1.359375, + "learning_rate": 0.0001924097615319041, + "loss": 4.4512, + "step": 3620 + }, + { + "epoch": 0.375454445301311, + "grad_norm": 1.1640625, + "learning_rate": 0.00019240560989041146, + "loss": 4.4483, + "step": 3621 + }, + { + "epoch": 0.3755581333557991, + "grad_norm": 1.640625, + "learning_rate": 0.0001924014571586303, + "loss": 4.4152, + "step": 3622 + }, + { + "epoch": 0.37566182141028714, + "grad_norm": 1.4375, + "learning_rate": 0.0001923973033366096, + "loss": 4.4342, + "step": 3623 + }, + { + "epoch": 0.3757655094647752, + "grad_norm": 1.453125, + "learning_rate": 0.0001923931484243984, + "loss": 4.3984, + "step": 3624 + }, + { + "epoch": 0.3758691975192633, + "grad_norm": 1.390625, + "learning_rate": 0.0001923889924220457, + "loss": 4.433, + "step": 3625 + }, + { + "epoch": 0.37597288557375136, + "grad_norm": 1.328125, + "learning_rate": 0.00019238483532960058, + "loss": 4.4543, + "step": 3626 + }, + { + "epoch": 0.37607657362823943, + "grad_norm": 1.25, + "learning_rate": 0.00019238067714711203, + "loss": 4.4161, + "step": 3627 + }, + { + "epoch": 0.3761802616827275, + "grad_norm": 1.5546875, + "learning_rate": 0.00019237651787462916, + "loss": 4.4185, + "step": 3628 + }, + { + "epoch": 0.3762839497372156, + "grad_norm": 1.4375, + "learning_rate": 0.00019237235751220103, + "loss": 4.4455, + "step": 3629 + }, + { + "epoch": 0.37638763779170364, + "grad_norm": 1.3046875, + "learning_rate": 0.0001923681960598767, + "loss": 4.409, + "step": 3630 + }, + { + "epoch": 0.3764913258461917, + "grad_norm": 1.2265625, + "learning_rate": 0.0001923640335177053, + "loss": 4.4611, + "step": 3631 + }, + { + "epoch": 0.3765950139006798, + "grad_norm": 1.296875, + "learning_rate": 0.00019235986988573595, + "loss": 4.4378, + "step": 3632 + }, + { + "epoch": 0.37669870195516786, + "grad_norm": 1.171875, + "learning_rate": 0.00019235570516401776, + "loss": 4.4345, + "step": 3633 + }, + { + "epoch": 0.37680239000965593, + "grad_norm": 1.53125, + "learning_rate": 0.00019235153935259986, + "loss": 4.458, + "step": 3634 + }, + { + "epoch": 0.376906078064144, + "grad_norm": 1.328125, + "learning_rate": 0.00019234737245153142, + "loss": 4.4146, + "step": 3635 + }, + { + "epoch": 0.3770097661186321, + "grad_norm": 1.5546875, + "learning_rate": 0.0001923432044608616, + "loss": 4.4413, + "step": 3636 + }, + { + "epoch": 0.37711345417312014, + "grad_norm": 1.3828125, + "learning_rate": 0.00019233903538063958, + "loss": 4.4489, + "step": 3637 + }, + { + "epoch": 0.3772171422276082, + "grad_norm": 1.5390625, + "learning_rate": 0.00019233486521091458, + "loss": 4.4059, + "step": 3638 + }, + { + "epoch": 0.3773208302820963, + "grad_norm": 1.3984375, + "learning_rate": 0.00019233069395173573, + "loss": 4.4205, + "step": 3639 + }, + { + "epoch": 0.3774245183365844, + "grad_norm": 1.5625, + "learning_rate": 0.0001923265216031523, + "loss": 4.4273, + "step": 3640 + }, + { + "epoch": 0.3775282063910725, + "grad_norm": 1.4375, + "learning_rate": 0.00019232234816521352, + "loss": 4.4103, + "step": 3641 + }, + { + "epoch": 0.37763189444556056, + "grad_norm": 1.625, + "learning_rate": 0.0001923181736379686, + "loss": 4.44, + "step": 3642 + }, + { + "epoch": 0.37773558250004863, + "grad_norm": 1.5, + "learning_rate": 0.00019231399802146685, + "loss": 4.4276, + "step": 3643 + }, + { + "epoch": 0.3778392705545367, + "grad_norm": 1.453125, + "learning_rate": 0.00019230982131575747, + "loss": 4.4077, + "step": 3644 + }, + { + "epoch": 0.37794295860902477, + "grad_norm": 1.3984375, + "learning_rate": 0.00019230564352088977, + "loss": 4.4196, + "step": 3645 + }, + { + "epoch": 0.37804664666351284, + "grad_norm": 1.3203125, + "learning_rate": 0.00019230146463691307, + "loss": 4.4645, + "step": 3646 + }, + { + "epoch": 0.3781503347180009, + "grad_norm": 1.1875, + "learning_rate": 0.00019229728466387664, + "loss": 4.4093, + "step": 3647 + }, + { + "epoch": 0.378254022772489, + "grad_norm": 1.34375, + "learning_rate": 0.00019229310360182984, + "loss": 4.4172, + "step": 3648 + }, + { + "epoch": 0.37835771082697706, + "grad_norm": 1.21875, + "learning_rate": 0.00019228892145082193, + "loss": 4.4356, + "step": 3649 + }, + { + "epoch": 0.3784613988814651, + "grad_norm": 1.5625, + "learning_rate": 0.00019228473821090235, + "loss": 4.4533, + "step": 3650 + }, + { + "epoch": 0.3785650869359532, + "grad_norm": 1.390625, + "learning_rate": 0.0001922805538821204, + "loss": 4.4694, + "step": 3651 + }, + { + "epoch": 0.37866877499044127, + "grad_norm": 1.3046875, + "learning_rate": 0.00019227636846452542, + "loss": 4.4129, + "step": 3652 + }, + { + "epoch": 0.37877246304492934, + "grad_norm": 1.2421875, + "learning_rate": 0.00019227218195816685, + "loss": 4.436, + "step": 3653 + }, + { + "epoch": 0.3788761510994174, + "grad_norm": 1.296875, + "learning_rate": 0.0001922679943630941, + "loss": 4.4356, + "step": 3654 + }, + { + "epoch": 0.3789798391539055, + "grad_norm": 1.1875, + "learning_rate": 0.0001922638056793565, + "loss": 4.4148, + "step": 3655 + }, + { + "epoch": 0.37908352720839356, + "grad_norm": 1.6171875, + "learning_rate": 0.00019225961590700353, + "loss": 4.4184, + "step": 3656 + }, + { + "epoch": 0.3791872152628816, + "grad_norm": 1.3828125, + "learning_rate": 0.00019225542504608465, + "loss": 4.4467, + "step": 3657 + }, + { + "epoch": 0.3792909033173697, + "grad_norm": 1.5, + "learning_rate": 0.00019225123309664924, + "loss": 4.4668, + "step": 3658 + }, + { + "epoch": 0.37939459137185777, + "grad_norm": 1.359375, + "learning_rate": 0.00019224704005874684, + "loss": 4.4578, + "step": 3659 + }, + { + "epoch": 0.37949827942634584, + "grad_norm": 1.3046875, + "learning_rate": 0.0001922428459324268, + "loss": 4.4322, + "step": 3660 + }, + { + "epoch": 0.3796019674808339, + "grad_norm": 1.1875, + "learning_rate": 0.00019223865071773874, + "loss": 4.4463, + "step": 3661 + }, + { + "epoch": 0.379705655535322, + "grad_norm": 1.3984375, + "learning_rate": 0.00019223445441473208, + "loss": 4.425, + "step": 3662 + }, + { + "epoch": 0.37980934358981006, + "grad_norm": 1.2734375, + "learning_rate": 0.00019223025702345639, + "loss": 4.4147, + "step": 3663 + }, + { + "epoch": 0.3799130316442981, + "grad_norm": 1.296875, + "learning_rate": 0.00019222605854396111, + "loss": 4.4286, + "step": 3664 + }, + { + "epoch": 0.3800167196987862, + "grad_norm": 1.203125, + "learning_rate": 0.00019222185897629584, + "loss": 4.4276, + "step": 3665 + }, + { + "epoch": 0.38012040775327427, + "grad_norm": 1.3203125, + "learning_rate": 0.00019221765832051012, + "loss": 4.4228, + "step": 3666 + }, + { + "epoch": 0.38022409580776234, + "grad_norm": 1.1796875, + "learning_rate": 0.00019221345657665352, + "loss": 4.4041, + "step": 3667 + }, + { + "epoch": 0.3803277838622504, + "grad_norm": 1.359375, + "learning_rate": 0.0001922092537447756, + "loss": 4.447, + "step": 3668 + }, + { + "epoch": 0.3804314719167385, + "grad_norm": 1.234375, + "learning_rate": 0.00019220504982492596, + "loss": 4.4779, + "step": 3669 + }, + { + "epoch": 0.38053515997122656, + "grad_norm": 1.2578125, + "learning_rate": 0.00019220084481715418, + "loss": 4.4111, + "step": 3670 + }, + { + "epoch": 0.3806388480257146, + "grad_norm": 1.171875, + "learning_rate": 0.0001921966387215099, + "loss": 4.4375, + "step": 3671 + }, + { + "epoch": 0.3807425360802027, + "grad_norm": 1.2109375, + "learning_rate": 0.00019219243153804273, + "loss": 4.4184, + "step": 3672 + }, + { + "epoch": 0.38084622413469077, + "grad_norm": 1.078125, + "learning_rate": 0.00019218822326680235, + "loss": 4.4274, + "step": 3673 + }, + { + "epoch": 0.38094991218917884, + "grad_norm": 1.3984375, + "learning_rate": 0.00019218401390783834, + "loss": 4.4228, + "step": 3674 + }, + { + "epoch": 0.3810536002436669, + "grad_norm": 1.2734375, + "learning_rate": 0.00019217980346120044, + "loss": 4.4196, + "step": 3675 + }, + { + "epoch": 0.381157288298155, + "grad_norm": 1.265625, + "learning_rate": 0.00019217559192693831, + "loss": 4.4682, + "step": 3676 + }, + { + "epoch": 0.38126097635264306, + "grad_norm": 1.1640625, + "learning_rate": 0.0001921713793051016, + "loss": 4.4386, + "step": 3677 + }, + { + "epoch": 0.3813646644071311, + "grad_norm": 1.3203125, + "learning_rate": 0.00019216716559574003, + "loss": 4.3986, + "step": 3678 + }, + { + "epoch": 0.3814683524616192, + "grad_norm": 1.140625, + "learning_rate": 0.00019216295079890338, + "loss": 4.4526, + "step": 3679 + }, + { + "epoch": 0.38157204051610727, + "grad_norm": 1.4453125, + "learning_rate": 0.0001921587349146413, + "loss": 4.4512, + "step": 3680 + }, + { + "epoch": 0.38167572857059534, + "grad_norm": 1.2734375, + "learning_rate": 0.00019215451794300355, + "loss": 4.456, + "step": 3681 + }, + { + "epoch": 0.3817794166250834, + "grad_norm": 1.46875, + "learning_rate": 0.00019215029988403992, + "loss": 4.4242, + "step": 3682 + }, + { + "epoch": 0.3818831046795715, + "grad_norm": 1.34375, + "learning_rate": 0.00019214608073780015, + "loss": 4.4795, + "step": 3683 + }, + { + "epoch": 0.38198679273405955, + "grad_norm": 1.46875, + "learning_rate": 0.00019214186050433405, + "loss": 4.427, + "step": 3684 + }, + { + "epoch": 0.3820904807885477, + "grad_norm": 1.390625, + "learning_rate": 0.00019213763918369137, + "loss": 4.4174, + "step": 3685 + }, + { + "epoch": 0.38219416884303575, + "grad_norm": 1.2421875, + "learning_rate": 0.00019213341677592197, + "loss": 4.4468, + "step": 3686 + }, + { + "epoch": 0.3822978568975238, + "grad_norm": 1.1484375, + "learning_rate": 0.00019212919328107564, + "loss": 4.4515, + "step": 3687 + }, + { + "epoch": 0.3824015449520119, + "grad_norm": 1.359375, + "learning_rate": 0.00019212496869920218, + "loss": 4.4227, + "step": 3688 + }, + { + "epoch": 0.38250523300649997, + "grad_norm": 1.25, + "learning_rate": 0.0001921207430303515, + "loss": 4.457, + "step": 3689 + }, + { + "epoch": 0.38260892106098804, + "grad_norm": 1.5078125, + "learning_rate": 0.0001921165162745734, + "loss": 4.4166, + "step": 3690 + }, + { + "epoch": 0.3827126091154761, + "grad_norm": 1.359375, + "learning_rate": 0.0001921122884319178, + "loss": 4.3881, + "step": 3691 + }, + { + "epoch": 0.3828162971699642, + "grad_norm": 1.1953125, + "learning_rate": 0.00019210805950243455, + "loss": 4.3417, + "step": 3692 + }, + { + "epoch": 0.38291998522445225, + "grad_norm": 1.125, + "learning_rate": 0.00019210382948617357, + "loss": 4.4258, + "step": 3693 + }, + { + "epoch": 0.3830236732789403, + "grad_norm": 1.3515625, + "learning_rate": 0.00019209959838318476, + "loss": 4.4543, + "step": 3694 + }, + { + "epoch": 0.3831273613334284, + "grad_norm": 1.203125, + "learning_rate": 0.00019209536619351804, + "loss": 4.4249, + "step": 3695 + }, + { + "epoch": 0.38323104938791647, + "grad_norm": 1.5078125, + "learning_rate": 0.00019209113291722334, + "loss": 4.4234, + "step": 3696 + }, + { + "epoch": 0.38333473744240454, + "grad_norm": 1.4140625, + "learning_rate": 0.0001920868985543506, + "loss": 4.4391, + "step": 3697 + }, + { + "epoch": 0.3834384254968926, + "grad_norm": 1.15625, + "learning_rate": 0.00019208266310494985, + "loss": 4.3681, + "step": 3698 + }, + { + "epoch": 0.3835421135513807, + "grad_norm": 1.1953125, + "learning_rate": 0.00019207842656907099, + "loss": 4.4259, + "step": 3699 + }, + { + "epoch": 0.38364580160586875, + "grad_norm": 1.203125, + "learning_rate": 0.000192074188946764, + "loss": 4.4409, + "step": 3700 + }, + { + "epoch": 0.3837494896603568, + "grad_norm": 1.09375, + "learning_rate": 0.00019206995023807893, + "loss": 4.4505, + "step": 3701 + }, + { + "epoch": 0.3838531777148449, + "grad_norm": 1.4375, + "learning_rate": 0.00019206571044306578, + "loss": 4.4179, + "step": 3702 + }, + { + "epoch": 0.38395686576933297, + "grad_norm": 1.28125, + "learning_rate": 0.00019206146956177454, + "loss": 4.4102, + "step": 3703 + }, + { + "epoch": 0.38406055382382104, + "grad_norm": 1.3359375, + "learning_rate": 0.00019205722759425527, + "loss": 4.4051, + "step": 3704 + }, + { + "epoch": 0.3841642418783091, + "grad_norm": 1.1953125, + "learning_rate": 0.00019205298454055806, + "loss": 4.4391, + "step": 3705 + }, + { + "epoch": 0.3842679299327972, + "grad_norm": 1.28125, + "learning_rate": 0.0001920487404007329, + "loss": 4.4612, + "step": 3706 + }, + { + "epoch": 0.38437161798728525, + "grad_norm": 1.125, + "learning_rate": 0.00019204449517482993, + "loss": 4.4197, + "step": 3707 + }, + { + "epoch": 0.3844753060417733, + "grad_norm": 1.515625, + "learning_rate": 0.00019204024886289919, + "loss": 4.4594, + "step": 3708 + }, + { + "epoch": 0.3845789940962614, + "grad_norm": 1.375, + "learning_rate": 0.0001920360014649908, + "loss": 4.4012, + "step": 3709 + }, + { + "epoch": 0.38468268215074947, + "grad_norm": 1.4453125, + "learning_rate": 0.00019203175298115492, + "loss": 4.4191, + "step": 3710 + }, + { + "epoch": 0.38478637020523754, + "grad_norm": 1.34375, + "learning_rate": 0.0001920275034114416, + "loss": 4.3976, + "step": 3711 + }, + { + "epoch": 0.3848900582597256, + "grad_norm": 1.3203125, + "learning_rate": 0.00019202325275590102, + "loss": 4.3799, + "step": 3712 + }, + { + "epoch": 0.3849937463142137, + "grad_norm": 1.203125, + "learning_rate": 0.00019201900101458333, + "loss": 4.4567, + "step": 3713 + }, + { + "epoch": 0.38509743436870175, + "grad_norm": 1.2265625, + "learning_rate": 0.0001920147481875387, + "loss": 4.3867, + "step": 3714 + }, + { + "epoch": 0.3852011224231898, + "grad_norm": 1.125, + "learning_rate": 0.0001920104942748173, + "loss": 4.4149, + "step": 3715 + }, + { + "epoch": 0.3853048104776779, + "grad_norm": 1.3984375, + "learning_rate": 0.00019200623927646934, + "loss": 4.4151, + "step": 3716 + }, + { + "epoch": 0.38540849853216597, + "grad_norm": 1.328125, + "learning_rate": 0.000192001983192545, + "loss": 4.4309, + "step": 3717 + }, + { + "epoch": 0.38551218658665404, + "grad_norm": 1.1171875, + "learning_rate": 0.00019199772602309452, + "loss": 4.37, + "step": 3718 + }, + { + "epoch": 0.3856158746411421, + "grad_norm": 1.1015625, + "learning_rate": 0.00019199346776816812, + "loss": 4.4452, + "step": 3719 + }, + { + "epoch": 0.3857195626956302, + "grad_norm": 1.28125, + "learning_rate": 0.00019198920842781604, + "loss": 4.4276, + "step": 3720 + }, + { + "epoch": 0.38582325075011825, + "grad_norm": 1.1484375, + "learning_rate": 0.00019198494800208853, + "loss": 4.4077, + "step": 3721 + }, + { + "epoch": 0.3859269388046063, + "grad_norm": 1.3203125, + "learning_rate": 0.00019198068649103585, + "loss": 4.3955, + "step": 3722 + }, + { + "epoch": 0.3860306268590944, + "grad_norm": 1.203125, + "learning_rate": 0.00019197642389470832, + "loss": 4.4649, + "step": 3723 + }, + { + "epoch": 0.38613431491358247, + "grad_norm": 1.375, + "learning_rate": 0.00019197216021315622, + "loss": 4.4707, + "step": 3724 + }, + { + "epoch": 0.38623800296807054, + "grad_norm": 1.2890625, + "learning_rate": 0.00019196789544642981, + "loss": 4.4139, + "step": 3725 + }, + { + "epoch": 0.3863416910225586, + "grad_norm": 1.2109375, + "learning_rate": 0.00019196362959457948, + "loss": 4.4305, + "step": 3726 + }, + { + "epoch": 0.3864453790770467, + "grad_norm": 1.1171875, + "learning_rate": 0.00019195936265765552, + "loss": 4.4144, + "step": 3727 + }, + { + "epoch": 0.38654906713153475, + "grad_norm": 1.359375, + "learning_rate": 0.00019195509463570831, + "loss": 4.3813, + "step": 3728 + }, + { + "epoch": 0.3866527551860228, + "grad_norm": 1.21875, + "learning_rate": 0.00019195082552878815, + "loss": 4.4248, + "step": 3729 + }, + { + "epoch": 0.38675644324051095, + "grad_norm": 1.46875, + "learning_rate": 0.00019194655533694545, + "loss": 4.4362, + "step": 3730 + }, + { + "epoch": 0.386860131294999, + "grad_norm": 1.4140625, + "learning_rate": 0.0001919422840602306, + "loss": 4.4018, + "step": 3731 + }, + { + "epoch": 0.3869638193494871, + "grad_norm": 1.25, + "learning_rate": 0.000191938011698694, + "loss": 4.4026, + "step": 3732 + }, + { + "epoch": 0.38706750740397516, + "grad_norm": 1.21875, + "learning_rate": 0.00019193373825238602, + "loss": 4.46, + "step": 3733 + }, + { + "epoch": 0.38717119545846324, + "grad_norm": 1.2734375, + "learning_rate": 0.00019192946372135713, + "loss": 4.3934, + "step": 3734 + }, + { + "epoch": 0.3872748835129513, + "grad_norm": 1.1796875, + "learning_rate": 0.00019192518810565772, + "loss": 4.4224, + "step": 3735 + }, + { + "epoch": 0.3873785715674394, + "grad_norm": 1.3828125, + "learning_rate": 0.00019192091140533824, + "loss": 4.4006, + "step": 3736 + }, + { + "epoch": 0.38748225962192745, + "grad_norm": 1.2890625, + "learning_rate": 0.0001919166336204492, + "loss": 4.4281, + "step": 3737 + }, + { + "epoch": 0.3875859476764155, + "grad_norm": 1.265625, + "learning_rate": 0.00019191235475104104, + "loss": 4.4234, + "step": 3738 + }, + { + "epoch": 0.3876896357309036, + "grad_norm": 1.2421875, + "learning_rate": 0.00019190807479716425, + "loss": 4.4203, + "step": 3739 + }, + { + "epoch": 0.38779332378539166, + "grad_norm": 1.1015625, + "learning_rate": 0.00019190379375886934, + "loss": 4.4245, + "step": 3740 + }, + { + "epoch": 0.38789701183987974, + "grad_norm": 1.0625, + "learning_rate": 0.00019189951163620678, + "loss": 4.4599, + "step": 3741 + }, + { + "epoch": 0.3880006998943678, + "grad_norm": 1.1015625, + "learning_rate": 0.00019189522842922714, + "loss": 4.3865, + "step": 3742 + }, + { + "epoch": 0.3881043879488559, + "grad_norm": 0.9609375, + "learning_rate": 0.00019189094413798094, + "loss": 4.4373, + "step": 3743 + }, + { + "epoch": 0.38820807600334395, + "grad_norm": 1.2578125, + "learning_rate": 0.00019188665876251874, + "loss": 4.4681, + "step": 3744 + }, + { + "epoch": 0.388311764057832, + "grad_norm": 1.0, + "learning_rate": 0.0001918823723028911, + "loss": 4.4097, + "step": 3745 + }, + { + "epoch": 0.3884154521123201, + "grad_norm": 1.59375, + "learning_rate": 0.00019187808475914855, + "loss": 4.4054, + "step": 3746 + }, + { + "epoch": 0.38851914016680816, + "grad_norm": 1.328125, + "learning_rate": 0.00019187379613134174, + "loss": 4.394, + "step": 3747 + }, + { + "epoch": 0.38862282822129623, + "grad_norm": 1.328125, + "learning_rate": 0.00019186950641952128, + "loss": 4.4425, + "step": 3748 + }, + { + "epoch": 0.3887265162757843, + "grad_norm": 1.2109375, + "learning_rate": 0.00019186521562373774, + "loss": 4.4016, + "step": 3749 + }, + { + "epoch": 0.3888302043302724, + "grad_norm": 1.3125, + "learning_rate": 0.00019186092374404176, + "loss": 4.3771, + "step": 3750 + }, + { + "epoch": 0.38893389238476045, + "grad_norm": 1.1171875, + "learning_rate": 0.00019185663078048396, + "loss": 4.4122, + "step": 3751 + }, + { + "epoch": 0.3890375804392485, + "grad_norm": 1.375, + "learning_rate": 0.00019185233673311506, + "loss": 4.4324, + "step": 3752 + }, + { + "epoch": 0.3891412684937366, + "grad_norm": 1.1640625, + "learning_rate": 0.0001918480416019856, + "loss": 4.4282, + "step": 3753 + }, + { + "epoch": 0.38924495654822466, + "grad_norm": 1.453125, + "learning_rate": 0.0001918437453871464, + "loss": 4.4434, + "step": 3754 + }, + { + "epoch": 0.38934864460271273, + "grad_norm": 1.3671875, + "learning_rate": 0.00019183944808864807, + "loss": 4.4197, + "step": 3755 + }, + { + "epoch": 0.3894523326572008, + "grad_norm": 1.265625, + "learning_rate": 0.00019183514970654136, + "loss": 4.3956, + "step": 3756 + }, + { + "epoch": 0.3895560207116889, + "grad_norm": 1.203125, + "learning_rate": 0.0001918308502408769, + "loss": 4.4278, + "step": 3757 + }, + { + "epoch": 0.38965970876617695, + "grad_norm": 1.265625, + "learning_rate": 0.00019182654969170551, + "loss": 4.4023, + "step": 3758 + }, + { + "epoch": 0.389763396820665, + "grad_norm": 1.0859375, + "learning_rate": 0.0001918222480590779, + "loss": 4.4272, + "step": 3759 + }, + { + "epoch": 0.3898670848751531, + "grad_norm": 1.6953125, + "learning_rate": 0.00019181794534304484, + "loss": 4.4145, + "step": 3760 + }, + { + "epoch": 0.38997077292964116, + "grad_norm": 1.5546875, + "learning_rate": 0.00019181364154365705, + "loss": 4.4161, + "step": 3761 + }, + { + "epoch": 0.39007446098412923, + "grad_norm": 1.40625, + "learning_rate": 0.00019180933666096536, + "loss": 4.4161, + "step": 3762 + }, + { + "epoch": 0.3901781490386173, + "grad_norm": 1.328125, + "learning_rate": 0.00019180503069502052, + "loss": 4.4283, + "step": 3763 + }, + { + "epoch": 0.3902818370931054, + "grad_norm": 1.421875, + "learning_rate": 0.0001918007236458734, + "loss": 4.4056, + "step": 3764 + }, + { + "epoch": 0.39038552514759345, + "grad_norm": 1.25, + "learning_rate": 0.00019179641551357474, + "loss": 4.4236, + "step": 3765 + }, + { + "epoch": 0.3904892132020815, + "grad_norm": 1.59375, + "learning_rate": 0.00019179210629817543, + "loss": 4.3897, + "step": 3766 + }, + { + "epoch": 0.3905929012565696, + "grad_norm": 1.4609375, + "learning_rate": 0.0001917877959997263, + "loss": 4.3884, + "step": 3767 + }, + { + "epoch": 0.39069658931105766, + "grad_norm": 1.4453125, + "learning_rate": 0.00019178348461827818, + "loss": 4.4123, + "step": 3768 + }, + { + "epoch": 0.39080027736554573, + "grad_norm": 1.2890625, + "learning_rate": 0.00019177917215388196, + "loss": 4.4079, + "step": 3769 + }, + { + "epoch": 0.3909039654200338, + "grad_norm": 1.640625, + "learning_rate": 0.00019177485860658852, + "loss": 4.4478, + "step": 3770 + }, + { + "epoch": 0.3910076534745219, + "grad_norm": 1.46875, + "learning_rate": 0.00019177054397644878, + "loss": 4.4614, + "step": 3771 + }, + { + "epoch": 0.39111134152900995, + "grad_norm": 1.6171875, + "learning_rate": 0.00019176622826351362, + "loss": 4.416, + "step": 3772 + }, + { + "epoch": 0.391215029583498, + "grad_norm": 1.546875, + "learning_rate": 0.00019176191146783394, + "loss": 4.4209, + "step": 3773 + }, + { + "epoch": 0.39131871763798615, + "grad_norm": 1.15625, + "learning_rate": 0.00019175759358946076, + "loss": 4.4549, + "step": 3774 + }, + { + "epoch": 0.3914224056924742, + "grad_norm": 1.1015625, + "learning_rate": 0.00019175327462844495, + "loss": 4.411, + "step": 3775 + }, + { + "epoch": 0.3915260937469623, + "grad_norm": 1.15625, + "learning_rate": 0.00019174895458483744, + "loss": 4.4674, + "step": 3776 + }, + { + "epoch": 0.39162978180145036, + "grad_norm": 0.96875, + "learning_rate": 0.0001917446334586893, + "loss": 4.4098, + "step": 3777 + }, + { + "epoch": 0.39173346985593843, + "grad_norm": 1.4140625, + "learning_rate": 0.00019174031125005145, + "loss": 4.4235, + "step": 3778 + }, + { + "epoch": 0.3918371579104265, + "grad_norm": 1.2421875, + "learning_rate": 0.0001917359879589749, + "loss": 4.4081, + "step": 3779 + }, + { + "epoch": 0.3919408459649146, + "grad_norm": 1.5390625, + "learning_rate": 0.00019173166358551065, + "loss": 4.4185, + "step": 3780 + }, + { + "epoch": 0.39204453401940265, + "grad_norm": 1.390625, + "learning_rate": 0.00019172733812970974, + "loss": 4.4354, + "step": 3781 + }, + { + "epoch": 0.3921482220738907, + "grad_norm": 1.21875, + "learning_rate": 0.00019172301159162318, + "loss": 4.4443, + "step": 3782 + }, + { + "epoch": 0.3922519101283788, + "grad_norm": 1.0859375, + "learning_rate": 0.00019171868397130205, + "loss": 4.4231, + "step": 3783 + }, + { + "epoch": 0.39235559818286686, + "grad_norm": 1.2734375, + "learning_rate": 0.00019171435526879743, + "loss": 4.452, + "step": 3784 + }, + { + "epoch": 0.39245928623735493, + "grad_norm": 1.15625, + "learning_rate": 0.00019171002548416035, + "loss": 4.3952, + "step": 3785 + }, + { + "epoch": 0.392562974291843, + "grad_norm": 1.2421875, + "learning_rate": 0.0001917056946174419, + "loss": 4.37, + "step": 3786 + }, + { + "epoch": 0.3926666623463311, + "grad_norm": 1.09375, + "learning_rate": 0.00019170136266869319, + "loss": 4.4024, + "step": 3787 + }, + { + "epoch": 0.39277035040081915, + "grad_norm": 1.2578125, + "learning_rate": 0.00019169702963796534, + "loss": 4.3948, + "step": 3788 + }, + { + "epoch": 0.3928740384553072, + "grad_norm": 1.171875, + "learning_rate": 0.00019169269552530947, + "loss": 4.3902, + "step": 3789 + }, + { + "epoch": 0.3929777265097953, + "grad_norm": 1.25, + "learning_rate": 0.00019168836033077672, + "loss": 4.4408, + "step": 3790 + }, + { + "epoch": 0.39308141456428336, + "grad_norm": 1.1328125, + "learning_rate": 0.0001916840240544182, + "loss": 4.4168, + "step": 3791 + }, + { + "epoch": 0.39318510261877143, + "grad_norm": 1.296875, + "learning_rate": 0.00019167968669628517, + "loss": 4.3858, + "step": 3792 + }, + { + "epoch": 0.3932887906732595, + "grad_norm": 1.140625, + "learning_rate": 0.0001916753482564287, + "loss": 4.4098, + "step": 3793 + }, + { + "epoch": 0.3933924787277476, + "grad_norm": 1.4453125, + "learning_rate": 0.00019167100873490004, + "loss": 4.4368, + "step": 3794 + }, + { + "epoch": 0.39349616678223565, + "grad_norm": 1.34375, + "learning_rate": 0.00019166666813175037, + "loss": 4.4083, + "step": 3795 + }, + { + "epoch": 0.3935998548367237, + "grad_norm": 1.234375, + "learning_rate": 0.00019166232644703092, + "loss": 4.3992, + "step": 3796 + }, + { + "epoch": 0.3937035428912118, + "grad_norm": 1.1953125, + "learning_rate": 0.00019165798368079293, + "loss": 4.4001, + "step": 3797 + }, + { + "epoch": 0.39380723094569986, + "grad_norm": 1.2578125, + "learning_rate": 0.0001916536398330876, + "loss": 4.3935, + "step": 3798 + }, + { + "epoch": 0.39391091900018793, + "grad_norm": 1.1328125, + "learning_rate": 0.0001916492949039662, + "loss": 4.4262, + "step": 3799 + }, + { + "epoch": 0.394014607054676, + "grad_norm": 1.4609375, + "learning_rate": 0.00019164494889348, + "loss": 4.4222, + "step": 3800 + }, + { + "epoch": 0.3941182951091641, + "grad_norm": 1.28125, + "learning_rate": 0.00019164060180168027, + "loss": 4.4313, + "step": 3801 + }, + { + "epoch": 0.39422198316365215, + "grad_norm": 1.390625, + "learning_rate": 0.0001916362536286183, + "loss": 4.3749, + "step": 3802 + }, + { + "epoch": 0.3943256712181402, + "grad_norm": 1.203125, + "learning_rate": 0.00019163190437434543, + "loss": 4.4329, + "step": 3803 + }, + { + "epoch": 0.3944293592726283, + "grad_norm": 1.453125, + "learning_rate": 0.00019162755403891293, + "loss": 4.4685, + "step": 3804 + }, + { + "epoch": 0.39453304732711636, + "grad_norm": 1.3125, + "learning_rate": 0.00019162320262237215, + "loss": 4.4298, + "step": 3805 + }, + { + "epoch": 0.39463673538160443, + "grad_norm": 1.296875, + "learning_rate": 0.00019161885012477443, + "loss": 4.4126, + "step": 3806 + }, + { + "epoch": 0.3947404234360925, + "grad_norm": 1.265625, + "learning_rate": 0.00019161449654617112, + "loss": 4.4101, + "step": 3807 + }, + { + "epoch": 0.3948441114905806, + "grad_norm": 1.3046875, + "learning_rate": 0.0001916101418866136, + "loss": 4.3926, + "step": 3808 + }, + { + "epoch": 0.39494779954506865, + "grad_norm": 1.2421875, + "learning_rate": 0.00019160578614615321, + "loss": 4.4267, + "step": 3809 + }, + { + "epoch": 0.3950514875995567, + "grad_norm": 1.2265625, + "learning_rate": 0.00019160142932484142, + "loss": 4.4472, + "step": 3810 + }, + { + "epoch": 0.3951551756540448, + "grad_norm": 1.1484375, + "learning_rate": 0.00019159707142272952, + "loss": 4.4031, + "step": 3811 + }, + { + "epoch": 0.39525886370853286, + "grad_norm": 1.3125, + "learning_rate": 0.0001915927124398691, + "loss": 4.4008, + "step": 3812 + }, + { + "epoch": 0.39536255176302093, + "grad_norm": 1.203125, + "learning_rate": 0.00019158835237631142, + "loss": 4.3912, + "step": 3813 + }, + { + "epoch": 0.395466239817509, + "grad_norm": 1.1796875, + "learning_rate": 0.00019158399123210803, + "loss": 4.3948, + "step": 3814 + }, + { + "epoch": 0.3955699278719971, + "grad_norm": 1.1171875, + "learning_rate": 0.0001915796290073103, + "loss": 4.3967, + "step": 3815 + }, + { + "epoch": 0.39567361592648514, + "grad_norm": 1.3515625, + "learning_rate": 0.00019157526570196982, + "loss": 4.4215, + "step": 3816 + }, + { + "epoch": 0.3957773039809732, + "grad_norm": 1.265625, + "learning_rate": 0.00019157090131613797, + "loss": 4.4453, + "step": 3817 + }, + { + "epoch": 0.3958809920354613, + "grad_norm": 1.265625, + "learning_rate": 0.00019156653584986627, + "loss": 4.4207, + "step": 3818 + }, + { + "epoch": 0.3959846800899494, + "grad_norm": 1.140625, + "learning_rate": 0.00019156216930320624, + "loss": 4.372, + "step": 3819 + }, + { + "epoch": 0.3960883681444375, + "grad_norm": 1.2421875, + "learning_rate": 0.0001915578016762094, + "loss": 4.4139, + "step": 3820 + }, + { + "epoch": 0.39619205619892556, + "grad_norm": 1.15625, + "learning_rate": 0.0001915534329689273, + "loss": 4.4365, + "step": 3821 + }, + { + "epoch": 0.39629574425341363, + "grad_norm": 1.46875, + "learning_rate": 0.00019154906318141147, + "loss": 4.3953, + "step": 3822 + }, + { + "epoch": 0.3963994323079017, + "grad_norm": 1.3671875, + "learning_rate": 0.00019154469231371345, + "loss": 4.4415, + "step": 3823 + }, + { + "epoch": 0.39650312036238977, + "grad_norm": 1.2734375, + "learning_rate": 0.0001915403203658848, + "loss": 4.4293, + "step": 3824 + }, + { + "epoch": 0.39660680841687784, + "grad_norm": 1.203125, + "learning_rate": 0.00019153594733797716, + "loss": 4.4074, + "step": 3825 + }, + { + "epoch": 0.3967104964713659, + "grad_norm": 1.25, + "learning_rate": 0.00019153157323004212, + "loss": 4.4193, + "step": 3826 + }, + { + "epoch": 0.396814184525854, + "grad_norm": 1.140625, + "learning_rate": 0.00019152719804213124, + "loss": 4.4267, + "step": 3827 + }, + { + "epoch": 0.39691787258034206, + "grad_norm": 1.421875, + "learning_rate": 0.0001915228217742962, + "loss": 4.4383, + "step": 3828 + }, + { + "epoch": 0.39702156063483013, + "grad_norm": 1.28125, + "learning_rate": 0.00019151844442658858, + "loss": 4.3961, + "step": 3829 + }, + { + "epoch": 0.3971252486893182, + "grad_norm": 1.2578125, + "learning_rate": 0.00019151406599906005, + "loss": 4.4261, + "step": 3830 + }, + { + "epoch": 0.39722893674380627, + "grad_norm": 1.1796875, + "learning_rate": 0.00019150968649176228, + "loss": 4.4638, + "step": 3831 + }, + { + "epoch": 0.39733262479829434, + "grad_norm": 1.1484375, + "learning_rate": 0.00019150530590474694, + "loss": 4.4539, + "step": 3832 + }, + { + "epoch": 0.3974363128527824, + "grad_norm": 1.046875, + "learning_rate": 0.00019150092423806574, + "loss": 4.4545, + "step": 3833 + }, + { + "epoch": 0.3975400009072705, + "grad_norm": 1.1484375, + "learning_rate": 0.00019149654149177035, + "loss": 4.4372, + "step": 3834 + }, + { + "epoch": 0.39764368896175856, + "grad_norm": 0.97265625, + "learning_rate": 0.00019149215766591247, + "loss": 4.401, + "step": 3835 + }, + { + "epoch": 0.39774737701624663, + "grad_norm": 1.3203125, + "learning_rate": 0.00019148777276054385, + "loss": 4.4252, + "step": 3836 + }, + { + "epoch": 0.3978510650707347, + "grad_norm": 1.140625, + "learning_rate": 0.0001914833867757162, + "loss": 4.417, + "step": 3837 + }, + { + "epoch": 0.39795475312522277, + "grad_norm": 1.390625, + "learning_rate": 0.0001914789997114813, + "loss": 4.428, + "step": 3838 + }, + { + "epoch": 0.39805844117971084, + "grad_norm": 1.1953125, + "learning_rate": 0.00019147461156789088, + "loss": 4.401, + "step": 3839 + }, + { + "epoch": 0.3981621292341989, + "grad_norm": 1.375, + "learning_rate": 0.00019147022234499675, + "loss": 4.4456, + "step": 3840 + }, + { + "epoch": 0.398265817288687, + "grad_norm": 1.1796875, + "learning_rate": 0.00019146583204285073, + "loss": 4.425, + "step": 3841 + }, + { + "epoch": 0.39836950534317506, + "grad_norm": 1.359375, + "learning_rate": 0.00019146144066150452, + "loss": 4.456, + "step": 3842 + }, + { + "epoch": 0.39847319339766313, + "grad_norm": 1.2421875, + "learning_rate": 0.00019145704820101, + "loss": 4.3935, + "step": 3843 + }, + { + "epoch": 0.3985768814521512, + "grad_norm": 1.1953125, + "learning_rate": 0.00019145265466141898, + "loss": 4.4166, + "step": 3844 + }, + { + "epoch": 0.39868056950663927, + "grad_norm": 1.1171875, + "learning_rate": 0.00019144826004278333, + "loss": 4.4599, + "step": 3845 + }, + { + "epoch": 0.39878425756112734, + "grad_norm": 1.3359375, + "learning_rate": 0.00019144386434515487, + "loss": 4.4153, + "step": 3846 + }, + { + "epoch": 0.3988879456156154, + "grad_norm": 1.1328125, + "learning_rate": 0.00019143946756858547, + "loss": 4.3922, + "step": 3847 + }, + { + "epoch": 0.3989916336701035, + "grad_norm": 1.46875, + "learning_rate": 0.00019143506971312698, + "loss": 4.4292, + "step": 3848 + }, + { + "epoch": 0.39909532172459156, + "grad_norm": 1.3125, + "learning_rate": 0.00019143067077883138, + "loss": 4.4366, + "step": 3849 + }, + { + "epoch": 0.3991990097790796, + "grad_norm": 1.2421875, + "learning_rate": 0.00019142627076575047, + "loss": 4.3678, + "step": 3850 + }, + { + "epoch": 0.3993026978335677, + "grad_norm": 1.125, + "learning_rate": 0.0001914218696739362, + "loss": 4.3722, + "step": 3851 + }, + { + "epoch": 0.39940638588805577, + "grad_norm": 1.296875, + "learning_rate": 0.0001914174675034405, + "loss": 4.4099, + "step": 3852 + }, + { + "epoch": 0.39951007394254384, + "grad_norm": 1.109375, + "learning_rate": 0.00019141306425431533, + "loss": 4.4257, + "step": 3853 + }, + { + "epoch": 0.3996137619970319, + "grad_norm": 1.71875, + "learning_rate": 0.00019140865992661263, + "loss": 4.4804, + "step": 3854 + }, + { + "epoch": 0.39971745005152, + "grad_norm": 1.5625, + "learning_rate": 0.00019140425452038437, + "loss": 4.4406, + "step": 3855 + }, + { + "epoch": 0.39982113810600806, + "grad_norm": 1.3125, + "learning_rate": 0.0001913998480356825, + "loss": 4.4184, + "step": 3856 + }, + { + "epoch": 0.3999248261604961, + "grad_norm": 1.2890625, + "learning_rate": 0.00019139544047255908, + "loss": 4.3869, + "step": 3857 + }, + { + "epoch": 0.4000285142149842, + "grad_norm": 1.1953125, + "learning_rate": 0.00019139103183106606, + "loss": 4.3996, + "step": 3858 + }, + { + "epoch": 0.40013220226947227, + "grad_norm": 1.1484375, + "learning_rate": 0.00019138662211125546, + "loss": 4.3957, + "step": 3859 + }, + { + "epoch": 0.40023589032396034, + "grad_norm": 1.28125, + "learning_rate": 0.0001913822113131793, + "loss": 4.4421, + "step": 3860 + }, + { + "epoch": 0.4003395783784484, + "grad_norm": 1.1484375, + "learning_rate": 0.00019137779943688966, + "loss": 4.4283, + "step": 3861 + }, + { + "epoch": 0.4004432664329365, + "grad_norm": 1.3359375, + "learning_rate": 0.00019137338648243855, + "loss": 4.4069, + "step": 3862 + }, + { + "epoch": 0.40054695448742456, + "grad_norm": 1.2265625, + "learning_rate": 0.00019136897244987814, + "loss": 4.4076, + "step": 3863 + }, + { + "epoch": 0.4006506425419127, + "grad_norm": 1.140625, + "learning_rate": 0.00019136455733926035, + "loss": 4.4231, + "step": 3864 + }, + { + "epoch": 0.40075433059640075, + "grad_norm": 1.0, + "learning_rate": 0.0001913601411506374, + "loss": 4.4225, + "step": 3865 + }, + { + "epoch": 0.4008580186508888, + "grad_norm": 1.15625, + "learning_rate": 0.00019135572388406135, + "loss": 4.4317, + "step": 3866 + }, + { + "epoch": 0.4009617067053769, + "grad_norm": 1.015625, + "learning_rate": 0.00019135130553958433, + "loss": 4.4126, + "step": 3867 + }, + { + "epoch": 0.40106539475986497, + "grad_norm": 1.3203125, + "learning_rate": 0.0001913468861172585, + "loss": 4.4147, + "step": 3868 + }, + { + "epoch": 0.40116908281435304, + "grad_norm": 1.21875, + "learning_rate": 0.0001913424656171359, + "loss": 4.407, + "step": 3869 + }, + { + "epoch": 0.4012727708688411, + "grad_norm": 1.3671875, + "learning_rate": 0.00019133804403926883, + "loss": 4.4141, + "step": 3870 + }, + { + "epoch": 0.4013764589233292, + "grad_norm": 1.234375, + "learning_rate": 0.00019133362138370935, + "loss": 4.3927, + "step": 3871 + }, + { + "epoch": 0.40148014697781725, + "grad_norm": 1.328125, + "learning_rate": 0.00019132919765050969, + "loss": 4.3875, + "step": 3872 + }, + { + "epoch": 0.4015838350323053, + "grad_norm": 1.2109375, + "learning_rate": 0.000191324772839722, + "loss": 4.4231, + "step": 3873 + }, + { + "epoch": 0.4016875230867934, + "grad_norm": 1.328125, + "learning_rate": 0.00019132034695139858, + "loss": 4.3975, + "step": 3874 + }, + { + "epoch": 0.40179121114128147, + "grad_norm": 1.234375, + "learning_rate": 0.00019131591998559157, + "loss": 4.3899, + "step": 3875 + }, + { + "epoch": 0.40189489919576954, + "grad_norm": 1.2109375, + "learning_rate": 0.00019131149194235323, + "loss": 4.3924, + "step": 3876 + }, + { + "epoch": 0.4019985872502576, + "grad_norm": 1.1640625, + "learning_rate": 0.0001913070628217358, + "loss": 4.4521, + "step": 3877 + }, + { + "epoch": 0.4021022753047457, + "grad_norm": 1.25, + "learning_rate": 0.0001913026326237916, + "loss": 4.4667, + "step": 3878 + }, + { + "epoch": 0.40220596335923375, + "grad_norm": 1.1484375, + "learning_rate": 0.00019129820134857278, + "loss": 4.4182, + "step": 3879 + }, + { + "epoch": 0.4023096514137218, + "grad_norm": 1.2421875, + "learning_rate": 0.0001912937689961317, + "loss": 4.4307, + "step": 3880 + }, + { + "epoch": 0.4024133394682099, + "grad_norm": 1.1875, + "learning_rate": 0.00019128933556652067, + "loss": 4.4365, + "step": 3881 + }, + { + "epoch": 0.40251702752269797, + "grad_norm": 1.3046875, + "learning_rate": 0.00019128490105979195, + "loss": 4.4601, + "step": 3882 + }, + { + "epoch": 0.40262071557718604, + "grad_norm": 1.203125, + "learning_rate": 0.00019128046547599791, + "loss": 4.4408, + "step": 3883 + }, + { + "epoch": 0.4027244036316741, + "grad_norm": 1.3671875, + "learning_rate": 0.00019127602881519086, + "loss": 4.4029, + "step": 3884 + }, + { + "epoch": 0.4028280916861622, + "grad_norm": 1.265625, + "learning_rate": 0.00019127159107742316, + "loss": 4.4356, + "step": 3885 + }, + { + "epoch": 0.40293177974065025, + "grad_norm": 1.2421875, + "learning_rate": 0.00019126715226274715, + "loss": 4.4414, + "step": 3886 + }, + { + "epoch": 0.4030354677951383, + "grad_norm": 1.15625, + "learning_rate": 0.00019126271237121523, + "loss": 4.4243, + "step": 3887 + }, + { + "epoch": 0.4031391558496264, + "grad_norm": 1.3125, + "learning_rate": 0.00019125827140287975, + "loss": 4.3972, + "step": 3888 + }, + { + "epoch": 0.40324284390411447, + "grad_norm": 1.203125, + "learning_rate": 0.00019125382935779314, + "loss": 4.3953, + "step": 3889 + }, + { + "epoch": 0.40334653195860254, + "grad_norm": 1.4140625, + "learning_rate": 0.0001912493862360078, + "loss": 4.4277, + "step": 3890 + }, + { + "epoch": 0.4034502200130906, + "grad_norm": 1.2578125, + "learning_rate": 0.0001912449420375762, + "loss": 4.4313, + "step": 3891 + }, + { + "epoch": 0.4035539080675787, + "grad_norm": 1.5, + "learning_rate": 0.00019124049676255068, + "loss": 4.4294, + "step": 3892 + }, + { + "epoch": 0.40365759612206675, + "grad_norm": 1.3203125, + "learning_rate": 0.00019123605041098377, + "loss": 4.4509, + "step": 3893 + }, + { + "epoch": 0.4037612841765548, + "grad_norm": 1.328125, + "learning_rate": 0.00019123160298292792, + "loss": 4.4061, + "step": 3894 + }, + { + "epoch": 0.4038649722310429, + "grad_norm": 1.2421875, + "learning_rate": 0.00019122715447843555, + "loss": 4.4158, + "step": 3895 + }, + { + "epoch": 0.40396866028553097, + "grad_norm": 1.2890625, + "learning_rate": 0.00019122270489755922, + "loss": 4.3951, + "step": 3896 + }, + { + "epoch": 0.40407234834001904, + "grad_norm": 1.1484375, + "learning_rate": 0.00019121825424035137, + "loss": 4.3816, + "step": 3897 + }, + { + "epoch": 0.4041760363945071, + "grad_norm": 1.3125, + "learning_rate": 0.0001912138025068646, + "loss": 4.4206, + "step": 3898 + }, + { + "epoch": 0.4042797244489952, + "grad_norm": 1.28125, + "learning_rate": 0.00019120934969715133, + "loss": 4.4204, + "step": 3899 + }, + { + "epoch": 0.40438341250348325, + "grad_norm": 1.265625, + "learning_rate": 0.00019120489581126414, + "loss": 4.4165, + "step": 3900 + }, + { + "epoch": 0.4044871005579713, + "grad_norm": 1.203125, + "learning_rate": 0.0001912004408492556, + "loss": 4.4463, + "step": 3901 + }, + { + "epoch": 0.4045907886124594, + "grad_norm": 1.1328125, + "learning_rate": 0.00019119598481117826, + "loss": 4.4039, + "step": 3902 + }, + { + "epoch": 0.40469447666694747, + "grad_norm": 1.125, + "learning_rate": 0.00019119152769708474, + "loss": 4.4281, + "step": 3903 + }, + { + "epoch": 0.40479816472143554, + "grad_norm": 1.28125, + "learning_rate": 0.00019118706950702752, + "loss": 4.4457, + "step": 3904 + }, + { + "epoch": 0.4049018527759236, + "grad_norm": 1.140625, + "learning_rate": 0.0001911826102410593, + "loss": 4.4312, + "step": 3905 + }, + { + "epoch": 0.4050055408304117, + "grad_norm": 1.3125, + "learning_rate": 0.00019117814989923267, + "loss": 4.4275, + "step": 3906 + }, + { + "epoch": 0.40510922888489975, + "grad_norm": 1.2109375, + "learning_rate": 0.00019117368848160023, + "loss": 4.4088, + "step": 3907 + }, + { + "epoch": 0.4052129169393879, + "grad_norm": 1.1796875, + "learning_rate": 0.00019116922598821464, + "loss": 4.4433, + "step": 3908 + }, + { + "epoch": 0.40531660499387595, + "grad_norm": 1.0546875, + "learning_rate": 0.00019116476241912858, + "loss": 4.4212, + "step": 3909 + }, + { + "epoch": 0.405420293048364, + "grad_norm": 1.1796875, + "learning_rate": 0.00019116029777439467, + "loss": 4.3693, + "step": 3910 + }, + { + "epoch": 0.4055239811028521, + "grad_norm": 1.0234375, + "learning_rate": 0.0001911558320540656, + "loss": 4.416, + "step": 3911 + }, + { + "epoch": 0.40562766915734016, + "grad_norm": 1.265625, + "learning_rate": 0.00019115136525819408, + "loss": 4.4272, + "step": 3912 + }, + { + "epoch": 0.40573135721182824, + "grad_norm": 1.125, + "learning_rate": 0.00019114689738683281, + "loss": 4.3669, + "step": 3913 + }, + { + "epoch": 0.4058350452663163, + "grad_norm": 1.4296875, + "learning_rate": 0.0001911424284400345, + "loss": 4.4228, + "step": 3914 + }, + { + "epoch": 0.4059387333208044, + "grad_norm": 1.265625, + "learning_rate": 0.00019113795841785183, + "loss": 4.415, + "step": 3915 + }, + { + "epoch": 0.40604242137529245, + "grad_norm": 1.3515625, + "learning_rate": 0.00019113348732033764, + "loss": 4.4196, + "step": 3916 + }, + { + "epoch": 0.4061461094297805, + "grad_norm": 1.3203125, + "learning_rate": 0.0001911290151475446, + "loss": 4.3969, + "step": 3917 + }, + { + "epoch": 0.4062497974842686, + "grad_norm": 1.2890625, + "learning_rate": 0.0001911245418995255, + "loss": 4.3875, + "step": 3918 + }, + { + "epoch": 0.40635348553875666, + "grad_norm": 1.1484375, + "learning_rate": 0.00019112006757633317, + "loss": 4.3719, + "step": 3919 + }, + { + "epoch": 0.40645717359324474, + "grad_norm": 1.3515625, + "learning_rate": 0.00019111559217802033, + "loss": 4.3987, + "step": 3920 + }, + { + "epoch": 0.4065608616477328, + "grad_norm": 1.234375, + "learning_rate": 0.00019111111570463982, + "loss": 4.434, + "step": 3921 + }, + { + "epoch": 0.4066645497022209, + "grad_norm": 1.5, + "learning_rate": 0.00019110663815624448, + "loss": 4.4007, + "step": 3922 + }, + { + "epoch": 0.40676823775670895, + "grad_norm": 1.375, + "learning_rate": 0.00019110215953288708, + "loss": 4.3783, + "step": 3923 + }, + { + "epoch": 0.406871925811197, + "grad_norm": 1.3359375, + "learning_rate": 0.0001910976798346205, + "loss": 4.4016, + "step": 3924 + }, + { + "epoch": 0.4069756138656851, + "grad_norm": 1.1875, + "learning_rate": 0.0001910931990614976, + "loss": 4.4789, + "step": 3925 + }, + { + "epoch": 0.40707930192017316, + "grad_norm": 1.3828125, + "learning_rate": 0.00019108871721357126, + "loss": 4.419, + "step": 3926 + }, + { + "epoch": 0.40718298997466124, + "grad_norm": 1.234375, + "learning_rate": 0.00019108423429089433, + "loss": 4.389, + "step": 3927 + }, + { + "epoch": 0.4072866780291493, + "grad_norm": 1.296875, + "learning_rate": 0.0001910797502935197, + "loss": 4.4432, + "step": 3928 + }, + { + "epoch": 0.4073903660836374, + "grad_norm": 1.234375, + "learning_rate": 0.0001910752652215003, + "loss": 4.4265, + "step": 3929 + }, + { + "epoch": 0.40749405413812545, + "grad_norm": 1.203125, + "learning_rate": 0.00019107077907488905, + "loss": 4.3687, + "step": 3930 + }, + { + "epoch": 0.4075977421926135, + "grad_norm": 1.1328125, + "learning_rate": 0.00019106629185373887, + "loss": 4.4174, + "step": 3931 + }, + { + "epoch": 0.4077014302471016, + "grad_norm": 1.1640625, + "learning_rate": 0.0001910618035581027, + "loss": 4.4237, + "step": 3932 + }, + { + "epoch": 0.40780511830158966, + "grad_norm": 1.0390625, + "learning_rate": 0.0001910573141880335, + "loss": 4.4584, + "step": 3933 + }, + { + "epoch": 0.40790880635607774, + "grad_norm": 1.1875, + "learning_rate": 0.00019105282374358424, + "loss": 4.4041, + "step": 3934 + }, + { + "epoch": 0.4080124944105658, + "grad_norm": 1.0, + "learning_rate": 0.00019104833222480792, + "loss": 4.4426, + "step": 3935 + }, + { + "epoch": 0.4081161824650539, + "grad_norm": 1.5546875, + "learning_rate": 0.00019104383963175755, + "loss": 4.4512, + "step": 3936 + }, + { + "epoch": 0.40821987051954195, + "grad_norm": 1.390625, + "learning_rate": 0.00019103934596448606, + "loss": 4.4471, + "step": 3937 + }, + { + "epoch": 0.40832355857403, + "grad_norm": 1.2265625, + "learning_rate": 0.00019103485122304653, + "loss": 4.432, + "step": 3938 + }, + { + "epoch": 0.4084272466285181, + "grad_norm": 1.15625, + "learning_rate": 0.000191030355407492, + "loss": 4.3946, + "step": 3939 + }, + { + "epoch": 0.40853093468300616, + "grad_norm": 1.1953125, + "learning_rate": 0.0001910258585178755, + "loss": 4.4321, + "step": 3940 + }, + { + "epoch": 0.40863462273749424, + "grad_norm": 0.99609375, + "learning_rate": 0.00019102136055425007, + "loss": 4.4022, + "step": 3941 + }, + { + "epoch": 0.4087383107919823, + "grad_norm": 1.359375, + "learning_rate": 0.0001910168615166688, + "loss": 4.4023, + "step": 3942 + }, + { + "epoch": 0.4088419988464704, + "grad_norm": 1.234375, + "learning_rate": 0.0001910123614051848, + "loss": 4.4107, + "step": 3943 + }, + { + "epoch": 0.40894568690095845, + "grad_norm": 1.34375, + "learning_rate": 0.00019100786021985112, + "loss": 4.3875, + "step": 3944 + }, + { + "epoch": 0.4090493749554465, + "grad_norm": 1.265625, + "learning_rate": 0.00019100335796072088, + "loss": 4.4059, + "step": 3945 + }, + { + "epoch": 0.4091530630099346, + "grad_norm": 1.2265625, + "learning_rate": 0.00019099885462784723, + "loss": 4.4487, + "step": 3946 + }, + { + "epoch": 0.40925675106442266, + "grad_norm": 1.046875, + "learning_rate": 0.00019099435022128328, + "loss": 4.3937, + "step": 3947 + }, + { + "epoch": 0.40936043911891073, + "grad_norm": 1.234375, + "learning_rate": 0.0001909898447410822, + "loss": 4.4254, + "step": 3948 + }, + { + "epoch": 0.4094641271733988, + "grad_norm": 1.1171875, + "learning_rate": 0.0001909853381872971, + "loss": 4.4152, + "step": 3949 + }, + { + "epoch": 0.4095678152278869, + "grad_norm": 1.3984375, + "learning_rate": 0.0001909808305599812, + "loss": 4.4128, + "step": 3950 + }, + { + "epoch": 0.40967150328237495, + "grad_norm": 1.296875, + "learning_rate": 0.00019097632185918767, + "loss": 4.3842, + "step": 3951 + }, + { + "epoch": 0.409775191336863, + "grad_norm": 1.15625, + "learning_rate": 0.0001909718120849697, + "loss": 4.4393, + "step": 3952 + }, + { + "epoch": 0.40987887939135115, + "grad_norm": 1.1015625, + "learning_rate": 0.00019096730123738053, + "loss": 4.4265, + "step": 3953 + }, + { + "epoch": 0.4099825674458392, + "grad_norm": 1.203125, + "learning_rate": 0.00019096278931647333, + "loss": 4.4435, + "step": 3954 + }, + { + "epoch": 0.4100862555003273, + "grad_norm": 1.0390625, + "learning_rate": 0.0001909582763223014, + "loss": 4.3978, + "step": 3955 + }, + { + "epoch": 0.41018994355481536, + "grad_norm": 1.2265625, + "learning_rate": 0.00019095376225491793, + "loss": 4.4641, + "step": 3956 + }, + { + "epoch": 0.41029363160930343, + "grad_norm": 1.1328125, + "learning_rate": 0.00019094924711437623, + "loss": 4.4192, + "step": 3957 + }, + { + "epoch": 0.4103973196637915, + "grad_norm": 1.2265625, + "learning_rate": 0.00019094473090072955, + "loss": 4.4066, + "step": 3958 + }, + { + "epoch": 0.4105010077182796, + "grad_norm": 1.15625, + "learning_rate": 0.00019094021361403115, + "loss": 4.4015, + "step": 3959 + }, + { + "epoch": 0.41060469577276765, + "grad_norm": 1.203125, + "learning_rate": 0.00019093569525433437, + "loss": 4.4408, + "step": 3960 + }, + { + "epoch": 0.4107083838272557, + "grad_norm": 1.0546875, + "learning_rate": 0.00019093117582169252, + "loss": 4.4277, + "step": 3961 + }, + { + "epoch": 0.4108120718817438, + "grad_norm": 1.25, + "learning_rate": 0.00019092665531615893, + "loss": 4.3769, + "step": 3962 + }, + { + "epoch": 0.41091575993623186, + "grad_norm": 1.1953125, + "learning_rate": 0.0001909221337377869, + "loss": 4.4123, + "step": 3963 + }, + { + "epoch": 0.41101944799071993, + "grad_norm": 1.2109375, + "learning_rate": 0.00019091761108662979, + "loss": 4.4338, + "step": 3964 + }, + { + "epoch": 0.411123136045208, + "grad_norm": 1.1484375, + "learning_rate": 0.00019091308736274098, + "loss": 4.4592, + "step": 3965 + }, + { + "epoch": 0.4112268240996961, + "grad_norm": 1.09375, + "learning_rate": 0.00019090856256617384, + "loss": 4.449, + "step": 3966 + }, + { + "epoch": 0.41133051215418415, + "grad_norm": 1.0234375, + "learning_rate": 0.00019090403669698178, + "loss": 4.4338, + "step": 3967 + }, + { + "epoch": 0.4114342002086722, + "grad_norm": 1.3125, + "learning_rate": 0.00019089950975521814, + "loss": 4.4226, + "step": 3968 + }, + { + "epoch": 0.4115378882631603, + "grad_norm": 1.125, + "learning_rate": 0.0001908949817409364, + "loss": 4.3943, + "step": 3969 + }, + { + "epoch": 0.41164157631764836, + "grad_norm": 1.4375, + "learning_rate": 0.0001908904526541899, + "loss": 4.3946, + "step": 3970 + }, + { + "epoch": 0.41174526437213643, + "grad_norm": 1.28125, + "learning_rate": 0.0001908859224950322, + "loss": 4.4524, + "step": 3971 + }, + { + "epoch": 0.4118489524266245, + "grad_norm": 1.40625, + "learning_rate": 0.00019088139126351665, + "loss": 4.4208, + "step": 3972 + }, + { + "epoch": 0.4119526404811126, + "grad_norm": 1.25, + "learning_rate": 0.00019087685895969675, + "loss": 4.3971, + "step": 3973 + }, + { + "epoch": 0.41205632853560065, + "grad_norm": 1.5546875, + "learning_rate": 0.000190872325583626, + "loss": 4.389, + "step": 3974 + }, + { + "epoch": 0.4121600165900887, + "grad_norm": 1.2421875, + "learning_rate": 0.00019086779113535785, + "loss": 4.3749, + "step": 3975 + }, + { + "epoch": 0.4122637046445768, + "grad_norm": 1.8203125, + "learning_rate": 0.00019086325561494581, + "loss": 4.4153, + "step": 3976 + }, + { + "epoch": 0.41236739269906486, + "grad_norm": 1.640625, + "learning_rate": 0.00019085871902244344, + "loss": 4.4229, + "step": 3977 + }, + { + "epoch": 0.41247108075355293, + "grad_norm": 1.1328125, + "learning_rate": 0.00019085418135790416, + "loss": 4.4027, + "step": 3978 + }, + { + "epoch": 0.412574768808041, + "grad_norm": 1.171875, + "learning_rate": 0.00019084964262138163, + "loss": 4.387, + "step": 3979 + }, + { + "epoch": 0.4126784568625291, + "grad_norm": 1.03125, + "learning_rate": 0.00019084510281292932, + "loss": 4.4023, + "step": 3980 + }, + { + "epoch": 0.41278214491701715, + "grad_norm": 1.1171875, + "learning_rate": 0.00019084056193260086, + "loss": 4.4215, + "step": 3981 + }, + { + "epoch": 0.4128858329715052, + "grad_norm": 1.0625, + "learning_rate": 0.00019083601998044975, + "loss": 4.4307, + "step": 3982 + }, + { + "epoch": 0.4129895210259933, + "grad_norm": 0.96484375, + "learning_rate": 0.00019083147695652964, + "loss": 4.4602, + "step": 3983 + }, + { + "epoch": 0.41309320908048136, + "grad_norm": 1.0390625, + "learning_rate": 0.0001908269328608941, + "loss": 4.4425, + "step": 3984 + }, + { + "epoch": 0.41319689713496943, + "grad_norm": 0.8671875, + "learning_rate": 0.00019082238769359676, + "loss": 4.3991, + "step": 3985 + }, + { + "epoch": 0.4133005851894575, + "grad_norm": 0.94140625, + "learning_rate": 0.00019081784145469127, + "loss": 4.3821, + "step": 3986 + }, + { + "epoch": 0.4134042732439456, + "grad_norm": 0.8515625, + "learning_rate": 0.00019081329414423123, + "loss": 4.4002, + "step": 3987 + }, + { + "epoch": 0.41350796129843365, + "grad_norm": 0.8203125, + "learning_rate": 0.00019080874576227034, + "loss": 4.3869, + "step": 3988 + }, + { + "epoch": 0.4136116493529217, + "grad_norm": 0.82421875, + "learning_rate": 0.00019080419630886222, + "loss": 4.4355, + "step": 3989 + }, + { + "epoch": 0.4137153374074098, + "grad_norm": 0.75390625, + "learning_rate": 0.00019079964578406057, + "loss": 4.4083, + "step": 3990 + }, + { + "epoch": 0.41381902546189786, + "grad_norm": 0.76953125, + "learning_rate": 0.00019079509418791908, + "loss": 4.4459, + "step": 3991 + }, + { + "epoch": 0.41392271351638593, + "grad_norm": 0.703125, + "learning_rate": 0.00019079054152049146, + "loss": 4.4067, + "step": 3992 + }, + { + "epoch": 0.414026401570874, + "grad_norm": 0.671875, + "learning_rate": 0.00019078598778183138, + "loss": 4.4076, + "step": 3993 + }, + { + "epoch": 0.4141300896253621, + "grad_norm": 0.69140625, + "learning_rate": 0.00019078143297199268, + "loss": 4.4011, + "step": 3994 + }, + { + "epoch": 0.41423377767985015, + "grad_norm": 0.65234375, + "learning_rate": 0.000190776877091029, + "loss": 4.3641, + "step": 3995 + }, + { + "epoch": 0.4143374657343382, + "grad_norm": 0.67578125, + "learning_rate": 0.00019077232013899409, + "loss": 4.3824, + "step": 3996 + }, + { + "epoch": 0.4144411537888263, + "grad_norm": 0.60546875, + "learning_rate": 0.00019076776211594178, + "loss": 4.3664, + "step": 3997 + }, + { + "epoch": 0.4145448418433144, + "grad_norm": 0.6484375, + "learning_rate": 0.00019076320302192584, + "loss": 4.442, + "step": 3998 + }, + { + "epoch": 0.4146485298978025, + "grad_norm": 0.58203125, + "learning_rate": 0.00019075864285700002, + "loss": 4.4118, + "step": 3999 + }, + { + "epoch": 0.41475221795229056, + "grad_norm": 0.59765625, + "learning_rate": 0.00019075408162121815, + "loss": 4.4445, + "step": 4000 + }, + { + "epoch": 0.41485590600677863, + "grad_norm": 0.59375, + "learning_rate": 0.00019074951931463406, + "loss": 4.375, + "step": 4001 + }, + { + "epoch": 0.4149595940612667, + "grad_norm": 0.66015625, + "learning_rate": 0.00019074495593730158, + "loss": 4.4621, + "step": 4002 + }, + { + "epoch": 0.4150632821157548, + "grad_norm": 0.5859375, + "learning_rate": 0.00019074039148927452, + "loss": 4.354, + "step": 4003 + }, + { + "epoch": 0.41516697017024284, + "grad_norm": 0.66015625, + "learning_rate": 0.00019073582597060678, + "loss": 4.3905, + "step": 4004 + }, + { + "epoch": 0.4152706582247309, + "grad_norm": 0.59765625, + "learning_rate": 0.00019073125938135217, + "loss": 4.4419, + "step": 4005 + }, + { + "epoch": 0.415374346279219, + "grad_norm": 0.5859375, + "learning_rate": 0.0001907266917215646, + "loss": 4.4289, + "step": 4006 + }, + { + "epoch": 0.41547803433370706, + "grad_norm": 0.609375, + "learning_rate": 0.00019072212299129802, + "loss": 4.4418, + "step": 4007 + }, + { + "epoch": 0.41558172238819513, + "grad_norm": 0.578125, + "learning_rate": 0.00019071755319060628, + "loss": 4.3701, + "step": 4008 + }, + { + "epoch": 0.4156854104426832, + "grad_norm": 0.578125, + "learning_rate": 0.00019071298231954325, + "loss": 4.3819, + "step": 4009 + }, + { + "epoch": 0.41578909849717127, + "grad_norm": 0.60546875, + "learning_rate": 0.00019070841037816297, + "loss": 4.4376, + "step": 4010 + }, + { + "epoch": 0.41589278655165934, + "grad_norm": 0.5625, + "learning_rate": 0.0001907038373665193, + "loss": 4.4225, + "step": 4011 + }, + { + "epoch": 0.4159964746061474, + "grad_norm": 0.6171875, + "learning_rate": 0.00019069926328466624, + "loss": 4.394, + "step": 4012 + }, + { + "epoch": 0.4161001626606355, + "grad_norm": 0.52734375, + "learning_rate": 0.00019069468813265775, + "loss": 4.4233, + "step": 4013 + }, + { + "epoch": 0.41620385071512356, + "grad_norm": 0.59765625, + "learning_rate": 0.00019069011191054784, + "loss": 4.3972, + "step": 4014 + }, + { + "epoch": 0.41630753876961163, + "grad_norm": 0.53125, + "learning_rate": 0.00019068553461839042, + "loss": 4.4305, + "step": 4015 + }, + { + "epoch": 0.4164112268240997, + "grad_norm": 0.60546875, + "learning_rate": 0.0001906809562562396, + "loss": 4.41, + "step": 4016 + }, + { + "epoch": 0.41651491487858777, + "grad_norm": 0.6171875, + "learning_rate": 0.0001906763768241493, + "loss": 4.4322, + "step": 4017 + }, + { + "epoch": 0.41661860293307584, + "grad_norm": 0.55859375, + "learning_rate": 0.00019067179632217363, + "loss": 4.4129, + "step": 4018 + }, + { + "epoch": 0.4167222909875639, + "grad_norm": 0.53125, + "learning_rate": 0.00019066721475036657, + "loss": 4.4214, + "step": 4019 + }, + { + "epoch": 0.416825979042052, + "grad_norm": 0.54296875, + "learning_rate": 0.00019066263210878225, + "loss": 4.4172, + "step": 4020 + }, + { + "epoch": 0.41692966709654006, + "grad_norm": 0.515625, + "learning_rate": 0.0001906580483974747, + "loss": 4.3925, + "step": 4021 + }, + { + "epoch": 0.41703335515102813, + "grad_norm": 0.5703125, + "learning_rate": 0.000190653463616498, + "loss": 4.4388, + "step": 4022 + }, + { + "epoch": 0.4171370432055162, + "grad_norm": 0.51953125, + "learning_rate": 0.0001906488777659063, + "loss": 4.3655, + "step": 4023 + }, + { + "epoch": 0.41724073126000427, + "grad_norm": 0.5546875, + "learning_rate": 0.0001906442908457536, + "loss": 4.3968, + "step": 4024 + }, + { + "epoch": 0.41734441931449234, + "grad_norm": 0.5, + "learning_rate": 0.00019063970285609412, + "loss": 4.3332, + "step": 4025 + }, + { + "epoch": 0.4174481073689804, + "grad_norm": 0.6015625, + "learning_rate": 0.00019063511379698196, + "loss": 4.4032, + "step": 4026 + }, + { + "epoch": 0.4175517954234685, + "grad_norm": 0.5390625, + "learning_rate": 0.00019063052366847125, + "loss": 4.4094, + "step": 4027 + }, + { + "epoch": 0.41765548347795656, + "grad_norm": 0.5546875, + "learning_rate": 0.00019062593247061614, + "loss": 4.4239, + "step": 4028 + }, + { + "epoch": 0.41775917153244463, + "grad_norm": 0.515625, + "learning_rate": 0.00019062134020347084, + "loss": 4.4453, + "step": 4029 + }, + { + "epoch": 0.4178628595869327, + "grad_norm": 0.5390625, + "learning_rate": 0.00019061674686708953, + "loss": 4.3898, + "step": 4030 + }, + { + "epoch": 0.41796654764142077, + "grad_norm": 0.5, + "learning_rate": 0.00019061215246152637, + "loss": 4.3921, + "step": 4031 + }, + { + "epoch": 0.41807023569590884, + "grad_norm": 0.5078125, + "learning_rate": 0.00019060755698683561, + "loss": 4.4481, + "step": 4032 + }, + { + "epoch": 0.4181739237503969, + "grad_norm": 0.51171875, + "learning_rate": 0.00019060296044307146, + "loss": 4.4255, + "step": 4033 + }, + { + "epoch": 0.418277611804885, + "grad_norm": 0.4765625, + "learning_rate": 0.00019059836283028813, + "loss": 4.4163, + "step": 4034 + }, + { + "epoch": 0.41838129985937306, + "grad_norm": 0.5234375, + "learning_rate": 0.00019059376414853988, + "loss": 4.4293, + "step": 4035 + }, + { + "epoch": 0.41848498791386113, + "grad_norm": 0.51953125, + "learning_rate": 0.00019058916439788098, + "loss": 4.431, + "step": 4036 + }, + { + "epoch": 0.4185886759683492, + "grad_norm": 0.50390625, + "learning_rate": 0.00019058456357836571, + "loss": 4.3368, + "step": 4037 + }, + { + "epoch": 0.41869236402283727, + "grad_norm": 0.5234375, + "learning_rate": 0.00019057996169004833, + "loss": 4.4211, + "step": 4038 + }, + { + "epoch": 0.41879605207732534, + "grad_norm": 0.57421875, + "learning_rate": 0.0001905753587329832, + "loss": 4.4138, + "step": 4039 + }, + { + "epoch": 0.4188997401318134, + "grad_norm": 0.53515625, + "learning_rate": 0.0001905707547072245, + "loss": 4.373, + "step": 4040 + }, + { + "epoch": 0.4190034281863015, + "grad_norm": 0.54296875, + "learning_rate": 0.00019056614961282666, + "loss": 4.3839, + "step": 4041 + }, + { + "epoch": 0.4191071162407896, + "grad_norm": 0.486328125, + "learning_rate": 0.000190561543449844, + "loss": 4.4151, + "step": 4042 + }, + { + "epoch": 0.4192108042952777, + "grad_norm": 0.546875, + "learning_rate": 0.00019055693621833087, + "loss": 4.3428, + "step": 4043 + }, + { + "epoch": 0.41931449234976576, + "grad_norm": 0.53125, + "learning_rate": 0.00019055232791834157, + "loss": 4.3879, + "step": 4044 + }, + { + "epoch": 0.4194181804042538, + "grad_norm": 0.57421875, + "learning_rate": 0.00019054771854993055, + "loss": 4.3823, + "step": 4045 + }, + { + "epoch": 0.4195218684587419, + "grad_norm": 0.5859375, + "learning_rate": 0.00019054310811315213, + "loss": 4.411, + "step": 4046 + }, + { + "epoch": 0.41962555651322997, + "grad_norm": 0.51171875, + "learning_rate": 0.00019053849660806077, + "loss": 4.4158, + "step": 4047 + }, + { + "epoch": 0.41972924456771804, + "grad_norm": 0.52734375, + "learning_rate": 0.00019053388403471086, + "loss": 4.4367, + "step": 4048 + }, + { + "epoch": 0.4198329326222061, + "grad_norm": 0.5546875, + "learning_rate": 0.0001905292703931568, + "loss": 4.4272, + "step": 4049 + }, + { + "epoch": 0.4199366206766942, + "grad_norm": 0.4765625, + "learning_rate": 0.00019052465568345306, + "loss": 4.3869, + "step": 4050 + }, + { + "epoch": 0.42004030873118225, + "grad_norm": 0.54296875, + "learning_rate": 0.00019052003990565407, + "loss": 4.3906, + "step": 4051 + }, + { + "epoch": 0.4201439967856703, + "grad_norm": 0.50390625, + "learning_rate": 0.0001905154230598143, + "loss": 4.4166, + "step": 4052 + }, + { + "epoch": 0.4202476848401584, + "grad_norm": 0.5390625, + "learning_rate": 0.0001905108051459882, + "loss": 4.423, + "step": 4053 + }, + { + "epoch": 0.42035137289464647, + "grad_norm": 0.51171875, + "learning_rate": 0.00019050618616423025, + "loss": 4.4247, + "step": 4054 + }, + { + "epoch": 0.42045506094913454, + "grad_norm": 0.55859375, + "learning_rate": 0.00019050156611459502, + "loss": 4.4092, + "step": 4055 + }, + { + "epoch": 0.4205587490036226, + "grad_norm": 0.53125, + "learning_rate": 0.00019049694499713695, + "loss": 4.4116, + "step": 4056 + }, + { + "epoch": 0.4206624370581107, + "grad_norm": 0.55078125, + "learning_rate": 0.0001904923228119106, + "loss": 4.4161, + "step": 4057 + }, + { + "epoch": 0.42076612511259875, + "grad_norm": 0.49609375, + "learning_rate": 0.00019048769955897047, + "loss": 4.4182, + "step": 4058 + }, + { + "epoch": 0.4208698131670868, + "grad_norm": 0.57421875, + "learning_rate": 0.00019048307523837114, + "loss": 4.3953, + "step": 4059 + }, + { + "epoch": 0.4209735012215749, + "grad_norm": 0.5546875, + "learning_rate": 0.00019047844985016718, + "loss": 4.455, + "step": 4060 + }, + { + "epoch": 0.42107718927606297, + "grad_norm": 0.57421875, + "learning_rate": 0.00019047382339441315, + "loss": 4.4332, + "step": 4061 + }, + { + "epoch": 0.42118087733055104, + "grad_norm": 0.5234375, + "learning_rate": 0.00019046919587116366, + "loss": 4.456, + "step": 4062 + }, + { + "epoch": 0.4212845653850391, + "grad_norm": 0.54296875, + "learning_rate": 0.00019046456728047327, + "loss": 4.421, + "step": 4063 + }, + { + "epoch": 0.4213882534395272, + "grad_norm": 0.5390625, + "learning_rate": 0.0001904599376223966, + "loss": 4.393, + "step": 4064 + }, + { + "epoch": 0.42149194149401525, + "grad_norm": 0.58203125, + "learning_rate": 0.00019045530689698828, + "loss": 4.391, + "step": 4065 + }, + { + "epoch": 0.4215956295485033, + "grad_norm": 0.55859375, + "learning_rate": 0.00019045067510430297, + "loss": 4.3836, + "step": 4066 + }, + { + "epoch": 0.4216993176029914, + "grad_norm": 0.56640625, + "learning_rate": 0.0001904460422443953, + "loss": 4.4405, + "step": 4067 + }, + { + "epoch": 0.42180300565747947, + "grad_norm": 0.546875, + "learning_rate": 0.00019044140831731997, + "loss": 4.418, + "step": 4068 + }, + { + "epoch": 0.42190669371196754, + "grad_norm": 0.5625, + "learning_rate": 0.0001904367733231316, + "loss": 4.4034, + "step": 4069 + }, + { + "epoch": 0.4220103817664556, + "grad_norm": 0.5390625, + "learning_rate": 0.00019043213726188486, + "loss": 4.3926, + "step": 4070 + }, + { + "epoch": 0.4221140698209437, + "grad_norm": 0.55859375, + "learning_rate": 0.00019042750013363452, + "loss": 4.4189, + "step": 4071 + }, + { + "epoch": 0.42221775787543175, + "grad_norm": 0.51171875, + "learning_rate": 0.00019042286193843527, + "loss": 4.4032, + "step": 4072 + }, + { + "epoch": 0.4223214459299198, + "grad_norm": 0.56640625, + "learning_rate": 0.00019041822267634184, + "loss": 4.4009, + "step": 4073 + }, + { + "epoch": 0.4224251339844079, + "grad_norm": 0.55078125, + "learning_rate": 0.00019041358234740896, + "loss": 4.4393, + "step": 4074 + }, + { + "epoch": 0.42252882203889597, + "grad_norm": 0.6015625, + "learning_rate": 0.00019040894095169138, + "loss": 4.3988, + "step": 4075 + }, + { + "epoch": 0.42263251009338404, + "grad_norm": 0.60546875, + "learning_rate": 0.00019040429848924385, + "loss": 4.4107, + "step": 4076 + }, + { + "epoch": 0.4227361981478721, + "grad_norm": 0.60546875, + "learning_rate": 0.00019039965496012116, + "loss": 4.4161, + "step": 4077 + }, + { + "epoch": 0.4228398862023602, + "grad_norm": 0.58203125, + "learning_rate": 0.00019039501036437808, + "loss": 4.3999, + "step": 4078 + }, + { + "epoch": 0.42294357425684825, + "grad_norm": 0.6015625, + "learning_rate": 0.00019039036470206946, + "loss": 4.4145, + "step": 4079 + }, + { + "epoch": 0.4230472623113363, + "grad_norm": 0.58984375, + "learning_rate": 0.00019038571797325005, + "loss": 4.4175, + "step": 4080 + }, + { + "epoch": 0.4231509503658244, + "grad_norm": 0.6328125, + "learning_rate": 0.00019038107017797472, + "loss": 4.3827, + "step": 4081 + }, + { + "epoch": 0.42325463842031247, + "grad_norm": 0.64453125, + "learning_rate": 0.0001903764213162983, + "loss": 4.3996, + "step": 4082 + }, + { + "epoch": 0.42335832647480054, + "grad_norm": 0.640625, + "learning_rate": 0.00019037177138827568, + "loss": 4.4007, + "step": 4083 + }, + { + "epoch": 0.4234620145292886, + "grad_norm": 0.66015625, + "learning_rate": 0.00019036712039396166, + "loss": 4.4087, + "step": 4084 + }, + { + "epoch": 0.4235657025837767, + "grad_norm": 0.63671875, + "learning_rate": 0.00019036246833341113, + "loss": 4.4105, + "step": 4085 + }, + { + "epoch": 0.42366939063826475, + "grad_norm": 0.6328125, + "learning_rate": 0.00019035781520667898, + "loss": 4.4509, + "step": 4086 + }, + { + "epoch": 0.4237730786927529, + "grad_norm": 0.625, + "learning_rate": 0.00019035316101382015, + "loss": 4.3775, + "step": 4087 + }, + { + "epoch": 0.42387676674724095, + "grad_norm": 0.69921875, + "learning_rate": 0.0001903485057548895, + "loss": 4.4293, + "step": 4088 + }, + { + "epoch": 0.423980454801729, + "grad_norm": 0.5625, + "learning_rate": 0.000190343849429942, + "loss": 4.4232, + "step": 4089 + }, + { + "epoch": 0.4240841428562171, + "grad_norm": 0.6328125, + "learning_rate": 0.00019033919203903258, + "loss": 4.3965, + "step": 4090 + }, + { + "epoch": 0.42418783091070517, + "grad_norm": 0.609375, + "learning_rate": 0.0001903345335822162, + "loss": 4.4255, + "step": 4091 + }, + { + "epoch": 0.42429151896519324, + "grad_norm": 0.640625, + "learning_rate": 0.0001903298740595478, + "loss": 4.4579, + "step": 4092 + }, + { + "epoch": 0.4243952070196813, + "grad_norm": 0.58984375, + "learning_rate": 0.00019032521347108237, + "loss": 4.4192, + "step": 4093 + }, + { + "epoch": 0.4244988950741694, + "grad_norm": 0.6875, + "learning_rate": 0.00019032055181687486, + "loss": 4.4378, + "step": 4094 + }, + { + "epoch": 0.42460258312865745, + "grad_norm": 0.58984375, + "learning_rate": 0.00019031588909698037, + "loss": 4.4109, + "step": 4095 + }, + { + "epoch": 0.4247062711831455, + "grad_norm": 0.60546875, + "learning_rate": 0.00019031122531145382, + "loss": 4.3871, + "step": 4096 + }, + { + "epoch": 0.4248099592376336, + "grad_norm": 0.609375, + "learning_rate": 0.00019030656046035028, + "loss": 4.3683, + "step": 4097 + }, + { + "epoch": 0.42491364729212167, + "grad_norm": 0.5625, + "learning_rate": 0.0001903018945437248, + "loss": 4.3856, + "step": 4098 + }, + { + "epoch": 0.42501733534660974, + "grad_norm": 0.5859375, + "learning_rate": 0.00019029722756163243, + "loss": 4.3411, + "step": 4099 + }, + { + "epoch": 0.4251210234010978, + "grad_norm": 0.5703125, + "learning_rate": 0.0001902925595141282, + "loss": 4.3784, + "step": 4100 + }, + { + "epoch": 0.4252247114555859, + "grad_norm": 0.62109375, + "learning_rate": 0.00019028789040126718, + "loss": 4.4135, + "step": 4101 + }, + { + "epoch": 0.42532839951007395, + "grad_norm": 0.57421875, + "learning_rate": 0.00019028322022310453, + "loss": 4.443, + "step": 4102 + }, + { + "epoch": 0.425432087564562, + "grad_norm": 0.6171875, + "learning_rate": 0.0001902785489796953, + "loss": 4.3813, + "step": 4103 + }, + { + "epoch": 0.4255357756190501, + "grad_norm": 0.5859375, + "learning_rate": 0.00019027387667109463, + "loss": 4.4411, + "step": 4104 + }, + { + "epoch": 0.42563946367353817, + "grad_norm": 0.59375, + "learning_rate": 0.00019026920329735764, + "loss": 4.4123, + "step": 4105 + }, + { + "epoch": 0.42574315172802624, + "grad_norm": 0.6015625, + "learning_rate": 0.00019026452885853947, + "loss": 4.4107, + "step": 4106 + }, + { + "epoch": 0.4258468397825143, + "grad_norm": 0.62109375, + "learning_rate": 0.00019025985335469528, + "loss": 4.4329, + "step": 4107 + }, + { + "epoch": 0.4259505278370024, + "grad_norm": 0.55859375, + "learning_rate": 0.00019025517678588022, + "loss": 4.4004, + "step": 4108 + }, + { + "epoch": 0.42605421589149045, + "grad_norm": 0.57421875, + "learning_rate": 0.00019025049915214948, + "loss": 4.356, + "step": 4109 + }, + { + "epoch": 0.4261579039459785, + "grad_norm": 0.53515625, + "learning_rate": 0.00019024582045355825, + "loss": 4.3517, + "step": 4110 + }, + { + "epoch": 0.4262615920004666, + "grad_norm": 0.6171875, + "learning_rate": 0.00019024114069016173, + "loss": 4.4328, + "step": 4111 + }, + { + "epoch": 0.42636528005495467, + "grad_norm": 0.59765625, + "learning_rate": 0.00019023645986201513, + "loss": 4.3555, + "step": 4112 + }, + { + "epoch": 0.42646896810944274, + "grad_norm": 0.62109375, + "learning_rate": 0.0001902317779691737, + "loss": 4.4117, + "step": 4113 + }, + { + "epoch": 0.4265726561639308, + "grad_norm": 0.5390625, + "learning_rate": 0.00019022709501169267, + "loss": 4.4447, + "step": 4114 + }, + { + "epoch": 0.4266763442184189, + "grad_norm": 0.6015625, + "learning_rate": 0.00019022241098962727, + "loss": 4.4215, + "step": 4115 + }, + { + "epoch": 0.42678003227290695, + "grad_norm": 0.53125, + "learning_rate": 0.0001902177259030328, + "loss": 4.4491, + "step": 4116 + }, + { + "epoch": 0.426883720327395, + "grad_norm": 0.66015625, + "learning_rate": 0.00019021303975196453, + "loss": 4.4125, + "step": 4117 + }, + { + "epoch": 0.4269874083818831, + "grad_norm": 0.52734375, + "learning_rate": 0.00019020835253647777, + "loss": 4.433, + "step": 4118 + }, + { + "epoch": 0.42709109643637116, + "grad_norm": 0.67578125, + "learning_rate": 0.0001902036642566278, + "loss": 4.3897, + "step": 4119 + }, + { + "epoch": 0.42719478449085924, + "grad_norm": 0.5703125, + "learning_rate": 0.0001901989749124699, + "loss": 4.4053, + "step": 4120 + }, + { + "epoch": 0.4272984725453473, + "grad_norm": 0.60546875, + "learning_rate": 0.0001901942845040595, + "loss": 4.4107, + "step": 4121 + }, + { + "epoch": 0.4274021605998354, + "grad_norm": 0.5546875, + "learning_rate": 0.00019018959303145186, + "loss": 4.4075, + "step": 4122 + }, + { + "epoch": 0.42750584865432345, + "grad_norm": 0.609375, + "learning_rate": 0.00019018490049470236, + "loss": 4.3824, + "step": 4123 + }, + { + "epoch": 0.4276095367088115, + "grad_norm": 0.62109375, + "learning_rate": 0.00019018020689386632, + "loss": 4.4052, + "step": 4124 + }, + { + "epoch": 0.4277132247632996, + "grad_norm": 0.578125, + "learning_rate": 0.0001901755122289992, + "loss": 4.3915, + "step": 4125 + }, + { + "epoch": 0.42781691281778766, + "grad_norm": 0.6640625, + "learning_rate": 0.0001901708165001564, + "loss": 4.4056, + "step": 4126 + }, + { + "epoch": 0.42792060087227574, + "grad_norm": 0.55859375, + "learning_rate": 0.0001901661197073932, + "loss": 4.424, + "step": 4127 + }, + { + "epoch": 0.4280242889267638, + "grad_norm": 0.625, + "learning_rate": 0.00019016142185076512, + "loss": 4.3931, + "step": 4128 + }, + { + "epoch": 0.4281279769812519, + "grad_norm": 0.53515625, + "learning_rate": 0.0001901567229303276, + "loss": 4.4321, + "step": 4129 + }, + { + "epoch": 0.42823166503573995, + "grad_norm": 0.6328125, + "learning_rate": 0.00019015202294613603, + "loss": 4.4057, + "step": 4130 + }, + { + "epoch": 0.428335353090228, + "grad_norm": 0.56640625, + "learning_rate": 0.00019014732189824587, + "loss": 4.3677, + "step": 4131 + }, + { + "epoch": 0.42843904114471615, + "grad_norm": 0.6171875, + "learning_rate": 0.00019014261978671262, + "loss": 4.4088, + "step": 4132 + }, + { + "epoch": 0.4285427291992042, + "grad_norm": 0.5625, + "learning_rate": 0.00019013791661159172, + "loss": 4.4014, + "step": 4133 + }, + { + "epoch": 0.4286464172536923, + "grad_norm": 0.5859375, + "learning_rate": 0.0001901332123729387, + "loss": 4.4197, + "step": 4134 + }, + { + "epoch": 0.42875010530818036, + "grad_norm": 0.5546875, + "learning_rate": 0.00019012850707080905, + "loss": 4.4187, + "step": 4135 + }, + { + "epoch": 0.42885379336266843, + "grad_norm": 0.54296875, + "learning_rate": 0.00019012380070525828, + "loss": 4.4087, + "step": 4136 + }, + { + "epoch": 0.4289574814171565, + "grad_norm": 0.546875, + "learning_rate": 0.00019011909327634193, + "loss": 4.4242, + "step": 4137 + }, + { + "epoch": 0.4290611694716446, + "grad_norm": 0.55078125, + "learning_rate": 0.00019011438478411553, + "loss": 4.4295, + "step": 4138 + }, + { + "epoch": 0.42916485752613265, + "grad_norm": 0.6171875, + "learning_rate": 0.00019010967522863466, + "loss": 4.3558, + "step": 4139 + }, + { + "epoch": 0.4292685455806207, + "grad_norm": 0.52734375, + "learning_rate": 0.00019010496460995487, + "loss": 4.3369, + "step": 4140 + }, + { + "epoch": 0.4293722336351088, + "grad_norm": 0.5703125, + "learning_rate": 0.00019010025292813174, + "loss": 4.3923, + "step": 4141 + }, + { + "epoch": 0.42947592168959686, + "grad_norm": 0.515625, + "learning_rate": 0.00019009554018322087, + "loss": 4.4044, + "step": 4142 + }, + { + "epoch": 0.42957960974408493, + "grad_norm": 0.58203125, + "learning_rate": 0.00019009082637527785, + "loss": 4.3502, + "step": 4143 + }, + { + "epoch": 0.429683297798573, + "grad_norm": 0.61328125, + "learning_rate": 0.0001900861115043583, + "loss": 4.3683, + "step": 4144 + }, + { + "epoch": 0.4297869858530611, + "grad_norm": 0.55859375, + "learning_rate": 0.0001900813955705179, + "loss": 4.3883, + "step": 4145 + }, + { + "epoch": 0.42989067390754915, + "grad_norm": 0.62890625, + "learning_rate": 0.00019007667857381223, + "loss": 4.4058, + "step": 4146 + }, + { + "epoch": 0.4299943619620372, + "grad_norm": 0.57421875, + "learning_rate": 0.00019007196051429694, + "loss": 4.4143, + "step": 4147 + }, + { + "epoch": 0.4300980500165253, + "grad_norm": 0.6171875, + "learning_rate": 0.00019006724139202774, + "loss": 4.4171, + "step": 4148 + }, + { + "epoch": 0.43020173807101336, + "grad_norm": 0.57421875, + "learning_rate": 0.00019006252120706032, + "loss": 4.4089, + "step": 4149 + }, + { + "epoch": 0.43030542612550143, + "grad_norm": 0.56640625, + "learning_rate": 0.00019005779995945034, + "loss": 4.4398, + "step": 4150 + }, + { + "epoch": 0.4304091141799895, + "grad_norm": 0.5703125, + "learning_rate": 0.00019005307764925351, + "loss": 4.382, + "step": 4151 + }, + { + "epoch": 0.4305128022344776, + "grad_norm": 0.60546875, + "learning_rate": 0.00019004835427652555, + "loss": 4.3662, + "step": 4152 + }, + { + "epoch": 0.43061649028896565, + "grad_norm": 0.5703125, + "learning_rate": 0.0001900436298413222, + "loss": 4.3927, + "step": 4153 + }, + { + "epoch": 0.4307201783434537, + "grad_norm": 0.64453125, + "learning_rate": 0.0001900389043436992, + "loss": 4.4008, + "step": 4154 + }, + { + "epoch": 0.4308238663979418, + "grad_norm": 0.55078125, + "learning_rate": 0.00019003417778371232, + "loss": 4.4127, + "step": 4155 + }, + { + "epoch": 0.43092755445242986, + "grad_norm": 0.60546875, + "learning_rate": 0.00019002945016141726, + "loss": 4.4235, + "step": 4156 + }, + { + "epoch": 0.43103124250691793, + "grad_norm": 0.62109375, + "learning_rate": 0.00019002472147686993, + "loss": 4.3804, + "step": 4157 + }, + { + "epoch": 0.431134930561406, + "grad_norm": 0.66015625, + "learning_rate": 0.00019001999173012596, + "loss": 4.371, + "step": 4158 + }, + { + "epoch": 0.4312386186158941, + "grad_norm": 0.60546875, + "learning_rate": 0.00019001526092124132, + "loss": 4.3823, + "step": 4159 + }, + { + "epoch": 0.43134230667038215, + "grad_norm": 0.66015625, + "learning_rate": 0.0001900105290502717, + "loss": 4.4178, + "step": 4160 + }, + { + "epoch": 0.4314459947248702, + "grad_norm": 0.578125, + "learning_rate": 0.00019000579611727302, + "loss": 4.3976, + "step": 4161 + }, + { + "epoch": 0.4315496827793583, + "grad_norm": 0.59375, + "learning_rate": 0.00019000106212230106, + "loss": 4.3859, + "step": 4162 + }, + { + "epoch": 0.43165337083384636, + "grad_norm": 0.60546875, + "learning_rate": 0.0001899963270654117, + "loss": 4.4141, + "step": 4163 + }, + { + "epoch": 0.43175705888833443, + "grad_norm": 0.6640625, + "learning_rate": 0.00018999159094666083, + "loss": 4.4009, + "step": 4164 + }, + { + "epoch": 0.4318607469428225, + "grad_norm": 0.6015625, + "learning_rate": 0.0001899868537661043, + "loss": 4.3679, + "step": 4165 + }, + { + "epoch": 0.4319644349973106, + "grad_norm": 0.64453125, + "learning_rate": 0.00018998211552379802, + "loss": 4.4297, + "step": 4166 + }, + { + "epoch": 0.43206812305179865, + "grad_norm": 0.65625, + "learning_rate": 0.0001899773762197979, + "loss": 4.4041, + "step": 4167 + }, + { + "epoch": 0.4321718111062867, + "grad_norm": 0.6484375, + "learning_rate": 0.00018997263585415986, + "loss": 4.3974, + "step": 4168 + }, + { + "epoch": 0.4322754991607748, + "grad_norm": 0.6875, + "learning_rate": 0.00018996789442693977, + "loss": 4.4237, + "step": 4169 + }, + { + "epoch": 0.43237918721526286, + "grad_norm": 0.6484375, + "learning_rate": 0.00018996315193819369, + "loss": 4.3515, + "step": 4170 + }, + { + "epoch": 0.43248287526975093, + "grad_norm": 0.64453125, + "learning_rate": 0.00018995840838797746, + "loss": 4.3885, + "step": 4171 + }, + { + "epoch": 0.432586563324239, + "grad_norm": 0.68359375, + "learning_rate": 0.00018995366377634712, + "loss": 4.3844, + "step": 4172 + }, + { + "epoch": 0.4326902513787271, + "grad_norm": 0.6328125, + "learning_rate": 0.00018994891810335864, + "loss": 4.4171, + "step": 4173 + }, + { + "epoch": 0.43279393943321515, + "grad_norm": 0.69140625, + "learning_rate": 0.00018994417136906802, + "loss": 4.4068, + "step": 4174 + }, + { + "epoch": 0.4328976274877032, + "grad_norm": 0.59765625, + "learning_rate": 0.00018993942357353123, + "loss": 4.3834, + "step": 4175 + }, + { + "epoch": 0.43300131554219135, + "grad_norm": 0.6328125, + "learning_rate": 0.0001899346747168043, + "loss": 4.3841, + "step": 4176 + }, + { + "epoch": 0.4331050035966794, + "grad_norm": 0.6640625, + "learning_rate": 0.00018992992479894332, + "loss": 4.3845, + "step": 4177 + }, + { + "epoch": 0.4332086916511675, + "grad_norm": 0.578125, + "learning_rate": 0.00018992517382000426, + "loss": 4.4253, + "step": 4178 + }, + { + "epoch": 0.43331237970565556, + "grad_norm": 0.7421875, + "learning_rate": 0.00018992042178004315, + "loss": 4.3755, + "step": 4179 + }, + { + "epoch": 0.43341606776014363, + "grad_norm": 0.55078125, + "learning_rate": 0.00018991566867911617, + "loss": 4.4059, + "step": 4180 + }, + { + "epoch": 0.4335197558146317, + "grad_norm": 0.66796875, + "learning_rate": 0.00018991091451727937, + "loss": 4.4073, + "step": 4181 + }, + { + "epoch": 0.4336234438691198, + "grad_norm": 0.56640625, + "learning_rate": 0.0001899061592945888, + "loss": 4.3329, + "step": 4182 + }, + { + "epoch": 0.43372713192360784, + "grad_norm": 0.63671875, + "learning_rate": 0.00018990140301110056, + "loss": 4.3915, + "step": 4183 + }, + { + "epoch": 0.4338308199780959, + "grad_norm": 0.5625, + "learning_rate": 0.0001898966456668708, + "loss": 4.4172, + "step": 4184 + }, + { + "epoch": 0.433934508032584, + "grad_norm": 0.62890625, + "learning_rate": 0.00018989188726195567, + "loss": 4.4165, + "step": 4185 + }, + { + "epoch": 0.43403819608707206, + "grad_norm": 0.55078125, + "learning_rate": 0.00018988712779641128, + "loss": 4.402, + "step": 4186 + }, + { + "epoch": 0.43414188414156013, + "grad_norm": 0.63671875, + "learning_rate": 0.0001898823672702938, + "loss": 4.4, + "step": 4187 + }, + { + "epoch": 0.4342455721960482, + "grad_norm": 0.57421875, + "learning_rate": 0.00018987760568365937, + "loss": 4.4032, + "step": 4188 + }, + { + "epoch": 0.4343492602505363, + "grad_norm": 0.625, + "learning_rate": 0.0001898728430365642, + "loss": 4.4163, + "step": 4189 + }, + { + "epoch": 0.43445294830502434, + "grad_norm": 0.6015625, + "learning_rate": 0.0001898680793290645, + "loss": 4.3809, + "step": 4190 + }, + { + "epoch": 0.4345566363595124, + "grad_norm": 0.65625, + "learning_rate": 0.00018986331456121648, + "loss": 4.3944, + "step": 4191 + }, + { + "epoch": 0.4346603244140005, + "grad_norm": 0.609375, + "learning_rate": 0.00018985854873307628, + "loss": 4.3181, + "step": 4192 + }, + { + "epoch": 0.43476401246848856, + "grad_norm": 0.6640625, + "learning_rate": 0.00018985378184470023, + "loss": 4.4519, + "step": 4193 + }, + { + "epoch": 0.43486770052297663, + "grad_norm": 0.5859375, + "learning_rate": 0.00018984901389614453, + "loss": 4.3376, + "step": 4194 + }, + { + "epoch": 0.4349713885774647, + "grad_norm": 0.6484375, + "learning_rate": 0.00018984424488746544, + "loss": 4.3662, + "step": 4195 + }, + { + "epoch": 0.4350750766319528, + "grad_norm": 0.58203125, + "learning_rate": 0.00018983947481871922, + "loss": 4.4071, + "step": 4196 + }, + { + "epoch": 0.43517876468644084, + "grad_norm": 0.64453125, + "learning_rate": 0.0001898347036899622, + "loss": 4.4058, + "step": 4197 + }, + { + "epoch": 0.4352824527409289, + "grad_norm": 0.62109375, + "learning_rate": 0.00018982993150125057, + "loss": 4.4075, + "step": 4198 + }, + { + "epoch": 0.435386140795417, + "grad_norm": 0.6328125, + "learning_rate": 0.00018982515825264072, + "loss": 4.4165, + "step": 4199 + }, + { + "epoch": 0.43548982884990506, + "grad_norm": 0.57421875, + "learning_rate": 0.000189820383944189, + "loss": 4.4017, + "step": 4200 + }, + { + "epoch": 0.43559351690439313, + "grad_norm": 0.6640625, + "learning_rate": 0.00018981560857595167, + "loss": 4.4161, + "step": 4201 + }, + { + "epoch": 0.4356972049588812, + "grad_norm": 0.53125, + "learning_rate": 0.00018981083214798507, + "loss": 4.4522, + "step": 4202 + }, + { + "epoch": 0.4358008930133693, + "grad_norm": 0.73046875, + "learning_rate": 0.0001898060546603456, + "loss": 4.4195, + "step": 4203 + }, + { + "epoch": 0.43590458106785734, + "grad_norm": 0.546875, + "learning_rate": 0.0001898012761130896, + "loss": 4.4096, + "step": 4204 + }, + { + "epoch": 0.4360082691223454, + "grad_norm": 0.69921875, + "learning_rate": 0.0001897964965062735, + "loss": 4.3819, + "step": 4205 + }, + { + "epoch": 0.4361119571768335, + "grad_norm": 0.5546875, + "learning_rate": 0.00018979171583995366, + "loss": 4.3974, + "step": 4206 + }, + { + "epoch": 0.43621564523132156, + "grad_norm": 0.69921875, + "learning_rate": 0.00018978693411418648, + "loss": 4.4042, + "step": 4207 + }, + { + "epoch": 0.43631933328580963, + "grad_norm": 0.515625, + "learning_rate": 0.00018978215132902838, + "loss": 4.3937, + "step": 4208 + }, + { + "epoch": 0.4364230213402977, + "grad_norm": 0.70703125, + "learning_rate": 0.00018977736748453577, + "loss": 4.3937, + "step": 4209 + }, + { + "epoch": 0.4365267093947858, + "grad_norm": 0.53125, + "learning_rate": 0.00018977258258076515, + "loss": 4.4096, + "step": 4210 + }, + { + "epoch": 0.43663039744927384, + "grad_norm": 0.66796875, + "learning_rate": 0.00018976779661777297, + "loss": 4.3821, + "step": 4211 + }, + { + "epoch": 0.4367340855037619, + "grad_norm": 0.60546875, + "learning_rate": 0.00018976300959561564, + "loss": 4.3688, + "step": 4212 + }, + { + "epoch": 0.43683777355825, + "grad_norm": 0.703125, + "learning_rate": 0.00018975822151434972, + "loss": 4.3522, + "step": 4213 + }, + { + "epoch": 0.43694146161273806, + "grad_norm": 0.64453125, + "learning_rate": 0.00018975343237403162, + "loss": 4.3861, + "step": 4214 + }, + { + "epoch": 0.43704514966722613, + "grad_norm": 0.671875, + "learning_rate": 0.00018974864217471792, + "loss": 4.3924, + "step": 4215 + }, + { + "epoch": 0.4371488377217142, + "grad_norm": 0.6015625, + "learning_rate": 0.0001897438509164651, + "loss": 4.3727, + "step": 4216 + }, + { + "epoch": 0.43725252577620227, + "grad_norm": 0.68359375, + "learning_rate": 0.00018973905859932973, + "loss": 4.4095, + "step": 4217 + }, + { + "epoch": 0.43735621383069034, + "grad_norm": 0.6796875, + "learning_rate": 0.00018973426522336827, + "loss": 4.4024, + "step": 4218 + }, + { + "epoch": 0.4374599018851784, + "grad_norm": 0.62890625, + "learning_rate": 0.00018972947078863738, + "loss": 4.399, + "step": 4219 + }, + { + "epoch": 0.4375635899396665, + "grad_norm": 0.63671875, + "learning_rate": 0.00018972467529519357, + "loss": 4.3364, + "step": 4220 + }, + { + "epoch": 0.4376672779941546, + "grad_norm": 0.625, + "learning_rate": 0.00018971987874309343, + "loss": 4.4305, + "step": 4221 + }, + { + "epoch": 0.4377709660486427, + "grad_norm": 0.60546875, + "learning_rate": 0.00018971508113239358, + "loss": 4.3965, + "step": 4222 + }, + { + "epoch": 0.43787465410313076, + "grad_norm": 0.66015625, + "learning_rate": 0.00018971028246315058, + "loss": 4.435, + "step": 4223 + }, + { + "epoch": 0.4379783421576188, + "grad_norm": 0.58203125, + "learning_rate": 0.00018970548273542108, + "loss": 4.3942, + "step": 4224 + }, + { + "epoch": 0.4380820302121069, + "grad_norm": 0.68359375, + "learning_rate": 0.0001897006819492617, + "loss": 4.3815, + "step": 4225 + }, + { + "epoch": 0.43818571826659497, + "grad_norm": 0.6328125, + "learning_rate": 0.00018969588010472915, + "loss": 4.4046, + "step": 4226 + }, + { + "epoch": 0.43828940632108304, + "grad_norm": 0.640625, + "learning_rate": 0.00018969107720187995, + "loss": 4.4266, + "step": 4227 + }, + { + "epoch": 0.4383930943755711, + "grad_norm": 0.671875, + "learning_rate": 0.00018968627324077088, + "loss": 4.3653, + "step": 4228 + }, + { + "epoch": 0.4384967824300592, + "grad_norm": 0.65625, + "learning_rate": 0.00018968146822145857, + "loss": 4.4342, + "step": 4229 + }, + { + "epoch": 0.43860047048454726, + "grad_norm": 0.64453125, + "learning_rate": 0.00018967666214399976, + "loss": 4.4118, + "step": 4230 + }, + { + "epoch": 0.4387041585390353, + "grad_norm": 0.64453125, + "learning_rate": 0.00018967185500845112, + "loss": 4.3927, + "step": 4231 + }, + { + "epoch": 0.4388078465935234, + "grad_norm": 0.65625, + "learning_rate": 0.00018966704681486936, + "loss": 4.3668, + "step": 4232 + }, + { + "epoch": 0.43891153464801147, + "grad_norm": 0.6171875, + "learning_rate": 0.00018966223756331125, + "loss": 4.3601, + "step": 4233 + }, + { + "epoch": 0.43901522270249954, + "grad_norm": 0.63671875, + "learning_rate": 0.0001896574272538335, + "loss": 4.4051, + "step": 4234 + }, + { + "epoch": 0.4391189107569876, + "grad_norm": 0.67578125, + "learning_rate": 0.0001896526158864929, + "loss": 4.3614, + "step": 4235 + }, + { + "epoch": 0.4392225988114757, + "grad_norm": 0.5625, + "learning_rate": 0.00018964780346134618, + "loss": 4.414, + "step": 4236 + }, + { + "epoch": 0.43932628686596376, + "grad_norm": 0.6171875, + "learning_rate": 0.00018964298997845017, + "loss": 4.4161, + "step": 4237 + }, + { + "epoch": 0.4394299749204518, + "grad_norm": 0.6171875, + "learning_rate": 0.0001896381754378616, + "loss": 4.4016, + "step": 4238 + }, + { + "epoch": 0.4395336629749399, + "grad_norm": 0.546875, + "learning_rate": 0.0001896333598396373, + "loss": 4.3883, + "step": 4239 + }, + { + "epoch": 0.43963735102942797, + "grad_norm": 0.640625, + "learning_rate": 0.00018962854318383413, + "loss": 4.4013, + "step": 4240 + }, + { + "epoch": 0.43974103908391604, + "grad_norm": 0.58984375, + "learning_rate": 0.00018962372547050887, + "loss": 4.4009, + "step": 4241 + }, + { + "epoch": 0.4398447271384041, + "grad_norm": 0.60546875, + "learning_rate": 0.0001896189066997184, + "loss": 4.4045, + "step": 4242 + }, + { + "epoch": 0.4399484151928922, + "grad_norm": 0.63671875, + "learning_rate": 0.00018961408687151959, + "loss": 4.3849, + "step": 4243 + }, + { + "epoch": 0.44005210324738026, + "grad_norm": 0.546875, + "learning_rate": 0.00018960926598596926, + "loss": 4.3956, + "step": 4244 + }, + { + "epoch": 0.4401557913018683, + "grad_norm": 0.578125, + "learning_rate": 0.0001896044440431243, + "loss": 4.4034, + "step": 4245 + }, + { + "epoch": 0.4402594793563564, + "grad_norm": 0.5546875, + "learning_rate": 0.00018959962104304163, + "loss": 4.3885, + "step": 4246 + }, + { + "epoch": 0.44036316741084447, + "grad_norm": 0.54296875, + "learning_rate": 0.00018959479698577814, + "loss": 4.3715, + "step": 4247 + }, + { + "epoch": 0.44046685546533254, + "grad_norm": 0.6328125, + "learning_rate": 0.00018958997187139073, + "loss": 4.4171, + "step": 4248 + }, + { + "epoch": 0.4405705435198206, + "grad_norm": 0.6171875, + "learning_rate": 0.00018958514569993636, + "loss": 4.3923, + "step": 4249 + }, + { + "epoch": 0.4406742315743087, + "grad_norm": 0.5859375, + "learning_rate": 0.000189580318471472, + "loss": 4.3919, + "step": 4250 + }, + { + "epoch": 0.44077791962879675, + "grad_norm": 0.58984375, + "learning_rate": 0.00018957549018605455, + "loss": 4.3884, + "step": 4251 + }, + { + "epoch": 0.4408816076832848, + "grad_norm": 0.640625, + "learning_rate": 0.000189570660843741, + "loss": 4.3758, + "step": 4252 + }, + { + "epoch": 0.4409852957377729, + "grad_norm": 0.5703125, + "learning_rate": 0.00018956583044458833, + "loss": 4.3725, + "step": 4253 + }, + { + "epoch": 0.44108898379226097, + "grad_norm": 0.61328125, + "learning_rate": 0.00018956099898865354, + "loss": 4.4041, + "step": 4254 + }, + { + "epoch": 0.44119267184674904, + "grad_norm": 0.63671875, + "learning_rate": 0.00018955616647599365, + "loss": 4.4445, + "step": 4255 + }, + { + "epoch": 0.4412963599012371, + "grad_norm": 0.5859375, + "learning_rate": 0.00018955133290666563, + "loss": 4.376, + "step": 4256 + }, + { + "epoch": 0.4414000479557252, + "grad_norm": 0.640625, + "learning_rate": 0.00018954649828072654, + "loss": 4.4271, + "step": 4257 + }, + { + "epoch": 0.44150373601021325, + "grad_norm": 0.58984375, + "learning_rate": 0.00018954166259823346, + "loss": 4.3593, + "step": 4258 + }, + { + "epoch": 0.4416074240647013, + "grad_norm": 0.75, + "learning_rate": 0.00018953682585924337, + "loss": 4.4018, + "step": 4259 + }, + { + "epoch": 0.4417111121191894, + "grad_norm": 0.6171875, + "learning_rate": 0.0001895319880638134, + "loss": 4.4092, + "step": 4260 + }, + { + "epoch": 0.44181480017367747, + "grad_norm": 0.59375, + "learning_rate": 0.00018952714921200063, + "loss": 4.3735, + "step": 4261 + }, + { + "epoch": 0.44191848822816554, + "grad_norm": 0.640625, + "learning_rate": 0.0001895223093038621, + "loss": 4.4423, + "step": 4262 + }, + { + "epoch": 0.4420221762826536, + "grad_norm": 0.58984375, + "learning_rate": 0.00018951746833945497, + "loss": 4.3934, + "step": 4263 + }, + { + "epoch": 0.4421258643371417, + "grad_norm": 0.64453125, + "learning_rate": 0.0001895126263188363, + "loss": 4.3609, + "step": 4264 + }, + { + "epoch": 0.44222955239162975, + "grad_norm": 0.55078125, + "learning_rate": 0.0001895077832420633, + "loss": 4.3961, + "step": 4265 + }, + { + "epoch": 0.4423332404461179, + "grad_norm": 0.62890625, + "learning_rate": 0.00018950293910919305, + "loss": 4.4182, + "step": 4266 + }, + { + "epoch": 0.44243692850060595, + "grad_norm": 0.53125, + "learning_rate": 0.00018949809392028276, + "loss": 4.3712, + "step": 4267 + }, + { + "epoch": 0.442540616555094, + "grad_norm": 0.56640625, + "learning_rate": 0.00018949324767538955, + "loss": 4.3777, + "step": 4268 + }, + { + "epoch": 0.4426443046095821, + "grad_norm": 0.55859375, + "learning_rate": 0.00018948840037457057, + "loss": 4.3713, + "step": 4269 + }, + { + "epoch": 0.44274799266407017, + "grad_norm": 0.59765625, + "learning_rate": 0.00018948355201788312, + "loss": 4.4353, + "step": 4270 + }, + { + "epoch": 0.44285168071855824, + "grad_norm": 0.609375, + "learning_rate": 0.0001894787026053843, + "loss": 4.4071, + "step": 4271 + }, + { + "epoch": 0.4429553687730463, + "grad_norm": 0.640625, + "learning_rate": 0.0001894738521371314, + "loss": 4.3544, + "step": 4272 + }, + { + "epoch": 0.4430590568275344, + "grad_norm": 0.61328125, + "learning_rate": 0.00018946900061318162, + "loss": 4.3539, + "step": 4273 + }, + { + "epoch": 0.44316274488202245, + "grad_norm": 0.66015625, + "learning_rate": 0.00018946414803359222, + "loss": 4.4077, + "step": 4274 + }, + { + "epoch": 0.4432664329365105, + "grad_norm": 0.6796875, + "learning_rate": 0.0001894592943984204, + "loss": 4.3844, + "step": 4275 + }, + { + "epoch": 0.4433701209909986, + "grad_norm": 0.6484375, + "learning_rate": 0.0001894544397077235, + "loss": 4.3638, + "step": 4276 + }, + { + "epoch": 0.44347380904548667, + "grad_norm": 0.828125, + "learning_rate": 0.00018944958396155878, + "loss": 4.3844, + "step": 4277 + }, + { + "epoch": 0.44357749709997474, + "grad_norm": 0.6484375, + "learning_rate": 0.0001894447271599835, + "loss": 4.4215, + "step": 4278 + }, + { + "epoch": 0.4436811851544628, + "grad_norm": 0.65625, + "learning_rate": 0.00018943986930305498, + "loss": 4.38, + "step": 4279 + }, + { + "epoch": 0.4437848732089509, + "grad_norm": 0.6640625, + "learning_rate": 0.00018943501039083056, + "loss": 4.391, + "step": 4280 + }, + { + "epoch": 0.44388856126343895, + "grad_norm": 0.60546875, + "learning_rate": 0.00018943015042336754, + "loss": 4.4133, + "step": 4281 + }, + { + "epoch": 0.443992249317927, + "grad_norm": 0.63671875, + "learning_rate": 0.00018942528940072329, + "loss": 4.3592, + "step": 4282 + }, + { + "epoch": 0.4440959373724151, + "grad_norm": 0.640625, + "learning_rate": 0.00018942042732295514, + "loss": 4.3841, + "step": 4283 + }, + { + "epoch": 0.44419962542690317, + "grad_norm": 0.70703125, + "learning_rate": 0.00018941556419012047, + "loss": 4.362, + "step": 4284 + }, + { + "epoch": 0.44430331348139124, + "grad_norm": 0.58203125, + "learning_rate": 0.00018941070000227667, + "loss": 4.3791, + "step": 4285 + }, + { + "epoch": 0.4444070015358793, + "grad_norm": 0.71484375, + "learning_rate": 0.0001894058347594811, + "loss": 4.3727, + "step": 4286 + }, + { + "epoch": 0.4445106895903674, + "grad_norm": 0.64453125, + "learning_rate": 0.00018940096846179123, + "loss": 4.3808, + "step": 4287 + }, + { + "epoch": 0.44461437764485545, + "grad_norm": 0.69140625, + "learning_rate": 0.00018939610110926437, + "loss": 4.4176, + "step": 4288 + }, + { + "epoch": 0.4447180656993435, + "grad_norm": 0.640625, + "learning_rate": 0.00018939123270195806, + "loss": 4.3873, + "step": 4289 + }, + { + "epoch": 0.4448217537538316, + "grad_norm": 0.6015625, + "learning_rate": 0.0001893863632399297, + "loss": 4.4036, + "step": 4290 + }, + { + "epoch": 0.44492544180831967, + "grad_norm": 0.69140625, + "learning_rate": 0.00018938149272323672, + "loss": 4.4184, + "step": 4291 + }, + { + "epoch": 0.44502912986280774, + "grad_norm": 0.64453125, + "learning_rate": 0.0001893766211519366, + "loss": 4.3895, + "step": 4292 + }, + { + "epoch": 0.4451328179172958, + "grad_norm": 0.65625, + "learning_rate": 0.00018937174852608682, + "loss": 4.4001, + "step": 4293 + }, + { + "epoch": 0.4452365059717839, + "grad_norm": 0.703125, + "learning_rate": 0.0001893668748457449, + "loss": 4.4087, + "step": 4294 + }, + { + "epoch": 0.44534019402627195, + "grad_norm": 0.6328125, + "learning_rate": 0.0001893620001109683, + "loss": 4.3992, + "step": 4295 + }, + { + "epoch": 0.44544388208076, + "grad_norm": 0.6953125, + "learning_rate": 0.00018935712432181459, + "loss": 4.3913, + "step": 4296 + }, + { + "epoch": 0.4455475701352481, + "grad_norm": 0.671875, + "learning_rate": 0.00018935224747834123, + "loss": 4.4231, + "step": 4297 + }, + { + "epoch": 0.44565125818973617, + "grad_norm": 0.62890625, + "learning_rate": 0.0001893473695806058, + "loss": 4.4124, + "step": 4298 + }, + { + "epoch": 0.44575494624422424, + "grad_norm": 0.67578125, + "learning_rate": 0.0001893424906286659, + "loss": 4.4123, + "step": 4299 + }, + { + "epoch": 0.4458586342987123, + "grad_norm": 0.59765625, + "learning_rate": 0.000189337610622579, + "loss": 4.4489, + "step": 4300 + }, + { + "epoch": 0.4459623223532004, + "grad_norm": 0.73046875, + "learning_rate": 0.00018933272956240277, + "loss": 4.3554, + "step": 4301 + }, + { + "epoch": 0.44606601040768845, + "grad_norm": 0.6796875, + "learning_rate": 0.00018932784744819472, + "loss": 4.4026, + "step": 4302 + }, + { + "epoch": 0.4461696984621765, + "grad_norm": 0.65625, + "learning_rate": 0.00018932296428001252, + "loss": 4.3834, + "step": 4303 + }, + { + "epoch": 0.4462733865166646, + "grad_norm": 0.6640625, + "learning_rate": 0.00018931808005791373, + "loss": 4.4165, + "step": 4304 + }, + { + "epoch": 0.44637707457115267, + "grad_norm": 0.61328125, + "learning_rate": 0.00018931319478195606, + "loss": 4.3488, + "step": 4305 + }, + { + "epoch": 0.44648076262564074, + "grad_norm": 0.63671875, + "learning_rate": 0.00018930830845219706, + "loss": 4.4097, + "step": 4306 + }, + { + "epoch": 0.4465844506801288, + "grad_norm": 0.69140625, + "learning_rate": 0.00018930342106869444, + "loss": 4.3845, + "step": 4307 + }, + { + "epoch": 0.4466881387346169, + "grad_norm": 0.59375, + "learning_rate": 0.00018929853263150584, + "loss": 4.3781, + "step": 4308 + }, + { + "epoch": 0.44679182678910495, + "grad_norm": 0.71875, + "learning_rate": 0.00018929364314068897, + "loss": 4.4109, + "step": 4309 + }, + { + "epoch": 0.4468955148435931, + "grad_norm": 0.640625, + "learning_rate": 0.00018928875259630146, + "loss": 4.3925, + "step": 4310 + }, + { + "epoch": 0.44699920289808115, + "grad_norm": 0.65234375, + "learning_rate": 0.00018928386099840107, + "loss": 4.3674, + "step": 4311 + }, + { + "epoch": 0.4471028909525692, + "grad_norm": 0.6484375, + "learning_rate": 0.00018927896834704548, + "loss": 4.4255, + "step": 4312 + }, + { + "epoch": 0.4472065790070573, + "grad_norm": 0.734375, + "learning_rate": 0.00018927407464229247, + "loss": 4.3883, + "step": 4313 + }, + { + "epoch": 0.44731026706154536, + "grad_norm": 0.72265625, + "learning_rate": 0.00018926917988419973, + "loss": 4.4199, + "step": 4314 + }, + { + "epoch": 0.44741395511603343, + "grad_norm": 0.70703125, + "learning_rate": 0.00018926428407282502, + "loss": 4.4236, + "step": 4315 + }, + { + "epoch": 0.4475176431705215, + "grad_norm": 0.78125, + "learning_rate": 0.00018925938720822612, + "loss": 4.4018, + "step": 4316 + }, + { + "epoch": 0.4476213312250096, + "grad_norm": 0.66796875, + "learning_rate": 0.00018925448929046082, + "loss": 4.3846, + "step": 4317 + }, + { + "epoch": 0.44772501927949765, + "grad_norm": 0.765625, + "learning_rate": 0.00018924959031958686, + "loss": 4.3673, + "step": 4318 + }, + { + "epoch": 0.4478287073339857, + "grad_norm": 0.69140625, + "learning_rate": 0.0001892446902956621, + "loss": 4.4087, + "step": 4319 + }, + { + "epoch": 0.4479323953884738, + "grad_norm": 0.63671875, + "learning_rate": 0.0001892397892187443, + "loss": 4.4178, + "step": 4320 + }, + { + "epoch": 0.44803608344296186, + "grad_norm": 0.6953125, + "learning_rate": 0.00018923488708889137, + "loss": 4.3939, + "step": 4321 + }, + { + "epoch": 0.44813977149744993, + "grad_norm": 0.66796875, + "learning_rate": 0.00018922998390616104, + "loss": 4.4221, + "step": 4322 + }, + { + "epoch": 0.448243459551938, + "grad_norm": 0.63671875, + "learning_rate": 0.00018922507967061126, + "loss": 4.3855, + "step": 4323 + }, + { + "epoch": 0.4483471476064261, + "grad_norm": 0.65625, + "learning_rate": 0.00018922017438229984, + "loss": 4.3899, + "step": 4324 + }, + { + "epoch": 0.44845083566091415, + "grad_norm": 0.74609375, + "learning_rate": 0.00018921526804128468, + "loss": 4.4192, + "step": 4325 + }, + { + "epoch": 0.4485545237154022, + "grad_norm": 0.6328125, + "learning_rate": 0.00018921036064762365, + "loss": 4.4315, + "step": 4326 + }, + { + "epoch": 0.4486582117698903, + "grad_norm": 0.640625, + "learning_rate": 0.00018920545220137467, + "loss": 4.4587, + "step": 4327 + }, + { + "epoch": 0.44876189982437836, + "grad_norm": 0.66796875, + "learning_rate": 0.00018920054270259566, + "loss": 4.3906, + "step": 4328 + }, + { + "epoch": 0.44886558787886643, + "grad_norm": 0.67578125, + "learning_rate": 0.00018919563215134453, + "loss": 4.3824, + "step": 4329 + }, + { + "epoch": 0.4489692759333545, + "grad_norm": 0.625, + "learning_rate": 0.0001891907205476792, + "loss": 4.4012, + "step": 4330 + }, + { + "epoch": 0.4490729639878426, + "grad_norm": 0.76171875, + "learning_rate": 0.00018918580789165765, + "loss": 4.4116, + "step": 4331 + }, + { + "epoch": 0.44917665204233065, + "grad_norm": 0.61328125, + "learning_rate": 0.00018918089418333786, + "loss": 4.4315, + "step": 4332 + }, + { + "epoch": 0.4492803400968187, + "grad_norm": 0.66796875, + "learning_rate": 0.00018917597942277777, + "loss": 4.4087, + "step": 4333 + }, + { + "epoch": 0.4493840281513068, + "grad_norm": 0.6328125, + "learning_rate": 0.0001891710636100354, + "loss": 4.3834, + "step": 4334 + }, + { + "epoch": 0.44948771620579486, + "grad_norm": 0.6796875, + "learning_rate": 0.00018916614674516875, + "loss": 4.4191, + "step": 4335 + }, + { + "epoch": 0.44959140426028293, + "grad_norm": 0.73046875, + "learning_rate": 0.00018916122882823582, + "loss": 4.4078, + "step": 4336 + }, + { + "epoch": 0.449695092314771, + "grad_norm": 0.70703125, + "learning_rate": 0.0001891563098592946, + "loss": 4.3853, + "step": 4337 + }, + { + "epoch": 0.4497987803692591, + "grad_norm": 0.66796875, + "learning_rate": 0.00018915138983840318, + "loss": 4.366, + "step": 4338 + }, + { + "epoch": 0.44990246842374715, + "grad_norm": 0.67578125, + "learning_rate": 0.00018914646876561962, + "loss": 4.3723, + "step": 4339 + }, + { + "epoch": 0.4500061564782352, + "grad_norm": 0.67578125, + "learning_rate": 0.00018914154664100195, + "loss": 4.4083, + "step": 4340 + }, + { + "epoch": 0.4501098445327233, + "grad_norm": 0.69140625, + "learning_rate": 0.00018913662346460824, + "loss": 4.3813, + "step": 4341 + }, + { + "epoch": 0.45021353258721136, + "grad_norm": 0.6484375, + "learning_rate": 0.00018913169923649663, + "loss": 4.3809, + "step": 4342 + }, + { + "epoch": 0.45031722064169943, + "grad_norm": 0.72265625, + "learning_rate": 0.00018912677395672514, + "loss": 4.3945, + "step": 4343 + }, + { + "epoch": 0.4504209086961875, + "grad_norm": 0.6171875, + "learning_rate": 0.00018912184762535195, + "loss": 4.3716, + "step": 4344 + }, + { + "epoch": 0.4505245967506756, + "grad_norm": 0.69140625, + "learning_rate": 0.00018911692024243517, + "loss": 4.3868, + "step": 4345 + }, + { + "epoch": 0.45062828480516365, + "grad_norm": 0.6328125, + "learning_rate": 0.00018911199180803294, + "loss": 4.3969, + "step": 4346 + }, + { + "epoch": 0.4507319728596517, + "grad_norm": 0.70703125, + "learning_rate": 0.00018910706232220338, + "loss": 4.4198, + "step": 4347 + }, + { + "epoch": 0.4508356609141398, + "grad_norm": 0.61328125, + "learning_rate": 0.0001891021317850047, + "loss": 4.4407, + "step": 4348 + }, + { + "epoch": 0.45093934896862786, + "grad_norm": 0.6875, + "learning_rate": 0.000189097200196495, + "loss": 4.3974, + "step": 4349 + }, + { + "epoch": 0.45104303702311593, + "grad_norm": 0.6484375, + "learning_rate": 0.00018909226755673257, + "loss": 4.3658, + "step": 4350 + }, + { + "epoch": 0.451146725077604, + "grad_norm": 0.65234375, + "learning_rate": 0.00018908733386577552, + "loss": 4.3892, + "step": 4351 + }, + { + "epoch": 0.4512504131320921, + "grad_norm": 0.72265625, + "learning_rate": 0.00018908239912368213, + "loss": 4.4091, + "step": 4352 + }, + { + "epoch": 0.45135410118658015, + "grad_norm": 0.546875, + "learning_rate": 0.0001890774633305106, + "loss": 4.3467, + "step": 4353 + }, + { + "epoch": 0.4514577892410682, + "grad_norm": 0.703125, + "learning_rate": 0.00018907252648631912, + "loss": 4.3949, + "step": 4354 + }, + { + "epoch": 0.45156147729555635, + "grad_norm": 0.58203125, + "learning_rate": 0.00018906758859116602, + "loss": 4.371, + "step": 4355 + }, + { + "epoch": 0.4516651653500444, + "grad_norm": 0.74609375, + "learning_rate": 0.0001890626496451095, + "loss": 4.4002, + "step": 4356 + }, + { + "epoch": 0.4517688534045325, + "grad_norm": 0.640625, + "learning_rate": 0.0001890577096482079, + "loss": 4.3438, + "step": 4357 + }, + { + "epoch": 0.45187254145902056, + "grad_norm": 0.7578125, + "learning_rate": 0.0001890527686005194, + "loss": 4.4444, + "step": 4358 + }, + { + "epoch": 0.45197622951350863, + "grad_norm": 0.625, + "learning_rate": 0.00018904782650210243, + "loss": 4.4304, + "step": 4359 + }, + { + "epoch": 0.4520799175679967, + "grad_norm": 0.703125, + "learning_rate": 0.00018904288335301516, + "loss": 4.3794, + "step": 4360 + }, + { + "epoch": 0.4521836056224848, + "grad_norm": 0.765625, + "learning_rate": 0.00018903793915331604, + "loss": 4.4026, + "step": 4361 + }, + { + "epoch": 0.45228729367697285, + "grad_norm": 0.65625, + "learning_rate": 0.00018903299390306334, + "loss": 4.4174, + "step": 4362 + }, + { + "epoch": 0.4523909817314609, + "grad_norm": 0.703125, + "learning_rate": 0.00018902804760231545, + "loss": 4.4262, + "step": 4363 + }, + { + "epoch": 0.452494669785949, + "grad_norm": 0.640625, + "learning_rate": 0.00018902310025113066, + "loss": 4.3765, + "step": 4364 + }, + { + "epoch": 0.45259835784043706, + "grad_norm": 0.64453125, + "learning_rate": 0.00018901815184956742, + "loss": 4.3958, + "step": 4365 + }, + { + "epoch": 0.45270204589492513, + "grad_norm": 0.64453125, + "learning_rate": 0.00018901320239768406, + "loss": 4.3625, + "step": 4366 + }, + { + "epoch": 0.4528057339494132, + "grad_norm": 0.6484375, + "learning_rate": 0.00018900825189553904, + "loss": 4.3915, + "step": 4367 + }, + { + "epoch": 0.4529094220039013, + "grad_norm": 0.62890625, + "learning_rate": 0.00018900330034319073, + "loss": 4.3817, + "step": 4368 + }, + { + "epoch": 0.45301311005838935, + "grad_norm": 0.6171875, + "learning_rate": 0.00018899834774069752, + "loss": 4.4392, + "step": 4369 + }, + { + "epoch": 0.4531167981128774, + "grad_norm": 0.62890625, + "learning_rate": 0.0001889933940881179, + "loss": 4.3706, + "step": 4370 + }, + { + "epoch": 0.4532204861673655, + "grad_norm": 0.5859375, + "learning_rate": 0.00018898843938551028, + "loss": 4.4122, + "step": 4371 + }, + { + "epoch": 0.45332417422185356, + "grad_norm": 0.55078125, + "learning_rate": 0.00018898348363293317, + "loss": 4.3939, + "step": 4372 + }, + { + "epoch": 0.45342786227634163, + "grad_norm": 0.5703125, + "learning_rate": 0.00018897852683044498, + "loss": 4.3972, + "step": 4373 + }, + { + "epoch": 0.4535315503308297, + "grad_norm": 0.5625, + "learning_rate": 0.00018897356897810426, + "loss": 4.4002, + "step": 4374 + }, + { + "epoch": 0.4536352383853178, + "grad_norm": 0.5390625, + "learning_rate": 0.00018896861007596948, + "loss": 4.3926, + "step": 4375 + }, + { + "epoch": 0.45373892643980585, + "grad_norm": 0.55078125, + "learning_rate": 0.00018896365012409914, + "loss": 4.4, + "step": 4376 + }, + { + "epoch": 0.4538426144942939, + "grad_norm": 0.61328125, + "learning_rate": 0.00018895868912255175, + "loss": 4.3701, + "step": 4377 + }, + { + "epoch": 0.453946302548782, + "grad_norm": 0.5859375, + "learning_rate": 0.00018895372707138585, + "loss": 4.3929, + "step": 4378 + }, + { + "epoch": 0.45404999060327006, + "grad_norm": 0.59765625, + "learning_rate": 0.00018894876397066002, + "loss": 4.4144, + "step": 4379 + }, + { + "epoch": 0.45415367865775813, + "grad_norm": 0.578125, + "learning_rate": 0.0001889437998204328, + "loss": 4.4537, + "step": 4380 + }, + { + "epoch": 0.4542573667122462, + "grad_norm": 0.69921875, + "learning_rate": 0.00018893883462076273, + "loss": 4.3803, + "step": 4381 + }, + { + "epoch": 0.4543610547667343, + "grad_norm": 0.66796875, + "learning_rate": 0.00018893386837170849, + "loss": 4.3838, + "step": 4382 + }, + { + "epoch": 0.45446474282122235, + "grad_norm": 0.62890625, + "learning_rate": 0.00018892890107332857, + "loss": 4.382, + "step": 4383 + }, + { + "epoch": 0.4545684308757104, + "grad_norm": 0.6796875, + "learning_rate": 0.0001889239327256816, + "loss": 4.3865, + "step": 4384 + }, + { + "epoch": 0.4546721189301985, + "grad_norm": 0.59375, + "learning_rate": 0.00018891896332882624, + "loss": 4.3959, + "step": 4385 + }, + { + "epoch": 0.45477580698468656, + "grad_norm": 0.66796875, + "learning_rate": 0.0001889139928828211, + "loss": 4.3629, + "step": 4386 + }, + { + "epoch": 0.45487949503917463, + "grad_norm": 0.66015625, + "learning_rate": 0.00018890902138772483, + "loss": 4.3834, + "step": 4387 + }, + { + "epoch": 0.4549831830936627, + "grad_norm": 0.64453125, + "learning_rate": 0.00018890404884359613, + "loss": 4.3842, + "step": 4388 + }, + { + "epoch": 0.4550868711481508, + "grad_norm": 0.77734375, + "learning_rate": 0.00018889907525049355, + "loss": 4.4223, + "step": 4389 + }, + { + "epoch": 0.45519055920263884, + "grad_norm": 0.68359375, + "learning_rate": 0.0001888941006084759, + "loss": 4.3973, + "step": 4390 + }, + { + "epoch": 0.4552942472571269, + "grad_norm": 0.734375, + "learning_rate": 0.00018888912491760182, + "loss": 4.3702, + "step": 4391 + }, + { + "epoch": 0.455397935311615, + "grad_norm": 0.62890625, + "learning_rate": 0.00018888414817793003, + "loss": 4.413, + "step": 4392 + }, + { + "epoch": 0.45550162336610306, + "grad_norm": 0.734375, + "learning_rate": 0.00018887917038951926, + "loss": 4.3807, + "step": 4393 + }, + { + "epoch": 0.45560531142059113, + "grad_norm": 0.62109375, + "learning_rate": 0.0001888741915524282, + "loss": 4.4024, + "step": 4394 + }, + { + "epoch": 0.4557089994750792, + "grad_norm": 0.6875, + "learning_rate": 0.00018886921166671563, + "loss": 4.3569, + "step": 4395 + }, + { + "epoch": 0.4558126875295673, + "grad_norm": 0.65234375, + "learning_rate": 0.00018886423073244033, + "loss": 4.3702, + "step": 4396 + }, + { + "epoch": 0.45591637558405534, + "grad_norm": 0.6796875, + "learning_rate": 0.00018885924874966102, + "loss": 4.3712, + "step": 4397 + }, + { + "epoch": 0.4560200636385434, + "grad_norm": 0.71484375, + "learning_rate": 0.0001888542657184365, + "loss": 4.4314, + "step": 4398 + }, + { + "epoch": 0.4561237516930315, + "grad_norm": 0.63671875, + "learning_rate": 0.00018884928163882556, + "loss": 4.3574, + "step": 4399 + }, + { + "epoch": 0.4562274397475196, + "grad_norm": 0.73046875, + "learning_rate": 0.00018884429651088702, + "loss": 4.3811, + "step": 4400 + }, + { + "epoch": 0.4563311278020077, + "grad_norm": 0.66015625, + "learning_rate": 0.0001888393103346797, + "loss": 4.3547, + "step": 4401 + }, + { + "epoch": 0.45643481585649576, + "grad_norm": 0.82421875, + "learning_rate": 0.00018883432311026242, + "loss": 4.4052, + "step": 4402 + }, + { + "epoch": 0.45653850391098383, + "grad_norm": 0.62890625, + "learning_rate": 0.00018882933483769403, + "loss": 4.3587, + "step": 4403 + }, + { + "epoch": 0.4566421919654719, + "grad_norm": 0.79296875, + "learning_rate": 0.0001888243455170334, + "loss": 4.3803, + "step": 4404 + }, + { + "epoch": 0.45674588001995997, + "grad_norm": 0.7421875, + "learning_rate": 0.00018881935514833935, + "loss": 4.3837, + "step": 4405 + }, + { + "epoch": 0.45684956807444804, + "grad_norm": 0.75, + "learning_rate": 0.00018881436373167083, + "loss": 4.3864, + "step": 4406 + }, + { + "epoch": 0.4569532561289361, + "grad_norm": 0.7109375, + "learning_rate": 0.0001888093712670867, + "loss": 4.3537, + "step": 4407 + }, + { + "epoch": 0.4570569441834242, + "grad_norm": 0.734375, + "learning_rate": 0.00018880437775464583, + "loss": 4.3655, + "step": 4408 + }, + { + "epoch": 0.45716063223791226, + "grad_norm": 0.82421875, + "learning_rate": 0.0001887993831944072, + "loss": 4.4028, + "step": 4409 + }, + { + "epoch": 0.45726432029240033, + "grad_norm": 0.609375, + "learning_rate": 0.00018879438758642972, + "loss": 4.3589, + "step": 4410 + }, + { + "epoch": 0.4573680083468884, + "grad_norm": 0.69140625, + "learning_rate": 0.00018878939093077232, + "loss": 4.3918, + "step": 4411 + }, + { + "epoch": 0.45747169640137647, + "grad_norm": 0.671875, + "learning_rate": 0.00018878439322749392, + "loss": 4.3798, + "step": 4412 + }, + { + "epoch": 0.45757538445586454, + "grad_norm": 0.69140625, + "learning_rate": 0.00018877939447665358, + "loss": 4.3469, + "step": 4413 + }, + { + "epoch": 0.4576790725103526, + "grad_norm": 0.63671875, + "learning_rate": 0.00018877439467831022, + "loss": 4.3823, + "step": 4414 + }, + { + "epoch": 0.4577827605648407, + "grad_norm": 0.7421875, + "learning_rate": 0.00018876939383252284, + "loss": 4.3694, + "step": 4415 + }, + { + "epoch": 0.45788644861932876, + "grad_norm": 0.6484375, + "learning_rate": 0.00018876439193935042, + "loss": 4.3747, + "step": 4416 + }, + { + "epoch": 0.45799013667381683, + "grad_norm": 0.703125, + "learning_rate": 0.00018875938899885202, + "loss": 4.4072, + "step": 4417 + }, + { + "epoch": 0.4580938247283049, + "grad_norm": 0.66796875, + "learning_rate": 0.00018875438501108667, + "loss": 4.395, + "step": 4418 + }, + { + "epoch": 0.45819751278279297, + "grad_norm": 0.64453125, + "learning_rate": 0.00018874937997611336, + "loss": 4.3889, + "step": 4419 + }, + { + "epoch": 0.45830120083728104, + "grad_norm": 0.7265625, + "learning_rate": 0.00018874437389399123, + "loss": 4.372, + "step": 4420 + }, + { + "epoch": 0.4584048888917691, + "grad_norm": 0.671875, + "learning_rate": 0.00018873936676477927, + "loss": 4.3597, + "step": 4421 + }, + { + "epoch": 0.4585085769462572, + "grad_norm": 0.75390625, + "learning_rate": 0.00018873435858853655, + "loss": 4.3904, + "step": 4422 + }, + { + "epoch": 0.45861226500074526, + "grad_norm": 0.671875, + "learning_rate": 0.0001887293493653222, + "loss": 4.3842, + "step": 4423 + }, + { + "epoch": 0.4587159530552333, + "grad_norm": 0.625, + "learning_rate": 0.00018872433909519537, + "loss": 4.3949, + "step": 4424 + }, + { + "epoch": 0.4588196411097214, + "grad_norm": 0.62109375, + "learning_rate": 0.00018871932777821509, + "loss": 4.4121, + "step": 4425 + }, + { + "epoch": 0.45892332916420947, + "grad_norm": 0.64453125, + "learning_rate": 0.00018871431541444053, + "loss": 4.3992, + "step": 4426 + }, + { + "epoch": 0.45902701721869754, + "grad_norm": 0.6484375, + "learning_rate": 0.00018870930200393083, + "loss": 4.3586, + "step": 4427 + }, + { + "epoch": 0.4591307052731856, + "grad_norm": 0.66796875, + "learning_rate": 0.0001887042875467451, + "loss": 4.3472, + "step": 4428 + }, + { + "epoch": 0.4592343933276737, + "grad_norm": 0.6015625, + "learning_rate": 0.00018869927204294258, + "loss": 4.3902, + "step": 4429 + }, + { + "epoch": 0.45933808138216176, + "grad_norm": 0.5546875, + "learning_rate": 0.0001886942554925824, + "loss": 4.4539, + "step": 4430 + }, + { + "epoch": 0.4594417694366498, + "grad_norm": 0.625, + "learning_rate": 0.00018868923789572376, + "loss": 4.3847, + "step": 4431 + }, + { + "epoch": 0.4595454574911379, + "grad_norm": 0.57421875, + "learning_rate": 0.00018868421925242586, + "loss": 4.3835, + "step": 4432 + }, + { + "epoch": 0.45964914554562597, + "grad_norm": 0.640625, + "learning_rate": 0.0001886791995627479, + "loss": 4.3715, + "step": 4433 + }, + { + "epoch": 0.45975283360011404, + "grad_norm": 0.58203125, + "learning_rate": 0.00018867417882674915, + "loss": 4.3309, + "step": 4434 + }, + { + "epoch": 0.4598565216546021, + "grad_norm": 0.62109375, + "learning_rate": 0.0001886691570444888, + "loss": 4.3963, + "step": 4435 + }, + { + "epoch": 0.4599602097090902, + "grad_norm": 0.59375, + "learning_rate": 0.00018866413421602613, + "loss": 4.4098, + "step": 4436 + }, + { + "epoch": 0.46006389776357826, + "grad_norm": 0.6015625, + "learning_rate": 0.00018865911034142042, + "loss": 4.3952, + "step": 4437 + }, + { + "epoch": 0.4601675858180663, + "grad_norm": 0.58203125, + "learning_rate": 0.00018865408542073089, + "loss": 4.4235, + "step": 4438 + }, + { + "epoch": 0.4602712738725544, + "grad_norm": 0.640625, + "learning_rate": 0.00018864905945401687, + "loss": 4.3633, + "step": 4439 + }, + { + "epoch": 0.46037496192704247, + "grad_norm": 0.58984375, + "learning_rate": 0.00018864403244133767, + "loss": 4.395, + "step": 4440 + }, + { + "epoch": 0.46047864998153054, + "grad_norm": 0.65625, + "learning_rate": 0.00018863900438275256, + "loss": 4.3403, + "step": 4441 + }, + { + "epoch": 0.4605823380360186, + "grad_norm": 0.6328125, + "learning_rate": 0.00018863397527832095, + "loss": 4.3693, + "step": 4442 + }, + { + "epoch": 0.4606860260905067, + "grad_norm": 0.6328125, + "learning_rate": 0.00018862894512810207, + "loss": 4.4441, + "step": 4443 + }, + { + "epoch": 0.4607897141449948, + "grad_norm": 0.68359375, + "learning_rate": 0.00018862391393215534, + "loss": 4.3215, + "step": 4444 + }, + { + "epoch": 0.4608934021994829, + "grad_norm": 0.5859375, + "learning_rate": 0.00018861888169054012, + "loss": 4.3838, + "step": 4445 + }, + { + "epoch": 0.46099709025397095, + "grad_norm": 0.68359375, + "learning_rate": 0.00018861384840331575, + "loss": 4.3675, + "step": 4446 + }, + { + "epoch": 0.461100778308459, + "grad_norm": 0.60546875, + "learning_rate": 0.00018860881407054163, + "loss": 4.3918, + "step": 4447 + }, + { + "epoch": 0.4612044663629471, + "grad_norm": 0.6328125, + "learning_rate": 0.0001886037786922772, + "loss": 4.3753, + "step": 4448 + }, + { + "epoch": 0.46130815441743517, + "grad_norm": 0.63671875, + "learning_rate": 0.0001885987422685818, + "loss": 4.4139, + "step": 4449 + }, + { + "epoch": 0.46141184247192324, + "grad_norm": 0.65234375, + "learning_rate": 0.00018859370479951492, + "loss": 4.4179, + "step": 4450 + }, + { + "epoch": 0.4615155305264113, + "grad_norm": 0.625, + "learning_rate": 0.00018858866628513598, + "loss": 4.32, + "step": 4451 + }, + { + "epoch": 0.4616192185808994, + "grad_norm": 0.546875, + "learning_rate": 0.0001885836267255044, + "loss": 4.4127, + "step": 4452 + }, + { + "epoch": 0.46172290663538745, + "grad_norm": 0.67578125, + "learning_rate": 0.00018857858612067967, + "loss": 4.3886, + "step": 4453 + }, + { + "epoch": 0.4618265946898755, + "grad_norm": 0.57421875, + "learning_rate": 0.00018857354447072123, + "loss": 4.3949, + "step": 4454 + }, + { + "epoch": 0.4619302827443636, + "grad_norm": 0.6171875, + "learning_rate": 0.00018856850177568864, + "loss": 4.3957, + "step": 4455 + }, + { + "epoch": 0.46203397079885167, + "grad_norm": 0.66015625, + "learning_rate": 0.00018856345803564133, + "loss": 4.3744, + "step": 4456 + }, + { + "epoch": 0.46213765885333974, + "grad_norm": 0.62109375, + "learning_rate": 0.00018855841325063883, + "loss": 4.3692, + "step": 4457 + }, + { + "epoch": 0.4622413469078278, + "grad_norm": 0.59375, + "learning_rate": 0.00018855336742074066, + "loss": 4.3896, + "step": 4458 + }, + { + "epoch": 0.4623450349623159, + "grad_norm": 0.61328125, + "learning_rate": 0.00018854832054600635, + "loss": 4.3986, + "step": 4459 + }, + { + "epoch": 0.46244872301680395, + "grad_norm": 0.6015625, + "learning_rate": 0.00018854327262649546, + "loss": 4.377, + "step": 4460 + }, + { + "epoch": 0.462552411071292, + "grad_norm": 0.6484375, + "learning_rate": 0.00018853822366226756, + "loss": 4.3758, + "step": 4461 + }, + { + "epoch": 0.4626560991257801, + "grad_norm": 0.56640625, + "learning_rate": 0.00018853317365338218, + "loss": 4.4131, + "step": 4462 + }, + { + "epoch": 0.46275978718026817, + "grad_norm": 0.625, + "learning_rate": 0.000188528122599899, + "loss": 4.3954, + "step": 4463 + }, + { + "epoch": 0.46286347523475624, + "grad_norm": 0.60546875, + "learning_rate": 0.00018852307050187749, + "loss": 4.3832, + "step": 4464 + }, + { + "epoch": 0.4629671632892443, + "grad_norm": 0.6640625, + "learning_rate": 0.00018851801735937732, + "loss": 4.3479, + "step": 4465 + }, + { + "epoch": 0.4630708513437324, + "grad_norm": 0.66796875, + "learning_rate": 0.00018851296317245816, + "loss": 4.3886, + "step": 4466 + }, + { + "epoch": 0.46317453939822045, + "grad_norm": 0.58203125, + "learning_rate": 0.00018850790794117957, + "loss": 4.3542, + "step": 4467 + }, + { + "epoch": 0.4632782274527085, + "grad_norm": 0.6875, + "learning_rate": 0.0001885028516656012, + "loss": 4.3936, + "step": 4468 + }, + { + "epoch": 0.4633819155071966, + "grad_norm": 0.66015625, + "learning_rate": 0.00018849779434578276, + "loss": 4.4084, + "step": 4469 + }, + { + "epoch": 0.46348560356168467, + "grad_norm": 0.65234375, + "learning_rate": 0.0001884927359817839, + "loss": 4.4057, + "step": 4470 + }, + { + "epoch": 0.46358929161617274, + "grad_norm": 0.6484375, + "learning_rate": 0.0001884876765736643, + "loss": 4.3766, + "step": 4471 + }, + { + "epoch": 0.4636929796706608, + "grad_norm": 0.69140625, + "learning_rate": 0.0001884826161214836, + "loss": 4.3469, + "step": 4472 + }, + { + "epoch": 0.4637966677251489, + "grad_norm": 0.69140625, + "learning_rate": 0.00018847755462530162, + "loss": 4.4156, + "step": 4473 + }, + { + "epoch": 0.46390035577963695, + "grad_norm": 0.765625, + "learning_rate": 0.000188472492085178, + "loss": 4.4327, + "step": 4474 + }, + { + "epoch": 0.464004043834125, + "grad_norm": 0.6640625, + "learning_rate": 0.00018846742850117248, + "loss": 4.4199, + "step": 4475 + }, + { + "epoch": 0.4641077318886131, + "grad_norm": 0.63671875, + "learning_rate": 0.0001884623638733448, + "loss": 4.3494, + "step": 4476 + }, + { + "epoch": 0.46421141994310117, + "grad_norm": 0.68359375, + "learning_rate": 0.00018845729820175477, + "loss": 4.3995, + "step": 4477 + }, + { + "epoch": 0.46431510799758924, + "grad_norm": 0.67578125, + "learning_rate": 0.0001884522314864621, + "loss": 4.329, + "step": 4478 + }, + { + "epoch": 0.4644187960520773, + "grad_norm": 0.74609375, + "learning_rate": 0.00018844716372752663, + "loss": 4.4106, + "step": 4479 + }, + { + "epoch": 0.4645224841065654, + "grad_norm": 0.70703125, + "learning_rate": 0.00018844209492500813, + "loss": 4.4032, + "step": 4480 + }, + { + "epoch": 0.46462617216105345, + "grad_norm": 0.69921875, + "learning_rate": 0.00018843702507896634, + "loss": 4.4141, + "step": 4481 + }, + { + "epoch": 0.4647298602155415, + "grad_norm": 0.6484375, + "learning_rate": 0.00018843195418946117, + "loss": 4.4069, + "step": 4482 + }, + { + "epoch": 0.4648335482700296, + "grad_norm": 0.69140625, + "learning_rate": 0.00018842688225655243, + "loss": 4.3916, + "step": 4483 + }, + { + "epoch": 0.46493723632451767, + "grad_norm": 0.70703125, + "learning_rate": 0.00018842180928029992, + "loss": 4.3536, + "step": 4484 + }, + { + "epoch": 0.46504092437900574, + "grad_norm": 0.71875, + "learning_rate": 0.00018841673526076355, + "loss": 4.3959, + "step": 4485 + }, + { + "epoch": 0.4651446124334938, + "grad_norm": 0.6484375, + "learning_rate": 0.00018841166019800315, + "loss": 4.4139, + "step": 4486 + }, + { + "epoch": 0.4652483004879819, + "grad_norm": 0.70703125, + "learning_rate": 0.00018840658409207862, + "loss": 4.31, + "step": 4487 + }, + { + "epoch": 0.46535198854246995, + "grad_norm": 0.640625, + "learning_rate": 0.00018840150694304986, + "loss": 4.3708, + "step": 4488 + }, + { + "epoch": 0.4654556765969581, + "grad_norm": 0.671875, + "learning_rate": 0.00018839642875097674, + "loss": 4.3896, + "step": 4489 + }, + { + "epoch": 0.46555936465144615, + "grad_norm": 0.6875, + "learning_rate": 0.0001883913495159192, + "loss": 4.374, + "step": 4490 + }, + { + "epoch": 0.4656630527059342, + "grad_norm": 0.5859375, + "learning_rate": 0.0001883862692379372, + "loss": 4.3423, + "step": 4491 + }, + { + "epoch": 0.4657667407604223, + "grad_norm": 0.63671875, + "learning_rate": 0.00018838118791709063, + "loss": 4.4045, + "step": 4492 + }, + { + "epoch": 0.46587042881491036, + "grad_norm": 0.65625, + "learning_rate": 0.00018837610555343947, + "loss": 4.3357, + "step": 4493 + }, + { + "epoch": 0.46597411686939844, + "grad_norm": 0.6875, + "learning_rate": 0.00018837102214704367, + "loss": 4.3608, + "step": 4494 + }, + { + "epoch": 0.4660778049238865, + "grad_norm": 0.62890625, + "learning_rate": 0.0001883659376979632, + "loss": 4.3956, + "step": 4495 + }, + { + "epoch": 0.4661814929783746, + "grad_norm": 0.6875, + "learning_rate": 0.00018836085220625814, + "loss": 4.4046, + "step": 4496 + }, + { + "epoch": 0.46628518103286265, + "grad_norm": 0.58984375, + "learning_rate": 0.00018835576567198837, + "loss": 4.3886, + "step": 4497 + }, + { + "epoch": 0.4663888690873507, + "grad_norm": 0.7421875, + "learning_rate": 0.000188350678095214, + "loss": 4.363, + "step": 4498 + }, + { + "epoch": 0.4664925571418388, + "grad_norm": 0.59765625, + "learning_rate": 0.00018834558947599498, + "loss": 4.321, + "step": 4499 + }, + { + "epoch": 0.46659624519632686, + "grad_norm": 0.68359375, + "learning_rate": 0.0001883404998143914, + "loss": 4.3768, + "step": 4500 + }, + { + "epoch": 0.46669993325081494, + "grad_norm": 0.73046875, + "learning_rate": 0.0001883354091104633, + "loss": 4.369, + "step": 4501 + }, + { + "epoch": 0.466803621305303, + "grad_norm": 0.6171875, + "learning_rate": 0.00018833031736427075, + "loss": 4.3661, + "step": 4502 + }, + { + "epoch": 0.4669073093597911, + "grad_norm": 0.73046875, + "learning_rate": 0.00018832522457587386, + "loss": 4.3625, + "step": 4503 + }, + { + "epoch": 0.46701099741427915, + "grad_norm": 0.6640625, + "learning_rate": 0.00018832013074533265, + "loss": 4.3675, + "step": 4504 + }, + { + "epoch": 0.4671146854687672, + "grad_norm": 0.6796875, + "learning_rate": 0.00018831503587270727, + "loss": 4.3922, + "step": 4505 + }, + { + "epoch": 0.4672183735232553, + "grad_norm": 0.67578125, + "learning_rate": 0.00018830993995805782, + "loss": 4.385, + "step": 4506 + }, + { + "epoch": 0.46732206157774336, + "grad_norm": 0.66796875, + "learning_rate": 0.0001883048430014444, + "loss": 4.3942, + "step": 4507 + }, + { + "epoch": 0.46742574963223144, + "grad_norm": 0.6875, + "learning_rate": 0.00018829974500292717, + "loss": 4.3906, + "step": 4508 + }, + { + "epoch": 0.4675294376867195, + "grad_norm": 0.76171875, + "learning_rate": 0.00018829464596256632, + "loss": 4.3429, + "step": 4509 + }, + { + "epoch": 0.4676331257412076, + "grad_norm": 0.66015625, + "learning_rate": 0.00018828954588042196, + "loss": 4.4135, + "step": 4510 + }, + { + "epoch": 0.46773681379569565, + "grad_norm": 0.82421875, + "learning_rate": 0.0001882844447565543, + "loss": 4.3824, + "step": 4511 + }, + { + "epoch": 0.4678405018501837, + "grad_norm": 0.61328125, + "learning_rate": 0.00018827934259102352, + "loss": 4.3978, + "step": 4512 + }, + { + "epoch": 0.4679441899046718, + "grad_norm": 0.78515625, + "learning_rate": 0.00018827423938388977, + "loss": 4.3683, + "step": 4513 + }, + { + "epoch": 0.46804787795915986, + "grad_norm": 0.69921875, + "learning_rate": 0.00018826913513521335, + "loss": 4.4184, + "step": 4514 + }, + { + "epoch": 0.46815156601364794, + "grad_norm": 0.62109375, + "learning_rate": 0.0001882640298450544, + "loss": 4.3855, + "step": 4515 + }, + { + "epoch": 0.468255254068136, + "grad_norm": 0.64453125, + "learning_rate": 0.0001882589235134732, + "loss": 4.3802, + "step": 4516 + }, + { + "epoch": 0.4683589421226241, + "grad_norm": 0.62109375, + "learning_rate": 0.00018825381614053004, + "loss": 4.3976, + "step": 4517 + }, + { + "epoch": 0.46846263017711215, + "grad_norm": 0.5859375, + "learning_rate": 0.00018824870772628512, + "loss": 4.3788, + "step": 4518 + }, + { + "epoch": 0.4685663182316002, + "grad_norm": 0.62109375, + "learning_rate": 0.00018824359827079873, + "loss": 4.391, + "step": 4519 + }, + { + "epoch": 0.4686700062860883, + "grad_norm": 0.56640625, + "learning_rate": 0.00018823848777413114, + "loss": 4.3887, + "step": 4520 + }, + { + "epoch": 0.46877369434057636, + "grad_norm": 0.62890625, + "learning_rate": 0.00018823337623634267, + "loss": 4.407, + "step": 4521 + }, + { + "epoch": 0.46887738239506443, + "grad_norm": 0.58203125, + "learning_rate": 0.00018822826365749365, + "loss": 4.4155, + "step": 4522 + }, + { + "epoch": 0.4689810704495525, + "grad_norm": 0.640625, + "learning_rate": 0.00018822315003764434, + "loss": 4.4231, + "step": 4523 + }, + { + "epoch": 0.4690847585040406, + "grad_norm": 0.58984375, + "learning_rate": 0.00018821803537685515, + "loss": 4.3997, + "step": 4524 + }, + { + "epoch": 0.46918844655852865, + "grad_norm": 0.7421875, + "learning_rate": 0.00018821291967518637, + "loss": 4.3025, + "step": 4525 + }, + { + "epoch": 0.4692921346130167, + "grad_norm": 0.640625, + "learning_rate": 0.0001882078029326984, + "loss": 4.4274, + "step": 4526 + }, + { + "epoch": 0.4693958226675048, + "grad_norm": 0.66796875, + "learning_rate": 0.0001882026851494516, + "loss": 4.3967, + "step": 4527 + }, + { + "epoch": 0.46949951072199286, + "grad_norm": 0.74609375, + "learning_rate": 0.00018819756632550635, + "loss": 4.3833, + "step": 4528 + }, + { + "epoch": 0.46960319877648093, + "grad_norm": 0.6640625, + "learning_rate": 0.00018819244646092303, + "loss": 4.3692, + "step": 4529 + }, + { + "epoch": 0.469706886830969, + "grad_norm": 0.6875, + "learning_rate": 0.00018818732555576207, + "loss": 4.3743, + "step": 4530 + }, + { + "epoch": 0.4698105748854571, + "grad_norm": 0.72265625, + "learning_rate": 0.0001881822036100839, + "loss": 4.3747, + "step": 4531 + }, + { + "epoch": 0.46991426293994515, + "grad_norm": 0.59765625, + "learning_rate": 0.0001881770806239489, + "loss": 4.3656, + "step": 4532 + }, + { + "epoch": 0.4700179509944332, + "grad_norm": 0.66796875, + "learning_rate": 0.0001881719565974176, + "loss": 4.3915, + "step": 4533 + }, + { + "epoch": 0.47012163904892135, + "grad_norm": 0.60546875, + "learning_rate": 0.0001881668315305504, + "loss": 4.3953, + "step": 4534 + }, + { + "epoch": 0.4702253271034094, + "grad_norm": 0.6953125, + "learning_rate": 0.0001881617054234078, + "loss": 4.3876, + "step": 4535 + }, + { + "epoch": 0.4703290151578975, + "grad_norm": 0.609375, + "learning_rate": 0.00018815657827605023, + "loss": 4.3467, + "step": 4536 + }, + { + "epoch": 0.47043270321238556, + "grad_norm": 0.6640625, + "learning_rate": 0.00018815145008853823, + "loss": 4.3883, + "step": 4537 + }, + { + "epoch": 0.47053639126687363, + "grad_norm": 0.6328125, + "learning_rate": 0.00018814632086093234, + "loss": 4.3748, + "step": 4538 + }, + { + "epoch": 0.4706400793213617, + "grad_norm": 0.640625, + "learning_rate": 0.000188141190593293, + "loss": 4.3926, + "step": 4539 + }, + { + "epoch": 0.4707437673758498, + "grad_norm": 0.66796875, + "learning_rate": 0.0001881360592856808, + "loss": 4.3796, + "step": 4540 + }, + { + "epoch": 0.47084745543033785, + "grad_norm": 0.6171875, + "learning_rate": 0.00018813092693815623, + "loss": 4.357, + "step": 4541 + }, + { + "epoch": 0.4709511434848259, + "grad_norm": 0.68359375, + "learning_rate": 0.00018812579355077992, + "loss": 4.3667, + "step": 4542 + }, + { + "epoch": 0.471054831539314, + "grad_norm": 0.66015625, + "learning_rate": 0.0001881206591236124, + "loss": 4.3696, + "step": 4543 + }, + { + "epoch": 0.47115851959380206, + "grad_norm": 0.6953125, + "learning_rate": 0.00018811552365671422, + "loss": 4.4268, + "step": 4544 + }, + { + "epoch": 0.47126220764829013, + "grad_norm": 0.59375, + "learning_rate": 0.000188110387150146, + "loss": 4.372, + "step": 4545 + }, + { + "epoch": 0.4713658957027782, + "grad_norm": 0.6328125, + "learning_rate": 0.00018810524960396837, + "loss": 4.3459, + "step": 4546 + }, + { + "epoch": 0.4714695837572663, + "grad_norm": 0.6171875, + "learning_rate": 0.00018810011101824193, + "loss": 4.3908, + "step": 4547 + }, + { + "epoch": 0.47157327181175435, + "grad_norm": 0.640625, + "learning_rate": 0.0001880949713930273, + "loss": 4.4018, + "step": 4548 + }, + { + "epoch": 0.4716769598662424, + "grad_norm": 0.609375, + "learning_rate": 0.0001880898307283851, + "loss": 4.3963, + "step": 4549 + }, + { + "epoch": 0.4717806479207305, + "grad_norm": 0.65625, + "learning_rate": 0.00018808468902437606, + "loss": 4.4004, + "step": 4550 + }, + { + "epoch": 0.47188433597521856, + "grad_norm": 0.58984375, + "learning_rate": 0.00018807954628106076, + "loss": 4.3346, + "step": 4551 + }, + { + "epoch": 0.47198802402970663, + "grad_norm": 0.70703125, + "learning_rate": 0.00018807440249849996, + "loss": 4.3979, + "step": 4552 + }, + { + "epoch": 0.4720917120841947, + "grad_norm": 0.5859375, + "learning_rate": 0.00018806925767675425, + "loss": 4.3794, + "step": 4553 + }, + { + "epoch": 0.4721954001386828, + "grad_norm": 0.69921875, + "learning_rate": 0.00018806411181588443, + "loss": 4.3476, + "step": 4554 + }, + { + "epoch": 0.47229908819317085, + "grad_norm": 0.671875, + "learning_rate": 0.00018805896491595116, + "loss": 4.3423, + "step": 4555 + }, + { + "epoch": 0.4724027762476589, + "grad_norm": 0.64453125, + "learning_rate": 0.00018805381697701524, + "loss": 4.3548, + "step": 4556 + }, + { + "epoch": 0.472506464302147, + "grad_norm": 0.69140625, + "learning_rate": 0.0001880486679991373, + "loss": 4.3531, + "step": 4557 + }, + { + "epoch": 0.47261015235663506, + "grad_norm": 0.58203125, + "learning_rate": 0.00018804351798237817, + "loss": 4.3784, + "step": 4558 + }, + { + "epoch": 0.47271384041112313, + "grad_norm": 0.6640625, + "learning_rate": 0.00018803836692679856, + "loss": 4.3495, + "step": 4559 + }, + { + "epoch": 0.4728175284656112, + "grad_norm": 0.63671875, + "learning_rate": 0.00018803321483245932, + "loss": 4.3597, + "step": 4560 + }, + { + "epoch": 0.4729212165200993, + "grad_norm": 0.62890625, + "learning_rate": 0.00018802806169942123, + "loss": 4.3687, + "step": 4561 + }, + { + "epoch": 0.47302490457458735, + "grad_norm": 0.64453125, + "learning_rate": 0.000188022907527745, + "loss": 4.3541, + "step": 4562 + }, + { + "epoch": 0.4731285926290754, + "grad_norm": 0.6015625, + "learning_rate": 0.00018801775231749152, + "loss": 4.412, + "step": 4563 + }, + { + "epoch": 0.4732322806835635, + "grad_norm": 0.71875, + "learning_rate": 0.0001880125960687216, + "loss": 4.359, + "step": 4564 + }, + { + "epoch": 0.47333596873805156, + "grad_norm": 0.60546875, + "learning_rate": 0.00018800743878149613, + "loss": 4.3814, + "step": 4565 + }, + { + "epoch": 0.47343965679253963, + "grad_norm": 0.703125, + "learning_rate": 0.00018800228045587586, + "loss": 4.3545, + "step": 4566 + }, + { + "epoch": 0.4735433448470277, + "grad_norm": 0.58984375, + "learning_rate": 0.0001879971210919217, + "loss": 4.377, + "step": 4567 + }, + { + "epoch": 0.4736470329015158, + "grad_norm": 0.69140625, + "learning_rate": 0.00018799196068969453, + "loss": 4.3649, + "step": 4568 + }, + { + "epoch": 0.47375072095600385, + "grad_norm": 0.640625, + "learning_rate": 0.00018798679924925525, + "loss": 4.3682, + "step": 4569 + }, + { + "epoch": 0.4738544090104919, + "grad_norm": 0.62890625, + "learning_rate": 0.00018798163677066475, + "loss": 4.3693, + "step": 4570 + }, + { + "epoch": 0.47395809706498, + "grad_norm": 0.66015625, + "learning_rate": 0.00018797647325398392, + "loss": 4.3592, + "step": 4571 + }, + { + "epoch": 0.47406178511946806, + "grad_norm": 0.625, + "learning_rate": 0.0001879713086992737, + "loss": 4.3882, + "step": 4572 + }, + { + "epoch": 0.47416547317395613, + "grad_norm": 0.703125, + "learning_rate": 0.00018796614310659506, + "loss": 4.3977, + "step": 4573 + }, + { + "epoch": 0.4742691612284442, + "grad_norm": 0.6953125, + "learning_rate": 0.00018796097647600887, + "loss": 4.3923, + "step": 4574 + }, + { + "epoch": 0.4743728492829323, + "grad_norm": 0.70703125, + "learning_rate": 0.00018795580880757618, + "loss": 4.3809, + "step": 4575 + }, + { + "epoch": 0.47447653733742035, + "grad_norm": 0.79296875, + "learning_rate": 0.00018795064010135787, + "loss": 4.403, + "step": 4576 + }, + { + "epoch": 0.4745802253919084, + "grad_norm": 0.67578125, + "learning_rate": 0.000187945470357415, + "loss": 4.3187, + "step": 4577 + }, + { + "epoch": 0.47468391344639654, + "grad_norm": 0.68359375, + "learning_rate": 0.00018794029957580857, + "loss": 4.3486, + "step": 4578 + }, + { + "epoch": 0.4747876015008846, + "grad_norm": 0.77734375, + "learning_rate": 0.0001879351277565995, + "loss": 4.3955, + "step": 4579 + }, + { + "epoch": 0.4748912895553727, + "grad_norm": 0.59765625, + "learning_rate": 0.00018792995489984893, + "loss": 4.3646, + "step": 4580 + }, + { + "epoch": 0.47499497760986076, + "grad_norm": 0.80859375, + "learning_rate": 0.0001879247810056178, + "loss": 4.3669, + "step": 4581 + }, + { + "epoch": 0.47509866566434883, + "grad_norm": 0.68359375, + "learning_rate": 0.00018791960607396723, + "loss": 4.3681, + "step": 4582 + }, + { + "epoch": 0.4752023537188369, + "grad_norm": 0.60546875, + "learning_rate": 0.0001879144301049582, + "loss": 4.3619, + "step": 4583 + }, + { + "epoch": 0.47530604177332497, + "grad_norm": 0.62109375, + "learning_rate": 0.0001879092530986519, + "loss": 4.3535, + "step": 4584 + }, + { + "epoch": 0.47540972982781304, + "grad_norm": 0.6015625, + "learning_rate": 0.00018790407505510929, + "loss": 4.33, + "step": 4585 + }, + { + "epoch": 0.4755134178823011, + "grad_norm": 0.6015625, + "learning_rate": 0.00018789889597439148, + "loss": 4.4277, + "step": 4586 + }, + { + "epoch": 0.4756171059367892, + "grad_norm": 0.65625, + "learning_rate": 0.00018789371585655964, + "loss": 4.3586, + "step": 4587 + }, + { + "epoch": 0.47572079399127726, + "grad_norm": 0.73046875, + "learning_rate": 0.00018788853470167488, + "loss": 4.3724, + "step": 4588 + }, + { + "epoch": 0.47582448204576533, + "grad_norm": 0.6796875, + "learning_rate": 0.00018788335250979828, + "loss": 4.3605, + "step": 4589 + }, + { + "epoch": 0.4759281701002534, + "grad_norm": 0.671875, + "learning_rate": 0.00018787816928099102, + "loss": 4.3503, + "step": 4590 + }, + { + "epoch": 0.47603185815474147, + "grad_norm": 0.671875, + "learning_rate": 0.00018787298501531428, + "loss": 4.4126, + "step": 4591 + }, + { + "epoch": 0.47613554620922954, + "grad_norm": 0.76953125, + "learning_rate": 0.00018786779971282917, + "loss": 4.3549, + "step": 4592 + }, + { + "epoch": 0.4762392342637176, + "grad_norm": 0.60546875, + "learning_rate": 0.0001878626133735969, + "loss": 4.3635, + "step": 4593 + }, + { + "epoch": 0.4763429223182057, + "grad_norm": 0.80078125, + "learning_rate": 0.00018785742599767872, + "loss": 4.3495, + "step": 4594 + }, + { + "epoch": 0.47644661037269376, + "grad_norm": 0.6953125, + "learning_rate": 0.00018785223758513575, + "loss": 4.3772, + "step": 4595 + }, + { + "epoch": 0.47655029842718183, + "grad_norm": 0.6875, + "learning_rate": 0.00018784704813602925, + "loss": 4.3902, + "step": 4596 + }, + { + "epoch": 0.4766539864816699, + "grad_norm": 0.73046875, + "learning_rate": 0.00018784185765042042, + "loss": 4.3868, + "step": 4597 + }, + { + "epoch": 0.47675767453615797, + "grad_norm": 0.6796875, + "learning_rate": 0.00018783666612837054, + "loss": 4.3831, + "step": 4598 + }, + { + "epoch": 0.47686136259064604, + "grad_norm": 0.6953125, + "learning_rate": 0.00018783147356994084, + "loss": 4.3242, + "step": 4599 + }, + { + "epoch": 0.4769650506451341, + "grad_norm": 0.66796875, + "learning_rate": 0.0001878262799751926, + "loss": 4.3894, + "step": 4600 + }, + { + "epoch": 0.4770687386996222, + "grad_norm": 0.8125, + "learning_rate": 0.00018782108534418708, + "loss": 4.3856, + "step": 4601 + }, + { + "epoch": 0.47717242675411026, + "grad_norm": 0.66796875, + "learning_rate": 0.00018781588967698557, + "loss": 4.3688, + "step": 4602 + }, + { + "epoch": 0.47727611480859833, + "grad_norm": 0.73828125, + "learning_rate": 0.00018781069297364944, + "loss": 4.3715, + "step": 4603 + }, + { + "epoch": 0.4773798028630864, + "grad_norm": 0.65234375, + "learning_rate": 0.0001878054952342399, + "loss": 4.3521, + "step": 4604 + }, + { + "epoch": 0.47748349091757447, + "grad_norm": 0.67578125, + "learning_rate": 0.00018780029645881836, + "loss": 4.3793, + "step": 4605 + }, + { + "epoch": 0.47758717897206254, + "grad_norm": 0.71484375, + "learning_rate": 0.0001877950966474461, + "loss": 4.3731, + "step": 4606 + }, + { + "epoch": 0.4776908670265506, + "grad_norm": 0.66015625, + "learning_rate": 0.00018778989580018455, + "loss": 4.3969, + "step": 4607 + }, + { + "epoch": 0.4777945550810387, + "grad_norm": 0.75390625, + "learning_rate": 0.000187784693917095, + "loss": 4.3838, + "step": 4608 + }, + { + "epoch": 0.47789824313552676, + "grad_norm": 0.70703125, + "learning_rate": 0.00018777949099823887, + "loss": 4.3661, + "step": 4609 + }, + { + "epoch": 0.47800193119001483, + "grad_norm": 0.734375, + "learning_rate": 0.00018777428704367752, + "loss": 4.3812, + "step": 4610 + }, + { + "epoch": 0.4781056192445029, + "grad_norm": 0.70703125, + "learning_rate": 0.00018776908205347237, + "loss": 4.3726, + "step": 4611 + }, + { + "epoch": 0.47820930729899097, + "grad_norm": 0.84765625, + "learning_rate": 0.00018776387602768483, + "loss": 4.3957, + "step": 4612 + }, + { + "epoch": 0.47831299535347904, + "grad_norm": 0.640625, + "learning_rate": 0.0001877586689663763, + "loss": 4.3695, + "step": 4613 + }, + { + "epoch": 0.4784166834079671, + "grad_norm": 0.69921875, + "learning_rate": 0.00018775346086960827, + "loss": 4.3326, + "step": 4614 + }, + { + "epoch": 0.4785203714624552, + "grad_norm": 0.68359375, + "learning_rate": 0.00018774825173744212, + "loss": 4.394, + "step": 4615 + }, + { + "epoch": 0.47862405951694326, + "grad_norm": 0.64453125, + "learning_rate": 0.0001877430415699394, + "loss": 4.3301, + "step": 4616 + }, + { + "epoch": 0.47872774757143133, + "grad_norm": 0.62109375, + "learning_rate": 0.00018773783036716153, + "loss": 4.3479, + "step": 4617 + }, + { + "epoch": 0.4788314356259194, + "grad_norm": 0.67578125, + "learning_rate": 0.00018773261812916997, + "loss": 4.3828, + "step": 4618 + }, + { + "epoch": 0.47893512368040747, + "grad_norm": 0.625, + "learning_rate": 0.00018772740485602628, + "loss": 4.3921, + "step": 4619 + }, + { + "epoch": 0.47903881173489554, + "grad_norm": 0.671875, + "learning_rate": 0.00018772219054779193, + "loss": 4.3895, + "step": 4620 + }, + { + "epoch": 0.4791424997893836, + "grad_norm": 0.6484375, + "learning_rate": 0.0001877169752045285, + "loss": 4.3958, + "step": 4621 + }, + { + "epoch": 0.4792461878438717, + "grad_norm": 0.671875, + "learning_rate": 0.00018771175882629744, + "loss": 4.3306, + "step": 4622 + }, + { + "epoch": 0.4793498758983598, + "grad_norm": 0.68359375, + "learning_rate": 0.00018770654141316037, + "loss": 4.3752, + "step": 4623 + }, + { + "epoch": 0.4794535639528479, + "grad_norm": 0.62890625, + "learning_rate": 0.0001877013229651788, + "loss": 4.3535, + "step": 4624 + }, + { + "epoch": 0.47955725200733595, + "grad_norm": 0.6953125, + "learning_rate": 0.00018769610348241434, + "loss": 4.3461, + "step": 4625 + }, + { + "epoch": 0.479660940061824, + "grad_norm": 0.62890625, + "learning_rate": 0.00018769088296492854, + "loss": 4.3541, + "step": 4626 + }, + { + "epoch": 0.4797646281163121, + "grad_norm": 0.6640625, + "learning_rate": 0.000187685661412783, + "loss": 4.3587, + "step": 4627 + }, + { + "epoch": 0.47986831617080017, + "grad_norm": 0.73828125, + "learning_rate": 0.00018768043882603935, + "loss": 4.3785, + "step": 4628 + }, + { + "epoch": 0.47997200422528824, + "grad_norm": 0.6875, + "learning_rate": 0.00018767521520475925, + "loss": 4.3602, + "step": 4629 + }, + { + "epoch": 0.4800756922797763, + "grad_norm": 0.80859375, + "learning_rate": 0.00018766999054900424, + "loss": 4.403, + "step": 4630 + }, + { + "epoch": 0.4801793803342644, + "grad_norm": 0.77734375, + "learning_rate": 0.00018766476485883603, + "loss": 4.3446, + "step": 4631 + }, + { + "epoch": 0.48028306838875245, + "grad_norm": 0.6953125, + "learning_rate": 0.00018765953813431628, + "loss": 4.3514, + "step": 4632 + }, + { + "epoch": 0.4803867564432405, + "grad_norm": 0.80078125, + "learning_rate": 0.00018765431037550662, + "loss": 4.3735, + "step": 4633 + }, + { + "epoch": 0.4804904444977286, + "grad_norm": 0.7265625, + "learning_rate": 0.00018764908158246875, + "loss": 4.3809, + "step": 4634 + }, + { + "epoch": 0.48059413255221667, + "grad_norm": 0.76953125, + "learning_rate": 0.00018764385175526436, + "loss": 4.3671, + "step": 4635 + }, + { + "epoch": 0.48069782060670474, + "grad_norm": 0.67578125, + "learning_rate": 0.00018763862089395515, + "loss": 4.3401, + "step": 4636 + }, + { + "epoch": 0.4808015086611928, + "grad_norm": 0.71875, + "learning_rate": 0.00018763338899860287, + "loss": 4.3297, + "step": 4637 + }, + { + "epoch": 0.4809051967156809, + "grad_norm": 0.73828125, + "learning_rate": 0.00018762815606926926, + "loss": 4.3903, + "step": 4638 + }, + { + "epoch": 0.48100888477016895, + "grad_norm": 0.7578125, + "learning_rate": 0.000187622922106016, + "loss": 4.3983, + "step": 4639 + }, + { + "epoch": 0.481112572824657, + "grad_norm": 0.66796875, + "learning_rate": 0.0001876176871089049, + "loss": 4.3711, + "step": 4640 + }, + { + "epoch": 0.4812162608791451, + "grad_norm": 0.78515625, + "learning_rate": 0.00018761245107799769, + "loss": 4.3311, + "step": 4641 + }, + { + "epoch": 0.48131994893363317, + "grad_norm": 0.6328125, + "learning_rate": 0.0001876072140133562, + "loss": 4.3567, + "step": 4642 + }, + { + "epoch": 0.48142363698812124, + "grad_norm": 0.75390625, + "learning_rate": 0.00018760197591504213, + "loss": 4.3462, + "step": 4643 + }, + { + "epoch": 0.4815273250426093, + "grad_norm": 0.75, + "learning_rate": 0.0001875967367831174, + "loss": 4.3802, + "step": 4644 + }, + { + "epoch": 0.4816310130970974, + "grad_norm": 0.69140625, + "learning_rate": 0.00018759149661764374, + "loss": 4.3746, + "step": 4645 + }, + { + "epoch": 0.48173470115158545, + "grad_norm": 0.7578125, + "learning_rate": 0.00018758625541868303, + "loss": 4.3654, + "step": 4646 + }, + { + "epoch": 0.4818383892060735, + "grad_norm": 0.67578125, + "learning_rate": 0.00018758101318629706, + "loss": 4.3534, + "step": 4647 + }, + { + "epoch": 0.4819420772605616, + "grad_norm": 0.70703125, + "learning_rate": 0.00018757576992054772, + "loss": 4.388, + "step": 4648 + }, + { + "epoch": 0.48204576531504967, + "grad_norm": 0.796875, + "learning_rate": 0.0001875705256214969, + "loss": 4.3819, + "step": 4649 + }, + { + "epoch": 0.48214945336953774, + "grad_norm": 0.69921875, + "learning_rate": 0.00018756528028920642, + "loss": 4.3552, + "step": 4650 + }, + { + "epoch": 0.4822531414240258, + "grad_norm": 0.640625, + "learning_rate": 0.00018756003392373817, + "loss": 4.394, + "step": 4651 + }, + { + "epoch": 0.4823568294785139, + "grad_norm": 0.7421875, + "learning_rate": 0.00018755478652515407, + "loss": 4.3123, + "step": 4652 + }, + { + "epoch": 0.48246051753300195, + "grad_norm": 0.6015625, + "learning_rate": 0.00018754953809351608, + "loss": 4.3751, + "step": 4653 + }, + { + "epoch": 0.48256420558749, + "grad_norm": 0.70703125, + "learning_rate": 0.00018754428862888606, + "loss": 4.3575, + "step": 4654 + }, + { + "epoch": 0.4826678936419781, + "grad_norm": 0.65625, + "learning_rate": 0.000187539038131326, + "loss": 4.3291, + "step": 4655 + }, + { + "epoch": 0.48277158169646617, + "grad_norm": 0.64453125, + "learning_rate": 0.0001875337866008978, + "loss": 4.3593, + "step": 4656 + }, + { + "epoch": 0.48287526975095424, + "grad_norm": 0.69140625, + "learning_rate": 0.00018752853403766344, + "loss": 4.3647, + "step": 4657 + }, + { + "epoch": 0.4829789578054423, + "grad_norm": 0.63671875, + "learning_rate": 0.00018752328044168492, + "loss": 4.3949, + "step": 4658 + }, + { + "epoch": 0.4830826458599304, + "grad_norm": 0.66796875, + "learning_rate": 0.0001875180258130242, + "loss": 4.3514, + "step": 4659 + }, + { + "epoch": 0.48318633391441845, + "grad_norm": 0.68359375, + "learning_rate": 0.00018751277015174327, + "loss": 4.3909, + "step": 4660 + }, + { + "epoch": 0.4832900219689065, + "grad_norm": 0.6796875, + "learning_rate": 0.00018750751345790416, + "loss": 4.3897, + "step": 4661 + }, + { + "epoch": 0.4833937100233946, + "grad_norm": 0.6796875, + "learning_rate": 0.00018750225573156893, + "loss": 4.3775, + "step": 4662 + }, + { + "epoch": 0.48349739807788267, + "grad_norm": 0.62109375, + "learning_rate": 0.00018749699697279953, + "loss": 4.3591, + "step": 4663 + }, + { + "epoch": 0.48360108613237074, + "grad_norm": 0.69921875, + "learning_rate": 0.00018749173718165805, + "loss": 4.3801, + "step": 4664 + }, + { + "epoch": 0.4837047741868588, + "grad_norm": 0.625, + "learning_rate": 0.00018748647635820657, + "loss": 4.3897, + "step": 4665 + }, + { + "epoch": 0.4838084622413469, + "grad_norm": 0.6640625, + "learning_rate": 0.00018748121450250715, + "loss": 4.3256, + "step": 4666 + }, + { + "epoch": 0.48391215029583495, + "grad_norm": 0.63671875, + "learning_rate": 0.0001874759516146219, + "loss": 4.3901, + "step": 4667 + }, + { + "epoch": 0.4840158383503231, + "grad_norm": 0.59375, + "learning_rate": 0.00018747068769461284, + "loss": 4.3546, + "step": 4668 + }, + { + "epoch": 0.48411952640481115, + "grad_norm": 0.65234375, + "learning_rate": 0.00018746542274254214, + "loss": 4.3735, + "step": 4669 + }, + { + "epoch": 0.4842232144592992, + "grad_norm": 0.609375, + "learning_rate": 0.0001874601567584719, + "loss": 4.3351, + "step": 4670 + }, + { + "epoch": 0.4843269025137873, + "grad_norm": 0.6484375, + "learning_rate": 0.00018745488974246431, + "loss": 4.3622, + "step": 4671 + }, + { + "epoch": 0.48443059056827537, + "grad_norm": 0.62109375, + "learning_rate": 0.0001874496216945814, + "loss": 4.3652, + "step": 4672 + }, + { + "epoch": 0.48453427862276344, + "grad_norm": 0.5703125, + "learning_rate": 0.00018744435261488541, + "loss": 4.3184, + "step": 4673 + }, + { + "epoch": 0.4846379666772515, + "grad_norm": 0.6953125, + "learning_rate": 0.00018743908250343848, + "loss": 4.3506, + "step": 4674 + }, + { + "epoch": 0.4847416547317396, + "grad_norm": 0.5703125, + "learning_rate": 0.00018743381136030284, + "loss": 4.372, + "step": 4675 + }, + { + "epoch": 0.48484534278622765, + "grad_norm": 0.59765625, + "learning_rate": 0.00018742853918554065, + "loss": 4.3609, + "step": 4676 + }, + { + "epoch": 0.4849490308407157, + "grad_norm": 0.55859375, + "learning_rate": 0.00018742326597921406, + "loss": 4.3369, + "step": 4677 + }, + { + "epoch": 0.4850527188952038, + "grad_norm": 0.6484375, + "learning_rate": 0.00018741799174138538, + "loss": 4.425, + "step": 4678 + }, + { + "epoch": 0.48515640694969187, + "grad_norm": 0.69140625, + "learning_rate": 0.0001874127164721168, + "loss": 4.3741, + "step": 4679 + }, + { + "epoch": 0.48526009500417994, + "grad_norm": 0.59765625, + "learning_rate": 0.00018740744017147056, + "loss": 4.3473, + "step": 4680 + }, + { + "epoch": 0.485363783058668, + "grad_norm": 0.71484375, + "learning_rate": 0.00018740216283950895, + "loss": 4.3419, + "step": 4681 + }, + { + "epoch": 0.4854674711131561, + "grad_norm": 0.58203125, + "learning_rate": 0.00018739688447629415, + "loss": 4.3533, + "step": 4682 + }, + { + "epoch": 0.48557115916764415, + "grad_norm": 0.73828125, + "learning_rate": 0.00018739160508188853, + "loss": 4.3423, + "step": 4683 + }, + { + "epoch": 0.4856748472221322, + "grad_norm": 0.65625, + "learning_rate": 0.00018738632465635434, + "loss": 4.2913, + "step": 4684 + }, + { + "epoch": 0.4857785352766203, + "grad_norm": 0.71484375, + "learning_rate": 0.0001873810431997539, + "loss": 4.3524, + "step": 4685 + }, + { + "epoch": 0.48588222333110836, + "grad_norm": 0.796875, + "learning_rate": 0.00018737576071214948, + "loss": 4.377, + "step": 4686 + }, + { + "epoch": 0.48598591138559644, + "grad_norm": 0.76171875, + "learning_rate": 0.00018737047719360347, + "loss": 4.3887, + "step": 4687 + }, + { + "epoch": 0.4860895994400845, + "grad_norm": 0.75, + "learning_rate": 0.00018736519264417822, + "loss": 4.4016, + "step": 4688 + }, + { + "epoch": 0.4861932874945726, + "grad_norm": 0.8359375, + "learning_rate": 0.00018735990706393599, + "loss": 4.3848, + "step": 4689 + }, + { + "epoch": 0.48629697554906065, + "grad_norm": 0.75390625, + "learning_rate": 0.00018735462045293923, + "loss": 4.3491, + "step": 4690 + }, + { + "epoch": 0.4864006636035487, + "grad_norm": 0.7421875, + "learning_rate": 0.00018734933281125028, + "loss": 4.3585, + "step": 4691 + }, + { + "epoch": 0.4865043516580368, + "grad_norm": 0.64453125, + "learning_rate": 0.0001873440441389315, + "loss": 4.3719, + "step": 4692 + }, + { + "epoch": 0.48660803971252486, + "grad_norm": 0.7734375, + "learning_rate": 0.00018733875443604538, + "loss": 4.3889, + "step": 4693 + }, + { + "epoch": 0.48671172776701294, + "grad_norm": 0.66796875, + "learning_rate": 0.00018733346370265427, + "loss": 4.382, + "step": 4694 + }, + { + "epoch": 0.486815415821501, + "grad_norm": 0.67578125, + "learning_rate": 0.00018732817193882058, + "loss": 4.3575, + "step": 4695 + }, + { + "epoch": 0.4869191038759891, + "grad_norm": 0.73046875, + "learning_rate": 0.00018732287914460677, + "loss": 4.3879, + "step": 4696 + }, + { + "epoch": 0.48702279193047715, + "grad_norm": 0.6796875, + "learning_rate": 0.0001873175853200753, + "loss": 4.3682, + "step": 4697 + }, + { + "epoch": 0.4871264799849652, + "grad_norm": 0.69140625, + "learning_rate": 0.00018731229046528865, + "loss": 4.3852, + "step": 4698 + }, + { + "epoch": 0.4872301680394533, + "grad_norm": 0.72265625, + "learning_rate": 0.0001873069945803092, + "loss": 4.3816, + "step": 4699 + }, + { + "epoch": 0.48733385609394136, + "grad_norm": 0.63671875, + "learning_rate": 0.00018730169766519955, + "loss": 4.3754, + "step": 4700 + }, + { + "epoch": 0.48743754414842944, + "grad_norm": 0.76171875, + "learning_rate": 0.00018729639972002215, + "loss": 4.3726, + "step": 4701 + }, + { + "epoch": 0.4875412322029175, + "grad_norm": 0.67578125, + "learning_rate": 0.00018729110074483952, + "loss": 4.3491, + "step": 4702 + }, + { + "epoch": 0.4876449202574056, + "grad_norm": 0.7421875, + "learning_rate": 0.00018728580073971413, + "loss": 4.4031, + "step": 4703 + }, + { + "epoch": 0.48774860831189365, + "grad_norm": 0.62890625, + "learning_rate": 0.00018728049970470858, + "loss": 4.3821, + "step": 4704 + }, + { + "epoch": 0.4878522963663817, + "grad_norm": 0.6796875, + "learning_rate": 0.0001872751976398854, + "loss": 4.3892, + "step": 4705 + }, + { + "epoch": 0.4879559844208698, + "grad_norm": 0.69140625, + "learning_rate": 0.00018726989454530713, + "loss": 4.3552, + "step": 4706 + }, + { + "epoch": 0.48805967247535786, + "grad_norm": 0.625, + "learning_rate": 0.00018726459042103634, + "loss": 4.3492, + "step": 4707 + }, + { + "epoch": 0.48816336052984594, + "grad_norm": 0.73046875, + "learning_rate": 0.00018725928526713566, + "loss": 4.3485, + "step": 4708 + }, + { + "epoch": 0.488267048584334, + "grad_norm": 0.58984375, + "learning_rate": 0.00018725397908366762, + "loss": 4.3352, + "step": 4709 + }, + { + "epoch": 0.4883707366388221, + "grad_norm": 0.71484375, + "learning_rate": 0.00018724867187069487, + "loss": 4.3679, + "step": 4710 + }, + { + "epoch": 0.48847442469331015, + "grad_norm": 0.62890625, + "learning_rate": 0.00018724336362828002, + "loss": 4.3396, + "step": 4711 + }, + { + "epoch": 0.4885781127477983, + "grad_norm": 0.64453125, + "learning_rate": 0.0001872380543564857, + "loss": 4.3776, + "step": 4712 + }, + { + "epoch": 0.48868180080228635, + "grad_norm": 0.87109375, + "learning_rate": 0.00018723274405537454, + "loss": 4.3836, + "step": 4713 + }, + { + "epoch": 0.4887854888567744, + "grad_norm": 0.66796875, + "learning_rate": 0.00018722743272500921, + "loss": 4.352, + "step": 4714 + }, + { + "epoch": 0.4888891769112625, + "grad_norm": 0.70703125, + "learning_rate": 0.0001872221203654524, + "loss": 4.3951, + "step": 4715 + }, + { + "epoch": 0.48899286496575056, + "grad_norm": 0.6953125, + "learning_rate": 0.00018721680697676675, + "loss": 4.4221, + "step": 4716 + }, + { + "epoch": 0.48909655302023863, + "grad_norm": 0.6171875, + "learning_rate": 0.00018721149255901499, + "loss": 4.3905, + "step": 4717 + }, + { + "epoch": 0.4892002410747267, + "grad_norm": 0.7109375, + "learning_rate": 0.0001872061771122598, + "loss": 4.3255, + "step": 4718 + }, + { + "epoch": 0.4893039291292148, + "grad_norm": 0.6484375, + "learning_rate": 0.00018720086063656388, + "loss": 4.371, + "step": 4719 + }, + { + "epoch": 0.48940761718370285, + "grad_norm": 0.62109375, + "learning_rate": 0.00018719554313198996, + "loss": 4.3986, + "step": 4720 + }, + { + "epoch": 0.4895113052381909, + "grad_norm": 0.64453125, + "learning_rate": 0.00018719022459860084, + "loss": 4.4115, + "step": 4721 + }, + { + "epoch": 0.489614993292679, + "grad_norm": 0.640625, + "learning_rate": 0.00018718490503645923, + "loss": 4.3245, + "step": 4722 + }, + { + "epoch": 0.48971868134716706, + "grad_norm": 0.63671875, + "learning_rate": 0.0001871795844456279, + "loss": 4.4211, + "step": 4723 + }, + { + "epoch": 0.48982236940165513, + "grad_norm": 0.671875, + "learning_rate": 0.0001871742628261696, + "loss": 4.407, + "step": 4724 + }, + { + "epoch": 0.4899260574561432, + "grad_norm": 0.609375, + "learning_rate": 0.00018716894017814718, + "loss": 4.3849, + "step": 4725 + }, + { + "epoch": 0.4900297455106313, + "grad_norm": 0.6796875, + "learning_rate": 0.00018716361650162336, + "loss": 4.3758, + "step": 4726 + }, + { + "epoch": 0.49013343356511935, + "grad_norm": 0.703125, + "learning_rate": 0.00018715829179666104, + "loss": 4.3135, + "step": 4727 + }, + { + "epoch": 0.4902371216196074, + "grad_norm": 0.6640625, + "learning_rate": 0.00018715296606332303, + "loss": 4.427, + "step": 4728 + }, + { + "epoch": 0.4903408096740955, + "grad_norm": 0.7578125, + "learning_rate": 0.0001871476393016721, + "loss": 4.3769, + "step": 4729 + }, + { + "epoch": 0.49044449772858356, + "grad_norm": 0.63671875, + "learning_rate": 0.00018714231151177116, + "loss": 4.3326, + "step": 4730 + }, + { + "epoch": 0.49054818578307163, + "grad_norm": 0.7734375, + "learning_rate": 0.00018713698269368306, + "loss": 4.3383, + "step": 4731 + }, + { + "epoch": 0.4906518738375597, + "grad_norm": 0.71875, + "learning_rate": 0.00018713165284747069, + "loss": 4.39, + "step": 4732 + }, + { + "epoch": 0.4907555618920478, + "grad_norm": 0.7265625, + "learning_rate": 0.00018712632197319688, + "loss": 4.38, + "step": 4733 + }, + { + "epoch": 0.49085924994653585, + "grad_norm": 0.72265625, + "learning_rate": 0.0001871209900709246, + "loss": 4.359, + "step": 4734 + }, + { + "epoch": 0.4909629380010239, + "grad_norm": 0.81640625, + "learning_rate": 0.0001871156571407167, + "loss": 4.3808, + "step": 4735 + }, + { + "epoch": 0.491066626055512, + "grad_norm": 0.765625, + "learning_rate": 0.00018711032318263616, + "loss": 4.3714, + "step": 4736 + }, + { + "epoch": 0.49117031411000006, + "grad_norm": 0.80078125, + "learning_rate": 0.00018710498819674584, + "loss": 4.3578, + "step": 4737 + }, + { + "epoch": 0.49127400216448813, + "grad_norm": 0.7734375, + "learning_rate": 0.0001870996521831088, + "loss": 4.3482, + "step": 4738 + }, + { + "epoch": 0.4913776902189762, + "grad_norm": 0.78515625, + "learning_rate": 0.0001870943151417879, + "loss": 4.4213, + "step": 4739 + }, + { + "epoch": 0.4914813782734643, + "grad_norm": 0.77734375, + "learning_rate": 0.00018708897707284613, + "loss": 4.3695, + "step": 4740 + }, + { + "epoch": 0.49158506632795235, + "grad_norm": 0.75, + "learning_rate": 0.0001870836379763465, + "loss": 4.3712, + "step": 4741 + }, + { + "epoch": 0.4916887543824404, + "grad_norm": 0.73828125, + "learning_rate": 0.00018707829785235197, + "loss": 4.3614, + "step": 4742 + }, + { + "epoch": 0.4917924424369285, + "grad_norm": 0.80078125, + "learning_rate": 0.0001870729567009256, + "loss": 4.3272, + "step": 4743 + }, + { + "epoch": 0.49189613049141656, + "grad_norm": 0.75, + "learning_rate": 0.00018706761452213038, + "loss": 4.3603, + "step": 4744 + }, + { + "epoch": 0.49199981854590463, + "grad_norm": 0.74609375, + "learning_rate": 0.00018706227131602934, + "loss": 4.384, + "step": 4745 + }, + { + "epoch": 0.4921035066003927, + "grad_norm": 0.76171875, + "learning_rate": 0.00018705692708268553, + "loss": 4.3522, + "step": 4746 + }, + { + "epoch": 0.4922071946548808, + "grad_norm": 0.7421875, + "learning_rate": 0.000187051581822162, + "loss": 4.3933, + "step": 4747 + }, + { + "epoch": 0.49231088270936885, + "grad_norm": 0.81640625, + "learning_rate": 0.0001870462355345218, + "loss": 4.3596, + "step": 4748 + }, + { + "epoch": 0.4924145707638569, + "grad_norm": 0.8125, + "learning_rate": 0.00018704088821982806, + "loss": 4.3722, + "step": 4749 + }, + { + "epoch": 0.492518258818345, + "grad_norm": 0.87890625, + "learning_rate": 0.00018703553987814382, + "loss": 4.388, + "step": 4750 + }, + { + "epoch": 0.49262194687283306, + "grad_norm": 0.80859375, + "learning_rate": 0.00018703019050953223, + "loss": 4.3509, + "step": 4751 + }, + { + "epoch": 0.49272563492732113, + "grad_norm": 0.84375, + "learning_rate": 0.0001870248401140564, + "loss": 4.3761, + "step": 4752 + }, + { + "epoch": 0.4928293229818092, + "grad_norm": 0.84765625, + "learning_rate": 0.00018701948869177942, + "loss": 4.3907, + "step": 4753 + }, + { + "epoch": 0.4929330110362973, + "grad_norm": 0.86328125, + "learning_rate": 0.00018701413624276446, + "loss": 4.3387, + "step": 4754 + }, + { + "epoch": 0.49303669909078535, + "grad_norm": 0.83984375, + "learning_rate": 0.0001870087827670747, + "loss": 4.3632, + "step": 4755 + }, + { + "epoch": 0.4931403871452734, + "grad_norm": 0.72265625, + "learning_rate": 0.00018700342826477324, + "loss": 4.3249, + "step": 4756 + }, + { + "epoch": 0.49324407519976154, + "grad_norm": 0.84765625, + "learning_rate": 0.0001869980727359233, + "loss": 4.3865, + "step": 4757 + }, + { + "epoch": 0.4933477632542496, + "grad_norm": 0.6796875, + "learning_rate": 0.0001869927161805881, + "loss": 4.3384, + "step": 4758 + }, + { + "epoch": 0.4934514513087377, + "grad_norm": 0.86328125, + "learning_rate": 0.00018698735859883076, + "loss": 4.3589, + "step": 4759 + }, + { + "epoch": 0.49355513936322576, + "grad_norm": 0.8046875, + "learning_rate": 0.00018698199999071455, + "loss": 4.388, + "step": 4760 + }, + { + "epoch": 0.49365882741771383, + "grad_norm": 0.81640625, + "learning_rate": 0.0001869766403563027, + "loss": 4.3556, + "step": 4761 + }, + { + "epoch": 0.4937625154722019, + "grad_norm": 0.8828125, + "learning_rate": 0.0001869712796956584, + "loss": 4.3647, + "step": 4762 + }, + { + "epoch": 0.49386620352669, + "grad_norm": 0.8125, + "learning_rate": 0.00018696591800884495, + "loss": 4.3689, + "step": 4763 + }, + { + "epoch": 0.49396989158117804, + "grad_norm": 0.80859375, + "learning_rate": 0.00018696055529592557, + "loss": 4.3974, + "step": 4764 + }, + { + "epoch": 0.4940735796356661, + "grad_norm": 0.75390625, + "learning_rate": 0.0001869551915569636, + "loss": 4.359, + "step": 4765 + }, + { + "epoch": 0.4941772676901542, + "grad_norm": 0.75, + "learning_rate": 0.00018694982679202227, + "loss": 4.3794, + "step": 4766 + }, + { + "epoch": 0.49428095574464226, + "grad_norm": 0.74609375, + "learning_rate": 0.0001869444610011649, + "loss": 4.3763, + "step": 4767 + }, + { + "epoch": 0.49438464379913033, + "grad_norm": 0.79296875, + "learning_rate": 0.00018693909418445478, + "loss": 4.3788, + "step": 4768 + }, + { + "epoch": 0.4944883318536184, + "grad_norm": 0.62890625, + "learning_rate": 0.00018693372634195527, + "loss": 4.3416, + "step": 4769 + }, + { + "epoch": 0.4945920199081065, + "grad_norm": 0.78125, + "learning_rate": 0.00018692835747372965, + "loss": 4.3486, + "step": 4770 + }, + { + "epoch": 0.49469570796259454, + "grad_norm": 0.640625, + "learning_rate": 0.00018692298757984132, + "loss": 4.3836, + "step": 4771 + }, + { + "epoch": 0.4947993960170826, + "grad_norm": 0.89453125, + "learning_rate": 0.0001869176166603536, + "loss": 4.3463, + "step": 4772 + }, + { + "epoch": 0.4949030840715707, + "grad_norm": 0.66796875, + "learning_rate": 0.00018691224471532988, + "loss": 4.3993, + "step": 4773 + }, + { + "epoch": 0.49500677212605876, + "grad_norm": 0.78125, + "learning_rate": 0.0001869068717448336, + "loss": 4.35, + "step": 4774 + }, + { + "epoch": 0.49511046018054683, + "grad_norm": 0.77734375, + "learning_rate": 0.00018690149774892802, + "loss": 4.3844, + "step": 4775 + }, + { + "epoch": 0.4952141482350349, + "grad_norm": 0.75, + "learning_rate": 0.00018689612272767666, + "loss": 4.3631, + "step": 4776 + }, + { + "epoch": 0.495317836289523, + "grad_norm": 0.67578125, + "learning_rate": 0.00018689074668114294, + "loss": 4.3669, + "step": 4777 + }, + { + "epoch": 0.49542152434401104, + "grad_norm": 0.80078125, + "learning_rate": 0.00018688536960939018, + "loss": 4.3282, + "step": 4778 + }, + { + "epoch": 0.4955252123984991, + "grad_norm": 0.61328125, + "learning_rate": 0.00018687999151248197, + "loss": 4.3554, + "step": 4779 + }, + { + "epoch": 0.4956289004529872, + "grad_norm": 0.7890625, + "learning_rate": 0.00018687461239048165, + "loss": 4.3386, + "step": 4780 + }, + { + "epoch": 0.49573258850747526, + "grad_norm": 0.69921875, + "learning_rate": 0.00018686923224345273, + "loss": 4.3446, + "step": 4781 + }, + { + "epoch": 0.49583627656196333, + "grad_norm": 0.6796875, + "learning_rate": 0.00018686385107145875, + "loss": 4.3566, + "step": 4782 + }, + { + "epoch": 0.4959399646164514, + "grad_norm": 0.640625, + "learning_rate": 0.0001868584688745631, + "loss": 4.3457, + "step": 4783 + }, + { + "epoch": 0.49604365267093947, + "grad_norm": 0.640625, + "learning_rate": 0.00018685308565282935, + "loss": 4.3423, + "step": 4784 + }, + { + "epoch": 0.49614734072542754, + "grad_norm": 0.78515625, + "learning_rate": 0.00018684770140632099, + "loss": 4.3752, + "step": 4785 + }, + { + "epoch": 0.4962510287799156, + "grad_norm": 0.78515625, + "learning_rate": 0.00018684231613510152, + "loss": 4.4074, + "step": 4786 + }, + { + "epoch": 0.4963547168344037, + "grad_norm": 0.7109375, + "learning_rate": 0.00018683692983923456, + "loss": 4.3825, + "step": 4787 + }, + { + "epoch": 0.49645840488889176, + "grad_norm": 0.71875, + "learning_rate": 0.0001868315425187836, + "loss": 4.3474, + "step": 4788 + }, + { + "epoch": 0.49656209294337983, + "grad_norm": 0.734375, + "learning_rate": 0.00018682615417381224, + "loss": 4.3531, + "step": 4789 + }, + { + "epoch": 0.4966657809978679, + "grad_norm": 0.796875, + "learning_rate": 0.00018682076480438405, + "loss": 4.3531, + "step": 4790 + }, + { + "epoch": 0.49676946905235597, + "grad_norm": 0.6796875, + "learning_rate": 0.0001868153744105626, + "loss": 4.3875, + "step": 4791 + }, + { + "epoch": 0.49687315710684404, + "grad_norm": 0.75, + "learning_rate": 0.00018680998299241147, + "loss": 4.3805, + "step": 4792 + }, + { + "epoch": 0.4969768451613321, + "grad_norm": 0.74609375, + "learning_rate": 0.00018680459054999432, + "loss": 4.3138, + "step": 4793 + }, + { + "epoch": 0.4970805332158202, + "grad_norm": 0.68359375, + "learning_rate": 0.00018679919708337476, + "loss": 4.3827, + "step": 4794 + }, + { + "epoch": 0.49718422127030826, + "grad_norm": 0.765625, + "learning_rate": 0.00018679380259261646, + "loss": 4.3639, + "step": 4795 + }, + { + "epoch": 0.49728790932479633, + "grad_norm": 0.7890625, + "learning_rate": 0.00018678840707778298, + "loss": 4.391, + "step": 4796 + }, + { + "epoch": 0.4973915973792844, + "grad_norm": 0.66796875, + "learning_rate": 0.00018678301053893804, + "loss": 4.3497, + "step": 4797 + }, + { + "epoch": 0.49749528543377247, + "grad_norm": 0.83203125, + "learning_rate": 0.00018677761297614532, + "loss": 4.4161, + "step": 4798 + }, + { + "epoch": 0.49759897348826054, + "grad_norm": 0.671875, + "learning_rate": 0.00018677221438946853, + "loss": 4.3588, + "step": 4799 + }, + { + "epoch": 0.4977026615427486, + "grad_norm": 0.79296875, + "learning_rate": 0.0001867668147789713, + "loss": 4.3636, + "step": 4800 + }, + { + "epoch": 0.4978063495972367, + "grad_norm": 0.69921875, + "learning_rate": 0.00018676141414471738, + "loss": 4.3342, + "step": 4801 + }, + { + "epoch": 0.4979100376517248, + "grad_norm": 0.8515625, + "learning_rate": 0.0001867560124867705, + "loss": 4.3903, + "step": 4802 + }, + { + "epoch": 0.4980137257062129, + "grad_norm": 0.67578125, + "learning_rate": 0.0001867506098051943, + "loss": 4.3743, + "step": 4803 + }, + { + "epoch": 0.49811741376070096, + "grad_norm": 0.84765625, + "learning_rate": 0.0001867452061000527, + "loss": 4.3445, + "step": 4804 + }, + { + "epoch": 0.498221101815189, + "grad_norm": 0.6953125, + "learning_rate": 0.0001867398013714093, + "loss": 4.3474, + "step": 4805 + }, + { + "epoch": 0.4983247898696771, + "grad_norm": 0.84765625, + "learning_rate": 0.00018673439561932797, + "loss": 4.3979, + "step": 4806 + }, + { + "epoch": 0.49842847792416517, + "grad_norm": 0.6328125, + "learning_rate": 0.00018672898884387244, + "loss": 4.3501, + "step": 4807 + }, + { + "epoch": 0.49853216597865324, + "grad_norm": 0.94140625, + "learning_rate": 0.0001867235810451065, + "loss": 4.4264, + "step": 4808 + }, + { + "epoch": 0.4986358540331413, + "grad_norm": 0.71875, + "learning_rate": 0.000186718172223094, + "loss": 4.4047, + "step": 4809 + }, + { + "epoch": 0.4987395420876294, + "grad_norm": 0.734375, + "learning_rate": 0.00018671276237789872, + "loss": 4.3913, + "step": 4810 + }, + { + "epoch": 0.49884323014211746, + "grad_norm": 0.85546875, + "learning_rate": 0.00018670735150958453, + "loss": 4.3599, + "step": 4811 + }, + { + "epoch": 0.4989469181966055, + "grad_norm": 0.84765625, + "learning_rate": 0.0001867019396182152, + "loss": 4.3785, + "step": 4812 + }, + { + "epoch": 0.4990506062510936, + "grad_norm": 0.859375, + "learning_rate": 0.00018669652670385463, + "loss": 4.3586, + "step": 4813 + }, + { + "epoch": 0.49915429430558167, + "grad_norm": 0.77734375, + "learning_rate": 0.00018669111276656673, + "loss": 4.3503, + "step": 4814 + }, + { + "epoch": 0.49925798236006974, + "grad_norm": 0.97265625, + "learning_rate": 0.00018668569780641532, + "loss": 4.3362, + "step": 4815 + }, + { + "epoch": 0.4993616704145578, + "grad_norm": 0.7578125, + "learning_rate": 0.0001866802818234643, + "loss": 4.3846, + "step": 4816 + }, + { + "epoch": 0.4994653584690459, + "grad_norm": 0.88671875, + "learning_rate": 0.0001866748648177776, + "loss": 4.3617, + "step": 4817 + }, + { + "epoch": 0.49956904652353396, + "grad_norm": 0.81640625, + "learning_rate": 0.00018666944678941909, + "loss": 4.3697, + "step": 4818 + }, + { + "epoch": 0.499672734578022, + "grad_norm": 0.8125, + "learning_rate": 0.00018666402773845272, + "loss": 4.3483, + "step": 4819 + }, + { + "epoch": 0.4997764226325101, + "grad_norm": 0.78515625, + "learning_rate": 0.00018665860766494245, + "loss": 4.399, + "step": 4820 + }, + { + "epoch": 0.49988011068699817, + "grad_norm": 0.92578125, + "learning_rate": 0.00018665318656895219, + "loss": 4.4077, + "step": 4821 + }, + { + "epoch": 0.49998379874148624, + "grad_norm": 0.984375, + "learning_rate": 0.00018664776445054596, + "loss": 4.4045, + "step": 4822 + }, + { + "epoch": 0.49998379874148624, + "eval_loss": 4.3779425621032715, + "eval_runtime": 0.4398, + "eval_samples_per_second": 338.822, + "eval_steps_per_second": 15.918, + "step": 4822 + }, + { + "epoch": 0.5000874867959743, + "grad_norm": 0.73828125, + "learning_rate": 0.00018664234130978766, + "loss": 4.3838, + "step": 4823 + }, + { + "epoch": 0.5001911748504624, + "grad_norm": 1.0, + "learning_rate": 0.00018663691714674132, + "loss": 4.3242, + "step": 4824 + }, + { + "epoch": 0.5002948629049505, + "grad_norm": 0.82421875, + "learning_rate": 0.00018663149196147095, + "loss": 4.3391, + "step": 4825 + }, + { + "epoch": 0.5003985509594385, + "grad_norm": 0.984375, + "learning_rate": 0.00018662606575404054, + "loss": 4.3059, + "step": 4826 + }, + { + "epoch": 0.5005022390139267, + "grad_norm": 0.9765625, + "learning_rate": 0.00018662063852451413, + "loss": 4.3825, + "step": 4827 + }, + { + "epoch": 0.5006059270684147, + "grad_norm": 0.77734375, + "learning_rate": 0.00018661521027295573, + "loss": 4.3923, + "step": 4828 + }, + { + "epoch": 0.5007096151229028, + "grad_norm": 0.9921875, + "learning_rate": 0.00018660978099942945, + "loss": 4.3616, + "step": 4829 + }, + { + "epoch": 0.5008133031773908, + "grad_norm": 0.71875, + "learning_rate": 0.00018660435070399925, + "loss": 4.3542, + "step": 4830 + }, + { + "epoch": 0.5009169912318789, + "grad_norm": 0.94140625, + "learning_rate": 0.0001865989193867293, + "loss": 4.4224, + "step": 4831 + }, + { + "epoch": 0.501020679286367, + "grad_norm": 0.73046875, + "learning_rate": 0.00018659348704768363, + "loss": 4.3345, + "step": 4832 + }, + { + "epoch": 0.5011243673408551, + "grad_norm": 0.8515625, + "learning_rate": 0.00018658805368692636, + "loss": 4.3521, + "step": 4833 + }, + { + "epoch": 0.5012280553953431, + "grad_norm": 0.7421875, + "learning_rate": 0.00018658261930452153, + "loss": 4.4144, + "step": 4834 + }, + { + "epoch": 0.5013317434498312, + "grad_norm": 0.79296875, + "learning_rate": 0.00018657718390053336, + "loss": 4.378, + "step": 4835 + }, + { + "epoch": 0.5014354315043192, + "grad_norm": 0.703125, + "learning_rate": 0.00018657174747502593, + "loss": 4.318, + "step": 4836 + }, + { + "epoch": 0.5015391195588074, + "grad_norm": 0.69921875, + "learning_rate": 0.00018656631002806338, + "loss": 4.3585, + "step": 4837 + }, + { + "epoch": 0.5016428076132954, + "grad_norm": 0.7734375, + "learning_rate": 0.0001865608715597099, + "loss": 4.3919, + "step": 4838 + }, + { + "epoch": 0.5017464956677835, + "grad_norm": 0.69921875, + "learning_rate": 0.00018655543207002958, + "loss": 4.3366, + "step": 4839 + }, + { + "epoch": 0.5018501837222715, + "grad_norm": 0.69140625, + "learning_rate": 0.0001865499915590867, + "loss": 4.3427, + "step": 4840 + }, + { + "epoch": 0.5019538717767597, + "grad_norm": 0.6796875, + "learning_rate": 0.0001865445500269454, + "loss": 4.3433, + "step": 4841 + }, + { + "epoch": 0.5020575598312477, + "grad_norm": 0.69140625, + "learning_rate": 0.00018653910747366988, + "loss": 4.393, + "step": 4842 + }, + { + "epoch": 0.5021612478857358, + "grad_norm": 0.66015625, + "learning_rate": 0.0001865336638993244, + "loss": 4.3742, + "step": 4843 + }, + { + "epoch": 0.5022649359402238, + "grad_norm": 0.64453125, + "learning_rate": 0.0001865282193039731, + "loss": 4.3722, + "step": 4844 + }, + { + "epoch": 0.5023686239947119, + "grad_norm": 0.65234375, + "learning_rate": 0.00018652277368768033, + "loss": 4.3399, + "step": 4845 + }, + { + "epoch": 0.5024723120492, + "grad_norm": 0.6171875, + "learning_rate": 0.00018651732705051024, + "loss": 4.3586, + "step": 4846 + }, + { + "epoch": 0.5025760001036881, + "grad_norm": 0.62890625, + "learning_rate": 0.00018651187939252716, + "loss": 4.37, + "step": 4847 + }, + { + "epoch": 0.5026796881581761, + "grad_norm": 0.6171875, + "learning_rate": 0.00018650643071379538, + "loss": 4.3741, + "step": 4848 + }, + { + "epoch": 0.5027833762126642, + "grad_norm": 0.6328125, + "learning_rate": 0.00018650098101437914, + "loss": 4.3927, + "step": 4849 + }, + { + "epoch": 0.5028870642671522, + "grad_norm": 0.640625, + "learning_rate": 0.00018649553029434274, + "loss": 4.3703, + "step": 4850 + }, + { + "epoch": 0.5029907523216404, + "grad_norm": 0.68359375, + "learning_rate": 0.00018649007855375054, + "loss": 4.3834, + "step": 4851 + }, + { + "epoch": 0.5030944403761284, + "grad_norm": 0.76171875, + "learning_rate": 0.00018648462579266684, + "loss": 4.3607, + "step": 4852 + }, + { + "epoch": 0.5031981284306165, + "grad_norm": 0.6484375, + "learning_rate": 0.00018647917201115597, + "loss": 4.3626, + "step": 4853 + }, + { + "epoch": 0.5033018164851045, + "grad_norm": 0.69921875, + "learning_rate": 0.00018647371720928227, + "loss": 4.3501, + "step": 4854 + }, + { + "epoch": 0.5034055045395927, + "grad_norm": 0.8203125, + "learning_rate": 0.0001864682613871101, + "loss": 4.2801, + "step": 4855 + }, + { + "epoch": 0.5035091925940807, + "grad_norm": 0.671875, + "learning_rate": 0.0001864628045447039, + "loss": 4.3684, + "step": 4856 + }, + { + "epoch": 0.5036128806485688, + "grad_norm": 0.81640625, + "learning_rate": 0.00018645734668212795, + "loss": 4.3729, + "step": 4857 + }, + { + "epoch": 0.5037165687030568, + "grad_norm": 0.73046875, + "learning_rate": 0.00018645188779944672, + "loss": 4.3496, + "step": 4858 + }, + { + "epoch": 0.5038202567575449, + "grad_norm": 0.91796875, + "learning_rate": 0.00018644642789672456, + "loss": 4.3412, + "step": 4859 + }, + { + "epoch": 0.503923944812033, + "grad_norm": 0.703125, + "learning_rate": 0.00018644096697402598, + "loss": 4.3277, + "step": 4860 + }, + { + "epoch": 0.5040276328665211, + "grad_norm": 1.109375, + "learning_rate": 0.00018643550503141533, + "loss": 4.412, + "step": 4861 + }, + { + "epoch": 0.5041313209210091, + "grad_norm": 0.71484375, + "learning_rate": 0.0001864300420689571, + "loss": 4.3538, + "step": 4862 + }, + { + "epoch": 0.5042350089754972, + "grad_norm": 0.8359375, + "learning_rate": 0.00018642457808671573, + "loss": 4.3333, + "step": 4863 + }, + { + "epoch": 0.5043386970299852, + "grad_norm": 0.9765625, + "learning_rate": 0.00018641911308475573, + "loss": 4.3947, + "step": 4864 + }, + { + "epoch": 0.5044423850844734, + "grad_norm": 0.7265625, + "learning_rate": 0.00018641364706314148, + "loss": 4.3542, + "step": 4865 + }, + { + "epoch": 0.5045460731389614, + "grad_norm": 0.91015625, + "learning_rate": 0.0001864081800219376, + "loss": 4.3132, + "step": 4866 + }, + { + "epoch": 0.5046497611934495, + "grad_norm": 0.80859375, + "learning_rate": 0.00018640271196120848, + "loss": 4.3881, + "step": 4867 + }, + { + "epoch": 0.5047534492479375, + "grad_norm": 0.69140625, + "learning_rate": 0.0001863972428810187, + "loss": 4.3479, + "step": 4868 + }, + { + "epoch": 0.5048571373024257, + "grad_norm": 0.91796875, + "learning_rate": 0.0001863917727814328, + "loss": 4.3538, + "step": 4869 + }, + { + "epoch": 0.5049608253569138, + "grad_norm": 0.66015625, + "learning_rate": 0.0001863863016625153, + "loss": 4.4002, + "step": 4870 + }, + { + "epoch": 0.5050645134114018, + "grad_norm": 0.9609375, + "learning_rate": 0.00018638082952433072, + "loss": 4.3441, + "step": 4871 + }, + { + "epoch": 0.5051682014658899, + "grad_norm": 0.68359375, + "learning_rate": 0.0001863753563669437, + "loss": 4.3473, + "step": 4872 + }, + { + "epoch": 0.5052718895203779, + "grad_norm": 1.0234375, + "learning_rate": 0.00018636988219041878, + "loss": 4.3742, + "step": 4873 + }, + { + "epoch": 0.5053755775748661, + "grad_norm": 0.91796875, + "learning_rate": 0.00018636440699482053, + "loss": 4.3611, + "step": 4874 + }, + { + "epoch": 0.5054792656293541, + "grad_norm": 0.796875, + "learning_rate": 0.00018635893078021356, + "loss": 4.3787, + "step": 4875 + }, + { + "epoch": 0.5055829536838422, + "grad_norm": 0.90625, + "learning_rate": 0.00018635345354666252, + "loss": 4.3462, + "step": 4876 + }, + { + "epoch": 0.5056866417383302, + "grad_norm": 0.7734375, + "learning_rate": 0.000186347975294232, + "loss": 4.3807, + "step": 4877 + }, + { + "epoch": 0.5057903297928183, + "grad_norm": 0.8515625, + "learning_rate": 0.0001863424960229866, + "loss": 4.3645, + "step": 4878 + }, + { + "epoch": 0.5058940178473064, + "grad_norm": 0.85546875, + "learning_rate": 0.00018633701573299107, + "loss": 4.3672, + "step": 4879 + }, + { + "epoch": 0.5059977059017945, + "grad_norm": 0.765625, + "learning_rate": 0.00018633153442430998, + "loss": 4.3876, + "step": 4880 + }, + { + "epoch": 0.5061013939562825, + "grad_norm": 0.67578125, + "learning_rate": 0.00018632605209700808, + "loss": 4.3838, + "step": 4881 + }, + { + "epoch": 0.5062050820107706, + "grad_norm": 0.8359375, + "learning_rate": 0.00018632056875115, + "loss": 4.3444, + "step": 4882 + }, + { + "epoch": 0.5063087700652587, + "grad_norm": 0.6875, + "learning_rate": 0.00018631508438680047, + "loss": 4.3192, + "step": 4883 + }, + { + "epoch": 0.5064124581197468, + "grad_norm": 0.77734375, + "learning_rate": 0.00018630959900402412, + "loss": 4.3604, + "step": 4884 + }, + { + "epoch": 0.5065161461742348, + "grad_norm": 0.77734375, + "learning_rate": 0.0001863041126028858, + "loss": 4.3894, + "step": 4885 + }, + { + "epoch": 0.5066198342287229, + "grad_norm": 0.67578125, + "learning_rate": 0.00018629862518345017, + "loss": 4.3822, + "step": 4886 + }, + { + "epoch": 0.5067235222832109, + "grad_norm": 0.77734375, + "learning_rate": 0.00018629313674578196, + "loss": 4.3275, + "step": 4887 + }, + { + "epoch": 0.5068272103376991, + "grad_norm": 0.6640625, + "learning_rate": 0.00018628764728994594, + "loss": 4.3346, + "step": 4888 + }, + { + "epoch": 0.5069308983921871, + "grad_norm": 0.6796875, + "learning_rate": 0.00018628215681600692, + "loss": 4.3965, + "step": 4889 + }, + { + "epoch": 0.5070345864466752, + "grad_norm": 0.74609375, + "learning_rate": 0.00018627666532402962, + "loss": 4.3549, + "step": 4890 + }, + { + "epoch": 0.5071382745011632, + "grad_norm": 0.68359375, + "learning_rate": 0.00018627117281407892, + "loss": 4.3535, + "step": 4891 + }, + { + "epoch": 0.5072419625556513, + "grad_norm": 0.6875, + "learning_rate": 0.00018626567928621955, + "loss": 4.3435, + "step": 4892 + }, + { + "epoch": 0.5073456506101394, + "grad_norm": 0.7109375, + "learning_rate": 0.00018626018474051634, + "loss": 4.295, + "step": 4893 + }, + { + "epoch": 0.5074493386646275, + "grad_norm": 0.7578125, + "learning_rate": 0.00018625468917703414, + "loss": 4.3831, + "step": 4894 + }, + { + "epoch": 0.5075530267191155, + "grad_norm": 0.71875, + "learning_rate": 0.0001862491925958378, + "loss": 4.3633, + "step": 4895 + }, + { + "epoch": 0.5076567147736036, + "grad_norm": 0.79296875, + "learning_rate": 0.0001862436949969921, + "loss": 4.3539, + "step": 4896 + }, + { + "epoch": 0.5077604028280917, + "grad_norm": 0.7734375, + "learning_rate": 0.00018623819638056204, + "loss": 4.3631, + "step": 4897 + }, + { + "epoch": 0.5078640908825798, + "grad_norm": 0.75, + "learning_rate": 0.00018623269674661238, + "loss": 4.3494, + "step": 4898 + }, + { + "epoch": 0.5079677789370678, + "grad_norm": 0.9140625, + "learning_rate": 0.00018622719609520804, + "loss": 4.3768, + "step": 4899 + }, + { + "epoch": 0.5080714669915559, + "grad_norm": 0.8203125, + "learning_rate": 0.00018622169442641395, + "loss": 4.3804, + "step": 4900 + }, + { + "epoch": 0.5081751550460439, + "grad_norm": 0.7109375, + "learning_rate": 0.000186216191740295, + "loss": 4.377, + "step": 4901 + }, + { + "epoch": 0.5082788431005321, + "grad_norm": 0.7421875, + "learning_rate": 0.00018621068803691612, + "loss": 4.3685, + "step": 4902 + }, + { + "epoch": 0.5083825311550201, + "grad_norm": 0.6640625, + "learning_rate": 0.00018620518331634225, + "loss": 4.3558, + "step": 4903 + }, + { + "epoch": 0.5084862192095082, + "grad_norm": 0.73828125, + "learning_rate": 0.00018619967757863836, + "loss": 4.3696, + "step": 4904 + }, + { + "epoch": 0.5085899072639962, + "grad_norm": 0.625, + "learning_rate": 0.00018619417082386936, + "loss": 4.3299, + "step": 4905 + }, + { + "epoch": 0.5086935953184843, + "grad_norm": 0.67578125, + "learning_rate": 0.00018618866305210029, + "loss": 4.3453, + "step": 4906 + }, + { + "epoch": 0.5087972833729724, + "grad_norm": 0.63671875, + "learning_rate": 0.00018618315426339606, + "loss": 4.373, + "step": 4907 + }, + { + "epoch": 0.5089009714274605, + "grad_norm": 0.7109375, + "learning_rate": 0.00018617764445782173, + "loss": 4.374, + "step": 4908 + }, + { + "epoch": 0.5090046594819485, + "grad_norm": 0.640625, + "learning_rate": 0.00018617213363544228, + "loss": 4.3881, + "step": 4909 + }, + { + "epoch": 0.5091083475364366, + "grad_norm": 0.6796875, + "learning_rate": 0.00018616662179632277, + "loss": 4.3841, + "step": 4910 + }, + { + "epoch": 0.5092120355909246, + "grad_norm": 0.640625, + "learning_rate": 0.00018616110894052818, + "loss": 4.3558, + "step": 4911 + }, + { + "epoch": 0.5093157236454128, + "grad_norm": 0.71484375, + "learning_rate": 0.0001861555950681236, + "loss": 4.2965, + "step": 4912 + }, + { + "epoch": 0.5094194116999009, + "grad_norm": 0.71875, + "learning_rate": 0.00018615008017917406, + "loss": 4.3539, + "step": 4913 + }, + { + "epoch": 0.5095230997543889, + "grad_norm": 0.640625, + "learning_rate": 0.00018614456427374463, + "loss": 4.3157, + "step": 4914 + }, + { + "epoch": 0.509626787808877, + "grad_norm": 0.72265625, + "learning_rate": 0.00018613904735190038, + "loss": 4.3303, + "step": 4915 + }, + { + "epoch": 0.5097304758633651, + "grad_norm": 0.69140625, + "learning_rate": 0.00018613352941370645, + "loss": 4.3523, + "step": 4916 + }, + { + "epoch": 0.5098341639178532, + "grad_norm": 0.890625, + "learning_rate": 0.00018612801045922792, + "loss": 4.3532, + "step": 4917 + }, + { + "epoch": 0.5099378519723412, + "grad_norm": 0.82421875, + "learning_rate": 0.0001861224904885299, + "loss": 4.3548, + "step": 4918 + }, + { + "epoch": 0.5100415400268293, + "grad_norm": 0.8828125, + "learning_rate": 0.00018611696950167754, + "loss": 4.3741, + "step": 4919 + }, + { + "epoch": 0.5101452280813173, + "grad_norm": 0.87109375, + "learning_rate": 0.00018611144749873594, + "loss": 4.3321, + "step": 4920 + }, + { + "epoch": 0.5102489161358055, + "grad_norm": 0.85546875, + "learning_rate": 0.00018610592447977035, + "loss": 4.387, + "step": 4921 + }, + { + "epoch": 0.5103526041902935, + "grad_norm": 0.91796875, + "learning_rate": 0.0001861004004448458, + "loss": 4.364, + "step": 4922 + }, + { + "epoch": 0.5104562922447816, + "grad_norm": 0.69921875, + "learning_rate": 0.00018609487539402757, + "loss": 4.3479, + "step": 4923 + }, + { + "epoch": 0.5105599802992696, + "grad_norm": 0.9765625, + "learning_rate": 0.0001860893493273808, + "loss": 4.3506, + "step": 4924 + }, + { + "epoch": 0.5106636683537578, + "grad_norm": 0.953125, + "learning_rate": 0.0001860838222449707, + "loss": 4.3584, + "step": 4925 + }, + { + "epoch": 0.5107673564082458, + "grad_norm": 0.76953125, + "learning_rate": 0.00018607829414686252, + "loss": 4.3778, + "step": 4926 + }, + { + "epoch": 0.5108710444627339, + "grad_norm": 0.94140625, + "learning_rate": 0.00018607276503312145, + "loss": 4.3566, + "step": 4927 + }, + { + "epoch": 0.5109747325172219, + "grad_norm": 0.94140625, + "learning_rate": 0.00018606723490381273, + "loss": 4.3141, + "step": 4928 + }, + { + "epoch": 0.51107842057171, + "grad_norm": 0.6796875, + "learning_rate": 0.0001860617037590016, + "loss": 4.3806, + "step": 4929 + }, + { + "epoch": 0.5111821086261981, + "grad_norm": 0.984375, + "learning_rate": 0.00018605617159875336, + "loss": 4.3295, + "step": 4930 + }, + { + "epoch": 0.5112857966806862, + "grad_norm": 0.96875, + "learning_rate": 0.00018605063842313326, + "loss": 4.3544, + "step": 4931 + }, + { + "epoch": 0.5113894847351742, + "grad_norm": 0.90625, + "learning_rate": 0.00018604510423220658, + "loss": 4.3995, + "step": 4932 + }, + { + "epoch": 0.5114931727896623, + "grad_norm": 0.78125, + "learning_rate": 0.00018603956902603862, + "loss": 4.353, + "step": 4933 + }, + { + "epoch": 0.5115968608441503, + "grad_norm": 0.859375, + "learning_rate": 0.00018603403280469469, + "loss": 4.3788, + "step": 4934 + }, + { + "epoch": 0.5117005488986385, + "grad_norm": 0.80859375, + "learning_rate": 0.00018602849556824015, + "loss": 4.322, + "step": 4935 + }, + { + "epoch": 0.5118042369531265, + "grad_norm": 0.7421875, + "learning_rate": 0.00018602295731674025, + "loss": 4.3518, + "step": 4936 + }, + { + "epoch": 0.5119079250076146, + "grad_norm": 0.94921875, + "learning_rate": 0.00018601741805026042, + "loss": 4.3241, + "step": 4937 + }, + { + "epoch": 0.5120116130621026, + "grad_norm": 0.765625, + "learning_rate": 0.00018601187776886595, + "loss": 4.3811, + "step": 4938 + }, + { + "epoch": 0.5121153011165908, + "grad_norm": 0.7734375, + "learning_rate": 0.00018600633647262226, + "loss": 4.347, + "step": 4939 + }, + { + "epoch": 0.5122189891710788, + "grad_norm": 0.86328125, + "learning_rate": 0.00018600079416159472, + "loss": 4.3929, + "step": 4940 + }, + { + "epoch": 0.5123226772255669, + "grad_norm": 0.74609375, + "learning_rate": 0.0001859952508358487, + "loss": 4.3574, + "step": 4941 + }, + { + "epoch": 0.5124263652800549, + "grad_norm": 0.8046875, + "learning_rate": 0.0001859897064954496, + "loss": 4.3482, + "step": 4942 + }, + { + "epoch": 0.512530053334543, + "grad_norm": 0.94921875, + "learning_rate": 0.0001859841611404629, + "loss": 4.3844, + "step": 4943 + }, + { + "epoch": 0.5126337413890311, + "grad_norm": 0.83203125, + "learning_rate": 0.00018597861477095398, + "loss": 4.3551, + "step": 4944 + }, + { + "epoch": 0.5127374294435192, + "grad_norm": 0.76953125, + "learning_rate": 0.00018597306738698827, + "loss": 4.3954, + "step": 4945 + }, + { + "epoch": 0.5128411174980072, + "grad_norm": 1.09375, + "learning_rate": 0.00018596751898863128, + "loss": 4.3693, + "step": 4946 + }, + { + "epoch": 0.5129448055524953, + "grad_norm": 0.73046875, + "learning_rate": 0.00018596196957594837, + "loss": 4.4051, + "step": 4947 + }, + { + "epoch": 0.5130484936069833, + "grad_norm": 0.85546875, + "learning_rate": 0.00018595641914900514, + "loss": 4.3488, + "step": 4948 + }, + { + "epoch": 0.5131521816614715, + "grad_norm": 0.93359375, + "learning_rate": 0.000185950867707867, + "loss": 4.3103, + "step": 4949 + }, + { + "epoch": 0.5132558697159595, + "grad_norm": 0.94140625, + "learning_rate": 0.0001859453152525995, + "loss": 4.3381, + "step": 4950 + }, + { + "epoch": 0.5133595577704476, + "grad_norm": 0.80078125, + "learning_rate": 0.0001859397617832681, + "loss": 4.3591, + "step": 4951 + }, + { + "epoch": 0.5134632458249356, + "grad_norm": 0.8359375, + "learning_rate": 0.00018593420729993838, + "loss": 4.3626, + "step": 4952 + }, + { + "epoch": 0.5135669338794238, + "grad_norm": 0.83203125, + "learning_rate": 0.00018592865180267582, + "loss": 4.3803, + "step": 4953 + }, + { + "epoch": 0.5136706219339118, + "grad_norm": 0.828125, + "learning_rate": 0.000185923095291546, + "loss": 4.3524, + "step": 4954 + }, + { + "epoch": 0.5137743099883999, + "grad_norm": 0.79296875, + "learning_rate": 0.0001859175377666145, + "loss": 4.3688, + "step": 4955 + }, + { + "epoch": 0.5138779980428879, + "grad_norm": 0.765625, + "learning_rate": 0.0001859119792279469, + "loss": 4.346, + "step": 4956 + }, + { + "epoch": 0.513981686097376, + "grad_norm": 0.8359375, + "learning_rate": 0.00018590641967560872, + "loss": 4.4135, + "step": 4957 + }, + { + "epoch": 0.5140853741518642, + "grad_norm": 0.8359375, + "learning_rate": 0.0001859008591096656, + "loss": 4.3789, + "step": 4958 + }, + { + "epoch": 0.5141890622063522, + "grad_norm": 0.90625, + "learning_rate": 0.00018589529753018313, + "loss": 4.3745, + "step": 4959 + }, + { + "epoch": 0.5142927502608403, + "grad_norm": 0.87890625, + "learning_rate": 0.00018588973493722697, + "loss": 4.3575, + "step": 4960 + }, + { + "epoch": 0.5143964383153283, + "grad_norm": 0.88671875, + "learning_rate": 0.0001858841713308627, + "loss": 4.3291, + "step": 4961 + }, + { + "epoch": 0.5145001263698165, + "grad_norm": 0.87109375, + "learning_rate": 0.00018587860671115603, + "loss": 4.3675, + "step": 4962 + }, + { + "epoch": 0.5146038144243045, + "grad_norm": 0.796875, + "learning_rate": 0.00018587304107817255, + "loss": 4.3248, + "step": 4963 + }, + { + "epoch": 0.5147075024787926, + "grad_norm": 0.734375, + "learning_rate": 0.00018586747443197796, + "loss": 4.3748, + "step": 4964 + }, + { + "epoch": 0.5148111905332806, + "grad_norm": 0.89453125, + "learning_rate": 0.00018586190677263793, + "loss": 4.3217, + "step": 4965 + }, + { + "epoch": 0.5149148785877687, + "grad_norm": 0.66015625, + "learning_rate": 0.00018585633810021818, + "loss": 4.3518, + "step": 4966 + }, + { + "epoch": 0.5150185666422568, + "grad_norm": 0.86328125, + "learning_rate": 0.00018585076841478438, + "loss": 4.3605, + "step": 4967 + }, + { + "epoch": 0.5151222546967449, + "grad_norm": 0.8203125, + "learning_rate": 0.00018584519771640227, + "loss": 4.3512, + "step": 4968 + }, + { + "epoch": 0.5152259427512329, + "grad_norm": 0.84765625, + "learning_rate": 0.00018583962600513755, + "loss": 4.3504, + "step": 4969 + }, + { + "epoch": 0.515329630805721, + "grad_norm": 0.80859375, + "learning_rate": 0.000185834053281056, + "loss": 4.3637, + "step": 4970 + }, + { + "epoch": 0.515433318860209, + "grad_norm": 0.8359375, + "learning_rate": 0.00018582847954422337, + "loss": 4.4037, + "step": 4971 + }, + { + "epoch": 0.5155370069146972, + "grad_norm": 0.8203125, + "learning_rate": 0.00018582290479470537, + "loss": 4.3579, + "step": 4972 + }, + { + "epoch": 0.5156406949691852, + "grad_norm": 0.80078125, + "learning_rate": 0.0001858173290325678, + "loss": 4.3613, + "step": 4973 + }, + { + "epoch": 0.5157443830236733, + "grad_norm": 0.8828125, + "learning_rate": 0.00018581175225787652, + "loss": 4.3607, + "step": 4974 + }, + { + "epoch": 0.5158480710781613, + "grad_norm": 0.71484375, + "learning_rate": 0.00018580617447069722, + "loss": 4.3722, + "step": 4975 + }, + { + "epoch": 0.5159517591326495, + "grad_norm": 0.78515625, + "learning_rate": 0.00018580059567109577, + "loss": 4.4164, + "step": 4976 + }, + { + "epoch": 0.5160554471871375, + "grad_norm": 0.70703125, + "learning_rate": 0.00018579501585913802, + "loss": 4.3352, + "step": 4977 + }, + { + "epoch": 0.5161591352416256, + "grad_norm": 0.7265625, + "learning_rate": 0.00018578943503488974, + "loss": 4.3437, + "step": 4978 + }, + { + "epoch": 0.5162628232961136, + "grad_norm": 0.8203125, + "learning_rate": 0.0001857838531984168, + "loss": 4.3684, + "step": 4979 + }, + { + "epoch": 0.5163665113506017, + "grad_norm": 0.7578125, + "learning_rate": 0.0001857782703497851, + "loss": 4.3773, + "step": 4980 + }, + { + "epoch": 0.5164701994050898, + "grad_norm": 0.875, + "learning_rate": 0.00018577268648906046, + "loss": 4.3865, + "step": 4981 + }, + { + "epoch": 0.5165738874595779, + "grad_norm": 0.81640625, + "learning_rate": 0.0001857671016163088, + "loss": 4.3972, + "step": 4982 + }, + { + "epoch": 0.5166775755140659, + "grad_norm": 0.7578125, + "learning_rate": 0.000185761515731596, + "loss": 4.3545, + "step": 4983 + }, + { + "epoch": 0.516781263568554, + "grad_norm": 0.7734375, + "learning_rate": 0.00018575592883498794, + "loss": 4.3675, + "step": 4984 + }, + { + "epoch": 0.516884951623042, + "grad_norm": 0.73828125, + "learning_rate": 0.0001857503409265506, + "loss": 4.3399, + "step": 4985 + }, + { + "epoch": 0.5169886396775302, + "grad_norm": 0.69921875, + "learning_rate": 0.00018574475200634985, + "loss": 4.3809, + "step": 4986 + }, + { + "epoch": 0.5170923277320182, + "grad_norm": 0.78125, + "learning_rate": 0.0001857391620744517, + "loss": 4.3294, + "step": 4987 + }, + { + "epoch": 0.5171960157865063, + "grad_norm": 0.70703125, + "learning_rate": 0.00018573357113092203, + "loss": 4.3773, + "step": 4988 + }, + { + "epoch": 0.5172997038409943, + "grad_norm": 0.73046875, + "learning_rate": 0.0001857279791758269, + "loss": 4.3379, + "step": 4989 + }, + { + "epoch": 0.5174033918954825, + "grad_norm": 0.69140625, + "learning_rate": 0.00018572238620923217, + "loss": 4.3642, + "step": 4990 + }, + { + "epoch": 0.5175070799499705, + "grad_norm": 0.73046875, + "learning_rate": 0.00018571679223120392, + "loss": 4.3593, + "step": 4991 + }, + { + "epoch": 0.5176107680044586, + "grad_norm": 0.8125, + "learning_rate": 0.00018571119724180814, + "loss": 4.3882, + "step": 4992 + }, + { + "epoch": 0.5177144560589466, + "grad_norm": 0.765625, + "learning_rate": 0.00018570560124111084, + "loss": 4.3503, + "step": 4993 + }, + { + "epoch": 0.5178181441134347, + "grad_norm": 0.75, + "learning_rate": 0.000185700004229178, + "loss": 4.3396, + "step": 4994 + }, + { + "epoch": 0.5179218321679228, + "grad_norm": 0.73828125, + "learning_rate": 0.00018569440620607572, + "loss": 4.3554, + "step": 4995 + }, + { + "epoch": 0.5180255202224109, + "grad_norm": 0.76171875, + "learning_rate": 0.00018568880717187005, + "loss": 4.3116, + "step": 4996 + }, + { + "epoch": 0.5181292082768989, + "grad_norm": 0.703125, + "learning_rate": 0.000185683207126627, + "loss": 4.3576, + "step": 4997 + }, + { + "epoch": 0.518232896331387, + "grad_norm": 0.71875, + "learning_rate": 0.0001856776060704127, + "loss": 4.375, + "step": 4998 + }, + { + "epoch": 0.518336584385875, + "grad_norm": 0.7265625, + "learning_rate": 0.00018567200400329317, + "loss": 4.3738, + "step": 4999 + }, + { + "epoch": 0.5184402724403632, + "grad_norm": 0.76953125, + "learning_rate": 0.0001856664009253346, + "loss": 4.3946, + "step": 5000 + }, + { + "epoch": 0.5185439604948512, + "grad_norm": 0.76953125, + "learning_rate": 0.00018566079683660306, + "loss": 4.3456, + "step": 5001 + }, + { + "epoch": 0.5186476485493393, + "grad_norm": 0.7109375, + "learning_rate": 0.00018565519173716465, + "loss": 4.3619, + "step": 5002 + }, + { + "epoch": 0.5187513366038274, + "grad_norm": 0.71875, + "learning_rate": 0.0001856495856270855, + "loss": 4.3406, + "step": 5003 + }, + { + "epoch": 0.5188550246583155, + "grad_norm": 0.73046875, + "learning_rate": 0.00018564397850643182, + "loss": 4.37, + "step": 5004 + }, + { + "epoch": 0.5189587127128036, + "grad_norm": 0.7421875, + "learning_rate": 0.00018563837037526967, + "loss": 4.367, + "step": 5005 + }, + { + "epoch": 0.5190624007672916, + "grad_norm": 0.70703125, + "learning_rate": 0.0001856327612336653, + "loss": 4.3462, + "step": 5006 + }, + { + "epoch": 0.5191660888217797, + "grad_norm": 0.79296875, + "learning_rate": 0.00018562715108168485, + "loss": 4.3543, + "step": 5007 + }, + { + "epoch": 0.5192697768762677, + "grad_norm": 0.74609375, + "learning_rate": 0.00018562153991939453, + "loss": 4.3434, + "step": 5008 + }, + { + "epoch": 0.5193734649307559, + "grad_norm": 0.83203125, + "learning_rate": 0.00018561592774686057, + "loss": 4.3605, + "step": 5009 + }, + { + "epoch": 0.5194771529852439, + "grad_norm": 0.86328125, + "learning_rate": 0.00018561031456414915, + "loss": 4.3769, + "step": 5010 + }, + { + "epoch": 0.519580841039732, + "grad_norm": 0.80078125, + "learning_rate": 0.00018560470037132652, + "loss": 4.3688, + "step": 5011 + }, + { + "epoch": 0.51968452909422, + "grad_norm": 0.71875, + "learning_rate": 0.0001855990851684589, + "loss": 4.3712, + "step": 5012 + }, + { + "epoch": 0.5197882171487082, + "grad_norm": 0.828125, + "learning_rate": 0.00018559346895561253, + "loss": 4.3923, + "step": 5013 + }, + { + "epoch": 0.5198919052031962, + "grad_norm": 0.6875, + "learning_rate": 0.00018558785173285376, + "loss": 4.3656, + "step": 5014 + }, + { + "epoch": 0.5199955932576843, + "grad_norm": 0.70703125, + "learning_rate": 0.00018558223350024876, + "loss": 4.3326, + "step": 5015 + }, + { + "epoch": 0.5200992813121723, + "grad_norm": 0.7109375, + "learning_rate": 0.0001855766142578639, + "loss": 4.3647, + "step": 5016 + }, + { + "epoch": 0.5202029693666604, + "grad_norm": 0.71875, + "learning_rate": 0.00018557099400576545, + "loss": 4.392, + "step": 5017 + }, + { + "epoch": 0.5203066574211485, + "grad_norm": 0.84765625, + "learning_rate": 0.0001855653727440197, + "loss": 4.3832, + "step": 5018 + }, + { + "epoch": 0.5204103454756366, + "grad_norm": 0.8125, + "learning_rate": 0.000185559750472693, + "loss": 4.3736, + "step": 5019 + }, + { + "epoch": 0.5205140335301246, + "grad_norm": 0.6953125, + "learning_rate": 0.00018555412719185172, + "loss": 4.2913, + "step": 5020 + }, + { + "epoch": 0.5206177215846127, + "grad_norm": 0.97265625, + "learning_rate": 0.00018554850290156214, + "loss": 4.3555, + "step": 5021 + }, + { + "epoch": 0.5207214096391007, + "grad_norm": 0.92578125, + "learning_rate": 0.00018554287760189067, + "loss": 4.3643, + "step": 5022 + }, + { + "epoch": 0.5208250976935889, + "grad_norm": 0.70703125, + "learning_rate": 0.00018553725129290366, + "loss": 4.37, + "step": 5023 + }, + { + "epoch": 0.5209287857480769, + "grad_norm": 0.85546875, + "learning_rate": 0.0001855316239746675, + "loss": 4.3919, + "step": 5024 + }, + { + "epoch": 0.521032473802565, + "grad_norm": 0.8359375, + "learning_rate": 0.0001855259956472486, + "loss": 4.3377, + "step": 5025 + }, + { + "epoch": 0.521136161857053, + "grad_norm": 0.69921875, + "learning_rate": 0.00018552036631071334, + "loss": 4.3495, + "step": 5026 + }, + { + "epoch": 0.5212398499115412, + "grad_norm": 0.83984375, + "learning_rate": 0.00018551473596512818, + "loss": 4.3484, + "step": 5027 + }, + { + "epoch": 0.5213435379660292, + "grad_norm": 0.86328125, + "learning_rate": 0.00018550910461055952, + "loss": 4.3541, + "step": 5028 + }, + { + "epoch": 0.5214472260205173, + "grad_norm": 0.68359375, + "learning_rate": 0.00018550347224707378, + "loss": 4.3259, + "step": 5029 + }, + { + "epoch": 0.5215509140750053, + "grad_norm": 0.7890625, + "learning_rate": 0.00018549783887473749, + "loss": 4.3419, + "step": 5030 + }, + { + "epoch": 0.5216546021294934, + "grad_norm": 0.8125, + "learning_rate": 0.00018549220449361707, + "loss": 4.3365, + "step": 5031 + }, + { + "epoch": 0.5217582901839815, + "grad_norm": 0.59765625, + "learning_rate": 0.000185486569103779, + "loss": 4.3261, + "step": 5032 + }, + { + "epoch": 0.5218619782384696, + "grad_norm": 0.8046875, + "learning_rate": 0.00018548093270528976, + "loss": 4.3539, + "step": 5033 + }, + { + "epoch": 0.5219656662929576, + "grad_norm": 0.7265625, + "learning_rate": 0.0001854752952982159, + "loss": 4.3635, + "step": 5034 + }, + { + "epoch": 0.5220693543474457, + "grad_norm": 0.734375, + "learning_rate": 0.00018546965688262388, + "loss": 4.3569, + "step": 5035 + }, + { + "epoch": 0.5221730424019337, + "grad_norm": 0.71484375, + "learning_rate": 0.00018546401745858027, + "loss": 4.3369, + "step": 5036 + }, + { + "epoch": 0.5222767304564219, + "grad_norm": 0.70703125, + "learning_rate": 0.0001854583770261516, + "loss": 4.3832, + "step": 5037 + }, + { + "epoch": 0.5223804185109099, + "grad_norm": 0.73046875, + "learning_rate": 0.0001854527355854044, + "loss": 4.3824, + "step": 5038 + }, + { + "epoch": 0.522484106565398, + "grad_norm": 0.7421875, + "learning_rate": 0.00018544709313640526, + "loss": 4.4, + "step": 5039 + }, + { + "epoch": 0.522587794619886, + "grad_norm": 0.73828125, + "learning_rate": 0.00018544144967922073, + "loss": 4.3677, + "step": 5040 + }, + { + "epoch": 0.5226914826743742, + "grad_norm": 0.921875, + "learning_rate": 0.0001854358052139174, + "loss": 4.327, + "step": 5041 + }, + { + "epoch": 0.5227951707288622, + "grad_norm": 0.76953125, + "learning_rate": 0.0001854301597405619, + "loss": 4.4058, + "step": 5042 + }, + { + "epoch": 0.5228988587833503, + "grad_norm": 0.7890625, + "learning_rate": 0.0001854245132592208, + "loss": 4.3692, + "step": 5043 + }, + { + "epoch": 0.5230025468378383, + "grad_norm": 0.79296875, + "learning_rate": 0.00018541886576996076, + "loss": 4.4042, + "step": 5044 + }, + { + "epoch": 0.5231062348923264, + "grad_norm": 0.73828125, + "learning_rate": 0.0001854132172728484, + "loss": 4.3645, + "step": 5045 + }, + { + "epoch": 0.5232099229468145, + "grad_norm": 0.859375, + "learning_rate": 0.00018540756776795034, + "loss": 4.3312, + "step": 5046 + }, + { + "epoch": 0.5233136110013026, + "grad_norm": 0.76953125, + "learning_rate": 0.00018540191725533326, + "loss": 4.3549, + "step": 5047 + }, + { + "epoch": 0.5234172990557907, + "grad_norm": 0.72265625, + "learning_rate": 0.00018539626573506386, + "loss": 4.3363, + "step": 5048 + }, + { + "epoch": 0.5235209871102787, + "grad_norm": 0.83984375, + "learning_rate": 0.00018539061320720874, + "loss": 4.3412, + "step": 5049 + }, + { + "epoch": 0.5236246751647669, + "grad_norm": 0.71875, + "learning_rate": 0.0001853849596718347, + "loss": 4.3746, + "step": 5050 + }, + { + "epoch": 0.5237283632192549, + "grad_norm": 0.73046875, + "learning_rate": 0.00018537930512900835, + "loss": 4.343, + "step": 5051 + }, + { + "epoch": 0.523832051273743, + "grad_norm": 0.8359375, + "learning_rate": 0.00018537364957879648, + "loss": 4.3576, + "step": 5052 + }, + { + "epoch": 0.523935739328231, + "grad_norm": 0.6171875, + "learning_rate": 0.00018536799302126578, + "loss": 4.3926, + "step": 5053 + }, + { + "epoch": 0.5240394273827191, + "grad_norm": 0.85546875, + "learning_rate": 0.00018536233545648302, + "loss": 4.3628, + "step": 5054 + }, + { + "epoch": 0.5241431154372072, + "grad_norm": 0.671875, + "learning_rate": 0.00018535667688451487, + "loss": 4.3296, + "step": 5055 + }, + { + "epoch": 0.5242468034916953, + "grad_norm": 0.765625, + "learning_rate": 0.00018535101730542822, + "loss": 4.3907, + "step": 5056 + }, + { + "epoch": 0.5243504915461833, + "grad_norm": 0.7421875, + "learning_rate": 0.00018534535671928978, + "loss": 4.3648, + "step": 5057 + }, + { + "epoch": 0.5244541796006714, + "grad_norm": 0.7578125, + "learning_rate": 0.00018533969512616634, + "loss": 4.3272, + "step": 5058 + }, + { + "epoch": 0.5245578676551594, + "grad_norm": 0.7578125, + "learning_rate": 0.00018533403252612467, + "loss": 4.3601, + "step": 5059 + }, + { + "epoch": 0.5246615557096476, + "grad_norm": 0.703125, + "learning_rate": 0.00018532836891923166, + "loss": 4.3732, + "step": 5060 + }, + { + "epoch": 0.5247652437641356, + "grad_norm": 0.71484375, + "learning_rate": 0.00018532270430555406, + "loss": 4.372, + "step": 5061 + }, + { + "epoch": 0.5248689318186237, + "grad_norm": 0.98828125, + "learning_rate": 0.00018531703868515878, + "loss": 4.3324, + "step": 5062 + }, + { + "epoch": 0.5249726198731117, + "grad_norm": 0.9375, + "learning_rate": 0.00018531137205811258, + "loss": 4.3788, + "step": 5063 + }, + { + "epoch": 0.5250763079275999, + "grad_norm": 0.83984375, + "learning_rate": 0.0001853057044244824, + "loss": 4.3657, + "step": 5064 + }, + { + "epoch": 0.5251799959820879, + "grad_norm": 0.94921875, + "learning_rate": 0.00018530003578433508, + "loss": 4.331, + "step": 5065 + }, + { + "epoch": 0.525283684036576, + "grad_norm": 0.9609375, + "learning_rate": 0.0001852943661377375, + "loss": 4.4065, + "step": 5066 + }, + { + "epoch": 0.525387372091064, + "grad_norm": 0.93359375, + "learning_rate": 0.00018528869548475654, + "loss": 4.3171, + "step": 5067 + }, + { + "epoch": 0.5254910601455521, + "grad_norm": 0.75390625, + "learning_rate": 0.0001852830238254591, + "loss": 4.35, + "step": 5068 + }, + { + "epoch": 0.5255947482000402, + "grad_norm": 0.94921875, + "learning_rate": 0.0001852773511599122, + "loss": 4.3805, + "step": 5069 + }, + { + "epoch": 0.5256984362545283, + "grad_norm": 0.90234375, + "learning_rate": 0.00018527167748818267, + "loss": 4.3788, + "step": 5070 + }, + { + "epoch": 0.5258021243090163, + "grad_norm": 0.71484375, + "learning_rate": 0.00018526600281033748, + "loss": 4.3321, + "step": 5071 + }, + { + "epoch": 0.5259058123635044, + "grad_norm": 0.86328125, + "learning_rate": 0.00018526032712644358, + "loss": 4.4017, + "step": 5072 + }, + { + "epoch": 0.5260095004179924, + "grad_norm": 0.69921875, + "learning_rate": 0.00018525465043656795, + "loss": 4.335, + "step": 5073 + }, + { + "epoch": 0.5261131884724806, + "grad_norm": 0.85546875, + "learning_rate": 0.00018524897274077757, + "loss": 4.3608, + "step": 5074 + }, + { + "epoch": 0.5262168765269686, + "grad_norm": 0.7421875, + "learning_rate": 0.0001852432940391394, + "loss": 4.2927, + "step": 5075 + }, + { + "epoch": 0.5263205645814567, + "grad_norm": 0.67578125, + "learning_rate": 0.0001852376143317205, + "loss": 4.366, + "step": 5076 + }, + { + "epoch": 0.5264242526359447, + "grad_norm": 0.859375, + "learning_rate": 0.00018523193361858785, + "loss": 4.3713, + "step": 5077 + }, + { + "epoch": 0.5265279406904328, + "grad_norm": 0.78515625, + "learning_rate": 0.00018522625189980844, + "loss": 4.3462, + "step": 5078 + }, + { + "epoch": 0.5266316287449209, + "grad_norm": 0.76171875, + "learning_rate": 0.00018522056917544939, + "loss": 4.3247, + "step": 5079 + }, + { + "epoch": 0.526735316799409, + "grad_norm": 0.8359375, + "learning_rate": 0.00018521488544557765, + "loss": 4.3743, + "step": 5080 + }, + { + "epoch": 0.526839004853897, + "grad_norm": 0.8125, + "learning_rate": 0.00018520920071026038, + "loss": 4.4079, + "step": 5081 + }, + { + "epoch": 0.5269426929083851, + "grad_norm": 0.75, + "learning_rate": 0.0001852035149695646, + "loss": 4.3398, + "step": 5082 + }, + { + "epoch": 0.5270463809628732, + "grad_norm": 0.78125, + "learning_rate": 0.00018519782822355743, + "loss": 4.3662, + "step": 5083 + }, + { + "epoch": 0.5271500690173613, + "grad_norm": 0.890625, + "learning_rate": 0.00018519214047230592, + "loss": 4.3139, + "step": 5084 + }, + { + "epoch": 0.5272537570718493, + "grad_norm": 0.63671875, + "learning_rate": 0.00018518645171587719, + "loss": 4.3445, + "step": 5085 + }, + { + "epoch": 0.5273574451263374, + "grad_norm": 0.890625, + "learning_rate": 0.0001851807619543384, + "loss": 4.3736, + "step": 5086 + }, + { + "epoch": 0.5274611331808254, + "grad_norm": 0.79296875, + "learning_rate": 0.00018517507118775666, + "loss": 4.3472, + "step": 5087 + }, + { + "epoch": 0.5275648212353136, + "grad_norm": 0.82421875, + "learning_rate": 0.0001851693794161991, + "loss": 4.3841, + "step": 5088 + }, + { + "epoch": 0.5276685092898016, + "grad_norm": 0.77734375, + "learning_rate": 0.0001851636866397329, + "loss": 4.3624, + "step": 5089 + }, + { + "epoch": 0.5277721973442897, + "grad_norm": 0.875, + "learning_rate": 0.00018515799285842522, + "loss": 4.3591, + "step": 5090 + }, + { + "epoch": 0.5278758853987777, + "grad_norm": 0.953125, + "learning_rate": 0.00018515229807234323, + "loss": 4.3731, + "step": 5091 + }, + { + "epoch": 0.5279795734532658, + "grad_norm": 0.7890625, + "learning_rate": 0.00018514660228155413, + "loss": 4.3403, + "step": 5092 + }, + { + "epoch": 0.528083261507754, + "grad_norm": 0.8515625, + "learning_rate": 0.00018514090548612516, + "loss": 4.3769, + "step": 5093 + }, + { + "epoch": 0.528186949562242, + "grad_norm": 0.91796875, + "learning_rate": 0.00018513520768612346, + "loss": 4.315, + "step": 5094 + }, + { + "epoch": 0.5282906376167301, + "grad_norm": 0.859375, + "learning_rate": 0.00018512950888161634, + "loss": 4.3687, + "step": 5095 + }, + { + "epoch": 0.5283943256712181, + "grad_norm": 0.96484375, + "learning_rate": 0.00018512380907267098, + "loss": 4.3596, + "step": 5096 + }, + { + "epoch": 0.5284980137257063, + "grad_norm": 0.9921875, + "learning_rate": 0.00018511810825935467, + "loss": 4.3683, + "step": 5097 + }, + { + "epoch": 0.5286017017801943, + "grad_norm": 0.75, + "learning_rate": 0.00018511240644173462, + "loss": 4.3502, + "step": 5098 + }, + { + "epoch": 0.5287053898346824, + "grad_norm": 0.9921875, + "learning_rate": 0.00018510670361987817, + "loss": 4.3894, + "step": 5099 + }, + { + "epoch": 0.5288090778891704, + "grad_norm": 0.8046875, + "learning_rate": 0.00018510099979385255, + "loss": 4.4088, + "step": 5100 + }, + { + "epoch": 0.5289127659436585, + "grad_norm": 1.0859375, + "learning_rate": 0.00018509529496372513, + "loss": 4.3636, + "step": 5101 + }, + { + "epoch": 0.5290164539981466, + "grad_norm": 0.8359375, + "learning_rate": 0.00018508958912956316, + "loss": 4.3658, + "step": 5102 + }, + { + "epoch": 0.5291201420526347, + "grad_norm": 0.86328125, + "learning_rate": 0.00018508388229143398, + "loss": 4.3711, + "step": 5103 + }, + { + "epoch": 0.5292238301071227, + "grad_norm": 1.0078125, + "learning_rate": 0.0001850781744494049, + "loss": 4.3279, + "step": 5104 + }, + { + "epoch": 0.5293275181616108, + "grad_norm": 0.66796875, + "learning_rate": 0.00018507246560354334, + "loss": 4.3678, + "step": 5105 + }, + { + "epoch": 0.5294312062160988, + "grad_norm": 0.96484375, + "learning_rate": 0.00018506675575391662, + "loss": 4.3717, + "step": 5106 + }, + { + "epoch": 0.529534894270587, + "grad_norm": 0.734375, + "learning_rate": 0.00018506104490059207, + "loss": 4.3604, + "step": 5107 + }, + { + "epoch": 0.529638582325075, + "grad_norm": 0.90234375, + "learning_rate": 0.0001850553330436371, + "loss": 4.3514, + "step": 5108 + }, + { + "epoch": 0.5297422703795631, + "grad_norm": 0.9296875, + "learning_rate": 0.00018504962018311912, + "loss": 4.3815, + "step": 5109 + }, + { + "epoch": 0.5298459584340511, + "grad_norm": 1.046875, + "learning_rate": 0.0001850439063191055, + "loss": 4.3414, + "step": 5110 + }, + { + "epoch": 0.5299496464885393, + "grad_norm": 0.7578125, + "learning_rate": 0.0001850381914516637, + "loss": 4.3528, + "step": 5111 + }, + { + "epoch": 0.5300533345430273, + "grad_norm": 0.87890625, + "learning_rate": 0.00018503247558086114, + "loss": 4.3303, + "step": 5112 + }, + { + "epoch": 0.5301570225975154, + "grad_norm": 0.9375, + "learning_rate": 0.00018502675870676522, + "loss": 4.3694, + "step": 5113 + }, + { + "epoch": 0.5302607106520034, + "grad_norm": 0.83984375, + "learning_rate": 0.00018502104082944345, + "loss": 4.339, + "step": 5114 + }, + { + "epoch": 0.5303643987064915, + "grad_norm": 0.734375, + "learning_rate": 0.00018501532194896327, + "loss": 4.3463, + "step": 5115 + }, + { + "epoch": 0.5304680867609796, + "grad_norm": 0.8515625, + "learning_rate": 0.00018500960206539215, + "loss": 4.331, + "step": 5116 + }, + { + "epoch": 0.5305717748154677, + "grad_norm": 0.84765625, + "learning_rate": 0.00018500388117879756, + "loss": 4.3701, + "step": 5117 + }, + { + "epoch": 0.5306754628699557, + "grad_norm": 0.765625, + "learning_rate": 0.0001849981592892471, + "loss": 4.3665, + "step": 5118 + }, + { + "epoch": 0.5307791509244438, + "grad_norm": 0.82421875, + "learning_rate": 0.00018499243639680813, + "loss": 4.3249, + "step": 5119 + }, + { + "epoch": 0.5308828389789318, + "grad_norm": 0.86328125, + "learning_rate": 0.00018498671250154826, + "loss": 4.3709, + "step": 5120 + }, + { + "epoch": 0.53098652703342, + "grad_norm": 0.6484375, + "learning_rate": 0.00018498098760353507, + "loss": 4.3713, + "step": 5121 + }, + { + "epoch": 0.531090215087908, + "grad_norm": 0.7890625, + "learning_rate": 0.000184975261702836, + "loss": 4.3346, + "step": 5122 + }, + { + "epoch": 0.5311939031423961, + "grad_norm": 0.70703125, + "learning_rate": 0.00018496953479951869, + "loss": 4.3565, + "step": 5123 + }, + { + "epoch": 0.5312975911968841, + "grad_norm": 0.734375, + "learning_rate": 0.0001849638068936507, + "loss": 4.3247, + "step": 5124 + }, + { + "epoch": 0.5314012792513723, + "grad_norm": 0.8046875, + "learning_rate": 0.00018495807798529957, + "loss": 4.3497, + "step": 5125 + }, + { + "epoch": 0.5315049673058603, + "grad_norm": 0.6796875, + "learning_rate": 0.00018495234807453293, + "loss": 4.4022, + "step": 5126 + }, + { + "epoch": 0.5316086553603484, + "grad_norm": 0.78515625, + "learning_rate": 0.0001849466171614184, + "loss": 4.3508, + "step": 5127 + }, + { + "epoch": 0.5317123434148364, + "grad_norm": 0.66796875, + "learning_rate": 0.00018494088524602354, + "loss": 4.3337, + "step": 5128 + }, + { + "epoch": 0.5318160314693245, + "grad_norm": 0.68359375, + "learning_rate": 0.00018493515232841605, + "loss": 4.37, + "step": 5129 + }, + { + "epoch": 0.5319197195238126, + "grad_norm": 0.6171875, + "learning_rate": 0.00018492941840866353, + "loss": 4.3041, + "step": 5130 + }, + { + "epoch": 0.5320234075783007, + "grad_norm": 0.7109375, + "learning_rate": 0.00018492368348683368, + "loss": 4.3602, + "step": 5131 + }, + { + "epoch": 0.5321270956327887, + "grad_norm": 0.66015625, + "learning_rate": 0.0001849179475629941, + "loss": 4.349, + "step": 5132 + }, + { + "epoch": 0.5322307836872768, + "grad_norm": 0.7578125, + "learning_rate": 0.0001849122106372125, + "loss": 4.3643, + "step": 5133 + }, + { + "epoch": 0.5323344717417648, + "grad_norm": 0.765625, + "learning_rate": 0.0001849064727095566, + "loss": 4.3504, + "step": 5134 + }, + { + "epoch": 0.532438159796253, + "grad_norm": 0.65234375, + "learning_rate": 0.00018490073378009405, + "loss": 4.4057, + "step": 5135 + }, + { + "epoch": 0.532541847850741, + "grad_norm": 0.7734375, + "learning_rate": 0.0001848949938488926, + "loss": 4.3717, + "step": 5136 + }, + { + "epoch": 0.5326455359052291, + "grad_norm": 0.875, + "learning_rate": 0.00018488925291601995, + "loss": 4.3567, + "step": 5137 + }, + { + "epoch": 0.5327492239597172, + "grad_norm": 0.74609375, + "learning_rate": 0.00018488351098154383, + "loss": 4.3428, + "step": 5138 + }, + { + "epoch": 0.5328529120142053, + "grad_norm": 0.76953125, + "learning_rate": 0.00018487776804553203, + "loss": 4.3578, + "step": 5139 + }, + { + "epoch": 0.5329566000686934, + "grad_norm": 0.73046875, + "learning_rate": 0.0001848720241080523, + "loss": 4.3299, + "step": 5140 + }, + { + "epoch": 0.5330602881231814, + "grad_norm": 0.76171875, + "learning_rate": 0.00018486627916917235, + "loss": 4.334, + "step": 5141 + }, + { + "epoch": 0.5331639761776695, + "grad_norm": 0.74609375, + "learning_rate": 0.00018486053322896005, + "loss": 4.3096, + "step": 5142 + }, + { + "epoch": 0.5332676642321575, + "grad_norm": 0.72265625, + "learning_rate": 0.00018485478628748315, + "loss": 4.3155, + "step": 5143 + }, + { + "epoch": 0.5333713522866457, + "grad_norm": 0.78515625, + "learning_rate": 0.00018484903834480946, + "loss": 4.3713, + "step": 5144 + }, + { + "epoch": 0.5334750403411337, + "grad_norm": 0.7734375, + "learning_rate": 0.0001848432894010068, + "loss": 4.3835, + "step": 5145 + }, + { + "epoch": 0.5335787283956218, + "grad_norm": 0.75, + "learning_rate": 0.00018483753945614304, + "loss": 4.415, + "step": 5146 + }, + { + "epoch": 0.5336824164501098, + "grad_norm": 0.765625, + "learning_rate": 0.00018483178851028597, + "loss": 4.3042, + "step": 5147 + }, + { + "epoch": 0.533786104504598, + "grad_norm": 0.73046875, + "learning_rate": 0.00018482603656350347, + "loss": 4.3036, + "step": 5148 + }, + { + "epoch": 0.533889792559086, + "grad_norm": 0.6328125, + "learning_rate": 0.00018482028361586342, + "loss": 4.3833, + "step": 5149 + }, + { + "epoch": 0.5339934806135741, + "grad_norm": 0.7890625, + "learning_rate": 0.00018481452966743363, + "loss": 4.338, + "step": 5150 + }, + { + "epoch": 0.5340971686680621, + "grad_norm": 0.6875, + "learning_rate": 0.0001848087747182821, + "loss": 4.3212, + "step": 5151 + }, + { + "epoch": 0.5342008567225502, + "grad_norm": 0.78125, + "learning_rate": 0.00018480301876847667, + "loss": 4.3137, + "step": 5152 + }, + { + "epoch": 0.5343045447770383, + "grad_norm": 0.76171875, + "learning_rate": 0.00018479726181808522, + "loss": 4.354, + "step": 5153 + }, + { + "epoch": 0.5344082328315264, + "grad_norm": 0.765625, + "learning_rate": 0.00018479150386717575, + "loss": 4.3564, + "step": 5154 + }, + { + "epoch": 0.5345119208860144, + "grad_norm": 0.8125, + "learning_rate": 0.00018478574491581618, + "loss": 4.3444, + "step": 5155 + }, + { + "epoch": 0.5346156089405025, + "grad_norm": 0.83203125, + "learning_rate": 0.00018477998496407437, + "loss": 4.381, + "step": 5156 + }, + { + "epoch": 0.5347192969949905, + "grad_norm": 0.859375, + "learning_rate": 0.00018477422401201842, + "loss": 4.3283, + "step": 5157 + }, + { + "epoch": 0.5348229850494787, + "grad_norm": 0.7734375, + "learning_rate": 0.00018476846205971623, + "loss": 4.3069, + "step": 5158 + }, + { + "epoch": 0.5349266731039667, + "grad_norm": 0.81640625, + "learning_rate": 0.00018476269910723577, + "loss": 4.3874, + "step": 5159 + }, + { + "epoch": 0.5350303611584548, + "grad_norm": 0.7890625, + "learning_rate": 0.00018475693515464508, + "loss": 4.3665, + "step": 5160 + }, + { + "epoch": 0.5351340492129428, + "grad_norm": 0.75390625, + "learning_rate": 0.0001847511702020121, + "loss": 4.3524, + "step": 5161 + }, + { + "epoch": 0.535237737267431, + "grad_norm": 0.890625, + "learning_rate": 0.00018474540424940497, + "loss": 4.4009, + "step": 5162 + }, + { + "epoch": 0.535341425321919, + "grad_norm": 0.66796875, + "learning_rate": 0.0001847396372968916, + "loss": 4.3042, + "step": 5163 + }, + { + "epoch": 0.5354451133764071, + "grad_norm": 0.78515625, + "learning_rate": 0.0001847338693445401, + "loss": 4.3491, + "step": 5164 + }, + { + "epoch": 0.5355488014308951, + "grad_norm": 0.72265625, + "learning_rate": 0.00018472810039241847, + "loss": 4.3576, + "step": 5165 + }, + { + "epoch": 0.5356524894853832, + "grad_norm": 0.81640625, + "learning_rate": 0.00018472233044059485, + "loss": 4.3091, + "step": 5166 + }, + { + "epoch": 0.5357561775398713, + "grad_norm": 0.8359375, + "learning_rate": 0.0001847165594891373, + "loss": 4.368, + "step": 5167 + }, + { + "epoch": 0.5358598655943594, + "grad_norm": 0.7890625, + "learning_rate": 0.00018471078753811392, + "loss": 4.3373, + "step": 5168 + }, + { + "epoch": 0.5359635536488474, + "grad_norm": 0.8125, + "learning_rate": 0.00018470501458759273, + "loss": 4.3089, + "step": 5169 + }, + { + "epoch": 0.5360672417033355, + "grad_norm": 0.8671875, + "learning_rate": 0.00018469924063764192, + "loss": 4.3848, + "step": 5170 + }, + { + "epoch": 0.5361709297578235, + "grad_norm": 0.71484375, + "learning_rate": 0.0001846934656883296, + "loss": 4.3728, + "step": 5171 + }, + { + "epoch": 0.5362746178123117, + "grad_norm": 0.81640625, + "learning_rate": 0.00018468768973972392, + "loss": 4.3487, + "step": 5172 + }, + { + "epoch": 0.5363783058667997, + "grad_norm": 0.70703125, + "learning_rate": 0.00018468191279189303, + "loss": 4.3687, + "step": 5173 + }, + { + "epoch": 0.5364819939212878, + "grad_norm": 0.79296875, + "learning_rate": 0.00018467613484490506, + "loss": 4.3389, + "step": 5174 + }, + { + "epoch": 0.5365856819757758, + "grad_norm": 0.8515625, + "learning_rate": 0.0001846703558988282, + "loss": 4.4002, + "step": 5175 + }, + { + "epoch": 0.536689370030264, + "grad_norm": 0.75390625, + "learning_rate": 0.0001846645759537307, + "loss": 4.3487, + "step": 5176 + }, + { + "epoch": 0.536793058084752, + "grad_norm": 0.7578125, + "learning_rate": 0.00018465879500968065, + "loss": 4.3517, + "step": 5177 + }, + { + "epoch": 0.5368967461392401, + "grad_norm": 0.7734375, + "learning_rate": 0.00018465301306674632, + "loss": 4.3955, + "step": 5178 + }, + { + "epoch": 0.5370004341937281, + "grad_norm": 0.7734375, + "learning_rate": 0.0001846472301249959, + "loss": 4.3555, + "step": 5179 + }, + { + "epoch": 0.5371041222482162, + "grad_norm": 0.6953125, + "learning_rate": 0.00018464144618449764, + "loss": 4.2937, + "step": 5180 + }, + { + "epoch": 0.5372078103027044, + "grad_norm": 0.67578125, + "learning_rate": 0.0001846356612453198, + "loss": 4.3372, + "step": 5181 + }, + { + "epoch": 0.5373114983571924, + "grad_norm": 0.7734375, + "learning_rate": 0.00018462987530753062, + "loss": 4.3874, + "step": 5182 + }, + { + "epoch": 0.5374151864116805, + "grad_norm": 0.7421875, + "learning_rate": 0.00018462408837119837, + "loss": 4.3692, + "step": 5183 + }, + { + "epoch": 0.5375188744661685, + "grad_norm": 0.859375, + "learning_rate": 0.00018461830043639131, + "loss": 4.3388, + "step": 5184 + }, + { + "epoch": 0.5376225625206567, + "grad_norm": 0.7265625, + "learning_rate": 0.00018461251150317777, + "loss": 4.3781, + "step": 5185 + }, + { + "epoch": 0.5377262505751447, + "grad_norm": 0.84375, + "learning_rate": 0.00018460672157162602, + "loss": 4.3331, + "step": 5186 + }, + { + "epoch": 0.5378299386296328, + "grad_norm": 0.76171875, + "learning_rate": 0.0001846009306418044, + "loss": 4.3527, + "step": 5187 + }, + { + "epoch": 0.5379336266841208, + "grad_norm": 0.828125, + "learning_rate": 0.00018459513871378126, + "loss": 4.3519, + "step": 5188 + }, + { + "epoch": 0.5380373147386089, + "grad_norm": 0.8359375, + "learning_rate": 0.00018458934578762483, + "loss": 4.3211, + "step": 5189 + }, + { + "epoch": 0.538141002793097, + "grad_norm": 0.68359375, + "learning_rate": 0.0001845835518634036, + "loss": 4.3603, + "step": 5190 + }, + { + "epoch": 0.5382446908475851, + "grad_norm": 0.84375, + "learning_rate": 0.00018457775694118582, + "loss": 4.368, + "step": 5191 + }, + { + "epoch": 0.5383483789020731, + "grad_norm": 0.72265625, + "learning_rate": 0.00018457196102103992, + "loss": 4.366, + "step": 5192 + }, + { + "epoch": 0.5384520669565612, + "grad_norm": 0.90234375, + "learning_rate": 0.0001845661641030343, + "loss": 4.3691, + "step": 5193 + }, + { + "epoch": 0.5385557550110492, + "grad_norm": 0.78515625, + "learning_rate": 0.0001845603661872373, + "loss": 4.348, + "step": 5194 + }, + { + "epoch": 0.5386594430655374, + "grad_norm": 0.7265625, + "learning_rate": 0.00018455456727371742, + "loss": 4.3388, + "step": 5195 + }, + { + "epoch": 0.5387631311200254, + "grad_norm": 0.8203125, + "learning_rate": 0.00018454876736254296, + "loss": 4.3334, + "step": 5196 + }, + { + "epoch": 0.5388668191745135, + "grad_norm": 0.71875, + "learning_rate": 0.00018454296645378245, + "loss": 4.3293, + "step": 5197 + }, + { + "epoch": 0.5389705072290015, + "grad_norm": 0.73046875, + "learning_rate": 0.00018453716454750428, + "loss": 4.3296, + "step": 5198 + }, + { + "epoch": 0.5390741952834897, + "grad_norm": 0.9375, + "learning_rate": 0.00018453136164377695, + "loss": 4.3375, + "step": 5199 + }, + { + "epoch": 0.5391778833379777, + "grad_norm": 0.71875, + "learning_rate": 0.00018452555774266892, + "loss": 4.3723, + "step": 5200 + }, + { + "epoch": 0.5392815713924658, + "grad_norm": 0.91796875, + "learning_rate": 0.00018451975284424862, + "loss": 4.3693, + "step": 5201 + }, + { + "epoch": 0.5393852594469538, + "grad_norm": 0.7265625, + "learning_rate": 0.0001845139469485846, + "loss": 4.298, + "step": 5202 + }, + { + "epoch": 0.5394889475014419, + "grad_norm": 0.91796875, + "learning_rate": 0.00018450814005574532, + "loss": 4.3173, + "step": 5203 + }, + { + "epoch": 0.53959263555593, + "grad_norm": 0.84375, + "learning_rate": 0.00018450233216579936, + "loss": 4.288, + "step": 5204 + }, + { + "epoch": 0.5396963236104181, + "grad_norm": 0.8046875, + "learning_rate": 0.00018449652327881514, + "loss": 4.3271, + "step": 5205 + }, + { + "epoch": 0.5398000116649061, + "grad_norm": 0.859375, + "learning_rate": 0.0001844907133948613, + "loss": 4.417, + "step": 5206 + }, + { + "epoch": 0.5399036997193942, + "grad_norm": 0.81640625, + "learning_rate": 0.00018448490251400635, + "loss": 4.3407, + "step": 5207 + }, + { + "epoch": 0.5400073877738822, + "grad_norm": 0.75, + "learning_rate": 0.00018447909063631888, + "loss": 4.3654, + "step": 5208 + }, + { + "epoch": 0.5401110758283704, + "grad_norm": 0.79296875, + "learning_rate": 0.0001844732777618674, + "loss": 4.3053, + "step": 5209 + }, + { + "epoch": 0.5402147638828584, + "grad_norm": 0.66015625, + "learning_rate": 0.00018446746389072055, + "loss": 4.3725, + "step": 5210 + }, + { + "epoch": 0.5403184519373465, + "grad_norm": 0.8828125, + "learning_rate": 0.0001844616490229469, + "loss": 4.3676, + "step": 5211 + }, + { + "epoch": 0.5404221399918345, + "grad_norm": 0.765625, + "learning_rate": 0.00018445583315861508, + "loss": 4.3485, + "step": 5212 + }, + { + "epoch": 0.5405258280463227, + "grad_norm": 0.84765625, + "learning_rate": 0.0001844500162977937, + "loss": 4.345, + "step": 5213 + }, + { + "epoch": 0.5406295161008107, + "grad_norm": 0.82421875, + "learning_rate": 0.0001844441984405514, + "loss": 4.3473, + "step": 5214 + }, + { + "epoch": 0.5407332041552988, + "grad_norm": 0.82421875, + "learning_rate": 0.00018443837958695682, + "loss": 4.3451, + "step": 5215 + }, + { + "epoch": 0.5408368922097868, + "grad_norm": 0.84765625, + "learning_rate": 0.00018443255973707863, + "loss": 4.3028, + "step": 5216 + }, + { + "epoch": 0.5409405802642749, + "grad_norm": 0.78515625, + "learning_rate": 0.00018442673889098546, + "loss": 4.3392, + "step": 5217 + }, + { + "epoch": 0.541044268318763, + "grad_norm": 0.9140625, + "learning_rate": 0.000184420917048746, + "loss": 4.3519, + "step": 5218 + }, + { + "epoch": 0.5411479563732511, + "grad_norm": 0.79296875, + "learning_rate": 0.00018441509421042898, + "loss": 4.3588, + "step": 5219 + }, + { + "epoch": 0.5412516444277391, + "grad_norm": 0.7421875, + "learning_rate": 0.00018440927037610306, + "loss": 4.3698, + "step": 5220 + }, + { + "epoch": 0.5413553324822272, + "grad_norm": 0.90625, + "learning_rate": 0.000184403445545837, + "loss": 4.3532, + "step": 5221 + }, + { + "epoch": 0.5414590205367152, + "grad_norm": 0.6875, + "learning_rate": 0.00018439761971969948, + "loss": 4.3166, + "step": 5222 + }, + { + "epoch": 0.5415627085912034, + "grad_norm": 0.84765625, + "learning_rate": 0.0001843917928977593, + "loss": 4.3754, + "step": 5223 + }, + { + "epoch": 0.5416663966456914, + "grad_norm": 0.91015625, + "learning_rate": 0.00018438596508008512, + "loss": 4.3262, + "step": 5224 + }, + { + "epoch": 0.5417700847001795, + "grad_norm": 0.69921875, + "learning_rate": 0.00018438013626674576, + "loss": 4.378, + "step": 5225 + }, + { + "epoch": 0.5418737727546676, + "grad_norm": 0.9140625, + "learning_rate": 0.00018437430645781, + "loss": 4.3435, + "step": 5226 + }, + { + "epoch": 0.5419774608091557, + "grad_norm": 0.90234375, + "learning_rate": 0.00018436847565334662, + "loss": 4.368, + "step": 5227 + }, + { + "epoch": 0.5420811488636438, + "grad_norm": 0.86328125, + "learning_rate": 0.00018436264385342441, + "loss": 4.3494, + "step": 5228 + }, + { + "epoch": 0.5421848369181318, + "grad_norm": 0.75, + "learning_rate": 0.00018435681105811216, + "loss": 4.3826, + "step": 5229 + }, + { + "epoch": 0.5422885249726199, + "grad_norm": 1.140625, + "learning_rate": 0.0001843509772674787, + "loss": 4.3497, + "step": 5230 + }, + { + "epoch": 0.5423922130271079, + "grad_norm": 0.74609375, + "learning_rate": 0.0001843451424815929, + "loss": 4.3584, + "step": 5231 + }, + { + "epoch": 0.5424959010815961, + "grad_norm": 0.8203125, + "learning_rate": 0.00018433930670052354, + "loss": 4.3105, + "step": 5232 + }, + { + "epoch": 0.5425995891360841, + "grad_norm": 0.84375, + "learning_rate": 0.00018433346992433955, + "loss": 4.3521, + "step": 5233 + }, + { + "epoch": 0.5427032771905722, + "grad_norm": 0.80859375, + "learning_rate": 0.00018432763215310972, + "loss": 4.3291, + "step": 5234 + }, + { + "epoch": 0.5428069652450602, + "grad_norm": 0.796875, + "learning_rate": 0.000184321793386903, + "loss": 4.3466, + "step": 5235 + }, + { + "epoch": 0.5429106532995484, + "grad_norm": 0.89453125, + "learning_rate": 0.00018431595362578825, + "loss": 4.337, + "step": 5236 + }, + { + "epoch": 0.5430143413540364, + "grad_norm": 0.703125, + "learning_rate": 0.00018431011286983436, + "loss": 4.3597, + "step": 5237 + }, + { + "epoch": 0.5431180294085245, + "grad_norm": 0.90625, + "learning_rate": 0.00018430427111911025, + "loss": 4.3259, + "step": 5238 + }, + { + "epoch": 0.5432217174630125, + "grad_norm": 0.80859375, + "learning_rate": 0.00018429842837368486, + "loss": 4.357, + "step": 5239 + }, + { + "epoch": 0.5433254055175006, + "grad_norm": 0.796875, + "learning_rate": 0.00018429258463362714, + "loss": 4.4306, + "step": 5240 + }, + { + "epoch": 0.5434290935719887, + "grad_norm": 0.85546875, + "learning_rate": 0.00018428673989900603, + "loss": 4.3846, + "step": 5241 + }, + { + "epoch": 0.5435327816264768, + "grad_norm": 0.84765625, + "learning_rate": 0.00018428089416989044, + "loss": 4.3188, + "step": 5242 + }, + { + "epoch": 0.5436364696809648, + "grad_norm": 0.6875, + "learning_rate": 0.00018427504744634941, + "loss": 4.3628, + "step": 5243 + }, + { + "epoch": 0.5437401577354529, + "grad_norm": 0.79296875, + "learning_rate": 0.0001842691997284519, + "loss": 4.338, + "step": 5244 + }, + { + "epoch": 0.5438438457899409, + "grad_norm": 0.7578125, + "learning_rate": 0.00018426335101626694, + "loss": 4.3442, + "step": 5245 + }, + { + "epoch": 0.5439475338444291, + "grad_norm": 0.84765625, + "learning_rate": 0.00018425750130986347, + "loss": 4.3254, + "step": 5246 + }, + { + "epoch": 0.5440512218989171, + "grad_norm": 0.76953125, + "learning_rate": 0.00018425165060931056, + "loss": 4.3504, + "step": 5247 + }, + { + "epoch": 0.5441549099534052, + "grad_norm": 0.7421875, + "learning_rate": 0.0001842457989146772, + "loss": 4.3441, + "step": 5248 + }, + { + "epoch": 0.5442585980078932, + "grad_norm": 0.765625, + "learning_rate": 0.0001842399462260325, + "loss": 4.3201, + "step": 5249 + }, + { + "epoch": 0.5443622860623814, + "grad_norm": 0.6640625, + "learning_rate": 0.0001842340925434455, + "loss": 4.311, + "step": 5250 + }, + { + "epoch": 0.5444659741168694, + "grad_norm": 0.77734375, + "learning_rate": 0.0001842282378669852, + "loss": 4.4093, + "step": 5251 + }, + { + "epoch": 0.5445696621713575, + "grad_norm": 0.65625, + "learning_rate": 0.00018422238219672078, + "loss": 4.372, + "step": 5252 + }, + { + "epoch": 0.5446733502258455, + "grad_norm": 0.73046875, + "learning_rate": 0.00018421652553272125, + "loss": 4.3411, + "step": 5253 + }, + { + "epoch": 0.5447770382803336, + "grad_norm": 0.73046875, + "learning_rate": 0.00018421066787505574, + "loss": 4.3217, + "step": 5254 + }, + { + "epoch": 0.5448807263348217, + "grad_norm": 0.75, + "learning_rate": 0.00018420480922379335, + "loss": 4.352, + "step": 5255 + }, + { + "epoch": 0.5449844143893098, + "grad_norm": 0.734375, + "learning_rate": 0.0001841989495790032, + "loss": 4.3746, + "step": 5256 + }, + { + "epoch": 0.5450881024437978, + "grad_norm": 0.75, + "learning_rate": 0.0001841930889407545, + "loss": 4.3087, + "step": 5257 + }, + { + "epoch": 0.5451917904982859, + "grad_norm": 0.88671875, + "learning_rate": 0.00018418722730911632, + "loss": 4.3734, + "step": 5258 + }, + { + "epoch": 0.5452954785527739, + "grad_norm": 0.73046875, + "learning_rate": 0.00018418136468415785, + "loss": 4.3285, + "step": 5259 + }, + { + "epoch": 0.5453991666072621, + "grad_norm": 0.7578125, + "learning_rate": 0.00018417550106594827, + "loss": 4.3549, + "step": 5260 + }, + { + "epoch": 0.5455028546617501, + "grad_norm": 0.75, + "learning_rate": 0.00018416963645455673, + "loss": 4.3472, + "step": 5261 + }, + { + "epoch": 0.5456065427162382, + "grad_norm": 0.7890625, + "learning_rate": 0.00018416377085005248, + "loss": 4.3737, + "step": 5262 + }, + { + "epoch": 0.5457102307707262, + "grad_norm": 0.79296875, + "learning_rate": 0.00018415790425250467, + "loss": 4.327, + "step": 5263 + }, + { + "epoch": 0.5458139188252144, + "grad_norm": 0.796875, + "learning_rate": 0.00018415203666198257, + "loss": 4.3347, + "step": 5264 + }, + { + "epoch": 0.5459176068797024, + "grad_norm": 0.73828125, + "learning_rate": 0.00018414616807855538, + "loss": 4.2823, + "step": 5265 + }, + { + "epoch": 0.5460212949341905, + "grad_norm": 0.79296875, + "learning_rate": 0.00018414029850229236, + "loss": 4.3409, + "step": 5266 + }, + { + "epoch": 0.5461249829886785, + "grad_norm": 0.76171875, + "learning_rate": 0.00018413442793326277, + "loss": 4.3065, + "step": 5267 + }, + { + "epoch": 0.5462286710431666, + "grad_norm": 0.81640625, + "learning_rate": 0.00018412855637153582, + "loss": 4.3402, + "step": 5268 + }, + { + "epoch": 0.5463323590976547, + "grad_norm": 0.81640625, + "learning_rate": 0.00018412268381718088, + "loss": 4.3167, + "step": 5269 + }, + { + "epoch": 0.5464360471521428, + "grad_norm": 0.78515625, + "learning_rate": 0.00018411681027026715, + "loss": 4.3548, + "step": 5270 + }, + { + "epoch": 0.5465397352066309, + "grad_norm": 0.71484375, + "learning_rate": 0.000184110935730864, + "loss": 4.34, + "step": 5271 + }, + { + "epoch": 0.5466434232611189, + "grad_norm": 0.74609375, + "learning_rate": 0.00018410506019904074, + "loss": 4.3171, + "step": 5272 + }, + { + "epoch": 0.546747111315607, + "grad_norm": 0.7265625, + "learning_rate": 0.00018409918367486662, + "loss": 4.3388, + "step": 5273 + }, + { + "epoch": 0.5468507993700951, + "grad_norm": 0.9453125, + "learning_rate": 0.00018409330615841107, + "loss": 4.3309, + "step": 5274 + }, + { + "epoch": 0.5469544874245832, + "grad_norm": 0.84375, + "learning_rate": 0.00018408742764974338, + "loss": 4.3119, + "step": 5275 + }, + { + "epoch": 0.5470581754790712, + "grad_norm": 0.87890625, + "learning_rate": 0.00018408154814893296, + "loss": 4.3519, + "step": 5276 + }, + { + "epoch": 0.5471618635335593, + "grad_norm": 0.79296875, + "learning_rate": 0.00018407566765604911, + "loss": 4.3591, + "step": 5277 + }, + { + "epoch": 0.5472655515880473, + "grad_norm": 1.0078125, + "learning_rate": 0.00018406978617116124, + "loss": 4.3208, + "step": 5278 + }, + { + "epoch": 0.5473692396425355, + "grad_norm": 0.80859375, + "learning_rate": 0.00018406390369433878, + "loss": 4.3438, + "step": 5279 + }, + { + "epoch": 0.5474729276970235, + "grad_norm": 0.74609375, + "learning_rate": 0.00018405802022565114, + "loss": 4.3529, + "step": 5280 + }, + { + "epoch": 0.5475766157515116, + "grad_norm": 0.984375, + "learning_rate": 0.00018405213576516772, + "loss": 4.3476, + "step": 5281 + }, + { + "epoch": 0.5476803038059996, + "grad_norm": 0.90234375, + "learning_rate": 0.0001840462503129579, + "loss": 4.329, + "step": 5282 + }, + { + "epoch": 0.5477839918604878, + "grad_norm": 0.8203125, + "learning_rate": 0.00018404036386909117, + "loss": 4.3186, + "step": 5283 + }, + { + "epoch": 0.5478876799149758, + "grad_norm": 0.95703125, + "learning_rate": 0.00018403447643363703, + "loss": 4.3454, + "step": 5284 + }, + { + "epoch": 0.5479913679694639, + "grad_norm": 0.82421875, + "learning_rate": 0.00018402858800666485, + "loss": 4.3464, + "step": 5285 + }, + { + "epoch": 0.5480950560239519, + "grad_norm": 0.75390625, + "learning_rate": 0.00018402269858824416, + "loss": 4.3919, + "step": 5286 + }, + { + "epoch": 0.54819874407844, + "grad_norm": 1.0078125, + "learning_rate": 0.00018401680817844445, + "loss": 4.3347, + "step": 5287 + }, + { + "epoch": 0.5483024321329281, + "grad_norm": 0.87109375, + "learning_rate": 0.00018401091677733522, + "loss": 4.3367, + "step": 5288 + }, + { + "epoch": 0.5484061201874162, + "grad_norm": 0.6796875, + "learning_rate": 0.00018400502438498597, + "loss": 4.3219, + "step": 5289 + }, + { + "epoch": 0.5485098082419042, + "grad_norm": 1.0390625, + "learning_rate": 0.00018399913100146625, + "loss": 4.3454, + "step": 5290 + }, + { + "epoch": 0.5486134962963923, + "grad_norm": 0.84375, + "learning_rate": 0.00018399323662684553, + "loss": 4.3655, + "step": 5291 + }, + { + "epoch": 0.5487171843508803, + "grad_norm": 0.75, + "learning_rate": 0.00018398734126119344, + "loss": 4.385, + "step": 5292 + }, + { + "epoch": 0.5488208724053685, + "grad_norm": 0.7421875, + "learning_rate": 0.00018398144490457946, + "loss": 4.3444, + "step": 5293 + }, + { + "epoch": 0.5489245604598565, + "grad_norm": 0.7734375, + "learning_rate": 0.00018397554755707324, + "loss": 4.3266, + "step": 5294 + }, + { + "epoch": 0.5490282485143446, + "grad_norm": 0.56640625, + "learning_rate": 0.00018396964921874433, + "loss": 4.3489, + "step": 5295 + }, + { + "epoch": 0.5491319365688326, + "grad_norm": 0.7421875, + "learning_rate": 0.0001839637498896623, + "loss": 4.2763, + "step": 5296 + }, + { + "epoch": 0.5492356246233208, + "grad_norm": 0.6171875, + "learning_rate": 0.00018395784956989677, + "loss": 4.3416, + "step": 5297 + }, + { + "epoch": 0.5493393126778088, + "grad_norm": 0.68359375, + "learning_rate": 0.0001839519482595174, + "loss": 4.3483, + "step": 5298 + }, + { + "epoch": 0.5494430007322969, + "grad_norm": 0.65625, + "learning_rate": 0.00018394604595859372, + "loss": 4.3213, + "step": 5299 + }, + { + "epoch": 0.5495466887867849, + "grad_norm": 0.671875, + "learning_rate": 0.00018394014266719547, + "loss": 4.2955, + "step": 5300 + }, + { + "epoch": 0.549650376841273, + "grad_norm": 0.703125, + "learning_rate": 0.00018393423838539225, + "loss": 4.3245, + "step": 5301 + }, + { + "epoch": 0.5497540648957611, + "grad_norm": 0.71484375, + "learning_rate": 0.00018392833311325376, + "loss": 4.3408, + "step": 5302 + }, + { + "epoch": 0.5498577529502492, + "grad_norm": 0.73828125, + "learning_rate": 0.00018392242685084963, + "loss": 4.3355, + "step": 5303 + }, + { + "epoch": 0.5499614410047372, + "grad_norm": 0.6875, + "learning_rate": 0.0001839165195982496, + "loss": 4.3566, + "step": 5304 + }, + { + "epoch": 0.5500651290592253, + "grad_norm": 0.69140625, + "learning_rate": 0.00018391061135552335, + "loss": 4.3398, + "step": 5305 + }, + { + "epoch": 0.5501688171137133, + "grad_norm": 0.6953125, + "learning_rate": 0.00018390470212274057, + "loss": 4.3165, + "step": 5306 + }, + { + "epoch": 0.5502725051682015, + "grad_norm": 0.71484375, + "learning_rate": 0.00018389879189997097, + "loss": 4.353, + "step": 5307 + }, + { + "epoch": 0.5503761932226895, + "grad_norm": 0.69140625, + "learning_rate": 0.0001838928806872843, + "loss": 4.3768, + "step": 5308 + }, + { + "epoch": 0.5504798812771776, + "grad_norm": 0.7421875, + "learning_rate": 0.00018388696848475034, + "loss": 4.3385, + "step": 5309 + }, + { + "epoch": 0.5505835693316656, + "grad_norm": 0.69921875, + "learning_rate": 0.00018388105529243886, + "loss": 4.3456, + "step": 5310 + }, + { + "epoch": 0.5506872573861538, + "grad_norm": 0.80078125, + "learning_rate": 0.00018387514111041956, + "loss": 4.3464, + "step": 5311 + }, + { + "epoch": 0.5507909454406418, + "grad_norm": 0.64453125, + "learning_rate": 0.00018386922593876224, + "loss": 4.3536, + "step": 5312 + }, + { + "epoch": 0.5508946334951299, + "grad_norm": 0.8515625, + "learning_rate": 0.00018386330977753675, + "loss": 4.3569, + "step": 5313 + }, + { + "epoch": 0.5509983215496179, + "grad_norm": 0.7578125, + "learning_rate": 0.00018385739262681284, + "loss": 4.3348, + "step": 5314 + }, + { + "epoch": 0.551102009604106, + "grad_norm": 0.7578125, + "learning_rate": 0.00018385147448666032, + "loss": 4.3592, + "step": 5315 + }, + { + "epoch": 0.5512056976585942, + "grad_norm": 0.7421875, + "learning_rate": 0.00018384555535714902, + "loss": 4.3617, + "step": 5316 + }, + { + "epoch": 0.5513093857130822, + "grad_norm": 0.796875, + "learning_rate": 0.0001838396352383488, + "loss": 4.3339, + "step": 5317 + }, + { + "epoch": 0.5514130737675703, + "grad_norm": 0.87890625, + "learning_rate": 0.00018383371413032955, + "loss": 4.3743, + "step": 5318 + }, + { + "epoch": 0.5515167618220583, + "grad_norm": 0.7578125, + "learning_rate": 0.00018382779203316108, + "loss": 4.3506, + "step": 5319 + }, + { + "epoch": 0.5516204498765465, + "grad_norm": 0.87890625, + "learning_rate": 0.00018382186894691326, + "loss": 4.3465, + "step": 5320 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 0.84765625, + "learning_rate": 0.00018381594487165597, + "loss": 4.3386, + "step": 5321 + }, + { + "epoch": 0.5518278259855226, + "grad_norm": 0.890625, + "learning_rate": 0.00018381001980745916, + "loss": 4.3716, + "step": 5322 + }, + { + "epoch": 0.5519315140400106, + "grad_norm": 0.89453125, + "learning_rate": 0.00018380409375439268, + "loss": 4.3494, + "step": 5323 + }, + { + "epoch": 0.5520352020944987, + "grad_norm": 0.9140625, + "learning_rate": 0.0001837981667125265, + "loss": 4.3477, + "step": 5324 + }, + { + "epoch": 0.5521388901489868, + "grad_norm": 0.7890625, + "learning_rate": 0.0001837922386819305, + "loss": 4.3209, + "step": 5325 + }, + { + "epoch": 0.5522425782034749, + "grad_norm": 0.94140625, + "learning_rate": 0.0001837863096626747, + "loss": 4.3661, + "step": 5326 + }, + { + "epoch": 0.5523462662579629, + "grad_norm": 0.828125, + "learning_rate": 0.00018378037965482898, + "loss": 4.3566, + "step": 5327 + }, + { + "epoch": 0.552449954312451, + "grad_norm": 0.68359375, + "learning_rate": 0.00018377444865846337, + "loss": 4.3516, + "step": 5328 + }, + { + "epoch": 0.552553642366939, + "grad_norm": 0.9296875, + "learning_rate": 0.00018376851667364777, + "loss": 4.3606, + "step": 5329 + }, + { + "epoch": 0.5526573304214272, + "grad_norm": 0.94921875, + "learning_rate": 0.00018376258370045228, + "loss": 4.3479, + "step": 5330 + }, + { + "epoch": 0.5527610184759152, + "grad_norm": 0.66796875, + "learning_rate": 0.0001837566497389468, + "loss": 4.3693, + "step": 5331 + }, + { + "epoch": 0.5528647065304033, + "grad_norm": 0.9296875, + "learning_rate": 0.0001837507147892014, + "loss": 4.3271, + "step": 5332 + }, + { + "epoch": 0.5529683945848913, + "grad_norm": 0.8359375, + "learning_rate": 0.0001837447788512861, + "loss": 4.3483, + "step": 5333 + }, + { + "epoch": 0.5530720826393795, + "grad_norm": 0.76953125, + "learning_rate": 0.00018373884192527092, + "loss": 4.3756, + "step": 5334 + }, + { + "epoch": 0.5531757706938675, + "grad_norm": 0.8984375, + "learning_rate": 0.00018373290401122594, + "loss": 4.3084, + "step": 5335 + }, + { + "epoch": 0.5532794587483556, + "grad_norm": 0.8515625, + "learning_rate": 0.0001837269651092212, + "loss": 4.3144, + "step": 5336 + }, + { + "epoch": 0.5533831468028436, + "grad_norm": 0.8359375, + "learning_rate": 0.00018372102521932676, + "loss": 4.3555, + "step": 5337 + }, + { + "epoch": 0.5534868348573317, + "grad_norm": 0.96875, + "learning_rate": 0.0001837150843416127, + "loss": 4.3477, + "step": 5338 + }, + { + "epoch": 0.5535905229118198, + "grad_norm": 0.9296875, + "learning_rate": 0.00018370914247614918, + "loss": 4.3296, + "step": 5339 + }, + { + "epoch": 0.5536942109663079, + "grad_norm": 1.0078125, + "learning_rate": 0.0001837031996230062, + "loss": 4.3393, + "step": 5340 + }, + { + "epoch": 0.5537978990207959, + "grad_norm": 0.94140625, + "learning_rate": 0.00018369725578225402, + "loss": 4.3316, + "step": 5341 + }, + { + "epoch": 0.553901587075284, + "grad_norm": 1.0703125, + "learning_rate": 0.00018369131095396263, + "loss": 4.3757, + "step": 5342 + }, + { + "epoch": 0.554005275129772, + "grad_norm": 0.9296875, + "learning_rate": 0.00018368536513820228, + "loss": 4.3632, + "step": 5343 + }, + { + "epoch": 0.5541089631842602, + "grad_norm": 1.015625, + "learning_rate": 0.00018367941833504308, + "loss": 4.354, + "step": 5344 + }, + { + "epoch": 0.5542126512387482, + "grad_norm": 0.80859375, + "learning_rate": 0.00018367347054455517, + "loss": 4.3277, + "step": 5345 + }, + { + "epoch": 0.5543163392932363, + "grad_norm": 1.3046875, + "learning_rate": 0.00018366752176680878, + "loss": 4.3754, + "step": 5346 + }, + { + "epoch": 0.5544200273477243, + "grad_norm": 0.828125, + "learning_rate": 0.00018366157200187404, + "loss": 4.3197, + "step": 5347 + }, + { + "epoch": 0.5545237154022125, + "grad_norm": 1.359375, + "learning_rate": 0.00018365562124982122, + "loss": 4.3296, + "step": 5348 + }, + { + "epoch": 0.5546274034567005, + "grad_norm": 0.90625, + "learning_rate": 0.00018364966951072047, + "loss": 4.3743, + "step": 5349 + }, + { + "epoch": 0.5547310915111886, + "grad_norm": 1.7890625, + "learning_rate": 0.00018364371678464205, + "loss": 4.3345, + "step": 5350 + }, + { + "epoch": 0.5548347795656766, + "grad_norm": 1.4765625, + "learning_rate": 0.00018363776307165619, + "loss": 4.3296, + "step": 5351 + }, + { + "epoch": 0.5549384676201647, + "grad_norm": 1.859375, + "learning_rate": 0.00018363180837183314, + "loss": 4.3787, + "step": 5352 + }, + { + "epoch": 0.5550421556746528, + "grad_norm": 1.71875, + "learning_rate": 0.00018362585268524314, + "loss": 4.2736, + "step": 5353 + }, + { + "epoch": 0.5551458437291409, + "grad_norm": 1.4921875, + "learning_rate": 0.0001836198960119565, + "loss": 4.3756, + "step": 5354 + }, + { + "epoch": 0.5552495317836289, + "grad_norm": 1.34375, + "learning_rate": 0.00018361393835204342, + "loss": 4.3685, + "step": 5355 + }, + { + "epoch": 0.555353219838117, + "grad_norm": 1.71875, + "learning_rate": 0.0001836079797055743, + "loss": 4.3604, + "step": 5356 + }, + { + "epoch": 0.555456907892605, + "grad_norm": 1.3359375, + "learning_rate": 0.0001836020200726194, + "loss": 4.3392, + "step": 5357 + }, + { + "epoch": 0.5555605959470932, + "grad_norm": 2.3125, + "learning_rate": 0.00018359605945324903, + "loss": 4.3684, + "step": 5358 + }, + { + "epoch": 0.5556642840015812, + "grad_norm": 2.1875, + "learning_rate": 0.0001835900978475335, + "loss": 4.3533, + "step": 5359 + }, + { + "epoch": 0.5557679720560693, + "grad_norm": 1.15625, + "learning_rate": 0.0001835841352555432, + "loss": 4.3828, + "step": 5360 + }, + { + "epoch": 0.5558716601105574, + "grad_norm": 1.3984375, + "learning_rate": 0.00018357817167734844, + "loss": 4.3377, + "step": 5361 + }, + { + "epoch": 0.5559753481650455, + "grad_norm": 1.1015625, + "learning_rate": 0.00018357220711301962, + "loss": 4.3375, + "step": 5362 + }, + { + "epoch": 0.5560790362195336, + "grad_norm": 1.203125, + "learning_rate": 0.00018356624156262708, + "loss": 4.3466, + "step": 5363 + }, + { + "epoch": 0.5561827242740216, + "grad_norm": 0.96484375, + "learning_rate": 0.0001835602750262412, + "loss": 4.3279, + "step": 5364 + }, + { + "epoch": 0.5562864123285097, + "grad_norm": 1.3203125, + "learning_rate": 0.00018355430750393244, + "loss": 4.3026, + "step": 5365 + }, + { + "epoch": 0.5563901003829977, + "grad_norm": 0.92578125, + "learning_rate": 0.00018354833899577116, + "loss": 4.3357, + "step": 5366 + }, + { + "epoch": 0.5564937884374859, + "grad_norm": 1.4609375, + "learning_rate": 0.00018354236950182782, + "loss": 4.3593, + "step": 5367 + }, + { + "epoch": 0.5565974764919739, + "grad_norm": 1.09375, + "learning_rate": 0.0001835363990221728, + "loss": 4.3321, + "step": 5368 + }, + { + "epoch": 0.556701164546462, + "grad_norm": 1.984375, + "learning_rate": 0.00018353042755687654, + "loss": 4.3385, + "step": 5369 + }, + { + "epoch": 0.55680485260095, + "grad_norm": 1.8671875, + "learning_rate": 0.0001835244551060096, + "loss": 4.3639, + "step": 5370 + }, + { + "epoch": 0.5569085406554382, + "grad_norm": 1.4140625, + "learning_rate": 0.00018351848166964232, + "loss": 4.3301, + "step": 5371 + }, + { + "epoch": 0.5570122287099262, + "grad_norm": 1.3984375, + "learning_rate": 0.00018351250724784528, + "loss": 4.3551, + "step": 5372 + }, + { + "epoch": 0.5571159167644143, + "grad_norm": 1.2421875, + "learning_rate": 0.00018350653184068892, + "loss": 4.3526, + "step": 5373 + }, + { + "epoch": 0.5572196048189023, + "grad_norm": 1.1328125, + "learning_rate": 0.0001835005554482437, + "loss": 4.3247, + "step": 5374 + }, + { + "epoch": 0.5573232928733904, + "grad_norm": 1.3671875, + "learning_rate": 0.00018349457807058026, + "loss": 4.366, + "step": 5375 + }, + { + "epoch": 0.5574269809278785, + "grad_norm": 1.140625, + "learning_rate": 0.00018348859970776903, + "loss": 4.3393, + "step": 5376 + }, + { + "epoch": 0.5575306689823666, + "grad_norm": 1.6875, + "learning_rate": 0.0001834826203598806, + "loss": 4.3672, + "step": 5377 + }, + { + "epoch": 0.5576343570368546, + "grad_norm": 1.359375, + "learning_rate": 0.00018347664002698545, + "loss": 4.3426, + "step": 5378 + }, + { + "epoch": 0.5577380450913427, + "grad_norm": 1.984375, + "learning_rate": 0.00018347065870915423, + "loss": 4.3356, + "step": 5379 + }, + { + "epoch": 0.5578417331458307, + "grad_norm": 1.7890625, + "learning_rate": 0.00018346467640645745, + "loss": 4.3529, + "step": 5380 + }, + { + "epoch": 0.5579454212003189, + "grad_norm": 1.8203125, + "learning_rate": 0.0001834586931189657, + "loss": 4.3789, + "step": 5381 + }, + { + "epoch": 0.5580491092548069, + "grad_norm": 1.609375, + "learning_rate": 0.0001834527088467496, + "loss": 4.3493, + "step": 5382 + }, + { + "epoch": 0.558152797309295, + "grad_norm": 1.765625, + "learning_rate": 0.00018344672358987975, + "loss": 4.2857, + "step": 5383 + }, + { + "epoch": 0.558256485363783, + "grad_norm": 1.5234375, + "learning_rate": 0.00018344073734842677, + "loss": 4.3451, + "step": 5384 + }, + { + "epoch": 0.5583601734182712, + "grad_norm": 1.9609375, + "learning_rate": 0.00018343475012246133, + "loss": 4.3336, + "step": 5385 + }, + { + "epoch": 0.5584638614727592, + "grad_norm": 1.796875, + "learning_rate": 0.00018342876191205395, + "loss": 4.2956, + "step": 5386 + }, + { + "epoch": 0.5585675495272473, + "grad_norm": 1.5625, + "learning_rate": 0.00018342277271727543, + "loss": 4.2989, + "step": 5387 + }, + { + "epoch": 0.5586712375817353, + "grad_norm": 1.4765625, + "learning_rate": 0.00018341678253819637, + "loss": 4.3567, + "step": 5388 + }, + { + "epoch": 0.5587749256362234, + "grad_norm": 1.5078125, + "learning_rate": 0.00018341079137488745, + "loss": 4.3328, + "step": 5389 + }, + { + "epoch": 0.5588786136907115, + "grad_norm": 1.3359375, + "learning_rate": 0.00018340479922741932, + "loss": 4.3779, + "step": 5390 + }, + { + "epoch": 0.5589823017451996, + "grad_norm": 1.6640625, + "learning_rate": 0.00018339880609586276, + "loss": 4.2602, + "step": 5391 + }, + { + "epoch": 0.5590859897996876, + "grad_norm": 1.4921875, + "learning_rate": 0.00018339281198028845, + "loss": 4.3444, + "step": 5392 + }, + { + "epoch": 0.5591896778541757, + "grad_norm": 1.7734375, + "learning_rate": 0.00018338681688076709, + "loss": 4.3121, + "step": 5393 + }, + { + "epoch": 0.5592933659086637, + "grad_norm": 1.6875, + "learning_rate": 0.0001833808207973694, + "loss": 4.3673, + "step": 5394 + }, + { + "epoch": 0.5593970539631519, + "grad_norm": 1.578125, + "learning_rate": 0.00018337482373016623, + "loss": 4.3323, + "step": 5395 + }, + { + "epoch": 0.5595007420176399, + "grad_norm": 1.4609375, + "learning_rate": 0.00018336882567922822, + "loss": 4.3443, + "step": 5396 + }, + { + "epoch": 0.559604430072128, + "grad_norm": 1.7890625, + "learning_rate": 0.00018336282664462622, + "loss": 4.3561, + "step": 5397 + }, + { + "epoch": 0.559708118126616, + "grad_norm": 1.578125, + "learning_rate": 0.00018335682662643099, + "loss": 4.3319, + "step": 5398 + }, + { + "epoch": 0.5598118061811042, + "grad_norm": 1.7109375, + "learning_rate": 0.0001833508256247133, + "loss": 4.3546, + "step": 5399 + }, + { + "epoch": 0.5599154942355922, + "grad_norm": 1.671875, + "learning_rate": 0.00018334482363954396, + "loss": 4.3268, + "step": 5400 + }, + { + "epoch": 0.5600191822900803, + "grad_norm": 1.4375, + "learning_rate": 0.00018333882067099385, + "loss": 4.3379, + "step": 5401 + }, + { + "epoch": 0.5601228703445683, + "grad_norm": 1.390625, + "learning_rate": 0.0001833328167191337, + "loss": 4.3464, + "step": 5402 + }, + { + "epoch": 0.5602265583990564, + "grad_norm": 1.5546875, + "learning_rate": 0.0001833268117840344, + "loss": 4.3033, + "step": 5403 + }, + { + "epoch": 0.5603302464535445, + "grad_norm": 1.3359375, + "learning_rate": 0.0001833208058657668, + "loss": 4.3792, + "step": 5404 + }, + { + "epoch": 0.5604339345080326, + "grad_norm": 1.6171875, + "learning_rate": 0.0001833147989644018, + "loss": 4.3486, + "step": 5405 + }, + { + "epoch": 0.5605376225625207, + "grad_norm": 1.4609375, + "learning_rate": 0.00018330879108001023, + "loss": 4.3596, + "step": 5406 + }, + { + "epoch": 0.5606413106170087, + "grad_norm": 1.7421875, + "learning_rate": 0.00018330278221266298, + "loss": 4.3626, + "step": 5407 + }, + { + "epoch": 0.5607449986714969, + "grad_norm": 1.609375, + "learning_rate": 0.00018329677236243096, + "loss": 4.3301, + "step": 5408 + }, + { + "epoch": 0.5608486867259849, + "grad_norm": 1.578125, + "learning_rate": 0.00018329076152938506, + "loss": 4.3098, + "step": 5409 + }, + { + "epoch": 0.560952374780473, + "grad_norm": 1.4140625, + "learning_rate": 0.00018328474971359622, + "loss": 4.3482, + "step": 5410 + }, + { + "epoch": 0.561056062834961, + "grad_norm": 1.6953125, + "learning_rate": 0.00018327873691513537, + "loss": 4.3548, + "step": 5411 + }, + { + "epoch": 0.5611597508894491, + "grad_norm": 1.6484375, + "learning_rate": 0.00018327272313407343, + "loss": 4.368, + "step": 5412 + }, + { + "epoch": 0.5612634389439372, + "grad_norm": 1.53125, + "learning_rate": 0.00018326670837048143, + "loss": 4.3198, + "step": 5413 + }, + { + "epoch": 0.5613671269984253, + "grad_norm": 1.484375, + "learning_rate": 0.00018326069262443025, + "loss": 4.3299, + "step": 5414 + }, + { + "epoch": 0.5614708150529133, + "grad_norm": 1.4609375, + "learning_rate": 0.00018325467589599094, + "loss": 4.3464, + "step": 5415 + }, + { + "epoch": 0.5615745031074014, + "grad_norm": 1.3828125, + "learning_rate": 0.00018324865818523445, + "loss": 4.3521, + "step": 5416 + }, + { + "epoch": 0.5616781911618894, + "grad_norm": 1.5390625, + "learning_rate": 0.00018324263949223178, + "loss": 4.3357, + "step": 5417 + }, + { + "epoch": 0.5617818792163776, + "grad_norm": 1.5078125, + "learning_rate": 0.00018323661981705394, + "loss": 4.3585, + "step": 5418 + }, + { + "epoch": 0.5618855672708656, + "grad_norm": 1.5, + "learning_rate": 0.000183230599159772, + "loss": 4.3358, + "step": 5419 + }, + { + "epoch": 0.5619892553253537, + "grad_norm": 1.359375, + "learning_rate": 0.00018322457752045695, + "loss": 4.2999, + "step": 5420 + }, + { + "epoch": 0.5620929433798417, + "grad_norm": 1.5859375, + "learning_rate": 0.00018321855489917988, + "loss": 4.3694, + "step": 5421 + }, + { + "epoch": 0.5621966314343299, + "grad_norm": 1.453125, + "learning_rate": 0.0001832125312960118, + "loss": 4.3505, + "step": 5422 + }, + { + "epoch": 0.5623003194888179, + "grad_norm": 1.8203125, + "learning_rate": 0.00018320650671102383, + "loss": 4.3697, + "step": 5423 + }, + { + "epoch": 0.562404007543306, + "grad_norm": 1.671875, + "learning_rate": 0.00018320048114428702, + "loss": 4.3777, + "step": 5424 + }, + { + "epoch": 0.562507695597794, + "grad_norm": 1.4453125, + "learning_rate": 0.00018319445459587247, + "loss": 4.3195, + "step": 5425 + }, + { + "epoch": 0.5626113836522821, + "grad_norm": 1.3828125, + "learning_rate": 0.0001831884270658513, + "loss": 4.3189, + "step": 5426 + }, + { + "epoch": 0.5627150717067702, + "grad_norm": 1.75, + "learning_rate": 0.00018318239855429465, + "loss": 4.3681, + "step": 5427 + }, + { + "epoch": 0.5628187597612583, + "grad_norm": 1.59375, + "learning_rate": 0.0001831763690612736, + "loss": 4.3512, + "step": 5428 + }, + { + "epoch": 0.5629224478157463, + "grad_norm": 1.6796875, + "learning_rate": 0.00018317033858685934, + "loss": 4.3531, + "step": 5429 + }, + { + "epoch": 0.5630261358702344, + "grad_norm": 1.671875, + "learning_rate": 0.000183164307131123, + "loss": 4.3556, + "step": 5430 + }, + { + "epoch": 0.5631298239247224, + "grad_norm": 1.515625, + "learning_rate": 0.0001831582746941357, + "loss": 4.3025, + "step": 5431 + }, + { + "epoch": 0.5632335119792106, + "grad_norm": 1.4375, + "learning_rate": 0.0001831522412759687, + "loss": 4.3599, + "step": 5432 + }, + { + "epoch": 0.5633372000336986, + "grad_norm": 1.6015625, + "learning_rate": 0.00018314620687669316, + "loss": 4.3102, + "step": 5433 + }, + { + "epoch": 0.5634408880881867, + "grad_norm": 1.46875, + "learning_rate": 0.00018314017149638026, + "loss": 4.3832, + "step": 5434 + }, + { + "epoch": 0.5635445761426747, + "grad_norm": 1.390625, + "learning_rate": 0.0001831341351351012, + "loss": 4.3497, + "step": 5435 + }, + { + "epoch": 0.5636482641971629, + "grad_norm": 1.296875, + "learning_rate": 0.00018312809779292722, + "loss": 4.3164, + "step": 5436 + }, + { + "epoch": 0.5637519522516509, + "grad_norm": 1.4453125, + "learning_rate": 0.00018312205946992958, + "loss": 4.3224, + "step": 5437 + }, + { + "epoch": 0.563855640306139, + "grad_norm": 1.3359375, + "learning_rate": 0.00018311602016617945, + "loss": 4.3528, + "step": 5438 + }, + { + "epoch": 0.563959328360627, + "grad_norm": 1.546875, + "learning_rate": 0.0001831099798817482, + "loss": 4.339, + "step": 5439 + }, + { + "epoch": 0.5640630164151151, + "grad_norm": 1.390625, + "learning_rate": 0.000183103938616707, + "loss": 4.3813, + "step": 5440 + }, + { + "epoch": 0.5641667044696032, + "grad_norm": 1.4921875, + "learning_rate": 0.0001830978963711272, + "loss": 4.3508, + "step": 5441 + }, + { + "epoch": 0.5642703925240913, + "grad_norm": 1.359375, + "learning_rate": 0.00018309185314508003, + "loss": 4.3509, + "step": 5442 + }, + { + "epoch": 0.5643740805785793, + "grad_norm": 1.640625, + "learning_rate": 0.00018308580893863685, + "loss": 4.3256, + "step": 5443 + }, + { + "epoch": 0.5644777686330674, + "grad_norm": 1.4296875, + "learning_rate": 0.00018307976375186896, + "loss": 4.3288, + "step": 5444 + }, + { + "epoch": 0.5645814566875554, + "grad_norm": 1.7109375, + "learning_rate": 0.00018307371758484765, + "loss": 4.3302, + "step": 5445 + }, + { + "epoch": 0.5646851447420436, + "grad_norm": 1.5859375, + "learning_rate": 0.00018306767043764429, + "loss": 4.3248, + "step": 5446 + }, + { + "epoch": 0.5647888327965316, + "grad_norm": 1.4609375, + "learning_rate": 0.0001830616223103302, + "loss": 4.3706, + "step": 5447 + }, + { + "epoch": 0.5648925208510197, + "grad_norm": 1.3359375, + "learning_rate": 0.00018305557320297678, + "loss": 4.3251, + "step": 5448 + }, + { + "epoch": 0.5649962089055078, + "grad_norm": 1.453125, + "learning_rate": 0.00018304952311565544, + "loss": 4.3554, + "step": 5449 + }, + { + "epoch": 0.5650998969599959, + "grad_norm": 1.28125, + "learning_rate": 0.00018304347204843748, + "loss": 4.3653, + "step": 5450 + }, + { + "epoch": 0.565203585014484, + "grad_norm": 1.7734375, + "learning_rate": 0.00018303742000139433, + "loss": 4.321, + "step": 5451 + }, + { + "epoch": 0.565307273068972, + "grad_norm": 1.703125, + "learning_rate": 0.00018303136697459736, + "loss": 4.3021, + "step": 5452 + }, + { + "epoch": 0.5654109611234601, + "grad_norm": 1.546875, + "learning_rate": 0.0001830253129681181, + "loss": 4.3297, + "step": 5453 + }, + { + "epoch": 0.5655146491779481, + "grad_norm": 1.4609375, + "learning_rate": 0.0001830192579820279, + "loss": 4.3235, + "step": 5454 + }, + { + "epoch": 0.5656183372324363, + "grad_norm": 1.4609375, + "learning_rate": 0.00018301320201639815, + "loss": 4.3163, + "step": 5455 + }, + { + "epoch": 0.5657220252869243, + "grad_norm": 1.359375, + "learning_rate": 0.0001830071450713004, + "loss": 4.3826, + "step": 5456 + }, + { + "epoch": 0.5658257133414124, + "grad_norm": 1.609375, + "learning_rate": 0.0001830010871468061, + "loss": 4.3021, + "step": 5457 + }, + { + "epoch": 0.5659294013959004, + "grad_norm": 1.5546875, + "learning_rate": 0.00018299502824298668, + "loss": 4.2943, + "step": 5458 + }, + { + "epoch": 0.5660330894503885, + "grad_norm": 1.359375, + "learning_rate": 0.00018298896835991366, + "loss": 4.3351, + "step": 5459 + }, + { + "epoch": 0.5661367775048766, + "grad_norm": 1.2734375, + "learning_rate": 0.00018298290749765853, + "loss": 4.3269, + "step": 5460 + }, + { + "epoch": 0.5662404655593647, + "grad_norm": 1.515625, + "learning_rate": 0.00018297684565629283, + "loss": 4.3372, + "step": 5461 + }, + { + "epoch": 0.5663441536138527, + "grad_norm": 1.3671875, + "learning_rate": 0.00018297078283588804, + "loss": 4.3356, + "step": 5462 + }, + { + "epoch": 0.5664478416683408, + "grad_norm": 1.5625, + "learning_rate": 0.00018296471903651572, + "loss": 4.3262, + "step": 5463 + }, + { + "epoch": 0.5665515297228289, + "grad_norm": 1.4609375, + "learning_rate": 0.0001829586542582474, + "loss": 4.3409, + "step": 5464 + }, + { + "epoch": 0.566655217777317, + "grad_norm": 1.5390625, + "learning_rate": 0.0001829525885011547, + "loss": 4.2715, + "step": 5465 + }, + { + "epoch": 0.566758905831805, + "grad_norm": 1.40625, + "learning_rate": 0.00018294652176530912, + "loss": 4.325, + "step": 5466 + }, + { + "epoch": 0.5668625938862931, + "grad_norm": 1.5625, + "learning_rate": 0.00018294045405078225, + "loss": 4.309, + "step": 5467 + }, + { + "epoch": 0.5669662819407811, + "grad_norm": 1.421875, + "learning_rate": 0.0001829343853576457, + "loss": 4.347, + "step": 5468 + }, + { + "epoch": 0.5670699699952693, + "grad_norm": 1.484375, + "learning_rate": 0.00018292831568597105, + "loss": 4.3978, + "step": 5469 + }, + { + "epoch": 0.5671736580497573, + "grad_norm": 1.4140625, + "learning_rate": 0.00018292224503582992, + "loss": 4.2942, + "step": 5470 + }, + { + "epoch": 0.5672773461042454, + "grad_norm": 1.578125, + "learning_rate": 0.00018291617340729399, + "loss": 4.3313, + "step": 5471 + }, + { + "epoch": 0.5673810341587334, + "grad_norm": 1.390625, + "learning_rate": 0.00018291010080043483, + "loss": 4.344, + "step": 5472 + }, + { + "epoch": 0.5674847222132215, + "grad_norm": 1.71875, + "learning_rate": 0.0001829040272153242, + "loss": 4.3467, + "step": 5473 + }, + { + "epoch": 0.5675884102677096, + "grad_norm": 1.6015625, + "learning_rate": 0.0001828979526520336, + "loss": 4.3406, + "step": 5474 + }, + { + "epoch": 0.5676920983221977, + "grad_norm": 1.640625, + "learning_rate": 0.0001828918771106348, + "loss": 4.3034, + "step": 5475 + }, + { + "epoch": 0.5677957863766857, + "grad_norm": 1.53125, + "learning_rate": 0.0001828858005911995, + "loss": 4.3252, + "step": 5476 + }, + { + "epoch": 0.5678994744311738, + "grad_norm": 1.3203125, + "learning_rate": 0.00018287972309379932, + "loss": 4.3511, + "step": 5477 + }, + { + "epoch": 0.5680031624856618, + "grad_norm": 1.28125, + "learning_rate": 0.00018287364461850604, + "loss": 4.33, + "step": 5478 + }, + { + "epoch": 0.56810685054015, + "grad_norm": 1.5625, + "learning_rate": 0.00018286756516539137, + "loss": 4.3105, + "step": 5479 + }, + { + "epoch": 0.568210538594638, + "grad_norm": 1.4140625, + "learning_rate": 0.000182861484734527, + "loss": 4.2994, + "step": 5480 + }, + { + "epoch": 0.5683142266491261, + "grad_norm": 1.65625, + "learning_rate": 0.0001828554033259847, + "loss": 4.3377, + "step": 5481 + }, + { + "epoch": 0.5684179147036141, + "grad_norm": 1.5078125, + "learning_rate": 0.00018284932093983624, + "loss": 4.3631, + "step": 5482 + }, + { + "epoch": 0.5685216027581023, + "grad_norm": 1.4296875, + "learning_rate": 0.00018284323757615335, + "loss": 4.3617, + "step": 5483 + }, + { + "epoch": 0.5686252908125903, + "grad_norm": 1.34375, + "learning_rate": 0.00018283715323500786, + "loss": 4.3284, + "step": 5484 + }, + { + "epoch": 0.5687289788670784, + "grad_norm": 1.3984375, + "learning_rate": 0.0001828310679164715, + "loss": 4.353, + "step": 5485 + }, + { + "epoch": 0.5688326669215664, + "grad_norm": 1.203125, + "learning_rate": 0.0001828249816206161, + "loss": 4.36, + "step": 5486 + }, + { + "epoch": 0.5689363549760545, + "grad_norm": 1.7890625, + "learning_rate": 0.00018281889434751346, + "loss": 4.372, + "step": 5487 + }, + { + "epoch": 0.5690400430305426, + "grad_norm": 1.734375, + "learning_rate": 0.0001828128060972354, + "loss": 4.3062, + "step": 5488 + }, + { + "epoch": 0.5691437310850307, + "grad_norm": 1.390625, + "learning_rate": 0.00018280671686985377, + "loss": 4.3524, + "step": 5489 + }, + { + "epoch": 0.5692474191395187, + "grad_norm": 1.3359375, + "learning_rate": 0.00018280062666544043, + "loss": 4.3408, + "step": 5490 + }, + { + "epoch": 0.5693511071940068, + "grad_norm": 1.2265625, + "learning_rate": 0.00018279453548406723, + "loss": 4.315, + "step": 5491 + }, + { + "epoch": 0.5694547952484948, + "grad_norm": 1.1640625, + "learning_rate": 0.00018278844332580597, + "loss": 4.364, + "step": 5492 + }, + { + "epoch": 0.569558483302983, + "grad_norm": 1.5859375, + "learning_rate": 0.00018278235019072864, + "loss": 4.2915, + "step": 5493 + }, + { + "epoch": 0.5696621713574711, + "grad_norm": 1.3828125, + "learning_rate": 0.00018277625607890708, + "loss": 4.2776, + "step": 5494 + }, + { + "epoch": 0.5697658594119591, + "grad_norm": 1.546875, + "learning_rate": 0.00018277016099041318, + "loss": 4.3762, + "step": 5495 + }, + { + "epoch": 0.5698695474664472, + "grad_norm": 1.4453125, + "learning_rate": 0.00018276406492531887, + "loss": 4.3403, + "step": 5496 + }, + { + "epoch": 0.5699732355209353, + "grad_norm": 1.2421875, + "learning_rate": 0.0001827579678836961, + "loss": 4.4015, + "step": 5497 + }, + { + "epoch": 0.5700769235754234, + "grad_norm": 1.1171875, + "learning_rate": 0.00018275186986561675, + "loss": 4.3357, + "step": 5498 + }, + { + "epoch": 0.5701806116299114, + "grad_norm": 1.6171875, + "learning_rate": 0.00018274577087115285, + "loss": 4.3691, + "step": 5499 + }, + { + "epoch": 0.5702842996843995, + "grad_norm": 1.390625, + "learning_rate": 0.0001827396709003763, + "loss": 4.3375, + "step": 5500 + }, + { + "epoch": 0.5703879877388875, + "grad_norm": 1.7109375, + "learning_rate": 0.00018273356995335909, + "loss": 4.3429, + "step": 5501 + }, + { + "epoch": 0.5704916757933757, + "grad_norm": 1.546875, + "learning_rate": 0.00018272746803017323, + "loss": 4.3488, + "step": 5502 + }, + { + "epoch": 0.5705953638478637, + "grad_norm": 1.3984375, + "learning_rate": 0.0001827213651308907, + "loss": 4.3569, + "step": 5503 + }, + { + "epoch": 0.5706990519023518, + "grad_norm": 1.28125, + "learning_rate": 0.00018271526125558345, + "loss": 4.3455, + "step": 5504 + }, + { + "epoch": 0.5708027399568398, + "grad_norm": 1.4765625, + "learning_rate": 0.0001827091564043236, + "loss": 4.3558, + "step": 5505 + }, + { + "epoch": 0.570906428011328, + "grad_norm": 1.3203125, + "learning_rate": 0.00018270305057718308, + "loss": 4.3108, + "step": 5506 + }, + { + "epoch": 0.571010116065816, + "grad_norm": 1.71875, + "learning_rate": 0.00018269694377423404, + "loss": 4.3547, + "step": 5507 + }, + { + "epoch": 0.5711138041203041, + "grad_norm": 1.578125, + "learning_rate": 0.00018269083599554845, + "loss": 4.3332, + "step": 5508 + }, + { + "epoch": 0.5712174921747921, + "grad_norm": 1.5078125, + "learning_rate": 0.0001826847272411984, + "loss": 4.3429, + "step": 5509 + }, + { + "epoch": 0.5713211802292802, + "grad_norm": 1.421875, + "learning_rate": 0.000182678617511256, + "loss": 4.3294, + "step": 5510 + }, + { + "epoch": 0.5714248682837683, + "grad_norm": 1.3125, + "learning_rate": 0.00018267250680579328, + "loss": 4.3342, + "step": 5511 + }, + { + "epoch": 0.5715285563382564, + "grad_norm": 1.28125, + "learning_rate": 0.00018266639512488236, + "loss": 4.3545, + "step": 5512 + }, + { + "epoch": 0.5716322443927444, + "grad_norm": 1.6796875, + "learning_rate": 0.00018266028246859538, + "loss": 4.3752, + "step": 5513 + }, + { + "epoch": 0.5717359324472325, + "grad_norm": 1.515625, + "learning_rate": 0.00018265416883700444, + "loss": 4.3253, + "step": 5514 + }, + { + "epoch": 0.5718396205017205, + "grad_norm": 1.3828125, + "learning_rate": 0.00018264805423018164, + "loss": 4.3538, + "step": 5515 + }, + { + "epoch": 0.5719433085562087, + "grad_norm": 1.3359375, + "learning_rate": 0.00018264193864819922, + "loss": 4.3428, + "step": 5516 + }, + { + "epoch": 0.5720469966106967, + "grad_norm": 1.3671875, + "learning_rate": 0.00018263582209112925, + "loss": 4.3088, + "step": 5517 + }, + { + "epoch": 0.5721506846651848, + "grad_norm": 1.21875, + "learning_rate": 0.00018262970455904394, + "loss": 4.3636, + "step": 5518 + }, + { + "epoch": 0.5722543727196728, + "grad_norm": 1.46875, + "learning_rate": 0.00018262358605201546, + "loss": 4.3314, + "step": 5519 + }, + { + "epoch": 0.572358060774161, + "grad_norm": 1.2890625, + "learning_rate": 0.000182617466570116, + "loss": 4.3425, + "step": 5520 + }, + { + "epoch": 0.572461748828649, + "grad_norm": 1.546875, + "learning_rate": 0.00018261134611341774, + "loss": 4.3332, + "step": 5521 + }, + { + "epoch": 0.5725654368831371, + "grad_norm": 1.4375, + "learning_rate": 0.00018260522468199297, + "loss": 4.3129, + "step": 5522 + }, + { + "epoch": 0.5726691249376251, + "grad_norm": 1.3515625, + "learning_rate": 0.00018259910227591384, + "loss": 4.3197, + "step": 5523 + }, + { + "epoch": 0.5727728129921132, + "grad_norm": 1.296875, + "learning_rate": 0.0001825929788952526, + "loss": 4.3431, + "step": 5524 + }, + { + "epoch": 0.5728765010466013, + "grad_norm": 1.6015625, + "learning_rate": 0.00018258685454008154, + "loss": 4.3203, + "step": 5525 + }, + { + "epoch": 0.5729801891010894, + "grad_norm": 1.4296875, + "learning_rate": 0.00018258072921047288, + "loss": 4.3404, + "step": 5526 + }, + { + "epoch": 0.5730838771555774, + "grad_norm": 1.59375, + "learning_rate": 0.00018257460290649889, + "loss": 4.3576, + "step": 5527 + }, + { + "epoch": 0.5731875652100655, + "grad_norm": 1.5625, + "learning_rate": 0.0001825684756282319, + "loss": 4.413, + "step": 5528 + }, + { + "epoch": 0.5732912532645535, + "grad_norm": 1.2421875, + "learning_rate": 0.00018256234737574415, + "loss": 4.3611, + "step": 5529 + }, + { + "epoch": 0.5733949413190417, + "grad_norm": 1.171875, + "learning_rate": 0.000182556218149108, + "loss": 4.377, + "step": 5530 + }, + { + "epoch": 0.5734986293735297, + "grad_norm": 1.546875, + "learning_rate": 0.0001825500879483957, + "loss": 4.336, + "step": 5531 + }, + { + "epoch": 0.5736023174280178, + "grad_norm": 1.421875, + "learning_rate": 0.00018254395677367967, + "loss": 4.3242, + "step": 5532 + }, + { + "epoch": 0.5737060054825058, + "grad_norm": 1.5, + "learning_rate": 0.00018253782462503216, + "loss": 4.3437, + "step": 5533 + }, + { + "epoch": 0.573809693536994, + "grad_norm": 1.421875, + "learning_rate": 0.00018253169150252561, + "loss": 4.3471, + "step": 5534 + }, + { + "epoch": 0.573913381591482, + "grad_norm": 1.296875, + "learning_rate": 0.00018252555740623234, + "loss": 4.358, + "step": 5535 + }, + { + "epoch": 0.5740170696459701, + "grad_norm": 1.25, + "learning_rate": 0.0001825194223362247, + "loss": 4.3566, + "step": 5536 + }, + { + "epoch": 0.5741207577004581, + "grad_norm": 1.4609375, + "learning_rate": 0.0001825132862925751, + "loss": 4.3145, + "step": 5537 + }, + { + "epoch": 0.5742244457549462, + "grad_norm": 1.359375, + "learning_rate": 0.00018250714927535596, + "loss": 4.381, + "step": 5538 + }, + { + "epoch": 0.5743281338094344, + "grad_norm": 1.4453125, + "learning_rate": 0.00018250101128463965, + "loss": 4.3207, + "step": 5539 + }, + { + "epoch": 0.5744318218639224, + "grad_norm": 1.296875, + "learning_rate": 0.0001824948723204986, + "loss": 4.3251, + "step": 5540 + }, + { + "epoch": 0.5745355099184105, + "grad_norm": 1.3671875, + "learning_rate": 0.0001824887323830053, + "loss": 4.3404, + "step": 5541 + }, + { + "epoch": 0.5746391979728985, + "grad_norm": 1.2109375, + "learning_rate": 0.00018248259147223215, + "loss": 4.3781, + "step": 5542 + }, + { + "epoch": 0.5747428860273867, + "grad_norm": 1.5234375, + "learning_rate": 0.0001824764495882516, + "loss": 4.3385, + "step": 5543 + }, + { + "epoch": 0.5748465740818747, + "grad_norm": 1.40625, + "learning_rate": 0.0001824703067311361, + "loss": 4.339, + "step": 5544 + }, + { + "epoch": 0.5749502621363628, + "grad_norm": 1.46875, + "learning_rate": 0.0001824641629009582, + "loss": 4.3373, + "step": 5545 + }, + { + "epoch": 0.5750539501908508, + "grad_norm": 1.3671875, + "learning_rate": 0.0001824580180977903, + "loss": 4.2672, + "step": 5546 + }, + { + "epoch": 0.5751576382453389, + "grad_norm": 1.515625, + "learning_rate": 0.00018245187232170497, + "loss": 4.3089, + "step": 5547 + }, + { + "epoch": 0.575261326299827, + "grad_norm": 1.390625, + "learning_rate": 0.00018244572557277473, + "loss": 4.3049, + "step": 5548 + }, + { + "epoch": 0.5753650143543151, + "grad_norm": 1.515625, + "learning_rate": 0.00018243957785107204, + "loss": 4.3171, + "step": 5549 + }, + { + "epoch": 0.5754687024088031, + "grad_norm": 1.4375, + "learning_rate": 0.00018243342915666948, + "loss": 4.3574, + "step": 5550 + }, + { + "epoch": 0.5755723904632912, + "grad_norm": 1.4296875, + "learning_rate": 0.0001824272794896396, + "loss": 4.3325, + "step": 5551 + }, + { + "epoch": 0.5756760785177792, + "grad_norm": 1.328125, + "learning_rate": 0.00018242112885005494, + "loss": 4.3621, + "step": 5552 + }, + { + "epoch": 0.5757797665722674, + "grad_norm": 1.3828125, + "learning_rate": 0.0001824149772379881, + "loss": 4.3562, + "step": 5553 + }, + { + "epoch": 0.5758834546267554, + "grad_norm": 1.3203125, + "learning_rate": 0.00018240882465351163, + "loss": 4.3579, + "step": 5554 + }, + { + "epoch": 0.5759871426812435, + "grad_norm": 1.3359375, + "learning_rate": 0.00018240267109669814, + "loss": 4.3047, + "step": 5555 + }, + { + "epoch": 0.5760908307357315, + "grad_norm": 1.234375, + "learning_rate": 0.00018239651656762026, + "loss": 4.3361, + "step": 5556 + }, + { + "epoch": 0.5761945187902197, + "grad_norm": 1.4375, + "learning_rate": 0.00018239036106635056, + "loss": 4.3471, + "step": 5557 + }, + { + "epoch": 0.5762982068447077, + "grad_norm": 1.34375, + "learning_rate": 0.00018238420459296167, + "loss": 4.3176, + "step": 5558 + }, + { + "epoch": 0.5764018948991958, + "grad_norm": 1.578125, + "learning_rate": 0.00018237804714752627, + "loss": 4.3554, + "step": 5559 + }, + { + "epoch": 0.5765055829536838, + "grad_norm": 1.40625, + "learning_rate": 0.000182371888730117, + "loss": 4.3582, + "step": 5560 + }, + { + "epoch": 0.5766092710081719, + "grad_norm": 1.46875, + "learning_rate": 0.00018236572934080648, + "loss": 4.3058, + "step": 5561 + }, + { + "epoch": 0.57671295906266, + "grad_norm": 1.3515625, + "learning_rate": 0.00018235956897966747, + "loss": 4.3419, + "step": 5562 + }, + { + "epoch": 0.5768166471171481, + "grad_norm": 1.5546875, + "learning_rate": 0.00018235340764677255, + "loss": 4.3409, + "step": 5563 + }, + { + "epoch": 0.5769203351716361, + "grad_norm": 1.484375, + "learning_rate": 0.00018234724534219453, + "loss": 4.3695, + "step": 5564 + }, + { + "epoch": 0.5770240232261242, + "grad_norm": 1.375, + "learning_rate": 0.000182341082066006, + "loss": 4.3446, + "step": 5565 + }, + { + "epoch": 0.5771277112806122, + "grad_norm": 1.3125, + "learning_rate": 0.00018233491781827977, + "loss": 4.3477, + "step": 5566 + }, + { + "epoch": 0.5772313993351004, + "grad_norm": 1.3125, + "learning_rate": 0.00018232875259908854, + "loss": 4.3555, + "step": 5567 + }, + { + "epoch": 0.5773350873895884, + "grad_norm": 1.1953125, + "learning_rate": 0.00018232258640850507, + "loss": 4.3217, + "step": 5568 + }, + { + "epoch": 0.5774387754440765, + "grad_norm": 1.40625, + "learning_rate": 0.00018231641924660208, + "loss": 4.2945, + "step": 5569 + }, + { + "epoch": 0.5775424634985645, + "grad_norm": 1.296875, + "learning_rate": 0.00018231025111345233, + "loss": 4.3787, + "step": 5570 + }, + { + "epoch": 0.5776461515530527, + "grad_norm": 1.328125, + "learning_rate": 0.00018230408200912868, + "loss": 4.3239, + "step": 5571 + }, + { + "epoch": 0.5777498396075407, + "grad_norm": 1.25, + "learning_rate": 0.00018229791193370384, + "loss": 4.3548, + "step": 5572 + }, + { + "epoch": 0.5778535276620288, + "grad_norm": 1.375, + "learning_rate": 0.00018229174088725062, + "loss": 4.3668, + "step": 5573 + }, + { + "epoch": 0.5779572157165168, + "grad_norm": 1.203125, + "learning_rate": 0.00018228556886984182, + "loss": 4.3516, + "step": 5574 + }, + { + "epoch": 0.5780609037710049, + "grad_norm": 1.578125, + "learning_rate": 0.00018227939588155031, + "loss": 4.344, + "step": 5575 + }, + { + "epoch": 0.578164591825493, + "grad_norm": 1.4375, + "learning_rate": 0.00018227322192244892, + "loss": 4.3754, + "step": 5576 + }, + { + "epoch": 0.5782682798799811, + "grad_norm": 1.4453125, + "learning_rate": 0.00018226704699261047, + "loss": 4.3347, + "step": 5577 + }, + { + "epoch": 0.5783719679344691, + "grad_norm": 1.34375, + "learning_rate": 0.0001822608710921078, + "loss": 4.3517, + "step": 5578 + }, + { + "epoch": 0.5784756559889572, + "grad_norm": 1.4609375, + "learning_rate": 0.00018225469422101384, + "loss": 4.3267, + "step": 5579 + }, + { + "epoch": 0.5785793440434452, + "grad_norm": 1.28125, + "learning_rate": 0.0001822485163794014, + "loss": 4.3415, + "step": 5580 + }, + { + "epoch": 0.5786830320979334, + "grad_norm": 1.609375, + "learning_rate": 0.00018224233756734343, + "loss": 4.3439, + "step": 5581 + }, + { + "epoch": 0.5787867201524214, + "grad_norm": 1.5, + "learning_rate": 0.0001822361577849128, + "loss": 4.3165, + "step": 5582 + }, + { + "epoch": 0.5788904082069095, + "grad_norm": 1.203125, + "learning_rate": 0.00018222997703218246, + "loss": 4.3216, + "step": 5583 + }, + { + "epoch": 0.5789940962613976, + "grad_norm": 1.125, + "learning_rate": 0.0001822237953092253, + "loss": 4.3286, + "step": 5584 + }, + { + "epoch": 0.5790977843158857, + "grad_norm": 1.4609375, + "learning_rate": 0.00018221761261611423, + "loss": 4.3354, + "step": 5585 + }, + { + "epoch": 0.5792014723703738, + "grad_norm": 1.3125, + "learning_rate": 0.0001822114289529223, + "loss": 4.3254, + "step": 5586 + }, + { + "epoch": 0.5793051604248618, + "grad_norm": 1.453125, + "learning_rate": 0.00018220524431972237, + "loss": 4.3203, + "step": 5587 + }, + { + "epoch": 0.5794088484793499, + "grad_norm": 1.359375, + "learning_rate": 0.00018219905871658747, + "loss": 4.3531, + "step": 5588 + }, + { + "epoch": 0.5795125365338379, + "grad_norm": 1.203125, + "learning_rate": 0.00018219287214359055, + "loss": 4.3582, + "step": 5589 + }, + { + "epoch": 0.5796162245883261, + "grad_norm": 1.1015625, + "learning_rate": 0.00018218668460080463, + "loss": 4.3153, + "step": 5590 + }, + { + "epoch": 0.5797199126428141, + "grad_norm": 1.40625, + "learning_rate": 0.00018218049608830273, + "loss": 4.3506, + "step": 5591 + }, + { + "epoch": 0.5798236006973022, + "grad_norm": 1.2109375, + "learning_rate": 0.0001821743066061578, + "loss": 4.3519, + "step": 5592 + }, + { + "epoch": 0.5799272887517902, + "grad_norm": 1.625, + "learning_rate": 0.00018216811615444294, + "loss": 4.3325, + "step": 5593 + }, + { + "epoch": 0.5800309768062784, + "grad_norm": 1.4921875, + "learning_rate": 0.00018216192473323114, + "loss": 4.373, + "step": 5594 + }, + { + "epoch": 0.5801346648607664, + "grad_norm": 1.3984375, + "learning_rate": 0.0001821557323425955, + "loss": 4.3325, + "step": 5595 + }, + { + "epoch": 0.5802383529152545, + "grad_norm": 1.359375, + "learning_rate": 0.00018214953898260908, + "loss": 4.2523, + "step": 5596 + }, + { + "epoch": 0.5803420409697425, + "grad_norm": 1.3515625, + "learning_rate": 0.00018214334465334488, + "loss": 4.3368, + "step": 5597 + }, + { + "epoch": 0.5804457290242306, + "grad_norm": 1.25, + "learning_rate": 0.0001821371493548761, + "loss": 4.3503, + "step": 5598 + }, + { + "epoch": 0.5805494170787187, + "grad_norm": 1.5703125, + "learning_rate": 0.00018213095308727576, + "loss": 4.3115, + "step": 5599 + }, + { + "epoch": 0.5806531051332068, + "grad_norm": 1.5078125, + "learning_rate": 0.000182124755850617, + "loss": 4.3373, + "step": 5600 + }, + { + "epoch": 0.5807567931876948, + "grad_norm": 1.4609375, + "learning_rate": 0.0001821185576449729, + "loss": 4.3589, + "step": 5601 + }, + { + "epoch": 0.5808604812421829, + "grad_norm": 1.3515625, + "learning_rate": 0.00018211235847041663, + "loss": 4.3404, + "step": 5602 + }, + { + "epoch": 0.5809641692966709, + "grad_norm": 1.4453125, + "learning_rate": 0.00018210615832702133, + "loss": 4.3841, + "step": 5603 + }, + { + "epoch": 0.5810678573511591, + "grad_norm": 1.28125, + "learning_rate": 0.00018209995721486016, + "loss": 4.3259, + "step": 5604 + }, + { + "epoch": 0.5811715454056471, + "grad_norm": 1.609375, + "learning_rate": 0.00018209375513400628, + "loss": 4.3524, + "step": 5605 + }, + { + "epoch": 0.5812752334601352, + "grad_norm": 1.5, + "learning_rate": 0.00018208755208453287, + "loss": 4.3828, + "step": 5606 + }, + { + "epoch": 0.5813789215146232, + "grad_norm": 1.390625, + "learning_rate": 0.00018208134806651312, + "loss": 4.3642, + "step": 5607 + }, + { + "epoch": 0.5814826095691114, + "grad_norm": 1.3828125, + "learning_rate": 0.00018207514308002018, + "loss": 4.3149, + "step": 5608 + }, + { + "epoch": 0.5815862976235994, + "grad_norm": 1.375, + "learning_rate": 0.00018206893712512735, + "loss": 4.3453, + "step": 5609 + }, + { + "epoch": 0.5816899856780875, + "grad_norm": 1.203125, + "learning_rate": 0.00018206273020190782, + "loss": 4.343, + "step": 5610 + }, + { + "epoch": 0.5817936737325755, + "grad_norm": 1.546875, + "learning_rate": 0.0001820565223104348, + "loss": 4.3722, + "step": 5611 + }, + { + "epoch": 0.5818973617870636, + "grad_norm": 1.3984375, + "learning_rate": 0.00018205031345078156, + "loss": 4.3135, + "step": 5612 + }, + { + "epoch": 0.5820010498415517, + "grad_norm": 1.4609375, + "learning_rate": 0.00018204410362302134, + "loss": 4.2871, + "step": 5613 + }, + { + "epoch": 0.5821047378960398, + "grad_norm": 1.46875, + "learning_rate": 0.00018203789282722743, + "loss": 4.3021, + "step": 5614 + }, + { + "epoch": 0.5822084259505278, + "grad_norm": 1.0703125, + "learning_rate": 0.0001820316810634731, + "loss": 4.338, + "step": 5615 + }, + { + "epoch": 0.5823121140050159, + "grad_norm": 1.0234375, + "learning_rate": 0.00018202546833183164, + "loss": 4.2786, + "step": 5616 + }, + { + "epoch": 0.5824158020595039, + "grad_norm": 1.375, + "learning_rate": 0.00018201925463237636, + "loss": 4.2837, + "step": 5617 + }, + { + "epoch": 0.5825194901139921, + "grad_norm": 1.1875, + "learning_rate": 0.0001820130399651806, + "loss": 4.2989, + "step": 5618 + }, + { + "epoch": 0.5826231781684801, + "grad_norm": 1.6484375, + "learning_rate": 0.00018200682433031765, + "loss": 4.2944, + "step": 5619 + }, + { + "epoch": 0.5827268662229682, + "grad_norm": 1.5078125, + "learning_rate": 0.00018200060772786083, + "loss": 4.3102, + "step": 5620 + }, + { + "epoch": 0.5828305542774562, + "grad_norm": 1.1875, + "learning_rate": 0.00018199439015788356, + "loss": 4.3416, + "step": 5621 + }, + { + "epoch": 0.5829342423319444, + "grad_norm": 1.15625, + "learning_rate": 0.00018198817162045912, + "loss": 4.3264, + "step": 5622 + }, + { + "epoch": 0.5830379303864324, + "grad_norm": 1.328125, + "learning_rate": 0.00018198195211566095, + "loss": 4.3834, + "step": 5623 + }, + { + "epoch": 0.5831416184409205, + "grad_norm": 1.125, + "learning_rate": 0.00018197573164356238, + "loss": 4.3006, + "step": 5624 + }, + { + "epoch": 0.5832453064954085, + "grad_norm": 1.6796875, + "learning_rate": 0.00018196951020423683, + "loss": 4.3308, + "step": 5625 + }, + { + "epoch": 0.5833489945498966, + "grad_norm": 1.4609375, + "learning_rate": 0.00018196328779775768, + "loss": 4.3448, + "step": 5626 + }, + { + "epoch": 0.5834526826043847, + "grad_norm": 1.515625, + "learning_rate": 0.00018195706442419843, + "loss": 4.3255, + "step": 5627 + }, + { + "epoch": 0.5835563706588728, + "grad_norm": 1.375, + "learning_rate": 0.0001819508400836324, + "loss": 4.3572, + "step": 5628 + }, + { + "epoch": 0.5836600587133609, + "grad_norm": 1.5703125, + "learning_rate": 0.00018194461477613315, + "loss": 4.3235, + "step": 5629 + }, + { + "epoch": 0.5837637467678489, + "grad_norm": 1.4296875, + "learning_rate": 0.000181938388501774, + "loss": 4.3347, + "step": 5630 + }, + { + "epoch": 0.583867434822337, + "grad_norm": 1.46875, + "learning_rate": 0.00018193216126062851, + "loss": 4.322, + "step": 5631 + }, + { + "epoch": 0.5839711228768251, + "grad_norm": 1.4140625, + "learning_rate": 0.0001819259330527701, + "loss": 4.363, + "step": 5632 + }, + { + "epoch": 0.5840748109313132, + "grad_norm": 1.3046875, + "learning_rate": 0.0001819197038782723, + "loss": 4.3823, + "step": 5633 + }, + { + "epoch": 0.5841784989858012, + "grad_norm": 1.1875, + "learning_rate": 0.00018191347373720858, + "loss": 4.3072, + "step": 5634 + }, + { + "epoch": 0.5842821870402893, + "grad_norm": 1.4765625, + "learning_rate": 0.00018190724262965246, + "loss": 4.3529, + "step": 5635 + }, + { + "epoch": 0.5843858750947774, + "grad_norm": 1.328125, + "learning_rate": 0.00018190101055567744, + "loss": 4.3431, + "step": 5636 + }, + { + "epoch": 0.5844895631492655, + "grad_norm": 1.4453125, + "learning_rate": 0.0001818947775153571, + "loss": 4.3289, + "step": 5637 + }, + { + "epoch": 0.5845932512037535, + "grad_norm": 1.296875, + "learning_rate": 0.00018188854350876494, + "loss": 4.3134, + "step": 5638 + }, + { + "epoch": 0.5846969392582416, + "grad_norm": 1.2421875, + "learning_rate": 0.0001818823085359745, + "loss": 4.3689, + "step": 5639 + }, + { + "epoch": 0.5848006273127296, + "grad_norm": 1.1484375, + "learning_rate": 0.00018187607259705942, + "loss": 4.3616, + "step": 5640 + }, + { + "epoch": 0.5849043153672178, + "grad_norm": 1.6171875, + "learning_rate": 0.0001818698356920932, + "loss": 4.3581, + "step": 5641 + }, + { + "epoch": 0.5850080034217058, + "grad_norm": 1.421875, + "learning_rate": 0.00018186359782114945, + "loss": 4.3455, + "step": 5642 + }, + { + "epoch": 0.5851116914761939, + "grad_norm": 1.40625, + "learning_rate": 0.00018185735898430182, + "loss": 4.3584, + "step": 5643 + }, + { + "epoch": 0.5852153795306819, + "grad_norm": 1.3515625, + "learning_rate": 0.00018185111918162384, + "loss": 4.3607, + "step": 5644 + }, + { + "epoch": 0.58531906758517, + "grad_norm": 1.265625, + "learning_rate": 0.00018184487841318918, + "loss": 4.3458, + "step": 5645 + }, + { + "epoch": 0.5854227556396581, + "grad_norm": 1.15625, + "learning_rate": 0.00018183863667907147, + "loss": 4.3571, + "step": 5646 + }, + { + "epoch": 0.5855264436941462, + "grad_norm": 1.4140625, + "learning_rate": 0.00018183239397934436, + "loss": 4.3665, + "step": 5647 + }, + { + "epoch": 0.5856301317486342, + "grad_norm": 1.34375, + "learning_rate": 0.0001818261503140815, + "loss": 4.3623, + "step": 5648 + }, + { + "epoch": 0.5857338198031223, + "grad_norm": 1.3828125, + "learning_rate": 0.00018181990568335657, + "loss": 4.3244, + "step": 5649 + }, + { + "epoch": 0.5858375078576104, + "grad_norm": 1.2578125, + "learning_rate": 0.00018181366008724324, + "loss": 4.2745, + "step": 5650 + }, + { + "epoch": 0.5859411959120985, + "grad_norm": 1.3984375, + "learning_rate": 0.00018180741352581518, + "loss": 4.3148, + "step": 5651 + }, + { + "epoch": 0.5860448839665865, + "grad_norm": 1.296875, + "learning_rate": 0.00018180116599914614, + "loss": 4.3239, + "step": 5652 + }, + { + "epoch": 0.5861485720210746, + "grad_norm": 1.53125, + "learning_rate": 0.00018179491750730978, + "loss": 4.3469, + "step": 5653 + }, + { + "epoch": 0.5862522600755626, + "grad_norm": 1.3671875, + "learning_rate": 0.00018178866805037988, + "loss": 4.3679, + "step": 5654 + }, + { + "epoch": 0.5863559481300508, + "grad_norm": 1.4609375, + "learning_rate": 0.00018178241762843014, + "loss": 4.3314, + "step": 5655 + }, + { + "epoch": 0.5864596361845388, + "grad_norm": 1.328125, + "learning_rate": 0.00018177616624153432, + "loss": 4.3647, + "step": 5656 + }, + { + "epoch": 0.5865633242390269, + "grad_norm": 1.421875, + "learning_rate": 0.00018176991388976622, + "loss": 4.3769, + "step": 5657 + }, + { + "epoch": 0.5866670122935149, + "grad_norm": 1.2890625, + "learning_rate": 0.0001817636605731995, + "loss": 4.3395, + "step": 5658 + }, + { + "epoch": 0.586770700348003, + "grad_norm": 1.453125, + "learning_rate": 0.00018175740629190805, + "loss": 4.3378, + "step": 5659 + }, + { + "epoch": 0.5868743884024911, + "grad_norm": 1.3359375, + "learning_rate": 0.0001817511510459656, + "loss": 4.3157, + "step": 5660 + }, + { + "epoch": 0.5869780764569792, + "grad_norm": 1.3828125, + "learning_rate": 0.00018174489483544604, + "loss": 4.3298, + "step": 5661 + }, + { + "epoch": 0.5870817645114672, + "grad_norm": 1.2734375, + "learning_rate": 0.00018173863766042308, + "loss": 4.3223, + "step": 5662 + }, + { + "epoch": 0.5871854525659553, + "grad_norm": 1.359375, + "learning_rate": 0.00018173237952097063, + "loss": 4.3173, + "step": 5663 + }, + { + "epoch": 0.5872891406204434, + "grad_norm": 1.28125, + "learning_rate": 0.00018172612041716246, + "loss": 4.3169, + "step": 5664 + }, + { + "epoch": 0.5873928286749315, + "grad_norm": 1.46875, + "learning_rate": 0.0001817198603490725, + "loss": 4.3136, + "step": 5665 + }, + { + "epoch": 0.5874965167294195, + "grad_norm": 1.3125, + "learning_rate": 0.00018171359931677453, + "loss": 4.3398, + "step": 5666 + }, + { + "epoch": 0.5876002047839076, + "grad_norm": 1.7109375, + "learning_rate": 0.00018170733732034248, + "loss": 4.3699, + "step": 5667 + }, + { + "epoch": 0.5877038928383956, + "grad_norm": 1.4765625, + "learning_rate": 0.00018170107435985021, + "loss": 4.3358, + "step": 5668 + }, + { + "epoch": 0.5878075808928838, + "grad_norm": 1.5, + "learning_rate": 0.00018169481043537166, + "loss": 4.3863, + "step": 5669 + }, + { + "epoch": 0.5879112689473718, + "grad_norm": 1.375, + "learning_rate": 0.00018168854554698064, + "loss": 4.3141, + "step": 5670 + }, + { + "epoch": 0.5880149570018599, + "grad_norm": 1.40625, + "learning_rate": 0.00018168227969475118, + "loss": 4.3231, + "step": 5671 + }, + { + "epoch": 0.5881186450563479, + "grad_norm": 1.359375, + "learning_rate": 0.00018167601287875712, + "loss": 4.3552, + "step": 5672 + }, + { + "epoch": 0.588222333110836, + "grad_norm": 1.4296875, + "learning_rate": 0.00018166974509907247, + "loss": 4.3648, + "step": 5673 + }, + { + "epoch": 0.5883260211653242, + "grad_norm": 1.2734375, + "learning_rate": 0.00018166347635577117, + "loss": 4.3125, + "step": 5674 + }, + { + "epoch": 0.5884297092198122, + "grad_norm": 1.453125, + "learning_rate": 0.00018165720664892714, + "loss": 4.2986, + "step": 5675 + }, + { + "epoch": 0.5885333972743003, + "grad_norm": 1.3984375, + "learning_rate": 0.0001816509359786144, + "loss": 4.3168, + "step": 5676 + }, + { + "epoch": 0.5886370853287883, + "grad_norm": 1.46875, + "learning_rate": 0.00018164466434490692, + "loss": 4.3265, + "step": 5677 + }, + { + "epoch": 0.5887407733832765, + "grad_norm": 1.4140625, + "learning_rate": 0.00018163839174787874, + "loss": 4.3073, + "step": 5678 + }, + { + "epoch": 0.5888444614377645, + "grad_norm": 1.4375, + "learning_rate": 0.00018163211818760379, + "loss": 4.3157, + "step": 5679 + }, + { + "epoch": 0.5889481494922526, + "grad_norm": 1.375, + "learning_rate": 0.00018162584366415615, + "loss": 4.3241, + "step": 5680 + }, + { + "epoch": 0.5890518375467406, + "grad_norm": 1.5234375, + "learning_rate": 0.00018161956817760983, + "loss": 4.3372, + "step": 5681 + }, + { + "epoch": 0.5891555256012287, + "grad_norm": 1.4375, + "learning_rate": 0.0001816132917280389, + "loss": 4.3576, + "step": 5682 + }, + { + "epoch": 0.5892592136557168, + "grad_norm": 1.609375, + "learning_rate": 0.00018160701431551736, + "loss": 4.3108, + "step": 5683 + }, + { + "epoch": 0.5893629017102049, + "grad_norm": 1.515625, + "learning_rate": 0.00018160073594011936, + "loss": 4.3198, + "step": 5684 + }, + { + "epoch": 0.5894665897646929, + "grad_norm": 1.390625, + "learning_rate": 0.00018159445660191888, + "loss": 4.3091, + "step": 5685 + }, + { + "epoch": 0.589570277819181, + "grad_norm": 1.28125, + "learning_rate": 0.0001815881763009901, + "loss": 4.3254, + "step": 5686 + }, + { + "epoch": 0.589673965873669, + "grad_norm": 1.46875, + "learning_rate": 0.00018158189503740709, + "loss": 4.3508, + "step": 5687 + }, + { + "epoch": 0.5897776539281572, + "grad_norm": 1.3984375, + "learning_rate": 0.00018157561281124392, + "loss": 4.3215, + "step": 5688 + }, + { + "epoch": 0.5898813419826452, + "grad_norm": 1.4140625, + "learning_rate": 0.00018156932962257475, + "loss": 4.2878, + "step": 5689 + }, + { + "epoch": 0.5899850300371333, + "grad_norm": 1.3984375, + "learning_rate": 0.00018156304547147374, + "loss": 4.3404, + "step": 5690 + }, + { + "epoch": 0.5900887180916213, + "grad_norm": 1.21875, + "learning_rate": 0.00018155676035801498, + "loss": 4.3673, + "step": 5691 + }, + { + "epoch": 0.5901924061461095, + "grad_norm": 1.171875, + "learning_rate": 0.00018155047428227268, + "loss": 4.3134, + "step": 5692 + }, + { + "epoch": 0.5902960942005975, + "grad_norm": 1.3828125, + "learning_rate": 0.000181544187244321, + "loss": 4.3343, + "step": 5693 + }, + { + "epoch": 0.5903997822550856, + "grad_norm": 1.265625, + "learning_rate": 0.00018153789924423407, + "loss": 4.329, + "step": 5694 + }, + { + "epoch": 0.5905034703095736, + "grad_norm": 1.421875, + "learning_rate": 0.00018153161028208614, + "loss": 4.3077, + "step": 5695 + }, + { + "epoch": 0.5906071583640617, + "grad_norm": 1.3203125, + "learning_rate": 0.00018152532035795136, + "loss": 4.308, + "step": 5696 + }, + { + "epoch": 0.5907108464185498, + "grad_norm": 1.3984375, + "learning_rate": 0.00018151902947190402, + "loss": 4.3066, + "step": 5697 + }, + { + "epoch": 0.5908145344730379, + "grad_norm": 1.3359375, + "learning_rate": 0.00018151273762401825, + "loss": 4.3872, + "step": 5698 + }, + { + "epoch": 0.5909182225275259, + "grad_norm": 1.3359375, + "learning_rate": 0.0001815064448143684, + "loss": 4.3014, + "step": 5699 + }, + { + "epoch": 0.591021910582014, + "grad_norm": 1.21875, + "learning_rate": 0.0001815001510430286, + "loss": 4.3571, + "step": 5700 + }, + { + "epoch": 0.591125598636502, + "grad_norm": 1.375, + "learning_rate": 0.00018149385631007322, + "loss": 4.3611, + "step": 5701 + }, + { + "epoch": 0.5912292866909902, + "grad_norm": 1.265625, + "learning_rate": 0.00018148756061557646, + "loss": 4.3576, + "step": 5702 + }, + { + "epoch": 0.5913329747454782, + "grad_norm": 1.2421875, + "learning_rate": 0.0001814812639596126, + "loss": 4.3244, + "step": 5703 + }, + { + "epoch": 0.5914366627999663, + "grad_norm": 1.1953125, + "learning_rate": 0.00018147496634225596, + "loss": 4.3498, + "step": 5704 + }, + { + "epoch": 0.5915403508544543, + "grad_norm": 1.2578125, + "learning_rate": 0.00018146866776358084, + "loss": 4.3045, + "step": 5705 + }, + { + "epoch": 0.5916440389089425, + "grad_norm": 1.140625, + "learning_rate": 0.0001814623682236616, + "loss": 4.398, + "step": 5706 + }, + { + "epoch": 0.5917477269634305, + "grad_norm": 1.2890625, + "learning_rate": 0.00018145606772257246, + "loss": 4.36, + "step": 5707 + }, + { + "epoch": 0.5918514150179186, + "grad_norm": 1.1875, + "learning_rate": 0.00018144976626038785, + "loss": 4.3212, + "step": 5708 + }, + { + "epoch": 0.5919551030724066, + "grad_norm": 1.3125, + "learning_rate": 0.00018144346383718211, + "loss": 4.3165, + "step": 5709 + }, + { + "epoch": 0.5920587911268947, + "grad_norm": 1.140625, + "learning_rate": 0.00018143716045302956, + "loss": 4.3158, + "step": 5710 + }, + { + "epoch": 0.5921624791813828, + "grad_norm": 1.34375, + "learning_rate": 0.0001814308561080046, + "loss": 4.3757, + "step": 5711 + }, + { + "epoch": 0.5922661672358709, + "grad_norm": 1.2421875, + "learning_rate": 0.00018142455080218163, + "loss": 4.3474, + "step": 5712 + }, + { + "epoch": 0.5923698552903589, + "grad_norm": 1.390625, + "learning_rate": 0.00018141824453563504, + "loss": 4.3471, + "step": 5713 + }, + { + "epoch": 0.592473543344847, + "grad_norm": 1.3203125, + "learning_rate": 0.00018141193730843923, + "loss": 4.3295, + "step": 5714 + }, + { + "epoch": 0.592577231399335, + "grad_norm": 1.3984375, + "learning_rate": 0.00018140562912066858, + "loss": 4.2494, + "step": 5715 + }, + { + "epoch": 0.5926809194538232, + "grad_norm": 1.296875, + "learning_rate": 0.00018139931997239757, + "loss": 4.3036, + "step": 5716 + }, + { + "epoch": 0.5927846075083113, + "grad_norm": 1.3359375, + "learning_rate": 0.00018139300986370064, + "loss": 4.3459, + "step": 5717 + }, + { + "epoch": 0.5928882955627993, + "grad_norm": 1.2578125, + "learning_rate": 0.00018138669879465223, + "loss": 4.3585, + "step": 5718 + }, + { + "epoch": 0.5929919836172874, + "grad_norm": 1.4296875, + "learning_rate": 0.0001813803867653268, + "loss": 4.3598, + "step": 5719 + }, + { + "epoch": 0.5930956716717755, + "grad_norm": 1.2890625, + "learning_rate": 0.00018137407377579883, + "loss": 4.3545, + "step": 5720 + }, + { + "epoch": 0.5931993597262636, + "grad_norm": 1.34375, + "learning_rate": 0.00018136775982614277, + "loss": 4.334, + "step": 5721 + }, + { + "epoch": 0.5933030477807516, + "grad_norm": 1.2109375, + "learning_rate": 0.00018136144491643318, + "loss": 4.3667, + "step": 5722 + }, + { + "epoch": 0.5934067358352397, + "grad_norm": 1.453125, + "learning_rate": 0.00018135512904674458, + "loss": 4.3182, + "step": 5723 + }, + { + "epoch": 0.5935104238897277, + "grad_norm": 1.34375, + "learning_rate": 0.0001813488122171514, + "loss": 4.3276, + "step": 5724 + }, + { + "epoch": 0.5936141119442159, + "grad_norm": 1.3046875, + "learning_rate": 0.00018134249442772825, + "loss": 4.2957, + "step": 5725 + }, + { + "epoch": 0.5937177999987039, + "grad_norm": 1.3125, + "learning_rate": 0.00018133617567854966, + "loss": 4.3233, + "step": 5726 + }, + { + "epoch": 0.593821488053192, + "grad_norm": 1.234375, + "learning_rate": 0.00018132985596969013, + "loss": 4.3506, + "step": 5727 + }, + { + "epoch": 0.59392517610768, + "grad_norm": 1.1953125, + "learning_rate": 0.00018132353530122433, + "loss": 4.3018, + "step": 5728 + }, + { + "epoch": 0.5940288641621682, + "grad_norm": 1.21875, + "learning_rate": 0.00018131721367322672, + "loss": 4.3681, + "step": 5729 + }, + { + "epoch": 0.5941325522166562, + "grad_norm": 1.0859375, + "learning_rate": 0.00018131089108577197, + "loss": 4.3353, + "step": 5730 + }, + { + "epoch": 0.5942362402711443, + "grad_norm": 1.4140625, + "learning_rate": 0.00018130456753893466, + "loss": 4.336, + "step": 5731 + }, + { + "epoch": 0.5943399283256323, + "grad_norm": 1.234375, + "learning_rate": 0.00018129824303278938, + "loss": 4.3253, + "step": 5732 + }, + { + "epoch": 0.5944436163801204, + "grad_norm": 1.421875, + "learning_rate": 0.00018129191756741076, + "loss": 4.3457, + "step": 5733 + }, + { + "epoch": 0.5945473044346085, + "grad_norm": 1.359375, + "learning_rate": 0.00018128559114287347, + "loss": 4.3569, + "step": 5734 + }, + { + "epoch": 0.5946509924890966, + "grad_norm": 1.1953125, + "learning_rate": 0.0001812792637592521, + "loss": 4.3368, + "step": 5735 + }, + { + "epoch": 0.5947546805435846, + "grad_norm": 1.0546875, + "learning_rate": 0.00018127293541662137, + "loss": 4.3151, + "step": 5736 + }, + { + "epoch": 0.5948583685980727, + "grad_norm": 1.40625, + "learning_rate": 0.00018126660611505587, + "loss": 4.3404, + "step": 5737 + }, + { + "epoch": 0.5949620566525607, + "grad_norm": 1.234375, + "learning_rate": 0.00018126027585463038, + "loss": 4.3155, + "step": 5738 + }, + { + "epoch": 0.5950657447070489, + "grad_norm": 1.5234375, + "learning_rate": 0.00018125394463541948, + "loss": 4.3546, + "step": 5739 + }, + { + "epoch": 0.5951694327615369, + "grad_norm": 1.4609375, + "learning_rate": 0.0001812476124574979, + "loss": 4.2847, + "step": 5740 + }, + { + "epoch": 0.595273120816025, + "grad_norm": 1.2109375, + "learning_rate": 0.0001812412793209404, + "loss": 4.338, + "step": 5741 + }, + { + "epoch": 0.595376808870513, + "grad_norm": 1.2265625, + "learning_rate": 0.0001812349452258217, + "loss": 4.3296, + "step": 5742 + }, + { + "epoch": 0.5954804969250012, + "grad_norm": 1.3515625, + "learning_rate": 0.00018122861017221654, + "loss": 4.3893, + "step": 5743 + }, + { + "epoch": 0.5955841849794892, + "grad_norm": 1.2109375, + "learning_rate": 0.00018122227416019957, + "loss": 4.3603, + "step": 5744 + }, + { + "epoch": 0.5956878730339773, + "grad_norm": 1.421875, + "learning_rate": 0.00018121593718984567, + "loss": 4.2856, + "step": 5745 + }, + { + "epoch": 0.5957915610884653, + "grad_norm": 1.3359375, + "learning_rate": 0.00018120959926122953, + "loss": 4.3316, + "step": 5746 + }, + { + "epoch": 0.5958952491429534, + "grad_norm": 1.25, + "learning_rate": 0.000181203260374426, + "loss": 4.3399, + "step": 5747 + }, + { + "epoch": 0.5959989371974415, + "grad_norm": 1.1953125, + "learning_rate": 0.00018119692052950977, + "loss": 4.349, + "step": 5748 + }, + { + "epoch": 0.5961026252519296, + "grad_norm": 1.3671875, + "learning_rate": 0.0001811905797265558, + "loss": 4.345, + "step": 5749 + }, + { + "epoch": 0.5962063133064176, + "grad_norm": 1.2109375, + "learning_rate": 0.00018118423796563874, + "loss": 4.3052, + "step": 5750 + }, + { + "epoch": 0.5963100013609057, + "grad_norm": 1.40625, + "learning_rate": 0.00018117789524683348, + "loss": 4.2979, + "step": 5751 + }, + { + "epoch": 0.5964136894153937, + "grad_norm": 1.2890625, + "learning_rate": 0.0001811715515702149, + "loss": 4.3457, + "step": 5752 + }, + { + "epoch": 0.5965173774698819, + "grad_norm": 1.34375, + "learning_rate": 0.0001811652069358578, + "loss": 4.3463, + "step": 5753 + }, + { + "epoch": 0.5966210655243699, + "grad_norm": 1.2421875, + "learning_rate": 0.00018115886134383705, + "loss": 4.299, + "step": 5754 + }, + { + "epoch": 0.596724753578858, + "grad_norm": 1.4375, + "learning_rate": 0.00018115251479422755, + "loss": 4.324, + "step": 5755 + }, + { + "epoch": 0.596828441633346, + "grad_norm": 1.2890625, + "learning_rate": 0.00018114616728710415, + "loss": 4.3441, + "step": 5756 + }, + { + "epoch": 0.5969321296878342, + "grad_norm": 1.484375, + "learning_rate": 0.00018113981882254173, + "loss": 4.3491, + "step": 5757 + }, + { + "epoch": 0.5970358177423222, + "grad_norm": 1.3671875, + "learning_rate": 0.0001811334694006152, + "loss": 4.3078, + "step": 5758 + }, + { + "epoch": 0.5971395057968103, + "grad_norm": 1.4296875, + "learning_rate": 0.00018112711902139954, + "loss": 4.3105, + "step": 5759 + }, + { + "epoch": 0.5972431938512983, + "grad_norm": 1.34375, + "learning_rate": 0.0001811207676849696, + "loss": 4.3357, + "step": 5760 + }, + { + "epoch": 0.5973468819057864, + "grad_norm": 1.5390625, + "learning_rate": 0.00018111441539140038, + "loss": 4.3783, + "step": 5761 + }, + { + "epoch": 0.5974505699602746, + "grad_norm": 1.5, + "learning_rate": 0.00018110806214076676, + "loss": 4.3518, + "step": 5762 + }, + { + "epoch": 0.5975542580147626, + "grad_norm": 1.21875, + "learning_rate": 0.00018110170793314377, + "loss": 4.2929, + "step": 5763 + }, + { + "epoch": 0.5976579460692507, + "grad_norm": 1.1875, + "learning_rate": 0.00018109535276860633, + "loss": 4.3141, + "step": 5764 + }, + { + "epoch": 0.5977616341237387, + "grad_norm": 1.4609375, + "learning_rate": 0.0001810889966472295, + "loss": 4.3106, + "step": 5765 + }, + { + "epoch": 0.5978653221782269, + "grad_norm": 1.328125, + "learning_rate": 0.0001810826395690882, + "loss": 4.33, + "step": 5766 + }, + { + "epoch": 0.5979690102327149, + "grad_norm": 1.40625, + "learning_rate": 0.00018107628153425745, + "loss": 4.3587, + "step": 5767 + }, + { + "epoch": 0.598072698287203, + "grad_norm": 1.3359375, + "learning_rate": 0.00018106992254281225, + "loss": 4.3539, + "step": 5768 + }, + { + "epoch": 0.598176386341691, + "grad_norm": 1.3671875, + "learning_rate": 0.0001810635625948277, + "loss": 4.3211, + "step": 5769 + }, + { + "epoch": 0.5982800743961791, + "grad_norm": 1.3203125, + "learning_rate": 0.0001810572016903788, + "loss": 4.3466, + "step": 5770 + }, + { + "epoch": 0.5983837624506672, + "grad_norm": 1.265625, + "learning_rate": 0.00018105083982954058, + "loss": 4.3471, + "step": 5771 + }, + { + "epoch": 0.5984874505051553, + "grad_norm": 1.140625, + "learning_rate": 0.00018104447701238814, + "loss": 4.3135, + "step": 5772 + }, + { + "epoch": 0.5985911385596433, + "grad_norm": 1.4375, + "learning_rate": 0.00018103811323899653, + "loss": 4.3606, + "step": 5773 + }, + { + "epoch": 0.5986948266141314, + "grad_norm": 1.390625, + "learning_rate": 0.00018103174850944085, + "loss": 4.3562, + "step": 5774 + }, + { + "epoch": 0.5987985146686194, + "grad_norm": 1.1875, + "learning_rate": 0.00018102538282379618, + "loss": 4.2699, + "step": 5775 + }, + { + "epoch": 0.5989022027231076, + "grad_norm": 1.1875, + "learning_rate": 0.00018101901618213767, + "loss": 4.3522, + "step": 5776 + }, + { + "epoch": 0.5990058907775956, + "grad_norm": 1.21875, + "learning_rate": 0.00018101264858454036, + "loss": 4.3327, + "step": 5777 + }, + { + "epoch": 0.5991095788320837, + "grad_norm": 1.1015625, + "learning_rate": 0.00018100628003107948, + "loss": 4.3317, + "step": 5778 + }, + { + "epoch": 0.5992132668865717, + "grad_norm": 1.390625, + "learning_rate": 0.0001809999105218301, + "loss": 4.3271, + "step": 5779 + }, + { + "epoch": 0.5993169549410599, + "grad_norm": 1.2578125, + "learning_rate": 0.0001809935400568674, + "loss": 4.3283, + "step": 5780 + }, + { + "epoch": 0.5994206429955479, + "grad_norm": 1.171875, + "learning_rate": 0.0001809871686362665, + "loss": 4.3304, + "step": 5781 + }, + { + "epoch": 0.599524331050036, + "grad_norm": 1.1015625, + "learning_rate": 0.0001809807962601027, + "loss": 4.3292, + "step": 5782 + }, + { + "epoch": 0.599628019104524, + "grad_norm": 1.25, + "learning_rate": 0.00018097442292845106, + "loss": 4.344, + "step": 5783 + }, + { + "epoch": 0.5997317071590121, + "grad_norm": 1.0703125, + "learning_rate": 0.0001809680486413868, + "loss": 4.3426, + "step": 5784 + }, + { + "epoch": 0.5998353952135002, + "grad_norm": 1.484375, + "learning_rate": 0.0001809616733989852, + "loss": 4.3199, + "step": 5785 + }, + { + "epoch": 0.5999390832679883, + "grad_norm": 1.40625, + "learning_rate": 0.0001809552972013214, + "loss": 4.3017, + "step": 5786 + }, + { + "epoch": 0.6000427713224763, + "grad_norm": 1.265625, + "learning_rate": 0.00018094892004847068, + "loss": 4.289, + "step": 5787 + }, + { + "epoch": 0.6001464593769644, + "grad_norm": 1.171875, + "learning_rate": 0.00018094254194050827, + "loss": 4.3317, + "step": 5788 + }, + { + "epoch": 0.6002501474314524, + "grad_norm": 1.3046875, + "learning_rate": 0.00018093616287750942, + "loss": 4.3347, + "step": 5789 + }, + { + "epoch": 0.6003538354859406, + "grad_norm": 1.15625, + "learning_rate": 0.00018092978285954943, + "loss": 4.3655, + "step": 5790 + }, + { + "epoch": 0.6004575235404286, + "grad_norm": 1.4140625, + "learning_rate": 0.0001809234018867035, + "loss": 4.3226, + "step": 5791 + }, + { + "epoch": 0.6005612115949167, + "grad_norm": 1.359375, + "learning_rate": 0.000180917019959047, + "loss": 4.3014, + "step": 5792 + }, + { + "epoch": 0.6006648996494047, + "grad_norm": 1.375, + "learning_rate": 0.0001809106370766552, + "loss": 4.3476, + "step": 5793 + }, + { + "epoch": 0.6007685877038929, + "grad_norm": 1.375, + "learning_rate": 0.0001809042532396034, + "loss": 4.3053, + "step": 5794 + }, + { + "epoch": 0.6008722757583809, + "grad_norm": 1.1953125, + "learning_rate": 0.00018089786844796693, + "loss": 4.3442, + "step": 5795 + }, + { + "epoch": 0.600975963812869, + "grad_norm": 1.1015625, + "learning_rate": 0.00018089148270182111, + "loss": 4.3391, + "step": 5796 + }, + { + "epoch": 0.601079651867357, + "grad_norm": 1.28125, + "learning_rate": 0.00018088509600124134, + "loss": 4.3439, + "step": 5797 + }, + { + "epoch": 0.6011833399218451, + "grad_norm": 1.140625, + "learning_rate": 0.0001808787083463029, + "loss": 4.3031, + "step": 5798 + }, + { + "epoch": 0.6012870279763332, + "grad_norm": 1.40625, + "learning_rate": 0.0001808723197370812, + "loss": 4.3131, + "step": 5799 + }, + { + "epoch": 0.6013907160308213, + "grad_norm": 1.3125, + "learning_rate": 0.00018086593017365164, + "loss": 4.3422, + "step": 5800 + }, + { + "epoch": 0.6014944040853093, + "grad_norm": 1.125, + "learning_rate": 0.00018085953965608952, + "loss": 4.3209, + "step": 5801 + }, + { + "epoch": 0.6015980921397974, + "grad_norm": 1.125, + "learning_rate": 0.00018085314818447036, + "loss": 4.3095, + "step": 5802 + }, + { + "epoch": 0.6017017801942854, + "grad_norm": 1.234375, + "learning_rate": 0.00018084675575886952, + "loss": 4.3618, + "step": 5803 + }, + { + "epoch": 0.6018054682487736, + "grad_norm": 1.0703125, + "learning_rate": 0.00018084036237936237, + "loss": 4.3659, + "step": 5804 + }, + { + "epoch": 0.6019091563032616, + "grad_norm": 1.65625, + "learning_rate": 0.00018083396804602443, + "loss": 4.3087, + "step": 5805 + }, + { + "epoch": 0.6020128443577497, + "grad_norm": 1.5546875, + "learning_rate": 0.00018082757275893113, + "loss": 4.3818, + "step": 5806 + }, + { + "epoch": 0.6021165324122378, + "grad_norm": 1.265625, + "learning_rate": 0.0001808211765181579, + "loss": 4.3672, + "step": 5807 + }, + { + "epoch": 0.6022202204667259, + "grad_norm": 1.265625, + "learning_rate": 0.0001808147793237802, + "loss": 4.3323, + "step": 5808 + }, + { + "epoch": 0.602323908521214, + "grad_norm": 1.21875, + "learning_rate": 0.00018080838117587352, + "loss": 4.3476, + "step": 5809 + }, + { + "epoch": 0.602427596575702, + "grad_norm": 1.0859375, + "learning_rate": 0.0001808019820745134, + "loss": 4.3119, + "step": 5810 + }, + { + "epoch": 0.6025312846301901, + "grad_norm": 1.4609375, + "learning_rate": 0.00018079558201977526, + "loss": 4.3303, + "step": 5811 + }, + { + "epoch": 0.6026349726846781, + "grad_norm": 1.265625, + "learning_rate": 0.0001807891810117347, + "loss": 4.3118, + "step": 5812 + }, + { + "epoch": 0.6027386607391663, + "grad_norm": 1.4140625, + "learning_rate": 0.0001807827790504672, + "loss": 4.3289, + "step": 5813 + }, + { + "epoch": 0.6028423487936543, + "grad_norm": 1.3515625, + "learning_rate": 0.00018077637613604826, + "loss": 4.2843, + "step": 5814 + }, + { + "epoch": 0.6029460368481424, + "grad_norm": 1.203125, + "learning_rate": 0.0001807699722685535, + "loss": 4.3415, + "step": 5815 + }, + { + "epoch": 0.6030497249026304, + "grad_norm": 1.1640625, + "learning_rate": 0.00018076356744805842, + "loss": 4.2981, + "step": 5816 + }, + { + "epoch": 0.6031534129571186, + "grad_norm": 1.265625, + "learning_rate": 0.00018075716167463863, + "loss": 4.2897, + "step": 5817 + }, + { + "epoch": 0.6032571010116066, + "grad_norm": 1.171875, + "learning_rate": 0.0001807507549483697, + "loss": 4.3263, + "step": 5818 + }, + { + "epoch": 0.6033607890660947, + "grad_norm": 1.4765625, + "learning_rate": 0.0001807443472693272, + "loss": 4.3334, + "step": 5819 + }, + { + "epoch": 0.6034644771205827, + "grad_norm": 1.3828125, + "learning_rate": 0.00018073793863758675, + "loss": 4.3286, + "step": 5820 + }, + { + "epoch": 0.6035681651750708, + "grad_norm": 1.296875, + "learning_rate": 0.00018073152905322397, + "loss": 4.3683, + "step": 5821 + }, + { + "epoch": 0.6036718532295589, + "grad_norm": 1.234375, + "learning_rate": 0.00018072511851631448, + "loss": 4.3507, + "step": 5822 + }, + { + "epoch": 0.603775541284047, + "grad_norm": 1.2734375, + "learning_rate": 0.00018071870702693397, + "loss": 4.2915, + "step": 5823 + }, + { + "epoch": 0.603879229338535, + "grad_norm": 1.1484375, + "learning_rate": 0.000180712294585158, + "loss": 4.3375, + "step": 5824 + }, + { + "epoch": 0.6039829173930231, + "grad_norm": 1.375, + "learning_rate": 0.00018070588119106228, + "loss": 4.3278, + "step": 5825 + }, + { + "epoch": 0.6040866054475111, + "grad_norm": 1.28125, + "learning_rate": 0.00018069946684472248, + "loss": 4.3463, + "step": 5826 + }, + { + "epoch": 0.6041902935019993, + "grad_norm": 1.2578125, + "learning_rate": 0.00018069305154621424, + "loss": 4.3182, + "step": 5827 + }, + { + "epoch": 0.6042939815564873, + "grad_norm": 1.25, + "learning_rate": 0.00018068663529561331, + "loss": 4.3293, + "step": 5828 + }, + { + "epoch": 0.6043976696109754, + "grad_norm": 1.1875, + "learning_rate": 0.00018068021809299536, + "loss": 4.3106, + "step": 5829 + }, + { + "epoch": 0.6045013576654634, + "grad_norm": 1.109375, + "learning_rate": 0.00018067379993843617, + "loss": 4.3078, + "step": 5830 + }, + { + "epoch": 0.6046050457199516, + "grad_norm": 1.375, + "learning_rate": 0.00018066738083201135, + "loss": 4.3269, + "step": 5831 + }, + { + "epoch": 0.6047087337744396, + "grad_norm": 1.2578125, + "learning_rate": 0.00018066096077379675, + "loss": 4.321, + "step": 5832 + }, + { + "epoch": 0.6048124218289277, + "grad_norm": 1.28125, + "learning_rate": 0.00018065453976386805, + "loss": 4.3406, + "step": 5833 + }, + { + "epoch": 0.6049161098834157, + "grad_norm": 1.234375, + "learning_rate": 0.00018064811780230103, + "loss": 4.3474, + "step": 5834 + }, + { + "epoch": 0.6050197979379038, + "grad_norm": 1.28125, + "learning_rate": 0.0001806416948891715, + "loss": 4.3533, + "step": 5835 + }, + { + "epoch": 0.6051234859923919, + "grad_norm": 1.171875, + "learning_rate": 0.0001806352710245552, + "loss": 4.3238, + "step": 5836 + }, + { + "epoch": 0.60522717404688, + "grad_norm": 1.3203125, + "learning_rate": 0.00018062884620852792, + "loss": 4.3164, + "step": 5837 + }, + { + "epoch": 0.605330862101368, + "grad_norm": 1.2421875, + "learning_rate": 0.00018062242044116552, + "loss": 4.3437, + "step": 5838 + }, + { + "epoch": 0.6054345501558561, + "grad_norm": 1.4609375, + "learning_rate": 0.00018061599372254375, + "loss": 4.2953, + "step": 5839 + }, + { + "epoch": 0.6055382382103441, + "grad_norm": 1.3515625, + "learning_rate": 0.0001806095660527385, + "loss": 4.3134, + "step": 5840 + }, + { + "epoch": 0.6056419262648323, + "grad_norm": 1.1171875, + "learning_rate": 0.00018060313743182554, + "loss": 4.3519, + "step": 5841 + }, + { + "epoch": 0.6057456143193203, + "grad_norm": 1.203125, + "learning_rate": 0.00018059670785988075, + "loss": 4.355, + "step": 5842 + }, + { + "epoch": 0.6058493023738084, + "grad_norm": 1.1640625, + "learning_rate": 0.00018059027733698005, + "loss": 4.3502, + "step": 5843 + }, + { + "epoch": 0.6059529904282964, + "grad_norm": 1.0703125, + "learning_rate": 0.00018058384586319926, + "loss": 4.3536, + "step": 5844 + }, + { + "epoch": 0.6060566784827845, + "grad_norm": 1.3203125, + "learning_rate": 0.00018057741343861423, + "loss": 4.3558, + "step": 5845 + }, + { + "epoch": 0.6061603665372726, + "grad_norm": 1.1328125, + "learning_rate": 0.0001805709800633009, + "loss": 4.3174, + "step": 5846 + }, + { + "epoch": 0.6062640545917607, + "grad_norm": 1.4609375, + "learning_rate": 0.00018056454573733518, + "loss": 4.364, + "step": 5847 + }, + { + "epoch": 0.6063677426462487, + "grad_norm": 1.390625, + "learning_rate": 0.000180558110460793, + "loss": 4.3445, + "step": 5848 + }, + { + "epoch": 0.6064714307007368, + "grad_norm": 1.171875, + "learning_rate": 0.00018055167423375025, + "loss": 4.2715, + "step": 5849 + }, + { + "epoch": 0.6065751187552249, + "grad_norm": 1.1015625, + "learning_rate": 0.00018054523705628292, + "loss": 4.305, + "step": 5850 + }, + { + "epoch": 0.606678806809713, + "grad_norm": 1.1875, + "learning_rate": 0.0001805387989284669, + "loss": 4.3408, + "step": 5851 + }, + { + "epoch": 0.6067824948642011, + "grad_norm": 1.0390625, + "learning_rate": 0.0001805323598503782, + "loss": 4.34, + "step": 5852 + }, + { + "epoch": 0.6068861829186891, + "grad_norm": 1.4140625, + "learning_rate": 0.0001805259198220928, + "loss": 4.3203, + "step": 5853 + }, + { + "epoch": 0.6069898709731772, + "grad_norm": 1.1640625, + "learning_rate": 0.00018051947884368662, + "loss": 4.3286, + "step": 5854 + }, + { + "epoch": 0.6070935590276653, + "grad_norm": 1.453125, + "learning_rate": 0.00018051303691523575, + "loss": 4.3236, + "step": 5855 + }, + { + "epoch": 0.6071972470821534, + "grad_norm": 1.359375, + "learning_rate": 0.0001805065940368161, + "loss": 4.335, + "step": 5856 + }, + { + "epoch": 0.6073009351366414, + "grad_norm": 1.234375, + "learning_rate": 0.00018050015020850378, + "loss": 4.3283, + "step": 5857 + }, + { + "epoch": 0.6074046231911295, + "grad_norm": 1.0625, + "learning_rate": 0.00018049370543037475, + "loss": 4.3239, + "step": 5858 + }, + { + "epoch": 0.6075083112456175, + "grad_norm": 1.4609375, + "learning_rate": 0.00018048725970250514, + "loss": 4.3298, + "step": 5859 + }, + { + "epoch": 0.6076119993001057, + "grad_norm": 1.2109375, + "learning_rate": 0.0001804808130249709, + "loss": 4.3457, + "step": 5860 + }, + { + "epoch": 0.6077156873545937, + "grad_norm": 1.4765625, + "learning_rate": 0.00018047436539784812, + "loss": 4.3558, + "step": 5861 + }, + { + "epoch": 0.6078193754090818, + "grad_norm": 1.3828125, + "learning_rate": 0.00018046791682121293, + "loss": 4.3571, + "step": 5862 + }, + { + "epoch": 0.6079230634635698, + "grad_norm": 1.3125, + "learning_rate": 0.00018046146729514136, + "loss": 4.295, + "step": 5863 + }, + { + "epoch": 0.608026751518058, + "grad_norm": 1.2890625, + "learning_rate": 0.00018045501681970954, + "loss": 4.3485, + "step": 5864 + }, + { + "epoch": 0.608130439572546, + "grad_norm": 1.296875, + "learning_rate": 0.00018044856539499354, + "loss": 4.3523, + "step": 5865 + }, + { + "epoch": 0.6082341276270341, + "grad_norm": 1.125, + "learning_rate": 0.00018044211302106953, + "loss": 4.3211, + "step": 5866 + }, + { + "epoch": 0.6083378156815221, + "grad_norm": 1.4453125, + "learning_rate": 0.00018043565969801359, + "loss": 4.3512, + "step": 5867 + }, + { + "epoch": 0.6084415037360102, + "grad_norm": 1.21875, + "learning_rate": 0.00018042920542590195, + "loss": 4.3781, + "step": 5868 + }, + { + "epoch": 0.6085451917904983, + "grad_norm": 1.578125, + "learning_rate": 0.00018042275020481064, + "loss": 4.3541, + "step": 5869 + }, + { + "epoch": 0.6086488798449864, + "grad_norm": 1.390625, + "learning_rate": 0.00018041629403481592, + "loss": 4.3422, + "step": 5870 + }, + { + "epoch": 0.6087525678994744, + "grad_norm": 1.4921875, + "learning_rate": 0.00018040983691599395, + "loss": 4.339, + "step": 5871 + }, + { + "epoch": 0.6088562559539625, + "grad_norm": 1.375, + "learning_rate": 0.00018040337884842086, + "loss": 4.2955, + "step": 5872 + }, + { + "epoch": 0.6089599440084505, + "grad_norm": 1.2578125, + "learning_rate": 0.00018039691983217287, + "loss": 4.3055, + "step": 5873 + }, + { + "epoch": 0.6090636320629387, + "grad_norm": 1.21875, + "learning_rate": 0.00018039045986732627, + "loss": 4.352, + "step": 5874 + }, + { + "epoch": 0.6091673201174267, + "grad_norm": 1.3828125, + "learning_rate": 0.0001803839989539572, + "loss": 4.3229, + "step": 5875 + }, + { + "epoch": 0.6092710081719148, + "grad_norm": 1.2734375, + "learning_rate": 0.0001803775370921419, + "loss": 4.313, + "step": 5876 + }, + { + "epoch": 0.6093746962264028, + "grad_norm": 1.421875, + "learning_rate": 0.00018037107428195664, + "loss": 4.3309, + "step": 5877 + }, + { + "epoch": 0.609478384280891, + "grad_norm": 1.3515625, + "learning_rate": 0.00018036461052347766, + "loss": 4.3201, + "step": 5878 + }, + { + "epoch": 0.609582072335379, + "grad_norm": 1.2734375, + "learning_rate": 0.0001803581458167812, + "loss": 4.3666, + "step": 5879 + }, + { + "epoch": 0.6096857603898671, + "grad_norm": 1.25, + "learning_rate": 0.0001803516801619436, + "loss": 4.3787, + "step": 5880 + }, + { + "epoch": 0.6097894484443551, + "grad_norm": 1.2734375, + "learning_rate": 0.00018034521355904108, + "loss": 4.3374, + "step": 5881 + }, + { + "epoch": 0.6098931364988432, + "grad_norm": 1.1640625, + "learning_rate": 0.00018033874600815, + "loss": 4.3281, + "step": 5882 + }, + { + "epoch": 0.6099968245533313, + "grad_norm": 1.4296875, + "learning_rate": 0.0001803322775093466, + "loss": 4.3247, + "step": 5883 + }, + { + "epoch": 0.6101005126078194, + "grad_norm": 1.203125, + "learning_rate": 0.00018032580806270725, + "loss": 4.3396, + "step": 5884 + }, + { + "epoch": 0.6102042006623074, + "grad_norm": 1.515625, + "learning_rate": 0.0001803193376683083, + "loss": 4.3337, + "step": 5885 + }, + { + "epoch": 0.6103078887167955, + "grad_norm": 1.4375, + "learning_rate": 0.00018031286632622603, + "loss": 4.347, + "step": 5886 + }, + { + "epoch": 0.6104115767712835, + "grad_norm": 1.2578125, + "learning_rate": 0.00018030639403653683, + "loss": 4.3089, + "step": 5887 + }, + { + "epoch": 0.6105152648257717, + "grad_norm": 1.21875, + "learning_rate": 0.00018029992079931711, + "loss": 4.3356, + "step": 5888 + }, + { + "epoch": 0.6106189528802597, + "grad_norm": 1.3203125, + "learning_rate": 0.00018029344661464318, + "loss": 4.2903, + "step": 5889 + }, + { + "epoch": 0.6107226409347478, + "grad_norm": 1.1796875, + "learning_rate": 0.00018028697148259144, + "loss": 4.3385, + "step": 5890 + }, + { + "epoch": 0.6108263289892358, + "grad_norm": 1.6484375, + "learning_rate": 0.00018028049540323832, + "loss": 4.3115, + "step": 5891 + }, + { + "epoch": 0.610930017043724, + "grad_norm": 1.484375, + "learning_rate": 0.0001802740183766602, + "loss": 4.3394, + "step": 5892 + }, + { + "epoch": 0.611033705098212, + "grad_norm": 1.6328125, + "learning_rate": 0.00018026754040293354, + "loss": 4.3097, + "step": 5893 + }, + { + "epoch": 0.6111373931527001, + "grad_norm": 1.578125, + "learning_rate": 0.00018026106148213476, + "loss": 4.3807, + "step": 5894 + }, + { + "epoch": 0.6112410812071881, + "grad_norm": 1.1796875, + "learning_rate": 0.00018025458161434027, + "loss": 4.3459, + "step": 5895 + }, + { + "epoch": 0.6113447692616762, + "grad_norm": 1.15625, + "learning_rate": 0.00018024810079962653, + "loss": 4.3674, + "step": 5896 + }, + { + "epoch": 0.6114484573161644, + "grad_norm": 1.1953125, + "learning_rate": 0.00018024161903807006, + "loss": 4.3333, + "step": 5897 + }, + { + "epoch": 0.6115521453706524, + "grad_norm": 1.0078125, + "learning_rate": 0.0001802351363297473, + "loss": 4.3329, + "step": 5898 + }, + { + "epoch": 0.6116558334251405, + "grad_norm": 1.421875, + "learning_rate": 0.00018022865267473473, + "loss": 4.3412, + "step": 5899 + }, + { + "epoch": 0.6117595214796285, + "grad_norm": 1.328125, + "learning_rate": 0.00018022216807310888, + "loss": 4.3796, + "step": 5900 + }, + { + "epoch": 0.6118632095341167, + "grad_norm": 1.1875, + "learning_rate": 0.00018021568252494624, + "loss": 4.3685, + "step": 5901 + }, + { + "epoch": 0.6119668975886047, + "grad_norm": 1.1484375, + "learning_rate": 0.00018020919603032334, + "loss": 4.3144, + "step": 5902 + }, + { + "epoch": 0.6120705856430928, + "grad_norm": 1.125, + "learning_rate": 0.00018020270858931666, + "loss": 4.3126, + "step": 5903 + }, + { + "epoch": 0.6121742736975808, + "grad_norm": 1.0, + "learning_rate": 0.00018019622020200285, + "loss": 4.3694, + "step": 5904 + }, + { + "epoch": 0.6122779617520689, + "grad_norm": 1.3203125, + "learning_rate": 0.0001801897308684584, + "loss": 4.3326, + "step": 5905 + }, + { + "epoch": 0.612381649806557, + "grad_norm": 1.25, + "learning_rate": 0.00018018324058875993, + "loss": 4.3275, + "step": 5906 + }, + { + "epoch": 0.6124853378610451, + "grad_norm": 1.359375, + "learning_rate": 0.00018017674936298393, + "loss": 4.3632, + "step": 5907 + }, + { + "epoch": 0.6125890259155331, + "grad_norm": 1.1875, + "learning_rate": 0.00018017025719120703, + "loss": 4.3562, + "step": 5908 + }, + { + "epoch": 0.6126927139700212, + "grad_norm": 1.3671875, + "learning_rate": 0.00018016376407350588, + "loss": 4.3318, + "step": 5909 + }, + { + "epoch": 0.6127964020245092, + "grad_norm": 1.3046875, + "learning_rate": 0.000180157270009957, + "loss": 4.3231, + "step": 5910 + }, + { + "epoch": 0.6129000900789974, + "grad_norm": 1.296875, + "learning_rate": 0.00018015077500063714, + "loss": 4.3291, + "step": 5911 + }, + { + "epoch": 0.6130037781334854, + "grad_norm": 1.1328125, + "learning_rate": 0.0001801442790456228, + "loss": 4.3329, + "step": 5912 + }, + { + "epoch": 0.6131074661879735, + "grad_norm": 1.3046875, + "learning_rate": 0.00018013778214499067, + "loss": 4.3003, + "step": 5913 + }, + { + "epoch": 0.6132111542424615, + "grad_norm": 1.203125, + "learning_rate": 0.00018013128429881747, + "loss": 4.3611, + "step": 5914 + }, + { + "epoch": 0.6133148422969497, + "grad_norm": 1.40625, + "learning_rate": 0.0001801247855071798, + "loss": 4.3372, + "step": 5915 + }, + { + "epoch": 0.6134185303514377, + "grad_norm": 1.359375, + "learning_rate": 0.00018011828577015434, + "loss": 4.3413, + "step": 5916 + }, + { + "epoch": 0.6135222184059258, + "grad_norm": 1.171875, + "learning_rate": 0.0001801117850878178, + "loss": 4.3753, + "step": 5917 + }, + { + "epoch": 0.6136259064604138, + "grad_norm": 1.0859375, + "learning_rate": 0.00018010528346024688, + "loss": 4.3338, + "step": 5918 + }, + { + "epoch": 0.6137295945149019, + "grad_norm": 1.25, + "learning_rate": 0.0001800987808875183, + "loss": 4.3024, + "step": 5919 + }, + { + "epoch": 0.61383328256939, + "grad_norm": 1.109375, + "learning_rate": 0.00018009227736970877, + "loss": 4.3136, + "step": 5920 + }, + { + "epoch": 0.6139369706238781, + "grad_norm": 1.2421875, + "learning_rate": 0.00018008577290689503, + "loss": 4.318, + "step": 5921 + }, + { + "epoch": 0.6140406586783661, + "grad_norm": 1.125, + "learning_rate": 0.00018007926749915383, + "loss": 4.324, + "step": 5922 + }, + { + "epoch": 0.6141443467328542, + "grad_norm": 1.296875, + "learning_rate": 0.0001800727611465619, + "loss": 4.3304, + "step": 5923 + }, + { + "epoch": 0.6142480347873422, + "grad_norm": 1.171875, + "learning_rate": 0.00018006625384919605, + "loss": 4.3218, + "step": 5924 + }, + { + "epoch": 0.6143517228418304, + "grad_norm": 1.359375, + "learning_rate": 0.00018005974560713305, + "loss": 4.3835, + "step": 5925 + }, + { + "epoch": 0.6144554108963184, + "grad_norm": 1.2578125, + "learning_rate": 0.00018005323642044966, + "loss": 4.314, + "step": 5926 + }, + { + "epoch": 0.6145590989508065, + "grad_norm": 1.4140625, + "learning_rate": 0.00018004672628922267, + "loss": 4.3165, + "step": 5927 + }, + { + "epoch": 0.6146627870052945, + "grad_norm": 1.3125, + "learning_rate": 0.00018004021521352893, + "loss": 4.3353, + "step": 5928 + }, + { + "epoch": 0.6147664750597827, + "grad_norm": 1.40625, + "learning_rate": 0.0001800337031934453, + "loss": 4.2861, + "step": 5929 + }, + { + "epoch": 0.6148701631142707, + "grad_norm": 1.3046875, + "learning_rate": 0.00018002719022904855, + "loss": 4.3535, + "step": 5930 + }, + { + "epoch": 0.6149738511687588, + "grad_norm": 1.3671875, + "learning_rate": 0.00018002067632041555, + "loss": 4.3158, + "step": 5931 + }, + { + "epoch": 0.6150775392232468, + "grad_norm": 1.28125, + "learning_rate": 0.00018001416146762314, + "loss": 4.3222, + "step": 5932 + }, + { + "epoch": 0.6151812272777349, + "grad_norm": 1.421875, + "learning_rate": 0.00018000764567074822, + "loss": 4.3179, + "step": 5933 + }, + { + "epoch": 0.615284915332223, + "grad_norm": 1.34375, + "learning_rate": 0.00018000112892986765, + "loss": 4.3686, + "step": 5934 + }, + { + "epoch": 0.6153886033867111, + "grad_norm": 1.265625, + "learning_rate": 0.0001799946112450583, + "loss": 4.3321, + "step": 5935 + }, + { + "epoch": 0.6154922914411991, + "grad_norm": 1.125, + "learning_rate": 0.00017998809261639712, + "loss": 4.3088, + "step": 5936 + }, + { + "epoch": 0.6155959794956872, + "grad_norm": 1.2421875, + "learning_rate": 0.000179981573043961, + "loss": 4.3076, + "step": 5937 + }, + { + "epoch": 0.6156996675501752, + "grad_norm": 1.1171875, + "learning_rate": 0.00017997505252782687, + "loss": 4.3049, + "step": 5938 + }, + { + "epoch": 0.6158033556046634, + "grad_norm": 1.390625, + "learning_rate": 0.00017996853106807165, + "loss": 4.357, + "step": 5939 + }, + { + "epoch": 0.6159070436591514, + "grad_norm": 1.359375, + "learning_rate": 0.00017996200866477228, + "loss": 4.3391, + "step": 5940 + }, + { + "epoch": 0.6160107317136395, + "grad_norm": 1.2734375, + "learning_rate": 0.00017995548531800573, + "loss": 4.372, + "step": 5941 + }, + { + "epoch": 0.6161144197681276, + "grad_norm": 1.1171875, + "learning_rate": 0.000179948961027849, + "loss": 4.2656, + "step": 5942 + }, + { + "epoch": 0.6162181078226157, + "grad_norm": 1.2734375, + "learning_rate": 0.00017994243579437898, + "loss": 4.3352, + "step": 5943 + }, + { + "epoch": 0.6163217958771038, + "grad_norm": 1.1171875, + "learning_rate": 0.0001799359096176728, + "loss": 4.3499, + "step": 5944 + }, + { + "epoch": 0.6164254839315918, + "grad_norm": 1.4921875, + "learning_rate": 0.00017992938249780733, + "loss": 4.3459, + "step": 5945 + }, + { + "epoch": 0.6165291719860799, + "grad_norm": 1.3828125, + "learning_rate": 0.00017992285443485965, + "loss": 4.3025, + "step": 5946 + }, + { + "epoch": 0.6166328600405679, + "grad_norm": 1.15625, + "learning_rate": 0.00017991632542890677, + "loss": 4.3394, + "step": 5947 + }, + { + "epoch": 0.6167365480950561, + "grad_norm": 1.1171875, + "learning_rate": 0.00017990979548002572, + "loss": 4.3118, + "step": 5948 + }, + { + "epoch": 0.6168402361495441, + "grad_norm": 1.2734375, + "learning_rate": 0.00017990326458829355, + "loss": 4.2628, + "step": 5949 + }, + { + "epoch": 0.6169439242040322, + "grad_norm": 1.109375, + "learning_rate": 0.0001798967327537873, + "loss": 4.3008, + "step": 5950 + }, + { + "epoch": 0.6170476122585202, + "grad_norm": 1.4609375, + "learning_rate": 0.0001798901999765841, + "loss": 4.3193, + "step": 5951 + }, + { + "epoch": 0.6171513003130084, + "grad_norm": 1.3515625, + "learning_rate": 0.00017988366625676098, + "loss": 4.3051, + "step": 5952 + }, + { + "epoch": 0.6172549883674964, + "grad_norm": 1.1484375, + "learning_rate": 0.00017987713159439502, + "loss": 4.3361, + "step": 5953 + }, + { + "epoch": 0.6173586764219845, + "grad_norm": 1.125, + "learning_rate": 0.00017987059598956336, + "loss": 4.3399, + "step": 5954 + }, + { + "epoch": 0.6174623644764725, + "grad_norm": 1.3203125, + "learning_rate": 0.00017986405944234307, + "loss": 4.3643, + "step": 5955 + }, + { + "epoch": 0.6175660525309606, + "grad_norm": 1.0859375, + "learning_rate": 0.0001798575219528113, + "loss": 4.3694, + "step": 5956 + }, + { + "epoch": 0.6176697405854487, + "grad_norm": 1.40625, + "learning_rate": 0.0001798509835210452, + "loss": 4.3734, + "step": 5957 + }, + { + "epoch": 0.6177734286399368, + "grad_norm": 1.328125, + "learning_rate": 0.0001798444441471219, + "loss": 4.3513, + "step": 5958 + }, + { + "epoch": 0.6178771166944248, + "grad_norm": 1.265625, + "learning_rate": 0.00017983790383111856, + "loss": 4.3163, + "step": 5959 + }, + { + "epoch": 0.6179808047489129, + "grad_norm": 1.1484375, + "learning_rate": 0.00017983136257311233, + "loss": 4.2875, + "step": 5960 + }, + { + "epoch": 0.6180844928034009, + "grad_norm": 1.203125, + "learning_rate": 0.00017982482037318042, + "loss": 4.3255, + "step": 5961 + }, + { + "epoch": 0.6181881808578891, + "grad_norm": 1.09375, + "learning_rate": 0.00017981827723140002, + "loss": 4.3877, + "step": 5962 + }, + { + "epoch": 0.6182918689123771, + "grad_norm": 1.2734375, + "learning_rate": 0.0001798117331478483, + "loss": 4.3281, + "step": 5963 + }, + { + "epoch": 0.6183955569668652, + "grad_norm": 1.1484375, + "learning_rate": 0.0001798051881226025, + "loss": 4.3363, + "step": 5964 + }, + { + "epoch": 0.6184992450213532, + "grad_norm": 1.34375, + "learning_rate": 0.00017979864215573983, + "loss": 4.3321, + "step": 5965 + }, + { + "epoch": 0.6186029330758414, + "grad_norm": 1.203125, + "learning_rate": 0.00017979209524733754, + "loss": 4.2988, + "step": 5966 + }, + { + "epoch": 0.6187066211303294, + "grad_norm": 1.2265625, + "learning_rate": 0.00017978554739747288, + "loss": 4.3493, + "step": 5967 + }, + { + "epoch": 0.6188103091848175, + "grad_norm": 1.15625, + "learning_rate": 0.0001797789986062231, + "loss": 4.2884, + "step": 5968 + }, + { + "epoch": 0.6189139972393055, + "grad_norm": 1.2578125, + "learning_rate": 0.00017977244887366545, + "loss": 4.3441, + "step": 5969 + }, + { + "epoch": 0.6190176852937936, + "grad_norm": 1.15625, + "learning_rate": 0.00017976589819987724, + "loss": 4.2903, + "step": 5970 + }, + { + "epoch": 0.6191213733482817, + "grad_norm": 1.375, + "learning_rate": 0.00017975934658493573, + "loss": 4.3436, + "step": 5971 + }, + { + "epoch": 0.6192250614027698, + "grad_norm": 1.2890625, + "learning_rate": 0.00017975279402891826, + "loss": 4.3744, + "step": 5972 + }, + { + "epoch": 0.6193287494572578, + "grad_norm": 1.2109375, + "learning_rate": 0.0001797462405319021, + "loss": 4.3048, + "step": 5973 + }, + { + "epoch": 0.6194324375117459, + "grad_norm": 1.0859375, + "learning_rate": 0.0001797396860939646, + "loss": 4.3472, + "step": 5974 + }, + { + "epoch": 0.6195361255662339, + "grad_norm": 1.171875, + "learning_rate": 0.0001797331307151831, + "loss": 4.3844, + "step": 5975 + }, + { + "epoch": 0.6196398136207221, + "grad_norm": 1.0625, + "learning_rate": 0.00017972657439563493, + "loss": 4.3364, + "step": 5976 + }, + { + "epoch": 0.6197435016752101, + "grad_norm": 1.4296875, + "learning_rate": 0.00017972001713539748, + "loss": 4.3088, + "step": 5977 + }, + { + "epoch": 0.6198471897296982, + "grad_norm": 1.3046875, + "learning_rate": 0.00017971345893454807, + "loss": 4.3343, + "step": 5978 + }, + { + "epoch": 0.6199508777841862, + "grad_norm": 1.328125, + "learning_rate": 0.00017970689979316412, + "loss": 4.3309, + "step": 5979 + }, + { + "epoch": 0.6200545658386744, + "grad_norm": 1.1875, + "learning_rate": 0.00017970033971132301, + "loss": 4.3378, + "step": 5980 + }, + { + "epoch": 0.6201582538931624, + "grad_norm": 1.203125, + "learning_rate": 0.00017969377868910216, + "loss": 4.2783, + "step": 5981 + }, + { + "epoch": 0.6202619419476505, + "grad_norm": 1.140625, + "learning_rate": 0.00017968721672657892, + "loss": 4.3228, + "step": 5982 + }, + { + "epoch": 0.6203656300021385, + "grad_norm": 1.5078125, + "learning_rate": 0.00017968065382383076, + "loss": 4.3054, + "step": 5983 + }, + { + "epoch": 0.6204693180566266, + "grad_norm": 1.390625, + "learning_rate": 0.00017967408998093514, + "loss": 4.336, + "step": 5984 + }, + { + "epoch": 0.6205730061111148, + "grad_norm": 1.2421875, + "learning_rate": 0.00017966752519796945, + "loss": 4.3644, + "step": 5985 + }, + { + "epoch": 0.6206766941656028, + "grad_norm": 1.25, + "learning_rate": 0.00017966095947501119, + "loss": 4.3458, + "step": 5986 + }, + { + "epoch": 0.6207803822200909, + "grad_norm": 1.1171875, + "learning_rate": 0.00017965439281213778, + "loss": 4.3109, + "step": 5987 + }, + { + "epoch": 0.6208840702745789, + "grad_norm": 1.0625, + "learning_rate": 0.0001796478252094268, + "loss": 4.3486, + "step": 5988 + }, + { + "epoch": 0.620987758329067, + "grad_norm": 1.2890625, + "learning_rate": 0.00017964125666695562, + "loss": 4.3259, + "step": 5989 + }, + { + "epoch": 0.6210914463835551, + "grad_norm": 1.1640625, + "learning_rate": 0.00017963468718480181, + "loss": 4.2994, + "step": 5990 + }, + { + "epoch": 0.6211951344380432, + "grad_norm": 1.40625, + "learning_rate": 0.00017962811676304285, + "loss": 4.3223, + "step": 5991 + }, + { + "epoch": 0.6212988224925312, + "grad_norm": 1.359375, + "learning_rate": 0.00017962154540175632, + "loss": 4.3949, + "step": 5992 + }, + { + "epoch": 0.6214025105470193, + "grad_norm": 0.98046875, + "learning_rate": 0.0001796149731010197, + "loss": 4.2671, + "step": 5993 + }, + { + "epoch": 0.6215061986015074, + "grad_norm": 1.0625, + "learning_rate": 0.00017960839986091057, + "loss": 4.3448, + "step": 5994 + }, + { + "epoch": 0.6216098866559955, + "grad_norm": 1.0234375, + "learning_rate": 0.00017960182568150642, + "loss": 4.3655, + "step": 5995 + }, + { + "epoch": 0.6217135747104835, + "grad_norm": 1.0, + "learning_rate": 0.0001795952505628849, + "loss": 4.284, + "step": 5996 + }, + { + "epoch": 0.6218172627649716, + "grad_norm": 1.046875, + "learning_rate": 0.00017958867450512358, + "loss": 4.3335, + "step": 5997 + }, + { + "epoch": 0.6219209508194596, + "grad_norm": 0.890625, + "learning_rate": 0.0001795820975083, + "loss": 4.3414, + "step": 5998 + }, + { + "epoch": 0.6220246388739478, + "grad_norm": 1.03125, + "learning_rate": 0.00017957551957249182, + "loss": 4.2983, + "step": 5999 + }, + { + "epoch": 0.6221283269284358, + "grad_norm": 0.875, + "learning_rate": 0.0001795689406977766, + "loss": 4.3232, + "step": 6000 + }, + { + "epoch": 0.6222320149829239, + "grad_norm": 1.109375, + "learning_rate": 0.000179562360884232, + "loss": 4.3616, + "step": 6001 + }, + { + "epoch": 0.6223357030374119, + "grad_norm": 0.92578125, + "learning_rate": 0.00017955578013193564, + "loss": 4.3173, + "step": 6002 + }, + { + "epoch": 0.6224393910919, + "grad_norm": 1.265625, + "learning_rate": 0.00017954919844096517, + "loss": 4.3492, + "step": 6003 + }, + { + "epoch": 0.6225430791463881, + "grad_norm": 1.078125, + "learning_rate": 0.00017954261581139825, + "loss": 4.3315, + "step": 6004 + }, + { + "epoch": 0.6226467672008762, + "grad_norm": 1.390625, + "learning_rate": 0.00017953603224331254, + "loss": 4.3123, + "step": 6005 + }, + { + "epoch": 0.6227504552553642, + "grad_norm": 1.2734375, + "learning_rate": 0.0001795294477367857, + "loss": 4.332, + "step": 6006 + }, + { + "epoch": 0.6228541433098523, + "grad_norm": 1.234375, + "learning_rate": 0.00017952286229189546, + "loss": 4.3646, + "step": 6007 + }, + { + "epoch": 0.6229578313643404, + "grad_norm": 1.1953125, + "learning_rate": 0.00017951627590871952, + "loss": 4.31, + "step": 6008 + }, + { + "epoch": 0.6230615194188285, + "grad_norm": 1.1796875, + "learning_rate": 0.00017950968858733557, + "loss": 4.3498, + "step": 6009 + }, + { + "epoch": 0.6231652074733165, + "grad_norm": 1.046875, + "learning_rate": 0.00017950310032782132, + "loss": 4.3115, + "step": 6010 + }, + { + "epoch": 0.6232688955278046, + "grad_norm": 1.40625, + "learning_rate": 0.00017949651113025454, + "loss": 4.3336, + "step": 6011 + }, + { + "epoch": 0.6233725835822926, + "grad_norm": 1.265625, + "learning_rate": 0.00017948992099471296, + "loss": 4.36, + "step": 6012 + }, + { + "epoch": 0.6234762716367808, + "grad_norm": 1.3828125, + "learning_rate": 0.00017948332992127433, + "loss": 4.3652, + "step": 6013 + }, + { + "epoch": 0.6235799596912688, + "grad_norm": 1.3203125, + "learning_rate": 0.00017947673791001643, + "loss": 4.3192, + "step": 6014 + }, + { + "epoch": 0.6236836477457569, + "grad_norm": 1.203125, + "learning_rate": 0.00017947014496101703, + "loss": 4.3289, + "step": 6015 + }, + { + "epoch": 0.6237873358002449, + "grad_norm": 1.125, + "learning_rate": 0.00017946355107435391, + "loss": 4.3483, + "step": 6016 + }, + { + "epoch": 0.623891023854733, + "grad_norm": 1.40625, + "learning_rate": 0.0001794569562501049, + "loss": 4.3262, + "step": 6017 + }, + { + "epoch": 0.6239947119092211, + "grad_norm": 1.234375, + "learning_rate": 0.0001794503604883478, + "loss": 4.3498, + "step": 6018 + }, + { + "epoch": 0.6240983999637092, + "grad_norm": 1.3515625, + "learning_rate": 0.00017944376378916044, + "loss": 4.3238, + "step": 6019 + }, + { + "epoch": 0.6242020880181972, + "grad_norm": 1.28125, + "learning_rate": 0.00017943716615262062, + "loss": 4.3432, + "step": 6020 + }, + { + "epoch": 0.6243057760726853, + "grad_norm": 1.0859375, + "learning_rate": 0.0001794305675788062, + "loss": 4.3482, + "step": 6021 + }, + { + "epoch": 0.6244094641271734, + "grad_norm": 1.0703125, + "learning_rate": 0.00017942396806779507, + "loss": 4.3173, + "step": 6022 + }, + { + "epoch": 0.6245131521816615, + "grad_norm": 1.1484375, + "learning_rate": 0.00017941736761966506, + "loss": 4.304, + "step": 6023 + }, + { + "epoch": 0.6246168402361495, + "grad_norm": 1.0625, + "learning_rate": 0.00017941076623449406, + "loss": 4.3422, + "step": 6024 + }, + { + "epoch": 0.6247205282906376, + "grad_norm": 1.390625, + "learning_rate": 0.00017940416391235995, + "loss": 4.3239, + "step": 6025 + }, + { + "epoch": 0.6248242163451256, + "grad_norm": 1.328125, + "learning_rate": 0.00017939756065334068, + "loss": 4.3214, + "step": 6026 + }, + { + "epoch": 0.6249279043996138, + "grad_norm": 1.109375, + "learning_rate": 0.00017939095645751408, + "loss": 4.3207, + "step": 6027 + }, + { + "epoch": 0.6250315924541018, + "grad_norm": 1.0625, + "learning_rate": 0.0001793843513249581, + "loss": 4.3462, + "step": 6028 + }, + { + "epoch": 0.6251352805085899, + "grad_norm": 1.2421875, + "learning_rate": 0.00017937774525575073, + "loss": 4.3265, + "step": 6029 + }, + { + "epoch": 0.625238968563078, + "grad_norm": 1.1328125, + "learning_rate": 0.00017937113824996985, + "loss": 4.3543, + "step": 6030 + }, + { + "epoch": 0.625342656617566, + "grad_norm": 1.3515625, + "learning_rate": 0.00017936453030769346, + "loss": 4.3666, + "step": 6031 + }, + { + "epoch": 0.6254463446720542, + "grad_norm": 1.1796875, + "learning_rate": 0.00017935792142899948, + "loss": 4.3258, + "step": 6032 + }, + { + "epoch": 0.6255500327265422, + "grad_norm": 1.28125, + "learning_rate": 0.00017935131161396592, + "loss": 4.3247, + "step": 6033 + }, + { + "epoch": 0.6256537207810303, + "grad_norm": 1.1875, + "learning_rate": 0.00017934470086267075, + "loss": 4.3449, + "step": 6034 + }, + { + "epoch": 0.6257574088355183, + "grad_norm": 1.3046875, + "learning_rate": 0.000179338089175192, + "loss": 4.3614, + "step": 6035 + }, + { + "epoch": 0.6258610968900065, + "grad_norm": 1.2109375, + "learning_rate": 0.00017933147655160766, + "loss": 4.3474, + "step": 6036 + }, + { + "epoch": 0.6259647849444945, + "grad_norm": 1.2578125, + "learning_rate": 0.00017932486299199573, + "loss": 4.3562, + "step": 6037 + }, + { + "epoch": 0.6260684729989826, + "grad_norm": 1.1640625, + "learning_rate": 0.0001793182484964343, + "loss": 4.3562, + "step": 6038 + }, + { + "epoch": 0.6261721610534706, + "grad_norm": 1.3125, + "learning_rate": 0.0001793116330650013, + "loss": 4.3347, + "step": 6039 + }, + { + "epoch": 0.6262758491079587, + "grad_norm": 1.1484375, + "learning_rate": 0.00017930501669777496, + "loss": 4.3492, + "step": 6040 + }, + { + "epoch": 0.6263795371624468, + "grad_norm": 1.359375, + "learning_rate": 0.00017929839939483322, + "loss": 4.3075, + "step": 6041 + }, + { + "epoch": 0.6264832252169349, + "grad_norm": 1.2578125, + "learning_rate": 0.00017929178115625417, + "loss": 4.3827, + "step": 6042 + }, + { + "epoch": 0.6265869132714229, + "grad_norm": 1.3828125, + "learning_rate": 0.00017928516198211595, + "loss": 4.3436, + "step": 6043 + }, + { + "epoch": 0.626690601325911, + "grad_norm": 1.265625, + "learning_rate": 0.0001792785418724966, + "loss": 4.3256, + "step": 6044 + }, + { + "epoch": 0.626794289380399, + "grad_norm": 1.3984375, + "learning_rate": 0.00017927192082747427, + "loss": 4.3112, + "step": 6045 + }, + { + "epoch": 0.6268979774348872, + "grad_norm": 1.2578125, + "learning_rate": 0.0001792652988471271, + "loss": 4.3048, + "step": 6046 + }, + { + "epoch": 0.6270016654893752, + "grad_norm": 1.546875, + "learning_rate": 0.00017925867593153317, + "loss": 4.3716, + "step": 6047 + }, + { + "epoch": 0.6271053535438633, + "grad_norm": 1.4296875, + "learning_rate": 0.0001792520520807706, + "loss": 4.2846, + "step": 6048 + }, + { + "epoch": 0.6272090415983513, + "grad_norm": 1.2265625, + "learning_rate": 0.00017924542729491765, + "loss": 4.3488, + "step": 6049 + }, + { + "epoch": 0.6273127296528395, + "grad_norm": 1.25, + "learning_rate": 0.00017923880157405238, + "loss": 4.318, + "step": 6050 + }, + { + "epoch": 0.6274164177073275, + "grad_norm": 1.234375, + "learning_rate": 0.000179232174918253, + "loss": 4.3829, + "step": 6051 + }, + { + "epoch": 0.6275201057618156, + "grad_norm": 1.1484375, + "learning_rate": 0.00017922554732759775, + "loss": 4.3599, + "step": 6052 + }, + { + "epoch": 0.6276237938163036, + "grad_norm": 1.3359375, + "learning_rate": 0.00017921891880216478, + "loss": 4.3358, + "step": 6053 + }, + { + "epoch": 0.6277274818707917, + "grad_norm": 1.2109375, + "learning_rate": 0.0001792122893420323, + "loss": 4.3376, + "step": 6054 + }, + { + "epoch": 0.6278311699252798, + "grad_norm": 1.4296875, + "learning_rate": 0.00017920565894727854, + "loss": 4.3467, + "step": 6055 + }, + { + "epoch": 0.6279348579797679, + "grad_norm": 1.2890625, + "learning_rate": 0.00017919902761798172, + "loss": 4.3125, + "step": 6056 + }, + { + "epoch": 0.6280385460342559, + "grad_norm": 1.3671875, + "learning_rate": 0.0001791923953542201, + "loss": 4.292, + "step": 6057 + }, + { + "epoch": 0.628142234088744, + "grad_norm": 1.3046875, + "learning_rate": 0.00017918576215607192, + "loss": 4.3364, + "step": 6058 + }, + { + "epoch": 0.628245922143232, + "grad_norm": 1.140625, + "learning_rate": 0.00017917912802361543, + "loss": 4.3216, + "step": 6059 + }, + { + "epoch": 0.6283496101977202, + "grad_norm": 1.109375, + "learning_rate": 0.00017917249295692895, + "loss": 4.3368, + "step": 6060 + }, + { + "epoch": 0.6284532982522082, + "grad_norm": 1.2421875, + "learning_rate": 0.00017916585695609073, + "loss": 4.2879, + "step": 6061 + }, + { + "epoch": 0.6285569863066963, + "grad_norm": 1.109375, + "learning_rate": 0.0001791592200211791, + "loss": 4.348, + "step": 6062 + }, + { + "epoch": 0.6286606743611843, + "grad_norm": 1.421875, + "learning_rate": 0.00017915258215227232, + "loss": 4.3623, + "step": 6063 + }, + { + "epoch": 0.6287643624156725, + "grad_norm": 1.3671875, + "learning_rate": 0.00017914594334944873, + "loss": 4.3399, + "step": 6064 + }, + { + "epoch": 0.6288680504701605, + "grad_norm": 1.1171875, + "learning_rate": 0.00017913930361278672, + "loss": 4.325, + "step": 6065 + }, + { + "epoch": 0.6289717385246486, + "grad_norm": 1.03125, + "learning_rate": 0.00017913266294236456, + "loss": 4.31, + "step": 6066 + }, + { + "epoch": 0.6290754265791366, + "grad_norm": 1.328125, + "learning_rate": 0.0001791260213382606, + "loss": 4.3083, + "step": 6067 + }, + { + "epoch": 0.6291791146336247, + "grad_norm": 1.1640625, + "learning_rate": 0.00017911937880055323, + "loss": 4.3409, + "step": 6068 + }, + { + "epoch": 0.6292828026881128, + "grad_norm": 1.4609375, + "learning_rate": 0.00017911273532932086, + "loss": 4.3099, + "step": 6069 + }, + { + "epoch": 0.6293864907426009, + "grad_norm": 1.359375, + "learning_rate": 0.00017910609092464181, + "loss": 4.3098, + "step": 6070 + }, + { + "epoch": 0.6294901787970889, + "grad_norm": 1.1171875, + "learning_rate": 0.0001790994455865945, + "loss": 4.2889, + "step": 6071 + }, + { + "epoch": 0.629593866851577, + "grad_norm": 1.1015625, + "learning_rate": 0.00017909279931525735, + "loss": 4.3456, + "step": 6072 + }, + { + "epoch": 0.629697554906065, + "grad_norm": 1.234375, + "learning_rate": 0.00017908615211070878, + "loss": 4.2922, + "step": 6073 + }, + { + "epoch": 0.6298012429605532, + "grad_norm": 1.0703125, + "learning_rate": 0.00017907950397302722, + "loss": 4.3142, + "step": 6074 + }, + { + "epoch": 0.6299049310150413, + "grad_norm": 1.5703125, + "learning_rate": 0.00017907285490229109, + "loss": 4.3715, + "step": 6075 + }, + { + "epoch": 0.6300086190695293, + "grad_norm": 1.3671875, + "learning_rate": 0.00017906620489857887, + "loss": 4.3564, + "step": 6076 + }, + { + "epoch": 0.6301123071240174, + "grad_norm": 1.3046875, + "learning_rate": 0.000179059553961969, + "loss": 4.3453, + "step": 6077 + }, + { + "epoch": 0.6302159951785055, + "grad_norm": 1.1640625, + "learning_rate": 0.00017905290209253996, + "loss": 4.2826, + "step": 6078 + }, + { + "epoch": 0.6303196832329936, + "grad_norm": 1.3046875, + "learning_rate": 0.00017904624929037025, + "loss": 4.3096, + "step": 6079 + }, + { + "epoch": 0.6304233712874816, + "grad_norm": 1.234375, + "learning_rate": 0.00017903959555553832, + "loss": 4.3031, + "step": 6080 + }, + { + "epoch": 0.6305270593419697, + "grad_norm": 1.4453125, + "learning_rate": 0.00017903294088812273, + "loss": 4.2875, + "step": 6081 + }, + { + "epoch": 0.6306307473964577, + "grad_norm": 1.34375, + "learning_rate": 0.000179026285288202, + "loss": 4.3089, + "step": 6082 + }, + { + "epoch": 0.6307344354509459, + "grad_norm": 1.1328125, + "learning_rate": 0.00017901962875585463, + "loss": 4.3298, + "step": 6083 + }, + { + "epoch": 0.6308381235054339, + "grad_norm": 1.0625, + "learning_rate": 0.00017901297129115914, + "loss": 4.2848, + "step": 6084 + }, + { + "epoch": 0.630941811559922, + "grad_norm": 1.140625, + "learning_rate": 0.00017900631289419417, + "loss": 4.2869, + "step": 6085 + }, + { + "epoch": 0.63104549961441, + "grad_norm": 1.0546875, + "learning_rate": 0.00017899965356503816, + "loss": 4.3072, + "step": 6086 + }, + { + "epoch": 0.6311491876688982, + "grad_norm": 1.6015625, + "learning_rate": 0.00017899299330376977, + "loss": 4.3425, + "step": 6087 + }, + { + "epoch": 0.6312528757233862, + "grad_norm": 1.3984375, + "learning_rate": 0.00017898633211046753, + "loss": 4.3198, + "step": 6088 + }, + { + "epoch": 0.6313565637778743, + "grad_norm": 1.28125, + "learning_rate": 0.00017897966998521011, + "loss": 4.3226, + "step": 6089 + }, + { + "epoch": 0.6314602518323623, + "grad_norm": 1.3046875, + "learning_rate": 0.00017897300692807603, + "loss": 4.3609, + "step": 6090 + }, + { + "epoch": 0.6315639398868504, + "grad_norm": 1.0234375, + "learning_rate": 0.00017896634293914398, + "loss": 4.2785, + "step": 6091 + }, + { + "epoch": 0.6316676279413385, + "grad_norm": 0.95703125, + "learning_rate": 0.00017895967801849253, + "loss": 4.2822, + "step": 6092 + }, + { + "epoch": 0.6317713159958266, + "grad_norm": 0.98828125, + "learning_rate": 0.00017895301216620032, + "loss": 4.2794, + "step": 6093 + }, + { + "epoch": 0.6318750040503146, + "grad_norm": 0.85546875, + "learning_rate": 0.00017894634538234607, + "loss": 4.3705, + "step": 6094 + }, + { + "epoch": 0.6319786921048027, + "grad_norm": 0.94140625, + "learning_rate": 0.0001789396776670084, + "loss": 4.3103, + "step": 6095 + }, + { + "epoch": 0.6320823801592907, + "grad_norm": 0.796875, + "learning_rate": 0.00017893300902026594, + "loss": 4.2928, + "step": 6096 + }, + { + "epoch": 0.6321860682137789, + "grad_norm": 0.9375, + "learning_rate": 0.00017892633944219743, + "loss": 4.3577, + "step": 6097 + }, + { + "epoch": 0.6322897562682669, + "grad_norm": 0.76953125, + "learning_rate": 0.00017891966893288154, + "loss": 4.3371, + "step": 6098 + }, + { + "epoch": 0.632393444322755, + "grad_norm": 0.91015625, + "learning_rate": 0.00017891299749239696, + "loss": 4.336, + "step": 6099 + }, + { + "epoch": 0.632497132377243, + "grad_norm": 0.75, + "learning_rate": 0.00017890632512082243, + "loss": 4.2812, + "step": 6100 + }, + { + "epoch": 0.6326008204317312, + "grad_norm": 0.83984375, + "learning_rate": 0.0001788996518182367, + "loss": 4.2998, + "step": 6101 + }, + { + "epoch": 0.6327045084862192, + "grad_norm": 0.7421875, + "learning_rate": 0.00017889297758471846, + "loss": 4.3527, + "step": 6102 + }, + { + "epoch": 0.6328081965407073, + "grad_norm": 0.84765625, + "learning_rate": 0.00017888630242034648, + "loss": 4.3369, + "step": 6103 + }, + { + "epoch": 0.6329118845951953, + "grad_norm": 0.76171875, + "learning_rate": 0.0001788796263251995, + "loss": 4.3144, + "step": 6104 + }, + { + "epoch": 0.6330155726496834, + "grad_norm": 0.84765625, + "learning_rate": 0.00017887294929935633, + "loss": 4.2951, + "step": 6105 + }, + { + "epoch": 0.6331192607041715, + "grad_norm": 0.76171875, + "learning_rate": 0.00017886627134289573, + "loss": 4.3436, + "step": 6106 + }, + { + "epoch": 0.6332229487586596, + "grad_norm": 0.87890625, + "learning_rate": 0.0001788595924558965, + "loss": 4.3389, + "step": 6107 + }, + { + "epoch": 0.6333266368131476, + "grad_norm": 0.734375, + "learning_rate": 0.0001788529126384374, + "loss": 4.3352, + "step": 6108 + }, + { + "epoch": 0.6334303248676357, + "grad_norm": 0.86328125, + "learning_rate": 0.00017884623189059733, + "loss": 4.3023, + "step": 6109 + }, + { + "epoch": 0.6335340129221237, + "grad_norm": 0.80078125, + "learning_rate": 0.00017883955021245505, + "loss": 4.2569, + "step": 6110 + }, + { + "epoch": 0.6336377009766119, + "grad_norm": 0.81640625, + "learning_rate": 0.0001788328676040894, + "loss": 4.3254, + "step": 6111 + }, + { + "epoch": 0.6337413890310999, + "grad_norm": 0.9140625, + "learning_rate": 0.00017882618406557922, + "loss": 4.2926, + "step": 6112 + }, + { + "epoch": 0.633845077085588, + "grad_norm": 0.7421875, + "learning_rate": 0.00017881949959700343, + "loss": 4.3386, + "step": 6113 + }, + { + "epoch": 0.633948765140076, + "grad_norm": 0.765625, + "learning_rate": 0.00017881281419844088, + "loss": 4.3414, + "step": 6114 + }, + { + "epoch": 0.6340524531945642, + "grad_norm": 0.7265625, + "learning_rate": 0.00017880612786997039, + "loss": 4.2778, + "step": 6115 + }, + { + "epoch": 0.6341561412490522, + "grad_norm": 0.828125, + "learning_rate": 0.00017879944061167092, + "loss": 4.3278, + "step": 6116 + }, + { + "epoch": 0.6342598293035403, + "grad_norm": 0.65234375, + "learning_rate": 0.0001787927524236213, + "loss": 4.3591, + "step": 6117 + }, + { + "epoch": 0.6343635173580283, + "grad_norm": 0.78125, + "learning_rate": 0.00017878606330590054, + "loss": 4.3536, + "step": 6118 + }, + { + "epoch": 0.6344672054125164, + "grad_norm": 0.66015625, + "learning_rate": 0.00017877937325858748, + "loss": 4.3563, + "step": 6119 + }, + { + "epoch": 0.6345708934670046, + "grad_norm": 0.7421875, + "learning_rate": 0.00017877268228176112, + "loss": 4.3259, + "step": 6120 + }, + { + "epoch": 0.6346745815214926, + "grad_norm": 0.59765625, + "learning_rate": 0.00017876599037550036, + "loss": 4.317, + "step": 6121 + }, + { + "epoch": 0.6347782695759807, + "grad_norm": 0.77734375, + "learning_rate": 0.00017875929753988416, + "loss": 4.3243, + "step": 6122 + }, + { + "epoch": 0.6348819576304687, + "grad_norm": 0.62109375, + "learning_rate": 0.00017875260377499152, + "loss": 4.2985, + "step": 6123 + }, + { + "epoch": 0.6349856456849569, + "grad_norm": 0.68359375, + "learning_rate": 0.0001787459090809014, + "loss": 4.3338, + "step": 6124 + }, + { + "epoch": 0.6350893337394449, + "grad_norm": 0.6171875, + "learning_rate": 0.0001787392134576928, + "loss": 4.3434, + "step": 6125 + }, + { + "epoch": 0.635193021793933, + "grad_norm": 0.7890625, + "learning_rate": 0.00017873251690544469, + "loss": 4.3351, + "step": 6126 + }, + { + "epoch": 0.635296709848421, + "grad_norm": 0.66015625, + "learning_rate": 0.0001787258194242361, + "loss": 4.3537, + "step": 6127 + }, + { + "epoch": 0.6354003979029091, + "grad_norm": 0.69140625, + "learning_rate": 0.00017871912101414609, + "loss": 4.293, + "step": 6128 + }, + { + "epoch": 0.6355040859573972, + "grad_norm": 0.671875, + "learning_rate": 0.0001787124216752536, + "loss": 4.3296, + "step": 6129 + }, + { + "epoch": 0.6356077740118853, + "grad_norm": 0.69140625, + "learning_rate": 0.0001787057214076378, + "loss": 4.2974, + "step": 6130 + }, + { + "epoch": 0.6357114620663733, + "grad_norm": 0.61328125, + "learning_rate": 0.00017869902021137765, + "loss": 4.3578, + "step": 6131 + }, + { + "epoch": 0.6358151501208614, + "grad_norm": 0.625, + "learning_rate": 0.00017869231808655226, + "loss": 4.3119, + "step": 6132 + }, + { + "epoch": 0.6359188381753494, + "grad_norm": 0.66796875, + "learning_rate": 0.00017868561503324071, + "loss": 4.3112, + "step": 6133 + }, + { + "epoch": 0.6360225262298376, + "grad_norm": 0.62109375, + "learning_rate": 0.00017867891105152205, + "loss": 4.3483, + "step": 6134 + }, + { + "epoch": 0.6361262142843256, + "grad_norm": 0.69140625, + "learning_rate": 0.00017867220614147544, + "loss": 4.2971, + "step": 6135 + }, + { + "epoch": 0.6362299023388137, + "grad_norm": 0.66015625, + "learning_rate": 0.00017866550030317993, + "loss": 4.2936, + "step": 6136 + }, + { + "epoch": 0.6363335903933017, + "grad_norm": 0.6953125, + "learning_rate": 0.0001786587935367147, + "loss": 4.294, + "step": 6137 + }, + { + "epoch": 0.6364372784477899, + "grad_norm": 0.71875, + "learning_rate": 0.0001786520858421588, + "loss": 4.2954, + "step": 6138 + }, + { + "epoch": 0.6365409665022779, + "grad_norm": 0.73828125, + "learning_rate": 0.00017864537721959148, + "loss": 4.3349, + "step": 6139 + }, + { + "epoch": 0.636644654556766, + "grad_norm": 0.68359375, + "learning_rate": 0.00017863866766909181, + "loss": 4.3433, + "step": 6140 + }, + { + "epoch": 0.636748342611254, + "grad_norm": 0.72265625, + "learning_rate": 0.00017863195719073897, + "loss": 4.3523, + "step": 6141 + }, + { + "epoch": 0.6368520306657421, + "grad_norm": 0.69140625, + "learning_rate": 0.0001786252457846122, + "loss": 4.3301, + "step": 6142 + }, + { + "epoch": 0.6369557187202302, + "grad_norm": 0.71875, + "learning_rate": 0.0001786185334507906, + "loss": 4.3225, + "step": 6143 + }, + { + "epoch": 0.6370594067747183, + "grad_norm": 0.73828125, + "learning_rate": 0.00017861182018935343, + "loss": 4.3684, + "step": 6144 + }, + { + "epoch": 0.6371630948292063, + "grad_norm": 0.6640625, + "learning_rate": 0.0001786051060003799, + "loss": 4.3492, + "step": 6145 + }, + { + "epoch": 0.6372667828836944, + "grad_norm": 0.7265625, + "learning_rate": 0.00017859839088394915, + "loss": 4.3687, + "step": 6146 + }, + { + "epoch": 0.6373704709381824, + "grad_norm": 0.671875, + "learning_rate": 0.00017859167484014053, + "loss": 4.2824, + "step": 6147 + }, + { + "epoch": 0.6374741589926706, + "grad_norm": 0.69140625, + "learning_rate": 0.00017858495786903317, + "loss": 4.3049, + "step": 6148 + }, + { + "epoch": 0.6375778470471586, + "grad_norm": 0.640625, + "learning_rate": 0.00017857823997070643, + "loss": 4.2993, + "step": 6149 + }, + { + "epoch": 0.6376815351016467, + "grad_norm": 0.671875, + "learning_rate": 0.00017857152114523944, + "loss": 4.3058, + "step": 6150 + }, + { + "epoch": 0.6377852231561347, + "grad_norm": 0.65625, + "learning_rate": 0.0001785648013927116, + "loss": 4.346, + "step": 6151 + }, + { + "epoch": 0.6378889112106229, + "grad_norm": 0.64453125, + "learning_rate": 0.00017855808071320217, + "loss": 4.3001, + "step": 6152 + }, + { + "epoch": 0.6379925992651109, + "grad_norm": 0.65625, + "learning_rate": 0.0001785513591067904, + "loss": 4.3162, + "step": 6153 + }, + { + "epoch": 0.638096287319599, + "grad_norm": 0.65625, + "learning_rate": 0.00017854463657355566, + "loss": 4.3288, + "step": 6154 + }, + { + "epoch": 0.638199975374087, + "grad_norm": 0.609375, + "learning_rate": 0.0001785379131135772, + "loss": 4.3129, + "step": 6155 + }, + { + "epoch": 0.6383036634285751, + "grad_norm": 0.6796875, + "learning_rate": 0.0001785311887269344, + "loss": 4.3475, + "step": 6156 + }, + { + "epoch": 0.6384073514830632, + "grad_norm": 0.65234375, + "learning_rate": 0.00017852446341370658, + "loss": 4.3083, + "step": 6157 + }, + { + "epoch": 0.6385110395375513, + "grad_norm": 0.734375, + "learning_rate": 0.00017851773717397307, + "loss": 4.3457, + "step": 6158 + }, + { + "epoch": 0.6386147275920393, + "grad_norm": 0.671875, + "learning_rate": 0.0001785110100078133, + "loss": 4.318, + "step": 6159 + }, + { + "epoch": 0.6387184156465274, + "grad_norm": 0.6484375, + "learning_rate": 0.00017850428191530657, + "loss": 4.365, + "step": 6160 + }, + { + "epoch": 0.6388221037010154, + "grad_norm": 0.63671875, + "learning_rate": 0.0001784975528965323, + "loss": 4.3537, + "step": 6161 + }, + { + "epoch": 0.6389257917555036, + "grad_norm": 0.609375, + "learning_rate": 0.00017849082295156988, + "loss": 4.369, + "step": 6162 + }, + { + "epoch": 0.6390294798099916, + "grad_norm": 0.671875, + "learning_rate": 0.00017848409208049874, + "loss": 4.2885, + "step": 6163 + }, + { + "epoch": 0.6391331678644797, + "grad_norm": 0.65234375, + "learning_rate": 0.00017847736028339824, + "loss": 4.3276, + "step": 6164 + }, + { + "epoch": 0.6392368559189678, + "grad_norm": 0.67578125, + "learning_rate": 0.00017847062756034786, + "loss": 4.2959, + "step": 6165 + }, + { + "epoch": 0.6393405439734559, + "grad_norm": 0.62890625, + "learning_rate": 0.00017846389391142705, + "loss": 4.3502, + "step": 6166 + }, + { + "epoch": 0.639444232027944, + "grad_norm": 0.67578125, + "learning_rate": 0.0001784571593367152, + "loss": 4.3225, + "step": 6167 + }, + { + "epoch": 0.639547920082432, + "grad_norm": 0.64453125, + "learning_rate": 0.0001784504238362918, + "loss": 4.3102, + "step": 6168 + }, + { + "epoch": 0.6396516081369201, + "grad_norm": 0.7109375, + "learning_rate": 0.00017844368741023634, + "loss": 4.3177, + "step": 6169 + }, + { + "epoch": 0.6397552961914081, + "grad_norm": 0.73828125, + "learning_rate": 0.00017843695005862828, + "loss": 4.3462, + "step": 6170 + }, + { + "epoch": 0.6398589842458963, + "grad_norm": 0.6484375, + "learning_rate": 0.00017843021178154712, + "loss": 4.338, + "step": 6171 + }, + { + "epoch": 0.6399626723003843, + "grad_norm": 0.65625, + "learning_rate": 0.00017842347257907237, + "loss": 4.2944, + "step": 6172 + }, + { + "epoch": 0.6400663603548724, + "grad_norm": 0.734375, + "learning_rate": 0.00017841673245128355, + "loss": 4.3055, + "step": 6173 + }, + { + "epoch": 0.6401700484093604, + "grad_norm": 0.6484375, + "learning_rate": 0.00017840999139826015, + "loss": 4.3246, + "step": 6174 + }, + { + "epoch": 0.6402737364638486, + "grad_norm": 0.72265625, + "learning_rate": 0.00017840324942008175, + "loss": 4.2996, + "step": 6175 + }, + { + "epoch": 0.6403774245183366, + "grad_norm": 0.640625, + "learning_rate": 0.0001783965065168279, + "loss": 4.3144, + "step": 6176 + }, + { + "epoch": 0.6404811125728247, + "grad_norm": 0.7109375, + "learning_rate": 0.00017838976268857813, + "loss": 4.3124, + "step": 6177 + }, + { + "epoch": 0.6405848006273127, + "grad_norm": 0.65625, + "learning_rate": 0.000178383017935412, + "loss": 4.3134, + "step": 6178 + }, + { + "epoch": 0.6406884886818008, + "grad_norm": 0.671875, + "learning_rate": 0.0001783762722574091, + "loss": 4.3734, + "step": 6179 + }, + { + "epoch": 0.6407921767362889, + "grad_norm": 0.66015625, + "learning_rate": 0.0001783695256546491, + "loss": 4.359, + "step": 6180 + }, + { + "epoch": 0.640895864790777, + "grad_norm": 0.640625, + "learning_rate": 0.00017836277812721148, + "loss": 4.3412, + "step": 6181 + }, + { + "epoch": 0.640999552845265, + "grad_norm": 0.6796875, + "learning_rate": 0.00017835602967517593, + "loss": 4.3363, + "step": 6182 + }, + { + "epoch": 0.6411032408997531, + "grad_norm": 0.640625, + "learning_rate": 0.00017834928029862205, + "loss": 4.3228, + "step": 6183 + }, + { + "epoch": 0.6412069289542411, + "grad_norm": 0.70703125, + "learning_rate": 0.0001783425299976295, + "loss": 4.3238, + "step": 6184 + }, + { + "epoch": 0.6413106170087293, + "grad_norm": 0.70703125, + "learning_rate": 0.00017833577877227793, + "loss": 4.2995, + "step": 6185 + }, + { + "epoch": 0.6414143050632173, + "grad_norm": 0.6640625, + "learning_rate": 0.0001783290266226469, + "loss": 4.3071, + "step": 6186 + }, + { + "epoch": 0.6415179931177054, + "grad_norm": 0.6953125, + "learning_rate": 0.0001783222735488162, + "loss": 4.3645, + "step": 6187 + }, + { + "epoch": 0.6416216811721934, + "grad_norm": 0.68359375, + "learning_rate": 0.00017831551955086545, + "loss": 4.345, + "step": 6188 + }, + { + "epoch": 0.6417253692266816, + "grad_norm": 0.72265625, + "learning_rate": 0.0001783087646288744, + "loss": 4.3403, + "step": 6189 + }, + { + "epoch": 0.6418290572811696, + "grad_norm": 0.71875, + "learning_rate": 0.00017830200878292263, + "loss": 4.3157, + "step": 6190 + }, + { + "epoch": 0.6419327453356577, + "grad_norm": 0.76171875, + "learning_rate": 0.00017829525201308998, + "loss": 4.3469, + "step": 6191 + }, + { + "epoch": 0.6420364333901457, + "grad_norm": 0.73828125, + "learning_rate": 0.00017828849431945608, + "loss": 4.3377, + "step": 6192 + }, + { + "epoch": 0.6421401214446338, + "grad_norm": 0.75, + "learning_rate": 0.00017828173570210072, + "loss": 4.3062, + "step": 6193 + }, + { + "epoch": 0.6422438094991219, + "grad_norm": 0.6875, + "learning_rate": 0.0001782749761611036, + "loss": 4.3202, + "step": 6194 + }, + { + "epoch": 0.64234749755361, + "grad_norm": 0.703125, + "learning_rate": 0.00017826821569654454, + "loss": 4.301, + "step": 6195 + }, + { + "epoch": 0.642451185608098, + "grad_norm": 0.66015625, + "learning_rate": 0.00017826145430850329, + "loss": 4.2708, + "step": 6196 + }, + { + "epoch": 0.6425548736625861, + "grad_norm": 0.70703125, + "learning_rate": 0.00017825469199705954, + "loss": 4.3532, + "step": 6197 + }, + { + "epoch": 0.6426585617170741, + "grad_norm": 0.6640625, + "learning_rate": 0.00017824792876229318, + "loss": 4.2841, + "step": 6198 + }, + { + "epoch": 0.6427622497715623, + "grad_norm": 0.65234375, + "learning_rate": 0.00017824116460428394, + "loss": 4.3245, + "step": 6199 + }, + { + "epoch": 0.6428659378260503, + "grad_norm": 0.70703125, + "learning_rate": 0.00017823439952311168, + "loss": 4.3055, + "step": 6200 + }, + { + "epoch": 0.6429696258805384, + "grad_norm": 0.62890625, + "learning_rate": 0.00017822763351885623, + "loss": 4.308, + "step": 6201 + }, + { + "epoch": 0.6430733139350264, + "grad_norm": 0.671875, + "learning_rate": 0.00017822086659159738, + "loss": 4.3525, + "step": 6202 + }, + { + "epoch": 0.6431770019895146, + "grad_norm": 0.62890625, + "learning_rate": 0.00017821409874141497, + "loss": 4.3367, + "step": 6203 + }, + { + "epoch": 0.6432806900440026, + "grad_norm": 0.70703125, + "learning_rate": 0.0001782073299683889, + "loss": 4.332, + "step": 6204 + }, + { + "epoch": 0.6433843780984907, + "grad_norm": 0.58203125, + "learning_rate": 0.00017820056027259895, + "loss": 4.3704, + "step": 6205 + }, + { + "epoch": 0.6434880661529787, + "grad_norm": 0.6875, + "learning_rate": 0.0001781937896541251, + "loss": 4.3155, + "step": 6206 + }, + { + "epoch": 0.6435917542074668, + "grad_norm": 0.5703125, + "learning_rate": 0.00017818701811304717, + "loss": 4.2937, + "step": 6207 + }, + { + "epoch": 0.6436954422619549, + "grad_norm": 0.671875, + "learning_rate": 0.00017818024564944507, + "loss": 4.2761, + "step": 6208 + }, + { + "epoch": 0.643799130316443, + "grad_norm": 0.5703125, + "learning_rate": 0.00017817347226339872, + "loss": 4.2736, + "step": 6209 + }, + { + "epoch": 0.6439028183709311, + "grad_norm": 0.640625, + "learning_rate": 0.00017816669795498805, + "loss": 4.309, + "step": 6210 + }, + { + "epoch": 0.6440065064254191, + "grad_norm": 0.62890625, + "learning_rate": 0.00017815992272429294, + "loss": 4.2983, + "step": 6211 + }, + { + "epoch": 0.6441101944799072, + "grad_norm": 0.66015625, + "learning_rate": 0.00017815314657139336, + "loss": 4.315, + "step": 6212 + }, + { + "epoch": 0.6442138825343953, + "grad_norm": 0.6640625, + "learning_rate": 0.00017814636949636928, + "loss": 4.2786, + "step": 6213 + }, + { + "epoch": 0.6443175705888834, + "grad_norm": 0.73828125, + "learning_rate": 0.0001781395914993006, + "loss": 4.2994, + "step": 6214 + }, + { + "epoch": 0.6444212586433714, + "grad_norm": 0.65234375, + "learning_rate": 0.0001781328125802674, + "loss": 4.3356, + "step": 6215 + }, + { + "epoch": 0.6445249466978595, + "grad_norm": 0.71875, + "learning_rate": 0.00017812603273934956, + "loss": 4.3215, + "step": 6216 + }, + { + "epoch": 0.6446286347523476, + "grad_norm": 0.70703125, + "learning_rate": 0.00017811925197662714, + "loss": 4.3373, + "step": 6217 + }, + { + "epoch": 0.6447323228068357, + "grad_norm": 0.7578125, + "learning_rate": 0.0001781124702921801, + "loss": 4.317, + "step": 6218 + }, + { + "epoch": 0.6448360108613237, + "grad_norm": 0.66796875, + "learning_rate": 0.00017810568768608848, + "loss": 4.3405, + "step": 6219 + }, + { + "epoch": 0.6449396989158118, + "grad_norm": 0.6953125, + "learning_rate": 0.00017809890415843236, + "loss": 4.3034, + "step": 6220 + }, + { + "epoch": 0.6450433869702998, + "grad_norm": 0.6484375, + "learning_rate": 0.00017809211970929166, + "loss": 4.3331, + "step": 6221 + }, + { + "epoch": 0.645147075024788, + "grad_norm": 0.67578125, + "learning_rate": 0.00017808533433874652, + "loss": 4.3232, + "step": 6222 + }, + { + "epoch": 0.645250763079276, + "grad_norm": 0.6875, + "learning_rate": 0.00017807854804687695, + "loss": 4.3423, + "step": 6223 + }, + { + "epoch": 0.6453544511337641, + "grad_norm": 0.6796875, + "learning_rate": 0.0001780717608337631, + "loss": 4.3282, + "step": 6224 + }, + { + "epoch": 0.6454581391882521, + "grad_norm": 0.80078125, + "learning_rate": 0.00017806497269948497, + "loss": 4.3107, + "step": 6225 + }, + { + "epoch": 0.6455618272427402, + "grad_norm": 0.72265625, + "learning_rate": 0.00017805818364412268, + "loss": 4.3284, + "step": 6226 + }, + { + "epoch": 0.6456655152972283, + "grad_norm": 0.703125, + "learning_rate": 0.0001780513936677563, + "loss": 4.3306, + "step": 6227 + }, + { + "epoch": 0.6457692033517164, + "grad_norm": 0.77734375, + "learning_rate": 0.000178044602770466, + "loss": 4.3592, + "step": 6228 + }, + { + "epoch": 0.6458728914062044, + "grad_norm": 0.72265625, + "learning_rate": 0.00017803781095233193, + "loss": 4.3433, + "step": 6229 + }, + { + "epoch": 0.6459765794606925, + "grad_norm": 0.671875, + "learning_rate": 0.00017803101821343411, + "loss": 4.3218, + "step": 6230 + }, + { + "epoch": 0.6460802675151806, + "grad_norm": 0.71484375, + "learning_rate": 0.00017802422455385282, + "loss": 4.3262, + "step": 6231 + }, + { + "epoch": 0.6461839555696687, + "grad_norm": 0.6953125, + "learning_rate": 0.0001780174299736681, + "loss": 4.3295, + "step": 6232 + }, + { + "epoch": 0.6462876436241567, + "grad_norm": 0.703125, + "learning_rate": 0.0001780106344729602, + "loss": 4.3481, + "step": 6233 + }, + { + "epoch": 0.6463913316786448, + "grad_norm": 0.70703125, + "learning_rate": 0.0001780038380518093, + "loss": 4.3348, + "step": 6234 + }, + { + "epoch": 0.6464950197331328, + "grad_norm": 0.72265625, + "learning_rate": 0.00017799704071029554, + "loss": 4.3457, + "step": 6235 + }, + { + "epoch": 0.646598707787621, + "grad_norm": 0.734375, + "learning_rate": 0.00017799024244849912, + "loss": 4.3509, + "step": 6236 + }, + { + "epoch": 0.646702395842109, + "grad_norm": 0.63671875, + "learning_rate": 0.00017798344326650032, + "loss": 4.3605, + "step": 6237 + }, + { + "epoch": 0.6468060838965971, + "grad_norm": 0.703125, + "learning_rate": 0.00017797664316437928, + "loss": 4.3658, + "step": 6238 + }, + { + "epoch": 0.6469097719510851, + "grad_norm": 0.671875, + "learning_rate": 0.00017796984214221633, + "loss": 4.3078, + "step": 6239 + }, + { + "epoch": 0.6470134600055732, + "grad_norm": 0.671875, + "learning_rate": 0.00017796304020009163, + "loss": 4.3317, + "step": 6240 + }, + { + "epoch": 0.6471171480600613, + "grad_norm": 0.6328125, + "learning_rate": 0.00017795623733808544, + "loss": 4.3386, + "step": 6241 + }, + { + "epoch": 0.6472208361145494, + "grad_norm": 0.6875, + "learning_rate": 0.0001779494335562781, + "loss": 4.2955, + "step": 6242 + }, + { + "epoch": 0.6473245241690374, + "grad_norm": 0.64453125, + "learning_rate": 0.0001779426288547498, + "loss": 4.3589, + "step": 6243 + }, + { + "epoch": 0.6474282122235255, + "grad_norm": 0.65625, + "learning_rate": 0.00017793582323358088, + "loss": 4.3091, + "step": 6244 + }, + { + "epoch": 0.6475319002780136, + "grad_norm": 0.7265625, + "learning_rate": 0.00017792901669285165, + "loss": 4.3449, + "step": 6245 + }, + { + "epoch": 0.6476355883325017, + "grad_norm": 0.625, + "learning_rate": 0.00017792220923264237, + "loss": 4.3256, + "step": 6246 + }, + { + "epoch": 0.6477392763869897, + "grad_norm": 0.71875, + "learning_rate": 0.0001779154008530334, + "loss": 4.3174, + "step": 6247 + }, + { + "epoch": 0.6478429644414778, + "grad_norm": 0.64453125, + "learning_rate": 0.00017790859155410508, + "loss": 4.2901, + "step": 6248 + }, + { + "epoch": 0.6479466524959658, + "grad_norm": 0.69921875, + "learning_rate": 0.00017790178133593768, + "loss": 4.3499, + "step": 6249 + }, + { + "epoch": 0.648050340550454, + "grad_norm": 0.7734375, + "learning_rate": 0.00017789497019861166, + "loss": 4.3747, + "step": 6250 + }, + { + "epoch": 0.648154028604942, + "grad_norm": 0.68359375, + "learning_rate": 0.00017788815814220733, + "loss": 4.291, + "step": 6251 + }, + { + "epoch": 0.6482577166594301, + "grad_norm": 0.8125, + "learning_rate": 0.00017788134516680504, + "loss": 4.2845, + "step": 6252 + }, + { + "epoch": 0.6483614047139182, + "grad_norm": 0.76953125, + "learning_rate": 0.00017787453127248522, + "loss": 4.3211, + "step": 6253 + }, + { + "epoch": 0.6484650927684062, + "grad_norm": 0.76953125, + "learning_rate": 0.00017786771645932824, + "loss": 4.3539, + "step": 6254 + }, + { + "epoch": 0.6485687808228944, + "grad_norm": 0.68359375, + "learning_rate": 0.00017786090072741448, + "loss": 4.329, + "step": 6255 + }, + { + "epoch": 0.6486724688773824, + "grad_norm": 0.7734375, + "learning_rate": 0.00017785408407682444, + "loss": 4.3404, + "step": 6256 + }, + { + "epoch": 0.6487761569318705, + "grad_norm": 0.74609375, + "learning_rate": 0.00017784726650763846, + "loss": 4.3098, + "step": 6257 + }, + { + "epoch": 0.6488798449863585, + "grad_norm": 0.7265625, + "learning_rate": 0.00017784044801993706, + "loss": 4.305, + "step": 6258 + }, + { + "epoch": 0.6489835330408467, + "grad_norm": 0.73046875, + "learning_rate": 0.00017783362861380065, + "loss": 4.3093, + "step": 6259 + }, + { + "epoch": 0.6490872210953347, + "grad_norm": 0.67578125, + "learning_rate": 0.00017782680828930968, + "loss": 4.3413, + "step": 6260 + }, + { + "epoch": 0.6491909091498228, + "grad_norm": 0.671875, + "learning_rate": 0.00017781998704654466, + "loss": 4.2956, + "step": 6261 + }, + { + "epoch": 0.6492945972043108, + "grad_norm": 0.64453125, + "learning_rate": 0.00017781316488558603, + "loss": 4.3282, + "step": 6262 + }, + { + "epoch": 0.649398285258799, + "grad_norm": 0.68359375, + "learning_rate": 0.00017780634180651432, + "loss": 4.2974, + "step": 6263 + }, + { + "epoch": 0.649501973313287, + "grad_norm": 0.609375, + "learning_rate": 0.00017779951780941, + "loss": 4.3213, + "step": 6264 + }, + { + "epoch": 0.6496056613677751, + "grad_norm": 0.6484375, + "learning_rate": 0.00017779269289435365, + "loss": 4.3303, + "step": 6265 + }, + { + "epoch": 0.6497093494222631, + "grad_norm": 0.67578125, + "learning_rate": 0.00017778586706142572, + "loss": 4.3236, + "step": 6266 + }, + { + "epoch": 0.6498130374767512, + "grad_norm": 0.66015625, + "learning_rate": 0.00017777904031070682, + "loss": 4.3598, + "step": 6267 + }, + { + "epoch": 0.6499167255312392, + "grad_norm": 0.64453125, + "learning_rate": 0.0001777722126422774, + "loss": 4.3371, + "step": 6268 + }, + { + "epoch": 0.6500204135857274, + "grad_norm": 0.6953125, + "learning_rate": 0.00017776538405621812, + "loss": 4.3058, + "step": 6269 + }, + { + "epoch": 0.6501241016402154, + "grad_norm": 0.6328125, + "learning_rate": 0.00017775855455260954, + "loss": 4.2711, + "step": 6270 + }, + { + "epoch": 0.6502277896947035, + "grad_norm": 0.6875, + "learning_rate": 0.00017775172413153216, + "loss": 4.3393, + "step": 6271 + }, + { + "epoch": 0.6503314777491915, + "grad_norm": 0.640625, + "learning_rate": 0.00017774489279306663, + "loss": 4.3036, + "step": 6272 + }, + { + "epoch": 0.6504351658036797, + "grad_norm": 0.63671875, + "learning_rate": 0.00017773806053729353, + "loss": 4.3239, + "step": 6273 + }, + { + "epoch": 0.6505388538581677, + "grad_norm": 0.6328125, + "learning_rate": 0.00017773122736429353, + "loss": 4.3377, + "step": 6274 + }, + { + "epoch": 0.6506425419126558, + "grad_norm": 0.59375, + "learning_rate": 0.0001777243932741472, + "loss": 4.3155, + "step": 6275 + }, + { + "epoch": 0.6507462299671438, + "grad_norm": 0.625, + "learning_rate": 0.00017771755826693518, + "loss": 4.2653, + "step": 6276 + }, + { + "epoch": 0.650849918021632, + "grad_norm": 0.6640625, + "learning_rate": 0.00017771072234273812, + "loss": 4.3229, + "step": 6277 + }, + { + "epoch": 0.65095360607612, + "grad_norm": 0.640625, + "learning_rate": 0.00017770388550163667, + "loss": 4.3287, + "step": 6278 + }, + { + "epoch": 0.6510572941306081, + "grad_norm": 0.67578125, + "learning_rate": 0.0001776970477437115, + "loss": 4.2721, + "step": 6279 + }, + { + "epoch": 0.6511609821850961, + "grad_norm": 0.6015625, + "learning_rate": 0.00017769020906904332, + "loss": 4.3, + "step": 6280 + }, + { + "epoch": 0.6512646702395842, + "grad_norm": 0.66796875, + "learning_rate": 0.00017768336947771282, + "loss": 4.3336, + "step": 6281 + }, + { + "epoch": 0.6513683582940722, + "grad_norm": 0.64453125, + "learning_rate": 0.00017767652896980062, + "loss": 4.3592, + "step": 6282 + }, + { + "epoch": 0.6514720463485604, + "grad_norm": 0.625, + "learning_rate": 0.00017766968754538755, + "loss": 4.3167, + "step": 6283 + }, + { + "epoch": 0.6515757344030484, + "grad_norm": 0.71875, + "learning_rate": 0.0001776628452045542, + "loss": 4.278, + "step": 6284 + }, + { + "epoch": 0.6516794224575365, + "grad_norm": 0.68359375, + "learning_rate": 0.0001776560019473814, + "loss": 4.3557, + "step": 6285 + }, + { + "epoch": 0.6517831105120245, + "grad_norm": 0.72265625, + "learning_rate": 0.00017764915777394985, + "loss": 4.3205, + "step": 6286 + }, + { + "epoch": 0.6518867985665127, + "grad_norm": 0.74609375, + "learning_rate": 0.00017764231268434035, + "loss": 4.2882, + "step": 6287 + }, + { + "epoch": 0.6519904866210007, + "grad_norm": 0.64453125, + "learning_rate": 0.0001776354666786336, + "loss": 4.3018, + "step": 6288 + }, + { + "epoch": 0.6520941746754888, + "grad_norm": 0.75390625, + "learning_rate": 0.00017762861975691044, + "loss": 4.3372, + "step": 6289 + }, + { + "epoch": 0.6521978627299768, + "grad_norm": 0.66015625, + "learning_rate": 0.0001776217719192516, + "loss": 4.342, + "step": 6290 + }, + { + "epoch": 0.652301550784465, + "grad_norm": 0.84375, + "learning_rate": 0.0001776149231657379, + "loss": 4.2978, + "step": 6291 + }, + { + "epoch": 0.652405238838953, + "grad_norm": 0.6171875, + "learning_rate": 0.00017760807349645016, + "loss": 4.348, + "step": 6292 + }, + { + "epoch": 0.6525089268934411, + "grad_norm": 0.72265625, + "learning_rate": 0.00017760122291146917, + "loss": 4.2836, + "step": 6293 + }, + { + "epoch": 0.6526126149479291, + "grad_norm": 0.67578125, + "learning_rate": 0.0001775943714108758, + "loss": 4.3148, + "step": 6294 + }, + { + "epoch": 0.6527163030024172, + "grad_norm": 0.68359375, + "learning_rate": 0.00017758751899475087, + "loss": 4.304, + "step": 6295 + }, + { + "epoch": 0.6528199910569052, + "grad_norm": 0.6640625, + "learning_rate": 0.0001775806656631752, + "loss": 4.3578, + "step": 6296 + }, + { + "epoch": 0.6529236791113934, + "grad_norm": 0.7109375, + "learning_rate": 0.00017757381141622971, + "loss": 4.3294, + "step": 6297 + }, + { + "epoch": 0.6530273671658815, + "grad_norm": 0.62109375, + "learning_rate": 0.00017756695625399522, + "loss": 4.3513, + "step": 6298 + }, + { + "epoch": 0.6531310552203695, + "grad_norm": 0.609375, + "learning_rate": 0.00017756010017655266, + "loss": 4.3071, + "step": 6299 + }, + { + "epoch": 0.6532347432748576, + "grad_norm": 0.65625, + "learning_rate": 0.00017755324318398288, + "loss": 4.326, + "step": 6300 + }, + { + "epoch": 0.6533384313293457, + "grad_norm": 0.6484375, + "learning_rate": 0.00017754638527636682, + "loss": 4.2787, + "step": 6301 + }, + { + "epoch": 0.6534421193838338, + "grad_norm": 0.72265625, + "learning_rate": 0.0001775395264537854, + "loss": 4.2901, + "step": 6302 + }, + { + "epoch": 0.6535458074383218, + "grad_norm": 0.6640625, + "learning_rate": 0.0001775326667163195, + "loss": 4.3453, + "step": 6303 + }, + { + "epoch": 0.6536494954928099, + "grad_norm": 0.6796875, + "learning_rate": 0.0001775258060640501, + "loss": 4.3247, + "step": 6304 + }, + { + "epoch": 0.653753183547298, + "grad_norm": 0.7265625, + "learning_rate": 0.00017751894449705814, + "loss": 4.3338, + "step": 6305 + }, + { + "epoch": 0.6538568716017861, + "grad_norm": 0.61328125, + "learning_rate": 0.00017751208201542457, + "loss": 4.3341, + "step": 6306 + }, + { + "epoch": 0.6539605596562741, + "grad_norm": 0.7109375, + "learning_rate": 0.00017750521861923036, + "loss": 4.334, + "step": 6307 + }, + { + "epoch": 0.6540642477107622, + "grad_norm": 0.69921875, + "learning_rate": 0.00017749835430855653, + "loss": 4.3381, + "step": 6308 + }, + { + "epoch": 0.6541679357652502, + "grad_norm": 0.7265625, + "learning_rate": 0.000177491489083484, + "loss": 4.3229, + "step": 6309 + }, + { + "epoch": 0.6542716238197384, + "grad_norm": 0.66015625, + "learning_rate": 0.00017748462294409385, + "loss": 4.2798, + "step": 6310 + }, + { + "epoch": 0.6543753118742264, + "grad_norm": 0.6953125, + "learning_rate": 0.00017747775589046702, + "loss": 4.3167, + "step": 6311 + }, + { + "epoch": 0.6544789999287145, + "grad_norm": 0.72265625, + "learning_rate": 0.0001774708879226846, + "loss": 4.3234, + "step": 6312 + }, + { + "epoch": 0.6545826879832025, + "grad_norm": 0.80078125, + "learning_rate": 0.00017746401904082757, + "loss": 4.3201, + "step": 6313 + }, + { + "epoch": 0.6546863760376906, + "grad_norm": 0.6328125, + "learning_rate": 0.00017745714924497698, + "loss": 4.3389, + "step": 6314 + }, + { + "epoch": 0.6547900640921787, + "grad_norm": 0.796875, + "learning_rate": 0.00017745027853521394, + "loss": 4.2997, + "step": 6315 + }, + { + "epoch": 0.6548937521466668, + "grad_norm": 0.67578125, + "learning_rate": 0.00017744340691161947, + "loss": 4.3189, + "step": 6316 + }, + { + "epoch": 0.6549974402011548, + "grad_norm": 0.7734375, + "learning_rate": 0.00017743653437427465, + "loss": 4.2672, + "step": 6317 + }, + { + "epoch": 0.6551011282556429, + "grad_norm": 0.73046875, + "learning_rate": 0.0001774296609232606, + "loss": 4.3556, + "step": 6318 + }, + { + "epoch": 0.6552048163101309, + "grad_norm": 0.6953125, + "learning_rate": 0.00017742278655865837, + "loss": 4.3441, + "step": 6319 + }, + { + "epoch": 0.6553085043646191, + "grad_norm": 0.71875, + "learning_rate": 0.00017741591128054912, + "loss": 4.3296, + "step": 6320 + }, + { + "epoch": 0.6554121924191071, + "grad_norm": 0.7578125, + "learning_rate": 0.00017740903508901395, + "loss": 4.3605, + "step": 6321 + }, + { + "epoch": 0.6555158804735952, + "grad_norm": 0.6640625, + "learning_rate": 0.00017740215798413397, + "loss": 4.3189, + "step": 6322 + }, + { + "epoch": 0.6556195685280832, + "grad_norm": 0.70703125, + "learning_rate": 0.00017739527996599034, + "loss": 4.3237, + "step": 6323 + }, + { + "epoch": 0.6557232565825714, + "grad_norm": 0.6796875, + "learning_rate": 0.00017738840103466422, + "loss": 4.3346, + "step": 6324 + }, + { + "epoch": 0.6558269446370594, + "grad_norm": 0.6953125, + "learning_rate": 0.0001773815211902368, + "loss": 4.2839, + "step": 6325 + }, + { + "epoch": 0.6559306326915475, + "grad_norm": 0.6953125, + "learning_rate": 0.0001773746404327892, + "loss": 4.2721, + "step": 6326 + }, + { + "epoch": 0.6560343207460355, + "grad_norm": 0.734375, + "learning_rate": 0.00017736775876240265, + "loss": 4.2895, + "step": 6327 + }, + { + "epoch": 0.6561380088005236, + "grad_norm": 0.6875, + "learning_rate": 0.0001773608761791583, + "loss": 4.3336, + "step": 6328 + }, + { + "epoch": 0.6562416968550117, + "grad_norm": 0.71875, + "learning_rate": 0.00017735399268313743, + "loss": 4.2716, + "step": 6329 + }, + { + "epoch": 0.6563453849094998, + "grad_norm": 0.76171875, + "learning_rate": 0.00017734710827442118, + "loss": 4.2834, + "step": 6330 + }, + { + "epoch": 0.6564490729639878, + "grad_norm": 0.75, + "learning_rate": 0.0001773402229530908, + "loss": 4.35, + "step": 6331 + }, + { + "epoch": 0.6565527610184759, + "grad_norm": 0.71875, + "learning_rate": 0.00017733333671922756, + "loss": 4.3204, + "step": 6332 + }, + { + "epoch": 0.6566564490729639, + "grad_norm": 0.64453125, + "learning_rate": 0.0001773264495729127, + "loss": 4.3374, + "step": 6333 + }, + { + "epoch": 0.6567601371274521, + "grad_norm": 0.70703125, + "learning_rate": 0.00017731956151422745, + "loss": 4.3615, + "step": 6334 + }, + { + "epoch": 0.6568638251819401, + "grad_norm": 0.67578125, + "learning_rate": 0.00017731267254325316, + "loss": 4.3116, + "step": 6335 + }, + { + "epoch": 0.6569675132364282, + "grad_norm": 0.765625, + "learning_rate": 0.00017730578266007097, + "loss": 4.3278, + "step": 6336 + }, + { + "epoch": 0.6570712012909162, + "grad_norm": 0.63671875, + "learning_rate": 0.00017729889186476232, + "loss": 4.3253, + "step": 6337 + }, + { + "epoch": 0.6571748893454044, + "grad_norm": 0.7421875, + "learning_rate": 0.00017729200015740844, + "loss": 4.2923, + "step": 6338 + }, + { + "epoch": 0.6572785773998924, + "grad_norm": 0.65625, + "learning_rate": 0.00017728510753809066, + "loss": 4.2721, + "step": 6339 + }, + { + "epoch": 0.6573822654543805, + "grad_norm": 0.76171875, + "learning_rate": 0.0001772782140068903, + "loss": 4.3146, + "step": 6340 + }, + { + "epoch": 0.6574859535088685, + "grad_norm": 0.62890625, + "learning_rate": 0.00017727131956388873, + "loss": 4.3398, + "step": 6341 + }, + { + "epoch": 0.6575896415633566, + "grad_norm": 0.78125, + "learning_rate": 0.00017726442420916723, + "loss": 4.3239, + "step": 6342 + }, + { + "epoch": 0.6576933296178448, + "grad_norm": 0.75, + "learning_rate": 0.0001772575279428072, + "loss": 4.3379, + "step": 6343 + }, + { + "epoch": 0.6577970176723328, + "grad_norm": 0.7421875, + "learning_rate": 0.00017725063076489003, + "loss": 4.3078, + "step": 6344 + }, + { + "epoch": 0.6579007057268209, + "grad_norm": 0.80078125, + "learning_rate": 0.00017724373267549704, + "loss": 4.3641, + "step": 6345 + }, + { + "epoch": 0.6580043937813089, + "grad_norm": 0.72265625, + "learning_rate": 0.00017723683367470966, + "loss": 4.3075, + "step": 6346 + }, + { + "epoch": 0.658108081835797, + "grad_norm": 0.75, + "learning_rate": 0.00017722993376260933, + "loss": 4.3449, + "step": 6347 + }, + { + "epoch": 0.6582117698902851, + "grad_norm": 0.64453125, + "learning_rate": 0.00017722303293927737, + "loss": 4.345, + "step": 6348 + }, + { + "epoch": 0.6583154579447732, + "grad_norm": 0.71484375, + "learning_rate": 0.00017721613120479524, + "loss": 4.3421, + "step": 6349 + }, + { + "epoch": 0.6584191459992612, + "grad_norm": 0.640625, + "learning_rate": 0.00017720922855924442, + "loss": 4.31, + "step": 6350 + }, + { + "epoch": 0.6585228340537493, + "grad_norm": 0.68359375, + "learning_rate": 0.0001772023250027063, + "loss": 4.2911, + "step": 6351 + }, + { + "epoch": 0.6586265221082374, + "grad_norm": 0.6328125, + "learning_rate": 0.0001771954205352624, + "loss": 4.3133, + "step": 6352 + }, + { + "epoch": 0.6587302101627255, + "grad_norm": 0.68359375, + "learning_rate": 0.00017718851515699407, + "loss": 4.2883, + "step": 6353 + }, + { + "epoch": 0.6588338982172135, + "grad_norm": 0.75390625, + "learning_rate": 0.00017718160886798288, + "loss": 4.3507, + "step": 6354 + }, + { + "epoch": 0.6589375862717016, + "grad_norm": 0.703125, + "learning_rate": 0.00017717470166831028, + "loss": 4.3606, + "step": 6355 + }, + { + "epoch": 0.6590412743261896, + "grad_norm": 0.69140625, + "learning_rate": 0.0001771677935580578, + "loss": 4.3079, + "step": 6356 + }, + { + "epoch": 0.6591449623806778, + "grad_norm": 0.72265625, + "learning_rate": 0.00017716088453730692, + "loss": 4.3126, + "step": 6357 + }, + { + "epoch": 0.6592486504351658, + "grad_norm": 0.71484375, + "learning_rate": 0.0001771539746061391, + "loss": 4.3268, + "step": 6358 + }, + { + "epoch": 0.6593523384896539, + "grad_norm": 0.75, + "learning_rate": 0.00017714706376463602, + "loss": 4.3424, + "step": 6359 + }, + { + "epoch": 0.6594560265441419, + "grad_norm": 0.67578125, + "learning_rate": 0.00017714015201287912, + "loss": 4.3265, + "step": 6360 + }, + { + "epoch": 0.65955971459863, + "grad_norm": 0.7421875, + "learning_rate": 0.00017713323935094995, + "loss": 4.3295, + "step": 6361 + }, + { + "epoch": 0.6596634026531181, + "grad_norm": 0.6640625, + "learning_rate": 0.0001771263257789301, + "loss": 4.3506, + "step": 6362 + }, + { + "epoch": 0.6597670907076062, + "grad_norm": 0.75390625, + "learning_rate": 0.0001771194112969011, + "loss": 4.3425, + "step": 6363 + }, + { + "epoch": 0.6598707787620942, + "grad_norm": 0.68359375, + "learning_rate": 0.0001771124959049446, + "loss": 4.3069, + "step": 6364 + }, + { + "epoch": 0.6599744668165823, + "grad_norm": 0.66015625, + "learning_rate": 0.00017710557960314214, + "loss": 4.3119, + "step": 6365 + }, + { + "epoch": 0.6600781548710704, + "grad_norm": 0.68359375, + "learning_rate": 0.00017709866239157536, + "loss": 4.3332, + "step": 6366 + }, + { + "epoch": 0.6601818429255585, + "grad_norm": 0.6875, + "learning_rate": 0.00017709174427032583, + "loss": 4.3551, + "step": 6367 + }, + { + "epoch": 0.6602855309800465, + "grad_norm": 0.74609375, + "learning_rate": 0.00017708482523947524, + "loss": 4.3009, + "step": 6368 + }, + { + "epoch": 0.6603892190345346, + "grad_norm": 0.65234375, + "learning_rate": 0.0001770779052991052, + "loss": 4.2607, + "step": 6369 + }, + { + "epoch": 0.6604929070890226, + "grad_norm": 0.73046875, + "learning_rate": 0.00017707098444929732, + "loss": 4.277, + "step": 6370 + }, + { + "epoch": 0.6605965951435108, + "grad_norm": 0.64453125, + "learning_rate": 0.00017706406269013333, + "loss": 4.3004, + "step": 6371 + }, + { + "epoch": 0.6607002831979988, + "grad_norm": 0.7578125, + "learning_rate": 0.0001770571400216948, + "loss": 4.325, + "step": 6372 + }, + { + "epoch": 0.6608039712524869, + "grad_norm": 0.6640625, + "learning_rate": 0.00017705021644406354, + "loss": 4.2894, + "step": 6373 + }, + { + "epoch": 0.6609076593069749, + "grad_norm": 0.734375, + "learning_rate": 0.00017704329195732113, + "loss": 4.3139, + "step": 6374 + }, + { + "epoch": 0.661011347361463, + "grad_norm": 0.734375, + "learning_rate": 0.0001770363665615493, + "loss": 4.3121, + "step": 6375 + }, + { + "epoch": 0.6611150354159511, + "grad_norm": 0.765625, + "learning_rate": 0.00017702944025682981, + "loss": 4.3245, + "step": 6376 + }, + { + "epoch": 0.6612187234704392, + "grad_norm": 0.83203125, + "learning_rate": 0.00017702251304324435, + "loss": 4.328, + "step": 6377 + }, + { + "epoch": 0.6613224115249272, + "grad_norm": 0.7265625, + "learning_rate": 0.00017701558492087463, + "loss": 4.329, + "step": 6378 + }, + { + "epoch": 0.6614260995794153, + "grad_norm": 0.67578125, + "learning_rate": 0.00017700865588980244, + "loss": 4.3363, + "step": 6379 + }, + { + "epoch": 0.6615297876339034, + "grad_norm": 0.82421875, + "learning_rate": 0.0001770017259501095, + "loss": 4.2961, + "step": 6380 + }, + { + "epoch": 0.6616334756883915, + "grad_norm": 0.6484375, + "learning_rate": 0.0001769947951018776, + "loss": 4.3053, + "step": 6381 + }, + { + "epoch": 0.6617371637428795, + "grad_norm": 0.7734375, + "learning_rate": 0.00017698786334518848, + "loss": 4.3419, + "step": 6382 + }, + { + "epoch": 0.6618408517973676, + "grad_norm": 0.640625, + "learning_rate": 0.00017698093068012398, + "loss": 4.3493, + "step": 6383 + }, + { + "epoch": 0.6619445398518556, + "grad_norm": 0.74609375, + "learning_rate": 0.00017697399710676586, + "loss": 4.2962, + "step": 6384 + }, + { + "epoch": 0.6620482279063438, + "grad_norm": 0.66796875, + "learning_rate": 0.00017696706262519592, + "loss": 4.3222, + "step": 6385 + }, + { + "epoch": 0.6621519159608318, + "grad_norm": 0.73828125, + "learning_rate": 0.00017696012723549602, + "loss": 4.2923, + "step": 6386 + }, + { + "epoch": 0.6622556040153199, + "grad_norm": 0.6953125, + "learning_rate": 0.00017695319093774796, + "loss": 4.3247, + "step": 6387 + }, + { + "epoch": 0.662359292069808, + "grad_norm": 0.70703125, + "learning_rate": 0.0001769462537320336, + "loss": 4.2963, + "step": 6388 + }, + { + "epoch": 0.662462980124296, + "grad_norm": 0.640625, + "learning_rate": 0.00017693931561843477, + "loss": 4.3119, + "step": 6389 + }, + { + "epoch": 0.6625666681787842, + "grad_norm": 0.71484375, + "learning_rate": 0.00017693237659703335, + "loss": 4.3639, + "step": 6390 + }, + { + "epoch": 0.6626703562332722, + "grad_norm": 0.68359375, + "learning_rate": 0.00017692543666791123, + "loss": 4.2999, + "step": 6391 + }, + { + "epoch": 0.6627740442877603, + "grad_norm": 0.72265625, + "learning_rate": 0.00017691849583115023, + "loss": 4.3166, + "step": 6392 + }, + { + "epoch": 0.6628777323422483, + "grad_norm": 0.7109375, + "learning_rate": 0.0001769115540868323, + "loss": 4.3059, + "step": 6393 + }, + { + "epoch": 0.6629814203967365, + "grad_norm": 0.78125, + "learning_rate": 0.00017690461143503932, + "loss": 4.3199, + "step": 6394 + }, + { + "epoch": 0.6630851084512245, + "grad_norm": 0.625, + "learning_rate": 0.00017689766787585325, + "loss": 4.3414, + "step": 6395 + }, + { + "epoch": 0.6631887965057126, + "grad_norm": 0.8203125, + "learning_rate": 0.00017689072340935596, + "loss": 4.2836, + "step": 6396 + }, + { + "epoch": 0.6632924845602006, + "grad_norm": 0.7109375, + "learning_rate": 0.00017688377803562938, + "loss": 4.3232, + "step": 6397 + }, + { + "epoch": 0.6633961726146888, + "grad_norm": 0.7265625, + "learning_rate": 0.00017687683175475556, + "loss": 4.3835, + "step": 6398 + }, + { + "epoch": 0.6634998606691768, + "grad_norm": 0.77734375, + "learning_rate": 0.00017686988456681632, + "loss": 4.3314, + "step": 6399 + }, + { + "epoch": 0.6636035487236649, + "grad_norm": 0.79296875, + "learning_rate": 0.00017686293647189373, + "loss": 4.3314, + "step": 6400 + }, + { + "epoch": 0.6637072367781529, + "grad_norm": 0.75390625, + "learning_rate": 0.00017685598747006976, + "loss": 4.2765, + "step": 6401 + }, + { + "epoch": 0.663810924832641, + "grad_norm": 0.84765625, + "learning_rate": 0.00017684903756142635, + "loss": 4.2805, + "step": 6402 + }, + { + "epoch": 0.663914612887129, + "grad_norm": 0.671875, + "learning_rate": 0.0001768420867460455, + "loss": 4.3317, + "step": 6403 + }, + { + "epoch": 0.6640183009416172, + "grad_norm": 0.76171875, + "learning_rate": 0.0001768351350240093, + "loss": 4.3102, + "step": 6404 + }, + { + "epoch": 0.6641219889961052, + "grad_norm": 0.71875, + "learning_rate": 0.00017682818239539972, + "loss": 4.3507, + "step": 6405 + }, + { + "epoch": 0.6642256770505933, + "grad_norm": 0.73828125, + "learning_rate": 0.0001768212288602988, + "loss": 4.2997, + "step": 6406 + }, + { + "epoch": 0.6643293651050813, + "grad_norm": 0.73046875, + "learning_rate": 0.00017681427441878855, + "loss": 4.2939, + "step": 6407 + }, + { + "epoch": 0.6644330531595695, + "grad_norm": 0.7265625, + "learning_rate": 0.00017680731907095107, + "loss": 4.3014, + "step": 6408 + }, + { + "epoch": 0.6645367412140575, + "grad_norm": 0.76953125, + "learning_rate": 0.00017680036281686844, + "loss": 4.3149, + "step": 6409 + }, + { + "epoch": 0.6646404292685456, + "grad_norm": 0.7578125, + "learning_rate": 0.00017679340565662267, + "loss": 4.2542, + "step": 6410 + }, + { + "epoch": 0.6647441173230336, + "grad_norm": 0.79296875, + "learning_rate": 0.00017678644759029592, + "loss": 4.333, + "step": 6411 + }, + { + "epoch": 0.6648478053775218, + "grad_norm": 0.69921875, + "learning_rate": 0.00017677948861797026, + "loss": 4.3367, + "step": 6412 + }, + { + "epoch": 0.6649514934320098, + "grad_norm": 0.7421875, + "learning_rate": 0.00017677252873972776, + "loss": 4.3524, + "step": 6413 + }, + { + "epoch": 0.6650551814864979, + "grad_norm": 0.734375, + "learning_rate": 0.00017676556795565061, + "loss": 4.318, + "step": 6414 + }, + { + "epoch": 0.6651588695409859, + "grad_norm": 0.7578125, + "learning_rate": 0.0001767586062658209, + "loss": 4.3213, + "step": 6415 + }, + { + "epoch": 0.665262557595474, + "grad_norm": 0.8125, + "learning_rate": 0.00017675164367032077, + "loss": 4.3024, + "step": 6416 + }, + { + "epoch": 0.665366245649962, + "grad_norm": 0.8359375, + "learning_rate": 0.00017674468016923232, + "loss": 4.2829, + "step": 6417 + }, + { + "epoch": 0.6654699337044502, + "grad_norm": 0.6953125, + "learning_rate": 0.00017673771576263782, + "loss": 4.3319, + "step": 6418 + }, + { + "epoch": 0.6655736217589382, + "grad_norm": 0.87890625, + "learning_rate": 0.00017673075045061937, + "loss": 4.3104, + "step": 6419 + }, + { + "epoch": 0.6656773098134263, + "grad_norm": 0.74609375, + "learning_rate": 0.0001767237842332592, + "loss": 4.3173, + "step": 6420 + }, + { + "epoch": 0.6657809978679143, + "grad_norm": 0.90625, + "learning_rate": 0.00017671681711063945, + "loss": 4.3071, + "step": 6421 + }, + { + "epoch": 0.6658846859224025, + "grad_norm": 0.828125, + "learning_rate": 0.00017670984908284236, + "loss": 4.338, + "step": 6422 + }, + { + "epoch": 0.6659883739768905, + "grad_norm": 0.85546875, + "learning_rate": 0.00017670288014995014, + "loss": 4.3091, + "step": 6423 + }, + { + "epoch": 0.6660920620313786, + "grad_norm": 0.77734375, + "learning_rate": 0.000176695910312045, + "loss": 4.3015, + "step": 6424 + }, + { + "epoch": 0.6661957500858666, + "grad_norm": 0.8046875, + "learning_rate": 0.0001766889395692092, + "loss": 4.2442, + "step": 6425 + }, + { + "epoch": 0.6662994381403547, + "grad_norm": 0.6953125, + "learning_rate": 0.00017668196792152498, + "loss": 4.2853, + "step": 6426 + }, + { + "epoch": 0.6664031261948428, + "grad_norm": 0.91015625, + "learning_rate": 0.00017667499536907458, + "loss": 4.2546, + "step": 6427 + }, + { + "epoch": 0.6665068142493309, + "grad_norm": 0.75390625, + "learning_rate": 0.00017666802191194033, + "loss": 4.3191, + "step": 6428 + }, + { + "epoch": 0.6666105023038189, + "grad_norm": 0.85546875, + "learning_rate": 0.0001766610475502044, + "loss": 4.2634, + "step": 6429 + }, + { + "epoch": 0.666714190358307, + "grad_norm": 0.734375, + "learning_rate": 0.00017665407228394916, + "loss": 4.3394, + "step": 6430 + }, + { + "epoch": 0.666817878412795, + "grad_norm": 0.796875, + "learning_rate": 0.0001766470961132569, + "loss": 4.3419, + "step": 6431 + }, + { + "epoch": 0.6669215664672832, + "grad_norm": 0.828125, + "learning_rate": 0.00017664011903820994, + "loss": 4.2822, + "step": 6432 + }, + { + "epoch": 0.6670252545217713, + "grad_norm": 0.7421875, + "learning_rate": 0.00017663314105889058, + "loss": 4.3083, + "step": 6433 + }, + { + "epoch": 0.6671289425762593, + "grad_norm": 0.91796875, + "learning_rate": 0.00017662616217538118, + "loss": 4.3549, + "step": 6434 + }, + { + "epoch": 0.6672326306307474, + "grad_norm": 0.8046875, + "learning_rate": 0.00017661918238776403, + "loss": 4.2849, + "step": 6435 + }, + { + "epoch": 0.6673363186852355, + "grad_norm": 0.7890625, + "learning_rate": 0.0001766122016961215, + "loss": 4.3556, + "step": 6436 + }, + { + "epoch": 0.6674400067397236, + "grad_norm": 0.78515625, + "learning_rate": 0.00017660522010053603, + "loss": 4.3536, + "step": 6437 + }, + { + "epoch": 0.6675436947942116, + "grad_norm": 0.6875, + "learning_rate": 0.0001765982376010899, + "loss": 4.3415, + "step": 6438 + }, + { + "epoch": 0.6676473828486997, + "grad_norm": 0.7421875, + "learning_rate": 0.00017659125419786556, + "loss": 4.2903, + "step": 6439 + }, + { + "epoch": 0.6677510709031877, + "grad_norm": 0.765625, + "learning_rate": 0.00017658426989094534, + "loss": 4.297, + "step": 6440 + }, + { + "epoch": 0.6678547589576759, + "grad_norm": 0.8125, + "learning_rate": 0.00017657728468041173, + "loss": 4.2782, + "step": 6441 + }, + { + "epoch": 0.6679584470121639, + "grad_norm": 0.81640625, + "learning_rate": 0.00017657029856634707, + "loss": 4.2852, + "step": 6442 + }, + { + "epoch": 0.668062135066652, + "grad_norm": 0.87109375, + "learning_rate": 0.00017656331154883385, + "loss": 4.3173, + "step": 6443 + }, + { + "epoch": 0.66816582312114, + "grad_norm": 0.72265625, + "learning_rate": 0.00017655632362795448, + "loss": 4.3095, + "step": 6444 + }, + { + "epoch": 0.6682695111756282, + "grad_norm": 0.8046875, + "learning_rate": 0.0001765493348037914, + "loss": 4.2713, + "step": 6445 + }, + { + "epoch": 0.6683731992301162, + "grad_norm": 0.703125, + "learning_rate": 0.00017654234507642711, + "loss": 4.3224, + "step": 6446 + }, + { + "epoch": 0.6684768872846043, + "grad_norm": 0.7109375, + "learning_rate": 0.00017653535444594405, + "loss": 4.3031, + "step": 6447 + }, + { + "epoch": 0.6685805753390923, + "grad_norm": 0.7421875, + "learning_rate": 0.0001765283629124247, + "loss": 4.3493, + "step": 6448 + }, + { + "epoch": 0.6686842633935804, + "grad_norm": 0.6875, + "learning_rate": 0.00017652137047595155, + "loss": 4.3091, + "step": 6449 + }, + { + "epoch": 0.6687879514480685, + "grad_norm": 0.66015625, + "learning_rate": 0.00017651437713660714, + "loss": 4.3148, + "step": 6450 + }, + { + "epoch": 0.6688916395025566, + "grad_norm": 0.66796875, + "learning_rate": 0.00017650738289447398, + "loss": 4.3442, + "step": 6451 + }, + { + "epoch": 0.6689953275570446, + "grad_norm": 0.6640625, + "learning_rate": 0.00017650038774963452, + "loss": 4.352, + "step": 6452 + }, + { + "epoch": 0.6690990156115327, + "grad_norm": 0.625, + "learning_rate": 0.00017649339170217138, + "loss": 4.3011, + "step": 6453 + }, + { + "epoch": 0.6692027036660207, + "grad_norm": 0.72265625, + "learning_rate": 0.00017648639475216705, + "loss": 4.3289, + "step": 6454 + }, + { + "epoch": 0.6693063917205089, + "grad_norm": 0.6875, + "learning_rate": 0.00017647939689970413, + "loss": 4.3572, + "step": 6455 + }, + { + "epoch": 0.6694100797749969, + "grad_norm": 0.734375, + "learning_rate": 0.00017647239814486517, + "loss": 4.2908, + "step": 6456 + }, + { + "epoch": 0.669513767829485, + "grad_norm": 0.70703125, + "learning_rate": 0.00017646539848773275, + "loss": 4.339, + "step": 6457 + }, + { + "epoch": 0.669617455883973, + "grad_norm": 0.7421875, + "learning_rate": 0.00017645839792838946, + "loss": 4.3345, + "step": 6458 + }, + { + "epoch": 0.6697211439384612, + "grad_norm": 0.8828125, + "learning_rate": 0.00017645139646691788, + "loss": 4.3301, + "step": 6459 + }, + { + "epoch": 0.6698248319929492, + "grad_norm": 0.75, + "learning_rate": 0.00017644439410340062, + "loss": 4.3495, + "step": 6460 + }, + { + "epoch": 0.6699285200474373, + "grad_norm": 0.796875, + "learning_rate": 0.00017643739083792034, + "loss": 4.2701, + "step": 6461 + }, + { + "epoch": 0.6700322081019253, + "grad_norm": 0.7890625, + "learning_rate": 0.00017643038667055966, + "loss": 4.3344, + "step": 6462 + }, + { + "epoch": 0.6701358961564134, + "grad_norm": 0.8125, + "learning_rate": 0.00017642338160140118, + "loss": 4.2788, + "step": 6463 + }, + { + "epoch": 0.6702395842109015, + "grad_norm": 0.7734375, + "learning_rate": 0.00017641637563052756, + "loss": 4.3089, + "step": 6464 + }, + { + "epoch": 0.6703432722653896, + "grad_norm": 0.76953125, + "learning_rate": 0.00017640936875802155, + "loss": 4.2687, + "step": 6465 + }, + { + "epoch": 0.6704469603198776, + "grad_norm": 0.81640625, + "learning_rate": 0.0001764023609839657, + "loss": 4.303, + "step": 6466 + }, + { + "epoch": 0.6705506483743657, + "grad_norm": 0.86328125, + "learning_rate": 0.00017639535230844276, + "loss": 4.3321, + "step": 6467 + }, + { + "epoch": 0.6706543364288537, + "grad_norm": 0.7421875, + "learning_rate": 0.00017638834273153546, + "loss": 4.3334, + "step": 6468 + }, + { + "epoch": 0.6707580244833419, + "grad_norm": 0.828125, + "learning_rate": 0.0001763813322533264, + "loss": 4.3177, + "step": 6469 + }, + { + "epoch": 0.6708617125378299, + "grad_norm": 0.76953125, + "learning_rate": 0.0001763743208738984, + "loss": 4.3005, + "step": 6470 + }, + { + "epoch": 0.670965400592318, + "grad_norm": 0.76171875, + "learning_rate": 0.00017636730859333413, + "loss": 4.256, + "step": 6471 + }, + { + "epoch": 0.671069088646806, + "grad_norm": 0.765625, + "learning_rate": 0.00017636029541171633, + "loss": 4.2535, + "step": 6472 + }, + { + "epoch": 0.6711727767012942, + "grad_norm": 0.82421875, + "learning_rate": 0.00017635328132912777, + "loss": 4.3104, + "step": 6473 + }, + { + "epoch": 0.6712764647557822, + "grad_norm": 0.78515625, + "learning_rate": 0.0001763462663456512, + "loss": 4.3169, + "step": 6474 + }, + { + "epoch": 0.6713801528102703, + "grad_norm": 0.81640625, + "learning_rate": 0.0001763392504613694, + "loss": 4.3292, + "step": 6475 + }, + { + "epoch": 0.6714838408647583, + "grad_norm": 0.765625, + "learning_rate": 0.00017633223367636514, + "loss": 4.317, + "step": 6476 + }, + { + "epoch": 0.6715875289192464, + "grad_norm": 0.77734375, + "learning_rate": 0.00017632521599072118, + "loss": 4.3153, + "step": 6477 + }, + { + "epoch": 0.6716912169737346, + "grad_norm": 0.76171875, + "learning_rate": 0.00017631819740452037, + "loss": 4.3043, + "step": 6478 + }, + { + "epoch": 0.6717949050282226, + "grad_norm": 0.76953125, + "learning_rate": 0.0001763111779178455, + "loss": 4.3376, + "step": 6479 + }, + { + "epoch": 0.6718985930827107, + "grad_norm": 0.609375, + "learning_rate": 0.0001763041575307794, + "loss": 4.3351, + "step": 6480 + }, + { + "epoch": 0.6720022811371987, + "grad_norm": 0.75, + "learning_rate": 0.0001762971362434049, + "loss": 4.3147, + "step": 6481 + }, + { + "epoch": 0.6721059691916869, + "grad_norm": 0.65625, + "learning_rate": 0.00017629011405580482, + "loss": 4.3209, + "step": 6482 + }, + { + "epoch": 0.6722096572461749, + "grad_norm": 0.6875, + "learning_rate": 0.00017628309096806206, + "loss": 4.3264, + "step": 6483 + }, + { + "epoch": 0.672313345300663, + "grad_norm": 0.6640625, + "learning_rate": 0.00017627606698025945, + "loss": 4.2596, + "step": 6484 + }, + { + "epoch": 0.672417033355151, + "grad_norm": 0.765625, + "learning_rate": 0.00017626904209247987, + "loss": 4.3037, + "step": 6485 + }, + { + "epoch": 0.6725207214096391, + "grad_norm": 0.734375, + "learning_rate": 0.00017626201630480622, + "loss": 4.3338, + "step": 6486 + }, + { + "epoch": 0.6726244094641272, + "grad_norm": 0.69921875, + "learning_rate": 0.00017625498961732142, + "loss": 4.3313, + "step": 6487 + }, + { + "epoch": 0.6727280975186153, + "grad_norm": 0.734375, + "learning_rate": 0.0001762479620301083, + "loss": 4.2831, + "step": 6488 + }, + { + "epoch": 0.6728317855731033, + "grad_norm": 0.65234375, + "learning_rate": 0.00017624093354324987, + "loss": 4.3158, + "step": 6489 + }, + { + "epoch": 0.6729354736275914, + "grad_norm": 0.734375, + "learning_rate": 0.000176233904156829, + "loss": 4.3364, + "step": 6490 + }, + { + "epoch": 0.6730391616820794, + "grad_norm": 0.63671875, + "learning_rate": 0.0001762268738709286, + "loss": 4.3593, + "step": 6491 + }, + { + "epoch": 0.6731428497365676, + "grad_norm": 0.76953125, + "learning_rate": 0.00017621984268563173, + "loss": 4.3485, + "step": 6492 + }, + { + "epoch": 0.6732465377910556, + "grad_norm": 0.6953125, + "learning_rate": 0.00017621281060102123, + "loss": 4.3344, + "step": 6493 + }, + { + "epoch": 0.6733502258455437, + "grad_norm": 0.7578125, + "learning_rate": 0.00017620577761718015, + "loss": 4.2714, + "step": 6494 + }, + { + "epoch": 0.6734539139000317, + "grad_norm": 0.65234375, + "learning_rate": 0.00017619874373419144, + "loss": 4.3282, + "step": 6495 + }, + { + "epoch": 0.6735576019545199, + "grad_norm": 0.796875, + "learning_rate": 0.00017619170895213812, + "loss": 4.2696, + "step": 6496 + }, + { + "epoch": 0.6736612900090079, + "grad_norm": 0.671875, + "learning_rate": 0.00017618467327110317, + "loss": 4.3398, + "step": 6497 + }, + { + "epoch": 0.673764978063496, + "grad_norm": 0.7734375, + "learning_rate": 0.00017617763669116958, + "loss": 4.3381, + "step": 6498 + }, + { + "epoch": 0.673868666117984, + "grad_norm": 0.6796875, + "learning_rate": 0.00017617059921242042, + "loss": 4.3148, + "step": 6499 + }, + { + "epoch": 0.6739723541724721, + "grad_norm": 0.71484375, + "learning_rate": 0.0001761635608349387, + "loss": 4.2602, + "step": 6500 + }, + { + "epoch": 0.6740760422269602, + "grad_norm": 0.65625, + "learning_rate": 0.00017615652155880747, + "loss": 4.3324, + "step": 6501 + }, + { + "epoch": 0.6741797302814483, + "grad_norm": 0.78515625, + "learning_rate": 0.00017614948138410975, + "loss": 4.341, + "step": 6502 + }, + { + "epoch": 0.6742834183359363, + "grad_norm": 0.69921875, + "learning_rate": 0.0001761424403109287, + "loss": 4.3384, + "step": 6503 + }, + { + "epoch": 0.6743871063904244, + "grad_norm": 0.76171875, + "learning_rate": 0.0001761353983393473, + "loss": 4.2924, + "step": 6504 + }, + { + "epoch": 0.6744907944449124, + "grad_norm": 0.75, + "learning_rate": 0.0001761283554694487, + "loss": 4.3056, + "step": 6505 + }, + { + "epoch": 0.6745944824994006, + "grad_norm": 0.71484375, + "learning_rate": 0.000176121311701316, + "loss": 4.3578, + "step": 6506 + }, + { + "epoch": 0.6746981705538886, + "grad_norm": 0.765625, + "learning_rate": 0.00017611426703503224, + "loss": 4.327, + "step": 6507 + }, + { + "epoch": 0.6748018586083767, + "grad_norm": 0.6796875, + "learning_rate": 0.0001761072214706806, + "loss": 4.3091, + "step": 6508 + }, + { + "epoch": 0.6749055466628647, + "grad_norm": 0.7890625, + "learning_rate": 0.0001761001750083442, + "loss": 4.3275, + "step": 6509 + }, + { + "epoch": 0.6750092347173529, + "grad_norm": 0.70703125, + "learning_rate": 0.0001760931276481062, + "loss": 4.316, + "step": 6510 + }, + { + "epoch": 0.6751129227718409, + "grad_norm": 0.76171875, + "learning_rate": 0.0001760860793900497, + "loss": 4.3116, + "step": 6511 + }, + { + "epoch": 0.675216610826329, + "grad_norm": 0.7265625, + "learning_rate": 0.00017607903023425788, + "loss": 4.3461, + "step": 6512 + }, + { + "epoch": 0.675320298880817, + "grad_norm": 0.80859375, + "learning_rate": 0.00017607198018081396, + "loss": 4.3162, + "step": 6513 + }, + { + "epoch": 0.6754239869353051, + "grad_norm": 0.82421875, + "learning_rate": 0.00017606492922980104, + "loss": 4.3131, + "step": 6514 + }, + { + "epoch": 0.6755276749897932, + "grad_norm": 0.8359375, + "learning_rate": 0.0001760578773813024, + "loss": 4.2896, + "step": 6515 + }, + { + "epoch": 0.6756313630442813, + "grad_norm": 0.8828125, + "learning_rate": 0.00017605082463540117, + "loss": 4.3247, + "step": 6516 + }, + { + "epoch": 0.6757350510987693, + "grad_norm": 0.78125, + "learning_rate": 0.0001760437709921806, + "loss": 4.336, + "step": 6517 + }, + { + "epoch": 0.6758387391532574, + "grad_norm": 0.90234375, + "learning_rate": 0.00017603671645172395, + "loss": 4.3142, + "step": 6518 + }, + { + "epoch": 0.6759424272077454, + "grad_norm": 0.7734375, + "learning_rate": 0.00017602966101411437, + "loss": 4.2958, + "step": 6519 + }, + { + "epoch": 0.6760461152622336, + "grad_norm": 0.8046875, + "learning_rate": 0.00017602260467943517, + "loss": 4.301, + "step": 6520 + }, + { + "epoch": 0.6761498033167217, + "grad_norm": 0.734375, + "learning_rate": 0.00017601554744776964, + "loss": 4.3737, + "step": 6521 + }, + { + "epoch": 0.6762534913712097, + "grad_norm": 0.9140625, + "learning_rate": 0.00017600848931920098, + "loss": 4.3691, + "step": 6522 + }, + { + "epoch": 0.6763571794256978, + "grad_norm": 0.65234375, + "learning_rate": 0.00017600143029381247, + "loss": 4.3371, + "step": 6523 + }, + { + "epoch": 0.6764608674801859, + "grad_norm": 0.8359375, + "learning_rate": 0.00017599437037168746, + "loss": 4.3148, + "step": 6524 + }, + { + "epoch": 0.676564555534674, + "grad_norm": 0.58984375, + "learning_rate": 0.00017598730955290917, + "loss": 4.3323, + "step": 6525 + }, + { + "epoch": 0.676668243589162, + "grad_norm": 0.77734375, + "learning_rate": 0.00017598024783756095, + "loss": 4.3187, + "step": 6526 + }, + { + "epoch": 0.6767719316436501, + "grad_norm": 0.68359375, + "learning_rate": 0.00017597318522572612, + "loss": 4.3184, + "step": 6527 + }, + { + "epoch": 0.6768756196981381, + "grad_norm": 0.66015625, + "learning_rate": 0.00017596612171748803, + "loss": 4.3484, + "step": 6528 + }, + { + "epoch": 0.6769793077526263, + "grad_norm": 0.7265625, + "learning_rate": 0.00017595905731292998, + "loss": 4.3241, + "step": 6529 + }, + { + "epoch": 0.6770829958071143, + "grad_norm": 0.6875, + "learning_rate": 0.00017595199201213534, + "loss": 4.3255, + "step": 6530 + }, + { + "epoch": 0.6771866838616024, + "grad_norm": 0.8046875, + "learning_rate": 0.00017594492581518752, + "loss": 4.3215, + "step": 6531 + }, + { + "epoch": 0.6772903719160904, + "grad_norm": 0.65625, + "learning_rate": 0.00017593785872216982, + "loss": 4.3453, + "step": 6532 + }, + { + "epoch": 0.6773940599705786, + "grad_norm": 0.8359375, + "learning_rate": 0.00017593079073316566, + "loss": 4.3257, + "step": 6533 + }, + { + "epoch": 0.6774977480250666, + "grad_norm": 0.69921875, + "learning_rate": 0.00017592372184825847, + "loss": 4.318, + "step": 6534 + }, + { + "epoch": 0.6776014360795547, + "grad_norm": 0.7734375, + "learning_rate": 0.0001759166520675316, + "loss": 4.3414, + "step": 6535 + }, + { + "epoch": 0.6777051241340427, + "grad_norm": 0.76171875, + "learning_rate": 0.00017590958139106845, + "loss": 4.3062, + "step": 6536 + }, + { + "epoch": 0.6778088121885308, + "grad_norm": 0.73828125, + "learning_rate": 0.0001759025098189525, + "loss": 4.3148, + "step": 6537 + }, + { + "epoch": 0.6779125002430189, + "grad_norm": 0.7421875, + "learning_rate": 0.00017589543735126718, + "loss": 4.2926, + "step": 6538 + }, + { + "epoch": 0.678016188297507, + "grad_norm": 0.77734375, + "learning_rate": 0.0001758883639880959, + "loss": 4.3369, + "step": 6539 + }, + { + "epoch": 0.678119876351995, + "grad_norm": 0.765625, + "learning_rate": 0.00017588128972952216, + "loss": 4.2984, + "step": 6540 + }, + { + "epoch": 0.6782235644064831, + "grad_norm": 0.75, + "learning_rate": 0.0001758742145756294, + "loss": 4.3083, + "step": 6541 + }, + { + "epoch": 0.6783272524609711, + "grad_norm": 0.73046875, + "learning_rate": 0.0001758671385265011, + "loss": 4.2944, + "step": 6542 + }, + { + "epoch": 0.6784309405154593, + "grad_norm": 0.734375, + "learning_rate": 0.00017586006158222078, + "loss": 4.3253, + "step": 6543 + }, + { + "epoch": 0.6785346285699473, + "grad_norm": 0.765625, + "learning_rate": 0.0001758529837428719, + "loss": 4.3334, + "step": 6544 + }, + { + "epoch": 0.6786383166244354, + "grad_norm": 0.76953125, + "learning_rate": 0.00017584590500853802, + "loss": 4.3115, + "step": 6545 + }, + { + "epoch": 0.6787420046789234, + "grad_norm": 0.71484375, + "learning_rate": 0.0001758388253793026, + "loss": 4.3329, + "step": 6546 + }, + { + "epoch": 0.6788456927334116, + "grad_norm": 0.73046875, + "learning_rate": 0.00017583174485524925, + "loss": 4.2879, + "step": 6547 + }, + { + "epoch": 0.6789493807878996, + "grad_norm": 0.7578125, + "learning_rate": 0.00017582466343646144, + "loss": 4.32, + "step": 6548 + }, + { + "epoch": 0.6790530688423877, + "grad_norm": 0.77734375, + "learning_rate": 0.00017581758112302276, + "loss": 4.375, + "step": 6549 + }, + { + "epoch": 0.6791567568968757, + "grad_norm": 0.6953125, + "learning_rate": 0.00017581049791501677, + "loss": 4.3272, + "step": 6550 + }, + { + "epoch": 0.6792604449513638, + "grad_norm": 0.7109375, + "learning_rate": 0.00017580341381252703, + "loss": 4.2906, + "step": 6551 + }, + { + "epoch": 0.6793641330058519, + "grad_norm": 0.796875, + "learning_rate": 0.0001757963288156371, + "loss": 4.3129, + "step": 6552 + }, + { + "epoch": 0.67946782106034, + "grad_norm": 0.7734375, + "learning_rate": 0.00017578924292443066, + "loss": 4.2775, + "step": 6553 + }, + { + "epoch": 0.679571509114828, + "grad_norm": 0.734375, + "learning_rate": 0.00017578215613899128, + "loss": 4.3718, + "step": 6554 + }, + { + "epoch": 0.6796751971693161, + "grad_norm": 0.82421875, + "learning_rate": 0.00017577506845940254, + "loss": 4.2961, + "step": 6555 + }, + { + "epoch": 0.6797788852238041, + "grad_norm": 0.78125, + "learning_rate": 0.00017576797988574808, + "loss": 4.294, + "step": 6556 + }, + { + "epoch": 0.6798825732782923, + "grad_norm": 0.96875, + "learning_rate": 0.00017576089041811151, + "loss": 4.3179, + "step": 6557 + }, + { + "epoch": 0.6799862613327803, + "grad_norm": 0.82421875, + "learning_rate": 0.00017575380005657658, + "loss": 4.2949, + "step": 6558 + }, + { + "epoch": 0.6800899493872684, + "grad_norm": 0.98828125, + "learning_rate": 0.00017574670880122687, + "loss": 4.3174, + "step": 6559 + }, + { + "epoch": 0.6801936374417564, + "grad_norm": 0.8203125, + "learning_rate": 0.00017573961665214607, + "loss": 4.3556, + "step": 6560 + }, + { + "epoch": 0.6802973254962446, + "grad_norm": 0.84375, + "learning_rate": 0.00017573252360941785, + "loss": 4.3147, + "step": 6561 + }, + { + "epoch": 0.6804010135507326, + "grad_norm": 0.93359375, + "learning_rate": 0.00017572542967312586, + "loss": 4.2997, + "step": 6562 + }, + { + "epoch": 0.6805047016052207, + "grad_norm": 0.67578125, + "learning_rate": 0.0001757183348433539, + "loss": 4.3231, + "step": 6563 + }, + { + "epoch": 0.6806083896597087, + "grad_norm": 0.89453125, + "learning_rate": 0.0001757112391201856, + "loss": 4.2858, + "step": 6564 + }, + { + "epoch": 0.6807120777141968, + "grad_norm": 0.71875, + "learning_rate": 0.00017570414250370472, + "loss": 4.3065, + "step": 6565 + }, + { + "epoch": 0.680815765768685, + "grad_norm": 0.78515625, + "learning_rate": 0.00017569704499399496, + "loss": 4.2943, + "step": 6566 + }, + { + "epoch": 0.680919453823173, + "grad_norm": 0.6953125, + "learning_rate": 0.00017568994659114008, + "loss": 4.3219, + "step": 6567 + }, + { + "epoch": 0.6810231418776611, + "grad_norm": 0.80078125, + "learning_rate": 0.0001756828472952239, + "loss": 4.3509, + "step": 6568 + }, + { + "epoch": 0.6811268299321491, + "grad_norm": 0.76953125, + "learning_rate": 0.00017567574710633006, + "loss": 4.3214, + "step": 6569 + }, + { + "epoch": 0.6812305179866373, + "grad_norm": 0.6796875, + "learning_rate": 0.00017566864602454243, + "loss": 4.3195, + "step": 6570 + }, + { + "epoch": 0.6813342060411253, + "grad_norm": 0.76171875, + "learning_rate": 0.00017566154404994471, + "loss": 4.3133, + "step": 6571 + }, + { + "epoch": 0.6814378940956134, + "grad_norm": 0.66015625, + "learning_rate": 0.00017565444118262081, + "loss": 4.3164, + "step": 6572 + }, + { + "epoch": 0.6815415821501014, + "grad_norm": 0.69140625, + "learning_rate": 0.00017564733742265445, + "loss": 4.3009, + "step": 6573 + }, + { + "epoch": 0.6816452702045895, + "grad_norm": 0.6171875, + "learning_rate": 0.0001756402327701295, + "loss": 4.3113, + "step": 6574 + }, + { + "epoch": 0.6817489582590776, + "grad_norm": 0.6796875, + "learning_rate": 0.0001756331272251297, + "loss": 4.3586, + "step": 6575 + }, + { + "epoch": 0.6818526463135657, + "grad_norm": 0.62890625, + "learning_rate": 0.00017562602078773898, + "loss": 4.3112, + "step": 6576 + }, + { + "epoch": 0.6819563343680537, + "grad_norm": 0.65625, + "learning_rate": 0.00017561891345804117, + "loss": 4.3211, + "step": 6577 + }, + { + "epoch": 0.6820600224225418, + "grad_norm": 0.66796875, + "learning_rate": 0.00017561180523612008, + "loss": 4.3358, + "step": 6578 + }, + { + "epoch": 0.6821637104770298, + "grad_norm": 0.7109375, + "learning_rate": 0.00017560469612205965, + "loss": 4.3131, + "step": 6579 + }, + { + "epoch": 0.682267398531518, + "grad_norm": 0.64453125, + "learning_rate": 0.0001755975861159437, + "loss": 4.3321, + "step": 6580 + }, + { + "epoch": 0.682371086586006, + "grad_norm": 0.6640625, + "learning_rate": 0.00017559047521785613, + "loss": 4.2691, + "step": 6581 + }, + { + "epoch": 0.6824747746404941, + "grad_norm": 0.71875, + "learning_rate": 0.00017558336342788088, + "loss": 4.3178, + "step": 6582 + }, + { + "epoch": 0.6825784626949821, + "grad_norm": 0.7265625, + "learning_rate": 0.00017557625074610185, + "loss": 4.3003, + "step": 6583 + }, + { + "epoch": 0.6826821507494703, + "grad_norm": 0.69140625, + "learning_rate": 0.0001755691371726029, + "loss": 4.2979, + "step": 6584 + }, + { + "epoch": 0.6827858388039583, + "grad_norm": 0.71484375, + "learning_rate": 0.00017556202270746805, + "loss": 4.3183, + "step": 6585 + }, + { + "epoch": 0.6828895268584464, + "grad_norm": 0.7109375, + "learning_rate": 0.0001755549073507812, + "loss": 4.2924, + "step": 6586 + }, + { + "epoch": 0.6829932149129344, + "grad_norm": 0.65234375, + "learning_rate": 0.0001755477911026263, + "loss": 4.3058, + "step": 6587 + }, + { + "epoch": 0.6830969029674225, + "grad_norm": 0.7421875, + "learning_rate": 0.00017554067396308731, + "loss": 4.2995, + "step": 6588 + }, + { + "epoch": 0.6832005910219106, + "grad_norm": 0.76953125, + "learning_rate": 0.00017553355593224822, + "loss": 4.3376, + "step": 6589 + }, + { + "epoch": 0.6833042790763987, + "grad_norm": 0.69921875, + "learning_rate": 0.00017552643701019305, + "loss": 4.2894, + "step": 6590 + }, + { + "epoch": 0.6834079671308867, + "grad_norm": 0.67578125, + "learning_rate": 0.0001755193171970057, + "loss": 4.297, + "step": 6591 + }, + { + "epoch": 0.6835116551853748, + "grad_norm": 0.73046875, + "learning_rate": 0.00017551219649277028, + "loss": 4.3418, + "step": 6592 + }, + { + "epoch": 0.6836153432398628, + "grad_norm": 0.65234375, + "learning_rate": 0.00017550507489757076, + "loss": 4.3075, + "step": 6593 + }, + { + "epoch": 0.683719031294351, + "grad_norm": 0.76953125, + "learning_rate": 0.00017549795241149116, + "loss": 4.3406, + "step": 6594 + }, + { + "epoch": 0.683822719348839, + "grad_norm": 0.65234375, + "learning_rate": 0.00017549082903461552, + "loss": 4.2859, + "step": 6595 + }, + { + "epoch": 0.6839264074033271, + "grad_norm": 0.76953125, + "learning_rate": 0.0001754837047670279, + "loss": 4.3241, + "step": 6596 + }, + { + "epoch": 0.6840300954578151, + "grad_norm": 0.66796875, + "learning_rate": 0.00017547657960881235, + "loss": 4.3338, + "step": 6597 + }, + { + "epoch": 0.6841337835123033, + "grad_norm": 0.73828125, + "learning_rate": 0.00017546945356005294, + "loss": 4.3292, + "step": 6598 + }, + { + "epoch": 0.6842374715667913, + "grad_norm": 0.62109375, + "learning_rate": 0.00017546232662083377, + "loss": 4.3014, + "step": 6599 + }, + { + "epoch": 0.6843411596212794, + "grad_norm": 0.6796875, + "learning_rate": 0.00017545519879123887, + "loss": 4.2827, + "step": 6600 + }, + { + "epoch": 0.6844448476757674, + "grad_norm": 0.64453125, + "learning_rate": 0.00017544807007135243, + "loss": 4.2965, + "step": 6601 + }, + { + "epoch": 0.6845485357302555, + "grad_norm": 0.671875, + "learning_rate": 0.0001754409404612585, + "loss": 4.2946, + "step": 6602 + }, + { + "epoch": 0.6846522237847436, + "grad_norm": 0.63671875, + "learning_rate": 0.00017543380996104123, + "loss": 4.2934, + "step": 6603 + }, + { + "epoch": 0.6847559118392317, + "grad_norm": 0.71875, + "learning_rate": 0.00017542667857078472, + "loss": 4.3368, + "step": 6604 + }, + { + "epoch": 0.6848595998937197, + "grad_norm": 0.71484375, + "learning_rate": 0.00017541954629057314, + "loss": 4.3014, + "step": 6605 + }, + { + "epoch": 0.6849632879482078, + "grad_norm": 0.71484375, + "learning_rate": 0.00017541241312049062, + "loss": 4.312, + "step": 6606 + }, + { + "epoch": 0.6850669760026958, + "grad_norm": 0.6875, + "learning_rate": 0.00017540527906062135, + "loss": 4.3363, + "step": 6607 + }, + { + "epoch": 0.685170664057184, + "grad_norm": 0.69921875, + "learning_rate": 0.00017539814411104949, + "loss": 4.2708, + "step": 6608 + }, + { + "epoch": 0.685274352111672, + "grad_norm": 0.69921875, + "learning_rate": 0.00017539100827185925, + "loss": 4.3057, + "step": 6609 + }, + { + "epoch": 0.6853780401661601, + "grad_norm": 0.80078125, + "learning_rate": 0.0001753838715431348, + "loss": 4.3111, + "step": 6610 + }, + { + "epoch": 0.6854817282206482, + "grad_norm": 0.68359375, + "learning_rate": 0.0001753767339249603, + "loss": 4.2907, + "step": 6611 + }, + { + "epoch": 0.6855854162751363, + "grad_norm": 0.7421875, + "learning_rate": 0.00017536959541742007, + "loss": 4.2412, + "step": 6612 + }, + { + "epoch": 0.6856891043296244, + "grad_norm": 0.7265625, + "learning_rate": 0.00017536245602059827, + "loss": 4.2948, + "step": 6613 + }, + { + "epoch": 0.6857927923841124, + "grad_norm": 0.77734375, + "learning_rate": 0.00017535531573457914, + "loss": 4.3325, + "step": 6614 + }, + { + "epoch": 0.6858964804386005, + "grad_norm": 0.6875, + "learning_rate": 0.00017534817455944698, + "loss": 4.2668, + "step": 6615 + }, + { + "epoch": 0.6860001684930885, + "grad_norm": 0.66015625, + "learning_rate": 0.00017534103249528595, + "loss": 4.3183, + "step": 6616 + }, + { + "epoch": 0.6861038565475767, + "grad_norm": 0.7265625, + "learning_rate": 0.0001753338895421804, + "loss": 4.3117, + "step": 6617 + }, + { + "epoch": 0.6862075446020647, + "grad_norm": 0.6484375, + "learning_rate": 0.00017532674570021458, + "loss": 4.3003, + "step": 6618 + }, + { + "epoch": 0.6863112326565528, + "grad_norm": 0.62890625, + "learning_rate": 0.0001753196009694728, + "loss": 4.3061, + "step": 6619 + }, + { + "epoch": 0.6864149207110408, + "grad_norm": 0.6953125, + "learning_rate": 0.00017531245535003934, + "loss": 4.3236, + "step": 6620 + }, + { + "epoch": 0.686518608765529, + "grad_norm": 0.66015625, + "learning_rate": 0.0001753053088419985, + "loss": 4.3037, + "step": 6621 + }, + { + "epoch": 0.686622296820017, + "grad_norm": 0.72265625, + "learning_rate": 0.00017529816144543463, + "loss": 4.302, + "step": 6622 + }, + { + "epoch": 0.6867259848745051, + "grad_norm": 0.78515625, + "learning_rate": 0.00017529101316043203, + "loss": 4.3131, + "step": 6623 + }, + { + "epoch": 0.6868296729289931, + "grad_norm": 0.734375, + "learning_rate": 0.0001752838639870751, + "loss": 4.3355, + "step": 6624 + }, + { + "epoch": 0.6869333609834812, + "grad_norm": 0.73828125, + "learning_rate": 0.00017527671392544812, + "loss": 4.3358, + "step": 6625 + }, + { + "epoch": 0.6870370490379692, + "grad_norm": 0.7734375, + "learning_rate": 0.0001752695629756355, + "loss": 4.2912, + "step": 6626 + }, + { + "epoch": 0.6871407370924574, + "grad_norm": 0.76171875, + "learning_rate": 0.00017526241113772158, + "loss": 4.2923, + "step": 6627 + }, + { + "epoch": 0.6872444251469454, + "grad_norm": 0.71875, + "learning_rate": 0.00017525525841179077, + "loss": 4.3022, + "step": 6628 + }, + { + "epoch": 0.6873481132014335, + "grad_norm": 0.75, + "learning_rate": 0.00017524810479792747, + "loss": 4.3173, + "step": 6629 + }, + { + "epoch": 0.6874518012559215, + "grad_norm": 0.6796875, + "learning_rate": 0.0001752409502962161, + "loss": 4.3274, + "step": 6630 + }, + { + "epoch": 0.6875554893104097, + "grad_norm": 0.66796875, + "learning_rate": 0.00017523379490674102, + "loss": 4.3059, + "step": 6631 + }, + { + "epoch": 0.6876591773648977, + "grad_norm": 0.7109375, + "learning_rate": 0.00017522663862958667, + "loss": 4.2875, + "step": 6632 + }, + { + "epoch": 0.6877628654193858, + "grad_norm": 0.6953125, + "learning_rate": 0.00017521948146483754, + "loss": 4.2669, + "step": 6633 + }, + { + "epoch": 0.6878665534738738, + "grad_norm": 0.671875, + "learning_rate": 0.000175212323412578, + "loss": 4.3299, + "step": 6634 + }, + { + "epoch": 0.687970241528362, + "grad_norm": 0.6953125, + "learning_rate": 0.00017520516447289258, + "loss": 4.3181, + "step": 6635 + }, + { + "epoch": 0.68807392958285, + "grad_norm": 0.66796875, + "learning_rate": 0.00017519800464586572, + "loss": 4.3058, + "step": 6636 + }, + { + "epoch": 0.6881776176373381, + "grad_norm": 0.6875, + "learning_rate": 0.00017519084393158185, + "loss": 4.3505, + "step": 6637 + }, + { + "epoch": 0.6882813056918261, + "grad_norm": 0.6015625, + "learning_rate": 0.0001751836823301255, + "loss": 4.3358, + "step": 6638 + }, + { + "epoch": 0.6883849937463142, + "grad_norm": 0.70703125, + "learning_rate": 0.00017517651984158122, + "loss": 4.3308, + "step": 6639 + }, + { + "epoch": 0.6884886818008022, + "grad_norm": 0.640625, + "learning_rate": 0.00017516935646603345, + "loss": 4.3068, + "step": 6640 + }, + { + "epoch": 0.6885923698552904, + "grad_norm": 0.69921875, + "learning_rate": 0.00017516219220356673, + "loss": 4.3641, + "step": 6641 + }, + { + "epoch": 0.6886960579097784, + "grad_norm": 0.64453125, + "learning_rate": 0.0001751550270542656, + "loss": 4.2864, + "step": 6642 + }, + { + "epoch": 0.6887997459642665, + "grad_norm": 0.68359375, + "learning_rate": 0.00017514786101821458, + "loss": 4.2931, + "step": 6643 + }, + { + "epoch": 0.6889034340187545, + "grad_norm": 0.70703125, + "learning_rate": 0.00017514069409549823, + "loss": 4.2846, + "step": 6644 + }, + { + "epoch": 0.6890071220732427, + "grad_norm": 0.71875, + "learning_rate": 0.00017513352628620115, + "loss": 4.3249, + "step": 6645 + }, + { + "epoch": 0.6891108101277307, + "grad_norm": 0.6796875, + "learning_rate": 0.00017512635759040784, + "loss": 4.2103, + "step": 6646 + }, + { + "epoch": 0.6892144981822188, + "grad_norm": 0.71484375, + "learning_rate": 0.00017511918800820292, + "loss": 4.3311, + "step": 6647 + }, + { + "epoch": 0.6893181862367068, + "grad_norm": 0.74609375, + "learning_rate": 0.00017511201753967102, + "loss": 4.3075, + "step": 6648 + }, + { + "epoch": 0.689421874291195, + "grad_norm": 0.765625, + "learning_rate": 0.00017510484618489668, + "loss": 4.323, + "step": 6649 + }, + { + "epoch": 0.689525562345683, + "grad_norm": 0.7109375, + "learning_rate": 0.00017509767394396458, + "loss": 4.3195, + "step": 6650 + }, + { + "epoch": 0.6896292504001711, + "grad_norm": 0.81640625, + "learning_rate": 0.00017509050081695925, + "loss": 4.3298, + "step": 6651 + }, + { + "epoch": 0.6897329384546591, + "grad_norm": 0.72265625, + "learning_rate": 0.00017508332680396543, + "loss": 4.2887, + "step": 6652 + }, + { + "epoch": 0.6898366265091472, + "grad_norm": 0.76953125, + "learning_rate": 0.0001750761519050677, + "loss": 4.3267, + "step": 6653 + }, + { + "epoch": 0.6899403145636352, + "grad_norm": 0.6796875, + "learning_rate": 0.0001750689761203508, + "loss": 4.3083, + "step": 6654 + }, + { + "epoch": 0.6900440026181234, + "grad_norm": 0.72265625, + "learning_rate": 0.00017506179944989928, + "loss": 4.2902, + "step": 6655 + }, + { + "epoch": 0.6901476906726115, + "grad_norm": 0.7109375, + "learning_rate": 0.00017505462189379783, + "loss": 4.3244, + "step": 6656 + }, + { + "epoch": 0.6902513787270995, + "grad_norm": 0.82421875, + "learning_rate": 0.00017504744345213122, + "loss": 4.3278, + "step": 6657 + }, + { + "epoch": 0.6903550667815876, + "grad_norm": 0.7265625, + "learning_rate": 0.00017504026412498412, + "loss": 4.3396, + "step": 6658 + }, + { + "epoch": 0.6904587548360757, + "grad_norm": 0.74609375, + "learning_rate": 0.0001750330839124412, + "loss": 4.2807, + "step": 6659 + }, + { + "epoch": 0.6905624428905638, + "grad_norm": 0.765625, + "learning_rate": 0.0001750259028145872, + "loss": 4.2965, + "step": 6660 + }, + { + "epoch": 0.6906661309450518, + "grad_norm": 0.69921875, + "learning_rate": 0.00017501872083150688, + "loss": 4.3143, + "step": 6661 + }, + { + "epoch": 0.6907698189995399, + "grad_norm": 0.74609375, + "learning_rate": 0.00017501153796328493, + "loss": 4.314, + "step": 6662 + }, + { + "epoch": 0.690873507054028, + "grad_norm": 0.71484375, + "learning_rate": 0.00017500435421000611, + "loss": 4.2631, + "step": 6663 + }, + { + "epoch": 0.6909771951085161, + "grad_norm": 0.75, + "learning_rate": 0.00017499716957175524, + "loss": 4.277, + "step": 6664 + }, + { + "epoch": 0.6910808831630041, + "grad_norm": 0.73046875, + "learning_rate": 0.00017498998404861702, + "loss": 4.3147, + "step": 6665 + }, + { + "epoch": 0.6911845712174922, + "grad_norm": 0.66796875, + "learning_rate": 0.00017498279764067623, + "loss": 4.2968, + "step": 6666 + }, + { + "epoch": 0.6912882592719802, + "grad_norm": 0.7421875, + "learning_rate": 0.00017497561034801772, + "loss": 4.3201, + "step": 6667 + }, + { + "epoch": 0.6913919473264684, + "grad_norm": 0.671875, + "learning_rate": 0.00017496842217072626, + "loss": 4.2557, + "step": 6668 + }, + { + "epoch": 0.6914956353809564, + "grad_norm": 0.76171875, + "learning_rate": 0.00017496123310888667, + "loss": 4.3024, + "step": 6669 + }, + { + "epoch": 0.6915993234354445, + "grad_norm": 0.703125, + "learning_rate": 0.00017495404316258376, + "loss": 4.3402, + "step": 6670 + }, + { + "epoch": 0.6917030114899325, + "grad_norm": 0.83984375, + "learning_rate": 0.00017494685233190234, + "loss": 4.3003, + "step": 6671 + }, + { + "epoch": 0.6918066995444206, + "grad_norm": 0.7421875, + "learning_rate": 0.0001749396606169273, + "loss": 4.2904, + "step": 6672 + }, + { + "epoch": 0.6919103875989087, + "grad_norm": 0.85546875, + "learning_rate": 0.0001749324680177435, + "loss": 4.2703, + "step": 6673 + }, + { + "epoch": 0.6920140756533968, + "grad_norm": 0.75, + "learning_rate": 0.00017492527453443578, + "loss": 4.3059, + "step": 6674 + }, + { + "epoch": 0.6921177637078848, + "grad_norm": 0.87890625, + "learning_rate": 0.00017491808016708899, + "loss": 4.2864, + "step": 6675 + }, + { + "epoch": 0.6922214517623729, + "grad_norm": 0.6953125, + "learning_rate": 0.00017491088491578807, + "loss": 4.2779, + "step": 6676 + }, + { + "epoch": 0.692325139816861, + "grad_norm": 0.87890625, + "learning_rate": 0.0001749036887806179, + "loss": 4.3136, + "step": 6677 + }, + { + "epoch": 0.6924288278713491, + "grad_norm": 0.79296875, + "learning_rate": 0.00017489649176166336, + "loss": 4.32, + "step": 6678 + }, + { + "epoch": 0.6925325159258371, + "grad_norm": 0.78515625, + "learning_rate": 0.0001748892938590094, + "loss": 4.3044, + "step": 6679 + }, + { + "epoch": 0.6926362039803252, + "grad_norm": 0.8046875, + "learning_rate": 0.00017488209507274095, + "loss": 4.3141, + "step": 6680 + }, + { + "epoch": 0.6927398920348132, + "grad_norm": 0.828125, + "learning_rate": 0.0001748748954029429, + "loss": 4.3011, + "step": 6681 + }, + { + "epoch": 0.6928435800893014, + "grad_norm": 0.8046875, + "learning_rate": 0.00017486769484970026, + "loss": 4.2975, + "step": 6682 + }, + { + "epoch": 0.6929472681437894, + "grad_norm": 0.76953125, + "learning_rate": 0.0001748604934130979, + "loss": 4.323, + "step": 6683 + }, + { + "epoch": 0.6930509561982775, + "grad_norm": 0.828125, + "learning_rate": 0.00017485329109322089, + "loss": 4.3262, + "step": 6684 + }, + { + "epoch": 0.6931546442527655, + "grad_norm": 0.875, + "learning_rate": 0.00017484608789015418, + "loss": 4.2892, + "step": 6685 + }, + { + "epoch": 0.6932583323072536, + "grad_norm": 0.80078125, + "learning_rate": 0.0001748388838039827, + "loss": 4.3314, + "step": 6686 + }, + { + "epoch": 0.6933620203617417, + "grad_norm": 0.828125, + "learning_rate": 0.00017483167883479157, + "loss": 4.3239, + "step": 6687 + }, + { + "epoch": 0.6934657084162298, + "grad_norm": 0.8125, + "learning_rate": 0.0001748244729826657, + "loss": 4.2554, + "step": 6688 + }, + { + "epoch": 0.6935693964707178, + "grad_norm": 0.83203125, + "learning_rate": 0.00017481726624769012, + "loss": 4.301, + "step": 6689 + }, + { + "epoch": 0.6936730845252059, + "grad_norm": 0.83203125, + "learning_rate": 0.0001748100586299499, + "loss": 4.2807, + "step": 6690 + }, + { + "epoch": 0.693776772579694, + "grad_norm": 0.76171875, + "learning_rate": 0.00017480285012953006, + "loss": 4.3359, + "step": 6691 + }, + { + "epoch": 0.6938804606341821, + "grad_norm": 0.765625, + "learning_rate": 0.00017479564074651568, + "loss": 4.3141, + "step": 6692 + }, + { + "epoch": 0.6939841486886701, + "grad_norm": 0.81640625, + "learning_rate": 0.00017478843048099178, + "loss": 4.2948, + "step": 6693 + }, + { + "epoch": 0.6940878367431582, + "grad_norm": 0.78125, + "learning_rate": 0.00017478121933304345, + "loss": 4.3005, + "step": 6694 + }, + { + "epoch": 0.6941915247976462, + "grad_norm": 0.8125, + "learning_rate": 0.0001747740073027558, + "loss": 4.3416, + "step": 6695 + }, + { + "epoch": 0.6942952128521344, + "grad_norm": 0.7421875, + "learning_rate": 0.0001747667943902139, + "loss": 4.3027, + "step": 6696 + }, + { + "epoch": 0.6943989009066224, + "grad_norm": 0.80859375, + "learning_rate": 0.00017475958059550285, + "loss": 4.3, + "step": 6697 + }, + { + "epoch": 0.6945025889611105, + "grad_norm": 0.7734375, + "learning_rate": 0.0001747523659187078, + "loss": 4.3291, + "step": 6698 + }, + { + "epoch": 0.6946062770155985, + "grad_norm": 0.7734375, + "learning_rate": 0.0001747451503599138, + "loss": 4.3234, + "step": 6699 + }, + { + "epoch": 0.6947099650700866, + "grad_norm": 0.83203125, + "learning_rate": 0.00017473793391920608, + "loss": 4.3066, + "step": 6700 + }, + { + "epoch": 0.6948136531245748, + "grad_norm": 0.796875, + "learning_rate": 0.0001747307165966697, + "loss": 4.3168, + "step": 6701 + }, + { + "epoch": 0.6949173411790628, + "grad_norm": 0.76171875, + "learning_rate": 0.00017472349839238989, + "loss": 4.3572, + "step": 6702 + }, + { + "epoch": 0.6950210292335509, + "grad_norm": 0.796875, + "learning_rate": 0.00017471627930645175, + "loss": 4.2868, + "step": 6703 + }, + { + "epoch": 0.6951247172880389, + "grad_norm": 0.7265625, + "learning_rate": 0.00017470905933894052, + "loss": 4.3121, + "step": 6704 + }, + { + "epoch": 0.6952284053425271, + "grad_norm": 0.78125, + "learning_rate": 0.00017470183848994139, + "loss": 4.3402, + "step": 6705 + }, + { + "epoch": 0.6953320933970151, + "grad_norm": 0.76953125, + "learning_rate": 0.0001746946167595395, + "loss": 4.301, + "step": 6706 + }, + { + "epoch": 0.6954357814515032, + "grad_norm": 0.69140625, + "learning_rate": 0.00017468739414782007, + "loss": 4.2396, + "step": 6707 + }, + { + "epoch": 0.6955394695059912, + "grad_norm": 0.73046875, + "learning_rate": 0.00017468017065486836, + "loss": 4.2664, + "step": 6708 + }, + { + "epoch": 0.6956431575604793, + "grad_norm": 0.73046875, + "learning_rate": 0.00017467294628076955, + "loss": 4.303, + "step": 6709 + }, + { + "epoch": 0.6957468456149674, + "grad_norm": 0.68359375, + "learning_rate": 0.00017466572102560894, + "loss": 4.3181, + "step": 6710 + }, + { + "epoch": 0.6958505336694555, + "grad_norm": 0.7265625, + "learning_rate": 0.0001746584948894717, + "loss": 4.329, + "step": 6711 + }, + { + "epoch": 0.6959542217239435, + "grad_norm": 0.65234375, + "learning_rate": 0.0001746512678724432, + "loss": 4.2602, + "step": 6712 + }, + { + "epoch": 0.6960579097784316, + "grad_norm": 0.80078125, + "learning_rate": 0.0001746440399746086, + "loss": 4.2684, + "step": 6713 + }, + { + "epoch": 0.6961615978329196, + "grad_norm": 0.61328125, + "learning_rate": 0.00017463681119605324, + "loss": 4.3137, + "step": 6714 + }, + { + "epoch": 0.6962652858874078, + "grad_norm": 0.76171875, + "learning_rate": 0.00017462958153686243, + "loss": 4.3403, + "step": 6715 + }, + { + "epoch": 0.6963689739418958, + "grad_norm": 0.65625, + "learning_rate": 0.00017462235099712143, + "loss": 4.313, + "step": 6716 + }, + { + "epoch": 0.6964726619963839, + "grad_norm": 0.66796875, + "learning_rate": 0.00017461511957691554, + "loss": 4.2659, + "step": 6717 + }, + { + "epoch": 0.6965763500508719, + "grad_norm": 0.6875, + "learning_rate": 0.00017460788727633014, + "loss": 4.3121, + "step": 6718 + }, + { + "epoch": 0.69668003810536, + "grad_norm": 0.625, + "learning_rate": 0.00017460065409545053, + "loss": 4.2943, + "step": 6719 + }, + { + "epoch": 0.6967837261598481, + "grad_norm": 0.66015625, + "learning_rate": 0.00017459342003436204, + "loss": 4.2951, + "step": 6720 + }, + { + "epoch": 0.6968874142143362, + "grad_norm": 0.7109375, + "learning_rate": 0.00017458618509315005, + "loss": 4.3544, + "step": 6721 + }, + { + "epoch": 0.6969911022688242, + "grad_norm": 0.6015625, + "learning_rate": 0.00017457894927189996, + "loss": 4.286, + "step": 6722 + }, + { + "epoch": 0.6970947903233123, + "grad_norm": 0.70703125, + "learning_rate": 0.00017457171257069707, + "loss": 4.3755, + "step": 6723 + }, + { + "epoch": 0.6971984783778004, + "grad_norm": 0.62890625, + "learning_rate": 0.00017456447498962678, + "loss": 4.3603, + "step": 6724 + }, + { + "epoch": 0.6973021664322885, + "grad_norm": 0.6640625, + "learning_rate": 0.0001745572365287745, + "loss": 4.3034, + "step": 6725 + }, + { + "epoch": 0.6974058544867765, + "grad_norm": 0.66015625, + "learning_rate": 0.00017454999718822566, + "loss": 4.3108, + "step": 6726 + }, + { + "epoch": 0.6975095425412646, + "grad_norm": 0.765625, + "learning_rate": 0.00017454275696806567, + "loss": 4.2932, + "step": 6727 + }, + { + "epoch": 0.6976132305957526, + "grad_norm": 0.671875, + "learning_rate": 0.0001745355158683799, + "loss": 4.2865, + "step": 6728 + }, + { + "epoch": 0.6977169186502408, + "grad_norm": 0.74609375, + "learning_rate": 0.00017452827388925388, + "loss": 4.3343, + "step": 6729 + }, + { + "epoch": 0.6978206067047288, + "grad_norm": 0.73046875, + "learning_rate": 0.000174521031030773, + "loss": 4.2954, + "step": 6730 + }, + { + "epoch": 0.6979242947592169, + "grad_norm": 0.7578125, + "learning_rate": 0.00017451378729302271, + "loss": 4.3043, + "step": 6731 + }, + { + "epoch": 0.6980279828137049, + "grad_norm": 0.7109375, + "learning_rate": 0.00017450654267608847, + "loss": 4.3119, + "step": 6732 + }, + { + "epoch": 0.698131670868193, + "grad_norm": 0.77734375, + "learning_rate": 0.00017449929718005582, + "loss": 4.2998, + "step": 6733 + }, + { + "epoch": 0.6982353589226811, + "grad_norm": 0.67578125, + "learning_rate": 0.00017449205080501018, + "loss": 4.2894, + "step": 6734 + }, + { + "epoch": 0.6983390469771692, + "grad_norm": 0.703125, + "learning_rate": 0.0001744848035510371, + "loss": 4.2802, + "step": 6735 + }, + { + "epoch": 0.6984427350316572, + "grad_norm": 0.72265625, + "learning_rate": 0.00017447755541822208, + "loss": 4.2742, + "step": 6736 + }, + { + "epoch": 0.6985464230861453, + "grad_norm": 0.7578125, + "learning_rate": 0.00017447030640665062, + "loss": 4.2813, + "step": 6737 + }, + { + "epoch": 0.6986501111406334, + "grad_norm": 0.78125, + "learning_rate": 0.00017446305651640825, + "loss": 4.2958, + "step": 6738 + }, + { + "epoch": 0.6987537991951215, + "grad_norm": 0.7578125, + "learning_rate": 0.00017445580574758056, + "loss": 4.2717, + "step": 6739 + }, + { + "epoch": 0.6988574872496095, + "grad_norm": 0.84765625, + "learning_rate": 0.00017444855410025305, + "loss": 4.3151, + "step": 6740 + }, + { + "epoch": 0.6989611753040976, + "grad_norm": 0.7734375, + "learning_rate": 0.0001744413015745113, + "loss": 4.3139, + "step": 6741 + }, + { + "epoch": 0.6990648633585856, + "grad_norm": 0.8046875, + "learning_rate": 0.00017443404817044087, + "loss": 4.3068, + "step": 6742 + }, + { + "epoch": 0.6991685514130738, + "grad_norm": 0.7890625, + "learning_rate": 0.00017442679388812737, + "loss": 4.3164, + "step": 6743 + }, + { + "epoch": 0.6992722394675618, + "grad_norm": 0.8359375, + "learning_rate": 0.00017441953872765638, + "loss": 4.2802, + "step": 6744 + }, + { + "epoch": 0.6993759275220499, + "grad_norm": 0.71484375, + "learning_rate": 0.00017441228268911347, + "loss": 4.332, + "step": 6745 + }, + { + "epoch": 0.699479615576538, + "grad_norm": 0.9453125, + "learning_rate": 0.00017440502577258427, + "loss": 4.2957, + "step": 6746 + }, + { + "epoch": 0.699583303631026, + "grad_norm": 0.8046875, + "learning_rate": 0.00017439776797815445, + "loss": 4.3194, + "step": 6747 + }, + { + "epoch": 0.6996869916855142, + "grad_norm": 0.92578125, + "learning_rate": 0.00017439050930590958, + "loss": 4.2783, + "step": 6748 + }, + { + "epoch": 0.6997906797400022, + "grad_norm": 0.8203125, + "learning_rate": 0.00017438324975593538, + "loss": 4.2853, + "step": 6749 + }, + { + "epoch": 0.6998943677944903, + "grad_norm": 0.8671875, + "learning_rate": 0.0001743759893283174, + "loss": 4.2653, + "step": 6750 + }, + { + "epoch": 0.6999980558489783, + "grad_norm": 0.9375, + "learning_rate": 0.00017436872802314141, + "loss": 4.309, + "step": 6751 + }, + { + "epoch": 0.7001017439034665, + "grad_norm": 0.8203125, + "learning_rate": 0.00017436146584049302, + "loss": 4.3477, + "step": 6752 + }, + { + "epoch": 0.7002054319579545, + "grad_norm": 0.9765625, + "learning_rate": 0.00017435420278045794, + "loss": 4.3174, + "step": 6753 + }, + { + "epoch": 0.7003091200124426, + "grad_norm": 0.79296875, + "learning_rate": 0.00017434693884312184, + "loss": 4.2983, + "step": 6754 + }, + { + "epoch": 0.7004128080669306, + "grad_norm": 0.94140625, + "learning_rate": 0.0001743396740285705, + "loss": 4.3341, + "step": 6755 + }, + { + "epoch": 0.7005164961214188, + "grad_norm": 0.80078125, + "learning_rate": 0.00017433240833688955, + "loss": 4.3021, + "step": 6756 + }, + { + "epoch": 0.7006201841759068, + "grad_norm": 0.734375, + "learning_rate": 0.00017432514176816478, + "loss": 4.3089, + "step": 6757 + }, + { + "epoch": 0.7007238722303949, + "grad_norm": 0.80859375, + "learning_rate": 0.00017431787432248188, + "loss": 4.3165, + "step": 6758 + }, + { + "epoch": 0.7008275602848829, + "grad_norm": 0.8125, + "learning_rate": 0.00017431060599992662, + "loss": 4.321, + "step": 6759 + }, + { + "epoch": 0.700931248339371, + "grad_norm": 0.83203125, + "learning_rate": 0.00017430333680058476, + "loss": 4.3034, + "step": 6760 + }, + { + "epoch": 0.701034936393859, + "grad_norm": 0.828125, + "learning_rate": 0.0001742960667245421, + "loss": 4.2945, + "step": 6761 + }, + { + "epoch": 0.7011386244483472, + "grad_norm": 0.95703125, + "learning_rate": 0.00017428879577188435, + "loss": 4.2914, + "step": 6762 + }, + { + "epoch": 0.7012423125028352, + "grad_norm": 0.7265625, + "learning_rate": 0.00017428152394269731, + "loss": 4.3338, + "step": 6763 + }, + { + "epoch": 0.7013460005573233, + "grad_norm": 0.79296875, + "learning_rate": 0.00017427425123706688, + "loss": 4.3143, + "step": 6764 + }, + { + "epoch": 0.7014496886118113, + "grad_norm": 0.78125, + "learning_rate": 0.00017426697765507876, + "loss": 4.2955, + "step": 6765 + }, + { + "epoch": 0.7015533766662995, + "grad_norm": 0.75390625, + "learning_rate": 0.00017425970319681882, + "loss": 4.2712, + "step": 6766 + }, + { + "epoch": 0.7016570647207875, + "grad_norm": 0.84765625, + "learning_rate": 0.00017425242786237285, + "loss": 4.2621, + "step": 6767 + }, + { + "epoch": 0.7017607527752756, + "grad_norm": 0.80078125, + "learning_rate": 0.00017424515165182674, + "loss": 4.3243, + "step": 6768 + }, + { + "epoch": 0.7018644408297636, + "grad_norm": 0.78515625, + "learning_rate": 0.00017423787456526634, + "loss": 4.2579, + "step": 6769 + }, + { + "epoch": 0.7019681288842518, + "grad_norm": 0.78515625, + "learning_rate": 0.00017423059660277742, + "loss": 4.2895, + "step": 6770 + }, + { + "epoch": 0.7020718169387398, + "grad_norm": 0.859375, + "learning_rate": 0.00017422331776444598, + "loss": 4.3058, + "step": 6771 + }, + { + "epoch": 0.7021755049932279, + "grad_norm": 0.91796875, + "learning_rate": 0.00017421603805035785, + "loss": 4.2943, + "step": 6772 + }, + { + "epoch": 0.7022791930477159, + "grad_norm": 0.7890625, + "learning_rate": 0.00017420875746059893, + "loss": 4.2972, + "step": 6773 + }, + { + "epoch": 0.702382881102204, + "grad_norm": 0.90625, + "learning_rate": 0.0001742014759952551, + "loss": 4.3516, + "step": 6774 + }, + { + "epoch": 0.702486569156692, + "grad_norm": 0.78515625, + "learning_rate": 0.00017419419365441227, + "loss": 4.3031, + "step": 6775 + }, + { + "epoch": 0.7025902572111802, + "grad_norm": 0.890625, + "learning_rate": 0.0001741869104381564, + "loss": 4.3006, + "step": 6776 + }, + { + "epoch": 0.7026939452656682, + "grad_norm": 0.796875, + "learning_rate": 0.0001741796263465734, + "loss": 4.3177, + "step": 6777 + }, + { + "epoch": 0.7027976333201563, + "grad_norm": 0.8125, + "learning_rate": 0.00017417234137974923, + "loss": 4.354, + "step": 6778 + }, + { + "epoch": 0.7029013213746443, + "grad_norm": 0.83203125, + "learning_rate": 0.00017416505553776983, + "loss": 4.3367, + "step": 6779 + }, + { + "epoch": 0.7030050094291325, + "grad_norm": 0.79296875, + "learning_rate": 0.00017415776882072118, + "loss": 4.2742, + "step": 6780 + }, + { + "epoch": 0.7031086974836205, + "grad_norm": 0.90625, + "learning_rate": 0.00017415048122868923, + "loss": 4.2843, + "step": 6781 + }, + { + "epoch": 0.7032123855381086, + "grad_norm": 0.765625, + "learning_rate": 0.00017414319276175995, + "loss": 4.3533, + "step": 6782 + }, + { + "epoch": 0.7033160735925966, + "grad_norm": 0.90234375, + "learning_rate": 0.00017413590342001944, + "loss": 4.3043, + "step": 6783 + }, + { + "epoch": 0.7034197616470848, + "grad_norm": 0.71875, + "learning_rate": 0.0001741286132035536, + "loss": 4.3679, + "step": 6784 + }, + { + "epoch": 0.7035234497015728, + "grad_norm": 0.8359375, + "learning_rate": 0.00017412132211244846, + "loss": 4.2908, + "step": 6785 + }, + { + "epoch": 0.7036271377560609, + "grad_norm": 0.71484375, + "learning_rate": 0.0001741140301467901, + "loss": 4.2883, + "step": 6786 + }, + { + "epoch": 0.7037308258105489, + "grad_norm": 0.80859375, + "learning_rate": 0.00017410673730666452, + "loss": 4.3207, + "step": 6787 + }, + { + "epoch": 0.703834513865037, + "grad_norm": 0.72265625, + "learning_rate": 0.00017409944359215779, + "loss": 4.2686, + "step": 6788 + }, + { + "epoch": 0.7039382019195252, + "grad_norm": 0.83984375, + "learning_rate": 0.00017409214900335592, + "loss": 4.3197, + "step": 6789 + }, + { + "epoch": 0.7040418899740132, + "grad_norm": 0.671875, + "learning_rate": 0.000174084853540345, + "loss": 4.2796, + "step": 6790 + }, + { + "epoch": 0.7041455780285013, + "grad_norm": 0.89453125, + "learning_rate": 0.00017407755720321116, + "loss": 4.2983, + "step": 6791 + }, + { + "epoch": 0.7042492660829893, + "grad_norm": 0.69921875, + "learning_rate": 0.00017407025999204042, + "loss": 4.2933, + "step": 6792 + }, + { + "epoch": 0.7043529541374774, + "grad_norm": 0.76953125, + "learning_rate": 0.00017406296190691892, + "loss": 4.3228, + "step": 6793 + }, + { + "epoch": 0.7044566421919655, + "grad_norm": 0.6953125, + "learning_rate": 0.00017405566294793277, + "loss": 4.2952, + "step": 6794 + }, + { + "epoch": 0.7045603302464536, + "grad_norm": 0.83984375, + "learning_rate": 0.00017404836311516806, + "loss": 4.3146, + "step": 6795 + }, + { + "epoch": 0.7046640183009416, + "grad_norm": 0.71484375, + "learning_rate": 0.00017404106240871093, + "loss": 4.316, + "step": 6796 + }, + { + "epoch": 0.7047677063554297, + "grad_norm": 0.74609375, + "learning_rate": 0.00017403376082864754, + "loss": 4.2951, + "step": 6797 + }, + { + "epoch": 0.7048713944099178, + "grad_norm": 0.7734375, + "learning_rate": 0.000174026458375064, + "loss": 4.2902, + "step": 6798 + }, + { + "epoch": 0.7049750824644059, + "grad_norm": 0.75390625, + "learning_rate": 0.00017401915504804656, + "loss": 4.2828, + "step": 6799 + }, + { + "epoch": 0.7050787705188939, + "grad_norm": 0.85546875, + "learning_rate": 0.0001740118508476813, + "loss": 4.34, + "step": 6800 + }, + { + "epoch": 0.705182458573382, + "grad_norm": 0.7890625, + "learning_rate": 0.00017400454577405443, + "loss": 4.3162, + "step": 6801 + }, + { + "epoch": 0.70528614662787, + "grad_norm": 0.86328125, + "learning_rate": 0.00017399723982725217, + "loss": 4.3065, + "step": 6802 + }, + { + "epoch": 0.7053898346823582, + "grad_norm": 0.83984375, + "learning_rate": 0.00017398993300736065, + "loss": 4.2622, + "step": 6803 + }, + { + "epoch": 0.7054935227368462, + "grad_norm": 0.7734375, + "learning_rate": 0.00017398262531446616, + "loss": 4.3342, + "step": 6804 + }, + { + "epoch": 0.7055972107913343, + "grad_norm": 0.83984375, + "learning_rate": 0.0001739753167486549, + "loss": 4.2971, + "step": 6805 + }, + { + "epoch": 0.7057008988458223, + "grad_norm": 0.8125, + "learning_rate": 0.0001739680073100131, + "loss": 4.3237, + "step": 6806 + }, + { + "epoch": 0.7058045869003104, + "grad_norm": 0.8984375, + "learning_rate": 0.000173960696998627, + "loss": 4.3068, + "step": 6807 + }, + { + "epoch": 0.7059082749547985, + "grad_norm": 0.76953125, + "learning_rate": 0.00017395338581458286, + "loss": 4.2803, + "step": 6808 + }, + { + "epoch": 0.7060119630092866, + "grad_norm": 0.796875, + "learning_rate": 0.00017394607375796693, + "loss": 4.2956, + "step": 6809 + }, + { + "epoch": 0.7061156510637746, + "grad_norm": 0.76953125, + "learning_rate": 0.00017393876082886546, + "loss": 4.3073, + "step": 6810 + }, + { + "epoch": 0.7062193391182627, + "grad_norm": 0.73828125, + "learning_rate": 0.0001739314470273648, + "loss": 4.2763, + "step": 6811 + }, + { + "epoch": 0.7063230271727508, + "grad_norm": 0.81640625, + "learning_rate": 0.00017392413235355124, + "loss": 4.3287, + "step": 6812 + }, + { + "epoch": 0.7064267152272389, + "grad_norm": 0.73046875, + "learning_rate": 0.000173916816807511, + "loss": 4.2653, + "step": 6813 + }, + { + "epoch": 0.7065304032817269, + "grad_norm": 0.765625, + "learning_rate": 0.0001739095003893305, + "loss": 4.3285, + "step": 6814 + }, + { + "epoch": 0.706634091336215, + "grad_norm": 0.8203125, + "learning_rate": 0.00017390218309909603, + "loss": 4.3328, + "step": 6815 + }, + { + "epoch": 0.706737779390703, + "grad_norm": 0.72265625, + "learning_rate": 0.00017389486493689388, + "loss": 4.2716, + "step": 6816 + }, + { + "epoch": 0.7068414674451912, + "grad_norm": 0.921875, + "learning_rate": 0.00017388754590281046, + "loss": 4.2941, + "step": 6817 + }, + { + "epoch": 0.7069451554996792, + "grad_norm": 0.7421875, + "learning_rate": 0.0001738802259969321, + "loss": 4.3477, + "step": 6818 + }, + { + "epoch": 0.7070488435541673, + "grad_norm": 0.99609375, + "learning_rate": 0.00017387290521934517, + "loss": 4.2487, + "step": 6819 + }, + { + "epoch": 0.7071525316086553, + "grad_norm": 0.7109375, + "learning_rate": 0.00017386558357013602, + "loss": 4.2578, + "step": 6820 + }, + { + "epoch": 0.7072562196631434, + "grad_norm": 0.95703125, + "learning_rate": 0.00017385826104939108, + "loss": 4.2883, + "step": 6821 + }, + { + "epoch": 0.7073599077176315, + "grad_norm": 0.72265625, + "learning_rate": 0.00017385093765719673, + "loss": 4.2993, + "step": 6822 + }, + { + "epoch": 0.7074635957721196, + "grad_norm": 0.890625, + "learning_rate": 0.0001738436133936394, + "loss": 4.2944, + "step": 6823 + }, + { + "epoch": 0.7075672838266076, + "grad_norm": 0.75, + "learning_rate": 0.00017383628825880546, + "loss": 4.2836, + "step": 6824 + }, + { + "epoch": 0.7076709718810957, + "grad_norm": 0.8046875, + "learning_rate": 0.0001738289622527814, + "loss": 4.2791, + "step": 6825 + }, + { + "epoch": 0.7077746599355837, + "grad_norm": 0.80859375, + "learning_rate": 0.00017382163537565357, + "loss": 4.2917, + "step": 6826 + }, + { + "epoch": 0.7078783479900719, + "grad_norm": 0.7734375, + "learning_rate": 0.0001738143076275085, + "loss": 4.2644, + "step": 6827 + }, + { + "epoch": 0.7079820360445599, + "grad_norm": 0.71875, + "learning_rate": 0.00017380697900843263, + "loss": 4.3196, + "step": 6828 + }, + { + "epoch": 0.708085724099048, + "grad_norm": 0.7734375, + "learning_rate": 0.00017379964951851244, + "loss": 4.3053, + "step": 6829 + }, + { + "epoch": 0.708189412153536, + "grad_norm": 0.734375, + "learning_rate": 0.0001737923191578344, + "loss": 4.2747, + "step": 6830 + }, + { + "epoch": 0.7082931002080242, + "grad_norm": 0.78515625, + "learning_rate": 0.00017378498792648496, + "loss": 4.2809, + "step": 6831 + }, + { + "epoch": 0.7083967882625122, + "grad_norm": 0.82421875, + "learning_rate": 0.00017377765582455069, + "loss": 4.3281, + "step": 6832 + }, + { + "epoch": 0.7085004763170003, + "grad_norm": 0.8203125, + "learning_rate": 0.00017377032285211804, + "loss": 4.3215, + "step": 6833 + }, + { + "epoch": 0.7086041643714884, + "grad_norm": 0.7109375, + "learning_rate": 0.00017376298900927356, + "loss": 4.3022, + "step": 6834 + }, + { + "epoch": 0.7087078524259764, + "grad_norm": 0.8515625, + "learning_rate": 0.0001737556542961038, + "loss": 4.2739, + "step": 6835 + }, + { + "epoch": 0.7088115404804646, + "grad_norm": 0.69921875, + "learning_rate": 0.00017374831871269528, + "loss": 4.3283, + "step": 6836 + }, + { + "epoch": 0.7089152285349526, + "grad_norm": 0.78125, + "learning_rate": 0.00017374098225913454, + "loss": 4.2811, + "step": 6837 + }, + { + "epoch": 0.7090189165894407, + "grad_norm": 0.6484375, + "learning_rate": 0.00017373364493550815, + "loss": 4.3492, + "step": 6838 + }, + { + "epoch": 0.7091226046439287, + "grad_norm": 0.8125, + "learning_rate": 0.00017372630674190274, + "loss": 4.249, + "step": 6839 + }, + { + "epoch": 0.7092262926984169, + "grad_norm": 0.6953125, + "learning_rate": 0.00017371896767840478, + "loss": 4.3491, + "step": 6840 + }, + { + "epoch": 0.7093299807529049, + "grad_norm": 0.8671875, + "learning_rate": 0.00017371162774510097, + "loss": 4.3126, + "step": 6841 + }, + { + "epoch": 0.709433668807393, + "grad_norm": 0.7109375, + "learning_rate": 0.00017370428694207782, + "loss": 4.3509, + "step": 6842 + }, + { + "epoch": 0.709537356861881, + "grad_norm": 0.75390625, + "learning_rate": 0.00017369694526942208, + "loss": 4.2861, + "step": 6843 + }, + { + "epoch": 0.7096410449163691, + "grad_norm": 0.75390625, + "learning_rate": 0.00017368960272722022, + "loss": 4.3139, + "step": 6844 + }, + { + "epoch": 0.7097447329708572, + "grad_norm": 0.625, + "learning_rate": 0.00017368225931555892, + "loss": 4.3315, + "step": 6845 + }, + { + "epoch": 0.7098484210253453, + "grad_norm": 0.75390625, + "learning_rate": 0.0001736749150345249, + "loss": 4.2999, + "step": 6846 + }, + { + "epoch": 0.7099521090798333, + "grad_norm": 0.6484375, + "learning_rate": 0.00017366756988420473, + "loss": 4.3054, + "step": 6847 + }, + { + "epoch": 0.7100557971343214, + "grad_norm": 0.73046875, + "learning_rate": 0.00017366022386468513, + "loss": 4.2595, + "step": 6848 + }, + { + "epoch": 0.7101594851888094, + "grad_norm": 0.6953125, + "learning_rate": 0.00017365287697605273, + "loss": 4.2923, + "step": 6849 + }, + { + "epoch": 0.7102631732432976, + "grad_norm": 0.75, + "learning_rate": 0.00017364552921839423, + "loss": 4.3015, + "step": 6850 + }, + { + "epoch": 0.7103668612977856, + "grad_norm": 0.79296875, + "learning_rate": 0.00017363818059179634, + "loss": 4.3324, + "step": 6851 + }, + { + "epoch": 0.7104705493522737, + "grad_norm": 0.69921875, + "learning_rate": 0.00017363083109634577, + "loss": 4.3018, + "step": 6852 + }, + { + "epoch": 0.7105742374067617, + "grad_norm": 0.8125, + "learning_rate": 0.0001736234807321292, + "loss": 4.2822, + "step": 6853 + }, + { + "epoch": 0.7106779254612499, + "grad_norm": 0.703125, + "learning_rate": 0.00017361612949923344, + "loss": 4.3018, + "step": 6854 + }, + { + "epoch": 0.7107816135157379, + "grad_norm": 0.84765625, + "learning_rate": 0.0001736087773977451, + "loss": 4.309, + "step": 6855 + }, + { + "epoch": 0.710885301570226, + "grad_norm": 0.78515625, + "learning_rate": 0.00017360142442775104, + "loss": 4.3042, + "step": 6856 + }, + { + "epoch": 0.710988989624714, + "grad_norm": 0.74609375, + "learning_rate": 0.00017359407058933793, + "loss": 4.3146, + "step": 6857 + }, + { + "epoch": 0.7110926776792021, + "grad_norm": 0.73046875, + "learning_rate": 0.00017358671588259261, + "loss": 4.298, + "step": 6858 + }, + { + "epoch": 0.7111963657336902, + "grad_norm": 0.796875, + "learning_rate": 0.00017357936030760183, + "loss": 4.3325, + "step": 6859 + }, + { + "epoch": 0.7113000537881783, + "grad_norm": 0.828125, + "learning_rate": 0.00017357200386445233, + "loss": 4.3342, + "step": 6860 + }, + { + "epoch": 0.7114037418426663, + "grad_norm": 0.76953125, + "learning_rate": 0.000173564646553231, + "loss": 4.321, + "step": 6861 + }, + { + "epoch": 0.7115074298971544, + "grad_norm": 0.8046875, + "learning_rate": 0.00017355728837402458, + "loss": 4.274, + "step": 6862 + }, + { + "epoch": 0.7116111179516424, + "grad_norm": 0.76953125, + "learning_rate": 0.00017354992932691992, + "loss": 4.2502, + "step": 6863 + }, + { + "epoch": 0.7117148060061306, + "grad_norm": 0.7734375, + "learning_rate": 0.00017354256941200385, + "loss": 4.2805, + "step": 6864 + }, + { + "epoch": 0.7118184940606186, + "grad_norm": 0.66015625, + "learning_rate": 0.00017353520862936316, + "loss": 4.2811, + "step": 6865 + }, + { + "epoch": 0.7119221821151067, + "grad_norm": 0.7578125, + "learning_rate": 0.00017352784697908478, + "loss": 4.3037, + "step": 6866 + }, + { + "epoch": 0.7120258701695947, + "grad_norm": 0.7109375, + "learning_rate": 0.00017352048446125551, + "loss": 4.2977, + "step": 6867 + }, + { + "epoch": 0.7121295582240829, + "grad_norm": 0.85546875, + "learning_rate": 0.00017351312107596225, + "loss": 4.3676, + "step": 6868 + }, + { + "epoch": 0.7122332462785709, + "grad_norm": 0.7578125, + "learning_rate": 0.00017350575682329185, + "loss": 4.3402, + "step": 6869 + }, + { + "epoch": 0.712336934333059, + "grad_norm": 0.83203125, + "learning_rate": 0.0001734983917033312, + "loss": 4.3002, + "step": 6870 + }, + { + "epoch": 0.712440622387547, + "grad_norm": 0.83203125, + "learning_rate": 0.00017349102571616727, + "loss": 4.3039, + "step": 6871 + }, + { + "epoch": 0.7125443104420351, + "grad_norm": 0.796875, + "learning_rate": 0.0001734836588618869, + "loss": 4.3131, + "step": 6872 + }, + { + "epoch": 0.7126479984965232, + "grad_norm": 0.73046875, + "learning_rate": 0.00017347629114057705, + "loss": 4.3066, + "step": 6873 + }, + { + "epoch": 0.7127516865510113, + "grad_norm": 0.703125, + "learning_rate": 0.0001734689225523246, + "loss": 4.2851, + "step": 6874 + }, + { + "epoch": 0.7128553746054993, + "grad_norm": 0.8359375, + "learning_rate": 0.00017346155309721652, + "loss": 4.2915, + "step": 6875 + }, + { + "epoch": 0.7129590626599874, + "grad_norm": 0.7109375, + "learning_rate": 0.00017345418277533978, + "loss": 4.3027, + "step": 6876 + }, + { + "epoch": 0.7130627507144754, + "grad_norm": 0.77734375, + "learning_rate": 0.00017344681158678132, + "loss": 4.3315, + "step": 6877 + }, + { + "epoch": 0.7131664387689636, + "grad_norm": 0.70703125, + "learning_rate": 0.0001734394395316281, + "loss": 4.3191, + "step": 6878 + }, + { + "epoch": 0.7132701268234517, + "grad_norm": 0.7421875, + "learning_rate": 0.0001734320666099672, + "loss": 4.2969, + "step": 6879 + }, + { + "epoch": 0.7133738148779397, + "grad_norm": 0.79296875, + "learning_rate": 0.00017342469282188546, + "loss": 4.3088, + "step": 6880 + }, + { + "epoch": 0.7134775029324278, + "grad_norm": 0.76953125, + "learning_rate": 0.00017341731816746997, + "loss": 4.2874, + "step": 6881 + }, + { + "epoch": 0.7135811909869159, + "grad_norm": 0.7578125, + "learning_rate": 0.00017340994264680774, + "loss": 4.2924, + "step": 6882 + }, + { + "epoch": 0.713684879041404, + "grad_norm": 0.74609375, + "learning_rate": 0.0001734025662599858, + "loss": 4.31, + "step": 6883 + }, + { + "epoch": 0.713788567095892, + "grad_norm": 0.71484375, + "learning_rate": 0.00017339518900709116, + "loss": 4.3129, + "step": 6884 + }, + { + "epoch": 0.7138922551503801, + "grad_norm": 0.71875, + "learning_rate": 0.00017338781088821085, + "loss": 4.3051, + "step": 6885 + }, + { + "epoch": 0.7139959432048681, + "grad_norm": 0.74609375, + "learning_rate": 0.00017338043190343196, + "loss": 4.3367, + "step": 6886 + }, + { + "epoch": 0.7140996312593563, + "grad_norm": 0.7890625, + "learning_rate": 0.00017337305205284155, + "loss": 4.326, + "step": 6887 + }, + { + "epoch": 0.7142033193138443, + "grad_norm": 0.62890625, + "learning_rate": 0.00017336567133652668, + "loss": 4.3013, + "step": 6888 + }, + { + "epoch": 0.7143070073683324, + "grad_norm": 0.7734375, + "learning_rate": 0.00017335828975457445, + "loss": 4.3218, + "step": 6889 + }, + { + "epoch": 0.7144106954228204, + "grad_norm": 0.6171875, + "learning_rate": 0.0001733509073070719, + "loss": 4.2486, + "step": 6890 + }, + { + "epoch": 0.7145143834773086, + "grad_norm": 0.7890625, + "learning_rate": 0.00017334352399410623, + "loss": 4.285, + "step": 6891 + }, + { + "epoch": 0.7146180715317966, + "grad_norm": 0.6875, + "learning_rate": 0.0001733361398157645, + "loss": 4.2822, + "step": 6892 + }, + { + "epoch": 0.7147217595862847, + "grad_norm": 0.8203125, + "learning_rate": 0.00017332875477213378, + "loss": 4.3236, + "step": 6893 + }, + { + "epoch": 0.7148254476407727, + "grad_norm": 0.6953125, + "learning_rate": 0.00017332136886330136, + "loss": 4.2788, + "step": 6894 + }, + { + "epoch": 0.7149291356952608, + "grad_norm": 0.765625, + "learning_rate": 0.0001733139820893542, + "loss": 4.3271, + "step": 6895 + }, + { + "epoch": 0.7150328237497489, + "grad_norm": 0.61328125, + "learning_rate": 0.0001733065944503796, + "loss": 4.2728, + "step": 6896 + }, + { + "epoch": 0.715136511804237, + "grad_norm": 0.77734375, + "learning_rate": 0.00017329920594646466, + "loss": 4.3192, + "step": 6897 + }, + { + "epoch": 0.715240199858725, + "grad_norm": 0.64453125, + "learning_rate": 0.00017329181657769658, + "loss": 4.2962, + "step": 6898 + }, + { + "epoch": 0.7153438879132131, + "grad_norm": 0.85546875, + "learning_rate": 0.00017328442634416253, + "loss": 4.3042, + "step": 6899 + }, + { + "epoch": 0.7154475759677011, + "grad_norm": 0.78515625, + "learning_rate": 0.00017327703524594971, + "loss": 4.3336, + "step": 6900 + }, + { + "epoch": 0.7155512640221893, + "grad_norm": 0.734375, + "learning_rate": 0.00017326964328314532, + "loss": 4.2664, + "step": 6901 + }, + { + "epoch": 0.7156549520766773, + "grad_norm": 0.8828125, + "learning_rate": 0.0001732622504558366, + "loss": 4.2763, + "step": 6902 + }, + { + "epoch": 0.7157586401311654, + "grad_norm": 0.7421875, + "learning_rate": 0.0001732548567641108, + "loss": 4.2979, + "step": 6903 + }, + { + "epoch": 0.7158623281856534, + "grad_norm": 0.83203125, + "learning_rate": 0.00017324746220805508, + "loss": 4.3024, + "step": 6904 + }, + { + "epoch": 0.7159660162401416, + "grad_norm": 0.81640625, + "learning_rate": 0.00017324006678775674, + "loss": 4.3234, + "step": 6905 + }, + { + "epoch": 0.7160697042946296, + "grad_norm": 0.83984375, + "learning_rate": 0.00017323267050330302, + "loss": 4.3004, + "step": 6906 + }, + { + "epoch": 0.7161733923491177, + "grad_norm": 0.7265625, + "learning_rate": 0.00017322527335478122, + "loss": 4.3169, + "step": 6907 + }, + { + "epoch": 0.7162770804036057, + "grad_norm": 0.86328125, + "learning_rate": 0.00017321787534227862, + "loss": 4.3255, + "step": 6908 + }, + { + "epoch": 0.7163807684580938, + "grad_norm": 0.75, + "learning_rate": 0.00017321047646588243, + "loss": 4.3095, + "step": 6909 + }, + { + "epoch": 0.7164844565125819, + "grad_norm": 0.8046875, + "learning_rate": 0.00017320307672568004, + "loss": 4.3014, + "step": 6910 + }, + { + "epoch": 0.71658814456707, + "grad_norm": 0.84765625, + "learning_rate": 0.00017319567612175872, + "loss": 4.3117, + "step": 6911 + }, + { + "epoch": 0.716691832621558, + "grad_norm": 0.75, + "learning_rate": 0.00017318827465420577, + "loss": 4.273, + "step": 6912 + }, + { + "epoch": 0.7167955206760461, + "grad_norm": 0.90625, + "learning_rate": 0.00017318087232310857, + "loss": 4.2808, + "step": 6913 + }, + { + "epoch": 0.7168992087305341, + "grad_norm": 0.6953125, + "learning_rate": 0.00017317346912855443, + "loss": 4.2904, + "step": 6914 + }, + { + "epoch": 0.7170028967850223, + "grad_norm": 0.83203125, + "learning_rate": 0.0001731660650706307, + "loss": 4.3138, + "step": 6915 + }, + { + "epoch": 0.7171065848395103, + "grad_norm": 0.76171875, + "learning_rate": 0.00017315866014942473, + "loss": 4.2877, + "step": 6916 + }, + { + "epoch": 0.7172102728939984, + "grad_norm": 0.80078125, + "learning_rate": 0.0001731512543650239, + "loss": 4.3118, + "step": 6917 + }, + { + "epoch": 0.7173139609484864, + "grad_norm": 0.70703125, + "learning_rate": 0.00017314384771751563, + "loss": 4.3041, + "step": 6918 + }, + { + "epoch": 0.7174176490029746, + "grad_norm": 0.75, + "learning_rate": 0.00017313644020698724, + "loss": 4.282, + "step": 6919 + }, + { + "epoch": 0.7175213370574626, + "grad_norm": 0.73828125, + "learning_rate": 0.00017312903183352616, + "loss": 4.266, + "step": 6920 + }, + { + "epoch": 0.7176250251119507, + "grad_norm": 0.75, + "learning_rate": 0.0001731216225972198, + "loss": 4.2921, + "step": 6921 + }, + { + "epoch": 0.7177287131664387, + "grad_norm": 0.75390625, + "learning_rate": 0.0001731142124981556, + "loss": 4.2855, + "step": 6922 + }, + { + "epoch": 0.7178324012209268, + "grad_norm": 0.703125, + "learning_rate": 0.00017310680153642097, + "loss": 4.2506, + "step": 6923 + }, + { + "epoch": 0.717936089275415, + "grad_norm": 0.73828125, + "learning_rate": 0.00017309938971210337, + "loss": 4.3639, + "step": 6924 + }, + { + "epoch": 0.718039777329903, + "grad_norm": 0.69140625, + "learning_rate": 0.00017309197702529026, + "loss": 4.2782, + "step": 6925 + }, + { + "epoch": 0.7181434653843911, + "grad_norm": 0.76171875, + "learning_rate": 0.00017308456347606905, + "loss": 4.332, + "step": 6926 + }, + { + "epoch": 0.7182471534388791, + "grad_norm": 0.71484375, + "learning_rate": 0.00017307714906452724, + "loss": 4.3266, + "step": 6927 + }, + { + "epoch": 0.7183508414933673, + "grad_norm": 0.71875, + "learning_rate": 0.00017306973379075234, + "loss": 4.3125, + "step": 6928 + }, + { + "epoch": 0.7184545295478553, + "grad_norm": 0.74609375, + "learning_rate": 0.0001730623176548318, + "loss": 4.2936, + "step": 6929 + }, + { + "epoch": 0.7185582176023434, + "grad_norm": 0.75390625, + "learning_rate": 0.00017305490065685318, + "loss": 4.3082, + "step": 6930 + }, + { + "epoch": 0.7186619056568314, + "grad_norm": 0.7109375, + "learning_rate": 0.00017304748279690392, + "loss": 4.3166, + "step": 6931 + }, + { + "epoch": 0.7187655937113195, + "grad_norm": 0.69140625, + "learning_rate": 0.0001730400640750716, + "loss": 4.3189, + "step": 6932 + }, + { + "epoch": 0.7188692817658076, + "grad_norm": 0.73046875, + "learning_rate": 0.00017303264449144369, + "loss": 4.2606, + "step": 6933 + }, + { + "epoch": 0.7189729698202957, + "grad_norm": 0.80859375, + "learning_rate": 0.0001730252240461078, + "loss": 4.28, + "step": 6934 + }, + { + "epoch": 0.7190766578747837, + "grad_norm": 0.65625, + "learning_rate": 0.00017301780273915144, + "loss": 4.3002, + "step": 6935 + }, + { + "epoch": 0.7191803459292718, + "grad_norm": 0.7109375, + "learning_rate": 0.00017301038057066222, + "loss": 4.3399, + "step": 6936 + }, + { + "epoch": 0.7192840339837598, + "grad_norm": 0.72265625, + "learning_rate": 0.00017300295754072768, + "loss": 4.2772, + "step": 6937 + }, + { + "epoch": 0.719387722038248, + "grad_norm": 0.70703125, + "learning_rate": 0.0001729955336494354, + "loss": 4.319, + "step": 6938 + }, + { + "epoch": 0.719491410092736, + "grad_norm": 0.76171875, + "learning_rate": 0.00017298810889687297, + "loss": 4.3156, + "step": 6939 + }, + { + "epoch": 0.7195950981472241, + "grad_norm": 0.81640625, + "learning_rate": 0.00017298068328312804, + "loss": 4.3124, + "step": 6940 + }, + { + "epoch": 0.7196987862017121, + "grad_norm": 0.7734375, + "learning_rate": 0.00017297325680828816, + "loss": 4.323, + "step": 6941 + }, + { + "epoch": 0.7198024742562003, + "grad_norm": 0.80859375, + "learning_rate": 0.000172965829472441, + "loss": 4.3011, + "step": 6942 + }, + { + "epoch": 0.7199061623106883, + "grad_norm": 0.7734375, + "learning_rate": 0.00017295840127567417, + "loss": 4.2752, + "step": 6943 + }, + { + "epoch": 0.7200098503651764, + "grad_norm": 0.7890625, + "learning_rate": 0.00017295097221807534, + "loss": 4.2404, + "step": 6944 + }, + { + "epoch": 0.7201135384196644, + "grad_norm": 0.77734375, + "learning_rate": 0.00017294354229973214, + "loss": 4.2955, + "step": 6945 + }, + { + "epoch": 0.7202172264741525, + "grad_norm": 0.79296875, + "learning_rate": 0.00017293611152073224, + "loss": 4.3117, + "step": 6946 + }, + { + "epoch": 0.7203209145286406, + "grad_norm": 0.734375, + "learning_rate": 0.00017292867988116334, + "loss": 4.307, + "step": 6947 + }, + { + "epoch": 0.7204246025831287, + "grad_norm": 0.74609375, + "learning_rate": 0.00017292124738111308, + "loss": 4.3032, + "step": 6948 + }, + { + "epoch": 0.7205282906376167, + "grad_norm": 0.83984375, + "learning_rate": 0.00017291381402066922, + "loss": 4.306, + "step": 6949 + }, + { + "epoch": 0.7206319786921048, + "grad_norm": 0.76171875, + "learning_rate": 0.0001729063797999194, + "loss": 4.2715, + "step": 6950 + }, + { + "epoch": 0.7207356667465928, + "grad_norm": 0.80078125, + "learning_rate": 0.0001728989447189514, + "loss": 4.3337, + "step": 6951 + }, + { + "epoch": 0.720839354801081, + "grad_norm": 0.84375, + "learning_rate": 0.00017289150877785282, + "loss": 4.2688, + "step": 6952 + }, + { + "epoch": 0.720943042855569, + "grad_norm": 0.7265625, + "learning_rate": 0.00017288407197671156, + "loss": 4.3162, + "step": 6953 + }, + { + "epoch": 0.7210467309100571, + "grad_norm": 0.77734375, + "learning_rate": 0.00017287663431561528, + "loss": 4.2812, + "step": 6954 + }, + { + "epoch": 0.7211504189645451, + "grad_norm": 0.73046875, + "learning_rate": 0.00017286919579465176, + "loss": 4.3023, + "step": 6955 + }, + { + "epoch": 0.7212541070190333, + "grad_norm": 0.734375, + "learning_rate": 0.00017286175641390873, + "loss": 4.2777, + "step": 6956 + }, + { + "epoch": 0.7213577950735213, + "grad_norm": 0.7421875, + "learning_rate": 0.000172854316173474, + "loss": 4.3601, + "step": 6957 + }, + { + "epoch": 0.7214614831280094, + "grad_norm": 0.828125, + "learning_rate": 0.0001728468750734354, + "loss": 4.3323, + "step": 6958 + }, + { + "epoch": 0.7215651711824974, + "grad_norm": 0.66796875, + "learning_rate": 0.0001728394331138806, + "loss": 4.2778, + "step": 6959 + }, + { + "epoch": 0.7216688592369855, + "grad_norm": 0.8046875, + "learning_rate": 0.00017283199029489752, + "loss": 4.3155, + "step": 6960 + }, + { + "epoch": 0.7217725472914736, + "grad_norm": 0.67578125, + "learning_rate": 0.00017282454661657391, + "loss": 4.2807, + "step": 6961 + }, + { + "epoch": 0.7218762353459617, + "grad_norm": 0.78515625, + "learning_rate": 0.00017281710207899765, + "loss": 4.3196, + "step": 6962 + }, + { + "epoch": 0.7219799234004497, + "grad_norm": 0.72265625, + "learning_rate": 0.0001728096566822566, + "loss": 4.3321, + "step": 6963 + }, + { + "epoch": 0.7220836114549378, + "grad_norm": 0.71875, + "learning_rate": 0.0001728022104264385, + "loss": 4.2515, + "step": 6964 + }, + { + "epoch": 0.7221872995094258, + "grad_norm": 0.81640625, + "learning_rate": 0.0001727947633116313, + "loss": 4.2605, + "step": 6965 + }, + { + "epoch": 0.722290987563914, + "grad_norm": 0.70703125, + "learning_rate": 0.00017278731533792283, + "loss": 4.3116, + "step": 6966 + }, + { + "epoch": 0.722394675618402, + "grad_norm": 0.953125, + "learning_rate": 0.000172779866505401, + "loss": 4.3079, + "step": 6967 + }, + { + "epoch": 0.7224983636728901, + "grad_norm": 0.8515625, + "learning_rate": 0.00017277241681415366, + "loss": 4.2762, + "step": 6968 + }, + { + "epoch": 0.7226020517273782, + "grad_norm": 0.765625, + "learning_rate": 0.00017276496626426874, + "loss": 4.2775, + "step": 6969 + }, + { + "epoch": 0.7227057397818663, + "grad_norm": 0.82421875, + "learning_rate": 0.0001727575148558341, + "loss": 4.3017, + "step": 6970 + }, + { + "epoch": 0.7228094278363544, + "grad_norm": 0.75390625, + "learning_rate": 0.00017275006258893775, + "loss": 4.2978, + "step": 6971 + }, + { + "epoch": 0.7229131158908424, + "grad_norm": 0.79296875, + "learning_rate": 0.0001727426094636675, + "loss": 4.2782, + "step": 6972 + }, + { + "epoch": 0.7230168039453305, + "grad_norm": 0.78125, + "learning_rate": 0.00017273515548011138, + "loss": 4.2956, + "step": 6973 + }, + { + "epoch": 0.7231204919998185, + "grad_norm": 0.77734375, + "learning_rate": 0.00017272770063835732, + "loss": 4.2611, + "step": 6974 + }, + { + "epoch": 0.7232241800543067, + "grad_norm": 0.71484375, + "learning_rate": 0.00017272024493849325, + "loss": 4.2621, + "step": 6975 + }, + { + "epoch": 0.7233278681087947, + "grad_norm": 0.8125, + "learning_rate": 0.00017271278838060719, + "loss": 4.3214, + "step": 6976 + }, + { + "epoch": 0.7234315561632828, + "grad_norm": 0.73828125, + "learning_rate": 0.00017270533096478704, + "loss": 4.3202, + "step": 6977 + }, + { + "epoch": 0.7235352442177708, + "grad_norm": 0.7734375, + "learning_rate": 0.00017269787269112084, + "loss": 4.2535, + "step": 6978 + }, + { + "epoch": 0.723638932272259, + "grad_norm": 0.78515625, + "learning_rate": 0.0001726904135596966, + "loss": 4.2887, + "step": 6979 + }, + { + "epoch": 0.723742620326747, + "grad_norm": 0.74609375, + "learning_rate": 0.00017268295357060235, + "loss": 4.3077, + "step": 6980 + }, + { + "epoch": 0.7238463083812351, + "grad_norm": 0.85546875, + "learning_rate": 0.00017267549272392607, + "loss": 4.289, + "step": 6981 + }, + { + "epoch": 0.7239499964357231, + "grad_norm": 0.70703125, + "learning_rate": 0.00017266803101975576, + "loss": 4.2946, + "step": 6982 + }, + { + "epoch": 0.7240536844902112, + "grad_norm": 0.9453125, + "learning_rate": 0.0001726605684581795, + "loss": 4.262, + "step": 6983 + }, + { + "epoch": 0.7241573725446993, + "grad_norm": 0.625, + "learning_rate": 0.0001726531050392854, + "loss": 4.3065, + "step": 6984 + }, + { + "epoch": 0.7242610605991874, + "grad_norm": 0.8671875, + "learning_rate": 0.00017264564076316136, + "loss": 4.2718, + "step": 6985 + }, + { + "epoch": 0.7243647486536754, + "grad_norm": 0.71484375, + "learning_rate": 0.00017263817562989563, + "loss": 4.326, + "step": 6986 + }, + { + "epoch": 0.7244684367081635, + "grad_norm": 0.8125, + "learning_rate": 0.0001726307096395762, + "loss": 4.2844, + "step": 6987 + }, + { + "epoch": 0.7245721247626515, + "grad_norm": 0.8203125, + "learning_rate": 0.00017262324279229113, + "loss": 4.3137, + "step": 6988 + }, + { + "epoch": 0.7246758128171397, + "grad_norm": 0.75390625, + "learning_rate": 0.0001726157750881286, + "loss": 4.2779, + "step": 6989 + }, + { + "epoch": 0.7247795008716277, + "grad_norm": 0.796875, + "learning_rate": 0.00017260830652717665, + "loss": 4.364, + "step": 6990 + }, + { + "epoch": 0.7248831889261158, + "grad_norm": 0.70703125, + "learning_rate": 0.00017260083710952343, + "loss": 4.298, + "step": 6991 + }, + { + "epoch": 0.7249868769806038, + "grad_norm": 0.84375, + "learning_rate": 0.0001725933668352571, + "loss": 4.3133, + "step": 6992 + }, + { + "epoch": 0.725090565035092, + "grad_norm": 0.72265625, + "learning_rate": 0.00017258589570446576, + "loss": 4.3327, + "step": 6993 + }, + { + "epoch": 0.72519425308958, + "grad_norm": 0.8046875, + "learning_rate": 0.0001725784237172376, + "loss": 4.3136, + "step": 6994 + }, + { + "epoch": 0.7252979411440681, + "grad_norm": 0.7734375, + "learning_rate": 0.00017257095087366074, + "loss": 4.3083, + "step": 6995 + }, + { + "epoch": 0.7254016291985561, + "grad_norm": 0.80859375, + "learning_rate": 0.00017256347717382338, + "loss": 4.2735, + "step": 6996 + }, + { + "epoch": 0.7255053172530442, + "grad_norm": 0.71484375, + "learning_rate": 0.0001725560026178137, + "loss": 4.3301, + "step": 6997 + }, + { + "epoch": 0.7256090053075323, + "grad_norm": 0.8125, + "learning_rate": 0.00017254852720571982, + "loss": 4.2935, + "step": 6998 + }, + { + "epoch": 0.7257126933620204, + "grad_norm": 0.734375, + "learning_rate": 0.00017254105093763008, + "loss": 4.2975, + "step": 6999 + }, + { + "epoch": 0.7258163814165084, + "grad_norm": 0.828125, + "learning_rate": 0.0001725335738136326, + "loss": 4.2722, + "step": 7000 + }, + { + "epoch": 0.7259200694709965, + "grad_norm": 0.70703125, + "learning_rate": 0.0001725260958338156, + "loss": 4.297, + "step": 7001 + }, + { + "epoch": 0.7260237575254845, + "grad_norm": 0.88671875, + "learning_rate": 0.00017251861699826737, + "loss": 4.2929, + "step": 7002 + }, + { + "epoch": 0.7261274455799727, + "grad_norm": 0.63671875, + "learning_rate": 0.00017251113730707608, + "loss": 4.3054, + "step": 7003 + }, + { + "epoch": 0.7262311336344607, + "grad_norm": 0.8359375, + "learning_rate": 0.00017250365676033002, + "loss": 4.2856, + "step": 7004 + }, + { + "epoch": 0.7263348216889488, + "grad_norm": 0.84375, + "learning_rate": 0.0001724961753581175, + "loss": 4.2623, + "step": 7005 + }, + { + "epoch": 0.7264385097434368, + "grad_norm": 0.7890625, + "learning_rate": 0.0001724886931005267, + "loss": 4.3196, + "step": 7006 + }, + { + "epoch": 0.726542197797925, + "grad_norm": 0.796875, + "learning_rate": 0.00017248120998764591, + "loss": 4.3348, + "step": 7007 + }, + { + "epoch": 0.726645885852413, + "grad_norm": 0.77734375, + "learning_rate": 0.0001724737260195635, + "loss": 4.2863, + "step": 7008 + }, + { + "epoch": 0.7267495739069011, + "grad_norm": 0.8203125, + "learning_rate": 0.00017246624119636773, + "loss": 4.2792, + "step": 7009 + }, + { + "epoch": 0.7268532619613891, + "grad_norm": 0.71875, + "learning_rate": 0.00017245875551814689, + "loss": 4.3015, + "step": 7010 + }, + { + "epoch": 0.7269569500158772, + "grad_norm": 0.80859375, + "learning_rate": 0.00017245126898498934, + "loss": 4.3235, + "step": 7011 + }, + { + "epoch": 0.7270606380703653, + "grad_norm": 0.74609375, + "learning_rate": 0.00017244378159698344, + "loss": 4.2926, + "step": 7012 + }, + { + "epoch": 0.7271643261248534, + "grad_norm": 0.81640625, + "learning_rate": 0.00017243629335421748, + "loss": 4.2849, + "step": 7013 + }, + { + "epoch": 0.7272680141793415, + "grad_norm": 0.83984375, + "learning_rate": 0.0001724288042567798, + "loss": 4.2794, + "step": 7014 + }, + { + "epoch": 0.7273717022338295, + "grad_norm": 0.86328125, + "learning_rate": 0.00017242131430475878, + "loss": 4.2771, + "step": 7015 + }, + { + "epoch": 0.7274753902883176, + "grad_norm": 0.80859375, + "learning_rate": 0.00017241382349824283, + "loss": 4.3026, + "step": 7016 + }, + { + "epoch": 0.7275790783428057, + "grad_norm": 0.765625, + "learning_rate": 0.00017240633183732032, + "loss": 4.3207, + "step": 7017 + }, + { + "epoch": 0.7276827663972938, + "grad_norm": 0.7578125, + "learning_rate": 0.0001723988393220796, + "loss": 4.3208, + "step": 7018 + }, + { + "epoch": 0.7277864544517818, + "grad_norm": 0.6953125, + "learning_rate": 0.0001723913459526091, + "loss": 4.3088, + "step": 7019 + }, + { + "epoch": 0.7278901425062699, + "grad_norm": 0.73828125, + "learning_rate": 0.00017238385172899727, + "loss": 4.257, + "step": 7020 + }, + { + "epoch": 0.727993830560758, + "grad_norm": 0.72265625, + "learning_rate": 0.00017237635665133248, + "loss": 4.2831, + "step": 7021 + }, + { + "epoch": 0.7280975186152461, + "grad_norm": 0.68359375, + "learning_rate": 0.0001723688607197032, + "loss": 4.2535, + "step": 7022 + }, + { + "epoch": 0.7282012066697341, + "grad_norm": 0.76953125, + "learning_rate": 0.00017236136393419783, + "loss": 4.3271, + "step": 7023 + }, + { + "epoch": 0.7283048947242222, + "grad_norm": 0.671875, + "learning_rate": 0.0001723538662949049, + "loss": 4.2639, + "step": 7024 + }, + { + "epoch": 0.7284085827787102, + "grad_norm": 0.7734375, + "learning_rate": 0.00017234636780191282, + "loss": 4.2348, + "step": 7025 + }, + { + "epoch": 0.7285122708331984, + "grad_norm": 0.71875, + "learning_rate": 0.00017233886845531004, + "loss": 4.3681, + "step": 7026 + }, + { + "epoch": 0.7286159588876864, + "grad_norm": 0.7734375, + "learning_rate": 0.00017233136825518512, + "loss": 4.2736, + "step": 7027 + }, + { + "epoch": 0.7287196469421745, + "grad_norm": 0.73828125, + "learning_rate": 0.00017232386720162648, + "loss": 4.3037, + "step": 7028 + }, + { + "epoch": 0.7288233349966625, + "grad_norm": 0.78125, + "learning_rate": 0.00017231636529472266, + "loss": 4.2867, + "step": 7029 + }, + { + "epoch": 0.7289270230511506, + "grad_norm": 0.765625, + "learning_rate": 0.00017230886253456217, + "loss": 4.2942, + "step": 7030 + }, + { + "epoch": 0.7290307111056387, + "grad_norm": 0.7890625, + "learning_rate": 0.00017230135892123358, + "loss": 4.3403, + "step": 7031 + }, + { + "epoch": 0.7291343991601268, + "grad_norm": 0.93359375, + "learning_rate": 0.00017229385445482532, + "loss": 4.3001, + "step": 7032 + }, + { + "epoch": 0.7292380872146148, + "grad_norm": 0.7109375, + "learning_rate": 0.00017228634913542604, + "loss": 4.3063, + "step": 7033 + }, + { + "epoch": 0.7293417752691029, + "grad_norm": 0.79296875, + "learning_rate": 0.0001722788429631242, + "loss": 4.3232, + "step": 7034 + }, + { + "epoch": 0.729445463323591, + "grad_norm": 0.7578125, + "learning_rate": 0.00017227133593800847, + "loss": 4.2879, + "step": 7035 + }, + { + "epoch": 0.7295491513780791, + "grad_norm": 0.76171875, + "learning_rate": 0.00017226382806016734, + "loss": 4.3101, + "step": 7036 + }, + { + "epoch": 0.7296528394325671, + "grad_norm": 0.7578125, + "learning_rate": 0.00017225631932968945, + "loss": 4.2876, + "step": 7037 + }, + { + "epoch": 0.7297565274870552, + "grad_norm": 0.78515625, + "learning_rate": 0.00017224880974666337, + "loss": 4.2636, + "step": 7038 + }, + { + "epoch": 0.7298602155415432, + "grad_norm": 0.78515625, + "learning_rate": 0.00017224129931117768, + "loss": 4.3189, + "step": 7039 + }, + { + "epoch": 0.7299639035960314, + "grad_norm": 0.78515625, + "learning_rate": 0.00017223378802332098, + "loss": 4.3109, + "step": 7040 + }, + { + "epoch": 0.7300675916505194, + "grad_norm": 0.78515625, + "learning_rate": 0.000172226275883182, + "loss": 4.2895, + "step": 7041 + }, + { + "epoch": 0.7301712797050075, + "grad_norm": 0.8046875, + "learning_rate": 0.0001722187628908493, + "loss": 4.2554, + "step": 7042 + }, + { + "epoch": 0.7302749677594955, + "grad_norm": 0.8671875, + "learning_rate": 0.00017221124904641153, + "loss": 4.3007, + "step": 7043 + }, + { + "epoch": 0.7303786558139836, + "grad_norm": 0.7578125, + "learning_rate": 0.00017220373434995737, + "loss": 4.2808, + "step": 7044 + }, + { + "epoch": 0.7304823438684717, + "grad_norm": 0.796875, + "learning_rate": 0.00017219621880157544, + "loss": 4.275, + "step": 7045 + }, + { + "epoch": 0.7305860319229598, + "grad_norm": 0.9140625, + "learning_rate": 0.00017218870240135446, + "loss": 4.3002, + "step": 7046 + }, + { + "epoch": 0.7306897199774478, + "grad_norm": 0.703125, + "learning_rate": 0.00017218118514938309, + "loss": 4.3309, + "step": 7047 + }, + { + "epoch": 0.7307934080319359, + "grad_norm": 0.8046875, + "learning_rate": 0.00017217366704575, + "loss": 4.3054, + "step": 7048 + }, + { + "epoch": 0.730897096086424, + "grad_norm": 0.734375, + "learning_rate": 0.00017216614809054402, + "loss": 4.311, + "step": 7049 + }, + { + "epoch": 0.7310007841409121, + "grad_norm": 0.69140625, + "learning_rate": 0.0001721586282838537, + "loss": 4.2963, + "step": 7050 + }, + { + "epoch": 0.7311044721954001, + "grad_norm": 0.765625, + "learning_rate": 0.00017215110762576786, + "loss": 4.289, + "step": 7051 + }, + { + "epoch": 0.7312081602498882, + "grad_norm": 0.6484375, + "learning_rate": 0.00017214358611637522, + "loss": 4.3274, + "step": 7052 + }, + { + "epoch": 0.7313118483043762, + "grad_norm": 0.75, + "learning_rate": 0.0001721360637557645, + "loss": 4.2937, + "step": 7053 + }, + { + "epoch": 0.7314155363588644, + "grad_norm": 0.7734375, + "learning_rate": 0.00017212854054402451, + "loss": 4.2802, + "step": 7054 + }, + { + "epoch": 0.7315192244133524, + "grad_norm": 0.71875, + "learning_rate": 0.00017212101648124398, + "loss": 4.301, + "step": 7055 + }, + { + "epoch": 0.7316229124678405, + "grad_norm": 0.72265625, + "learning_rate": 0.00017211349156751168, + "loss": 4.2797, + "step": 7056 + }, + { + "epoch": 0.7317266005223286, + "grad_norm": 0.6953125, + "learning_rate": 0.00017210596580291644, + "loss": 4.3053, + "step": 7057 + }, + { + "epoch": 0.7318302885768166, + "grad_norm": 0.7578125, + "learning_rate": 0.00017209843918754698, + "loss": 4.2942, + "step": 7058 + }, + { + "epoch": 0.7319339766313048, + "grad_norm": 0.6875, + "learning_rate": 0.00017209091172149215, + "loss": 4.3132, + "step": 7059 + }, + { + "epoch": 0.7320376646857928, + "grad_norm": 0.703125, + "learning_rate": 0.0001720833834048408, + "loss": 4.2949, + "step": 7060 + }, + { + "epoch": 0.7321413527402809, + "grad_norm": 0.76171875, + "learning_rate": 0.0001720758542376817, + "loss": 4.3509, + "step": 7061 + }, + { + "epoch": 0.7322450407947689, + "grad_norm": 0.6640625, + "learning_rate": 0.0001720683242201037, + "loss": 4.3095, + "step": 7062 + }, + { + "epoch": 0.7323487288492571, + "grad_norm": 0.765625, + "learning_rate": 0.00017206079335219568, + "loss": 4.2731, + "step": 7063 + }, + { + "epoch": 0.7324524169037451, + "grad_norm": 0.6953125, + "learning_rate": 0.00017205326163404646, + "loss": 4.3128, + "step": 7064 + }, + { + "epoch": 0.7325561049582332, + "grad_norm": 0.7734375, + "learning_rate": 0.0001720457290657449, + "loss": 4.3371, + "step": 7065 + }, + { + "epoch": 0.7326597930127212, + "grad_norm": 0.74609375, + "learning_rate": 0.0001720381956473799, + "loss": 4.3332, + "step": 7066 + }, + { + "epoch": 0.7327634810672093, + "grad_norm": 0.74609375, + "learning_rate": 0.00017203066137904034, + "loss": 4.2822, + "step": 7067 + }, + { + "epoch": 0.7328671691216974, + "grad_norm": 0.734375, + "learning_rate": 0.0001720231262608151, + "loss": 4.305, + "step": 7068 + }, + { + "epoch": 0.7329708571761855, + "grad_norm": 0.74609375, + "learning_rate": 0.00017201559029279313, + "loss": 4.3058, + "step": 7069 + }, + { + "epoch": 0.7330745452306735, + "grad_norm": 0.73046875, + "learning_rate": 0.0001720080534750633, + "loss": 4.3136, + "step": 7070 + }, + { + "epoch": 0.7331782332851616, + "grad_norm": 0.7109375, + "learning_rate": 0.00017200051580771458, + "loss": 4.2996, + "step": 7071 + }, + { + "epoch": 0.7332819213396496, + "grad_norm": 0.75390625, + "learning_rate": 0.00017199297729083584, + "loss": 4.2357, + "step": 7072 + }, + { + "epoch": 0.7333856093941378, + "grad_norm": 0.71484375, + "learning_rate": 0.00017198543792451608, + "loss": 4.2651, + "step": 7073 + }, + { + "epoch": 0.7334892974486258, + "grad_norm": 0.828125, + "learning_rate": 0.00017197789770884424, + "loss": 4.2955, + "step": 7074 + }, + { + "epoch": 0.7335929855031139, + "grad_norm": 0.76953125, + "learning_rate": 0.00017197035664390936, + "loss": 4.308, + "step": 7075 + }, + { + "epoch": 0.7336966735576019, + "grad_norm": 0.78515625, + "learning_rate": 0.00017196281472980026, + "loss": 4.2533, + "step": 7076 + }, + { + "epoch": 0.7338003616120901, + "grad_norm": 0.7265625, + "learning_rate": 0.00017195527196660603, + "loss": 4.3313, + "step": 7077 + }, + { + "epoch": 0.7339040496665781, + "grad_norm": 0.80078125, + "learning_rate": 0.0001719477283544157, + "loss": 4.2987, + "step": 7078 + }, + { + "epoch": 0.7340077377210662, + "grad_norm": 0.75390625, + "learning_rate": 0.00017194018389331817, + "loss": 4.3059, + "step": 7079 + }, + { + "epoch": 0.7341114257755542, + "grad_norm": 0.8671875, + "learning_rate": 0.00017193263858340254, + "loss": 4.2652, + "step": 7080 + }, + { + "epoch": 0.7342151138300423, + "grad_norm": 0.66796875, + "learning_rate": 0.00017192509242475783, + "loss": 4.3037, + "step": 7081 + }, + { + "epoch": 0.7343188018845304, + "grad_norm": 0.796875, + "learning_rate": 0.00017191754541747301, + "loss": 4.2889, + "step": 7082 + }, + { + "epoch": 0.7344224899390185, + "grad_norm": 0.7265625, + "learning_rate": 0.00017190999756163723, + "loss": 4.3051, + "step": 7083 + }, + { + "epoch": 0.7345261779935065, + "grad_norm": 0.765625, + "learning_rate": 0.00017190244885733946, + "loss": 4.3293, + "step": 7084 + }, + { + "epoch": 0.7346298660479946, + "grad_norm": 0.796875, + "learning_rate": 0.00017189489930466878, + "loss": 4.3515, + "step": 7085 + }, + { + "epoch": 0.7347335541024826, + "grad_norm": 0.75, + "learning_rate": 0.0001718873489037143, + "loss": 4.309, + "step": 7086 + }, + { + "epoch": 0.7348372421569708, + "grad_norm": 0.859375, + "learning_rate": 0.00017187979765456512, + "loss": 4.2999, + "step": 7087 + }, + { + "epoch": 0.7349409302114588, + "grad_norm": 0.71875, + "learning_rate": 0.0001718722455573103, + "loss": 4.2773, + "step": 7088 + }, + { + "epoch": 0.7350446182659469, + "grad_norm": 0.73828125, + "learning_rate": 0.0001718646926120389, + "loss": 4.3381, + "step": 7089 + }, + { + "epoch": 0.7351483063204349, + "grad_norm": 0.73828125, + "learning_rate": 0.00017185713881884014, + "loss": 4.2982, + "step": 7090 + }, + { + "epoch": 0.7352519943749231, + "grad_norm": 0.74609375, + "learning_rate": 0.00017184958417780307, + "loss": 4.272, + "step": 7091 + }, + { + "epoch": 0.7353556824294111, + "grad_norm": 0.7265625, + "learning_rate": 0.00017184202868901691, + "loss": 4.3178, + "step": 7092 + }, + { + "epoch": 0.7354593704838992, + "grad_norm": 0.828125, + "learning_rate": 0.0001718344723525707, + "loss": 4.3005, + "step": 7093 + }, + { + "epoch": 0.7355630585383872, + "grad_norm": 0.6953125, + "learning_rate": 0.00017182691516855368, + "loss": 4.3045, + "step": 7094 + }, + { + "epoch": 0.7356667465928753, + "grad_norm": 0.72265625, + "learning_rate": 0.00017181935713705496, + "loss": 4.3003, + "step": 7095 + }, + { + "epoch": 0.7357704346473634, + "grad_norm": 0.68359375, + "learning_rate": 0.00017181179825816374, + "loss": 4.3082, + "step": 7096 + }, + { + "epoch": 0.7358741227018515, + "grad_norm": 0.65625, + "learning_rate": 0.00017180423853196923, + "loss": 4.3297, + "step": 7097 + }, + { + "epoch": 0.7359778107563395, + "grad_norm": 0.68359375, + "learning_rate": 0.0001717966779585606, + "loss": 4.2416, + "step": 7098 + }, + { + "epoch": 0.7360814988108276, + "grad_norm": 0.71484375, + "learning_rate": 0.00017178911653802705, + "loss": 4.2868, + "step": 7099 + }, + { + "epoch": 0.7361851868653156, + "grad_norm": 0.66015625, + "learning_rate": 0.00017178155427045782, + "loss": 4.3129, + "step": 7100 + }, + { + "epoch": 0.7362888749198038, + "grad_norm": 0.73828125, + "learning_rate": 0.00017177399115594213, + "loss": 4.3162, + "step": 7101 + }, + { + "epoch": 0.7363925629742919, + "grad_norm": 0.6953125, + "learning_rate": 0.0001717664271945692, + "loss": 4.2818, + "step": 7102 + }, + { + "epoch": 0.7364962510287799, + "grad_norm": 0.81640625, + "learning_rate": 0.00017175886238642832, + "loss": 4.3238, + "step": 7103 + }, + { + "epoch": 0.736599939083268, + "grad_norm": 0.703125, + "learning_rate": 0.00017175129673160865, + "loss": 4.2752, + "step": 7104 + }, + { + "epoch": 0.7367036271377561, + "grad_norm": 0.76171875, + "learning_rate": 0.00017174373023019958, + "loss": 4.3085, + "step": 7105 + }, + { + "epoch": 0.7368073151922442, + "grad_norm": 0.78515625, + "learning_rate": 0.0001717361628822903, + "loss": 4.2717, + "step": 7106 + }, + { + "epoch": 0.7369110032467322, + "grad_norm": 0.6953125, + "learning_rate": 0.00017172859468797015, + "loss": 4.3096, + "step": 7107 + }, + { + "epoch": 0.7370146913012203, + "grad_norm": 0.8671875, + "learning_rate": 0.0001717210256473284, + "loss": 4.3106, + "step": 7108 + }, + { + "epoch": 0.7371183793557083, + "grad_norm": 0.703125, + "learning_rate": 0.00017171345576045437, + "loss": 4.3068, + "step": 7109 + }, + { + "epoch": 0.7372220674101965, + "grad_norm": 0.890625, + "learning_rate": 0.00017170588502743735, + "loss": 4.2665, + "step": 7110 + }, + { + "epoch": 0.7373257554646845, + "grad_norm": 0.69921875, + "learning_rate": 0.00017169831344836668, + "loss": 4.3333, + "step": 7111 + }, + { + "epoch": 0.7374294435191726, + "grad_norm": 0.79296875, + "learning_rate": 0.0001716907410233317, + "loss": 4.2849, + "step": 7112 + }, + { + "epoch": 0.7375331315736606, + "grad_norm": 0.71875, + "learning_rate": 0.00017168316775242174, + "loss": 4.2733, + "step": 7113 + }, + { + "epoch": 0.7376368196281488, + "grad_norm": 0.74609375, + "learning_rate": 0.0001716755936357262, + "loss": 4.2969, + "step": 7114 + }, + { + "epoch": 0.7377405076826368, + "grad_norm": 0.8125, + "learning_rate": 0.00017166801867333443, + "loss": 4.3362, + "step": 7115 + }, + { + "epoch": 0.7378441957371249, + "grad_norm": 0.75390625, + "learning_rate": 0.00017166044286533576, + "loss": 4.2884, + "step": 7116 + }, + { + "epoch": 0.7379478837916129, + "grad_norm": 0.75, + "learning_rate": 0.00017165286621181961, + "loss": 4.2443, + "step": 7117 + }, + { + "epoch": 0.738051571846101, + "grad_norm": 0.734375, + "learning_rate": 0.0001716452887128754, + "loss": 4.3151, + "step": 7118 + }, + { + "epoch": 0.7381552599005891, + "grad_norm": 0.671875, + "learning_rate": 0.00017163771036859252, + "loss": 4.3142, + "step": 7119 + }, + { + "epoch": 0.7382589479550772, + "grad_norm": 0.7109375, + "learning_rate": 0.0001716301311790604, + "loss": 4.2677, + "step": 7120 + }, + { + "epoch": 0.7383626360095652, + "grad_norm": 0.66796875, + "learning_rate": 0.00017162255114436842, + "loss": 4.2915, + "step": 7121 + }, + { + "epoch": 0.7384663240640533, + "grad_norm": 0.6953125, + "learning_rate": 0.00017161497026460605, + "loss": 4.2741, + "step": 7122 + }, + { + "epoch": 0.7385700121185413, + "grad_norm": 0.66796875, + "learning_rate": 0.00017160738853986272, + "loss": 4.2836, + "step": 7123 + }, + { + "epoch": 0.7386737001730295, + "grad_norm": 0.6640625, + "learning_rate": 0.0001715998059702279, + "loss": 4.2986, + "step": 7124 + }, + { + "epoch": 0.7387773882275175, + "grad_norm": 0.68359375, + "learning_rate": 0.00017159222255579105, + "loss": 4.3167, + "step": 7125 + }, + { + "epoch": 0.7388810762820056, + "grad_norm": 0.6953125, + "learning_rate": 0.00017158463829664169, + "loss": 4.3071, + "step": 7126 + }, + { + "epoch": 0.7389847643364936, + "grad_norm": 0.66015625, + "learning_rate": 0.00017157705319286923, + "loss": 4.2997, + "step": 7127 + }, + { + "epoch": 0.7390884523909818, + "grad_norm": 0.734375, + "learning_rate": 0.00017156946724456321, + "loss": 4.3084, + "step": 7128 + }, + { + "epoch": 0.7391921404454698, + "grad_norm": 0.65625, + "learning_rate": 0.00017156188045181313, + "loss": 4.3035, + "step": 7129 + }, + { + "epoch": 0.7392958284999579, + "grad_norm": 0.74609375, + "learning_rate": 0.00017155429281470852, + "loss": 4.3053, + "step": 7130 + }, + { + "epoch": 0.7393995165544459, + "grad_norm": 0.625, + "learning_rate": 0.00017154670433333887, + "loss": 4.2988, + "step": 7131 + }, + { + "epoch": 0.739503204608934, + "grad_norm": 0.69921875, + "learning_rate": 0.00017153911500779377, + "loss": 4.2948, + "step": 7132 + }, + { + "epoch": 0.739606892663422, + "grad_norm": 0.7109375, + "learning_rate": 0.00017153152483816267, + "loss": 4.2857, + "step": 7133 + }, + { + "epoch": 0.7397105807179102, + "grad_norm": 0.73828125, + "learning_rate": 0.00017152393382453523, + "loss": 4.2967, + "step": 7134 + }, + { + "epoch": 0.7398142687723982, + "grad_norm": 0.78515625, + "learning_rate": 0.00017151634196700097, + "loss": 4.3033, + "step": 7135 + }, + { + "epoch": 0.7399179568268863, + "grad_norm": 0.70703125, + "learning_rate": 0.00017150874926564948, + "loss": 4.2786, + "step": 7136 + }, + { + "epoch": 0.7400216448813743, + "grad_norm": 0.73828125, + "learning_rate": 0.00017150115572057032, + "loss": 4.2795, + "step": 7137 + }, + { + "epoch": 0.7401253329358625, + "grad_norm": 0.74609375, + "learning_rate": 0.00017149356133185312, + "loss": 4.2819, + "step": 7138 + }, + { + "epoch": 0.7402290209903505, + "grad_norm": 0.796875, + "learning_rate": 0.00017148596609958746, + "loss": 4.2671, + "step": 7139 + }, + { + "epoch": 0.7403327090448386, + "grad_norm": 0.7109375, + "learning_rate": 0.00017147837002386295, + "loss": 4.2576, + "step": 7140 + }, + { + "epoch": 0.7404363970993266, + "grad_norm": 0.8125, + "learning_rate": 0.00017147077310476923, + "loss": 4.2497, + "step": 7141 + }, + { + "epoch": 0.7405400851538148, + "grad_norm": 0.7265625, + "learning_rate": 0.00017146317534239597, + "loss": 4.3075, + "step": 7142 + }, + { + "epoch": 0.7406437732083028, + "grad_norm": 0.75390625, + "learning_rate": 0.00017145557673683273, + "loss": 4.3005, + "step": 7143 + }, + { + "epoch": 0.7407474612627909, + "grad_norm": 0.7421875, + "learning_rate": 0.00017144797728816928, + "loss": 4.3209, + "step": 7144 + }, + { + "epoch": 0.7408511493172789, + "grad_norm": 0.80859375, + "learning_rate": 0.0001714403769964952, + "loss": 4.284, + "step": 7145 + }, + { + "epoch": 0.740954837371767, + "grad_norm": 0.6328125, + "learning_rate": 0.00017143277586190015, + "loss": 4.3253, + "step": 7146 + }, + { + "epoch": 0.7410585254262552, + "grad_norm": 0.7890625, + "learning_rate": 0.00017142517388447388, + "loss": 4.3014, + "step": 7147 + }, + { + "epoch": 0.7411622134807432, + "grad_norm": 0.67578125, + "learning_rate": 0.00017141757106430605, + "loss": 4.2871, + "step": 7148 + }, + { + "epoch": 0.7412659015352313, + "grad_norm": 0.7421875, + "learning_rate": 0.00017140996740148636, + "loss": 4.2668, + "step": 7149 + }, + { + "epoch": 0.7413695895897193, + "grad_norm": 0.65625, + "learning_rate": 0.00017140236289610457, + "loss": 4.3317, + "step": 7150 + }, + { + "epoch": 0.7414732776442075, + "grad_norm": 0.7734375, + "learning_rate": 0.00017139475754825037, + "loss": 4.279, + "step": 7151 + }, + { + "epoch": 0.7415769656986955, + "grad_norm": 0.67578125, + "learning_rate": 0.00017138715135801347, + "loss": 4.2713, + "step": 7152 + }, + { + "epoch": 0.7416806537531836, + "grad_norm": 0.796875, + "learning_rate": 0.00017137954432548365, + "loss": 4.271, + "step": 7153 + }, + { + "epoch": 0.7417843418076716, + "grad_norm": 0.72265625, + "learning_rate": 0.00017137193645075068, + "loss": 4.3096, + "step": 7154 + }, + { + "epoch": 0.7418880298621597, + "grad_norm": 0.765625, + "learning_rate": 0.00017136432773390427, + "loss": 4.3234, + "step": 7155 + }, + { + "epoch": 0.7419917179166478, + "grad_norm": 0.796875, + "learning_rate": 0.00017135671817503426, + "loss": 4.3376, + "step": 7156 + }, + { + "epoch": 0.7420954059711359, + "grad_norm": 0.76171875, + "learning_rate": 0.0001713491077742304, + "loss": 4.2886, + "step": 7157 + }, + { + "epoch": 0.7421990940256239, + "grad_norm": 0.77734375, + "learning_rate": 0.0001713414965315825, + "loss": 4.299, + "step": 7158 + }, + { + "epoch": 0.742302782080112, + "grad_norm": 0.7421875, + "learning_rate": 0.0001713338844471803, + "loss": 4.2673, + "step": 7159 + }, + { + "epoch": 0.7424064701346, + "grad_norm": 0.75390625, + "learning_rate": 0.00017132627152111372, + "loss": 4.297, + "step": 7160 + }, + { + "epoch": 0.7425101581890882, + "grad_norm": 0.71875, + "learning_rate": 0.00017131865775347249, + "loss": 4.2811, + "step": 7161 + }, + { + "epoch": 0.7426138462435762, + "grad_norm": 0.75, + "learning_rate": 0.00017131104314434652, + "loss": 4.2929, + "step": 7162 + }, + { + "epoch": 0.7427175342980643, + "grad_norm": 0.8046875, + "learning_rate": 0.00017130342769382562, + "loss": 4.3183, + "step": 7163 + }, + { + "epoch": 0.7428212223525523, + "grad_norm": 0.6875, + "learning_rate": 0.00017129581140199962, + "loss": 4.2843, + "step": 7164 + }, + { + "epoch": 0.7429249104070405, + "grad_norm": 0.76953125, + "learning_rate": 0.00017128819426895841, + "loss": 4.288, + "step": 7165 + }, + { + "epoch": 0.7430285984615285, + "grad_norm": 0.71875, + "learning_rate": 0.00017128057629479186, + "loss": 4.2776, + "step": 7166 + }, + { + "epoch": 0.7431322865160166, + "grad_norm": 0.72265625, + "learning_rate": 0.0001712729574795899, + "loss": 4.2689, + "step": 7167 + }, + { + "epoch": 0.7432359745705046, + "grad_norm": 0.8203125, + "learning_rate": 0.00017126533782344235, + "loss": 4.3065, + "step": 7168 + }, + { + "epoch": 0.7433396626249927, + "grad_norm": 0.69921875, + "learning_rate": 0.00017125771732643915, + "loss": 4.3338, + "step": 7169 + }, + { + "epoch": 0.7434433506794808, + "grad_norm": 0.8125, + "learning_rate": 0.00017125009598867018, + "loss": 4.2874, + "step": 7170 + }, + { + "epoch": 0.7435470387339689, + "grad_norm": 0.79296875, + "learning_rate": 0.0001712424738102254, + "loss": 4.2767, + "step": 7171 + }, + { + "epoch": 0.7436507267884569, + "grad_norm": 0.80859375, + "learning_rate": 0.00017123485079119477, + "loss": 4.3092, + "step": 7172 + }, + { + "epoch": 0.743754414842945, + "grad_norm": 0.73046875, + "learning_rate": 0.00017122722693166815, + "loss": 4.2933, + "step": 7173 + }, + { + "epoch": 0.743858102897433, + "grad_norm": 0.80859375, + "learning_rate": 0.00017121960223173558, + "loss": 4.2672, + "step": 7174 + }, + { + "epoch": 0.7439617909519212, + "grad_norm": 0.80859375, + "learning_rate": 0.000171211976691487, + "loss": 4.2863, + "step": 7175 + }, + { + "epoch": 0.7440654790064092, + "grad_norm": 0.7734375, + "learning_rate": 0.00017120435031101232, + "loss": 4.2946, + "step": 7176 + }, + { + "epoch": 0.7441691670608973, + "grad_norm": 0.78125, + "learning_rate": 0.0001711967230904016, + "loss": 4.2853, + "step": 7177 + }, + { + "epoch": 0.7442728551153853, + "grad_norm": 0.74609375, + "learning_rate": 0.00017118909502974482, + "loss": 4.3053, + "step": 7178 + }, + { + "epoch": 0.7443765431698735, + "grad_norm": 0.8828125, + "learning_rate": 0.00017118146612913192, + "loss": 4.251, + "step": 7179 + }, + { + "epoch": 0.7444802312243615, + "grad_norm": 0.8046875, + "learning_rate": 0.00017117383638865302, + "loss": 4.2856, + "step": 7180 + }, + { + "epoch": 0.7445839192788496, + "grad_norm": 0.8046875, + "learning_rate": 0.00017116620580839804, + "loss": 4.3232, + "step": 7181 + }, + { + "epoch": 0.7446876073333376, + "grad_norm": 0.83984375, + "learning_rate": 0.00017115857438845708, + "loss": 4.2889, + "step": 7182 + }, + { + "epoch": 0.7447912953878257, + "grad_norm": 0.72265625, + "learning_rate": 0.00017115094212892017, + "loss": 4.2871, + "step": 7183 + }, + { + "epoch": 0.7448949834423138, + "grad_norm": 0.79296875, + "learning_rate": 0.00017114330902987733, + "loss": 4.2325, + "step": 7184 + }, + { + "epoch": 0.7449986714968019, + "grad_norm": 0.80859375, + "learning_rate": 0.00017113567509141863, + "loss": 4.2844, + "step": 7185 + }, + { + "epoch": 0.7451023595512899, + "grad_norm": 0.75, + "learning_rate": 0.0001711280403136342, + "loss": 4.2512, + "step": 7186 + }, + { + "epoch": 0.745206047605778, + "grad_norm": 0.90625, + "learning_rate": 0.00017112040469661409, + "loss": 4.3283, + "step": 7187 + }, + { + "epoch": 0.745309735660266, + "grad_norm": 0.703125, + "learning_rate": 0.00017111276824044832, + "loss": 4.2993, + "step": 7188 + }, + { + "epoch": 0.7454134237147542, + "grad_norm": 0.9140625, + "learning_rate": 0.00017110513094522708, + "loss": 4.2461, + "step": 7189 + }, + { + "epoch": 0.7455171117692422, + "grad_norm": 0.71484375, + "learning_rate": 0.00017109749281104048, + "loss": 4.3055, + "step": 7190 + }, + { + "epoch": 0.7456207998237303, + "grad_norm": 0.890625, + "learning_rate": 0.0001710898538379786, + "loss": 4.2779, + "step": 7191 + }, + { + "epoch": 0.7457244878782184, + "grad_norm": 0.80078125, + "learning_rate": 0.0001710822140261316, + "loss": 4.2817, + "step": 7192 + }, + { + "epoch": 0.7458281759327064, + "grad_norm": 0.81640625, + "learning_rate": 0.00017107457337558958, + "loss": 4.3003, + "step": 7193 + }, + { + "epoch": 0.7459318639871946, + "grad_norm": 0.78125, + "learning_rate": 0.00017106693188644276, + "loss": 4.2541, + "step": 7194 + }, + { + "epoch": 0.7460355520416826, + "grad_norm": 0.75, + "learning_rate": 0.00017105928955878127, + "loss": 4.2842, + "step": 7195 + }, + { + "epoch": 0.7461392400961707, + "grad_norm": 0.859375, + "learning_rate": 0.00017105164639269526, + "loss": 4.2931, + "step": 7196 + }, + { + "epoch": 0.7462429281506587, + "grad_norm": 0.79296875, + "learning_rate": 0.0001710440023882749, + "loss": 4.3405, + "step": 7197 + }, + { + "epoch": 0.7463466162051469, + "grad_norm": 0.890625, + "learning_rate": 0.00017103635754561045, + "loss": 4.3018, + "step": 7198 + }, + { + "epoch": 0.7464503042596349, + "grad_norm": 0.80078125, + "learning_rate": 0.00017102871186479206, + "loss": 4.3131, + "step": 7199 + }, + { + "epoch": 0.746553992314123, + "grad_norm": 0.77734375, + "learning_rate": 0.00017102106534590993, + "loss": 4.3459, + "step": 7200 + }, + { + "epoch": 0.746657680368611, + "grad_norm": 0.734375, + "learning_rate": 0.00017101341798905434, + "loss": 4.3024, + "step": 7201 + }, + { + "epoch": 0.7467613684230991, + "grad_norm": 0.75390625, + "learning_rate": 0.00017100576979431543, + "loss": 4.2866, + "step": 7202 + }, + { + "epoch": 0.7468650564775872, + "grad_norm": 0.7421875, + "learning_rate": 0.00017099812076178355, + "loss": 4.295, + "step": 7203 + }, + { + "epoch": 0.7469687445320753, + "grad_norm": 0.69921875, + "learning_rate": 0.00017099047089154886, + "loss": 4.2849, + "step": 7204 + }, + { + "epoch": 0.7470724325865633, + "grad_norm": 0.8203125, + "learning_rate": 0.00017098282018370163, + "loss": 4.3026, + "step": 7205 + }, + { + "epoch": 0.7471761206410514, + "grad_norm": 0.7109375, + "learning_rate": 0.00017097516863833222, + "loss": 4.2857, + "step": 7206 + }, + { + "epoch": 0.7472798086955394, + "grad_norm": 0.84375, + "learning_rate": 0.0001709675162555308, + "loss": 4.2794, + "step": 7207 + }, + { + "epoch": 0.7473834967500276, + "grad_norm": 0.76953125, + "learning_rate": 0.0001709598630353877, + "loss": 4.2753, + "step": 7208 + }, + { + "epoch": 0.7474871848045156, + "grad_norm": 0.8046875, + "learning_rate": 0.00017095220897799323, + "loss": 4.2956, + "step": 7209 + }, + { + "epoch": 0.7475908728590037, + "grad_norm": 0.875, + "learning_rate": 0.00017094455408343768, + "loss": 4.2803, + "step": 7210 + }, + { + "epoch": 0.7476945609134917, + "grad_norm": 0.76953125, + "learning_rate": 0.0001709368983518114, + "loss": 4.2632, + "step": 7211 + }, + { + "epoch": 0.7477982489679799, + "grad_norm": 0.7890625, + "learning_rate": 0.0001709292417832047, + "loss": 4.3143, + "step": 7212 + }, + { + "epoch": 0.7479019370224679, + "grad_norm": 0.79296875, + "learning_rate": 0.00017092158437770794, + "loss": 4.2838, + "step": 7213 + }, + { + "epoch": 0.748005625076956, + "grad_norm": 0.8046875, + "learning_rate": 0.00017091392613541144, + "loss": 4.284, + "step": 7214 + }, + { + "epoch": 0.748109313131444, + "grad_norm": 0.73046875, + "learning_rate": 0.00017090626705640557, + "loss": 4.2753, + "step": 7215 + }, + { + "epoch": 0.7482130011859321, + "grad_norm": 0.7421875, + "learning_rate": 0.0001708986071407807, + "loss": 4.3097, + "step": 7216 + }, + { + "epoch": 0.7483166892404202, + "grad_norm": 0.703125, + "learning_rate": 0.0001708909463886272, + "loss": 4.2965, + "step": 7217 + }, + { + "epoch": 0.7484203772949083, + "grad_norm": 0.8203125, + "learning_rate": 0.00017088328480003545, + "loss": 4.2942, + "step": 7218 + }, + { + "epoch": 0.7485240653493963, + "grad_norm": 0.63671875, + "learning_rate": 0.00017087562237509592, + "loss": 4.3179, + "step": 7219 + }, + { + "epoch": 0.7486277534038844, + "grad_norm": 0.8125, + "learning_rate": 0.00017086795911389895, + "loss": 4.289, + "step": 7220 + }, + { + "epoch": 0.7487314414583724, + "grad_norm": 0.7265625, + "learning_rate": 0.00017086029501653496, + "loss": 4.2506, + "step": 7221 + }, + { + "epoch": 0.7488351295128606, + "grad_norm": 0.9140625, + "learning_rate": 0.00017085263008309438, + "loss": 4.3148, + "step": 7222 + }, + { + "epoch": 0.7489388175673486, + "grad_norm": 0.80859375, + "learning_rate": 0.00017084496431366767, + "loss": 4.2945, + "step": 7223 + }, + { + "epoch": 0.7490425056218367, + "grad_norm": 0.921875, + "learning_rate": 0.00017083729770834527, + "loss": 4.3027, + "step": 7224 + }, + { + "epoch": 0.7491461936763247, + "grad_norm": 0.87890625, + "learning_rate": 0.00017082963026721762, + "loss": 4.2845, + "step": 7225 + }, + { + "epoch": 0.7492498817308129, + "grad_norm": 0.86328125, + "learning_rate": 0.0001708219619903752, + "loss": 4.3067, + "step": 7226 + }, + { + "epoch": 0.7493535697853009, + "grad_norm": 0.921875, + "learning_rate": 0.00017081429287790854, + "loss": 4.3316, + "step": 7227 + }, + { + "epoch": 0.749457257839789, + "grad_norm": 0.80859375, + "learning_rate": 0.00017080662292990803, + "loss": 4.309, + "step": 7228 + }, + { + "epoch": 0.749560945894277, + "grad_norm": 0.9765625, + "learning_rate": 0.0001707989521464642, + "loss": 4.2783, + "step": 7229 + }, + { + "epoch": 0.7496646339487651, + "grad_norm": 0.86328125, + "learning_rate": 0.00017079128052766764, + "loss": 4.2817, + "step": 7230 + }, + { + "epoch": 0.7497683220032532, + "grad_norm": 0.82421875, + "learning_rate": 0.00017078360807360875, + "loss": 4.2768, + "step": 7231 + }, + { + "epoch": 0.7498720100577413, + "grad_norm": 0.9453125, + "learning_rate": 0.00017077593478437812, + "loss": 4.2631, + "step": 7232 + }, + { + "epoch": 0.7499756981122293, + "grad_norm": 0.80078125, + "learning_rate": 0.00017076826066006627, + "loss": 4.2689, + "step": 7233 + }, + { + "epoch": 0.7499756981122293, + "eval_loss": 4.309548377990723, + "eval_runtime": 0.441, + "eval_samples_per_second": 337.873, + "eval_steps_per_second": 15.873, + "step": 7233 + }, + { + "epoch": 0.7500793861667174, + "grad_norm": 0.890625, + "learning_rate": 0.00017076058570076374, + "loss": 4.2999, + "step": 7234 + }, + { + "epoch": 0.7501830742212054, + "grad_norm": 0.96484375, + "learning_rate": 0.0001707529099065611, + "loss": 4.2887, + "step": 7235 + }, + { + "epoch": 0.7502867622756936, + "grad_norm": 0.84375, + "learning_rate": 0.00017074523327754895, + "loss": 4.2617, + "step": 7236 + }, + { + "epoch": 0.7503904503301817, + "grad_norm": 0.9140625, + "learning_rate": 0.00017073755581381778, + "loss": 4.3132, + "step": 7237 + }, + { + "epoch": 0.7504941383846697, + "grad_norm": 0.8671875, + "learning_rate": 0.00017072987751545827, + "loss": 4.2769, + "step": 7238 + }, + { + "epoch": 0.7505978264391578, + "grad_norm": 0.8125, + "learning_rate": 0.00017072219838256092, + "loss": 4.3216, + "step": 7239 + }, + { + "epoch": 0.7507015144936459, + "grad_norm": 0.97265625, + "learning_rate": 0.0001707145184152164, + "loss": 4.3209, + "step": 7240 + }, + { + "epoch": 0.750805202548134, + "grad_norm": 0.81640625, + "learning_rate": 0.00017070683761351532, + "loss": 4.3313, + "step": 7241 + }, + { + "epoch": 0.750908890602622, + "grad_norm": 0.79296875, + "learning_rate": 0.00017069915597754827, + "loss": 4.2942, + "step": 7242 + }, + { + "epoch": 0.7510125786571101, + "grad_norm": 0.984375, + "learning_rate": 0.00017069147350740594, + "loss": 4.3034, + "step": 7243 + }, + { + "epoch": 0.7511162667115981, + "grad_norm": 0.84375, + "learning_rate": 0.00017068379020317894, + "loss": 4.3169, + "step": 7244 + }, + { + "epoch": 0.7512199547660863, + "grad_norm": 0.74609375, + "learning_rate": 0.00017067610606495795, + "loss": 4.3012, + "step": 7245 + }, + { + "epoch": 0.7513236428205743, + "grad_norm": 0.9296875, + "learning_rate": 0.00017066842109283358, + "loss": 4.3494, + "step": 7246 + }, + { + "epoch": 0.7514273308750624, + "grad_norm": 0.765625, + "learning_rate": 0.00017066073528689658, + "loss": 4.3039, + "step": 7247 + }, + { + "epoch": 0.7515310189295504, + "grad_norm": 0.94921875, + "learning_rate": 0.00017065304864723757, + "loss": 4.3302, + "step": 7248 + }, + { + "epoch": 0.7516347069840386, + "grad_norm": 0.9453125, + "learning_rate": 0.00017064536117394728, + "loss": 4.2775, + "step": 7249 + }, + { + "epoch": 0.7517383950385266, + "grad_norm": 0.796875, + "learning_rate": 0.0001706376728671164, + "loss": 4.2653, + "step": 7250 + }, + { + "epoch": 0.7518420830930147, + "grad_norm": 1.1015625, + "learning_rate": 0.00017062998372683566, + "loss": 4.3025, + "step": 7251 + }, + { + "epoch": 0.7519457711475027, + "grad_norm": 0.73828125, + "learning_rate": 0.00017062229375319573, + "loss": 4.2539, + "step": 7252 + }, + { + "epoch": 0.7520494592019908, + "grad_norm": 1.2578125, + "learning_rate": 0.00017061460294628744, + "loss": 4.2528, + "step": 7253 + }, + { + "epoch": 0.7521531472564789, + "grad_norm": 0.8359375, + "learning_rate": 0.00017060691130620144, + "loss": 4.318, + "step": 7254 + }, + { + "epoch": 0.752256835310967, + "grad_norm": 1.3125, + "learning_rate": 0.00017059921883302853, + "loss": 4.3094, + "step": 7255 + }, + { + "epoch": 0.752360523365455, + "grad_norm": 1.0390625, + "learning_rate": 0.0001705915255268595, + "loss": 4.3801, + "step": 7256 + }, + { + "epoch": 0.7524642114199431, + "grad_norm": 1.828125, + "learning_rate": 0.00017058383138778504, + "loss": 4.319, + "step": 7257 + }, + { + "epoch": 0.7525678994744311, + "grad_norm": 1.6328125, + "learning_rate": 0.00017057613641589597, + "loss": 4.2533, + "step": 7258 + }, + { + "epoch": 0.7526715875289193, + "grad_norm": 1.5, + "learning_rate": 0.00017056844061128312, + "loss": 4.2892, + "step": 7259 + }, + { + "epoch": 0.7527752755834073, + "grad_norm": 1.484375, + "learning_rate": 0.00017056074397403726, + "loss": 4.2962, + "step": 7260 + }, + { + "epoch": 0.7528789636378954, + "grad_norm": 1.203125, + "learning_rate": 0.00017055304650424923, + "loss": 4.3135, + "step": 7261 + }, + { + "epoch": 0.7529826516923834, + "grad_norm": 1.25, + "learning_rate": 0.00017054534820200982, + "loss": 4.2944, + "step": 7262 + }, + { + "epoch": 0.7530863397468716, + "grad_norm": 1.3671875, + "learning_rate": 0.0001705376490674098, + "loss": 4.2739, + "step": 7263 + }, + { + "epoch": 0.7531900278013596, + "grad_norm": 1.21875, + "learning_rate": 0.00017052994910054018, + "loss": 4.2843, + "step": 7264 + }, + { + "epoch": 0.7532937158558477, + "grad_norm": 1.5703125, + "learning_rate": 0.00017052224830149166, + "loss": 4.3169, + "step": 7265 + }, + { + "epoch": 0.7533974039103357, + "grad_norm": 1.2890625, + "learning_rate": 0.00017051454667035517, + "loss": 4.2898, + "step": 7266 + }, + { + "epoch": 0.7535010919648238, + "grad_norm": 1.6171875, + "learning_rate": 0.00017050684420722155, + "loss": 4.244, + "step": 7267 + }, + { + "epoch": 0.7536047800193119, + "grad_norm": 1.4375, + "learning_rate": 0.00017049914091218175, + "loss": 4.3089, + "step": 7268 + }, + { + "epoch": 0.7537084680738, + "grad_norm": 1.7265625, + "learning_rate": 0.00017049143678532654, + "loss": 4.284, + "step": 7269 + }, + { + "epoch": 0.753812156128288, + "grad_norm": 1.6796875, + "learning_rate": 0.00017048373182674694, + "loss": 4.3093, + "step": 7270 + }, + { + "epoch": 0.7539158441827761, + "grad_norm": 1.28125, + "learning_rate": 0.0001704760260365338, + "loss": 4.2962, + "step": 7271 + }, + { + "epoch": 0.7540195322372641, + "grad_norm": 1.2578125, + "learning_rate": 0.00017046831941477803, + "loss": 4.3126, + "step": 7272 + }, + { + "epoch": 0.7541232202917523, + "grad_norm": 1.5, + "learning_rate": 0.0001704606119615706, + "loss": 4.2415, + "step": 7273 + }, + { + "epoch": 0.7542269083462403, + "grad_norm": 1.3984375, + "learning_rate": 0.0001704529036770024, + "loss": 4.3483, + "step": 7274 + }, + { + "epoch": 0.7543305964007284, + "grad_norm": 1.578125, + "learning_rate": 0.00017044519456116443, + "loss": 4.3222, + "step": 7275 + }, + { + "epoch": 0.7544342844552164, + "grad_norm": 1.40625, + "learning_rate": 0.00017043748461414762, + "loss": 4.2517, + "step": 7276 + }, + { + "epoch": 0.7545379725097046, + "grad_norm": 1.4921875, + "learning_rate": 0.00017042977383604295, + "loss": 4.3102, + "step": 7277 + }, + { + "epoch": 0.7546416605641926, + "grad_norm": 1.3515625, + "learning_rate": 0.0001704220622269414, + "loss": 4.3405, + "step": 7278 + }, + { + "epoch": 0.7547453486186807, + "grad_norm": 1.5703125, + "learning_rate": 0.00017041434978693393, + "loss": 4.3233, + "step": 7279 + }, + { + "epoch": 0.7548490366731688, + "grad_norm": 1.4921875, + "learning_rate": 0.00017040663651611158, + "loss": 4.2985, + "step": 7280 + }, + { + "epoch": 0.7549527247276568, + "grad_norm": 1.390625, + "learning_rate": 0.00017039892241456537, + "loss": 4.2694, + "step": 7281 + }, + { + "epoch": 0.755056412782145, + "grad_norm": 1.2734375, + "learning_rate": 0.00017039120748238627, + "loss": 4.3193, + "step": 7282 + }, + { + "epoch": 0.755160100836633, + "grad_norm": 1.5859375, + "learning_rate": 0.0001703834917196653, + "loss": 4.2847, + "step": 7283 + }, + { + "epoch": 0.7552637888911211, + "grad_norm": 1.484375, + "learning_rate": 0.00017037577512649357, + "loss": 4.2836, + "step": 7284 + }, + { + "epoch": 0.7553674769456091, + "grad_norm": 1.28125, + "learning_rate": 0.00017036805770296207, + "loss": 4.2952, + "step": 7285 + }, + { + "epoch": 0.7554711650000973, + "grad_norm": 1.2890625, + "learning_rate": 0.00017036033944916188, + "loss": 4.3062, + "step": 7286 + }, + { + "epoch": 0.7555748530545853, + "grad_norm": 1.3671875, + "learning_rate": 0.00017035262036518403, + "loss": 4.2864, + "step": 7287 + }, + { + "epoch": 0.7556785411090734, + "grad_norm": 1.28125, + "learning_rate": 0.00017034490045111963, + "loss": 4.2936, + "step": 7288 + }, + { + "epoch": 0.7557822291635614, + "grad_norm": 1.484375, + "learning_rate": 0.0001703371797070598, + "loss": 4.265, + "step": 7289 + }, + { + "epoch": 0.7558859172180495, + "grad_norm": 1.3828125, + "learning_rate": 0.00017032945813309555, + "loss": 4.3142, + "step": 7290 + }, + { + "epoch": 0.7559896052725376, + "grad_norm": 1.234375, + "learning_rate": 0.00017032173572931807, + "loss": 4.2957, + "step": 7291 + }, + { + "epoch": 0.7560932933270257, + "grad_norm": 1.1484375, + "learning_rate": 0.00017031401249581842, + "loss": 4.3144, + "step": 7292 + }, + { + "epoch": 0.7561969813815137, + "grad_norm": 1.359375, + "learning_rate": 0.00017030628843268776, + "loss": 4.2702, + "step": 7293 + }, + { + "epoch": 0.7563006694360018, + "grad_norm": 1.2890625, + "learning_rate": 0.0001702985635400172, + "loss": 4.3049, + "step": 7294 + }, + { + "epoch": 0.7564043574904898, + "grad_norm": 1.3828125, + "learning_rate": 0.00017029083781789793, + "loss": 4.278, + "step": 7295 + }, + { + "epoch": 0.756508045544978, + "grad_norm": 1.3203125, + "learning_rate": 0.00017028311126642107, + "loss": 4.2731, + "step": 7296 + }, + { + "epoch": 0.756611733599466, + "grad_norm": 1.21875, + "learning_rate": 0.00017027538388567778, + "loss": 4.3164, + "step": 7297 + }, + { + "epoch": 0.7567154216539541, + "grad_norm": 1.1015625, + "learning_rate": 0.00017026765567575923, + "loss": 4.3086, + "step": 7298 + }, + { + "epoch": 0.7568191097084421, + "grad_norm": 1.4140625, + "learning_rate": 0.0001702599266367566, + "loss": 4.3135, + "step": 7299 + }, + { + "epoch": 0.7569227977629303, + "grad_norm": 1.25, + "learning_rate": 0.00017025219676876114, + "loss": 4.2603, + "step": 7300 + }, + { + "epoch": 0.7570264858174183, + "grad_norm": 1.5859375, + "learning_rate": 0.00017024446607186402, + "loss": 4.2973, + "step": 7301 + }, + { + "epoch": 0.7571301738719064, + "grad_norm": 1.375, + "learning_rate": 0.00017023673454615645, + "loss": 4.3145, + "step": 7302 + }, + { + "epoch": 0.7572338619263944, + "grad_norm": 1.4140625, + "learning_rate": 0.00017022900219172964, + "loss": 4.2712, + "step": 7303 + }, + { + "epoch": 0.7573375499808825, + "grad_norm": 1.40625, + "learning_rate": 0.00017022126900867484, + "loss": 4.313, + "step": 7304 + }, + { + "epoch": 0.7574412380353706, + "grad_norm": 1.328125, + "learning_rate": 0.00017021353499708331, + "loss": 4.3111, + "step": 7305 + }, + { + "epoch": 0.7575449260898587, + "grad_norm": 1.2265625, + "learning_rate": 0.00017020580015704627, + "loss": 4.3204, + "step": 7306 + }, + { + "epoch": 0.7576486141443467, + "grad_norm": 1.4765625, + "learning_rate": 0.00017019806448865502, + "loss": 4.3063, + "step": 7307 + }, + { + "epoch": 0.7577523021988348, + "grad_norm": 1.28125, + "learning_rate": 0.0001701903279920008, + "loss": 4.3014, + "step": 7308 + }, + { + "epoch": 0.7578559902533228, + "grad_norm": 1.796875, + "learning_rate": 0.00017018259066717487, + "loss": 4.2751, + "step": 7309 + }, + { + "epoch": 0.757959678307811, + "grad_norm": 1.6484375, + "learning_rate": 0.0001701748525142686, + "loss": 4.2672, + "step": 7310 + }, + { + "epoch": 0.758063366362299, + "grad_norm": 1.203125, + "learning_rate": 0.00017016711353337325, + "loss": 4.2977, + "step": 7311 + }, + { + "epoch": 0.7581670544167871, + "grad_norm": 1.2109375, + "learning_rate": 0.00017015937372458009, + "loss": 4.2664, + "step": 7312 + }, + { + "epoch": 0.7582707424712751, + "grad_norm": 1.25, + "learning_rate": 0.0001701516330879805, + "loss": 4.3269, + "step": 7313 + }, + { + "epoch": 0.7583744305257633, + "grad_norm": 1.09375, + "learning_rate": 0.0001701438916236658, + "loss": 4.2875, + "step": 7314 + }, + { + "epoch": 0.7584781185802513, + "grad_norm": 1.6015625, + "learning_rate": 0.00017013614933172733, + "loss": 4.3359, + "step": 7315 + }, + { + "epoch": 0.7585818066347394, + "grad_norm": 1.53125, + "learning_rate": 0.00017012840621225643, + "loss": 4.3346, + "step": 7316 + }, + { + "epoch": 0.7586854946892274, + "grad_norm": 1.4453125, + "learning_rate": 0.00017012066226534446, + "loss": 4.321, + "step": 7317 + }, + { + "epoch": 0.7587891827437155, + "grad_norm": 1.359375, + "learning_rate": 0.00017011291749108282, + "loss": 4.2992, + "step": 7318 + }, + { + "epoch": 0.7588928707982036, + "grad_norm": 1.28125, + "learning_rate": 0.00017010517188956284, + "loss": 4.2377, + "step": 7319 + }, + { + "epoch": 0.7589965588526917, + "grad_norm": 1.2265625, + "learning_rate": 0.00017009742546087594, + "loss": 4.2905, + "step": 7320 + }, + { + "epoch": 0.7591002469071797, + "grad_norm": 1.328125, + "learning_rate": 0.00017008967820511352, + "loss": 4.273, + "step": 7321 + }, + { + "epoch": 0.7592039349616678, + "grad_norm": 1.2421875, + "learning_rate": 0.000170081930122367, + "loss": 4.2929, + "step": 7322 + }, + { + "epoch": 0.7593076230161558, + "grad_norm": 1.4453125, + "learning_rate": 0.00017007418121272775, + "loss": 4.3323, + "step": 7323 + }, + { + "epoch": 0.759411311070644, + "grad_norm": 1.3515625, + "learning_rate": 0.00017006643147628726, + "loss": 4.3453, + "step": 7324 + }, + { + "epoch": 0.7595149991251321, + "grad_norm": 1.296875, + "learning_rate": 0.00017005868091313694, + "loss": 4.3426, + "step": 7325 + }, + { + "epoch": 0.7596186871796201, + "grad_norm": 1.1953125, + "learning_rate": 0.00017005092952336823, + "loss": 4.2514, + "step": 7326 + }, + { + "epoch": 0.7597223752341082, + "grad_norm": 1.4609375, + "learning_rate": 0.00017004317730707263, + "loss": 4.2899, + "step": 7327 + }, + { + "epoch": 0.7598260632885963, + "grad_norm": 1.375, + "learning_rate": 0.0001700354242643415, + "loss": 4.2561, + "step": 7328 + }, + { + "epoch": 0.7599297513430844, + "grad_norm": 1.46875, + "learning_rate": 0.00017002767039526646, + "loss": 4.3167, + "step": 7329 + }, + { + "epoch": 0.7600334393975724, + "grad_norm": 1.4140625, + "learning_rate": 0.0001700199156999389, + "loss": 4.3333, + "step": 7330 + }, + { + "epoch": 0.7601371274520605, + "grad_norm": 1.2734375, + "learning_rate": 0.00017001216017845038, + "loss": 4.2794, + "step": 7331 + }, + { + "epoch": 0.7602408155065485, + "grad_norm": 1.15625, + "learning_rate": 0.00017000440383089239, + "loss": 4.2827, + "step": 7332 + }, + { + "epoch": 0.7603445035610367, + "grad_norm": 1.5703125, + "learning_rate": 0.00016999664665735638, + "loss": 4.2929, + "step": 7333 + }, + { + "epoch": 0.7604481916155247, + "grad_norm": 1.3203125, + "learning_rate": 0.00016998888865793396, + "loss": 4.3017, + "step": 7334 + }, + { + "epoch": 0.7605518796700128, + "grad_norm": 1.515625, + "learning_rate": 0.0001699811298327166, + "loss": 4.2976, + "step": 7335 + }, + { + "epoch": 0.7606555677245008, + "grad_norm": 1.3984375, + "learning_rate": 0.0001699733701817959, + "loss": 4.2537, + "step": 7336 + }, + { + "epoch": 0.760759255778989, + "grad_norm": 1.2890625, + "learning_rate": 0.0001699656097052634, + "loss": 4.3015, + "step": 7337 + }, + { + "epoch": 0.760862943833477, + "grad_norm": 1.1953125, + "learning_rate": 0.0001699578484032107, + "loss": 4.2592, + "step": 7338 + }, + { + "epoch": 0.7609666318879651, + "grad_norm": 1.3671875, + "learning_rate": 0.00016995008627572933, + "loss": 4.3026, + "step": 7339 + }, + { + "epoch": 0.7610703199424531, + "grad_norm": 1.140625, + "learning_rate": 0.00016994232332291084, + "loss": 4.2917, + "step": 7340 + }, + { + "epoch": 0.7611740079969412, + "grad_norm": 1.6015625, + "learning_rate": 0.00016993455954484687, + "loss": 4.2922, + "step": 7341 + }, + { + "epoch": 0.7612776960514293, + "grad_norm": 1.5078125, + "learning_rate": 0.00016992679494162903, + "loss": 4.2461, + "step": 7342 + }, + { + "epoch": 0.7613813841059174, + "grad_norm": 1.4296875, + "learning_rate": 0.00016991902951334894, + "loss": 4.3031, + "step": 7343 + }, + { + "epoch": 0.7614850721604054, + "grad_norm": 1.3046875, + "learning_rate": 0.00016991126326009818, + "loss": 4.2936, + "step": 7344 + }, + { + "epoch": 0.7615887602148935, + "grad_norm": 1.3046875, + "learning_rate": 0.00016990349618196845, + "loss": 4.2804, + "step": 7345 + }, + { + "epoch": 0.7616924482693815, + "grad_norm": 1.2734375, + "learning_rate": 0.00016989572827905134, + "loss": 4.2966, + "step": 7346 + }, + { + "epoch": 0.7617961363238697, + "grad_norm": 1.4765625, + "learning_rate": 0.00016988795955143852, + "loss": 4.275, + "step": 7347 + }, + { + "epoch": 0.7618998243783577, + "grad_norm": 1.2890625, + "learning_rate": 0.00016988018999922167, + "loss": 4.256, + "step": 7348 + }, + { + "epoch": 0.7620035124328458, + "grad_norm": 1.4140625, + "learning_rate": 0.0001698724196224924, + "loss": 4.3285, + "step": 7349 + }, + { + "epoch": 0.7621072004873338, + "grad_norm": 1.28125, + "learning_rate": 0.00016986464842134246, + "loss": 4.3024, + "step": 7350 + }, + { + "epoch": 0.762210888541822, + "grad_norm": 1.2890625, + "learning_rate": 0.00016985687639586354, + "loss": 4.3106, + "step": 7351 + }, + { + "epoch": 0.76231457659631, + "grad_norm": 1.296875, + "learning_rate": 0.00016984910354614732, + "loss": 4.3155, + "step": 7352 + }, + { + "epoch": 0.7624182646507981, + "grad_norm": 1.3359375, + "learning_rate": 0.00016984132987228547, + "loss": 4.3108, + "step": 7353 + }, + { + "epoch": 0.7625219527052861, + "grad_norm": 1.1953125, + "learning_rate": 0.00016983355537436977, + "loss": 4.3093, + "step": 7354 + }, + { + "epoch": 0.7626256407597742, + "grad_norm": 1.3203125, + "learning_rate": 0.00016982578005249197, + "loss": 4.3132, + "step": 7355 + }, + { + "epoch": 0.7627293288142623, + "grad_norm": 1.234375, + "learning_rate": 0.0001698180039067437, + "loss": 4.2788, + "step": 7356 + }, + { + "epoch": 0.7628330168687504, + "grad_norm": 1.4140625, + "learning_rate": 0.00016981022693721688, + "loss": 4.299, + "step": 7357 + }, + { + "epoch": 0.7629367049232384, + "grad_norm": 1.28125, + "learning_rate": 0.0001698024491440031, + "loss": 4.2963, + "step": 7358 + }, + { + "epoch": 0.7630403929777265, + "grad_norm": 1.1953125, + "learning_rate": 0.00016979467052719423, + "loss": 4.2889, + "step": 7359 + }, + { + "epoch": 0.7631440810322145, + "grad_norm": 1.109375, + "learning_rate": 0.00016978689108688202, + "loss": 4.2818, + "step": 7360 + }, + { + "epoch": 0.7632477690867027, + "grad_norm": 1.328125, + "learning_rate": 0.00016977911082315827, + "loss": 4.2491, + "step": 7361 + }, + { + "epoch": 0.7633514571411907, + "grad_norm": 1.1171875, + "learning_rate": 0.00016977132973611475, + "loss": 4.2968, + "step": 7362 + }, + { + "epoch": 0.7634551451956788, + "grad_norm": 1.3671875, + "learning_rate": 0.00016976354782584333, + "loss": 4.3111, + "step": 7363 + }, + { + "epoch": 0.7635588332501668, + "grad_norm": 1.2265625, + "learning_rate": 0.00016975576509243578, + "loss": 4.296, + "step": 7364 + }, + { + "epoch": 0.763662521304655, + "grad_norm": 1.2578125, + "learning_rate": 0.00016974798153598393, + "loss": 4.289, + "step": 7365 + }, + { + "epoch": 0.763766209359143, + "grad_norm": 1.171875, + "learning_rate": 0.0001697401971565796, + "loss": 4.3073, + "step": 7366 + }, + { + "epoch": 0.7638698974136311, + "grad_norm": 1.3359375, + "learning_rate": 0.00016973241195431468, + "loss": 4.3217, + "step": 7367 + }, + { + "epoch": 0.7639735854681191, + "grad_norm": 1.203125, + "learning_rate": 0.000169724625929281, + "loss": 4.3015, + "step": 7368 + }, + { + "epoch": 0.7640772735226072, + "grad_norm": 1.421875, + "learning_rate": 0.00016971683908157046, + "loss": 4.2788, + "step": 7369 + }, + { + "epoch": 0.7641809615770954, + "grad_norm": 1.3359375, + "learning_rate": 0.0001697090514112749, + "loss": 4.2878, + "step": 7370 + }, + { + "epoch": 0.7642846496315834, + "grad_norm": 1.1875, + "learning_rate": 0.0001697012629184862, + "loss": 4.2664, + "step": 7371 + }, + { + "epoch": 0.7643883376860715, + "grad_norm": 1.078125, + "learning_rate": 0.0001696934736032963, + "loss": 4.3435, + "step": 7372 + }, + { + "epoch": 0.7644920257405595, + "grad_norm": 1.328125, + "learning_rate": 0.00016968568346579707, + "loss": 4.2861, + "step": 7373 + }, + { + "epoch": 0.7645957137950476, + "grad_norm": 1.1953125, + "learning_rate": 0.00016967789250608046, + "loss": 4.2991, + "step": 7374 + }, + { + "epoch": 0.7646994018495357, + "grad_norm": 1.3984375, + "learning_rate": 0.00016967010072423835, + "loss": 4.2715, + "step": 7375 + }, + { + "epoch": 0.7648030899040238, + "grad_norm": 1.359375, + "learning_rate": 0.00016966230812036267, + "loss": 4.3264, + "step": 7376 + }, + { + "epoch": 0.7649067779585118, + "grad_norm": 1.265625, + "learning_rate": 0.00016965451469454546, + "loss": 4.2816, + "step": 7377 + }, + { + "epoch": 0.7650104660129999, + "grad_norm": 1.125, + "learning_rate": 0.00016964672044687853, + "loss": 4.2958, + "step": 7378 + }, + { + "epoch": 0.765114154067488, + "grad_norm": 1.1328125, + "learning_rate": 0.00016963892537745395, + "loss": 4.2681, + "step": 7379 + }, + { + "epoch": 0.7652178421219761, + "grad_norm": 1.078125, + "learning_rate": 0.0001696311294863637, + "loss": 4.2735, + "step": 7380 + }, + { + "epoch": 0.7653215301764641, + "grad_norm": 1.328125, + "learning_rate": 0.00016962333277369967, + "loss": 4.2949, + "step": 7381 + }, + { + "epoch": 0.7654252182309522, + "grad_norm": 1.25, + "learning_rate": 0.00016961553523955393, + "loss": 4.2902, + "step": 7382 + }, + { + "epoch": 0.7655289062854402, + "grad_norm": 1.28125, + "learning_rate": 0.00016960773688401846, + "loss": 4.2752, + "step": 7383 + }, + { + "epoch": 0.7656325943399284, + "grad_norm": 1.2734375, + "learning_rate": 0.00016959993770718524, + "loss": 4.3044, + "step": 7384 + }, + { + "epoch": 0.7657362823944164, + "grad_norm": 1.125, + "learning_rate": 0.00016959213770914637, + "loss": 4.3009, + "step": 7385 + }, + { + "epoch": 0.7658399704489045, + "grad_norm": 1.0390625, + "learning_rate": 0.00016958433688999381, + "loss": 4.2998, + "step": 7386 + }, + { + "epoch": 0.7659436585033925, + "grad_norm": 1.3515625, + "learning_rate": 0.00016957653524981968, + "loss": 4.2673, + "step": 7387 + }, + { + "epoch": 0.7660473465578806, + "grad_norm": 1.2578125, + "learning_rate": 0.00016956873278871592, + "loss": 4.3262, + "step": 7388 + }, + { + "epoch": 0.7661510346123687, + "grad_norm": 1.5546875, + "learning_rate": 0.0001695609295067747, + "loss": 4.3042, + "step": 7389 + }, + { + "epoch": 0.7662547226668568, + "grad_norm": 1.4453125, + "learning_rate": 0.000169553125404088, + "loss": 4.2929, + "step": 7390 + }, + { + "epoch": 0.7663584107213448, + "grad_norm": 1.2890625, + "learning_rate": 0.00016954532048074793, + "loss": 4.287, + "step": 7391 + }, + { + "epoch": 0.7664620987758329, + "grad_norm": 1.2578125, + "learning_rate": 0.00016953751473684664, + "loss": 4.276, + "step": 7392 + }, + { + "epoch": 0.766565786830321, + "grad_norm": 1.25, + "learning_rate": 0.00016952970817247614, + "loss": 4.2847, + "step": 7393 + }, + { + "epoch": 0.7666694748848091, + "grad_norm": 1.21875, + "learning_rate": 0.0001695219007877286, + "loss": 4.2912, + "step": 7394 + }, + { + "epoch": 0.7667731629392971, + "grad_norm": 1.4453125, + "learning_rate": 0.00016951409258269612, + "loss": 4.2992, + "step": 7395 + }, + { + "epoch": 0.7668768509937852, + "grad_norm": 1.2890625, + "learning_rate": 0.0001695062835574708, + "loss": 4.3023, + "step": 7396 + }, + { + "epoch": 0.7669805390482732, + "grad_norm": 1.2890625, + "learning_rate": 0.00016949847371214484, + "loss": 4.2982, + "step": 7397 + }, + { + "epoch": 0.7670842271027614, + "grad_norm": 1.21875, + "learning_rate": 0.00016949066304681033, + "loss": 4.2983, + "step": 7398 + }, + { + "epoch": 0.7671879151572494, + "grad_norm": 1.15625, + "learning_rate": 0.00016948285156155945, + "loss": 4.2468, + "step": 7399 + }, + { + "epoch": 0.7672916032117375, + "grad_norm": 0.98046875, + "learning_rate": 0.0001694750392564844, + "loss": 4.3253, + "step": 7400 + }, + { + "epoch": 0.7673952912662255, + "grad_norm": 1.2890625, + "learning_rate": 0.0001694672261316773, + "loss": 4.3069, + "step": 7401 + }, + { + "epoch": 0.7674989793207136, + "grad_norm": 1.1484375, + "learning_rate": 0.00016945941218723037, + "loss": 4.3027, + "step": 7402 + }, + { + "epoch": 0.7676026673752017, + "grad_norm": 1.53125, + "learning_rate": 0.0001694515974232358, + "loss": 4.3055, + "step": 7403 + }, + { + "epoch": 0.7677063554296898, + "grad_norm": 1.4140625, + "learning_rate": 0.00016944378183978578, + "loss": 4.3045, + "step": 7404 + }, + { + "epoch": 0.7678100434841778, + "grad_norm": 1.2734375, + "learning_rate": 0.00016943596543697256, + "loss": 4.3241, + "step": 7405 + }, + { + "epoch": 0.7679137315386659, + "grad_norm": 1.2109375, + "learning_rate": 0.00016942814821488834, + "loss": 4.2945, + "step": 7406 + }, + { + "epoch": 0.768017419593154, + "grad_norm": 1.375, + "learning_rate": 0.00016942033017362533, + "loss": 4.2977, + "step": 7407 + }, + { + "epoch": 0.7681211076476421, + "grad_norm": 1.1484375, + "learning_rate": 0.00016941251131327581, + "loss": 4.2918, + "step": 7408 + }, + { + "epoch": 0.7682247957021301, + "grad_norm": 1.46875, + "learning_rate": 0.00016940469163393207, + "loss": 4.3023, + "step": 7409 + }, + { + "epoch": 0.7683284837566182, + "grad_norm": 1.328125, + "learning_rate": 0.0001693968711356863, + "loss": 4.2654, + "step": 7410 + }, + { + "epoch": 0.7684321718111062, + "grad_norm": 1.3515625, + "learning_rate": 0.0001693890498186308, + "loss": 4.3465, + "step": 7411 + }, + { + "epoch": 0.7685358598655944, + "grad_norm": 1.2109375, + "learning_rate": 0.00016938122768285786, + "loss": 4.2624, + "step": 7412 + }, + { + "epoch": 0.7686395479200824, + "grad_norm": 1.3828125, + "learning_rate": 0.0001693734047284598, + "loss": 4.3084, + "step": 7413 + }, + { + "epoch": 0.7687432359745705, + "grad_norm": 1.3046875, + "learning_rate": 0.00016936558095552887, + "loss": 4.3405, + "step": 7414 + }, + { + "epoch": 0.7688469240290586, + "grad_norm": 1.40625, + "learning_rate": 0.00016935775636415742, + "loss": 4.2755, + "step": 7415 + }, + { + "epoch": 0.7689506120835466, + "grad_norm": 1.3359375, + "learning_rate": 0.00016934993095443776, + "loss": 4.3349, + "step": 7416 + }, + { + "epoch": 0.7690543001380348, + "grad_norm": 1.203125, + "learning_rate": 0.0001693421047264622, + "loss": 4.2602, + "step": 7417 + }, + { + "epoch": 0.7691579881925228, + "grad_norm": 1.1484375, + "learning_rate": 0.00016933427768032314, + "loss": 4.2741, + "step": 7418 + }, + { + "epoch": 0.7692616762470109, + "grad_norm": 1.4375, + "learning_rate": 0.00016932644981611288, + "loss": 4.312, + "step": 7419 + }, + { + "epoch": 0.7693653643014989, + "grad_norm": 1.25, + "learning_rate": 0.0001693186211339238, + "loss": 4.2978, + "step": 7420 + }, + { + "epoch": 0.7694690523559871, + "grad_norm": 1.4609375, + "learning_rate": 0.00016931079163384823, + "loss": 4.2716, + "step": 7421 + }, + { + "epoch": 0.7695727404104751, + "grad_norm": 1.4453125, + "learning_rate": 0.00016930296131597862, + "loss": 4.3112, + "step": 7422 + }, + { + "epoch": 0.7696764284649632, + "grad_norm": 1.2109375, + "learning_rate": 0.00016929513018040728, + "loss": 4.2339, + "step": 7423 + }, + { + "epoch": 0.7697801165194512, + "grad_norm": 1.140625, + "learning_rate": 0.00016928729822722667, + "loss": 4.2802, + "step": 7424 + }, + { + "epoch": 0.7698838045739393, + "grad_norm": 1.203125, + "learning_rate": 0.00016927946545652918, + "loss": 4.2905, + "step": 7425 + }, + { + "epoch": 0.7699874926284274, + "grad_norm": 1.109375, + "learning_rate": 0.00016927163186840726, + "loss": 4.2913, + "step": 7426 + }, + { + "epoch": 0.7700911806829155, + "grad_norm": 1.46875, + "learning_rate": 0.0001692637974629533, + "loss": 4.2782, + "step": 7427 + }, + { + "epoch": 0.7701948687374035, + "grad_norm": 1.421875, + "learning_rate": 0.0001692559622402597, + "loss": 4.2655, + "step": 7428 + }, + { + "epoch": 0.7702985567918916, + "grad_norm": 1.203125, + "learning_rate": 0.000169248126200419, + "loss": 4.2924, + "step": 7429 + }, + { + "epoch": 0.7704022448463796, + "grad_norm": 1.1875, + "learning_rate": 0.0001692402893435236, + "loss": 4.307, + "step": 7430 + }, + { + "epoch": 0.7705059329008678, + "grad_norm": 1.2890625, + "learning_rate": 0.00016923245166966595, + "loss": 4.2712, + "step": 7431 + }, + { + "epoch": 0.7706096209553558, + "grad_norm": 1.0703125, + "learning_rate": 0.00016922461317893855, + "loss": 4.3062, + "step": 7432 + }, + { + "epoch": 0.7707133090098439, + "grad_norm": 1.5078125, + "learning_rate": 0.00016921677387143392, + "loss": 4.2883, + "step": 7433 + }, + { + "epoch": 0.7708169970643319, + "grad_norm": 1.2421875, + "learning_rate": 0.00016920893374724455, + "loss": 4.2437, + "step": 7434 + }, + { + "epoch": 0.7709206851188201, + "grad_norm": 1.4609375, + "learning_rate": 0.00016920109280646285, + "loss": 4.2196, + "step": 7435 + }, + { + "epoch": 0.7710243731733081, + "grad_norm": 1.3828125, + "learning_rate": 0.00016919325104918143, + "loss": 4.3101, + "step": 7436 + }, + { + "epoch": 0.7711280612277962, + "grad_norm": 1.3828125, + "learning_rate": 0.00016918540847549277, + "loss": 4.3135, + "step": 7437 + }, + { + "epoch": 0.7712317492822842, + "grad_norm": 1.3125, + "learning_rate": 0.00016917756508548948, + "loss": 4.3182, + "step": 7438 + }, + { + "epoch": 0.7713354373367723, + "grad_norm": 1.4453125, + "learning_rate": 0.000169169720879264, + "loss": 4.2481, + "step": 7439 + }, + { + "epoch": 0.7714391253912604, + "grad_norm": 1.3671875, + "learning_rate": 0.00016916187585690895, + "loss": 4.3098, + "step": 7440 + }, + { + "epoch": 0.7715428134457485, + "grad_norm": 1.4375, + "learning_rate": 0.00016915403001851683, + "loss": 4.2693, + "step": 7441 + }, + { + "epoch": 0.7716465015002365, + "grad_norm": 1.3203125, + "learning_rate": 0.00016914618336418032, + "loss": 4.2724, + "step": 7442 + }, + { + "epoch": 0.7717501895547246, + "grad_norm": 1.3984375, + "learning_rate": 0.0001691383358939919, + "loss": 4.3085, + "step": 7443 + }, + { + "epoch": 0.7718538776092126, + "grad_norm": 1.3828125, + "learning_rate": 0.00016913048760804417, + "loss": 4.2871, + "step": 7444 + }, + { + "epoch": 0.7719575656637008, + "grad_norm": 1.234375, + "learning_rate": 0.00016912263850642981, + "loss": 4.3041, + "step": 7445 + }, + { + "epoch": 0.7720612537181888, + "grad_norm": 1.15625, + "learning_rate": 0.00016911478858924137, + "loss": 4.2777, + "step": 7446 + }, + { + "epoch": 0.7721649417726769, + "grad_norm": 1.1875, + "learning_rate": 0.00016910693785657146, + "loss": 4.2965, + "step": 7447 + }, + { + "epoch": 0.7722686298271649, + "grad_norm": 1.125, + "learning_rate": 0.00016909908630851277, + "loss": 4.3142, + "step": 7448 + }, + { + "epoch": 0.7723723178816531, + "grad_norm": 1.453125, + "learning_rate": 0.00016909123394515785, + "loss": 4.3168, + "step": 7449 + }, + { + "epoch": 0.7724760059361411, + "grad_norm": 1.390625, + "learning_rate": 0.00016908338076659945, + "loss": 4.3266, + "step": 7450 + }, + { + "epoch": 0.7725796939906292, + "grad_norm": 1.3203125, + "learning_rate": 0.00016907552677293018, + "loss": 4.3167, + "step": 7451 + }, + { + "epoch": 0.7726833820451172, + "grad_norm": 1.2109375, + "learning_rate": 0.0001690676719642427, + "loss": 4.2969, + "step": 7452 + }, + { + "epoch": 0.7727870700996053, + "grad_norm": 1.2421875, + "learning_rate": 0.00016905981634062967, + "loss": 4.2735, + "step": 7453 + }, + { + "epoch": 0.7728907581540934, + "grad_norm": 1.1640625, + "learning_rate": 0.0001690519599021838, + "loss": 4.3345, + "step": 7454 + }, + { + "epoch": 0.7729944462085815, + "grad_norm": 1.578125, + "learning_rate": 0.00016904410264899787, + "loss": 4.3112, + "step": 7455 + }, + { + "epoch": 0.7730981342630695, + "grad_norm": 1.4609375, + "learning_rate": 0.00016903624458116444, + "loss": 4.2882, + "step": 7456 + }, + { + "epoch": 0.7732018223175576, + "grad_norm": 1.234375, + "learning_rate": 0.0001690283856987763, + "loss": 4.271, + "step": 7457 + }, + { + "epoch": 0.7733055103720456, + "grad_norm": 1.203125, + "learning_rate": 0.0001690205260019262, + "loss": 4.3036, + "step": 7458 + }, + { + "epoch": 0.7734091984265338, + "grad_norm": 1.125, + "learning_rate": 0.00016901266549070688, + "loss": 4.2912, + "step": 7459 + }, + { + "epoch": 0.7735128864810219, + "grad_norm": 1.125, + "learning_rate": 0.00016900480416521103, + "loss": 4.2563, + "step": 7460 + }, + { + "epoch": 0.7736165745355099, + "grad_norm": 1.3359375, + "learning_rate": 0.00016899694202553143, + "loss": 4.2761, + "step": 7461 + }, + { + "epoch": 0.773720262589998, + "grad_norm": 1.25, + "learning_rate": 0.00016898907907176084, + "loss": 4.2598, + "step": 7462 + }, + { + "epoch": 0.7738239506444861, + "grad_norm": 1.296875, + "learning_rate": 0.00016898121530399202, + "loss": 4.2726, + "step": 7463 + }, + { + "epoch": 0.7739276386989742, + "grad_norm": 1.1796875, + "learning_rate": 0.0001689733507223178, + "loss": 4.2995, + "step": 7464 + }, + { + "epoch": 0.7740313267534622, + "grad_norm": 1.2890625, + "learning_rate": 0.00016896548532683098, + "loss": 4.31, + "step": 7465 + }, + { + "epoch": 0.7741350148079503, + "grad_norm": 1.25, + "learning_rate": 0.0001689576191176243, + "loss": 4.2907, + "step": 7466 + }, + { + "epoch": 0.7742387028624383, + "grad_norm": 1.1796875, + "learning_rate": 0.0001689497520947906, + "loss": 4.3051, + "step": 7467 + }, + { + "epoch": 0.7743423909169265, + "grad_norm": 1.125, + "learning_rate": 0.00016894188425842273, + "loss": 4.3089, + "step": 7468 + }, + { + "epoch": 0.7744460789714145, + "grad_norm": 1.2265625, + "learning_rate": 0.00016893401560861347, + "loss": 4.2659, + "step": 7469 + }, + { + "epoch": 0.7745497670259026, + "grad_norm": 1.1328125, + "learning_rate": 0.00016892614614545572, + "loss": 4.281, + "step": 7470 + }, + { + "epoch": 0.7746534550803906, + "grad_norm": 1.2421875, + "learning_rate": 0.00016891827586904226, + "loss": 4.2811, + "step": 7471 + }, + { + "epoch": 0.7747571431348788, + "grad_norm": 1.2109375, + "learning_rate": 0.00016891040477946606, + "loss": 4.2619, + "step": 7472 + }, + { + "epoch": 0.7748608311893668, + "grad_norm": 1.1953125, + "learning_rate": 0.0001689025328768199, + "loss": 4.2999, + "step": 7473 + }, + { + "epoch": 0.7749645192438549, + "grad_norm": 1.1484375, + "learning_rate": 0.00016889466016119665, + "loss": 4.3273, + "step": 7474 + }, + { + "epoch": 0.7750682072983429, + "grad_norm": 1.078125, + "learning_rate": 0.00016888678663268925, + "loss": 4.2815, + "step": 7475 + }, + { + "epoch": 0.775171895352831, + "grad_norm": 0.96484375, + "learning_rate": 0.00016887891229139064, + "loss": 4.2958, + "step": 7476 + }, + { + "epoch": 0.7752755834073191, + "grad_norm": 1.3046875, + "learning_rate": 0.00016887103713739363, + "loss": 4.2667, + "step": 7477 + }, + { + "epoch": 0.7753792714618072, + "grad_norm": 1.109375, + "learning_rate": 0.0001688631611707912, + "loss": 4.3166, + "step": 7478 + }, + { + "epoch": 0.7754829595162952, + "grad_norm": 1.2890625, + "learning_rate": 0.0001688552843916762, + "loss": 4.3385, + "step": 7479 + }, + { + "epoch": 0.7755866475707833, + "grad_norm": 1.1796875, + "learning_rate": 0.00016884740680014169, + "loss": 4.2935, + "step": 7480 + }, + { + "epoch": 0.7756903356252713, + "grad_norm": 1.140625, + "learning_rate": 0.00016883952839628053, + "loss": 4.2766, + "step": 7481 + }, + { + "epoch": 0.7757940236797595, + "grad_norm": 1.046875, + "learning_rate": 0.0001688316491801857, + "loss": 4.2529, + "step": 7482 + }, + { + "epoch": 0.7758977117342475, + "grad_norm": 1.2890625, + "learning_rate": 0.00016882376915195019, + "loss": 4.3134, + "step": 7483 + }, + { + "epoch": 0.7760013997887356, + "grad_norm": 1.109375, + "learning_rate": 0.00016881588831166696, + "loss": 4.2798, + "step": 7484 + }, + { + "epoch": 0.7761050878432236, + "grad_norm": 1.421875, + "learning_rate": 0.00016880800665942895, + "loss": 4.297, + "step": 7485 + }, + { + "epoch": 0.7762087758977118, + "grad_norm": 1.2734375, + "learning_rate": 0.0001688001241953292, + "loss": 4.3174, + "step": 7486 + }, + { + "epoch": 0.7763124639521998, + "grad_norm": 1.359375, + "learning_rate": 0.0001687922409194607, + "loss": 4.2795, + "step": 7487 + }, + { + "epoch": 0.7764161520066879, + "grad_norm": 1.28125, + "learning_rate": 0.0001687843568319165, + "loss": 4.3163, + "step": 7488 + }, + { + "epoch": 0.7765198400611759, + "grad_norm": 1.21875, + "learning_rate": 0.0001687764719327896, + "loss": 4.2761, + "step": 7489 + }, + { + "epoch": 0.776623528115664, + "grad_norm": 1.0625, + "learning_rate": 0.00016876858622217304, + "loss": 4.299, + "step": 7490 + }, + { + "epoch": 0.7767272161701521, + "grad_norm": 1.34375, + "learning_rate": 0.00016876069970015985, + "loss": 4.2744, + "step": 7491 + }, + { + "epoch": 0.7768309042246402, + "grad_norm": 1.2734375, + "learning_rate": 0.00016875281236684305, + "loss": 4.2877, + "step": 7492 + }, + { + "epoch": 0.7769345922791282, + "grad_norm": 1.359375, + "learning_rate": 0.00016874492422231578, + "loss": 4.313, + "step": 7493 + }, + { + "epoch": 0.7770382803336163, + "grad_norm": 1.3203125, + "learning_rate": 0.00016873703526667106, + "loss": 4.2781, + "step": 7494 + }, + { + "epoch": 0.7771419683881043, + "grad_norm": 1.140625, + "learning_rate": 0.00016872914550000197, + "loss": 4.2608, + "step": 7495 + }, + { + "epoch": 0.7772456564425925, + "grad_norm": 1.0546875, + "learning_rate": 0.00016872125492240161, + "loss": 4.2584, + "step": 7496 + }, + { + "epoch": 0.7773493444970805, + "grad_norm": 1.1953125, + "learning_rate": 0.0001687133635339631, + "loss": 4.2925, + "step": 7497 + }, + { + "epoch": 0.7774530325515686, + "grad_norm": 1.0078125, + "learning_rate": 0.0001687054713347795, + "loss": 4.3133, + "step": 7498 + }, + { + "epoch": 0.7775567206060566, + "grad_norm": 1.46875, + "learning_rate": 0.000168697578324944, + "loss": 4.315, + "step": 7499 + }, + { + "epoch": 0.7776604086605448, + "grad_norm": 1.3515625, + "learning_rate": 0.00016868968450454966, + "loss": 4.2717, + "step": 7500 + }, + { + "epoch": 0.7777640967150328, + "grad_norm": 1.34375, + "learning_rate": 0.0001686817898736897, + "loss": 4.2838, + "step": 7501 + }, + { + "epoch": 0.7778677847695209, + "grad_norm": 1.3125, + "learning_rate": 0.0001686738944324572, + "loss": 4.2746, + "step": 7502 + }, + { + "epoch": 0.7779714728240089, + "grad_norm": 1.109375, + "learning_rate": 0.00016866599818094531, + "loss": 4.3302, + "step": 7503 + }, + { + "epoch": 0.778075160878497, + "grad_norm": 1.078125, + "learning_rate": 0.00016865810111924728, + "loss": 4.2894, + "step": 7504 + }, + { + "epoch": 0.7781788489329852, + "grad_norm": 1.390625, + "learning_rate": 0.00016865020324745617, + "loss": 4.2608, + "step": 7505 + }, + { + "epoch": 0.7782825369874732, + "grad_norm": 1.1328125, + "learning_rate": 0.00016864230456566527, + "loss": 4.2797, + "step": 7506 + }, + { + "epoch": 0.7783862250419613, + "grad_norm": 1.5546875, + "learning_rate": 0.0001686344050739677, + "loss": 4.2473, + "step": 7507 + }, + { + "epoch": 0.7784899130964493, + "grad_norm": 1.453125, + "learning_rate": 0.00016862650477245672, + "loss": 4.289, + "step": 7508 + }, + { + "epoch": 0.7785936011509375, + "grad_norm": 1.3359375, + "learning_rate": 0.00016861860366122552, + "loss": 4.3078, + "step": 7509 + }, + { + "epoch": 0.7786972892054255, + "grad_norm": 1.1953125, + "learning_rate": 0.00016861070174036734, + "loss": 4.31, + "step": 7510 + }, + { + "epoch": 0.7788009772599136, + "grad_norm": 1.2734375, + "learning_rate": 0.0001686027990099754, + "loss": 4.2991, + "step": 7511 + }, + { + "epoch": 0.7789046653144016, + "grad_norm": 1.0703125, + "learning_rate": 0.00016859489547014293, + "loss": 4.2701, + "step": 7512 + }, + { + "epoch": 0.7790083533688897, + "grad_norm": 1.640625, + "learning_rate": 0.00016858699112096322, + "loss": 4.2792, + "step": 7513 + }, + { + "epoch": 0.7791120414233778, + "grad_norm": 1.578125, + "learning_rate": 0.00016857908596252952, + "loss": 4.3019, + "step": 7514 + }, + { + "epoch": 0.7792157294778659, + "grad_norm": 1.2890625, + "learning_rate": 0.0001685711799949351, + "loss": 4.3028, + "step": 7515 + }, + { + "epoch": 0.7793194175323539, + "grad_norm": 1.2578125, + "learning_rate": 0.0001685632732182732, + "loss": 4.3022, + "step": 7516 + }, + { + "epoch": 0.779423105586842, + "grad_norm": 1.234375, + "learning_rate": 0.00016855536563263715, + "loss": 4.2835, + "step": 7517 + }, + { + "epoch": 0.77952679364133, + "grad_norm": 1.15625, + "learning_rate": 0.00016854745723812032, + "loss": 4.332, + "step": 7518 + }, + { + "epoch": 0.7796304816958182, + "grad_norm": 1.5234375, + "learning_rate": 0.0001685395480348159, + "loss": 4.3164, + "step": 7519 + }, + { + "epoch": 0.7797341697503062, + "grad_norm": 1.4140625, + "learning_rate": 0.00016853163802281728, + "loss": 4.2839, + "step": 7520 + }, + { + "epoch": 0.7798378578047943, + "grad_norm": 1.3515625, + "learning_rate": 0.00016852372720221775, + "loss": 4.3423, + "step": 7521 + }, + { + "epoch": 0.7799415458592823, + "grad_norm": 1.2421875, + "learning_rate": 0.00016851581557311072, + "loss": 4.3084, + "step": 7522 + }, + { + "epoch": 0.7800452339137705, + "grad_norm": 1.28125, + "learning_rate": 0.00016850790313558943, + "loss": 4.3181, + "step": 7523 + }, + { + "epoch": 0.7801489219682585, + "grad_norm": 1.1640625, + "learning_rate": 0.00016849998988974733, + "loss": 4.2593, + "step": 7524 + }, + { + "epoch": 0.7802526100227466, + "grad_norm": 1.25, + "learning_rate": 0.00016849207583567776, + "loss": 4.2949, + "step": 7525 + }, + { + "epoch": 0.7803562980772346, + "grad_norm": 1.1171875, + "learning_rate": 0.00016848416097347407, + "loss": 4.2945, + "step": 7526 + }, + { + "epoch": 0.7804599861317227, + "grad_norm": 1.3671875, + "learning_rate": 0.0001684762453032297, + "loss": 4.3089, + "step": 7527 + }, + { + "epoch": 0.7805636741862108, + "grad_norm": 1.2890625, + "learning_rate": 0.000168468328825038, + "loss": 4.2943, + "step": 7528 + }, + { + "epoch": 0.7806673622406989, + "grad_norm": 1.328125, + "learning_rate": 0.00016846041153899242, + "loss": 4.333, + "step": 7529 + }, + { + "epoch": 0.7807710502951869, + "grad_norm": 1.2109375, + "learning_rate": 0.00016845249344518633, + "loss": 4.3041, + "step": 7530 + }, + { + "epoch": 0.780874738349675, + "grad_norm": 1.15625, + "learning_rate": 0.00016844457454371316, + "loss": 4.2783, + "step": 7531 + }, + { + "epoch": 0.780978426404163, + "grad_norm": 1.109375, + "learning_rate": 0.00016843665483466636, + "loss": 4.3132, + "step": 7532 + }, + { + "epoch": 0.7810821144586512, + "grad_norm": 1.421875, + "learning_rate": 0.0001684287343181394, + "loss": 4.2628, + "step": 7533 + }, + { + "epoch": 0.7811858025131392, + "grad_norm": 1.265625, + "learning_rate": 0.0001684208129942257, + "loss": 4.2853, + "step": 7534 + }, + { + "epoch": 0.7812894905676273, + "grad_norm": 1.4140625, + "learning_rate": 0.00016841289086301874, + "loss": 4.3297, + "step": 7535 + }, + { + "epoch": 0.7813931786221153, + "grad_norm": 1.328125, + "learning_rate": 0.00016840496792461194, + "loss": 4.2628, + "step": 7536 + }, + { + "epoch": 0.7814968666766035, + "grad_norm": 1.1484375, + "learning_rate": 0.00016839704417909886, + "loss": 4.2701, + "step": 7537 + }, + { + "epoch": 0.7816005547310915, + "grad_norm": 1.1015625, + "learning_rate": 0.00016838911962657296, + "loss": 4.3012, + "step": 7538 + }, + { + "epoch": 0.7817042427855796, + "grad_norm": 1.2109375, + "learning_rate": 0.00016838119426712775, + "loss": 4.3009, + "step": 7539 + }, + { + "epoch": 0.7818079308400676, + "grad_norm": 1.1484375, + "learning_rate": 0.00016837326810085668, + "loss": 4.2807, + "step": 7540 + }, + { + "epoch": 0.7819116188945557, + "grad_norm": 1.2578125, + "learning_rate": 0.00016836534112785336, + "loss": 4.3013, + "step": 7541 + }, + { + "epoch": 0.7820153069490438, + "grad_norm": 1.203125, + "learning_rate": 0.00016835741334821126, + "loss": 4.2881, + "step": 7542 + }, + { + "epoch": 0.7821189950035319, + "grad_norm": 1.2734375, + "learning_rate": 0.000168349484762024, + "loss": 4.2784, + "step": 7543 + }, + { + "epoch": 0.7822226830580199, + "grad_norm": 1.1953125, + "learning_rate": 0.000168341555369385, + "loss": 4.2688, + "step": 7544 + }, + { + "epoch": 0.782326371112508, + "grad_norm": 1.2421875, + "learning_rate": 0.0001683336251703879, + "loss": 4.3094, + "step": 7545 + }, + { + "epoch": 0.782430059166996, + "grad_norm": 1.140625, + "learning_rate": 0.00016832569416512624, + "loss": 4.2929, + "step": 7546 + }, + { + "epoch": 0.7825337472214842, + "grad_norm": 1.1796875, + "learning_rate": 0.00016831776235369365, + "loss": 4.2823, + "step": 7547 + }, + { + "epoch": 0.7826374352759723, + "grad_norm": 1.0546875, + "learning_rate": 0.00016830982973618368, + "loss": 4.2762, + "step": 7548 + }, + { + "epoch": 0.7827411233304603, + "grad_norm": 1.3359375, + "learning_rate": 0.00016830189631268988, + "loss": 4.3096, + "step": 7549 + }, + { + "epoch": 0.7828448113849484, + "grad_norm": 1.15625, + "learning_rate": 0.00016829396208330595, + "loss": 4.2435, + "step": 7550 + }, + { + "epoch": 0.7829484994394365, + "grad_norm": 1.25, + "learning_rate": 0.00016828602704812543, + "loss": 4.2706, + "step": 7551 + }, + { + "epoch": 0.7830521874939246, + "grad_norm": 1.1328125, + "learning_rate": 0.00016827809120724199, + "loss": 4.3124, + "step": 7552 + }, + { + "epoch": 0.7831558755484126, + "grad_norm": 1.296875, + "learning_rate": 0.00016827015456074925, + "loss": 4.3115, + "step": 7553 + }, + { + "epoch": 0.7832595636029007, + "grad_norm": 1.1953125, + "learning_rate": 0.0001682622171087408, + "loss": 4.2603, + "step": 7554 + }, + { + "epoch": 0.7833632516573887, + "grad_norm": 1.1953125, + "learning_rate": 0.0001682542788513104, + "loss": 4.2819, + "step": 7555 + }, + { + "epoch": 0.7834669397118769, + "grad_norm": 1.1484375, + "learning_rate": 0.00016824633978855164, + "loss": 4.3383, + "step": 7556 + }, + { + "epoch": 0.7835706277663649, + "grad_norm": 1.296875, + "learning_rate": 0.0001682383999205582, + "loss": 4.2508, + "step": 7557 + }, + { + "epoch": 0.783674315820853, + "grad_norm": 1.15625, + "learning_rate": 0.0001682304592474238, + "loss": 4.2703, + "step": 7558 + }, + { + "epoch": 0.783778003875341, + "grad_norm": 1.34375, + "learning_rate": 0.00016822251776924208, + "loss": 4.2566, + "step": 7559 + }, + { + "epoch": 0.7838816919298291, + "grad_norm": 1.2734375, + "learning_rate": 0.00016821457548610677, + "loss": 4.3067, + "step": 7560 + }, + { + "epoch": 0.7839853799843172, + "grad_norm": 1.4921875, + "learning_rate": 0.00016820663239811156, + "loss": 4.3048, + "step": 7561 + }, + { + "epoch": 0.7840890680388053, + "grad_norm": 1.296875, + "learning_rate": 0.0001681986885053502, + "loss": 4.2958, + "step": 7562 + }, + { + "epoch": 0.7841927560932933, + "grad_norm": 1.3828125, + "learning_rate": 0.00016819074380791638, + "loss": 4.3104, + "step": 7563 + }, + { + "epoch": 0.7842964441477814, + "grad_norm": 1.2421875, + "learning_rate": 0.00016818279830590392, + "loss": 4.3193, + "step": 7564 + }, + { + "epoch": 0.7844001322022695, + "grad_norm": 1.2734375, + "learning_rate": 0.00016817485199940648, + "loss": 4.2839, + "step": 7565 + }, + { + "epoch": 0.7845038202567576, + "grad_norm": 1.1953125, + "learning_rate": 0.00016816690488851785, + "loss": 4.2268, + "step": 7566 + }, + { + "epoch": 0.7846075083112456, + "grad_norm": 1.265625, + "learning_rate": 0.00016815895697333178, + "loss": 4.2412, + "step": 7567 + }, + { + "epoch": 0.7847111963657337, + "grad_norm": 1.171875, + "learning_rate": 0.0001681510082539421, + "loss": 4.2996, + "step": 7568 + }, + { + "epoch": 0.7848148844202217, + "grad_norm": 1.328125, + "learning_rate": 0.00016814305873044254, + "loss": 4.2629, + "step": 7569 + }, + { + "epoch": 0.7849185724747099, + "grad_norm": 1.2578125, + "learning_rate": 0.0001681351084029269, + "loss": 4.2047, + "step": 7570 + }, + { + "epoch": 0.7850222605291979, + "grad_norm": 1.078125, + "learning_rate": 0.00016812715727148906, + "loss": 4.2992, + "step": 7571 + }, + { + "epoch": 0.785125948583686, + "grad_norm": 1.09375, + "learning_rate": 0.00016811920533622274, + "loss": 4.31, + "step": 7572 + }, + { + "epoch": 0.785229636638174, + "grad_norm": 1.21875, + "learning_rate": 0.0001681112525972218, + "loss": 4.2708, + "step": 7573 + }, + { + "epoch": 0.7853333246926621, + "grad_norm": 1.09375, + "learning_rate": 0.00016810329905458008, + "loss": 4.2971, + "step": 7574 + }, + { + "epoch": 0.7854370127471502, + "grad_norm": 1.3671875, + "learning_rate": 0.0001680953447083914, + "loss": 4.2911, + "step": 7575 + }, + { + "epoch": 0.7855407008016383, + "grad_norm": 1.2890625, + "learning_rate": 0.00016808738955874968, + "loss": 4.3078, + "step": 7576 + }, + { + "epoch": 0.7856443888561263, + "grad_norm": 1.1796875, + "learning_rate": 0.0001680794336057487, + "loss": 4.266, + "step": 7577 + }, + { + "epoch": 0.7857480769106144, + "grad_norm": 1.1015625, + "learning_rate": 0.00016807147684948237, + "loss": 4.2838, + "step": 7578 + }, + { + "epoch": 0.7858517649651025, + "grad_norm": 1.234375, + "learning_rate": 0.00016806351929004457, + "loss": 4.3373, + "step": 7579 + }, + { + "epoch": 0.7859554530195906, + "grad_norm": 1.140625, + "learning_rate": 0.00016805556092752921, + "loss": 4.2938, + "step": 7580 + }, + { + "epoch": 0.7860591410740786, + "grad_norm": 1.3046875, + "learning_rate": 0.00016804760176203015, + "loss": 4.2999, + "step": 7581 + }, + { + "epoch": 0.7861628291285667, + "grad_norm": 1.1640625, + "learning_rate": 0.00016803964179364132, + "loss": 4.3438, + "step": 7582 + }, + { + "epoch": 0.7862665171830547, + "grad_norm": 1.140625, + "learning_rate": 0.0001680316810224566, + "loss": 4.3247, + "step": 7583 + }, + { + "epoch": 0.7863702052375429, + "grad_norm": 1.09375, + "learning_rate": 0.00016802371944857, + "loss": 4.3308, + "step": 7584 + }, + { + "epoch": 0.7864738932920309, + "grad_norm": 1.140625, + "learning_rate": 0.00016801575707207542, + "loss": 4.2712, + "step": 7585 + }, + { + "epoch": 0.786577581346519, + "grad_norm": 0.984375, + "learning_rate": 0.00016800779389306678, + "loss": 4.3176, + "step": 7586 + }, + { + "epoch": 0.786681269401007, + "grad_norm": 1.2734375, + "learning_rate": 0.00016799982991163808, + "loss": 4.2888, + "step": 7587 + }, + { + "epoch": 0.7867849574554951, + "grad_norm": 1.1171875, + "learning_rate": 0.00016799186512788324, + "loss": 4.2784, + "step": 7588 + }, + { + "epoch": 0.7868886455099832, + "grad_norm": 1.3984375, + "learning_rate": 0.0001679838995418963, + "loss": 4.2813, + "step": 7589 + }, + { + "epoch": 0.7869923335644713, + "grad_norm": 1.3125, + "learning_rate": 0.0001679759331537712, + "loss": 4.3295, + "step": 7590 + }, + { + "epoch": 0.7870960216189593, + "grad_norm": 1.1953125, + "learning_rate": 0.0001679679659636019, + "loss": 4.2676, + "step": 7591 + }, + { + "epoch": 0.7871997096734474, + "grad_norm": 1.1015625, + "learning_rate": 0.00016795999797148244, + "loss": 4.2144, + "step": 7592 + }, + { + "epoch": 0.7873033977279356, + "grad_norm": 1.2890625, + "learning_rate": 0.00016795202917750692, + "loss": 4.2928, + "step": 7593 + }, + { + "epoch": 0.7874070857824236, + "grad_norm": 1.1640625, + "learning_rate": 0.0001679440595817692, + "loss": 4.2746, + "step": 7594 + }, + { + "epoch": 0.7875107738369117, + "grad_norm": 1.3984375, + "learning_rate": 0.00016793608918436347, + "loss": 4.308, + "step": 7595 + }, + { + "epoch": 0.7876144618913997, + "grad_norm": 1.3203125, + "learning_rate": 0.00016792811798538364, + "loss": 4.2943, + "step": 7596 + }, + { + "epoch": 0.7877181499458878, + "grad_norm": 1.1640625, + "learning_rate": 0.00016792014598492386, + "loss": 4.2543, + "step": 7597 + }, + { + "epoch": 0.7878218380003759, + "grad_norm": 1.125, + "learning_rate": 0.00016791217318307814, + "loss": 4.2815, + "step": 7598 + }, + { + "epoch": 0.787925526054864, + "grad_norm": 1.109375, + "learning_rate": 0.00016790419957994054, + "loss": 4.2256, + "step": 7599 + }, + { + "epoch": 0.788029214109352, + "grad_norm": 1.0234375, + "learning_rate": 0.0001678962251756052, + "loss": 4.2607, + "step": 7600 + }, + { + "epoch": 0.7881329021638401, + "grad_norm": 1.234375, + "learning_rate": 0.00016788824997016616, + "loss": 4.2988, + "step": 7601 + }, + { + "epoch": 0.7882365902183281, + "grad_norm": 1.0546875, + "learning_rate": 0.00016788027396371751, + "loss": 4.2565, + "step": 7602 + }, + { + "epoch": 0.7883402782728163, + "grad_norm": 1.421875, + "learning_rate": 0.00016787229715635343, + "loss": 4.3026, + "step": 7603 + }, + { + "epoch": 0.7884439663273043, + "grad_norm": 1.3671875, + "learning_rate": 0.00016786431954816792, + "loss": 4.3119, + "step": 7604 + }, + { + "epoch": 0.7885476543817924, + "grad_norm": 1.21875, + "learning_rate": 0.00016785634113925522, + "loss": 4.232, + "step": 7605 + }, + { + "epoch": 0.7886513424362804, + "grad_norm": 1.21875, + "learning_rate": 0.0001678483619297094, + "loss": 4.3448, + "step": 7606 + }, + { + "epoch": 0.7887550304907686, + "grad_norm": 1.109375, + "learning_rate": 0.00016784038191962463, + "loss": 4.272, + "step": 7607 + }, + { + "epoch": 0.7888587185452566, + "grad_norm": 1.0, + "learning_rate": 0.00016783240110909508, + "loss": 4.3144, + "step": 7608 + }, + { + "epoch": 0.7889624065997447, + "grad_norm": 1.34375, + "learning_rate": 0.0001678244194982149, + "loss": 4.2749, + "step": 7609 + }, + { + "epoch": 0.7890660946542327, + "grad_norm": 1.1875, + "learning_rate": 0.00016781643708707828, + "loss": 4.2794, + "step": 7610 + }, + { + "epoch": 0.7891697827087208, + "grad_norm": 1.34375, + "learning_rate": 0.00016780845387577936, + "loss": 4.295, + "step": 7611 + }, + { + "epoch": 0.7892734707632089, + "grad_norm": 1.2109375, + "learning_rate": 0.00016780046986441236, + "loss": 4.3052, + "step": 7612 + }, + { + "epoch": 0.789377158817697, + "grad_norm": 1.3125, + "learning_rate": 0.0001677924850530715, + "loss": 4.3186, + "step": 7613 + }, + { + "epoch": 0.789480846872185, + "grad_norm": 1.25, + "learning_rate": 0.00016778449944185092, + "loss": 4.3061, + "step": 7614 + }, + { + "epoch": 0.7895845349266731, + "grad_norm": 1.1640625, + "learning_rate": 0.00016777651303084495, + "loss": 4.2802, + "step": 7615 + }, + { + "epoch": 0.7896882229811611, + "grad_norm": 1.0625, + "learning_rate": 0.00016776852582014778, + "loss": 4.2582, + "step": 7616 + }, + { + "epoch": 0.7897919110356493, + "grad_norm": 1.2578125, + "learning_rate": 0.0001677605378098536, + "loss": 4.3021, + "step": 7617 + }, + { + "epoch": 0.7898955990901373, + "grad_norm": 1.171875, + "learning_rate": 0.00016775254900005675, + "loss": 4.2635, + "step": 7618 + }, + { + "epoch": 0.7899992871446254, + "grad_norm": 1.1875, + "learning_rate": 0.00016774455939085137, + "loss": 4.3136, + "step": 7619 + }, + { + "epoch": 0.7901029751991134, + "grad_norm": 1.0703125, + "learning_rate": 0.00016773656898233185, + "loss": 4.2791, + "step": 7620 + }, + { + "epoch": 0.7902066632536016, + "grad_norm": 1.1328125, + "learning_rate": 0.0001677285777745924, + "loss": 4.2588, + "step": 7621 + }, + { + "epoch": 0.7903103513080896, + "grad_norm": 1.046875, + "learning_rate": 0.00016772058576772735, + "loss": 4.2855, + "step": 7622 + }, + { + "epoch": 0.7904140393625777, + "grad_norm": 1.46875, + "learning_rate": 0.00016771259296183093, + "loss": 4.2701, + "step": 7623 + }, + { + "epoch": 0.7905177274170657, + "grad_norm": 1.3359375, + "learning_rate": 0.0001677045993569975, + "loss": 4.2737, + "step": 7624 + }, + { + "epoch": 0.7906214154715538, + "grad_norm": 1.1640625, + "learning_rate": 0.00016769660495332138, + "loss": 4.2514, + "step": 7625 + }, + { + "epoch": 0.7907251035260419, + "grad_norm": 1.1171875, + "learning_rate": 0.0001676886097508969, + "loss": 4.3126, + "step": 7626 + }, + { + "epoch": 0.79082879158053, + "grad_norm": 1.171875, + "learning_rate": 0.00016768061374981832, + "loss": 4.3186, + "step": 7627 + }, + { + "epoch": 0.790932479635018, + "grad_norm": 1.0703125, + "learning_rate": 0.00016767261695018007, + "loss": 4.2789, + "step": 7628 + }, + { + "epoch": 0.7910361676895061, + "grad_norm": 1.359375, + "learning_rate": 0.00016766461935207647, + "loss": 4.2943, + "step": 7629 + }, + { + "epoch": 0.7911398557439941, + "grad_norm": 1.3203125, + "learning_rate": 0.0001676566209556019, + "loss": 4.3317, + "step": 7630 + }, + { + "epoch": 0.7912435437984823, + "grad_norm": 1.2265625, + "learning_rate": 0.0001676486217608507, + "loss": 4.2931, + "step": 7631 + }, + { + "epoch": 0.7913472318529703, + "grad_norm": 1.1171875, + "learning_rate": 0.00016764062176791725, + "loss": 4.2701, + "step": 7632 + }, + { + "epoch": 0.7914509199074584, + "grad_norm": 1.1796875, + "learning_rate": 0.000167632620976896, + "loss": 4.2703, + "step": 7633 + }, + { + "epoch": 0.7915546079619464, + "grad_norm": 1.1171875, + "learning_rate": 0.00016762461938788132, + "loss": 4.2488, + "step": 7634 + }, + { + "epoch": 0.7916582960164346, + "grad_norm": 1.234375, + "learning_rate": 0.0001676166170009676, + "loss": 4.2902, + "step": 7635 + }, + { + "epoch": 0.7917619840709226, + "grad_norm": 1.21875, + "learning_rate": 0.00016760861381624926, + "loss": 4.312, + "step": 7636 + }, + { + "epoch": 0.7918656721254107, + "grad_norm": 1.3359375, + "learning_rate": 0.00016760060983382077, + "loss": 4.2857, + "step": 7637 + }, + { + "epoch": 0.7919693601798988, + "grad_norm": 1.21875, + "learning_rate": 0.00016759260505377652, + "loss": 4.2868, + "step": 7638 + }, + { + "epoch": 0.7920730482343868, + "grad_norm": 1.2265625, + "learning_rate": 0.00016758459947621097, + "loss": 4.2934, + "step": 7639 + }, + { + "epoch": 0.792176736288875, + "grad_norm": 1.1328125, + "learning_rate": 0.0001675765931012186, + "loss": 4.3025, + "step": 7640 + }, + { + "epoch": 0.792280424343363, + "grad_norm": 1.4453125, + "learning_rate": 0.00016756858592889383, + "loss": 4.3105, + "step": 7641 + }, + { + "epoch": 0.7923841123978511, + "grad_norm": 1.2421875, + "learning_rate": 0.00016756057795933122, + "loss": 4.2634, + "step": 7642 + }, + { + "epoch": 0.7924878004523391, + "grad_norm": 1.5, + "learning_rate": 0.00016755256919262517, + "loss": 4.2992, + "step": 7643 + }, + { + "epoch": 0.7925914885068273, + "grad_norm": 1.3671875, + "learning_rate": 0.00016754455962887023, + "loss": 4.2771, + "step": 7644 + }, + { + "epoch": 0.7926951765613153, + "grad_norm": 1.3984375, + "learning_rate": 0.00016753654926816088, + "loss": 4.3138, + "step": 7645 + }, + { + "epoch": 0.7927988646158034, + "grad_norm": 1.3828125, + "learning_rate": 0.00016752853811059163, + "loss": 4.2902, + "step": 7646 + }, + { + "epoch": 0.7929025526702914, + "grad_norm": 1.21875, + "learning_rate": 0.00016752052615625704, + "loss": 4.3219, + "step": 7647 + }, + { + "epoch": 0.7930062407247795, + "grad_norm": 1.1953125, + "learning_rate": 0.00016751251340525159, + "loss": 4.3061, + "step": 7648 + }, + { + "epoch": 0.7931099287792676, + "grad_norm": 1.2890625, + "learning_rate": 0.00016750449985766984, + "loss": 4.3073, + "step": 7649 + }, + { + "epoch": 0.7932136168337557, + "grad_norm": 1.140625, + "learning_rate": 0.00016749648551360634, + "loss": 4.2367, + "step": 7650 + }, + { + "epoch": 0.7933173048882437, + "grad_norm": 1.4609375, + "learning_rate": 0.00016748847037315566, + "loss": 4.284, + "step": 7651 + }, + { + "epoch": 0.7934209929427318, + "grad_norm": 1.34375, + "learning_rate": 0.0001674804544364124, + "loss": 4.3115, + "step": 7652 + }, + { + "epoch": 0.7935246809972198, + "grad_norm": 1.21875, + "learning_rate": 0.0001674724377034711, + "loss": 4.2805, + "step": 7653 + }, + { + "epoch": 0.793628369051708, + "grad_norm": 1.203125, + "learning_rate": 0.00016746442017442632, + "loss": 4.3198, + "step": 7654 + }, + { + "epoch": 0.793732057106196, + "grad_norm": 1.09375, + "learning_rate": 0.00016745640184937272, + "loss": 4.3037, + "step": 7655 + }, + { + "epoch": 0.7938357451606841, + "grad_norm": 0.94921875, + "learning_rate": 0.00016744838272840488, + "loss": 4.2644, + "step": 7656 + }, + { + "epoch": 0.7939394332151721, + "grad_norm": 1.171875, + "learning_rate": 0.0001674403628116174, + "loss": 4.2905, + "step": 7657 + }, + { + "epoch": 0.7940431212696603, + "grad_norm": 0.953125, + "learning_rate": 0.00016743234209910498, + "loss": 4.3112, + "step": 7658 + }, + { + "epoch": 0.7941468093241483, + "grad_norm": 1.1796875, + "learning_rate": 0.00016742432059096214, + "loss": 4.2794, + "step": 7659 + }, + { + "epoch": 0.7942504973786364, + "grad_norm": 1.0703125, + "learning_rate": 0.00016741629828728364, + "loss": 4.2744, + "step": 7660 + }, + { + "epoch": 0.7943541854331244, + "grad_norm": 1.5546875, + "learning_rate": 0.00016740827518816405, + "loss": 4.2451, + "step": 7661 + }, + { + "epoch": 0.7944578734876125, + "grad_norm": 1.4765625, + "learning_rate": 0.00016740025129369807, + "loss": 4.3041, + "step": 7662 + }, + { + "epoch": 0.7945615615421006, + "grad_norm": 1.1171875, + "learning_rate": 0.00016739222660398038, + "loss": 4.3188, + "step": 7663 + }, + { + "epoch": 0.7946652495965887, + "grad_norm": 1.15625, + "learning_rate": 0.00016738420111910566, + "loss": 4.2473, + "step": 7664 + }, + { + "epoch": 0.7947689376510767, + "grad_norm": 1.1484375, + "learning_rate": 0.0001673761748391686, + "loss": 4.2878, + "step": 7665 + }, + { + "epoch": 0.7948726257055648, + "grad_norm": 0.98828125, + "learning_rate": 0.00016736814776426387, + "loss": 4.2988, + "step": 7666 + }, + { + "epoch": 0.7949763137600528, + "grad_norm": 1.4921875, + "learning_rate": 0.0001673601198944862, + "loss": 4.2549, + "step": 7667 + }, + { + "epoch": 0.795080001814541, + "grad_norm": 1.2890625, + "learning_rate": 0.00016735209122993033, + "loss": 4.3131, + "step": 7668 + }, + { + "epoch": 0.795183689869029, + "grad_norm": 1.4140625, + "learning_rate": 0.000167344061770691, + "loss": 4.3129, + "step": 7669 + }, + { + "epoch": 0.7952873779235171, + "grad_norm": 1.3671875, + "learning_rate": 0.00016733603151686288, + "loss": 4.2754, + "step": 7670 + }, + { + "epoch": 0.7953910659780051, + "grad_norm": 1.0078125, + "learning_rate": 0.00016732800046854082, + "loss": 4.3063, + "step": 7671 + }, + { + "epoch": 0.7954947540324933, + "grad_norm": 1.09375, + "learning_rate": 0.0001673199686258195, + "loss": 4.2615, + "step": 7672 + }, + { + "epoch": 0.7955984420869813, + "grad_norm": 1.046875, + "learning_rate": 0.0001673119359887937, + "loss": 4.3246, + "step": 7673 + }, + { + "epoch": 0.7957021301414694, + "grad_norm": 0.8828125, + "learning_rate": 0.0001673039025575582, + "loss": 4.2785, + "step": 7674 + }, + { + "epoch": 0.7958058181959574, + "grad_norm": 1.2578125, + "learning_rate": 0.00016729586833220782, + "loss": 4.2913, + "step": 7675 + }, + { + "epoch": 0.7959095062504455, + "grad_norm": 0.93359375, + "learning_rate": 0.00016728783331283734, + "loss": 4.3111, + "step": 7676 + }, + { + "epoch": 0.7960131943049336, + "grad_norm": 1.390625, + "learning_rate": 0.00016727979749954153, + "loss": 4.3188, + "step": 7677 + }, + { + "epoch": 0.7961168823594217, + "grad_norm": 1.2421875, + "learning_rate": 0.00016727176089241521, + "loss": 4.2945, + "step": 7678 + }, + { + "epoch": 0.7962205704139097, + "grad_norm": 1.359375, + "learning_rate": 0.00016726372349155325, + "loss": 4.2384, + "step": 7679 + }, + { + "epoch": 0.7963242584683978, + "grad_norm": 1.2578125, + "learning_rate": 0.0001672556852970504, + "loss": 4.2974, + "step": 7680 + }, + { + "epoch": 0.7964279465228858, + "grad_norm": 1.3125, + "learning_rate": 0.00016724764630900163, + "loss": 4.2904, + "step": 7681 + }, + { + "epoch": 0.796531634577374, + "grad_norm": 1.171875, + "learning_rate": 0.0001672396065275017, + "loss": 4.2817, + "step": 7682 + }, + { + "epoch": 0.7966353226318621, + "grad_norm": 1.4765625, + "learning_rate": 0.00016723156595264545, + "loss": 4.2707, + "step": 7683 + }, + { + "epoch": 0.7967390106863501, + "grad_norm": 1.421875, + "learning_rate": 0.00016722352458452782, + "loss": 4.3052, + "step": 7684 + }, + { + "epoch": 0.7968426987408382, + "grad_norm": 1.296875, + "learning_rate": 0.00016721548242324365, + "loss": 4.3118, + "step": 7685 + }, + { + "epoch": 0.7969463867953263, + "grad_norm": 1.21875, + "learning_rate": 0.00016720743946888785, + "loss": 4.2763, + "step": 7686 + }, + { + "epoch": 0.7970500748498144, + "grad_norm": 1.1875, + "learning_rate": 0.00016719939572155528, + "loss": 4.2655, + "step": 7687 + }, + { + "epoch": 0.7971537629043024, + "grad_norm": 1.0390625, + "learning_rate": 0.00016719135118134092, + "loss": 4.3199, + "step": 7688 + }, + { + "epoch": 0.7972574509587905, + "grad_norm": 1.3203125, + "learning_rate": 0.00016718330584833958, + "loss": 4.3064, + "step": 7689 + }, + { + "epoch": 0.7973611390132785, + "grad_norm": 1.2421875, + "learning_rate": 0.00016717525972264626, + "loss": 4.2913, + "step": 7690 + }, + { + "epoch": 0.7974648270677667, + "grad_norm": 1.375, + "learning_rate": 0.0001671672128043559, + "loss": 4.3011, + "step": 7691 + }, + { + "epoch": 0.7975685151222547, + "grad_norm": 1.2734375, + "learning_rate": 0.00016715916509356344, + "loss": 4.2835, + "step": 7692 + }, + { + "epoch": 0.7976722031767428, + "grad_norm": 1.3046875, + "learning_rate": 0.0001671511165903638, + "loss": 4.272, + "step": 7693 + }, + { + "epoch": 0.7977758912312308, + "grad_norm": 1.2734375, + "learning_rate": 0.00016714306729485195, + "loss": 4.2619, + "step": 7694 + }, + { + "epoch": 0.797879579285719, + "grad_norm": 1.0703125, + "learning_rate": 0.0001671350172071229, + "loss": 4.2933, + "step": 7695 + }, + { + "epoch": 0.797983267340207, + "grad_norm": 1.0859375, + "learning_rate": 0.00016712696632727164, + "loss": 4.3161, + "step": 7696 + }, + { + "epoch": 0.7980869553946951, + "grad_norm": 1.1328125, + "learning_rate": 0.0001671189146553931, + "loss": 4.2422, + "step": 7697 + }, + { + "epoch": 0.7981906434491831, + "grad_norm": 0.97265625, + "learning_rate": 0.0001671108621915823, + "loss": 4.2794, + "step": 7698 + }, + { + "epoch": 0.7982943315036712, + "grad_norm": 1.34375, + "learning_rate": 0.0001671028089359343, + "loss": 4.2902, + "step": 7699 + }, + { + "epoch": 0.7983980195581593, + "grad_norm": 1.2578125, + "learning_rate": 0.00016709475488854407, + "loss": 4.3202, + "step": 7700 + }, + { + "epoch": 0.7985017076126474, + "grad_norm": 1.296875, + "learning_rate": 0.00016708670004950666, + "loss": 4.3195, + "step": 7701 + }, + { + "epoch": 0.7986053956671354, + "grad_norm": 1.296875, + "learning_rate": 0.0001670786444189171, + "loss": 4.3011, + "step": 7702 + }, + { + "epoch": 0.7987090837216235, + "grad_norm": 1.09375, + "learning_rate": 0.00016707058799687044, + "loss": 4.3031, + "step": 7703 + }, + { + "epoch": 0.7988127717761115, + "grad_norm": 1.046875, + "learning_rate": 0.00016706253078346175, + "loss": 4.3169, + "step": 7704 + }, + { + "epoch": 0.7989164598305997, + "grad_norm": 1.2734375, + "learning_rate": 0.00016705447277878607, + "loss": 4.2572, + "step": 7705 + }, + { + "epoch": 0.7990201478850877, + "grad_norm": 1.140625, + "learning_rate": 0.0001670464139829385, + "loss": 4.294, + "step": 7706 + }, + { + "epoch": 0.7991238359395758, + "grad_norm": 1.3671875, + "learning_rate": 0.00016703835439601413, + "loss": 4.3035, + "step": 7707 + }, + { + "epoch": 0.7992275239940638, + "grad_norm": 1.2734375, + "learning_rate": 0.00016703029401810802, + "loss": 4.2962, + "step": 7708 + }, + { + "epoch": 0.799331212048552, + "grad_norm": 1.171875, + "learning_rate": 0.00016702223284931528, + "loss": 4.3065, + "step": 7709 + }, + { + "epoch": 0.79943490010304, + "grad_norm": 1.1328125, + "learning_rate": 0.00016701417088973107, + "loss": 4.3254, + "step": 7710 + }, + { + "epoch": 0.7995385881575281, + "grad_norm": 1.09375, + "learning_rate": 0.00016700610813945044, + "loss": 4.3224, + "step": 7711 + }, + { + "epoch": 0.7996422762120161, + "grad_norm": 1.109375, + "learning_rate": 0.00016699804459856862, + "loss": 4.3194, + "step": 7712 + }, + { + "epoch": 0.7997459642665042, + "grad_norm": 1.0703125, + "learning_rate": 0.00016698998026718064, + "loss": 4.2604, + "step": 7713 + }, + { + "epoch": 0.7998496523209923, + "grad_norm": 0.94921875, + "learning_rate": 0.00016698191514538172, + "loss": 4.3033, + "step": 7714 + }, + { + "epoch": 0.7999533403754804, + "grad_norm": 1.1484375, + "learning_rate": 0.00016697384923326704, + "loss": 4.3104, + "step": 7715 + }, + { + "epoch": 0.8000570284299684, + "grad_norm": 1.0390625, + "learning_rate": 0.0001669657825309317, + "loss": 4.315, + "step": 7716 + }, + { + "epoch": 0.8001607164844565, + "grad_norm": 1.4375, + "learning_rate": 0.00016695771503847092, + "loss": 4.3228, + "step": 7717 + }, + { + "epoch": 0.8002644045389445, + "grad_norm": 1.3359375, + "learning_rate": 0.00016694964675597986, + "loss": 4.2683, + "step": 7718 + }, + { + "epoch": 0.8003680925934327, + "grad_norm": 1.2421875, + "learning_rate": 0.00016694157768355376, + "loss": 4.298, + "step": 7719 + }, + { + "epoch": 0.8004717806479207, + "grad_norm": 1.1953125, + "learning_rate": 0.00016693350782128778, + "loss": 4.2719, + "step": 7720 + }, + { + "epoch": 0.8005754687024088, + "grad_norm": 1.1875, + "learning_rate": 0.00016692543716927718, + "loss": 4.2916, + "step": 7721 + }, + { + "epoch": 0.8006791567568968, + "grad_norm": 1.0859375, + "learning_rate": 0.00016691736572761715, + "loss": 4.3065, + "step": 7722 + }, + { + "epoch": 0.800782844811385, + "grad_norm": 1.2578125, + "learning_rate": 0.00016690929349640296, + "loss": 4.3154, + "step": 7723 + }, + { + "epoch": 0.800886532865873, + "grad_norm": 1.1015625, + "learning_rate": 0.00016690122047572983, + "loss": 4.3292, + "step": 7724 + }, + { + "epoch": 0.8009902209203611, + "grad_norm": 1.3671875, + "learning_rate": 0.000166893146665693, + "loss": 4.3223, + "step": 7725 + }, + { + "epoch": 0.8010939089748491, + "grad_norm": 1.3671875, + "learning_rate": 0.00016688507206638777, + "loss": 4.2767, + "step": 7726 + }, + { + "epoch": 0.8011975970293372, + "grad_norm": 1.0234375, + "learning_rate": 0.00016687699667790936, + "loss": 4.3026, + "step": 7727 + }, + { + "epoch": 0.8013012850838254, + "grad_norm": 1.0234375, + "learning_rate": 0.0001668689205003531, + "loss": 4.2776, + "step": 7728 + }, + { + "epoch": 0.8014049731383134, + "grad_norm": 1.09375, + "learning_rate": 0.00016686084353381426, + "loss": 4.2744, + "step": 7729 + }, + { + "epoch": 0.8015086611928015, + "grad_norm": 0.921875, + "learning_rate": 0.00016685276577838815, + "loss": 4.2787, + "step": 7730 + }, + { + "epoch": 0.8016123492472895, + "grad_norm": 1.328125, + "learning_rate": 0.00016684468723417005, + "loss": 4.2959, + "step": 7731 + }, + { + "epoch": 0.8017160373017777, + "grad_norm": 1.171875, + "learning_rate": 0.00016683660790125533, + "loss": 4.3154, + "step": 7732 + }, + { + "epoch": 0.8018197253562657, + "grad_norm": 1.453125, + "learning_rate": 0.0001668285277797393, + "loss": 4.2811, + "step": 7733 + }, + { + "epoch": 0.8019234134107538, + "grad_norm": 1.3046875, + "learning_rate": 0.0001668204468697172, + "loss": 4.2388, + "step": 7734 + }, + { + "epoch": 0.8020271014652418, + "grad_norm": 1.0703125, + "learning_rate": 0.0001668123651712845, + "loss": 4.3022, + "step": 7735 + }, + { + "epoch": 0.8021307895197299, + "grad_norm": 1.0390625, + "learning_rate": 0.00016680428268453653, + "loss": 4.2492, + "step": 7736 + }, + { + "epoch": 0.802234477574218, + "grad_norm": 1.234375, + "learning_rate": 0.00016679619940956864, + "loss": 4.2857, + "step": 7737 + }, + { + "epoch": 0.8023381656287061, + "grad_norm": 1.1015625, + "learning_rate": 0.0001667881153464762, + "loss": 4.3029, + "step": 7738 + }, + { + "epoch": 0.8024418536831941, + "grad_norm": 1.3046875, + "learning_rate": 0.00016678003049535459, + "loss": 4.2936, + "step": 7739 + }, + { + "epoch": 0.8025455417376822, + "grad_norm": 1.171875, + "learning_rate": 0.0001667719448562992, + "loss": 4.3072, + "step": 7740 + }, + { + "epoch": 0.8026492297921702, + "grad_norm": 1.2578125, + "learning_rate": 0.00016676385842940547, + "loss": 4.2703, + "step": 7741 + }, + { + "epoch": 0.8027529178466584, + "grad_norm": 1.171875, + "learning_rate": 0.00016675577121476876, + "loss": 4.2924, + "step": 7742 + }, + { + "epoch": 0.8028566059011464, + "grad_norm": 1.21875, + "learning_rate": 0.00016674768321248452, + "loss": 4.3096, + "step": 7743 + }, + { + "epoch": 0.8029602939556345, + "grad_norm": 1.1328125, + "learning_rate": 0.0001667395944226482, + "loss": 4.3014, + "step": 7744 + }, + { + "epoch": 0.8030639820101225, + "grad_norm": 1.2578125, + "learning_rate": 0.00016673150484535518, + "loss": 4.2482, + "step": 7745 + }, + { + "epoch": 0.8031676700646107, + "grad_norm": 1.265625, + "learning_rate": 0.00016672341448070095, + "loss": 4.2823, + "step": 7746 + }, + { + "epoch": 0.8032713581190987, + "grad_norm": 1.171875, + "learning_rate": 0.00016671532332878094, + "loss": 4.3267, + "step": 7747 + }, + { + "epoch": 0.8033750461735868, + "grad_norm": 1.15625, + "learning_rate": 0.00016670723138969065, + "loss": 4.3252, + "step": 7748 + }, + { + "epoch": 0.8034787342280748, + "grad_norm": 1.25, + "learning_rate": 0.00016669913866352556, + "loss": 4.2556, + "step": 7749 + }, + { + "epoch": 0.8035824222825629, + "grad_norm": 1.1953125, + "learning_rate": 0.0001666910451503811, + "loss": 4.2379, + "step": 7750 + }, + { + "epoch": 0.803686110337051, + "grad_norm": 1.171875, + "learning_rate": 0.00016668295085035286, + "loss": 4.2545, + "step": 7751 + }, + { + "epoch": 0.8037897983915391, + "grad_norm": 1.1015625, + "learning_rate": 0.00016667485576353624, + "loss": 4.3009, + "step": 7752 + }, + { + "epoch": 0.8038934864460271, + "grad_norm": 1.1953125, + "learning_rate": 0.0001666667598900268, + "loss": 4.3247, + "step": 7753 + }, + { + "epoch": 0.8039971745005152, + "grad_norm": 1.0234375, + "learning_rate": 0.00016665866322992007, + "loss": 4.2759, + "step": 7754 + }, + { + "epoch": 0.8041008625550032, + "grad_norm": 1.265625, + "learning_rate": 0.0001666505657833116, + "loss": 4.2795, + "step": 7755 + }, + { + "epoch": 0.8042045506094914, + "grad_norm": 1.1953125, + "learning_rate": 0.0001666424675502969, + "loss": 4.2908, + "step": 7756 + }, + { + "epoch": 0.8043082386639794, + "grad_norm": 1.078125, + "learning_rate": 0.0001666343685309715, + "loss": 4.2934, + "step": 7757 + }, + { + "epoch": 0.8044119267184675, + "grad_norm": 1.078125, + "learning_rate": 0.000166626268725431, + "loss": 4.2682, + "step": 7758 + }, + { + "epoch": 0.8045156147729555, + "grad_norm": 0.97265625, + "learning_rate": 0.00016661816813377095, + "loss": 4.2481, + "step": 7759 + }, + { + "epoch": 0.8046193028274436, + "grad_norm": 0.99609375, + "learning_rate": 0.00016661006675608694, + "loss": 4.2937, + "step": 7760 + }, + { + "epoch": 0.8047229908819317, + "grad_norm": 1.2109375, + "learning_rate": 0.00016660196459247458, + "loss": 4.3113, + "step": 7761 + }, + { + "epoch": 0.8048266789364198, + "grad_norm": 1.0703125, + "learning_rate": 0.0001665938616430294, + "loss": 4.3276, + "step": 7762 + }, + { + "epoch": 0.8049303669909078, + "grad_norm": 1.2734375, + "learning_rate": 0.00016658575790784704, + "loss": 4.2966, + "step": 7763 + }, + { + "epoch": 0.8050340550453959, + "grad_norm": 1.15625, + "learning_rate": 0.00016657765338702317, + "loss": 4.3491, + "step": 7764 + }, + { + "epoch": 0.805137743099884, + "grad_norm": 1.265625, + "learning_rate": 0.00016656954808065333, + "loss": 4.3034, + "step": 7765 + }, + { + "epoch": 0.8052414311543721, + "grad_norm": 1.125, + "learning_rate": 0.0001665614419888332, + "loss": 4.3113, + "step": 7766 + }, + { + "epoch": 0.8053451192088601, + "grad_norm": 1.25, + "learning_rate": 0.0001665533351116584, + "loss": 4.3144, + "step": 7767 + }, + { + "epoch": 0.8054488072633482, + "grad_norm": 1.1953125, + "learning_rate": 0.00016654522744922461, + "loss": 4.32, + "step": 7768 + }, + { + "epoch": 0.8055524953178362, + "grad_norm": 1.03125, + "learning_rate": 0.00016653711900162748, + "loss": 4.293, + "step": 7769 + }, + { + "epoch": 0.8056561833723244, + "grad_norm": 0.96875, + "learning_rate": 0.00016652900976896263, + "loss": 4.3441, + "step": 7770 + }, + { + "epoch": 0.8057598714268124, + "grad_norm": 1.296875, + "learning_rate": 0.00016652089975132585, + "loss": 4.2961, + "step": 7771 + }, + { + "epoch": 0.8058635594813005, + "grad_norm": 1.1484375, + "learning_rate": 0.00016651278894881278, + "loss": 4.2946, + "step": 7772 + }, + { + "epoch": 0.8059672475357886, + "grad_norm": 1.3046875, + "learning_rate": 0.00016650467736151905, + "loss": 4.3085, + "step": 7773 + }, + { + "epoch": 0.8060709355902766, + "grad_norm": 1.2734375, + "learning_rate": 0.00016649656498954042, + "loss": 4.2685, + "step": 7774 + }, + { + "epoch": 0.8061746236447648, + "grad_norm": 1.21875, + "learning_rate": 0.00016648845183297266, + "loss": 4.2989, + "step": 7775 + }, + { + "epoch": 0.8062783116992528, + "grad_norm": 1.09375, + "learning_rate": 0.00016648033789191144, + "loss": 4.3006, + "step": 7776 + }, + { + "epoch": 0.8063819997537409, + "grad_norm": 1.265625, + "learning_rate": 0.0001664722231664525, + "loss": 4.2543, + "step": 7777 + }, + { + "epoch": 0.8064856878082289, + "grad_norm": 1.125, + "learning_rate": 0.0001664641076566916, + "loss": 4.2665, + "step": 7778 + }, + { + "epoch": 0.8065893758627171, + "grad_norm": 1.2421875, + "learning_rate": 0.00016645599136272447, + "loss": 4.3282, + "step": 7779 + }, + { + "epoch": 0.8066930639172051, + "grad_norm": 1.1171875, + "learning_rate": 0.0001664478742846469, + "loss": 4.3053, + "step": 7780 + }, + { + "epoch": 0.8067967519716932, + "grad_norm": 1.390625, + "learning_rate": 0.00016643975642255466, + "loss": 4.293, + "step": 7781 + }, + { + "epoch": 0.8069004400261812, + "grad_norm": 1.2734375, + "learning_rate": 0.0001664316377765435, + "loss": 4.2947, + "step": 7782 + }, + { + "epoch": 0.8070041280806693, + "grad_norm": 1.421875, + "learning_rate": 0.00016642351834670924, + "loss": 4.2965, + "step": 7783 + }, + { + "epoch": 0.8071078161351574, + "grad_norm": 1.328125, + "learning_rate": 0.00016641539813314768, + "loss": 4.2896, + "step": 7784 + }, + { + "epoch": 0.8072115041896455, + "grad_norm": 1.2421875, + "learning_rate": 0.0001664072771359546, + "loss": 4.2974, + "step": 7785 + }, + { + "epoch": 0.8073151922441335, + "grad_norm": 1.140625, + "learning_rate": 0.0001663991553552259, + "loss": 4.2654, + "step": 7786 + }, + { + "epoch": 0.8074188802986216, + "grad_norm": 1.234375, + "learning_rate": 0.0001663910327910573, + "loss": 4.2828, + "step": 7787 + }, + { + "epoch": 0.8075225683531096, + "grad_norm": 1.1875, + "learning_rate": 0.00016638290944354474, + "loss": 4.2936, + "step": 7788 + }, + { + "epoch": 0.8076262564075978, + "grad_norm": 1.3828125, + "learning_rate": 0.000166374785312784, + "loss": 4.3182, + "step": 7789 + }, + { + "epoch": 0.8077299444620858, + "grad_norm": 1.296875, + "learning_rate": 0.00016636666039887094, + "loss": 4.2962, + "step": 7790 + }, + { + "epoch": 0.8078336325165739, + "grad_norm": 1.28125, + "learning_rate": 0.00016635853470190146, + "loss": 4.316, + "step": 7791 + }, + { + "epoch": 0.8079373205710619, + "grad_norm": 1.15625, + "learning_rate": 0.00016635040822197142, + "loss": 4.2575, + "step": 7792 + }, + { + "epoch": 0.8080410086255501, + "grad_norm": 1.1953125, + "learning_rate": 0.00016634228095917667, + "loss": 4.2889, + "step": 7793 + }, + { + "epoch": 0.8081446966800381, + "grad_norm": 1.1328125, + "learning_rate": 0.00016633415291361314, + "loss": 4.2599, + "step": 7794 + }, + { + "epoch": 0.8082483847345262, + "grad_norm": 1.3515625, + "learning_rate": 0.0001663260240853767, + "loss": 4.2845, + "step": 7795 + }, + { + "epoch": 0.8083520727890142, + "grad_norm": 1.2578125, + "learning_rate": 0.00016631789447456337, + "loss": 4.2388, + "step": 7796 + }, + { + "epoch": 0.8084557608435023, + "grad_norm": 1.421875, + "learning_rate": 0.00016630976408126891, + "loss": 4.2798, + "step": 7797 + }, + { + "epoch": 0.8085594488979904, + "grad_norm": 1.34375, + "learning_rate": 0.00016630163290558937, + "loss": 4.2705, + "step": 7798 + }, + { + "epoch": 0.8086631369524785, + "grad_norm": 1.0390625, + "learning_rate": 0.0001662935009476206, + "loss": 4.2865, + "step": 7799 + }, + { + "epoch": 0.8087668250069665, + "grad_norm": 1.0078125, + "learning_rate": 0.0001662853682074586, + "loss": 4.2664, + "step": 7800 + }, + { + "epoch": 0.8088705130614546, + "grad_norm": 1.2734375, + "learning_rate": 0.00016627723468519935, + "loss": 4.2632, + "step": 7801 + }, + { + "epoch": 0.8089742011159426, + "grad_norm": 1.09375, + "learning_rate": 0.00016626910038093878, + "loss": 4.296, + "step": 7802 + }, + { + "epoch": 0.8090778891704308, + "grad_norm": 1.46875, + "learning_rate": 0.00016626096529477288, + "loss": 4.3453, + "step": 7803 + }, + { + "epoch": 0.8091815772249188, + "grad_norm": 1.390625, + "learning_rate": 0.0001662528294267976, + "loss": 4.2582, + "step": 7804 + }, + { + "epoch": 0.8092852652794069, + "grad_norm": 1.078125, + "learning_rate": 0.000166244692777109, + "loss": 4.2668, + "step": 7805 + }, + { + "epoch": 0.8093889533338949, + "grad_norm": 1.078125, + "learning_rate": 0.00016623655534580303, + "loss": 4.2773, + "step": 7806 + }, + { + "epoch": 0.8094926413883831, + "grad_norm": 1.1328125, + "learning_rate": 0.0001662284171329757, + "loss": 4.3475, + "step": 7807 + }, + { + "epoch": 0.8095963294428711, + "grad_norm": 1.0546875, + "learning_rate": 0.00016622027813872312, + "loss": 4.3152, + "step": 7808 + }, + { + "epoch": 0.8097000174973592, + "grad_norm": 1.3125, + "learning_rate": 0.00016621213836314123, + "loss": 4.2213, + "step": 7809 + }, + { + "epoch": 0.8098037055518472, + "grad_norm": 1.2109375, + "learning_rate": 0.00016620399780632608, + "loss": 4.2859, + "step": 7810 + }, + { + "epoch": 0.8099073936063353, + "grad_norm": 1.2265625, + "learning_rate": 0.00016619585646837374, + "loss": 4.2515, + "step": 7811 + }, + { + "epoch": 0.8100110816608234, + "grad_norm": 1.1953125, + "learning_rate": 0.00016618771434938025, + "loss": 4.2569, + "step": 7812 + }, + { + "epoch": 0.8101147697153115, + "grad_norm": 1.0859375, + "learning_rate": 0.00016617957144944172, + "loss": 4.2405, + "step": 7813 + }, + { + "epoch": 0.8102184577697995, + "grad_norm": 1.03125, + "learning_rate": 0.0001661714277686542, + "loss": 4.2934, + "step": 7814 + }, + { + "epoch": 0.8103221458242876, + "grad_norm": 1.171875, + "learning_rate": 0.00016616328330711376, + "loss": 4.3069, + "step": 7815 + }, + { + "epoch": 0.8104258338787758, + "grad_norm": 1.078125, + "learning_rate": 0.00016615513806491654, + "loss": 4.2408, + "step": 7816 + }, + { + "epoch": 0.8105295219332638, + "grad_norm": 1.21875, + "learning_rate": 0.0001661469920421586, + "loss": 4.334, + "step": 7817 + }, + { + "epoch": 0.8106332099877519, + "grad_norm": 1.1015625, + "learning_rate": 0.00016613884523893608, + "loss": 4.2613, + "step": 7818 + }, + { + "epoch": 0.8107368980422399, + "grad_norm": 1.234375, + "learning_rate": 0.0001661306976553451, + "loss": 4.3069, + "step": 7819 + }, + { + "epoch": 0.810840586096728, + "grad_norm": 1.171875, + "learning_rate": 0.00016612254929148179, + "loss": 4.3041, + "step": 7820 + }, + { + "epoch": 0.8109442741512161, + "grad_norm": 1.171875, + "learning_rate": 0.0001661144001474423, + "loss": 4.3017, + "step": 7821 + }, + { + "epoch": 0.8110479622057042, + "grad_norm": 1.125, + "learning_rate": 0.00016610625022332276, + "loss": 4.2651, + "step": 7822 + }, + { + "epoch": 0.8111516502601922, + "grad_norm": 1.1328125, + "learning_rate": 0.00016609809951921936, + "loss": 4.3191, + "step": 7823 + }, + { + "epoch": 0.8112553383146803, + "grad_norm": 1.0546875, + "learning_rate": 0.00016608994803522824, + "loss": 4.2134, + "step": 7824 + }, + { + "epoch": 0.8113590263691683, + "grad_norm": 1.375, + "learning_rate": 0.0001660817957714456, + "loss": 4.3125, + "step": 7825 + }, + { + "epoch": 0.8114627144236565, + "grad_norm": 1.25, + "learning_rate": 0.00016607364272796762, + "loss": 4.2822, + "step": 7826 + }, + { + "epoch": 0.8115664024781445, + "grad_norm": 1.2109375, + "learning_rate": 0.0001660654889048905, + "loss": 4.2994, + "step": 7827 + }, + { + "epoch": 0.8116700905326326, + "grad_norm": 1.1796875, + "learning_rate": 0.00016605733430231044, + "loss": 4.2685, + "step": 7828 + }, + { + "epoch": 0.8117737785871206, + "grad_norm": 1.1171875, + "learning_rate": 0.00016604917892032366, + "loss": 4.2875, + "step": 7829 + }, + { + "epoch": 0.8118774666416088, + "grad_norm": 1.078125, + "learning_rate": 0.0001660410227590264, + "loss": 4.2983, + "step": 7830 + }, + { + "epoch": 0.8119811546960968, + "grad_norm": 1.1796875, + "learning_rate": 0.00016603286581851488, + "loss": 4.2994, + "step": 7831 + }, + { + "epoch": 0.8120848427505849, + "grad_norm": 1.0546875, + "learning_rate": 0.0001660247080988853, + "loss": 4.2526, + "step": 7832 + }, + { + "epoch": 0.8121885308050729, + "grad_norm": 1.21875, + "learning_rate": 0.00016601654960023398, + "loss": 4.2908, + "step": 7833 + }, + { + "epoch": 0.812292218859561, + "grad_norm": 1.109375, + "learning_rate": 0.0001660083903226572, + "loss": 4.2445, + "step": 7834 + }, + { + "epoch": 0.8123959069140491, + "grad_norm": 1.25, + "learning_rate": 0.00016600023026625116, + "loss": 4.2771, + "step": 7835 + }, + { + "epoch": 0.8124995949685372, + "grad_norm": 1.1484375, + "learning_rate": 0.00016599206943111215, + "loss": 4.2637, + "step": 7836 + }, + { + "epoch": 0.8126032830230252, + "grad_norm": 1.1328125, + "learning_rate": 0.0001659839078173365, + "loss": 4.2666, + "step": 7837 + }, + { + "epoch": 0.8127069710775133, + "grad_norm": 1.0546875, + "learning_rate": 0.00016597574542502047, + "loss": 4.2428, + "step": 7838 + }, + { + "epoch": 0.8128106591320013, + "grad_norm": 1.2578125, + "learning_rate": 0.0001659675822542604, + "loss": 4.28, + "step": 7839 + }, + { + "epoch": 0.8129143471864895, + "grad_norm": 1.15625, + "learning_rate": 0.00016595941830515256, + "loss": 4.2637, + "step": 7840 + }, + { + "epoch": 0.8130180352409775, + "grad_norm": 1.375, + "learning_rate": 0.00016595125357779332, + "loss": 4.272, + "step": 7841 + }, + { + "epoch": 0.8131217232954656, + "grad_norm": 1.2109375, + "learning_rate": 0.00016594308807227904, + "loss": 4.265, + "step": 7842 + }, + { + "epoch": 0.8132254113499536, + "grad_norm": 1.09375, + "learning_rate": 0.00016593492178870598, + "loss": 4.2538, + "step": 7843 + }, + { + "epoch": 0.8133290994044418, + "grad_norm": 1.0390625, + "learning_rate": 0.00016592675472717054, + "loss": 4.2471, + "step": 7844 + }, + { + "epoch": 0.8134327874589298, + "grad_norm": 1.171875, + "learning_rate": 0.0001659185868877691, + "loss": 4.2553, + "step": 7845 + }, + { + "epoch": 0.8135364755134179, + "grad_norm": 1.1015625, + "learning_rate": 0.00016591041827059802, + "loss": 4.3116, + "step": 7846 + }, + { + "epoch": 0.8136401635679059, + "grad_norm": 1.2890625, + "learning_rate": 0.00016590224887575366, + "loss": 4.2769, + "step": 7847 + }, + { + "epoch": 0.813743851622394, + "grad_norm": 1.171875, + "learning_rate": 0.00016589407870333243, + "loss": 4.3086, + "step": 7848 + }, + { + "epoch": 0.8138475396768821, + "grad_norm": 1.1796875, + "learning_rate": 0.00016588590775343072, + "loss": 4.2312, + "step": 7849 + }, + { + "epoch": 0.8139512277313702, + "grad_norm": 1.25, + "learning_rate": 0.00016587773602614494, + "loss": 4.3059, + "step": 7850 + }, + { + "epoch": 0.8140549157858582, + "grad_norm": 1.0703125, + "learning_rate": 0.0001658695635215715, + "loss": 4.2705, + "step": 7851 + }, + { + "epoch": 0.8141586038403463, + "grad_norm": 1.0703125, + "learning_rate": 0.00016586139023980686, + "loss": 4.2674, + "step": 7852 + }, + { + "epoch": 0.8142622918948343, + "grad_norm": 1.1953125, + "learning_rate": 0.00016585321618094742, + "loss": 4.3153, + "step": 7853 + }, + { + "epoch": 0.8143659799493225, + "grad_norm": 1.140625, + "learning_rate": 0.00016584504134508965, + "loss": 4.3233, + "step": 7854 + }, + { + "epoch": 0.8144696680038105, + "grad_norm": 1.2890625, + "learning_rate": 0.00016583686573233, + "loss": 4.304, + "step": 7855 + }, + { + "epoch": 0.8145733560582986, + "grad_norm": 1.2109375, + "learning_rate": 0.0001658286893427649, + "loss": 4.2762, + "step": 7856 + }, + { + "epoch": 0.8146770441127866, + "grad_norm": 1.09375, + "learning_rate": 0.00016582051217649087, + "loss": 4.3054, + "step": 7857 + }, + { + "epoch": 0.8147807321672748, + "grad_norm": 1.0703125, + "learning_rate": 0.00016581233423360433, + "loss": 4.2704, + "step": 7858 + }, + { + "epoch": 0.8148844202217628, + "grad_norm": 1.2109375, + "learning_rate": 0.00016580415551420189, + "loss": 4.3082, + "step": 7859 + }, + { + "epoch": 0.8149881082762509, + "grad_norm": 1.125, + "learning_rate": 0.00016579597601837993, + "loss": 4.2897, + "step": 7860 + }, + { + "epoch": 0.815091796330739, + "grad_norm": 1.234375, + "learning_rate": 0.000165787795746235, + "loss": 4.3225, + "step": 7861 + }, + { + "epoch": 0.815195484385227, + "grad_norm": 1.203125, + "learning_rate": 0.00016577961469786364, + "loss": 4.3007, + "step": 7862 + }, + { + "epoch": 0.8152991724397152, + "grad_norm": 1.109375, + "learning_rate": 0.00016577143287336234, + "loss": 4.3154, + "step": 7863 + }, + { + "epoch": 0.8154028604942032, + "grad_norm": 1.109375, + "learning_rate": 0.00016576325027282764, + "loss": 4.2933, + "step": 7864 + }, + { + "epoch": 0.8155065485486913, + "grad_norm": 1.109375, + "learning_rate": 0.00016575506689635612, + "loss": 4.2656, + "step": 7865 + }, + { + "epoch": 0.8156102366031793, + "grad_norm": 1.0703125, + "learning_rate": 0.00016574688274404432, + "loss": 4.239, + "step": 7866 + }, + { + "epoch": 0.8157139246576675, + "grad_norm": 1.3125, + "learning_rate": 0.00016573869781598882, + "loss": 4.2593, + "step": 7867 + }, + { + "epoch": 0.8158176127121555, + "grad_norm": 1.2109375, + "learning_rate": 0.00016573051211228614, + "loss": 4.2934, + "step": 7868 + }, + { + "epoch": 0.8159213007666436, + "grad_norm": 1.265625, + "learning_rate": 0.00016572232563303292, + "loss": 4.2886, + "step": 7869 + }, + { + "epoch": 0.8160249888211316, + "grad_norm": 1.203125, + "learning_rate": 0.0001657141383783257, + "loss": 4.2693, + "step": 7870 + }, + { + "epoch": 0.8161286768756197, + "grad_norm": 1.078125, + "learning_rate": 0.00016570595034826115, + "loss": 4.3098, + "step": 7871 + }, + { + "epoch": 0.8162323649301078, + "grad_norm": 1.0625, + "learning_rate": 0.0001656977615429358, + "loss": 4.2957, + "step": 7872 + }, + { + "epoch": 0.8163360529845959, + "grad_norm": 1.1953125, + "learning_rate": 0.00016568957196244632, + "loss": 4.2775, + "step": 7873 + }, + { + "epoch": 0.8164397410390839, + "grad_norm": 1.046875, + "learning_rate": 0.00016568138160688936, + "loss": 4.2855, + "step": 7874 + }, + { + "epoch": 0.816543429093572, + "grad_norm": 1.3125, + "learning_rate": 0.0001656731904763615, + "loss": 4.2679, + "step": 7875 + }, + { + "epoch": 0.81664711714806, + "grad_norm": 1.1640625, + "learning_rate": 0.00016566499857095942, + "loss": 4.3152, + "step": 7876 + }, + { + "epoch": 0.8167508052025482, + "grad_norm": 1.3984375, + "learning_rate": 0.00016565680589077976, + "loss": 4.2852, + "step": 7877 + }, + { + "epoch": 0.8168544932570362, + "grad_norm": 1.25, + "learning_rate": 0.00016564861243591918, + "loss": 4.2883, + "step": 7878 + }, + { + "epoch": 0.8169581813115243, + "grad_norm": 1.171875, + "learning_rate": 0.00016564041820647438, + "loss": 4.2654, + "step": 7879 + }, + { + "epoch": 0.8170618693660123, + "grad_norm": 1.1484375, + "learning_rate": 0.00016563222320254206, + "loss": 4.3057, + "step": 7880 + }, + { + "epoch": 0.8171655574205005, + "grad_norm": 1.2578125, + "learning_rate": 0.00016562402742421883, + "loss": 4.2914, + "step": 7881 + }, + { + "epoch": 0.8172692454749885, + "grad_norm": 1.1484375, + "learning_rate": 0.00016561583087160145, + "loss": 4.2848, + "step": 7882 + }, + { + "epoch": 0.8173729335294766, + "grad_norm": 1.3359375, + "learning_rate": 0.00016560763354478666, + "loss": 4.2953, + "step": 7883 + }, + { + "epoch": 0.8174766215839646, + "grad_norm": 1.25, + "learning_rate": 0.00016559943544387114, + "loss": 4.2588, + "step": 7884 + }, + { + "epoch": 0.8175803096384527, + "grad_norm": 1.2421875, + "learning_rate": 0.00016559123656895158, + "loss": 4.2727, + "step": 7885 + }, + { + "epoch": 0.8176839976929408, + "grad_norm": 1.2265625, + "learning_rate": 0.00016558303692012482, + "loss": 4.2575, + "step": 7886 + }, + { + "epoch": 0.8177876857474289, + "grad_norm": 1.1171875, + "learning_rate": 0.0001655748364974875, + "loss": 4.293, + "step": 7887 + }, + { + "epoch": 0.8178913738019169, + "grad_norm": 1.0234375, + "learning_rate": 0.00016556663530113648, + "loss": 4.267, + "step": 7888 + }, + { + "epoch": 0.817995061856405, + "grad_norm": 1.359375, + "learning_rate": 0.0001655584333311684, + "loss": 4.3019, + "step": 7889 + }, + { + "epoch": 0.818098749910893, + "grad_norm": 1.2578125, + "learning_rate": 0.00016555023058768015, + "loss": 4.2912, + "step": 7890 + }, + { + "epoch": 0.8182024379653812, + "grad_norm": 1.1796875, + "learning_rate": 0.00016554202707076847, + "loss": 4.3066, + "step": 7891 + }, + { + "epoch": 0.8183061260198692, + "grad_norm": 1.21875, + "learning_rate": 0.00016553382278053014, + "loss": 4.2105, + "step": 7892 + }, + { + "epoch": 0.8184098140743573, + "grad_norm": 0.96875, + "learning_rate": 0.00016552561771706196, + "loss": 4.2934, + "step": 7893 + }, + { + "epoch": 0.8185135021288453, + "grad_norm": 0.984375, + "learning_rate": 0.00016551741188046076, + "loss": 4.2425, + "step": 7894 + }, + { + "epoch": 0.8186171901833335, + "grad_norm": 1.125, + "learning_rate": 0.00016550920527082336, + "loss": 4.2912, + "step": 7895 + }, + { + "epoch": 0.8187208782378215, + "grad_norm": 0.8828125, + "learning_rate": 0.0001655009978882466, + "loss": 4.2229, + "step": 7896 + }, + { + "epoch": 0.8188245662923096, + "grad_norm": 1.15625, + "learning_rate": 0.0001654927897328273, + "loss": 4.3013, + "step": 7897 + }, + { + "epoch": 0.8189282543467976, + "grad_norm": 0.95703125, + "learning_rate": 0.0001654845808046623, + "loss": 4.2932, + "step": 7898 + }, + { + "epoch": 0.8190319424012857, + "grad_norm": 1.328125, + "learning_rate": 0.00016547637110384846, + "loss": 4.3038, + "step": 7899 + }, + { + "epoch": 0.8191356304557738, + "grad_norm": 1.1484375, + "learning_rate": 0.00016546816063048268, + "loss": 4.2764, + "step": 7900 + }, + { + "epoch": 0.8192393185102619, + "grad_norm": 1.2265625, + "learning_rate": 0.00016545994938466175, + "loss": 4.2641, + "step": 7901 + }, + { + "epoch": 0.8193430065647499, + "grad_norm": 1.203125, + "learning_rate": 0.00016545173736648265, + "loss": 4.2777, + "step": 7902 + }, + { + "epoch": 0.819446694619238, + "grad_norm": 1.015625, + "learning_rate": 0.00016544352457604225, + "loss": 4.3514, + "step": 7903 + }, + { + "epoch": 0.819550382673726, + "grad_norm": 0.99609375, + "learning_rate": 0.0001654353110134374, + "loss": 4.266, + "step": 7904 + }, + { + "epoch": 0.8196540707282142, + "grad_norm": 1.1015625, + "learning_rate": 0.0001654270966787651, + "loss": 4.2791, + "step": 7905 + }, + { + "epoch": 0.8197577587827023, + "grad_norm": 0.96484375, + "learning_rate": 0.00016541888157212218, + "loss": 4.2612, + "step": 7906 + }, + { + "epoch": 0.8198614468371903, + "grad_norm": 1.3515625, + "learning_rate": 0.0001654106656936056, + "loss": 4.3454, + "step": 7907 + }, + { + "epoch": 0.8199651348916784, + "grad_norm": 1.25, + "learning_rate": 0.0001654024490433123, + "loss": 4.2956, + "step": 7908 + }, + { + "epoch": 0.8200688229461665, + "grad_norm": 1.2109375, + "learning_rate": 0.00016539423162133926, + "loss": 4.2367, + "step": 7909 + }, + { + "epoch": 0.8201725110006546, + "grad_norm": 1.1328125, + "learning_rate": 0.00016538601342778344, + "loss": 4.2403, + "step": 7910 + }, + { + "epoch": 0.8202761990551426, + "grad_norm": 0.98046875, + "learning_rate": 0.00016537779446274174, + "loss": 4.3065, + "step": 7911 + }, + { + "epoch": 0.8203798871096307, + "grad_norm": 0.91015625, + "learning_rate": 0.00016536957472631115, + "loss": 4.2632, + "step": 7912 + }, + { + "epoch": 0.8204835751641187, + "grad_norm": 0.96484375, + "learning_rate": 0.00016536135421858875, + "loss": 4.2896, + "step": 7913 + }, + { + "epoch": 0.8205872632186069, + "grad_norm": 0.84375, + "learning_rate": 0.0001653531329396714, + "loss": 4.2405, + "step": 7914 + }, + { + "epoch": 0.8206909512730949, + "grad_norm": 0.94140625, + "learning_rate": 0.00016534491088965615, + "loss": 4.275, + "step": 7915 + }, + { + "epoch": 0.820794639327583, + "grad_norm": 0.8515625, + "learning_rate": 0.00016533668806864007, + "loss": 4.2577, + "step": 7916 + }, + { + "epoch": 0.820898327382071, + "grad_norm": 0.9296875, + "learning_rate": 0.00016532846447672012, + "loss": 4.2587, + "step": 7917 + }, + { + "epoch": 0.8210020154365592, + "grad_norm": 0.796875, + "learning_rate": 0.00016532024011399335, + "loss": 4.3051, + "step": 7918 + }, + { + "epoch": 0.8211057034910472, + "grad_norm": 0.94140625, + "learning_rate": 0.00016531201498055676, + "loss": 4.3112, + "step": 7919 + }, + { + "epoch": 0.8212093915455353, + "grad_norm": 0.79296875, + "learning_rate": 0.00016530378907650745, + "loss": 4.28, + "step": 7920 + }, + { + "epoch": 0.8213130796000233, + "grad_norm": 0.87890625, + "learning_rate": 0.0001652955624019425, + "loss": 4.2912, + "step": 7921 + }, + { + "epoch": 0.8214167676545114, + "grad_norm": 0.796875, + "learning_rate": 0.00016528733495695886, + "loss": 4.2139, + "step": 7922 + }, + { + "epoch": 0.8215204557089995, + "grad_norm": 0.86328125, + "learning_rate": 0.00016527910674165374, + "loss": 4.289, + "step": 7923 + }, + { + "epoch": 0.8216241437634876, + "grad_norm": 0.75390625, + "learning_rate": 0.00016527087775612413, + "loss": 4.2447, + "step": 7924 + }, + { + "epoch": 0.8217278318179756, + "grad_norm": 0.9375, + "learning_rate": 0.00016526264800046717, + "loss": 4.305, + "step": 7925 + }, + { + "epoch": 0.8218315198724637, + "grad_norm": 0.71484375, + "learning_rate": 0.00016525441747477994, + "loss": 4.3099, + "step": 7926 + }, + { + "epoch": 0.8219352079269517, + "grad_norm": 0.87890625, + "learning_rate": 0.00016524618617915957, + "loss": 4.2869, + "step": 7927 + }, + { + "epoch": 0.8220388959814399, + "grad_norm": 0.79296875, + "learning_rate": 0.00016523795411370316, + "loss": 4.2937, + "step": 7928 + }, + { + "epoch": 0.8221425840359279, + "grad_norm": 0.86328125, + "learning_rate": 0.00016522972127850784, + "loss": 4.2678, + "step": 7929 + }, + { + "epoch": 0.822246272090416, + "grad_norm": 0.76953125, + "learning_rate": 0.0001652214876736708, + "loss": 4.2553, + "step": 7930 + }, + { + "epoch": 0.822349960144904, + "grad_norm": 0.85546875, + "learning_rate": 0.00016521325329928911, + "loss": 4.3384, + "step": 7931 + }, + { + "epoch": 0.8224536481993922, + "grad_norm": 0.8515625, + "learning_rate": 0.00016520501815546, + "loss": 4.3044, + "step": 7932 + }, + { + "epoch": 0.8225573362538802, + "grad_norm": 0.796875, + "learning_rate": 0.00016519678224228055, + "loss": 4.2717, + "step": 7933 + }, + { + "epoch": 0.8226610243083683, + "grad_norm": 0.84765625, + "learning_rate": 0.00016518854555984803, + "loss": 4.2566, + "step": 7934 + }, + { + "epoch": 0.8227647123628563, + "grad_norm": 0.7421875, + "learning_rate": 0.00016518030810825957, + "loss": 4.3309, + "step": 7935 + }, + { + "epoch": 0.8228684004173444, + "grad_norm": 0.8125, + "learning_rate": 0.00016517206988761238, + "loss": 4.3035, + "step": 7936 + }, + { + "epoch": 0.8229720884718325, + "grad_norm": 0.6796875, + "learning_rate": 0.00016516383089800363, + "loss": 4.2951, + "step": 7937 + }, + { + "epoch": 0.8230757765263206, + "grad_norm": 0.79296875, + "learning_rate": 0.00016515559113953056, + "loss": 4.2759, + "step": 7938 + }, + { + "epoch": 0.8231794645808086, + "grad_norm": 0.7109375, + "learning_rate": 0.0001651473506122904, + "loss": 4.2353, + "step": 7939 + }, + { + "epoch": 0.8232831526352967, + "grad_norm": 0.7109375, + "learning_rate": 0.00016513910931638038, + "loss": 4.3049, + "step": 7940 + }, + { + "epoch": 0.8233868406897847, + "grad_norm": 0.77734375, + "learning_rate": 0.0001651308672518977, + "loss": 4.2876, + "step": 7941 + }, + { + "epoch": 0.8234905287442729, + "grad_norm": 0.734375, + "learning_rate": 0.00016512262441893967, + "loss": 4.2671, + "step": 7942 + }, + { + "epoch": 0.8235942167987609, + "grad_norm": 0.73828125, + "learning_rate": 0.00016511438081760348, + "loss": 4.2758, + "step": 7943 + }, + { + "epoch": 0.823697904853249, + "grad_norm": 0.734375, + "learning_rate": 0.00016510613644798642, + "loss": 4.3016, + "step": 7944 + }, + { + "epoch": 0.823801592907737, + "grad_norm": 0.82421875, + "learning_rate": 0.00016509789131018577, + "loss": 4.2983, + "step": 7945 + }, + { + "epoch": 0.8239052809622252, + "grad_norm": 0.7890625, + "learning_rate": 0.00016508964540429882, + "loss": 4.2565, + "step": 7946 + }, + { + "epoch": 0.8240089690167132, + "grad_norm": 0.7578125, + "learning_rate": 0.00016508139873042287, + "loss": 4.3073, + "step": 7947 + }, + { + "epoch": 0.8241126570712013, + "grad_norm": 0.75390625, + "learning_rate": 0.00016507315128865517, + "loss": 4.3019, + "step": 7948 + }, + { + "epoch": 0.8242163451256893, + "grad_norm": 0.75, + "learning_rate": 0.0001650649030790931, + "loss": 4.2778, + "step": 7949 + }, + { + "epoch": 0.8243200331801774, + "grad_norm": 0.77734375, + "learning_rate": 0.00016505665410183396, + "loss": 4.2676, + "step": 7950 + }, + { + "epoch": 0.8244237212346656, + "grad_norm": 0.75, + "learning_rate": 0.00016504840435697504, + "loss": 4.2584, + "step": 7951 + }, + { + "epoch": 0.8245274092891536, + "grad_norm": 0.74609375, + "learning_rate": 0.0001650401538446137, + "loss": 4.2756, + "step": 7952 + }, + { + "epoch": 0.8246310973436417, + "grad_norm": 0.71875, + "learning_rate": 0.00016503190256484732, + "loss": 4.3243, + "step": 7953 + }, + { + "epoch": 0.8247347853981297, + "grad_norm": 0.73046875, + "learning_rate": 0.00016502365051777326, + "loss": 4.3209, + "step": 7954 + }, + { + "epoch": 0.8248384734526178, + "grad_norm": 0.72265625, + "learning_rate": 0.0001650153977034888, + "loss": 4.3155, + "step": 7955 + }, + { + "epoch": 0.8249421615071059, + "grad_norm": 0.703125, + "learning_rate": 0.0001650071441220914, + "loss": 4.3073, + "step": 7956 + }, + { + "epoch": 0.825045849561594, + "grad_norm": 0.67578125, + "learning_rate": 0.00016499888977367842, + "loss": 4.2505, + "step": 7957 + }, + { + "epoch": 0.825149537616082, + "grad_norm": 0.75, + "learning_rate": 0.00016499063465834723, + "loss": 4.2585, + "step": 7958 + }, + { + "epoch": 0.8252532256705701, + "grad_norm": 0.60546875, + "learning_rate": 0.00016498237877619526, + "loss": 4.2695, + "step": 7959 + }, + { + "epoch": 0.8253569137250581, + "grad_norm": 0.765625, + "learning_rate": 0.00016497412212731992, + "loss": 4.2895, + "step": 7960 + }, + { + "epoch": 0.8254606017795463, + "grad_norm": 0.6640625, + "learning_rate": 0.00016496586471181863, + "loss": 4.3072, + "step": 7961 + }, + { + "epoch": 0.8255642898340343, + "grad_norm": 0.62109375, + "learning_rate": 0.00016495760652978877, + "loss": 4.2812, + "step": 7962 + }, + { + "epoch": 0.8256679778885224, + "grad_norm": 0.66796875, + "learning_rate": 0.00016494934758132782, + "loss": 4.2581, + "step": 7963 + }, + { + "epoch": 0.8257716659430104, + "grad_norm": 0.7421875, + "learning_rate": 0.00016494108786653327, + "loss": 4.2804, + "step": 7964 + }, + { + "epoch": 0.8258753539974986, + "grad_norm": 0.6875, + "learning_rate": 0.00016493282738550246, + "loss": 4.2712, + "step": 7965 + }, + { + "epoch": 0.8259790420519866, + "grad_norm": 0.75390625, + "learning_rate": 0.00016492456613833299, + "loss": 4.3037, + "step": 7966 + }, + { + "epoch": 0.8260827301064747, + "grad_norm": 0.69921875, + "learning_rate": 0.00016491630412512223, + "loss": 4.311, + "step": 7967 + }, + { + "epoch": 0.8261864181609627, + "grad_norm": 0.65625, + "learning_rate": 0.0001649080413459677, + "loss": 4.2494, + "step": 7968 + }, + { + "epoch": 0.8262901062154508, + "grad_norm": 0.65234375, + "learning_rate": 0.0001648997778009669, + "loss": 4.2881, + "step": 7969 + }, + { + "epoch": 0.8263937942699389, + "grad_norm": 0.70703125, + "learning_rate": 0.00016489151349021732, + "loss": 4.3125, + "step": 7970 + }, + { + "epoch": 0.826497482324427, + "grad_norm": 0.578125, + "learning_rate": 0.00016488324841381648, + "loss": 4.2862, + "step": 7971 + }, + { + "epoch": 0.826601170378915, + "grad_norm": 0.70703125, + "learning_rate": 0.0001648749825718619, + "loss": 4.286, + "step": 7972 + }, + { + "epoch": 0.8267048584334031, + "grad_norm": 0.625, + "learning_rate": 0.00016486671596445109, + "loss": 4.2535, + "step": 7973 + }, + { + "epoch": 0.8268085464878911, + "grad_norm": 0.63671875, + "learning_rate": 0.0001648584485916816, + "loss": 4.2822, + "step": 7974 + }, + { + "epoch": 0.8269122345423793, + "grad_norm": 0.7109375, + "learning_rate": 0.000164850180453651, + "loss": 4.2299, + "step": 7975 + }, + { + "epoch": 0.8270159225968673, + "grad_norm": 0.59375, + "learning_rate": 0.0001648419115504568, + "loss": 4.3058, + "step": 7976 + }, + { + "epoch": 0.8271196106513554, + "grad_norm": 0.69921875, + "learning_rate": 0.00016483364188219656, + "loss": 4.3092, + "step": 7977 + }, + { + "epoch": 0.8272232987058434, + "grad_norm": 0.60546875, + "learning_rate": 0.0001648253714489679, + "loss": 4.3129, + "step": 7978 + }, + { + "epoch": 0.8273269867603316, + "grad_norm": 0.6484375, + "learning_rate": 0.0001648171002508684, + "loss": 4.3436, + "step": 7979 + }, + { + "epoch": 0.8274306748148196, + "grad_norm": 0.6953125, + "learning_rate": 0.0001648088282879956, + "loss": 4.2833, + "step": 7980 + }, + { + "epoch": 0.8275343628693077, + "grad_norm": 0.625, + "learning_rate": 0.00016480055556044714, + "loss": 4.2834, + "step": 7981 + }, + { + "epoch": 0.8276380509237957, + "grad_norm": 0.71484375, + "learning_rate": 0.00016479228206832064, + "loss": 4.2804, + "step": 7982 + }, + { + "epoch": 0.8277417389782838, + "grad_norm": 0.65234375, + "learning_rate": 0.00016478400781171366, + "loss": 4.2894, + "step": 7983 + }, + { + "epoch": 0.8278454270327719, + "grad_norm": 0.6796875, + "learning_rate": 0.00016477573279072391, + "loss": 4.2667, + "step": 7984 + }, + { + "epoch": 0.82794911508726, + "grad_norm": 0.6640625, + "learning_rate": 0.00016476745700544896, + "loss": 4.3018, + "step": 7985 + }, + { + "epoch": 0.828052803141748, + "grad_norm": 0.62890625, + "learning_rate": 0.00016475918045598647, + "loss": 4.2352, + "step": 7986 + }, + { + "epoch": 0.8281564911962361, + "grad_norm": 0.6953125, + "learning_rate": 0.00016475090314243412, + "loss": 4.2554, + "step": 7987 + }, + { + "epoch": 0.8282601792507241, + "grad_norm": 0.64453125, + "learning_rate": 0.00016474262506488958, + "loss": 4.3044, + "step": 7988 + }, + { + "epoch": 0.8283638673052123, + "grad_norm": 0.640625, + "learning_rate": 0.00016473434622345047, + "loss": 4.2943, + "step": 7989 + }, + { + "epoch": 0.8284675553597003, + "grad_norm": 0.64453125, + "learning_rate": 0.0001647260666182145, + "loss": 4.3148, + "step": 7990 + }, + { + "epoch": 0.8285712434141884, + "grad_norm": 0.59375, + "learning_rate": 0.00016471778624927938, + "loss": 4.2165, + "step": 7991 + }, + { + "epoch": 0.8286749314686764, + "grad_norm": 0.7109375, + "learning_rate": 0.00016470950511674278, + "loss": 4.2778, + "step": 7992 + }, + { + "epoch": 0.8287786195231646, + "grad_norm": 0.640625, + "learning_rate": 0.00016470122322070245, + "loss": 4.274, + "step": 7993 + }, + { + "epoch": 0.8288823075776526, + "grad_norm": 0.68359375, + "learning_rate": 0.00016469294056125602, + "loss": 4.2821, + "step": 7994 + }, + { + "epoch": 0.8289859956321407, + "grad_norm": 0.62890625, + "learning_rate": 0.0001646846571385013, + "loss": 4.2289, + "step": 7995 + }, + { + "epoch": 0.8290896836866288, + "grad_norm": 0.72265625, + "learning_rate": 0.00016467637295253603, + "loss": 4.3043, + "step": 7996 + }, + { + "epoch": 0.8291933717411168, + "grad_norm": 0.62109375, + "learning_rate": 0.0001646680880034579, + "loss": 4.2625, + "step": 7997 + }, + { + "epoch": 0.829297059795605, + "grad_norm": 0.6875, + "learning_rate": 0.00016465980229136471, + "loss": 4.274, + "step": 7998 + }, + { + "epoch": 0.829400747850093, + "grad_norm": 0.7265625, + "learning_rate": 0.00016465151581635415, + "loss": 4.2753, + "step": 7999 + }, + { + "epoch": 0.8295044359045811, + "grad_norm": 0.7421875, + "learning_rate": 0.0001646432285785241, + "loss": 4.2849, + "step": 8000 + }, + { + "epoch": 0.8296081239590691, + "grad_norm": 0.73828125, + "learning_rate": 0.00016463494057797226, + "loss": 4.2604, + "step": 8001 + }, + { + "epoch": 0.8297118120135573, + "grad_norm": 0.73828125, + "learning_rate": 0.00016462665181479644, + "loss": 4.2643, + "step": 8002 + }, + { + "epoch": 0.8298155000680453, + "grad_norm": 0.77734375, + "learning_rate": 0.00016461836228909445, + "loss": 4.3256, + "step": 8003 + }, + { + "epoch": 0.8299191881225334, + "grad_norm": 0.70703125, + "learning_rate": 0.00016461007200096407, + "loss": 4.309, + "step": 8004 + }, + { + "epoch": 0.8300228761770214, + "grad_norm": 0.69921875, + "learning_rate": 0.00016460178095050316, + "loss": 4.2878, + "step": 8005 + }, + { + "epoch": 0.8301265642315095, + "grad_norm": 0.76171875, + "learning_rate": 0.0001645934891378095, + "loss": 4.3001, + "step": 8006 + }, + { + "epoch": 0.8302302522859976, + "grad_norm": 0.6640625, + "learning_rate": 0.00016458519656298095, + "loss": 4.3005, + "step": 8007 + }, + { + "epoch": 0.8303339403404857, + "grad_norm": 0.72265625, + "learning_rate": 0.00016457690322611537, + "loss": 4.2724, + "step": 8008 + }, + { + "epoch": 0.8304376283949737, + "grad_norm": 0.69921875, + "learning_rate": 0.0001645686091273106, + "loss": 4.2699, + "step": 8009 + }, + { + "epoch": 0.8305413164494618, + "grad_norm": 0.75, + "learning_rate": 0.00016456031426666446, + "loss": 4.3013, + "step": 8010 + }, + { + "epoch": 0.8306450045039498, + "grad_norm": 0.63671875, + "learning_rate": 0.00016455201864427486, + "loss": 4.2032, + "step": 8011 + }, + { + "epoch": 0.830748692558438, + "grad_norm": 0.734375, + "learning_rate": 0.0001645437222602397, + "loss": 4.276, + "step": 8012 + }, + { + "epoch": 0.830852380612926, + "grad_norm": 0.59375, + "learning_rate": 0.00016453542511465682, + "loss": 4.2788, + "step": 8013 + }, + { + "epoch": 0.8309560686674141, + "grad_norm": 0.68359375, + "learning_rate": 0.00016452712720762416, + "loss": 4.2797, + "step": 8014 + }, + { + "epoch": 0.8310597567219021, + "grad_norm": 0.65625, + "learning_rate": 0.00016451882853923958, + "loss": 4.2979, + "step": 8015 + }, + { + "epoch": 0.8311634447763903, + "grad_norm": 0.609375, + "learning_rate": 0.00016451052910960106, + "loss": 4.3467, + "step": 8016 + }, + { + "epoch": 0.8312671328308783, + "grad_norm": 0.66796875, + "learning_rate": 0.00016450222891880646, + "loss": 4.2835, + "step": 8017 + }, + { + "epoch": 0.8313708208853664, + "grad_norm": 0.6015625, + "learning_rate": 0.00016449392796695373, + "loss": 4.284, + "step": 8018 + }, + { + "epoch": 0.8314745089398544, + "grad_norm": 0.671875, + "learning_rate": 0.00016448562625414085, + "loss": 4.32, + "step": 8019 + }, + { + "epoch": 0.8315781969943425, + "grad_norm": 0.5703125, + "learning_rate": 0.0001644773237804657, + "loss": 4.2855, + "step": 8020 + }, + { + "epoch": 0.8316818850488306, + "grad_norm": 0.63671875, + "learning_rate": 0.00016446902054602635, + "loss": 4.294, + "step": 8021 + }, + { + "epoch": 0.8317855731033187, + "grad_norm": 0.54296875, + "learning_rate": 0.00016446071655092068, + "loss": 4.2968, + "step": 8022 + }, + { + "epoch": 0.8318892611578067, + "grad_norm": 0.6640625, + "learning_rate": 0.00016445241179524668, + "loss": 4.2859, + "step": 8023 + }, + { + "epoch": 0.8319929492122948, + "grad_norm": 0.6328125, + "learning_rate": 0.00016444410627910235, + "loss": 4.2405, + "step": 8024 + }, + { + "epoch": 0.8320966372667828, + "grad_norm": 0.62890625, + "learning_rate": 0.0001644358000025857, + "loss": 4.2623, + "step": 8025 + }, + { + "epoch": 0.832200325321271, + "grad_norm": 0.6328125, + "learning_rate": 0.0001644274929657947, + "loss": 4.2707, + "step": 8026 + }, + { + "epoch": 0.832304013375759, + "grad_norm": 0.62890625, + "learning_rate": 0.0001644191851688274, + "loss": 4.2848, + "step": 8027 + }, + { + "epoch": 0.8324077014302471, + "grad_norm": 0.65625, + "learning_rate": 0.00016441087661178181, + "loss": 4.3119, + "step": 8028 + }, + { + "epoch": 0.8325113894847351, + "grad_norm": 0.64453125, + "learning_rate": 0.00016440256729475597, + "loss": 4.3303, + "step": 8029 + }, + { + "epoch": 0.8326150775392233, + "grad_norm": 0.671875, + "learning_rate": 0.0001643942572178479, + "loss": 4.3078, + "step": 8030 + }, + { + "epoch": 0.8327187655937113, + "grad_norm": 0.66015625, + "learning_rate": 0.00016438594638115567, + "loss": 4.3261, + "step": 8031 + }, + { + "epoch": 0.8328224536481994, + "grad_norm": 0.62109375, + "learning_rate": 0.00016437763478477732, + "loss": 4.2575, + "step": 8032 + }, + { + "epoch": 0.8329261417026874, + "grad_norm": 0.72265625, + "learning_rate": 0.00016436932242881093, + "loss": 4.2534, + "step": 8033 + }, + { + "epoch": 0.8330298297571755, + "grad_norm": 0.60546875, + "learning_rate": 0.00016436100931335458, + "loss": 4.2907, + "step": 8034 + }, + { + "epoch": 0.8331335178116636, + "grad_norm": 0.671875, + "learning_rate": 0.00016435269543850636, + "loss": 4.2756, + "step": 8035 + }, + { + "epoch": 0.8332372058661517, + "grad_norm": 0.6875, + "learning_rate": 0.00016434438080436436, + "loss": 4.267, + "step": 8036 + }, + { + "epoch": 0.8333408939206397, + "grad_norm": 0.73828125, + "learning_rate": 0.00016433606541102667, + "loss": 4.2986, + "step": 8037 + }, + { + "epoch": 0.8334445819751278, + "grad_norm": 0.6328125, + "learning_rate": 0.00016432774925859142, + "loss": 4.2979, + "step": 8038 + }, + { + "epoch": 0.8335482700296158, + "grad_norm": 0.71875, + "learning_rate": 0.00016431943234715673, + "loss": 4.2951, + "step": 8039 + }, + { + "epoch": 0.833651958084104, + "grad_norm": 0.67578125, + "learning_rate": 0.0001643111146768207, + "loss": 4.2659, + "step": 8040 + }, + { + "epoch": 0.8337556461385921, + "grad_norm": 0.66796875, + "learning_rate": 0.00016430279624768152, + "loss": 4.2604, + "step": 8041 + }, + { + "epoch": 0.8338593341930801, + "grad_norm": 0.6015625, + "learning_rate": 0.00016429447705983732, + "loss": 4.3355, + "step": 8042 + }, + { + "epoch": 0.8339630222475682, + "grad_norm": 0.70703125, + "learning_rate": 0.00016428615711338624, + "loss": 4.2892, + "step": 8043 + }, + { + "epoch": 0.8340667103020563, + "grad_norm": 0.6328125, + "learning_rate": 0.00016427783640842646, + "loss": 4.282, + "step": 8044 + }, + { + "epoch": 0.8341703983565444, + "grad_norm": 0.7265625, + "learning_rate": 0.00016426951494505617, + "loss": 4.3052, + "step": 8045 + }, + { + "epoch": 0.8342740864110324, + "grad_norm": 0.64453125, + "learning_rate": 0.00016426119272337352, + "loss": 4.2935, + "step": 8046 + }, + { + "epoch": 0.8343777744655205, + "grad_norm": 0.73046875, + "learning_rate": 0.00016425286974347674, + "loss": 4.2614, + "step": 8047 + }, + { + "epoch": 0.8344814625200085, + "grad_norm": 0.6171875, + "learning_rate": 0.000164244546005464, + "loss": 4.2344, + "step": 8048 + }, + { + "epoch": 0.8345851505744967, + "grad_norm": 0.7578125, + "learning_rate": 0.00016423622150943355, + "loss": 4.2243, + "step": 8049 + }, + { + "epoch": 0.8346888386289847, + "grad_norm": 0.65234375, + "learning_rate": 0.00016422789625548356, + "loss": 4.2501, + "step": 8050 + }, + { + "epoch": 0.8347925266834728, + "grad_norm": 0.7734375, + "learning_rate": 0.0001642195702437123, + "loss": 4.2824, + "step": 8051 + }, + { + "epoch": 0.8348962147379608, + "grad_norm": 0.66796875, + "learning_rate": 0.000164211243474218, + "loss": 4.2817, + "step": 8052 + }, + { + "epoch": 0.834999902792449, + "grad_norm": 0.80859375, + "learning_rate": 0.00016420291594709889, + "loss": 4.3298, + "step": 8053 + }, + { + "epoch": 0.835103590846937, + "grad_norm": 0.6796875, + "learning_rate": 0.00016419458766245323, + "loss": 4.283, + "step": 8054 + }, + { + "epoch": 0.8352072789014251, + "grad_norm": 0.86328125, + "learning_rate": 0.00016418625862037932, + "loss": 4.334, + "step": 8055 + }, + { + "epoch": 0.8353109669559131, + "grad_norm": 0.734375, + "learning_rate": 0.00016417792882097537, + "loss": 4.2788, + "step": 8056 + }, + { + "epoch": 0.8354146550104012, + "grad_norm": 0.85546875, + "learning_rate": 0.00016416959826433974, + "loss": 4.3003, + "step": 8057 + }, + { + "epoch": 0.8355183430648893, + "grad_norm": 0.7265625, + "learning_rate": 0.00016416126695057063, + "loss": 4.269, + "step": 8058 + }, + { + "epoch": 0.8356220311193774, + "grad_norm": 0.875, + "learning_rate": 0.00016415293487976644, + "loss": 4.2525, + "step": 8059 + }, + { + "epoch": 0.8357257191738654, + "grad_norm": 0.703125, + "learning_rate": 0.00016414460205202539, + "loss": 4.2904, + "step": 8060 + }, + { + "epoch": 0.8358294072283535, + "grad_norm": 0.92578125, + "learning_rate": 0.00016413626846744584, + "loss": 4.2732, + "step": 8061 + }, + { + "epoch": 0.8359330952828415, + "grad_norm": 0.74609375, + "learning_rate": 0.00016412793412612614, + "loss": 4.2605, + "step": 8062 + }, + { + "epoch": 0.8360367833373297, + "grad_norm": 0.94921875, + "learning_rate": 0.00016411959902816462, + "loss": 4.2769, + "step": 8063 + }, + { + "epoch": 0.8361404713918177, + "grad_norm": 0.7421875, + "learning_rate": 0.00016411126317365958, + "loss": 4.2547, + "step": 8064 + }, + { + "epoch": 0.8362441594463058, + "grad_norm": 0.8671875, + "learning_rate": 0.00016410292656270943, + "loss": 4.2837, + "step": 8065 + }, + { + "epoch": 0.8363478475007938, + "grad_norm": 0.82421875, + "learning_rate": 0.00016409458919541248, + "loss": 4.2996, + "step": 8066 + }, + { + "epoch": 0.836451535555282, + "grad_norm": 0.8671875, + "learning_rate": 0.00016408625107186713, + "loss": 4.2966, + "step": 8067 + }, + { + "epoch": 0.83655522360977, + "grad_norm": 0.84765625, + "learning_rate": 0.00016407791219217178, + "loss": 4.2681, + "step": 8068 + }, + { + "epoch": 0.8366589116642581, + "grad_norm": 0.73828125, + "learning_rate": 0.0001640695725564248, + "loss": 4.2706, + "step": 8069 + }, + { + "epoch": 0.8367625997187461, + "grad_norm": 0.7890625, + "learning_rate": 0.00016406123216472452, + "loss": 4.2618, + "step": 8070 + }, + { + "epoch": 0.8368662877732342, + "grad_norm": 0.765625, + "learning_rate": 0.00016405289101716953, + "loss": 4.3088, + "step": 8071 + }, + { + "epoch": 0.8369699758277223, + "grad_norm": 0.75, + "learning_rate": 0.00016404454911385805, + "loss": 4.2826, + "step": 8072 + }, + { + "epoch": 0.8370736638822104, + "grad_norm": 0.81640625, + "learning_rate": 0.00016403620645488858, + "loss": 4.2745, + "step": 8073 + }, + { + "epoch": 0.8371773519366984, + "grad_norm": 0.71875, + "learning_rate": 0.00016402786304035963, + "loss": 4.3094, + "step": 8074 + }, + { + "epoch": 0.8372810399911865, + "grad_norm": 0.73828125, + "learning_rate": 0.00016401951887036952, + "loss": 4.2542, + "step": 8075 + }, + { + "epoch": 0.8373847280456745, + "grad_norm": 0.68359375, + "learning_rate": 0.0001640111739450168, + "loss": 4.3313, + "step": 8076 + }, + { + "epoch": 0.8374884161001627, + "grad_norm": 0.69921875, + "learning_rate": 0.00016400282826439986, + "loss": 4.31, + "step": 8077 + }, + { + "epoch": 0.8375921041546507, + "grad_norm": 0.8125, + "learning_rate": 0.0001639944818286172, + "loss": 4.2806, + "step": 8078 + }, + { + "epoch": 0.8376957922091388, + "grad_norm": 0.7109375, + "learning_rate": 0.0001639861346377673, + "loss": 4.2812, + "step": 8079 + }, + { + "epoch": 0.8377994802636268, + "grad_norm": 0.90625, + "learning_rate": 0.0001639777866919487, + "loss": 4.2328, + "step": 8080 + }, + { + "epoch": 0.837903168318115, + "grad_norm": 0.6875, + "learning_rate": 0.0001639694379912598, + "loss": 4.2796, + "step": 8081 + }, + { + "epoch": 0.838006856372603, + "grad_norm": 0.85546875, + "learning_rate": 0.00016396108853579912, + "loss": 4.2733, + "step": 8082 + }, + { + "epoch": 0.8381105444270911, + "grad_norm": 0.69140625, + "learning_rate": 0.00016395273832566523, + "loss": 4.292, + "step": 8083 + }, + { + "epoch": 0.8382142324815792, + "grad_norm": 0.90234375, + "learning_rate": 0.00016394438736095667, + "loss": 4.304, + "step": 8084 + }, + { + "epoch": 0.8383179205360672, + "grad_norm": 0.6875, + "learning_rate": 0.0001639360356417719, + "loss": 4.2959, + "step": 8085 + }, + { + "epoch": 0.8384216085905554, + "grad_norm": 0.84375, + "learning_rate": 0.00016392768316820947, + "loss": 4.2942, + "step": 8086 + }, + { + "epoch": 0.8385252966450434, + "grad_norm": 0.671875, + "learning_rate": 0.000163919329940368, + "loss": 4.2869, + "step": 8087 + }, + { + "epoch": 0.8386289846995315, + "grad_norm": 0.859375, + "learning_rate": 0.00016391097595834598, + "loss": 4.2924, + "step": 8088 + }, + { + "epoch": 0.8387326727540195, + "grad_norm": 0.66015625, + "learning_rate": 0.00016390262122224195, + "loss": 4.3011, + "step": 8089 + }, + { + "epoch": 0.8388363608085077, + "grad_norm": 0.90625, + "learning_rate": 0.00016389426573215458, + "loss": 4.2705, + "step": 8090 + }, + { + "epoch": 0.8389400488629957, + "grad_norm": 0.67578125, + "learning_rate": 0.00016388590948818245, + "loss": 4.2856, + "step": 8091 + }, + { + "epoch": 0.8390437369174838, + "grad_norm": 0.83984375, + "learning_rate": 0.00016387755249042406, + "loss": 4.2641, + "step": 8092 + }, + { + "epoch": 0.8391474249719718, + "grad_norm": 0.6796875, + "learning_rate": 0.00016386919473897812, + "loss": 4.3175, + "step": 8093 + }, + { + "epoch": 0.8392511130264599, + "grad_norm": 0.7890625, + "learning_rate": 0.00016386083623394314, + "loss": 4.2739, + "step": 8094 + }, + { + "epoch": 0.839354801080948, + "grad_norm": 0.734375, + "learning_rate": 0.00016385247697541785, + "loss": 4.2844, + "step": 8095 + }, + { + "epoch": 0.8394584891354361, + "grad_norm": 0.7890625, + "learning_rate": 0.00016384411696350083, + "loss": 4.2292, + "step": 8096 + }, + { + "epoch": 0.8395621771899241, + "grad_norm": 0.75, + "learning_rate": 0.00016383575619829065, + "loss": 4.2973, + "step": 8097 + }, + { + "epoch": 0.8396658652444122, + "grad_norm": 0.74609375, + "learning_rate": 0.0001638273946798861, + "loss": 4.2837, + "step": 8098 + }, + { + "epoch": 0.8397695532989002, + "grad_norm": 0.7734375, + "learning_rate": 0.0001638190324083857, + "loss": 4.2605, + "step": 8099 + }, + { + "epoch": 0.8398732413533884, + "grad_norm": 0.75390625, + "learning_rate": 0.0001638106693838882, + "loss": 4.2699, + "step": 8100 + }, + { + "epoch": 0.8399769294078764, + "grad_norm": 0.75390625, + "learning_rate": 0.00016380230560649226, + "loss": 4.2393, + "step": 8101 + }, + { + "epoch": 0.8400806174623645, + "grad_norm": 0.71875, + "learning_rate": 0.00016379394107629656, + "loss": 4.314, + "step": 8102 + }, + { + "epoch": 0.8401843055168525, + "grad_norm": 0.73828125, + "learning_rate": 0.00016378557579339978, + "loss": 4.2962, + "step": 8103 + }, + { + "epoch": 0.8402879935713407, + "grad_norm": 0.71875, + "learning_rate": 0.00016377720975790062, + "loss": 4.301, + "step": 8104 + }, + { + "epoch": 0.8403916816258287, + "grad_norm": 0.75390625, + "learning_rate": 0.00016376884296989783, + "loss": 4.2674, + "step": 8105 + }, + { + "epoch": 0.8404953696803168, + "grad_norm": 0.69140625, + "learning_rate": 0.00016376047542949007, + "loss": 4.3346, + "step": 8106 + }, + { + "epoch": 0.8405990577348048, + "grad_norm": 0.74609375, + "learning_rate": 0.00016375210713677612, + "loss": 4.28, + "step": 8107 + }, + { + "epoch": 0.8407027457892929, + "grad_norm": 0.703125, + "learning_rate": 0.0001637437380918547, + "loss": 4.3053, + "step": 8108 + }, + { + "epoch": 0.840806433843781, + "grad_norm": 0.69140625, + "learning_rate": 0.0001637353682948245, + "loss": 4.3224, + "step": 8109 + }, + { + "epoch": 0.8409101218982691, + "grad_norm": 0.76171875, + "learning_rate": 0.00016372699774578435, + "loss": 4.3216, + "step": 8110 + }, + { + "epoch": 0.8410138099527571, + "grad_norm": 0.65234375, + "learning_rate": 0.000163718626444833, + "loss": 4.2668, + "step": 8111 + }, + { + "epoch": 0.8411174980072452, + "grad_norm": 0.76953125, + "learning_rate": 0.00016371025439206922, + "loss": 4.2696, + "step": 8112 + }, + { + "epoch": 0.8412211860617332, + "grad_norm": 0.6484375, + "learning_rate": 0.00016370188158759176, + "loss": 4.2908, + "step": 8113 + }, + { + "epoch": 0.8413248741162214, + "grad_norm": 0.78515625, + "learning_rate": 0.00016369350803149946, + "loss": 4.3051, + "step": 8114 + }, + { + "epoch": 0.8414285621707094, + "grad_norm": 0.6875, + "learning_rate": 0.00016368513372389107, + "loss": 4.2693, + "step": 8115 + }, + { + "epoch": 0.8415322502251975, + "grad_norm": 0.6640625, + "learning_rate": 0.0001636767586648654, + "loss": 4.3012, + "step": 8116 + }, + { + "epoch": 0.8416359382796855, + "grad_norm": 0.59375, + "learning_rate": 0.00016366838285452134, + "loss": 4.2654, + "step": 8117 + }, + { + "epoch": 0.8417396263341737, + "grad_norm": 0.66796875, + "learning_rate": 0.00016366000629295763, + "loss": 4.2762, + "step": 8118 + }, + { + "epoch": 0.8418433143886617, + "grad_norm": 0.59375, + "learning_rate": 0.00016365162898027315, + "loss": 4.2977, + "step": 8119 + }, + { + "epoch": 0.8419470024431498, + "grad_norm": 0.69140625, + "learning_rate": 0.0001636432509165667, + "loss": 4.2502, + "step": 8120 + }, + { + "epoch": 0.8420506904976378, + "grad_norm": 0.59375, + "learning_rate": 0.00016363487210193723, + "loss": 4.3008, + "step": 8121 + }, + { + "epoch": 0.8421543785521259, + "grad_norm": 0.6484375, + "learning_rate": 0.0001636264925364835, + "loss": 4.2472, + "step": 8122 + }, + { + "epoch": 0.842258066606614, + "grad_norm": 0.62109375, + "learning_rate": 0.0001636181122203044, + "loss": 4.2992, + "step": 8123 + }, + { + "epoch": 0.8423617546611021, + "grad_norm": 0.6484375, + "learning_rate": 0.0001636097311534988, + "loss": 4.2512, + "step": 8124 + }, + { + "epoch": 0.8424654427155901, + "grad_norm": 0.59765625, + "learning_rate": 0.00016360134933616566, + "loss": 4.288, + "step": 8125 + }, + { + "epoch": 0.8425691307700782, + "grad_norm": 0.65234375, + "learning_rate": 0.00016359296676840378, + "loss": 4.3224, + "step": 8126 + }, + { + "epoch": 0.8426728188245662, + "grad_norm": 0.63671875, + "learning_rate": 0.00016358458345031216, + "loss": 4.2729, + "step": 8127 + }, + { + "epoch": 0.8427765068790544, + "grad_norm": 0.65625, + "learning_rate": 0.00016357619938198962, + "loss": 4.3161, + "step": 8128 + }, + { + "epoch": 0.8428801949335425, + "grad_norm": 0.64453125, + "learning_rate": 0.00016356781456353518, + "loss": 4.3149, + "step": 8129 + }, + { + "epoch": 0.8429838829880305, + "grad_norm": 0.73828125, + "learning_rate": 0.0001635594289950477, + "loss": 4.2801, + "step": 8130 + }, + { + "epoch": 0.8430875710425186, + "grad_norm": 0.68359375, + "learning_rate": 0.00016355104267662613, + "loss": 4.2477, + "step": 8131 + }, + { + "epoch": 0.8431912590970067, + "grad_norm": 0.6484375, + "learning_rate": 0.00016354265560836946, + "loss": 4.2534, + "step": 8132 + }, + { + "epoch": 0.8432949471514948, + "grad_norm": 0.72265625, + "learning_rate": 0.0001635342677903766, + "loss": 4.2756, + "step": 8133 + }, + { + "epoch": 0.8433986352059828, + "grad_norm": 0.66015625, + "learning_rate": 0.00016352587922274653, + "loss": 4.2894, + "step": 8134 + }, + { + "epoch": 0.8435023232604709, + "grad_norm": 0.71484375, + "learning_rate": 0.00016351748990557825, + "loss": 4.2701, + "step": 8135 + }, + { + "epoch": 0.8436060113149589, + "grad_norm": 0.77734375, + "learning_rate": 0.00016350909983897075, + "loss": 4.2833, + "step": 8136 + }, + { + "epoch": 0.8437096993694471, + "grad_norm": 0.64453125, + "learning_rate": 0.00016350070902302298, + "loss": 4.301, + "step": 8137 + }, + { + "epoch": 0.8438133874239351, + "grad_norm": 0.78515625, + "learning_rate": 0.00016349231745783397, + "loss": 4.3139, + "step": 8138 + }, + { + "epoch": 0.8439170754784232, + "grad_norm": 0.6875, + "learning_rate": 0.00016348392514350273, + "loss": 4.3215, + "step": 8139 + }, + { + "epoch": 0.8440207635329112, + "grad_norm": 0.72265625, + "learning_rate": 0.00016347553208012828, + "loss": 4.2747, + "step": 8140 + }, + { + "epoch": 0.8441244515873993, + "grad_norm": 0.74609375, + "learning_rate": 0.00016346713826780965, + "loss": 4.277, + "step": 8141 + }, + { + "epoch": 0.8442281396418874, + "grad_norm": 0.734375, + "learning_rate": 0.00016345874370664583, + "loss": 4.2962, + "step": 8142 + }, + { + "epoch": 0.8443318276963755, + "grad_norm": 0.71484375, + "learning_rate": 0.00016345034839673598, + "loss": 4.2753, + "step": 8143 + }, + { + "epoch": 0.8444355157508635, + "grad_norm": 0.70703125, + "learning_rate": 0.00016344195233817908, + "loss": 4.2507, + "step": 8144 + }, + { + "epoch": 0.8445392038053516, + "grad_norm": 0.80078125, + "learning_rate": 0.00016343355553107415, + "loss": 4.2846, + "step": 8145 + }, + { + "epoch": 0.8446428918598397, + "grad_norm": 0.69921875, + "learning_rate": 0.0001634251579755204, + "loss": 4.2818, + "step": 8146 + }, + { + "epoch": 0.8447465799143278, + "grad_norm": 0.8359375, + "learning_rate": 0.00016341675967161676, + "loss": 4.3059, + "step": 8147 + }, + { + "epoch": 0.8448502679688158, + "grad_norm": 0.7109375, + "learning_rate": 0.00016340836061946244, + "loss": 4.2929, + "step": 8148 + }, + { + "epoch": 0.8449539560233039, + "grad_norm": 0.84375, + "learning_rate": 0.00016339996081915644, + "loss": 4.3118, + "step": 8149 + }, + { + "epoch": 0.8450576440777919, + "grad_norm": 0.76953125, + "learning_rate": 0.00016339156027079797, + "loss": 4.2649, + "step": 8150 + }, + { + "epoch": 0.8451613321322801, + "grad_norm": 0.77734375, + "learning_rate": 0.00016338315897448605, + "loss": 4.2986, + "step": 8151 + }, + { + "epoch": 0.8452650201867681, + "grad_norm": 0.7734375, + "learning_rate": 0.00016337475693031984, + "loss": 4.3008, + "step": 8152 + }, + { + "epoch": 0.8453687082412562, + "grad_norm": 0.76953125, + "learning_rate": 0.00016336635413839853, + "loss": 4.2551, + "step": 8153 + }, + { + "epoch": 0.8454723962957442, + "grad_norm": 0.78125, + "learning_rate": 0.00016335795059882125, + "loss": 4.3054, + "step": 8154 + }, + { + "epoch": 0.8455760843502323, + "grad_norm": 0.76171875, + "learning_rate": 0.00016334954631168708, + "loss": 4.2829, + "step": 8155 + }, + { + "epoch": 0.8456797724047204, + "grad_norm": 0.7734375, + "learning_rate": 0.00016334114127709523, + "loss": 4.2798, + "step": 8156 + }, + { + "epoch": 0.8457834604592085, + "grad_norm": 0.85546875, + "learning_rate": 0.00016333273549514488, + "loss": 4.3232, + "step": 8157 + }, + { + "epoch": 0.8458871485136965, + "grad_norm": 0.828125, + "learning_rate": 0.0001633243289659352, + "loss": 4.2421, + "step": 8158 + }, + { + "epoch": 0.8459908365681846, + "grad_norm": 0.796875, + "learning_rate": 0.00016331592168956533, + "loss": 4.2662, + "step": 8159 + }, + { + "epoch": 0.8460945246226727, + "grad_norm": 0.77734375, + "learning_rate": 0.00016330751366613454, + "loss": 4.2964, + "step": 8160 + }, + { + "epoch": 0.8461982126771608, + "grad_norm": 0.765625, + "learning_rate": 0.00016329910489574202, + "loss": 4.3097, + "step": 8161 + }, + { + "epoch": 0.8463019007316488, + "grad_norm": 0.82421875, + "learning_rate": 0.00016329069537848696, + "loss": 4.249, + "step": 8162 + }, + { + "epoch": 0.8464055887861369, + "grad_norm": 0.73828125, + "learning_rate": 0.0001632822851144686, + "loss": 4.308, + "step": 8163 + }, + { + "epoch": 0.8465092768406249, + "grad_norm": 0.82421875, + "learning_rate": 0.00016327387410378617, + "loss": 4.2608, + "step": 8164 + }, + { + "epoch": 0.8466129648951131, + "grad_norm": 0.73828125, + "learning_rate": 0.00016326546234653893, + "loss": 4.3032, + "step": 8165 + }, + { + "epoch": 0.8467166529496011, + "grad_norm": 0.76953125, + "learning_rate": 0.0001632570498428261, + "loss": 4.2588, + "step": 8166 + }, + { + "epoch": 0.8468203410040892, + "grad_norm": 0.66015625, + "learning_rate": 0.00016324863659274692, + "loss": 4.2746, + "step": 8167 + }, + { + "epoch": 0.8469240290585772, + "grad_norm": 0.734375, + "learning_rate": 0.0001632402225964007, + "loss": 4.2657, + "step": 8168 + }, + { + "epoch": 0.8470277171130653, + "grad_norm": 0.73828125, + "learning_rate": 0.0001632318078538867, + "loss": 4.2658, + "step": 8169 + }, + { + "epoch": 0.8471314051675534, + "grad_norm": 0.75, + "learning_rate": 0.0001632233923653042, + "loss": 4.2676, + "step": 8170 + }, + { + "epoch": 0.8472350932220415, + "grad_norm": 0.7734375, + "learning_rate": 0.00016321497613075252, + "loss": 4.2574, + "step": 8171 + }, + { + "epoch": 0.8473387812765295, + "grad_norm": 0.8359375, + "learning_rate": 0.0001632065591503309, + "loss": 4.2547, + "step": 8172 + }, + { + "epoch": 0.8474424693310176, + "grad_norm": 0.69921875, + "learning_rate": 0.00016319814142413874, + "loss": 4.2806, + "step": 8173 + }, + { + "epoch": 0.8475461573855058, + "grad_norm": 0.81640625, + "learning_rate": 0.00016318972295227527, + "loss": 4.2752, + "step": 8174 + }, + { + "epoch": 0.8476498454399938, + "grad_norm": 0.66015625, + "learning_rate": 0.00016318130373483994, + "loss": 4.2638, + "step": 8175 + }, + { + "epoch": 0.8477535334944819, + "grad_norm": 0.8046875, + "learning_rate": 0.00016317288377193197, + "loss": 4.2541, + "step": 8176 + }, + { + "epoch": 0.8478572215489699, + "grad_norm": 0.72265625, + "learning_rate": 0.00016316446306365072, + "loss": 4.3027, + "step": 8177 + }, + { + "epoch": 0.847960909603458, + "grad_norm": 0.83984375, + "learning_rate": 0.0001631560416100956, + "loss": 4.2608, + "step": 8178 + }, + { + "epoch": 0.8480645976579461, + "grad_norm": 0.7734375, + "learning_rate": 0.00016314761941136594, + "loss": 4.2515, + "step": 8179 + }, + { + "epoch": 0.8481682857124342, + "grad_norm": 0.79296875, + "learning_rate": 0.00016313919646756113, + "loss": 4.2605, + "step": 8180 + }, + { + "epoch": 0.8482719737669222, + "grad_norm": 0.7265625, + "learning_rate": 0.00016313077277878052, + "loss": 4.3121, + "step": 8181 + }, + { + "epoch": 0.8483756618214103, + "grad_norm": 0.71875, + "learning_rate": 0.00016312234834512355, + "loss": 4.2779, + "step": 8182 + }, + { + "epoch": 0.8484793498758983, + "grad_norm": 0.671875, + "learning_rate": 0.0001631139231666896, + "loss": 4.2677, + "step": 8183 + }, + { + "epoch": 0.8485830379303865, + "grad_norm": 0.7109375, + "learning_rate": 0.00016310549724357802, + "loss": 4.2662, + "step": 8184 + }, + { + "epoch": 0.8486867259848745, + "grad_norm": 0.65625, + "learning_rate": 0.00016309707057588833, + "loss": 4.305, + "step": 8185 + }, + { + "epoch": 0.8487904140393626, + "grad_norm": 0.6875, + "learning_rate": 0.00016308864316371988, + "loss": 4.2841, + "step": 8186 + }, + { + "epoch": 0.8488941020938506, + "grad_norm": 0.7421875, + "learning_rate": 0.00016308021500717213, + "loss": 4.2788, + "step": 8187 + }, + { + "epoch": 0.8489977901483388, + "grad_norm": 0.71484375, + "learning_rate": 0.00016307178610634453, + "loss": 4.307, + "step": 8188 + }, + { + "epoch": 0.8491014782028268, + "grad_norm": 0.796875, + "learning_rate": 0.00016306335646133652, + "loss": 4.2861, + "step": 8189 + }, + { + "epoch": 0.8492051662573149, + "grad_norm": 0.7265625, + "learning_rate": 0.00016305492607224755, + "loss": 4.2639, + "step": 8190 + }, + { + "epoch": 0.8493088543118029, + "grad_norm": 0.75, + "learning_rate": 0.00016304649493917715, + "loss": 4.324, + "step": 8191 + }, + { + "epoch": 0.849412542366291, + "grad_norm": 0.66015625, + "learning_rate": 0.00016303806306222472, + "loss": 4.2783, + "step": 8192 + }, + { + "epoch": 0.8495162304207791, + "grad_norm": 0.79296875, + "learning_rate": 0.00016302963044148978, + "loss": 4.2854, + "step": 8193 + }, + { + "epoch": 0.8496199184752672, + "grad_norm": 0.69921875, + "learning_rate": 0.00016302119707707182, + "loss": 4.2542, + "step": 8194 + }, + { + "epoch": 0.8497236065297552, + "grad_norm": 0.7421875, + "learning_rate": 0.00016301276296907033, + "loss": 4.2758, + "step": 8195 + }, + { + "epoch": 0.8498272945842433, + "grad_norm": 0.6875, + "learning_rate": 0.0001630043281175849, + "loss": 4.248, + "step": 8196 + }, + { + "epoch": 0.8499309826387313, + "grad_norm": 0.7890625, + "learning_rate": 0.00016299589252271495, + "loss": 4.2591, + "step": 8197 + }, + { + "epoch": 0.8500346706932195, + "grad_norm": 0.78125, + "learning_rate": 0.0001629874561845601, + "loss": 4.2892, + "step": 8198 + }, + { + "epoch": 0.8501383587477075, + "grad_norm": 0.8046875, + "learning_rate": 0.0001629790191032198, + "loss": 4.3016, + "step": 8199 + }, + { + "epoch": 0.8502420468021956, + "grad_norm": 0.70703125, + "learning_rate": 0.00016297058127879367, + "loss": 4.3234, + "step": 8200 + }, + { + "epoch": 0.8503457348566836, + "grad_norm": 0.71484375, + "learning_rate": 0.00016296214271138126, + "loss": 4.3115, + "step": 8201 + }, + { + "epoch": 0.8504494229111718, + "grad_norm": 0.73046875, + "learning_rate": 0.0001629537034010821, + "loss": 4.2885, + "step": 8202 + }, + { + "epoch": 0.8505531109656598, + "grad_norm": 0.65234375, + "learning_rate": 0.00016294526334799576, + "loss": 4.2767, + "step": 8203 + }, + { + "epoch": 0.8506567990201479, + "grad_norm": 0.77734375, + "learning_rate": 0.00016293682255222192, + "loss": 4.2572, + "step": 8204 + }, + { + "epoch": 0.8507604870746359, + "grad_norm": 0.640625, + "learning_rate": 0.00016292838101386004, + "loss": 4.2418, + "step": 8205 + }, + { + "epoch": 0.850864175129124, + "grad_norm": 0.73828125, + "learning_rate": 0.00016291993873300979, + "loss": 4.2631, + "step": 8206 + }, + { + "epoch": 0.8509678631836121, + "grad_norm": 0.80078125, + "learning_rate": 0.00016291149570977077, + "loss": 4.2422, + "step": 8207 + }, + { + "epoch": 0.8510715512381002, + "grad_norm": 0.703125, + "learning_rate": 0.00016290305194424263, + "loss": 4.3361, + "step": 8208 + }, + { + "epoch": 0.8511752392925882, + "grad_norm": 0.72265625, + "learning_rate": 0.00016289460743652495, + "loss": 4.3048, + "step": 8209 + }, + { + "epoch": 0.8512789273470763, + "grad_norm": 0.671875, + "learning_rate": 0.0001628861621867174, + "loss": 4.2793, + "step": 8210 + }, + { + "epoch": 0.8513826154015643, + "grad_norm": 0.81640625, + "learning_rate": 0.00016287771619491958, + "loss": 4.2686, + "step": 8211 + }, + { + "epoch": 0.8514863034560525, + "grad_norm": 0.703125, + "learning_rate": 0.0001628692694612312, + "loss": 4.2783, + "step": 8212 + }, + { + "epoch": 0.8515899915105405, + "grad_norm": 0.859375, + "learning_rate": 0.00016286082198575187, + "loss": 4.2605, + "step": 8213 + }, + { + "epoch": 0.8516936795650286, + "grad_norm": 0.67578125, + "learning_rate": 0.0001628523737685813, + "loss": 4.3076, + "step": 8214 + }, + { + "epoch": 0.8517973676195166, + "grad_norm": 0.82421875, + "learning_rate": 0.00016284392480981916, + "loss": 4.2711, + "step": 8215 + }, + { + "epoch": 0.8519010556740048, + "grad_norm": 0.68359375, + "learning_rate": 0.00016283547510956514, + "loss": 4.2939, + "step": 8216 + }, + { + "epoch": 0.8520047437284928, + "grad_norm": 0.8125, + "learning_rate": 0.00016282702466791893, + "loss": 4.2644, + "step": 8217 + }, + { + "epoch": 0.8521084317829809, + "grad_norm": 0.75390625, + "learning_rate": 0.00016281857348498027, + "loss": 4.3043, + "step": 8218 + }, + { + "epoch": 0.852212119837469, + "grad_norm": 0.83203125, + "learning_rate": 0.0001628101215608488, + "loss": 4.2732, + "step": 8219 + }, + { + "epoch": 0.852315807891957, + "grad_norm": 0.7109375, + "learning_rate": 0.0001628016688956243, + "loss": 4.248, + "step": 8220 + }, + { + "epoch": 0.8524194959464452, + "grad_norm": 0.79296875, + "learning_rate": 0.0001627932154894065, + "loss": 4.26, + "step": 8221 + }, + { + "epoch": 0.8525231840009332, + "grad_norm": 0.7109375, + "learning_rate": 0.00016278476134229514, + "loss": 4.2633, + "step": 8222 + }, + { + "epoch": 0.8526268720554213, + "grad_norm": 0.8515625, + "learning_rate": 0.00016277630645438991, + "loss": 4.2904, + "step": 8223 + }, + { + "epoch": 0.8527305601099093, + "grad_norm": 0.7578125, + "learning_rate": 0.0001627678508257907, + "loss": 4.2674, + "step": 8224 + }, + { + "epoch": 0.8528342481643975, + "grad_norm": 0.8046875, + "learning_rate": 0.00016275939445659714, + "loss": 4.2886, + "step": 8225 + }, + { + "epoch": 0.8529379362188855, + "grad_norm": 0.83984375, + "learning_rate": 0.0001627509373469091, + "loss": 4.2872, + "step": 8226 + }, + { + "epoch": 0.8530416242733736, + "grad_norm": 0.78515625, + "learning_rate": 0.0001627424794968263, + "loss": 4.26, + "step": 8227 + }, + { + "epoch": 0.8531453123278616, + "grad_norm": 0.828125, + "learning_rate": 0.00016273402090644855, + "loss": 4.2756, + "step": 8228 + }, + { + "epoch": 0.8532490003823497, + "grad_norm": 0.79296875, + "learning_rate": 0.00016272556157587574, + "loss": 4.264, + "step": 8229 + }, + { + "epoch": 0.8533526884368378, + "grad_norm": 0.7890625, + "learning_rate": 0.00016271710150520753, + "loss": 4.2563, + "step": 8230 + }, + { + "epoch": 0.8534563764913259, + "grad_norm": 0.7890625, + "learning_rate": 0.00016270864069454385, + "loss": 4.2632, + "step": 8231 + }, + { + "epoch": 0.8535600645458139, + "grad_norm": 0.76953125, + "learning_rate": 0.00016270017914398453, + "loss": 4.2908, + "step": 8232 + }, + { + "epoch": 0.853663752600302, + "grad_norm": 0.80078125, + "learning_rate": 0.00016269171685362932, + "loss": 4.3089, + "step": 8233 + }, + { + "epoch": 0.85376744065479, + "grad_norm": 0.703125, + "learning_rate": 0.00016268325382357815, + "loss": 4.3015, + "step": 8234 + }, + { + "epoch": 0.8538711287092782, + "grad_norm": 0.76953125, + "learning_rate": 0.0001626747900539308, + "loss": 4.271, + "step": 8235 + }, + { + "epoch": 0.8539748167637662, + "grad_norm": 0.75390625, + "learning_rate": 0.00016266632554478723, + "loss": 4.2738, + "step": 8236 + }, + { + "epoch": 0.8540785048182543, + "grad_norm": 0.875, + "learning_rate": 0.00016265786029624724, + "loss": 4.2862, + "step": 8237 + }, + { + "epoch": 0.8541821928727423, + "grad_norm": 0.7890625, + "learning_rate": 0.00016264939430841072, + "loss": 4.2891, + "step": 8238 + }, + { + "epoch": 0.8542858809272305, + "grad_norm": 0.875, + "learning_rate": 0.00016264092758137757, + "loss": 4.2754, + "step": 8239 + }, + { + "epoch": 0.8543895689817185, + "grad_norm": 0.8125, + "learning_rate": 0.0001626324601152477, + "loss": 4.2848, + "step": 8240 + }, + { + "epoch": 0.8544932570362066, + "grad_norm": 0.79296875, + "learning_rate": 0.00016262399191012102, + "loss": 4.2629, + "step": 8241 + }, + { + "epoch": 0.8545969450906946, + "grad_norm": 0.796875, + "learning_rate": 0.00016261552296609742, + "loss": 4.2785, + "step": 8242 + }, + { + "epoch": 0.8547006331451827, + "grad_norm": 0.7890625, + "learning_rate": 0.00016260705328327682, + "loss": 4.2961, + "step": 8243 + }, + { + "epoch": 0.8548043211996708, + "grad_norm": 0.7578125, + "learning_rate": 0.0001625985828617592, + "loss": 4.2976, + "step": 8244 + }, + { + "epoch": 0.8549080092541589, + "grad_norm": 0.68359375, + "learning_rate": 0.00016259011170164443, + "loss": 4.3077, + "step": 8245 + }, + { + "epoch": 0.8550116973086469, + "grad_norm": 0.69921875, + "learning_rate": 0.00016258163980303254, + "loss": 4.264, + "step": 8246 + }, + { + "epoch": 0.855115385363135, + "grad_norm": 0.72265625, + "learning_rate": 0.00016257316716602343, + "loss": 4.3042, + "step": 8247 + }, + { + "epoch": 0.855219073417623, + "grad_norm": 0.73828125, + "learning_rate": 0.0001625646937907171, + "loss": 4.2716, + "step": 8248 + }, + { + "epoch": 0.8553227614721112, + "grad_norm": 0.76171875, + "learning_rate": 0.0001625562196772135, + "loss": 4.199, + "step": 8249 + }, + { + "epoch": 0.8554264495265992, + "grad_norm": 0.765625, + "learning_rate": 0.00016254774482561267, + "loss": 4.315, + "step": 8250 + }, + { + "epoch": 0.8555301375810873, + "grad_norm": 0.70703125, + "learning_rate": 0.00016253926923601454, + "loss": 4.2734, + "step": 8251 + }, + { + "epoch": 0.8556338256355753, + "grad_norm": 0.73828125, + "learning_rate": 0.00016253079290851915, + "loss": 4.2881, + "step": 8252 + }, + { + "epoch": 0.8557375136900635, + "grad_norm": 0.71484375, + "learning_rate": 0.00016252231584322648, + "loss": 4.3141, + "step": 8253 + }, + { + "epoch": 0.8558412017445515, + "grad_norm": 0.77734375, + "learning_rate": 0.0001625138380402366, + "loss": 4.3066, + "step": 8254 + }, + { + "epoch": 0.8559448897990396, + "grad_norm": 0.796875, + "learning_rate": 0.00016250535949964952, + "loss": 4.3038, + "step": 8255 + }, + { + "epoch": 0.8560485778535276, + "grad_norm": 0.6796875, + "learning_rate": 0.00016249688022156524, + "loss": 4.2631, + "step": 8256 + }, + { + "epoch": 0.8561522659080157, + "grad_norm": 0.69921875, + "learning_rate": 0.00016248840020608384, + "loss": 4.2473, + "step": 8257 + }, + { + "epoch": 0.8562559539625038, + "grad_norm": 0.69140625, + "learning_rate": 0.0001624799194533054, + "loss": 4.2545, + "step": 8258 + }, + { + "epoch": 0.8563596420169919, + "grad_norm": 0.75, + "learning_rate": 0.0001624714379633299, + "loss": 4.2926, + "step": 8259 + }, + { + "epoch": 0.8564633300714799, + "grad_norm": 0.6796875, + "learning_rate": 0.0001624629557362575, + "loss": 4.2821, + "step": 8260 + }, + { + "epoch": 0.856567018125968, + "grad_norm": 0.76171875, + "learning_rate": 0.00016245447277218823, + "loss": 4.2727, + "step": 8261 + }, + { + "epoch": 0.856670706180456, + "grad_norm": 0.6171875, + "learning_rate": 0.00016244598907122222, + "loss": 4.2537, + "step": 8262 + }, + { + "epoch": 0.8567743942349442, + "grad_norm": 0.66015625, + "learning_rate": 0.00016243750463345953, + "loss": 4.274, + "step": 8263 + }, + { + "epoch": 0.8568780822894323, + "grad_norm": 0.671875, + "learning_rate": 0.00016242901945900031, + "loss": 4.2731, + "step": 8264 + }, + { + "epoch": 0.8569817703439203, + "grad_norm": 0.5625, + "learning_rate": 0.00016242053354794463, + "loss": 4.2518, + "step": 8265 + }, + { + "epoch": 0.8570854583984084, + "grad_norm": 0.63671875, + "learning_rate": 0.00016241204690039266, + "loss": 4.2789, + "step": 8266 + }, + { + "epoch": 0.8571891464528965, + "grad_norm": 0.6796875, + "learning_rate": 0.00016240355951644447, + "loss": 4.2943, + "step": 8267 + }, + { + "epoch": 0.8572928345073846, + "grad_norm": 0.6640625, + "learning_rate": 0.00016239507139620028, + "loss": 4.2876, + "step": 8268 + }, + { + "epoch": 0.8573965225618726, + "grad_norm": 0.68359375, + "learning_rate": 0.00016238658253976018, + "loss": 4.2763, + "step": 8269 + }, + { + "epoch": 0.8575002106163607, + "grad_norm": 0.6953125, + "learning_rate": 0.00016237809294722435, + "loss": 4.2659, + "step": 8270 + }, + { + "epoch": 0.8576038986708487, + "grad_norm": 0.6796875, + "learning_rate": 0.00016236960261869297, + "loss": 4.2733, + "step": 8271 + }, + { + "epoch": 0.8577075867253369, + "grad_norm": 0.69921875, + "learning_rate": 0.0001623611115542662, + "loss": 4.2487, + "step": 8272 + }, + { + "epoch": 0.8578112747798249, + "grad_norm": 0.7734375, + "learning_rate": 0.00016235261975404422, + "loss": 4.2883, + "step": 8273 + }, + { + "epoch": 0.857914962834313, + "grad_norm": 0.71875, + "learning_rate": 0.00016234412721812726, + "loss": 4.3047, + "step": 8274 + }, + { + "epoch": 0.858018650888801, + "grad_norm": 0.72265625, + "learning_rate": 0.0001623356339466155, + "loss": 4.28, + "step": 8275 + }, + { + "epoch": 0.8581223389432892, + "grad_norm": 0.6796875, + "learning_rate": 0.00016232713993960914, + "loss": 4.255, + "step": 8276 + }, + { + "epoch": 0.8582260269977772, + "grad_norm": 0.76953125, + "learning_rate": 0.0001623186451972084, + "loss": 4.2602, + "step": 8277 + }, + { + "epoch": 0.8583297150522653, + "grad_norm": 0.68359375, + "learning_rate": 0.00016231014971951352, + "loss": 4.2697, + "step": 8278 + }, + { + "epoch": 0.8584334031067533, + "grad_norm": 0.7578125, + "learning_rate": 0.00016230165350662477, + "loss": 4.2785, + "step": 8279 + }, + { + "epoch": 0.8585370911612414, + "grad_norm": 0.6875, + "learning_rate": 0.00016229315655864234, + "loss": 4.3177, + "step": 8280 + }, + { + "epoch": 0.8586407792157295, + "grad_norm": 0.71875, + "learning_rate": 0.00016228465887566652, + "loss": 4.2407, + "step": 8281 + }, + { + "epoch": 0.8587444672702176, + "grad_norm": 0.6953125, + "learning_rate": 0.00016227616045779754, + "loss": 4.2322, + "step": 8282 + }, + { + "epoch": 0.8588481553247056, + "grad_norm": 0.71484375, + "learning_rate": 0.00016226766130513574, + "loss": 4.2513, + "step": 8283 + }, + { + "epoch": 0.8589518433791937, + "grad_norm": 0.6171875, + "learning_rate": 0.00016225916141778132, + "loss": 4.274, + "step": 8284 + }, + { + "epoch": 0.8590555314336817, + "grad_norm": 0.67578125, + "learning_rate": 0.0001622506607958346, + "loss": 4.2683, + "step": 8285 + }, + { + "epoch": 0.8591592194881699, + "grad_norm": 0.68359375, + "learning_rate": 0.00016224215943939593, + "loss": 4.2584, + "step": 8286 + }, + { + "epoch": 0.8592629075426579, + "grad_norm": 0.73046875, + "learning_rate": 0.00016223365734856552, + "loss": 4.2968, + "step": 8287 + }, + { + "epoch": 0.859366595597146, + "grad_norm": 0.73828125, + "learning_rate": 0.00016222515452344376, + "loss": 4.32, + "step": 8288 + }, + { + "epoch": 0.859470283651634, + "grad_norm": 0.66015625, + "learning_rate": 0.00016221665096413095, + "loss": 4.2641, + "step": 8289 + }, + { + "epoch": 0.8595739717061222, + "grad_norm": 0.65625, + "learning_rate": 0.0001622081466707274, + "loss": 4.2229, + "step": 8290 + }, + { + "epoch": 0.8596776597606102, + "grad_norm": 0.72265625, + "learning_rate": 0.00016219964164333351, + "loss": 4.2653, + "step": 8291 + }, + { + "epoch": 0.8597813478150983, + "grad_norm": 0.65625, + "learning_rate": 0.0001621911358820496, + "loss": 4.2972, + "step": 8292 + }, + { + "epoch": 0.8598850358695863, + "grad_norm": 0.69921875, + "learning_rate": 0.00016218262938697602, + "loss": 4.3156, + "step": 8293 + }, + { + "epoch": 0.8599887239240744, + "grad_norm": 0.67578125, + "learning_rate": 0.00016217412215821313, + "loss": 4.29, + "step": 8294 + }, + { + "epoch": 0.8600924119785625, + "grad_norm": 0.69140625, + "learning_rate": 0.0001621656141958613, + "loss": 4.2752, + "step": 8295 + }, + { + "epoch": 0.8601961000330506, + "grad_norm": 0.6484375, + "learning_rate": 0.00016215710550002098, + "loss": 4.2789, + "step": 8296 + }, + { + "epoch": 0.8602997880875386, + "grad_norm": 0.67578125, + "learning_rate": 0.0001621485960707925, + "loss": 4.2865, + "step": 8297 + }, + { + "epoch": 0.8604034761420267, + "grad_norm": 0.63671875, + "learning_rate": 0.00016214008590827628, + "loss": 4.2618, + "step": 8298 + }, + { + "epoch": 0.8605071641965147, + "grad_norm": 0.671875, + "learning_rate": 0.0001621315750125727, + "loss": 4.2809, + "step": 8299 + }, + { + "epoch": 0.8606108522510029, + "grad_norm": 0.63671875, + "learning_rate": 0.00016212306338378226, + "loss": 4.286, + "step": 8300 + }, + { + "epoch": 0.8607145403054909, + "grad_norm": 0.74609375, + "learning_rate": 0.00016211455102200533, + "loss": 4.2798, + "step": 8301 + }, + { + "epoch": 0.860818228359979, + "grad_norm": 0.72265625, + "learning_rate": 0.00016210603792734233, + "loss": 4.2832, + "step": 8302 + }, + { + "epoch": 0.860921916414467, + "grad_norm": 0.7109375, + "learning_rate": 0.00016209752409989374, + "loss": 4.271, + "step": 8303 + }, + { + "epoch": 0.8610256044689552, + "grad_norm": 0.6875, + "learning_rate": 0.00016208900953976004, + "loss": 4.2956, + "step": 8304 + }, + { + "epoch": 0.8611292925234432, + "grad_norm": 0.70703125, + "learning_rate": 0.00016208049424704162, + "loss": 4.2594, + "step": 8305 + }, + { + "epoch": 0.8612329805779313, + "grad_norm": 0.75390625, + "learning_rate": 0.000162071978221839, + "loss": 4.2507, + "step": 8306 + }, + { + "epoch": 0.8613366686324193, + "grad_norm": 0.69140625, + "learning_rate": 0.00016206346146425263, + "loss": 4.2584, + "step": 8307 + }, + { + "epoch": 0.8614403566869074, + "grad_norm": 0.703125, + "learning_rate": 0.00016205494397438303, + "loss": 4.2283, + "step": 8308 + }, + { + "epoch": 0.8615440447413956, + "grad_norm": 0.75, + "learning_rate": 0.00016204642575233072, + "loss": 4.3046, + "step": 8309 + }, + { + "epoch": 0.8616477327958836, + "grad_norm": 0.75390625, + "learning_rate": 0.00016203790679819612, + "loss": 4.2764, + "step": 8310 + }, + { + "epoch": 0.8617514208503717, + "grad_norm": 0.75390625, + "learning_rate": 0.00016202938711207984, + "loss": 4.2966, + "step": 8311 + }, + { + "epoch": 0.8618551089048597, + "grad_norm": 0.6953125, + "learning_rate": 0.00016202086669408233, + "loss": 4.3013, + "step": 8312 + }, + { + "epoch": 0.8619587969593479, + "grad_norm": 0.76171875, + "learning_rate": 0.00016201234554430417, + "loss": 4.2819, + "step": 8313 + }, + { + "epoch": 0.8620624850138359, + "grad_norm": 0.73046875, + "learning_rate": 0.00016200382366284588, + "loss": 4.2945, + "step": 8314 + }, + { + "epoch": 0.862166173068324, + "grad_norm": 0.77734375, + "learning_rate": 0.000161995301049808, + "loss": 4.2843, + "step": 8315 + }, + { + "epoch": 0.862269861122812, + "grad_norm": 0.671875, + "learning_rate": 0.00016198677770529112, + "loss": 4.2686, + "step": 8316 + }, + { + "epoch": 0.8623735491773001, + "grad_norm": 0.74609375, + "learning_rate": 0.00016197825362939577, + "loss": 4.3057, + "step": 8317 + }, + { + "epoch": 0.8624772372317882, + "grad_norm": 0.68359375, + "learning_rate": 0.00016196972882222255, + "loss": 4.3086, + "step": 8318 + }, + { + "epoch": 0.8625809252862763, + "grad_norm": 0.703125, + "learning_rate": 0.00016196120328387204, + "loss": 4.262, + "step": 8319 + }, + { + "epoch": 0.8626846133407643, + "grad_norm": 0.7109375, + "learning_rate": 0.0001619526770144448, + "loss": 4.3007, + "step": 8320 + }, + { + "epoch": 0.8627883013952524, + "grad_norm": 0.6484375, + "learning_rate": 0.00016194415001404147, + "loss": 4.2464, + "step": 8321 + }, + { + "epoch": 0.8628919894497404, + "grad_norm": 0.671875, + "learning_rate": 0.00016193562228276263, + "loss": 4.273, + "step": 8322 + }, + { + "epoch": 0.8629956775042286, + "grad_norm": 0.734375, + "learning_rate": 0.00016192709382070896, + "loss": 4.2781, + "step": 8323 + }, + { + "epoch": 0.8630993655587166, + "grad_norm": 0.6484375, + "learning_rate": 0.000161918564627981, + "loss": 4.2916, + "step": 8324 + }, + { + "epoch": 0.8632030536132047, + "grad_norm": 0.671875, + "learning_rate": 0.00016191003470467948, + "loss": 4.3182, + "step": 8325 + }, + { + "epoch": 0.8633067416676927, + "grad_norm": 0.66015625, + "learning_rate": 0.00016190150405090496, + "loss": 4.2624, + "step": 8326 + }, + { + "epoch": 0.8634104297221808, + "grad_norm": 0.74609375, + "learning_rate": 0.0001618929726667581, + "loss": 4.2549, + "step": 8327 + }, + { + "epoch": 0.8635141177766689, + "grad_norm": 0.7265625, + "learning_rate": 0.00016188444055233961, + "loss": 4.253, + "step": 8328 + }, + { + "epoch": 0.863617805831157, + "grad_norm": 0.71875, + "learning_rate": 0.00016187590770775016, + "loss": 4.2926, + "step": 8329 + }, + { + "epoch": 0.863721493885645, + "grad_norm": 0.6953125, + "learning_rate": 0.00016186737413309037, + "loss": 4.2453, + "step": 8330 + }, + { + "epoch": 0.8638251819401331, + "grad_norm": 0.71484375, + "learning_rate": 0.00016185883982846097, + "loss": 4.2331, + "step": 8331 + }, + { + "epoch": 0.8639288699946212, + "grad_norm": 0.66796875, + "learning_rate": 0.00016185030479396267, + "loss": 4.2789, + "step": 8332 + }, + { + "epoch": 0.8640325580491093, + "grad_norm": 0.7109375, + "learning_rate": 0.00016184176902969615, + "loss": 4.2669, + "step": 8333 + }, + { + "epoch": 0.8641362461035973, + "grad_norm": 0.703125, + "learning_rate": 0.0001618332325357621, + "loss": 4.2263, + "step": 8334 + }, + { + "epoch": 0.8642399341580854, + "grad_norm": 0.76171875, + "learning_rate": 0.00016182469531226125, + "loss": 4.2698, + "step": 8335 + }, + { + "epoch": 0.8643436222125734, + "grad_norm": 0.703125, + "learning_rate": 0.0001618161573592944, + "loss": 4.314, + "step": 8336 + }, + { + "epoch": 0.8644473102670616, + "grad_norm": 0.75, + "learning_rate": 0.0001618076186769622, + "loss": 4.282, + "step": 8337 + }, + { + "epoch": 0.8645509983215496, + "grad_norm": 0.74609375, + "learning_rate": 0.00016179907926536547, + "loss": 4.2735, + "step": 8338 + }, + { + "epoch": 0.8646546863760377, + "grad_norm": 0.796875, + "learning_rate": 0.00016179053912460486, + "loss": 4.3129, + "step": 8339 + }, + { + "epoch": 0.8647583744305257, + "grad_norm": 0.80078125, + "learning_rate": 0.00016178199825478125, + "loss": 4.3089, + "step": 8340 + }, + { + "epoch": 0.8648620624850138, + "grad_norm": 0.7734375, + "learning_rate": 0.00016177345665599536, + "loss": 4.2849, + "step": 8341 + }, + { + "epoch": 0.8649657505395019, + "grad_norm": 0.80859375, + "learning_rate": 0.00016176491432834798, + "loss": 4.2581, + "step": 8342 + }, + { + "epoch": 0.86506943859399, + "grad_norm": 0.87890625, + "learning_rate": 0.0001617563712719399, + "loss": 4.2763, + "step": 8343 + }, + { + "epoch": 0.865173126648478, + "grad_norm": 0.8203125, + "learning_rate": 0.00016174782748687192, + "loss": 4.2793, + "step": 8344 + }, + { + "epoch": 0.8652768147029661, + "grad_norm": 0.75390625, + "learning_rate": 0.00016173928297324484, + "loss": 4.2949, + "step": 8345 + }, + { + "epoch": 0.8653805027574542, + "grad_norm": 0.7890625, + "learning_rate": 0.0001617307377311595, + "loss": 4.2183, + "step": 8346 + }, + { + "epoch": 0.8654841908119423, + "grad_norm": 0.87109375, + "learning_rate": 0.0001617221917607167, + "loss": 4.2548, + "step": 8347 + }, + { + "epoch": 0.8655878788664303, + "grad_norm": 0.76171875, + "learning_rate": 0.00016171364506201727, + "loss": 4.2516, + "step": 8348 + }, + { + "epoch": 0.8656915669209184, + "grad_norm": 0.80859375, + "learning_rate": 0.00016170509763516205, + "loss": 4.2631, + "step": 8349 + }, + { + "epoch": 0.8657952549754064, + "grad_norm": 0.8125, + "learning_rate": 0.00016169654948025193, + "loss": 4.2793, + "step": 8350 + }, + { + "epoch": 0.8658989430298946, + "grad_norm": 0.79296875, + "learning_rate": 0.00016168800059738773, + "loss": 4.2804, + "step": 8351 + }, + { + "epoch": 0.8660026310843827, + "grad_norm": 0.86328125, + "learning_rate": 0.0001616794509866703, + "loss": 4.2818, + "step": 8352 + }, + { + "epoch": 0.8661063191388707, + "grad_norm": 0.7421875, + "learning_rate": 0.0001616709006482006, + "loss": 4.232, + "step": 8353 + }, + { + "epoch": 0.8662100071933588, + "grad_norm": 0.83203125, + "learning_rate": 0.00016166234958207946, + "loss": 4.2797, + "step": 8354 + }, + { + "epoch": 0.8663136952478468, + "grad_norm": 0.82421875, + "learning_rate": 0.00016165379778840776, + "loss": 4.2438, + "step": 8355 + }, + { + "epoch": 0.866417383302335, + "grad_norm": 0.78125, + "learning_rate": 0.00016164524526728638, + "loss": 4.2721, + "step": 8356 + }, + { + "epoch": 0.866521071356823, + "grad_norm": 0.8125, + "learning_rate": 0.0001616366920188163, + "loss": 4.3177, + "step": 8357 + }, + { + "epoch": 0.8666247594113111, + "grad_norm": 0.66796875, + "learning_rate": 0.00016162813804309841, + "loss": 4.2867, + "step": 8358 + }, + { + "epoch": 0.8667284474657991, + "grad_norm": 0.8203125, + "learning_rate": 0.00016161958334023365, + "loss": 4.2791, + "step": 8359 + }, + { + "epoch": 0.8668321355202873, + "grad_norm": 0.703125, + "learning_rate": 0.0001616110279103229, + "loss": 4.2265, + "step": 8360 + }, + { + "epoch": 0.8669358235747753, + "grad_norm": 0.71484375, + "learning_rate": 0.00016160247175346716, + "loss": 4.282, + "step": 8361 + }, + { + "epoch": 0.8670395116292634, + "grad_norm": 0.7265625, + "learning_rate": 0.00016159391486976737, + "loss": 4.2605, + "step": 8362 + }, + { + "epoch": 0.8671431996837514, + "grad_norm": 0.73046875, + "learning_rate": 0.00016158535725932453, + "loss": 4.2556, + "step": 8363 + }, + { + "epoch": 0.8672468877382395, + "grad_norm": 0.71875, + "learning_rate": 0.00016157679892223953, + "loss": 4.2746, + "step": 8364 + }, + { + "epoch": 0.8673505757927276, + "grad_norm": 0.69921875, + "learning_rate": 0.0001615682398586134, + "loss": 4.2715, + "step": 8365 + }, + { + "epoch": 0.8674542638472157, + "grad_norm": 0.69921875, + "learning_rate": 0.00016155968006854713, + "loss": 4.3402, + "step": 8366 + }, + { + "epoch": 0.8675579519017037, + "grad_norm": 0.68359375, + "learning_rate": 0.00016155111955214172, + "loss": 4.2307, + "step": 8367 + }, + { + "epoch": 0.8676616399561918, + "grad_norm": 0.7578125, + "learning_rate": 0.00016154255830949814, + "loss": 4.3315, + "step": 8368 + }, + { + "epoch": 0.8677653280106798, + "grad_norm": 0.6953125, + "learning_rate": 0.00016153399634071742, + "loss": 4.3183, + "step": 8369 + }, + { + "epoch": 0.867869016065168, + "grad_norm": 0.734375, + "learning_rate": 0.00016152543364590058, + "loss": 4.2694, + "step": 8370 + }, + { + "epoch": 0.867972704119656, + "grad_norm": 0.65234375, + "learning_rate": 0.0001615168702251487, + "loss": 4.2861, + "step": 8371 + }, + { + "epoch": 0.8680763921741441, + "grad_norm": 0.75, + "learning_rate": 0.00016150830607856276, + "loss": 4.2575, + "step": 8372 + }, + { + "epoch": 0.8681800802286321, + "grad_norm": 0.69140625, + "learning_rate": 0.0001614997412062438, + "loss": 4.2933, + "step": 8373 + }, + { + "epoch": 0.8682837682831203, + "grad_norm": 0.703125, + "learning_rate": 0.0001614911756082929, + "loss": 4.3062, + "step": 8374 + }, + { + "epoch": 0.8683874563376083, + "grad_norm": 0.71875, + "learning_rate": 0.00016148260928481117, + "loss": 4.3121, + "step": 8375 + }, + { + "epoch": 0.8684911443920964, + "grad_norm": 0.72265625, + "learning_rate": 0.0001614740422358996, + "loss": 4.3087, + "step": 8376 + }, + { + "epoch": 0.8685948324465844, + "grad_norm": 0.66796875, + "learning_rate": 0.00016146547446165935, + "loss": 4.2649, + "step": 8377 + }, + { + "epoch": 0.8686985205010725, + "grad_norm": 0.68359375, + "learning_rate": 0.00016145690596219142, + "loss": 4.2673, + "step": 8378 + }, + { + "epoch": 0.8688022085555606, + "grad_norm": 0.69921875, + "learning_rate": 0.000161448336737597, + "loss": 4.2901, + "step": 8379 + }, + { + "epoch": 0.8689058966100487, + "grad_norm": 0.74609375, + "learning_rate": 0.00016143976678797717, + "loss": 4.2934, + "step": 8380 + }, + { + "epoch": 0.8690095846645367, + "grad_norm": 0.69140625, + "learning_rate": 0.00016143119611343302, + "loss": 4.2689, + "step": 8381 + }, + { + "epoch": 0.8691132727190248, + "grad_norm": 0.78515625, + "learning_rate": 0.00016142262471406568, + "loss": 4.2696, + "step": 8382 + }, + { + "epoch": 0.8692169607735128, + "grad_norm": 0.6953125, + "learning_rate": 0.00016141405258997632, + "loss": 4.2788, + "step": 8383 + }, + { + "epoch": 0.869320648828001, + "grad_norm": 0.75, + "learning_rate": 0.00016140547974126602, + "loss": 4.2743, + "step": 8384 + }, + { + "epoch": 0.869424336882489, + "grad_norm": 0.66015625, + "learning_rate": 0.000161396906168036, + "loss": 4.2375, + "step": 8385 + }, + { + "epoch": 0.8695280249369771, + "grad_norm": 0.76953125, + "learning_rate": 0.00016138833187038737, + "loss": 4.2954, + "step": 8386 + }, + { + "epoch": 0.8696317129914651, + "grad_norm": 0.66796875, + "learning_rate": 0.00016137975684842135, + "loss": 4.2602, + "step": 8387 + }, + { + "epoch": 0.8697354010459533, + "grad_norm": 0.77734375, + "learning_rate": 0.00016137118110223905, + "loss": 4.2751, + "step": 8388 + }, + { + "epoch": 0.8698390891004413, + "grad_norm": 0.7421875, + "learning_rate": 0.00016136260463194167, + "loss": 4.2718, + "step": 8389 + }, + { + "epoch": 0.8699427771549294, + "grad_norm": 0.77734375, + "learning_rate": 0.00016135402743763043, + "loss": 4.2387, + "step": 8390 + }, + { + "epoch": 0.8700464652094174, + "grad_norm": 0.734375, + "learning_rate": 0.00016134544951940656, + "loss": 4.2696, + "step": 8391 + }, + { + "epoch": 0.8701501532639055, + "grad_norm": 0.703125, + "learning_rate": 0.00016133687087737118, + "loss": 4.2762, + "step": 8392 + }, + { + "epoch": 0.8702538413183936, + "grad_norm": 0.734375, + "learning_rate": 0.00016132829151162557, + "loss": 4.299, + "step": 8393 + }, + { + "epoch": 0.8703575293728817, + "grad_norm": 0.63671875, + "learning_rate": 0.00016131971142227097, + "loss": 4.2447, + "step": 8394 + }, + { + "epoch": 0.8704612174273697, + "grad_norm": 0.7890625, + "learning_rate": 0.00016131113060940857, + "loss": 4.2158, + "step": 8395 + }, + { + "epoch": 0.8705649054818578, + "grad_norm": 0.7265625, + "learning_rate": 0.00016130254907313965, + "loss": 4.2522, + "step": 8396 + }, + { + "epoch": 0.870668593536346, + "grad_norm": 0.70703125, + "learning_rate": 0.00016129396681356545, + "loss": 4.2729, + "step": 8397 + }, + { + "epoch": 0.870772281590834, + "grad_norm": 0.71484375, + "learning_rate": 0.0001612853838307872, + "loss": 4.303, + "step": 8398 + }, + { + "epoch": 0.8708759696453221, + "grad_norm": 0.75, + "learning_rate": 0.00016127680012490624, + "loss": 4.2124, + "step": 8399 + }, + { + "epoch": 0.8709796576998101, + "grad_norm": 0.74609375, + "learning_rate": 0.00016126821569602381, + "loss": 4.2845, + "step": 8400 + }, + { + "epoch": 0.8710833457542982, + "grad_norm": 0.68359375, + "learning_rate": 0.0001612596305442412, + "loss": 4.3315, + "step": 8401 + }, + { + "epoch": 0.8711870338087863, + "grad_norm": 0.87109375, + "learning_rate": 0.00016125104466965966, + "loss": 4.3241, + "step": 8402 + }, + { + "epoch": 0.8712907218632744, + "grad_norm": 0.6796875, + "learning_rate": 0.0001612424580723806, + "loss": 4.2597, + "step": 8403 + }, + { + "epoch": 0.8713944099177624, + "grad_norm": 0.91796875, + "learning_rate": 0.00016123387075250522, + "loss": 4.2711, + "step": 8404 + }, + { + "epoch": 0.8714980979722505, + "grad_norm": 0.7265625, + "learning_rate": 0.0001612252827101349, + "loss": 4.3072, + "step": 8405 + }, + { + "epoch": 0.8716017860267385, + "grad_norm": 0.96875, + "learning_rate": 0.00016121669394537097, + "loss": 4.2542, + "step": 8406 + }, + { + "epoch": 0.8717054740812267, + "grad_norm": 0.7421875, + "learning_rate": 0.00016120810445831478, + "loss": 4.231, + "step": 8407 + }, + { + "epoch": 0.8718091621357147, + "grad_norm": 1.0234375, + "learning_rate": 0.00016119951424906764, + "loss": 4.3089, + "step": 8408 + }, + { + "epoch": 0.8719128501902028, + "grad_norm": 0.71875, + "learning_rate": 0.00016119092331773094, + "loss": 4.2775, + "step": 8409 + }, + { + "epoch": 0.8720165382446908, + "grad_norm": 1.015625, + "learning_rate": 0.000161182331664406, + "loss": 4.2568, + "step": 8410 + }, + { + "epoch": 0.872120226299179, + "grad_norm": 0.7109375, + "learning_rate": 0.00016117373928919423, + "loss": 4.248, + "step": 8411 + }, + { + "epoch": 0.872223914353667, + "grad_norm": 1.0234375, + "learning_rate": 0.000161165146192197, + "loss": 4.237, + "step": 8412 + }, + { + "epoch": 0.8723276024081551, + "grad_norm": 0.6640625, + "learning_rate": 0.00016115655237351572, + "loss": 4.2702, + "step": 8413 + }, + { + "epoch": 0.8724312904626431, + "grad_norm": 0.890625, + "learning_rate": 0.00016114795783325173, + "loss": 4.2676, + "step": 8414 + }, + { + "epoch": 0.8725349785171312, + "grad_norm": 0.69921875, + "learning_rate": 0.00016113936257150649, + "loss": 4.2378, + "step": 8415 + }, + { + "epoch": 0.8726386665716193, + "grad_norm": 0.953125, + "learning_rate": 0.0001611307665883814, + "loss": 4.3031, + "step": 8416 + }, + { + "epoch": 0.8727423546261074, + "grad_norm": 0.83984375, + "learning_rate": 0.00016112216988397788, + "loss": 4.276, + "step": 8417 + }, + { + "epoch": 0.8728460426805954, + "grad_norm": 0.9375, + "learning_rate": 0.0001611135724583974, + "loss": 4.2516, + "step": 8418 + }, + { + "epoch": 0.8729497307350835, + "grad_norm": 0.68359375, + "learning_rate": 0.0001611049743117413, + "loss": 4.3003, + "step": 8419 + }, + { + "epoch": 0.8730534187895715, + "grad_norm": 0.9921875, + "learning_rate": 0.00016109637544411113, + "loss": 4.2494, + "step": 8420 + }, + { + "epoch": 0.8731571068440597, + "grad_norm": 0.75, + "learning_rate": 0.0001610877758556083, + "loss": 4.3208, + "step": 8421 + }, + { + "epoch": 0.8732607948985477, + "grad_norm": 0.98046875, + "learning_rate": 0.00016107917554633429, + "loss": 4.2399, + "step": 8422 + }, + { + "epoch": 0.8733644829530358, + "grad_norm": 0.68359375, + "learning_rate": 0.00016107057451639057, + "loss": 4.2707, + "step": 8423 + }, + { + "epoch": 0.8734681710075238, + "grad_norm": 0.9296875, + "learning_rate": 0.00016106197276587864, + "loss": 4.274, + "step": 8424 + }, + { + "epoch": 0.873571859062012, + "grad_norm": 0.69140625, + "learning_rate": 0.00016105337029489997, + "loss": 4.256, + "step": 8425 + }, + { + "epoch": 0.8736755471165, + "grad_norm": 0.94140625, + "learning_rate": 0.00016104476710355608, + "loss": 4.2639, + "step": 8426 + }, + { + "epoch": 0.8737792351709881, + "grad_norm": 0.73046875, + "learning_rate": 0.0001610361631919484, + "loss": 4.2571, + "step": 8427 + }, + { + "epoch": 0.8738829232254761, + "grad_norm": 0.91015625, + "learning_rate": 0.00016102755856017858, + "loss": 4.3151, + "step": 8428 + }, + { + "epoch": 0.8739866112799642, + "grad_norm": 0.7421875, + "learning_rate": 0.00016101895320834805, + "loss": 4.3228, + "step": 8429 + }, + { + "epoch": 0.8740902993344523, + "grad_norm": 0.76953125, + "learning_rate": 0.0001610103471365584, + "loss": 4.2533, + "step": 8430 + }, + { + "epoch": 0.8741939873889404, + "grad_norm": 0.74609375, + "learning_rate": 0.0001610017403449111, + "loss": 4.2683, + "step": 8431 + }, + { + "epoch": 0.8742976754434284, + "grad_norm": 0.76953125, + "learning_rate": 0.0001609931328335078, + "loss": 4.2586, + "step": 8432 + }, + { + "epoch": 0.8744013634979165, + "grad_norm": 0.7265625, + "learning_rate": 0.00016098452460245, + "loss": 4.3005, + "step": 8433 + }, + { + "epoch": 0.8745050515524045, + "grad_norm": 0.73828125, + "learning_rate": 0.0001609759156518392, + "loss": 4.3092, + "step": 8434 + }, + { + "epoch": 0.8746087396068927, + "grad_norm": 0.65625, + "learning_rate": 0.0001609673059817771, + "loss": 4.2785, + "step": 8435 + }, + { + "epoch": 0.8747124276613807, + "grad_norm": 0.8203125, + "learning_rate": 0.00016095869559236526, + "loss": 4.2505, + "step": 8436 + }, + { + "epoch": 0.8748161157158688, + "grad_norm": 0.77734375, + "learning_rate": 0.0001609500844837052, + "loss": 4.2825, + "step": 8437 + }, + { + "epoch": 0.8749198037703568, + "grad_norm": 0.85546875, + "learning_rate": 0.0001609414726558986, + "loss": 4.2787, + "step": 8438 + }, + { + "epoch": 0.875023491824845, + "grad_norm": 0.73046875, + "learning_rate": 0.00016093286010904705, + "loss": 4.2771, + "step": 8439 + }, + { + "epoch": 0.875127179879333, + "grad_norm": 0.84765625, + "learning_rate": 0.00016092424684325214, + "loss": 4.2597, + "step": 8440 + }, + { + "epoch": 0.8752308679338211, + "grad_norm": 0.734375, + "learning_rate": 0.00016091563285861553, + "loss": 4.2585, + "step": 8441 + }, + { + "epoch": 0.8753345559883092, + "grad_norm": 0.796875, + "learning_rate": 0.0001609070181552388, + "loss": 4.2992, + "step": 8442 + }, + { + "epoch": 0.8754382440427972, + "grad_norm": 0.66796875, + "learning_rate": 0.00016089840273322367, + "loss": 4.262, + "step": 8443 + }, + { + "epoch": 0.8755419320972854, + "grad_norm": 0.76171875, + "learning_rate": 0.00016088978659267177, + "loss": 4.2614, + "step": 8444 + }, + { + "epoch": 0.8756456201517734, + "grad_norm": 0.640625, + "learning_rate": 0.00016088116973368477, + "loss": 4.2499, + "step": 8445 + }, + { + "epoch": 0.8757493082062615, + "grad_norm": 0.828125, + "learning_rate": 0.00016087255215636428, + "loss": 4.2869, + "step": 8446 + }, + { + "epoch": 0.8758529962607495, + "grad_norm": 0.609375, + "learning_rate": 0.000160863933860812, + "loss": 4.2898, + "step": 8447 + }, + { + "epoch": 0.8759566843152377, + "grad_norm": 0.76171875, + "learning_rate": 0.00016085531484712968, + "loss": 4.2726, + "step": 8448 + }, + { + "epoch": 0.8760603723697257, + "grad_norm": 0.66796875, + "learning_rate": 0.00016084669511541897, + "loss": 4.2572, + "step": 8449 + }, + { + "epoch": 0.8761640604242138, + "grad_norm": 0.7265625, + "learning_rate": 0.00016083807466578157, + "loss": 4.2457, + "step": 8450 + }, + { + "epoch": 0.8762677484787018, + "grad_norm": 0.6484375, + "learning_rate": 0.0001608294534983192, + "loss": 4.2877, + "step": 8451 + }, + { + "epoch": 0.8763714365331899, + "grad_norm": 0.73046875, + "learning_rate": 0.00016082083161313355, + "loss": 4.3083, + "step": 8452 + }, + { + "epoch": 0.876475124587678, + "grad_norm": 0.66796875, + "learning_rate": 0.00016081220901032638, + "loss": 4.3249, + "step": 8453 + }, + { + "epoch": 0.8765788126421661, + "grad_norm": 0.71875, + "learning_rate": 0.00016080358568999948, + "loss": 4.2733, + "step": 8454 + }, + { + "epoch": 0.8766825006966541, + "grad_norm": 0.6953125, + "learning_rate": 0.00016079496165225448, + "loss": 4.2688, + "step": 8455 + }, + { + "epoch": 0.8767861887511422, + "grad_norm": 0.73828125, + "learning_rate": 0.00016078633689719322, + "loss": 4.3026, + "step": 8456 + }, + { + "epoch": 0.8768898768056302, + "grad_norm": 0.68359375, + "learning_rate": 0.00016077771142491745, + "loss": 4.2397, + "step": 8457 + }, + { + "epoch": 0.8769935648601184, + "grad_norm": 0.734375, + "learning_rate": 0.00016076908523552893, + "loss": 4.2921, + "step": 8458 + }, + { + "epoch": 0.8770972529146064, + "grad_norm": 0.7578125, + "learning_rate": 0.00016076045832912942, + "loss": 4.2783, + "step": 8459 + }, + { + "epoch": 0.8772009409690945, + "grad_norm": 0.703125, + "learning_rate": 0.00016075183070582072, + "loss": 4.2605, + "step": 8460 + }, + { + "epoch": 0.8773046290235825, + "grad_norm": 0.7265625, + "learning_rate": 0.00016074320236570463, + "loss": 4.2831, + "step": 8461 + }, + { + "epoch": 0.8774083170780707, + "grad_norm": 0.69140625, + "learning_rate": 0.00016073457330888303, + "loss": 4.2776, + "step": 8462 + }, + { + "epoch": 0.8775120051325587, + "grad_norm": 0.703125, + "learning_rate": 0.0001607259435354576, + "loss": 4.2924, + "step": 8463 + }, + { + "epoch": 0.8776156931870468, + "grad_norm": 0.6875, + "learning_rate": 0.00016071731304553025, + "loss": 4.2821, + "step": 8464 + }, + { + "epoch": 0.8777193812415348, + "grad_norm": 0.68359375, + "learning_rate": 0.00016070868183920277, + "loss": 4.3323, + "step": 8465 + }, + { + "epoch": 0.8778230692960229, + "grad_norm": 0.671875, + "learning_rate": 0.00016070004991657703, + "loss": 4.2644, + "step": 8466 + }, + { + "epoch": 0.877926757350511, + "grad_norm": 0.64453125, + "learning_rate": 0.00016069141727775485, + "loss": 4.2329, + "step": 8467 + }, + { + "epoch": 0.8780304454049991, + "grad_norm": 0.7421875, + "learning_rate": 0.00016068278392283812, + "loss": 4.2735, + "step": 8468 + }, + { + "epoch": 0.8781341334594871, + "grad_norm": 0.64453125, + "learning_rate": 0.0001606741498519287, + "loss": 4.2749, + "step": 8469 + }, + { + "epoch": 0.8782378215139752, + "grad_norm": 0.69140625, + "learning_rate": 0.0001606655150651284, + "loss": 4.2961, + "step": 8470 + }, + { + "epoch": 0.8783415095684632, + "grad_norm": 0.6875, + "learning_rate": 0.00016065687956253916, + "loss": 4.2416, + "step": 8471 + }, + { + "epoch": 0.8784451976229514, + "grad_norm": 0.69921875, + "learning_rate": 0.00016064824334426287, + "loss": 4.3016, + "step": 8472 + }, + { + "epoch": 0.8785488856774394, + "grad_norm": 0.73828125, + "learning_rate": 0.0001606396064104014, + "loss": 4.2524, + "step": 8473 + }, + { + "epoch": 0.8786525737319275, + "grad_norm": 0.734375, + "learning_rate": 0.0001606309687610567, + "loss": 4.2683, + "step": 8474 + }, + { + "epoch": 0.8787562617864155, + "grad_norm": 0.7421875, + "learning_rate": 0.00016062233039633065, + "loss": 4.2676, + "step": 8475 + }, + { + "epoch": 0.8788599498409037, + "grad_norm": 0.734375, + "learning_rate": 0.00016061369131632516, + "loss": 4.2734, + "step": 8476 + }, + { + "epoch": 0.8789636378953917, + "grad_norm": 0.671875, + "learning_rate": 0.00016060505152114222, + "loss": 4.277, + "step": 8477 + }, + { + "epoch": 0.8790673259498798, + "grad_norm": 0.69921875, + "learning_rate": 0.00016059641101088369, + "loss": 4.2818, + "step": 8478 + }, + { + "epoch": 0.8791710140043678, + "grad_norm": 0.58203125, + "learning_rate": 0.00016058776978565163, + "loss": 4.3112, + "step": 8479 + }, + { + "epoch": 0.8792747020588559, + "grad_norm": 0.7109375, + "learning_rate": 0.00016057912784554786, + "loss": 4.2279, + "step": 8480 + }, + { + "epoch": 0.879378390113344, + "grad_norm": 0.68359375, + "learning_rate": 0.00016057048519067446, + "loss": 4.2666, + "step": 8481 + }, + { + "epoch": 0.8794820781678321, + "grad_norm": 0.765625, + "learning_rate": 0.00016056184182113335, + "loss": 4.2834, + "step": 8482 + }, + { + "epoch": 0.8795857662223201, + "grad_norm": 0.60546875, + "learning_rate": 0.00016055319773702653, + "loss": 4.2535, + "step": 8483 + }, + { + "epoch": 0.8796894542768082, + "grad_norm": 0.7734375, + "learning_rate": 0.00016054455293845597, + "loss": 4.2957, + "step": 8484 + }, + { + "epoch": 0.8797931423312962, + "grad_norm": 0.59375, + "learning_rate": 0.00016053590742552367, + "loss": 4.2143, + "step": 8485 + }, + { + "epoch": 0.8798968303857844, + "grad_norm": 0.73046875, + "learning_rate": 0.00016052726119833167, + "loss": 4.3376, + "step": 8486 + }, + { + "epoch": 0.8800005184402725, + "grad_norm": 0.703125, + "learning_rate": 0.000160518614256982, + "loss": 4.227, + "step": 8487 + }, + { + "epoch": 0.8801042064947605, + "grad_norm": 0.65625, + "learning_rate": 0.0001605099666015766, + "loss": 4.2465, + "step": 8488 + }, + { + "epoch": 0.8802078945492486, + "grad_norm": 0.67578125, + "learning_rate": 0.00016050131823221756, + "loss": 4.2975, + "step": 8489 + }, + { + "epoch": 0.8803115826037367, + "grad_norm": 0.671875, + "learning_rate": 0.00016049266914900694, + "loss": 4.2436, + "step": 8490 + }, + { + "epoch": 0.8804152706582248, + "grad_norm": 0.66015625, + "learning_rate": 0.00016048401935204676, + "loss": 4.3068, + "step": 8491 + }, + { + "epoch": 0.8805189587127128, + "grad_norm": 0.66796875, + "learning_rate": 0.0001604753688414391, + "loss": 4.2671, + "step": 8492 + }, + { + "epoch": 0.8806226467672009, + "grad_norm": 0.703125, + "learning_rate": 0.00016046671761728597, + "loss": 4.2261, + "step": 8493 + }, + { + "epoch": 0.8807263348216889, + "grad_norm": 0.76953125, + "learning_rate": 0.00016045806567968954, + "loss": 4.3031, + "step": 8494 + }, + { + "epoch": 0.8808300228761771, + "grad_norm": 0.78125, + "learning_rate": 0.0001604494130287518, + "loss": 4.2663, + "step": 8495 + }, + { + "epoch": 0.8809337109306651, + "grad_norm": 0.6875, + "learning_rate": 0.0001604407596645749, + "loss": 4.2643, + "step": 8496 + }, + { + "epoch": 0.8810373989851532, + "grad_norm": 0.78125, + "learning_rate": 0.00016043210558726095, + "loss": 4.2479, + "step": 8497 + }, + { + "epoch": 0.8811410870396412, + "grad_norm": 0.671875, + "learning_rate": 0.000160423450796912, + "loss": 4.2624, + "step": 8498 + }, + { + "epoch": 0.8812447750941294, + "grad_norm": 0.73046875, + "learning_rate": 0.0001604147952936302, + "loss": 4.2587, + "step": 8499 + }, + { + "epoch": 0.8813484631486174, + "grad_norm": 0.66015625, + "learning_rate": 0.00016040613907751769, + "loss": 4.2823, + "step": 8500 + }, + { + "epoch": 0.8814521512031055, + "grad_norm": 0.68359375, + "learning_rate": 0.00016039748214867662, + "loss": 4.2809, + "step": 8501 + }, + { + "epoch": 0.8815558392575935, + "grad_norm": 0.68359375, + "learning_rate": 0.00016038882450720906, + "loss": 4.2661, + "step": 8502 + }, + { + "epoch": 0.8816595273120816, + "grad_norm": 0.6484375, + "learning_rate": 0.0001603801661532172, + "loss": 4.2395, + "step": 8503 + }, + { + "epoch": 0.8817632153665697, + "grad_norm": 0.74609375, + "learning_rate": 0.00016037150708680324, + "loss": 4.2503, + "step": 8504 + }, + { + "epoch": 0.8818669034210578, + "grad_norm": 0.67578125, + "learning_rate": 0.0001603628473080693, + "loss": 4.3186, + "step": 8505 + }, + { + "epoch": 0.8819705914755458, + "grad_norm": 0.6875, + "learning_rate": 0.00016035418681711755, + "loss": 4.2744, + "step": 8506 + }, + { + "epoch": 0.8820742795300339, + "grad_norm": 0.66015625, + "learning_rate": 0.00016034552561405022, + "loss": 4.2869, + "step": 8507 + }, + { + "epoch": 0.8821779675845219, + "grad_norm": 0.77734375, + "learning_rate": 0.00016033686369896946, + "loss": 4.2335, + "step": 8508 + }, + { + "epoch": 0.8822816556390101, + "grad_norm": 0.69140625, + "learning_rate": 0.0001603282010719775, + "loss": 4.3039, + "step": 8509 + }, + { + "epoch": 0.8823853436934981, + "grad_norm": 0.76171875, + "learning_rate": 0.0001603195377331765, + "loss": 4.3127, + "step": 8510 + }, + { + "epoch": 0.8824890317479862, + "grad_norm": 0.6484375, + "learning_rate": 0.00016031087368266876, + "loss": 4.2854, + "step": 8511 + }, + { + "epoch": 0.8825927198024742, + "grad_norm": 0.77734375, + "learning_rate": 0.00016030220892055642, + "loss": 4.2955, + "step": 8512 + }, + { + "epoch": 0.8826964078569624, + "grad_norm": 0.6953125, + "learning_rate": 0.0001602935434469418, + "loss": 4.2926, + "step": 8513 + }, + { + "epoch": 0.8828000959114504, + "grad_norm": 0.7578125, + "learning_rate": 0.00016028487726192707, + "loss": 4.3063, + "step": 8514 + }, + { + "epoch": 0.8829037839659385, + "grad_norm": 0.76953125, + "learning_rate": 0.00016027621036561454, + "loss": 4.3108, + "step": 8515 + }, + { + "epoch": 0.8830074720204265, + "grad_norm": 0.734375, + "learning_rate": 0.0001602675427581064, + "loss": 4.2623, + "step": 8516 + }, + { + "epoch": 0.8831111600749146, + "grad_norm": 0.71875, + "learning_rate": 0.000160258874439505, + "loss": 4.2864, + "step": 8517 + }, + { + "epoch": 0.8832148481294027, + "grad_norm": 0.75, + "learning_rate": 0.00016025020540991257, + "loss": 4.2584, + "step": 8518 + }, + { + "epoch": 0.8833185361838908, + "grad_norm": 0.71484375, + "learning_rate": 0.00016024153566943138, + "loss": 4.2823, + "step": 8519 + }, + { + "epoch": 0.8834222242383788, + "grad_norm": 0.72265625, + "learning_rate": 0.00016023286521816375, + "loss": 4.2925, + "step": 8520 + }, + { + "epoch": 0.8835259122928669, + "grad_norm": 0.796875, + "learning_rate": 0.000160224194056212, + "loss": 4.3236, + "step": 8521 + }, + { + "epoch": 0.8836296003473549, + "grad_norm": 0.77734375, + "learning_rate": 0.00016021552218367844, + "loss": 4.3257, + "step": 8522 + }, + { + "epoch": 0.8837332884018431, + "grad_norm": 0.73046875, + "learning_rate": 0.00016020684960066532, + "loss": 4.2686, + "step": 8523 + }, + { + "epoch": 0.8838369764563311, + "grad_norm": 0.703125, + "learning_rate": 0.00016019817630727505, + "loss": 4.2799, + "step": 8524 + }, + { + "epoch": 0.8839406645108192, + "grad_norm": 0.75, + "learning_rate": 0.00016018950230360993, + "loss": 4.3084, + "step": 8525 + }, + { + "epoch": 0.8840443525653072, + "grad_norm": 0.69921875, + "learning_rate": 0.00016018082758977227, + "loss": 4.3094, + "step": 8526 + }, + { + "epoch": 0.8841480406197954, + "grad_norm": 0.76171875, + "learning_rate": 0.00016017215216586447, + "loss": 4.262, + "step": 8527 + }, + { + "epoch": 0.8842517286742834, + "grad_norm": 0.703125, + "learning_rate": 0.00016016347603198892, + "loss": 4.3118, + "step": 8528 + }, + { + "epoch": 0.8843554167287715, + "grad_norm": 0.7890625, + "learning_rate": 0.0001601547991882479, + "loss": 4.2696, + "step": 8529 + }, + { + "epoch": 0.8844591047832595, + "grad_norm": 0.73828125, + "learning_rate": 0.00016014612163474387, + "loss": 4.2951, + "step": 8530 + }, + { + "epoch": 0.8845627928377476, + "grad_norm": 0.8203125, + "learning_rate": 0.00016013744337157917, + "loss": 4.2564, + "step": 8531 + }, + { + "epoch": 0.8846664808922358, + "grad_norm": 0.6953125, + "learning_rate": 0.0001601287643988562, + "loss": 4.2667, + "step": 8532 + }, + { + "epoch": 0.8847701689467238, + "grad_norm": 0.82421875, + "learning_rate": 0.0001601200847166774, + "loss": 4.2845, + "step": 8533 + }, + { + "epoch": 0.8848738570012119, + "grad_norm": 0.7109375, + "learning_rate": 0.00016011140432514511, + "loss": 4.271, + "step": 8534 + }, + { + "epoch": 0.8849775450556999, + "grad_norm": 0.796875, + "learning_rate": 0.0001601027232243618, + "loss": 4.2858, + "step": 8535 + }, + { + "epoch": 0.885081233110188, + "grad_norm": 0.734375, + "learning_rate": 0.00016009404141442988, + "loss": 4.2776, + "step": 8536 + }, + { + "epoch": 0.8851849211646761, + "grad_norm": 0.7265625, + "learning_rate": 0.00016008535889545182, + "loss": 4.2547, + "step": 8537 + }, + { + "epoch": 0.8852886092191642, + "grad_norm": 0.72265625, + "learning_rate": 0.00016007667566753, + "loss": 4.2769, + "step": 8538 + }, + { + "epoch": 0.8853922972736522, + "grad_norm": 0.70703125, + "learning_rate": 0.00016006799173076694, + "loss": 4.2438, + "step": 8539 + }, + { + "epoch": 0.8854959853281403, + "grad_norm": 0.6875, + "learning_rate": 0.00016005930708526506, + "loss": 4.2793, + "step": 8540 + }, + { + "epoch": 0.8855996733826283, + "grad_norm": 0.6640625, + "learning_rate": 0.00016005062173112688, + "loss": 4.243, + "step": 8541 + }, + { + "epoch": 0.8857033614371165, + "grad_norm": 0.66796875, + "learning_rate": 0.00016004193566845478, + "loss": 4.2848, + "step": 8542 + }, + { + "epoch": 0.8858070494916045, + "grad_norm": 0.66796875, + "learning_rate": 0.00016003324889735134, + "loss": 4.2932, + "step": 8543 + }, + { + "epoch": 0.8859107375460926, + "grad_norm": 0.62890625, + "learning_rate": 0.00016002456141791903, + "loss": 4.2527, + "step": 8544 + }, + { + "epoch": 0.8860144256005806, + "grad_norm": 0.7265625, + "learning_rate": 0.0001600158732302603, + "loss": 4.2389, + "step": 8545 + }, + { + "epoch": 0.8861181136550688, + "grad_norm": 0.66015625, + "learning_rate": 0.00016000718433447774, + "loss": 4.2734, + "step": 8546 + }, + { + "epoch": 0.8862218017095568, + "grad_norm": 0.703125, + "learning_rate": 0.00015999849473067386, + "loss": 4.236, + "step": 8547 + }, + { + "epoch": 0.8863254897640449, + "grad_norm": 0.6796875, + "learning_rate": 0.0001599898044189511, + "loss": 4.2682, + "step": 8548 + }, + { + "epoch": 0.8864291778185329, + "grad_norm": 0.72265625, + "learning_rate": 0.0001599811133994121, + "loss": 4.2913, + "step": 8549 + }, + { + "epoch": 0.886532865873021, + "grad_norm": 0.69921875, + "learning_rate": 0.00015997242167215935, + "loss": 4.2738, + "step": 8550 + }, + { + "epoch": 0.8866365539275091, + "grad_norm": 0.74609375, + "learning_rate": 0.00015996372923729544, + "loss": 4.2653, + "step": 8551 + }, + { + "epoch": 0.8867402419819972, + "grad_norm": 0.73828125, + "learning_rate": 0.00015995503609492288, + "loss": 4.2559, + "step": 8552 + }, + { + "epoch": 0.8868439300364852, + "grad_norm": 0.76171875, + "learning_rate": 0.00015994634224514428, + "loss": 4.2434, + "step": 8553 + }, + { + "epoch": 0.8869476180909733, + "grad_norm": 0.76171875, + "learning_rate": 0.00015993764768806222, + "loss": 4.2631, + "step": 8554 + }, + { + "epoch": 0.8870513061454613, + "grad_norm": 0.765625, + "learning_rate": 0.00015992895242377927, + "loss": 4.2876, + "step": 8555 + }, + { + "epoch": 0.8871549941999495, + "grad_norm": 0.765625, + "learning_rate": 0.000159920256452398, + "loss": 4.2772, + "step": 8556 + }, + { + "epoch": 0.8872586822544375, + "grad_norm": 0.8515625, + "learning_rate": 0.00015991155977402108, + "loss": 4.29, + "step": 8557 + }, + { + "epoch": 0.8873623703089256, + "grad_norm": 0.6796875, + "learning_rate": 0.00015990286238875108, + "loss": 4.2843, + "step": 8558 + }, + { + "epoch": 0.8874660583634136, + "grad_norm": 0.84765625, + "learning_rate": 0.00015989416429669062, + "loss": 4.3156, + "step": 8559 + }, + { + "epoch": 0.8875697464179018, + "grad_norm": 0.7890625, + "learning_rate": 0.0001598854654979423, + "loss": 4.2318, + "step": 8560 + }, + { + "epoch": 0.8876734344723898, + "grad_norm": 0.87109375, + "learning_rate": 0.00015987676599260882, + "loss": 4.2924, + "step": 8561 + }, + { + "epoch": 0.8877771225268779, + "grad_norm": 0.74609375, + "learning_rate": 0.0001598680657807928, + "loss": 4.2523, + "step": 8562 + }, + { + "epoch": 0.8878808105813659, + "grad_norm": 0.890625, + "learning_rate": 0.00015985936486259688, + "loss": 4.253, + "step": 8563 + }, + { + "epoch": 0.887984498635854, + "grad_norm": 0.81640625, + "learning_rate": 0.00015985066323812372, + "loss": 4.2691, + "step": 8564 + }, + { + "epoch": 0.8880881866903421, + "grad_norm": 0.828125, + "learning_rate": 0.000159841960907476, + "loss": 4.3306, + "step": 8565 + }, + { + "epoch": 0.8881918747448302, + "grad_norm": 0.87109375, + "learning_rate": 0.00015983325787075642, + "loss": 4.2717, + "step": 8566 + }, + { + "epoch": 0.8882955627993182, + "grad_norm": 0.8203125, + "learning_rate": 0.00015982455412806763, + "loss": 4.271, + "step": 8567 + }, + { + "epoch": 0.8883992508538063, + "grad_norm": 0.8671875, + "learning_rate": 0.0001598158496795123, + "loss": 4.2797, + "step": 8568 + }, + { + "epoch": 0.8885029389082943, + "grad_norm": 0.7734375, + "learning_rate": 0.00015980714452519323, + "loss": 4.2774, + "step": 8569 + }, + { + "epoch": 0.8886066269627825, + "grad_norm": 0.9375, + "learning_rate": 0.00015979843866521302, + "loss": 4.2678, + "step": 8570 + }, + { + "epoch": 0.8887103150172705, + "grad_norm": 0.86328125, + "learning_rate": 0.0001597897320996745, + "loss": 4.2739, + "step": 8571 + }, + { + "epoch": 0.8888140030717586, + "grad_norm": 0.9140625, + "learning_rate": 0.00015978102482868032, + "loss": 4.2616, + "step": 8572 + }, + { + "epoch": 0.8889176911262466, + "grad_norm": 0.8515625, + "learning_rate": 0.0001597723168523332, + "loss": 4.2571, + "step": 8573 + }, + { + "epoch": 0.8890213791807348, + "grad_norm": 0.83203125, + "learning_rate": 0.00015976360817073596, + "loss": 4.3135, + "step": 8574 + }, + { + "epoch": 0.8891250672352228, + "grad_norm": 0.7265625, + "learning_rate": 0.0001597548987839913, + "loss": 4.3112, + "step": 8575 + }, + { + "epoch": 0.8892287552897109, + "grad_norm": 0.734375, + "learning_rate": 0.00015974618869220203, + "loss": 4.273, + "step": 8576 + }, + { + "epoch": 0.889332443344199, + "grad_norm": 0.70703125, + "learning_rate": 0.00015973747789547086, + "loss": 4.2647, + "step": 8577 + }, + { + "epoch": 0.889436131398687, + "grad_norm": 0.74609375, + "learning_rate": 0.00015972876639390058, + "loss": 4.2989, + "step": 8578 + }, + { + "epoch": 0.8895398194531752, + "grad_norm": 0.72265625, + "learning_rate": 0.00015972005418759405, + "loss": 4.256, + "step": 8579 + }, + { + "epoch": 0.8896435075076632, + "grad_norm": 0.7734375, + "learning_rate": 0.00015971134127665395, + "loss": 4.3002, + "step": 8580 + }, + { + "epoch": 0.8897471955621513, + "grad_norm": 0.79296875, + "learning_rate": 0.00015970262766118318, + "loss": 4.2529, + "step": 8581 + }, + { + "epoch": 0.8898508836166393, + "grad_norm": 0.71875, + "learning_rate": 0.00015969391334128447, + "loss": 4.2892, + "step": 8582 + }, + { + "epoch": 0.8899545716711275, + "grad_norm": 0.75390625, + "learning_rate": 0.00015968519831706073, + "loss": 4.2553, + "step": 8583 + }, + { + "epoch": 0.8900582597256155, + "grad_norm": 0.7109375, + "learning_rate": 0.00015967648258861472, + "loss": 4.2492, + "step": 8584 + }, + { + "epoch": 0.8901619477801036, + "grad_norm": 0.78125, + "learning_rate": 0.00015966776615604928, + "loss": 4.2728, + "step": 8585 + }, + { + "epoch": 0.8902656358345916, + "grad_norm": 0.7265625, + "learning_rate": 0.0001596590490194673, + "loss": 4.2634, + "step": 8586 + }, + { + "epoch": 0.8903693238890797, + "grad_norm": 0.76953125, + "learning_rate": 0.0001596503311789716, + "loss": 4.2454, + "step": 8587 + }, + { + "epoch": 0.8904730119435678, + "grad_norm": 0.703125, + "learning_rate": 0.00015964161263466503, + "loss": 4.3347, + "step": 8588 + }, + { + "epoch": 0.8905766999980559, + "grad_norm": 0.6953125, + "learning_rate": 0.00015963289338665046, + "loss": 4.294, + "step": 8589 + }, + { + "epoch": 0.8906803880525439, + "grad_norm": 0.65234375, + "learning_rate": 0.0001596241734350308, + "loss": 4.3227, + "step": 8590 + }, + { + "epoch": 0.890784076107032, + "grad_norm": 0.75, + "learning_rate": 0.0001596154527799089, + "loss": 4.2758, + "step": 8591 + }, + { + "epoch": 0.89088776416152, + "grad_norm": 0.6875, + "learning_rate": 0.00015960673142138775, + "loss": 4.3, + "step": 8592 + }, + { + "epoch": 0.8909914522160082, + "grad_norm": 0.703125, + "learning_rate": 0.0001595980093595701, + "loss": 4.278, + "step": 8593 + }, + { + "epoch": 0.8910951402704962, + "grad_norm": 0.69140625, + "learning_rate": 0.00015958928659455895, + "loss": 4.2745, + "step": 8594 + }, + { + "epoch": 0.8911988283249843, + "grad_norm": 0.6953125, + "learning_rate": 0.0001595805631264572, + "loss": 4.2686, + "step": 8595 + }, + { + "epoch": 0.8913025163794723, + "grad_norm": 0.6796875, + "learning_rate": 0.0001595718389553678, + "loss": 4.2547, + "step": 8596 + }, + { + "epoch": 0.8914062044339605, + "grad_norm": 0.640625, + "learning_rate": 0.00015956311408139365, + "loss": 4.2825, + "step": 8597 + }, + { + "epoch": 0.8915098924884485, + "grad_norm": 0.66796875, + "learning_rate": 0.00015955438850463776, + "loss": 4.2491, + "step": 8598 + }, + { + "epoch": 0.8916135805429366, + "grad_norm": 0.703125, + "learning_rate": 0.000159545662225203, + "loss": 4.2302, + "step": 8599 + }, + { + "epoch": 0.8917172685974246, + "grad_norm": 0.64453125, + "learning_rate": 0.0001595369352431924, + "loss": 4.2561, + "step": 8600 + }, + { + "epoch": 0.8918209566519127, + "grad_norm": 0.63671875, + "learning_rate": 0.00015952820755870886, + "loss": 4.2302, + "step": 8601 + }, + { + "epoch": 0.8919246447064008, + "grad_norm": 0.64453125, + "learning_rate": 0.0001595194791718554, + "loss": 4.2646, + "step": 8602 + }, + { + "epoch": 0.8920283327608889, + "grad_norm": 0.68359375, + "learning_rate": 0.00015951075008273504, + "loss": 4.2478, + "step": 8603 + }, + { + "epoch": 0.8921320208153769, + "grad_norm": 0.66796875, + "learning_rate": 0.00015950202029145068, + "loss": 4.305, + "step": 8604 + }, + { + "epoch": 0.892235708869865, + "grad_norm": 0.6953125, + "learning_rate": 0.0001594932897981054, + "loss": 4.3149, + "step": 8605 + }, + { + "epoch": 0.892339396924353, + "grad_norm": 0.68359375, + "learning_rate": 0.0001594845586028022, + "loss": 4.2881, + "step": 8606 + }, + { + "epoch": 0.8924430849788412, + "grad_norm": 0.73046875, + "learning_rate": 0.00015947582670564406, + "loss": 4.222, + "step": 8607 + }, + { + "epoch": 0.8925467730333292, + "grad_norm": 0.69921875, + "learning_rate": 0.0001594670941067341, + "loss": 4.2469, + "step": 8608 + }, + { + "epoch": 0.8926504610878173, + "grad_norm": 0.7265625, + "learning_rate": 0.0001594583608061752, + "loss": 4.244, + "step": 8609 + }, + { + "epoch": 0.8927541491423053, + "grad_norm": 0.703125, + "learning_rate": 0.00015944962680407057, + "loss": 4.2382, + "step": 8610 + }, + { + "epoch": 0.8928578371967935, + "grad_norm": 0.8515625, + "learning_rate": 0.00015944089210052312, + "loss": 4.2642, + "step": 8611 + }, + { + "epoch": 0.8929615252512815, + "grad_norm": 0.6328125, + "learning_rate": 0.000159432156695636, + "loss": 4.2569, + "step": 8612 + }, + { + "epoch": 0.8930652133057696, + "grad_norm": 0.81640625, + "learning_rate": 0.00015942342058951225, + "loss": 4.2415, + "step": 8613 + }, + { + "epoch": 0.8931689013602576, + "grad_norm": 0.66015625, + "learning_rate": 0.00015941468378225497, + "loss": 4.2533, + "step": 8614 + }, + { + "epoch": 0.8932725894147457, + "grad_norm": 0.87109375, + "learning_rate": 0.00015940594627396722, + "loss": 4.2619, + "step": 8615 + }, + { + "epoch": 0.8933762774692338, + "grad_norm": 0.68359375, + "learning_rate": 0.0001593972080647521, + "loss": 4.2925, + "step": 8616 + }, + { + "epoch": 0.8934799655237219, + "grad_norm": 0.84765625, + "learning_rate": 0.0001593884691547127, + "loss": 4.2796, + "step": 8617 + }, + { + "epoch": 0.8935836535782099, + "grad_norm": 0.66015625, + "learning_rate": 0.00015937972954395213, + "loss": 4.3201, + "step": 8618 + }, + { + "epoch": 0.893687341632698, + "grad_norm": 0.72265625, + "learning_rate": 0.00015937098923257353, + "loss": 4.2301, + "step": 8619 + }, + { + "epoch": 0.8937910296871862, + "grad_norm": 0.7578125, + "learning_rate": 0.00015936224822068, + "loss": 4.3146, + "step": 8620 + }, + { + "epoch": 0.8938947177416742, + "grad_norm": 0.71875, + "learning_rate": 0.0001593535065083747, + "loss": 4.2895, + "step": 8621 + }, + { + "epoch": 0.8939984057961623, + "grad_norm": 0.77734375, + "learning_rate": 0.00015934476409576077, + "loss": 4.2772, + "step": 8622 + }, + { + "epoch": 0.8941020938506503, + "grad_norm": 0.6640625, + "learning_rate": 0.00015933602098294137, + "loss": 4.2764, + "step": 8623 + }, + { + "epoch": 0.8942057819051384, + "grad_norm": 0.7578125, + "learning_rate": 0.00015932727717001962, + "loss": 4.2412, + "step": 8624 + }, + { + "epoch": 0.8943094699596265, + "grad_norm": 0.6328125, + "learning_rate": 0.0001593185326570987, + "loss": 4.2683, + "step": 8625 + }, + { + "epoch": 0.8944131580141146, + "grad_norm": 0.70703125, + "learning_rate": 0.00015930978744428182, + "loss": 4.2657, + "step": 8626 + }, + { + "epoch": 0.8945168460686026, + "grad_norm": 0.69140625, + "learning_rate": 0.00015930104153167213, + "loss": 4.2425, + "step": 8627 + }, + { + "epoch": 0.8946205341230907, + "grad_norm": 0.671875, + "learning_rate": 0.00015929229491937286, + "loss": 4.2625, + "step": 8628 + }, + { + "epoch": 0.8947242221775787, + "grad_norm": 0.734375, + "learning_rate": 0.00015928354760748716, + "loss": 4.2919, + "step": 8629 + }, + { + "epoch": 0.8948279102320669, + "grad_norm": 0.68359375, + "learning_rate": 0.00015927479959611826, + "loss": 4.2587, + "step": 8630 + }, + { + "epoch": 0.8949315982865549, + "grad_norm": 0.69921875, + "learning_rate": 0.00015926605088536938, + "loss": 4.2266, + "step": 8631 + }, + { + "epoch": 0.895035286341043, + "grad_norm": 0.6796875, + "learning_rate": 0.00015925730147534374, + "loss": 4.301, + "step": 8632 + }, + { + "epoch": 0.895138974395531, + "grad_norm": 0.69921875, + "learning_rate": 0.0001592485513661446, + "loss": 4.289, + "step": 8633 + }, + { + "epoch": 0.8952426624500192, + "grad_norm": 0.703125, + "learning_rate": 0.00015923980055787518, + "loss": 4.2844, + "step": 8634 + }, + { + "epoch": 0.8953463505045072, + "grad_norm": 0.6640625, + "learning_rate": 0.0001592310490506387, + "loss": 4.2731, + "step": 8635 + }, + { + "epoch": 0.8954500385589953, + "grad_norm": 0.73828125, + "learning_rate": 0.0001592222968445385, + "loss": 4.2905, + "step": 8636 + }, + { + "epoch": 0.8955537266134833, + "grad_norm": 0.69921875, + "learning_rate": 0.00015921354393967779, + "loss": 4.3063, + "step": 8637 + }, + { + "epoch": 0.8956574146679714, + "grad_norm": 0.6484375, + "learning_rate": 0.00015920479033615983, + "loss": 4.2803, + "step": 8638 + }, + { + "epoch": 0.8957611027224595, + "grad_norm": 0.73828125, + "learning_rate": 0.00015919603603408792, + "loss": 4.3242, + "step": 8639 + }, + { + "epoch": 0.8958647907769476, + "grad_norm": 0.62109375, + "learning_rate": 0.00015918728103356538, + "loss": 4.2296, + "step": 8640 + }, + { + "epoch": 0.8959684788314356, + "grad_norm": 0.71484375, + "learning_rate": 0.00015917852533469548, + "loss": 4.3023, + "step": 8641 + }, + { + "epoch": 0.8960721668859237, + "grad_norm": 0.765625, + "learning_rate": 0.00015916976893758154, + "loss": 4.2442, + "step": 8642 + }, + { + "epoch": 0.8961758549404117, + "grad_norm": 0.796875, + "learning_rate": 0.00015916101184232687, + "loss": 4.278, + "step": 8643 + }, + { + "epoch": 0.8962795429948999, + "grad_norm": 0.6640625, + "learning_rate": 0.0001591522540490348, + "loss": 4.2436, + "step": 8644 + }, + { + "epoch": 0.8963832310493879, + "grad_norm": 0.78515625, + "learning_rate": 0.00015914349555780865, + "loss": 4.2421, + "step": 8645 + }, + { + "epoch": 0.896486919103876, + "grad_norm": 0.70703125, + "learning_rate": 0.0001591347363687518, + "loss": 4.3135, + "step": 8646 + }, + { + "epoch": 0.896590607158364, + "grad_norm": 0.7890625, + "learning_rate": 0.00015912597648196753, + "loss": 4.2696, + "step": 8647 + }, + { + "epoch": 0.8966942952128522, + "grad_norm": 0.69921875, + "learning_rate": 0.0001591172158975592, + "loss": 4.2535, + "step": 8648 + }, + { + "epoch": 0.8967979832673402, + "grad_norm": 0.66015625, + "learning_rate": 0.00015910845461563028, + "loss": 4.2705, + "step": 8649 + }, + { + "epoch": 0.8969016713218283, + "grad_norm": 0.75, + "learning_rate": 0.00015909969263628408, + "loss": 4.2643, + "step": 8650 + }, + { + "epoch": 0.8970053593763163, + "grad_norm": 0.6953125, + "learning_rate": 0.00015909092995962393, + "loss": 4.2681, + "step": 8651 + }, + { + "epoch": 0.8971090474308044, + "grad_norm": 0.6875, + "learning_rate": 0.0001590821665857533, + "loss": 4.302, + "step": 8652 + }, + { + "epoch": 0.8972127354852925, + "grad_norm": 0.79296875, + "learning_rate": 0.00015907340251477558, + "loss": 4.2141, + "step": 8653 + }, + { + "epoch": 0.8973164235397806, + "grad_norm": 0.66796875, + "learning_rate": 0.0001590646377467941, + "loss": 4.324, + "step": 8654 + }, + { + "epoch": 0.8974201115942686, + "grad_norm": 0.77734375, + "learning_rate": 0.00015905587228191235, + "loss": 4.2819, + "step": 8655 + }, + { + "epoch": 0.8975237996487567, + "grad_norm": 0.71875, + "learning_rate": 0.00015904710612023372, + "loss": 4.284, + "step": 8656 + }, + { + "epoch": 0.8976274877032447, + "grad_norm": 0.75390625, + "learning_rate": 0.00015903833926186168, + "loss": 4.228, + "step": 8657 + }, + { + "epoch": 0.8977311757577329, + "grad_norm": 0.6796875, + "learning_rate": 0.00015902957170689966, + "loss": 4.2526, + "step": 8658 + }, + { + "epoch": 0.8978348638122209, + "grad_norm": 0.69140625, + "learning_rate": 0.00015902080345545102, + "loss": 4.2731, + "step": 8659 + }, + { + "epoch": 0.897938551866709, + "grad_norm": 0.6953125, + "learning_rate": 0.00015901203450761932, + "loss": 4.303, + "step": 8660 + }, + { + "epoch": 0.898042239921197, + "grad_norm": 0.7578125, + "learning_rate": 0.000159003264863508, + "loss": 4.3071, + "step": 8661 + }, + { + "epoch": 0.8981459279756852, + "grad_norm": 0.72265625, + "learning_rate": 0.00015899449452322055, + "loss": 4.2771, + "step": 8662 + }, + { + "epoch": 0.8982496160301732, + "grad_norm": 0.734375, + "learning_rate": 0.00015898572348686037, + "loss": 4.3085, + "step": 8663 + }, + { + "epoch": 0.8983533040846613, + "grad_norm": 0.8203125, + "learning_rate": 0.00015897695175453105, + "loss": 4.2636, + "step": 8664 + }, + { + "epoch": 0.8984569921391494, + "grad_norm": 0.6953125, + "learning_rate": 0.000158968179326336, + "loss": 4.2446, + "step": 8665 + }, + { + "epoch": 0.8985606801936374, + "grad_norm": 0.82421875, + "learning_rate": 0.0001589594062023788, + "loss": 4.2981, + "step": 8666 + }, + { + "epoch": 0.8986643682481256, + "grad_norm": 0.69140625, + "learning_rate": 0.00015895063238276292, + "loss": 4.2697, + "step": 8667 + }, + { + "epoch": 0.8987680563026136, + "grad_norm": 0.73828125, + "learning_rate": 0.00015894185786759189, + "loss": 4.2807, + "step": 8668 + }, + { + "epoch": 0.8988717443571017, + "grad_norm": 0.6796875, + "learning_rate": 0.00015893308265696923, + "loss": 4.2924, + "step": 8669 + }, + { + "epoch": 0.8989754324115897, + "grad_norm": 0.8359375, + "learning_rate": 0.0001589243067509985, + "loss": 4.2421, + "step": 8670 + }, + { + "epoch": 0.8990791204660779, + "grad_norm": 0.671875, + "learning_rate": 0.00015891553014978324, + "loss": 4.2539, + "step": 8671 + }, + { + "epoch": 0.8991828085205659, + "grad_norm": 0.76953125, + "learning_rate": 0.000158906752853427, + "loss": 4.2523, + "step": 8672 + }, + { + "epoch": 0.899286496575054, + "grad_norm": 0.6796875, + "learning_rate": 0.00015889797486203332, + "loss": 4.2962, + "step": 8673 + }, + { + "epoch": 0.899390184629542, + "grad_norm": 0.78515625, + "learning_rate": 0.00015888919617570584, + "loss": 4.2683, + "step": 8674 + }, + { + "epoch": 0.8994938726840301, + "grad_norm": 0.7265625, + "learning_rate": 0.00015888041679454806, + "loss": 4.2513, + "step": 8675 + }, + { + "epoch": 0.8995975607385182, + "grad_norm": 0.80859375, + "learning_rate": 0.0001588716367186636, + "loss": 4.2272, + "step": 8676 + }, + { + "epoch": 0.8997012487930063, + "grad_norm": 0.6875, + "learning_rate": 0.00015886285594815606, + "loss": 4.2718, + "step": 8677 + }, + { + "epoch": 0.8998049368474943, + "grad_norm": 0.76953125, + "learning_rate": 0.00015885407448312905, + "loss": 4.2413, + "step": 8678 + }, + { + "epoch": 0.8999086249019824, + "grad_norm": 0.69921875, + "learning_rate": 0.0001588452923236862, + "loss": 4.304, + "step": 8679 + }, + { + "epoch": 0.9000123129564704, + "grad_norm": 0.75, + "learning_rate": 0.00015883650946993104, + "loss": 4.2845, + "step": 8680 + }, + { + "epoch": 0.9001160010109586, + "grad_norm": 0.71484375, + "learning_rate": 0.0001588277259219673, + "loss": 4.2831, + "step": 8681 + }, + { + "epoch": 0.9002196890654466, + "grad_norm": 0.734375, + "learning_rate": 0.00015881894167989855, + "loss": 4.2684, + "step": 8682 + }, + { + "epoch": 0.9003233771199347, + "grad_norm": 0.703125, + "learning_rate": 0.0001588101567438285, + "loss": 4.2956, + "step": 8683 + }, + { + "epoch": 0.9004270651744227, + "grad_norm": 0.80859375, + "learning_rate": 0.00015880137111386075, + "loss": 4.2602, + "step": 8684 + }, + { + "epoch": 0.9005307532289109, + "grad_norm": 0.7109375, + "learning_rate": 0.00015879258479009896, + "loss": 4.257, + "step": 8685 + }, + { + "epoch": 0.9006344412833989, + "grad_norm": 0.765625, + "learning_rate": 0.0001587837977726468, + "loss": 4.2761, + "step": 8686 + }, + { + "epoch": 0.900738129337887, + "grad_norm": 0.75390625, + "learning_rate": 0.00015877501006160804, + "loss": 4.2571, + "step": 8687 + }, + { + "epoch": 0.900841817392375, + "grad_norm": 0.76171875, + "learning_rate": 0.00015876622165708623, + "loss": 4.2385, + "step": 8688 + }, + { + "epoch": 0.9009455054468631, + "grad_norm": 0.7890625, + "learning_rate": 0.00015875743255918512, + "loss": 4.2775, + "step": 8689 + }, + { + "epoch": 0.9010491935013512, + "grad_norm": 0.80078125, + "learning_rate": 0.00015874864276800845, + "loss": 4.27, + "step": 8690 + }, + { + "epoch": 0.9011528815558393, + "grad_norm": 0.81640625, + "learning_rate": 0.00015873985228365988, + "loss": 4.2789, + "step": 8691 + }, + { + "epoch": 0.9012565696103273, + "grad_norm": 0.75390625, + "learning_rate": 0.00015873106110624315, + "loss": 4.3142, + "step": 8692 + }, + { + "epoch": 0.9013602576648154, + "grad_norm": 0.80078125, + "learning_rate": 0.00015872226923586196, + "loss": 4.2239, + "step": 8693 + }, + { + "epoch": 0.9014639457193034, + "grad_norm": 0.71875, + "learning_rate": 0.00015871347667262007, + "loss": 4.2556, + "step": 8694 + }, + { + "epoch": 0.9015676337737916, + "grad_norm": 0.74609375, + "learning_rate": 0.00015870468341662124, + "loss": 4.3006, + "step": 8695 + }, + { + "epoch": 0.9016713218282796, + "grad_norm": 0.76171875, + "learning_rate": 0.00015869588946796918, + "loss": 4.2797, + "step": 8696 + }, + { + "epoch": 0.9017750098827677, + "grad_norm": 0.86328125, + "learning_rate": 0.00015868709482676766, + "loss": 4.3279, + "step": 8697 + }, + { + "epoch": 0.9018786979372557, + "grad_norm": 0.80859375, + "learning_rate": 0.00015867829949312045, + "loss": 4.3145, + "step": 8698 + }, + { + "epoch": 0.9019823859917439, + "grad_norm": 0.8203125, + "learning_rate": 0.00015866950346713136, + "loss": 4.3113, + "step": 8699 + }, + { + "epoch": 0.9020860740462319, + "grad_norm": 0.828125, + "learning_rate": 0.00015866070674890414, + "loss": 4.2536, + "step": 8700 + }, + { + "epoch": 0.90218976210072, + "grad_norm": 0.734375, + "learning_rate": 0.00015865190933854259, + "loss": 4.2619, + "step": 8701 + }, + { + "epoch": 0.902293450155208, + "grad_norm": 0.71875, + "learning_rate": 0.0001586431112361505, + "loss": 4.2475, + "step": 8702 + }, + { + "epoch": 0.9023971382096961, + "grad_norm": 0.74609375, + "learning_rate": 0.00015863431244183168, + "loss": 4.2817, + "step": 8703 + }, + { + "epoch": 0.9025008262641842, + "grad_norm": 0.75, + "learning_rate": 0.00015862551295568996, + "loss": 4.2685, + "step": 8704 + }, + { + "epoch": 0.9026045143186723, + "grad_norm": 0.6796875, + "learning_rate": 0.00015861671277782918, + "loss": 4.2415, + "step": 8705 + }, + { + "epoch": 0.9027082023731603, + "grad_norm": 0.85546875, + "learning_rate": 0.0001586079119083531, + "loss": 4.2506, + "step": 8706 + }, + { + "epoch": 0.9028118904276484, + "grad_norm": 0.69140625, + "learning_rate": 0.00015859911034736562, + "loss": 4.2706, + "step": 8707 + }, + { + "epoch": 0.9029155784821364, + "grad_norm": 0.83984375, + "learning_rate": 0.0001585903080949706, + "loss": 4.2659, + "step": 8708 + }, + { + "epoch": 0.9030192665366246, + "grad_norm": 0.6953125, + "learning_rate": 0.0001585815051512719, + "loss": 4.2601, + "step": 8709 + }, + { + "epoch": 0.9031229545911127, + "grad_norm": 0.76171875, + "learning_rate": 0.0001585727015163733, + "loss": 4.2822, + "step": 8710 + }, + { + "epoch": 0.9032266426456007, + "grad_norm": 0.73046875, + "learning_rate": 0.00015856389719037876, + "loss": 4.2895, + "step": 8711 + }, + { + "epoch": 0.9033303307000888, + "grad_norm": 0.6953125, + "learning_rate": 0.00015855509217339219, + "loss": 4.2873, + "step": 8712 + }, + { + "epoch": 0.9034340187545769, + "grad_norm": 0.71875, + "learning_rate": 0.00015854628646551737, + "loss": 4.2613, + "step": 8713 + }, + { + "epoch": 0.903537706809065, + "grad_norm": 0.69140625, + "learning_rate": 0.0001585374800668583, + "loss": 4.2723, + "step": 8714 + }, + { + "epoch": 0.903641394863553, + "grad_norm": 0.72265625, + "learning_rate": 0.00015852867297751877, + "loss": 4.2495, + "step": 8715 + }, + { + "epoch": 0.9037450829180411, + "grad_norm": 0.7421875, + "learning_rate": 0.00015851986519760286, + "loss": 4.2634, + "step": 8716 + }, + { + "epoch": 0.9038487709725291, + "grad_norm": 0.7890625, + "learning_rate": 0.00015851105672721433, + "loss": 4.2665, + "step": 8717 + }, + { + "epoch": 0.9039524590270173, + "grad_norm": 0.7265625, + "learning_rate": 0.00015850224756645717, + "loss": 4.3059, + "step": 8718 + }, + { + "epoch": 0.9040561470815053, + "grad_norm": 0.79296875, + "learning_rate": 0.00015849343771543538, + "loss": 4.2917, + "step": 8719 + }, + { + "epoch": 0.9041598351359934, + "grad_norm": 0.76953125, + "learning_rate": 0.00015848462717425282, + "loss": 4.2838, + "step": 8720 + }, + { + "epoch": 0.9042635231904814, + "grad_norm": 0.9765625, + "learning_rate": 0.0001584758159430135, + "loss": 4.2943, + "step": 8721 + }, + { + "epoch": 0.9043672112449695, + "grad_norm": 0.76953125, + "learning_rate": 0.00015846700402182134, + "loss": 4.267, + "step": 8722 + }, + { + "epoch": 0.9044708992994576, + "grad_norm": 0.83984375, + "learning_rate": 0.00015845819141078037, + "loss": 4.2886, + "step": 8723 + }, + { + "epoch": 0.9045745873539457, + "grad_norm": 0.80859375, + "learning_rate": 0.00015844937810999453, + "loss": 4.2663, + "step": 8724 + }, + { + "epoch": 0.9046782754084337, + "grad_norm": 0.78125, + "learning_rate": 0.00015844056411956777, + "loss": 4.2901, + "step": 8725 + }, + { + "epoch": 0.9047819634629218, + "grad_norm": 0.75, + "learning_rate": 0.00015843174943960418, + "loss": 4.3164, + "step": 8726 + }, + { + "epoch": 0.9048856515174099, + "grad_norm": 0.7734375, + "learning_rate": 0.00015842293407020765, + "loss": 4.2537, + "step": 8727 + }, + { + "epoch": 0.904989339571898, + "grad_norm": 0.73046875, + "learning_rate": 0.00015841411801148232, + "loss": 4.27, + "step": 8728 + }, + { + "epoch": 0.905093027626386, + "grad_norm": 0.82421875, + "learning_rate": 0.00015840530126353213, + "loss": 4.2543, + "step": 8729 + }, + { + "epoch": 0.9051967156808741, + "grad_norm": 0.71875, + "learning_rate": 0.0001583964838264611, + "loss": 4.2713, + "step": 8730 + }, + { + "epoch": 0.9053004037353621, + "grad_norm": 0.734375, + "learning_rate": 0.00015838766570037326, + "loss": 4.2698, + "step": 8731 + }, + { + "epoch": 0.9054040917898503, + "grad_norm": 0.76953125, + "learning_rate": 0.0001583788468853727, + "loss": 4.2757, + "step": 8732 + }, + { + "epoch": 0.9055077798443383, + "grad_norm": 0.7890625, + "learning_rate": 0.0001583700273815635, + "loss": 4.291, + "step": 8733 + }, + { + "epoch": 0.9056114678988264, + "grad_norm": 0.77734375, + "learning_rate": 0.00015836120718904967, + "loss": 4.2981, + "step": 8734 + }, + { + "epoch": 0.9057151559533144, + "grad_norm": 0.765625, + "learning_rate": 0.00015835238630793524, + "loss": 4.3023, + "step": 8735 + }, + { + "epoch": 0.9058188440078025, + "grad_norm": 0.76953125, + "learning_rate": 0.00015834356473832438, + "loss": 4.2939, + "step": 8736 + }, + { + "epoch": 0.9059225320622906, + "grad_norm": 0.76953125, + "learning_rate": 0.00015833474248032111, + "loss": 4.2536, + "step": 8737 + }, + { + "epoch": 0.9060262201167787, + "grad_norm": 0.765625, + "learning_rate": 0.0001583259195340295, + "loss": 4.2877, + "step": 8738 + }, + { + "epoch": 0.9061299081712667, + "grad_norm": 0.7421875, + "learning_rate": 0.00015831709589955376, + "loss": 4.2692, + "step": 8739 + }, + { + "epoch": 0.9062335962257548, + "grad_norm": 0.7734375, + "learning_rate": 0.0001583082715769979, + "loss": 4.2686, + "step": 8740 + }, + { + "epoch": 0.9063372842802428, + "grad_norm": 0.7578125, + "learning_rate": 0.00015829944656646608, + "loss": 4.2784, + "step": 8741 + }, + { + "epoch": 0.906440972334731, + "grad_norm": 0.8515625, + "learning_rate": 0.0001582906208680624, + "loss": 4.2617, + "step": 8742 + }, + { + "epoch": 0.906544660389219, + "grad_norm": 0.73828125, + "learning_rate": 0.00015828179448189103, + "loss": 4.2634, + "step": 8743 + }, + { + "epoch": 0.9066483484437071, + "grad_norm": 0.828125, + "learning_rate": 0.00015827296740805608, + "loss": 4.2801, + "step": 8744 + }, + { + "epoch": 0.9067520364981951, + "grad_norm": 0.703125, + "learning_rate": 0.00015826413964666172, + "loss": 4.2969, + "step": 8745 + }, + { + "epoch": 0.9068557245526833, + "grad_norm": 0.8046875, + "learning_rate": 0.0001582553111978121, + "loss": 4.2976, + "step": 8746 + }, + { + "epoch": 0.9069594126071713, + "grad_norm": 0.6953125, + "learning_rate": 0.00015824648206161138, + "loss": 4.2384, + "step": 8747 + }, + { + "epoch": 0.9070631006616594, + "grad_norm": 0.7265625, + "learning_rate": 0.00015823765223816372, + "loss": 4.2754, + "step": 8748 + }, + { + "epoch": 0.9071667887161474, + "grad_norm": 0.75, + "learning_rate": 0.00015822882172757333, + "loss": 4.3017, + "step": 8749 + }, + { + "epoch": 0.9072704767706355, + "grad_norm": 0.7578125, + "learning_rate": 0.00015821999052994441, + "loss": 4.2288, + "step": 8750 + }, + { + "epoch": 0.9073741648251236, + "grad_norm": 0.78515625, + "learning_rate": 0.00015821115864538113, + "loss": 4.2528, + "step": 8751 + }, + { + "epoch": 0.9074778528796117, + "grad_norm": 0.7578125, + "learning_rate": 0.00015820232607398772, + "loss": 4.2473, + "step": 8752 + }, + { + "epoch": 0.9075815409340997, + "grad_norm": 0.78515625, + "learning_rate": 0.0001581934928158684, + "loss": 4.2968, + "step": 8753 + }, + { + "epoch": 0.9076852289885878, + "grad_norm": 0.69140625, + "learning_rate": 0.00015818465887112738, + "loss": 4.24, + "step": 8754 + }, + { + "epoch": 0.907788917043076, + "grad_norm": 0.73828125, + "learning_rate": 0.00015817582423986886, + "loss": 4.2909, + "step": 8755 + }, + { + "epoch": 0.907892605097564, + "grad_norm": 0.71484375, + "learning_rate": 0.0001581669889221971, + "loss": 4.2621, + "step": 8756 + }, + { + "epoch": 0.9079962931520521, + "grad_norm": 0.75, + "learning_rate": 0.00015815815291821638, + "loss": 4.3012, + "step": 8757 + }, + { + "epoch": 0.9080999812065401, + "grad_norm": 0.703125, + "learning_rate": 0.00015814931622803094, + "loss": 4.2564, + "step": 8758 + }, + { + "epoch": 0.9082036692610282, + "grad_norm": 0.6875, + "learning_rate": 0.000158140478851745, + "loss": 4.299, + "step": 8759 + }, + { + "epoch": 0.9083073573155163, + "grad_norm": 0.7734375, + "learning_rate": 0.0001581316407894629, + "loss": 4.2498, + "step": 8760 + }, + { + "epoch": 0.9084110453700044, + "grad_norm": 0.71875, + "learning_rate": 0.00015812280204128887, + "loss": 4.2857, + "step": 8761 + }, + { + "epoch": 0.9085147334244924, + "grad_norm": 0.78515625, + "learning_rate": 0.00015811396260732722, + "loss": 4.2647, + "step": 8762 + }, + { + "epoch": 0.9086184214789805, + "grad_norm": 0.75, + "learning_rate": 0.00015810512248768226, + "loss": 4.2407, + "step": 8763 + }, + { + "epoch": 0.9087221095334685, + "grad_norm": 0.69921875, + "learning_rate": 0.00015809628168245821, + "loss": 4.2563, + "step": 8764 + }, + { + "epoch": 0.9088257975879567, + "grad_norm": 0.6640625, + "learning_rate": 0.00015808744019175951, + "loss": 4.2673, + "step": 8765 + }, + { + "epoch": 0.9089294856424447, + "grad_norm": 0.72265625, + "learning_rate": 0.00015807859801569043, + "loss": 4.2512, + "step": 8766 + }, + { + "epoch": 0.9090331736969328, + "grad_norm": 0.609375, + "learning_rate": 0.00015806975515435522, + "loss": 4.3002, + "step": 8767 + }, + { + "epoch": 0.9091368617514208, + "grad_norm": 0.8046875, + "learning_rate": 0.0001580609116078583, + "loss": 4.2859, + "step": 8768 + }, + { + "epoch": 0.909240549805909, + "grad_norm": 0.66015625, + "learning_rate": 0.00015805206737630402, + "loss": 4.2534, + "step": 8769 + }, + { + "epoch": 0.909344237860397, + "grad_norm": 0.67578125, + "learning_rate": 0.0001580432224597967, + "loss": 4.3108, + "step": 8770 + }, + { + "epoch": 0.9094479259148851, + "grad_norm": 0.6875, + "learning_rate": 0.00015803437685844073, + "loss": 4.2701, + "step": 8771 + }, + { + "epoch": 0.9095516139693731, + "grad_norm": 0.69140625, + "learning_rate": 0.0001580255305723404, + "loss": 4.2773, + "step": 8772 + }, + { + "epoch": 0.9096553020238612, + "grad_norm": 0.671875, + "learning_rate": 0.0001580166836016002, + "loss": 4.2535, + "step": 8773 + }, + { + "epoch": 0.9097589900783493, + "grad_norm": 0.68359375, + "learning_rate": 0.00015800783594632444, + "loss": 4.2504, + "step": 8774 + }, + { + "epoch": 0.9098626781328374, + "grad_norm": 0.640625, + "learning_rate": 0.00015799898760661757, + "loss": 4.2086, + "step": 8775 + }, + { + "epoch": 0.9099663661873254, + "grad_norm": 0.7109375, + "learning_rate": 0.00015799013858258388, + "loss": 4.2892, + "step": 8776 + }, + { + "epoch": 0.9100700542418135, + "grad_norm": 0.6640625, + "learning_rate": 0.0001579812888743279, + "loss": 4.2683, + "step": 8777 + }, + { + "epoch": 0.9101737422963015, + "grad_norm": 0.765625, + "learning_rate": 0.000157972438481954, + "loss": 4.2804, + "step": 8778 + }, + { + "epoch": 0.9102774303507897, + "grad_norm": 0.68359375, + "learning_rate": 0.00015796358740556658, + "loss": 4.2349, + "step": 8779 + }, + { + "epoch": 0.9103811184052777, + "grad_norm": 0.79296875, + "learning_rate": 0.00015795473564527012, + "loss": 4.2952, + "step": 8780 + }, + { + "epoch": 0.9104848064597658, + "grad_norm": 0.68359375, + "learning_rate": 0.00015794588320116904, + "loss": 4.3084, + "step": 8781 + }, + { + "epoch": 0.9105884945142538, + "grad_norm": 0.8125, + "learning_rate": 0.0001579370300733678, + "loss": 4.2992, + "step": 8782 + }, + { + "epoch": 0.910692182568742, + "grad_norm": 0.69921875, + "learning_rate": 0.00015792817626197084, + "loss": 4.2066, + "step": 8783 + }, + { + "epoch": 0.91079587062323, + "grad_norm": 0.83984375, + "learning_rate": 0.0001579193217670826, + "loss": 4.2721, + "step": 8784 + }, + { + "epoch": 0.9108995586777181, + "grad_norm": 0.73046875, + "learning_rate": 0.00015791046658880759, + "loss": 4.3195, + "step": 8785 + }, + { + "epoch": 0.9110032467322061, + "grad_norm": 0.84765625, + "learning_rate": 0.00015790161072725033, + "loss": 4.2495, + "step": 8786 + }, + { + "epoch": 0.9111069347866942, + "grad_norm": 0.79296875, + "learning_rate": 0.00015789275418251527, + "loss": 4.2673, + "step": 8787 + }, + { + "epoch": 0.9112106228411823, + "grad_norm": 0.78515625, + "learning_rate": 0.00015788389695470687, + "loss": 4.2598, + "step": 8788 + }, + { + "epoch": 0.9113143108956704, + "grad_norm": 0.79296875, + "learning_rate": 0.00015787503904392969, + "loss": 4.2678, + "step": 8789 + }, + { + "epoch": 0.9114179989501584, + "grad_norm": 0.78125, + "learning_rate": 0.0001578661804502882, + "loss": 4.2586, + "step": 8790 + }, + { + "epoch": 0.9115216870046465, + "grad_norm": 0.6875, + "learning_rate": 0.00015785732117388698, + "loss": 4.2912, + "step": 8791 + }, + { + "epoch": 0.9116253750591345, + "grad_norm": 0.73046875, + "learning_rate": 0.00015784846121483053, + "loss": 4.3045, + "step": 8792 + }, + { + "epoch": 0.9117290631136227, + "grad_norm": 0.69921875, + "learning_rate": 0.00015783960057322335, + "loss": 4.2654, + "step": 8793 + }, + { + "epoch": 0.9118327511681107, + "grad_norm": 0.75, + "learning_rate": 0.00015783073924917004, + "loss": 4.2305, + "step": 8794 + }, + { + "epoch": 0.9119364392225988, + "grad_norm": 0.72265625, + "learning_rate": 0.00015782187724277519, + "loss": 4.2843, + "step": 8795 + }, + { + "epoch": 0.9120401272770868, + "grad_norm": 0.78125, + "learning_rate": 0.00015781301455414324, + "loss": 4.2485, + "step": 8796 + }, + { + "epoch": 0.912143815331575, + "grad_norm": 0.66796875, + "learning_rate": 0.00015780415118337885, + "loss": 4.2562, + "step": 8797 + }, + { + "epoch": 0.912247503386063, + "grad_norm": 0.76953125, + "learning_rate": 0.00015779528713058655, + "loss": 4.2176, + "step": 8798 + }, + { + "epoch": 0.9123511914405511, + "grad_norm": 0.72265625, + "learning_rate": 0.000157786422395871, + "loss": 4.3066, + "step": 8799 + }, + { + "epoch": 0.9124548794950392, + "grad_norm": 0.7734375, + "learning_rate": 0.00015777755697933673, + "loss": 4.2327, + "step": 8800 + }, + { + "epoch": 0.9125585675495272, + "grad_norm": 0.7421875, + "learning_rate": 0.00015776869088108834, + "loss": 4.2274, + "step": 8801 + }, + { + "epoch": 0.9126622556040154, + "grad_norm": 0.71484375, + "learning_rate": 0.0001577598241012305, + "loss": 4.2839, + "step": 8802 + }, + { + "epoch": 0.9127659436585034, + "grad_norm": 0.74609375, + "learning_rate": 0.00015775095663986778, + "loss": 4.2804, + "step": 8803 + }, + { + "epoch": 0.9128696317129915, + "grad_norm": 0.66796875, + "learning_rate": 0.00015774208849710482, + "loss": 4.299, + "step": 8804 + }, + { + "epoch": 0.9129733197674795, + "grad_norm": 0.7578125, + "learning_rate": 0.00015773321967304624, + "loss": 4.272, + "step": 8805 + }, + { + "epoch": 0.9130770078219677, + "grad_norm": 0.71484375, + "learning_rate": 0.0001577243501677967, + "loss": 4.2984, + "step": 8806 + }, + { + "epoch": 0.9131806958764557, + "grad_norm": 0.75, + "learning_rate": 0.00015771547998146086, + "loss": 4.2805, + "step": 8807 + }, + { + "epoch": 0.9132843839309438, + "grad_norm": 0.73046875, + "learning_rate": 0.00015770660911414332, + "loss": 4.2923, + "step": 8808 + }, + { + "epoch": 0.9133880719854318, + "grad_norm": 0.65234375, + "learning_rate": 0.00015769773756594885, + "loss": 4.2588, + "step": 8809 + }, + { + "epoch": 0.9134917600399199, + "grad_norm": 0.7421875, + "learning_rate": 0.000157688865336982, + "loss": 4.2737, + "step": 8810 + }, + { + "epoch": 0.913595448094408, + "grad_norm": 0.67578125, + "learning_rate": 0.00015767999242734756, + "loss": 4.2774, + "step": 8811 + }, + { + "epoch": 0.9136991361488961, + "grad_norm": 0.796875, + "learning_rate": 0.00015767111883715018, + "loss": 4.2914, + "step": 8812 + }, + { + "epoch": 0.9138028242033841, + "grad_norm": 0.68359375, + "learning_rate": 0.00015766224456649453, + "loss": 4.2476, + "step": 8813 + }, + { + "epoch": 0.9139065122578722, + "grad_norm": 0.8515625, + "learning_rate": 0.00015765336961548538, + "loss": 4.2929, + "step": 8814 + }, + { + "epoch": 0.9140102003123602, + "grad_norm": 0.73046875, + "learning_rate": 0.00015764449398422738, + "loss": 4.2975, + "step": 8815 + }, + { + "epoch": 0.9141138883668484, + "grad_norm": 0.79296875, + "learning_rate": 0.00015763561767282534, + "loss": 4.2574, + "step": 8816 + }, + { + "epoch": 0.9142175764213364, + "grad_norm": 0.75390625, + "learning_rate": 0.0001576267406813839, + "loss": 4.2037, + "step": 8817 + }, + { + "epoch": 0.9143212644758245, + "grad_norm": 0.81640625, + "learning_rate": 0.0001576178630100078, + "loss": 4.2451, + "step": 8818 + }, + { + "epoch": 0.9144249525303125, + "grad_norm": 0.78125, + "learning_rate": 0.00015760898465880184, + "loss": 4.2579, + "step": 8819 + }, + { + "epoch": 0.9145286405848007, + "grad_norm": 0.8203125, + "learning_rate": 0.00015760010562787077, + "loss": 4.289, + "step": 8820 + }, + { + "epoch": 0.9146323286392887, + "grad_norm": 0.6953125, + "learning_rate": 0.00015759122591731936, + "loss": 4.2257, + "step": 8821 + }, + { + "epoch": 0.9147360166937768, + "grad_norm": 0.671875, + "learning_rate": 0.00015758234552725234, + "loss": 4.224, + "step": 8822 + }, + { + "epoch": 0.9148397047482648, + "grad_norm": 0.734375, + "learning_rate": 0.00015757346445777448, + "loss": 4.2634, + "step": 8823 + }, + { + "epoch": 0.9149433928027529, + "grad_norm": 0.75390625, + "learning_rate": 0.00015756458270899066, + "loss": 4.3201, + "step": 8824 + }, + { + "epoch": 0.915047080857241, + "grad_norm": 0.72265625, + "learning_rate": 0.00015755570028100558, + "loss": 4.2593, + "step": 8825 + }, + { + "epoch": 0.9151507689117291, + "grad_norm": 0.70703125, + "learning_rate": 0.00015754681717392407, + "loss": 4.2888, + "step": 8826 + }, + { + "epoch": 0.9152544569662171, + "grad_norm": 0.71875, + "learning_rate": 0.00015753793338785092, + "loss": 4.2838, + "step": 8827 + }, + { + "epoch": 0.9153581450207052, + "grad_norm": 0.79296875, + "learning_rate": 0.00015752904892289102, + "loss": 4.2517, + "step": 8828 + }, + { + "epoch": 0.9154618330751932, + "grad_norm": 0.6953125, + "learning_rate": 0.00015752016377914915, + "loss": 4.3019, + "step": 8829 + }, + { + "epoch": 0.9155655211296814, + "grad_norm": 0.76953125, + "learning_rate": 0.00015751127795673013, + "loss": 4.283, + "step": 8830 + }, + { + "epoch": 0.9156692091841694, + "grad_norm": 0.65625, + "learning_rate": 0.00015750239145573885, + "loss": 4.2745, + "step": 8831 + }, + { + "epoch": 0.9157728972386575, + "grad_norm": 0.75390625, + "learning_rate": 0.00015749350427628008, + "loss": 4.3084, + "step": 8832 + }, + { + "epoch": 0.9158765852931455, + "grad_norm": 0.66015625, + "learning_rate": 0.00015748461641845878, + "loss": 4.2416, + "step": 8833 + }, + { + "epoch": 0.9159802733476337, + "grad_norm": 0.734375, + "learning_rate": 0.00015747572788237977, + "loss": 4.2328, + "step": 8834 + }, + { + "epoch": 0.9160839614021217, + "grad_norm": 0.78515625, + "learning_rate": 0.0001574668386681479, + "loss": 4.2722, + "step": 8835 + }, + { + "epoch": 0.9161876494566098, + "grad_norm": 0.80859375, + "learning_rate": 0.0001574579487758681, + "loss": 4.2735, + "step": 8836 + }, + { + "epoch": 0.9162913375110978, + "grad_norm": 0.78515625, + "learning_rate": 0.00015744905820564524, + "loss": 4.2824, + "step": 8837 + }, + { + "epoch": 0.9163950255655859, + "grad_norm": 0.69921875, + "learning_rate": 0.0001574401669575842, + "loss": 4.2671, + "step": 8838 + }, + { + "epoch": 0.916498713620074, + "grad_norm": 0.80078125, + "learning_rate": 0.00015743127503178994, + "loss": 4.2662, + "step": 8839 + }, + { + "epoch": 0.9166024016745621, + "grad_norm": 0.69921875, + "learning_rate": 0.0001574223824283673, + "loss": 4.2068, + "step": 8840 + }, + { + "epoch": 0.9167060897290501, + "grad_norm": 0.703125, + "learning_rate": 0.00015741348914742126, + "loss": 4.2622, + "step": 8841 + }, + { + "epoch": 0.9168097777835382, + "grad_norm": 0.73828125, + "learning_rate": 0.00015740459518905677, + "loss": 4.3, + "step": 8842 + }, + { + "epoch": 0.9169134658380262, + "grad_norm": 0.64453125, + "learning_rate": 0.0001573957005533787, + "loss": 4.2615, + "step": 8843 + }, + { + "epoch": 0.9170171538925144, + "grad_norm": 0.71484375, + "learning_rate": 0.00015738680524049202, + "loss": 4.2809, + "step": 8844 + }, + { + "epoch": 0.9171208419470025, + "grad_norm": 0.69140625, + "learning_rate": 0.00015737790925050173, + "loss": 4.2593, + "step": 8845 + }, + { + "epoch": 0.9172245300014905, + "grad_norm": 0.6875, + "learning_rate": 0.00015736901258351277, + "loss": 4.271, + "step": 8846 + }, + { + "epoch": 0.9173282180559786, + "grad_norm": 0.67578125, + "learning_rate": 0.00015736011523963006, + "loss": 4.2784, + "step": 8847 + }, + { + "epoch": 0.9174319061104667, + "grad_norm": 0.70703125, + "learning_rate": 0.00015735121721895866, + "loss": 4.2922, + "step": 8848 + }, + { + "epoch": 0.9175355941649548, + "grad_norm": 0.6875, + "learning_rate": 0.0001573423185216035, + "loss": 4.236, + "step": 8849 + }, + { + "epoch": 0.9176392822194428, + "grad_norm": 0.73828125, + "learning_rate": 0.00015733341914766962, + "loss": 4.2735, + "step": 8850 + }, + { + "epoch": 0.9177429702739309, + "grad_norm": 0.70703125, + "learning_rate": 0.00015732451909726195, + "loss": 4.2984, + "step": 8851 + }, + { + "epoch": 0.9178466583284189, + "grad_norm": 0.7265625, + "learning_rate": 0.0001573156183704856, + "loss": 4.2864, + "step": 8852 + }, + { + "epoch": 0.9179503463829071, + "grad_norm": 0.6875, + "learning_rate": 0.00015730671696744555, + "loss": 4.2688, + "step": 8853 + }, + { + "epoch": 0.9180540344373951, + "grad_norm": 0.8203125, + "learning_rate": 0.0001572978148882468, + "loss": 4.3005, + "step": 8854 + }, + { + "epoch": 0.9181577224918832, + "grad_norm": 0.71875, + "learning_rate": 0.00015728891213299435, + "loss": 4.2281, + "step": 8855 + }, + { + "epoch": 0.9182614105463712, + "grad_norm": 0.7265625, + "learning_rate": 0.0001572800087017933, + "loss": 4.2397, + "step": 8856 + }, + { + "epoch": 0.9183650986008594, + "grad_norm": 0.6953125, + "learning_rate": 0.00015727110459474878, + "loss": 4.1826, + "step": 8857 + }, + { + "epoch": 0.9184687866553474, + "grad_norm": 0.70703125, + "learning_rate": 0.00015726219981196567, + "loss": 4.2488, + "step": 8858 + }, + { + "epoch": 0.9185724747098355, + "grad_norm": 0.80078125, + "learning_rate": 0.00015725329435354918, + "loss": 4.24, + "step": 8859 + }, + { + "epoch": 0.9186761627643235, + "grad_norm": 0.6015625, + "learning_rate": 0.00015724438821960432, + "loss": 4.2979, + "step": 8860 + }, + { + "epoch": 0.9187798508188116, + "grad_norm": 0.734375, + "learning_rate": 0.00015723548141023617, + "loss": 4.3016, + "step": 8861 + }, + { + "epoch": 0.9188835388732997, + "grad_norm": 0.625, + "learning_rate": 0.00015722657392554988, + "loss": 4.274, + "step": 8862 + }, + { + "epoch": 0.9189872269277878, + "grad_norm": 0.71484375, + "learning_rate": 0.00015721766576565048, + "loss": 4.2504, + "step": 8863 + }, + { + "epoch": 0.9190909149822758, + "grad_norm": 0.73046875, + "learning_rate": 0.00015720875693064313, + "loss": 4.2652, + "step": 8864 + }, + { + "epoch": 0.9191946030367639, + "grad_norm": 0.65234375, + "learning_rate": 0.00015719984742063292, + "loss": 4.2431, + "step": 8865 + }, + { + "epoch": 0.9192982910912519, + "grad_norm": 0.7734375, + "learning_rate": 0.00015719093723572496, + "loss": 4.2611, + "step": 8866 + }, + { + "epoch": 0.9194019791457401, + "grad_norm": 0.6484375, + "learning_rate": 0.0001571820263760244, + "loss": 4.2941, + "step": 8867 + }, + { + "epoch": 0.9195056672002281, + "grad_norm": 0.80078125, + "learning_rate": 0.00015717311484163634, + "loss": 4.2652, + "step": 8868 + }, + { + "epoch": 0.9196093552547162, + "grad_norm": 0.6953125, + "learning_rate": 0.000157164202632666, + "loss": 4.2666, + "step": 8869 + }, + { + "epoch": 0.9197130433092042, + "grad_norm": 0.7265625, + "learning_rate": 0.00015715528974921848, + "loss": 4.2376, + "step": 8870 + }, + { + "epoch": 0.9198167313636924, + "grad_norm": 0.73828125, + "learning_rate": 0.00015714637619139898, + "loss": 4.3034, + "step": 8871 + }, + { + "epoch": 0.9199204194181804, + "grad_norm": 0.70703125, + "learning_rate": 0.00015713746195931264, + "loss": 4.2802, + "step": 8872 + }, + { + "epoch": 0.9200241074726685, + "grad_norm": 0.7890625, + "learning_rate": 0.0001571285470530646, + "loss": 4.2629, + "step": 8873 + }, + { + "epoch": 0.9201277955271565, + "grad_norm": 0.7421875, + "learning_rate": 0.00015711963147276015, + "loss": 4.2792, + "step": 8874 + }, + { + "epoch": 0.9202314835816446, + "grad_norm": 0.828125, + "learning_rate": 0.00015711071521850437, + "loss": 4.2617, + "step": 8875 + }, + { + "epoch": 0.9203351716361327, + "grad_norm": 0.71484375, + "learning_rate": 0.00015710179829040257, + "loss": 4.3119, + "step": 8876 + }, + { + "epoch": 0.9204388596906208, + "grad_norm": 0.87109375, + "learning_rate": 0.00015709288068855987, + "loss": 4.3074, + "step": 8877 + }, + { + "epoch": 0.9205425477451088, + "grad_norm": 0.765625, + "learning_rate": 0.00015708396241308155, + "loss": 4.2577, + "step": 8878 + }, + { + "epoch": 0.9206462357995969, + "grad_norm": 0.8671875, + "learning_rate": 0.00015707504346407282, + "loss": 4.2658, + "step": 8879 + }, + { + "epoch": 0.9207499238540849, + "grad_norm": 0.71875, + "learning_rate": 0.00015706612384163888, + "loss": 4.2884, + "step": 8880 + }, + { + "epoch": 0.9208536119085731, + "grad_norm": 0.8046875, + "learning_rate": 0.00015705720354588503, + "loss": 4.2734, + "step": 8881 + }, + { + "epoch": 0.9209572999630611, + "grad_norm": 0.6796875, + "learning_rate": 0.00015704828257691647, + "loss": 4.2624, + "step": 8882 + }, + { + "epoch": 0.9210609880175492, + "grad_norm": 0.80078125, + "learning_rate": 0.00015703936093483848, + "loss": 4.2541, + "step": 8883 + }, + { + "epoch": 0.9211646760720372, + "grad_norm": 0.73046875, + "learning_rate": 0.00015703043861975635, + "loss": 4.2992, + "step": 8884 + }, + { + "epoch": 0.9212683641265254, + "grad_norm": 0.8203125, + "learning_rate": 0.00015702151563177531, + "loss": 4.2204, + "step": 8885 + }, + { + "epoch": 0.9213720521810134, + "grad_norm": 0.75, + "learning_rate": 0.00015701259197100067, + "loss": 4.2905, + "step": 8886 + }, + { + "epoch": 0.9214757402355015, + "grad_norm": 0.79296875, + "learning_rate": 0.00015700366763753772, + "loss": 4.2781, + "step": 8887 + }, + { + "epoch": 0.9215794282899896, + "grad_norm": 0.76953125, + "learning_rate": 0.0001569947426314917, + "loss": 4.2797, + "step": 8888 + }, + { + "epoch": 0.9216831163444776, + "grad_norm": 0.79296875, + "learning_rate": 0.00015698581695296798, + "loss": 4.2928, + "step": 8889 + }, + { + "epoch": 0.9217868043989658, + "grad_norm": 0.76953125, + "learning_rate": 0.00015697689060207188, + "loss": 4.2258, + "step": 8890 + }, + { + "epoch": 0.9218904924534538, + "grad_norm": 0.7734375, + "learning_rate": 0.00015696796357890868, + "loss": 4.2968, + "step": 8891 + }, + { + "epoch": 0.9219941805079419, + "grad_norm": 0.8203125, + "learning_rate": 0.00015695903588358373, + "loss": 4.2534, + "step": 8892 + }, + { + "epoch": 0.9220978685624299, + "grad_norm": 0.8046875, + "learning_rate": 0.00015695010751620237, + "loss": 4.2481, + "step": 8893 + }, + { + "epoch": 0.922201556616918, + "grad_norm": 0.6796875, + "learning_rate": 0.0001569411784768699, + "loss": 4.2957, + "step": 8894 + }, + { + "epoch": 0.9223052446714061, + "grad_norm": 0.78515625, + "learning_rate": 0.00015693224876569178, + "loss": 4.2965, + "step": 8895 + }, + { + "epoch": 0.9224089327258942, + "grad_norm": 0.63671875, + "learning_rate": 0.00015692331838277327, + "loss": 4.2929, + "step": 8896 + }, + { + "epoch": 0.9225126207803822, + "grad_norm": 0.7421875, + "learning_rate": 0.00015691438732821979, + "loss": 4.3019, + "step": 8897 + }, + { + "epoch": 0.9226163088348703, + "grad_norm": 0.75, + "learning_rate": 0.00015690545560213668, + "loss": 4.2573, + "step": 8898 + }, + { + "epoch": 0.9227199968893584, + "grad_norm": 0.71484375, + "learning_rate": 0.00015689652320462937, + "loss": 4.2578, + "step": 8899 + }, + { + "epoch": 0.9228236849438465, + "grad_norm": 0.796875, + "learning_rate": 0.0001568875901358032, + "loss": 4.2717, + "step": 8900 + }, + { + "epoch": 0.9229273729983345, + "grad_norm": 0.703125, + "learning_rate": 0.00015687865639576361, + "loss": 4.1983, + "step": 8901 + }, + { + "epoch": 0.9230310610528226, + "grad_norm": 0.8046875, + "learning_rate": 0.000156869721984616, + "loss": 4.2582, + "step": 8902 + }, + { + "epoch": 0.9231347491073106, + "grad_norm": 0.75, + "learning_rate": 0.0001568607869024658, + "loss": 4.295, + "step": 8903 + }, + { + "epoch": 0.9232384371617988, + "grad_norm": 0.8046875, + "learning_rate": 0.0001568518511494184, + "loss": 4.3088, + "step": 8904 + }, + { + "epoch": 0.9233421252162868, + "grad_norm": 0.8125, + "learning_rate": 0.00015684291472557927, + "loss": 4.2766, + "step": 8905 + }, + { + "epoch": 0.9234458132707749, + "grad_norm": 0.76953125, + "learning_rate": 0.0001568339776310538, + "loss": 4.2754, + "step": 8906 + }, + { + "epoch": 0.9235495013252629, + "grad_norm": 0.80078125, + "learning_rate": 0.0001568250398659475, + "loss": 4.3055, + "step": 8907 + }, + { + "epoch": 0.923653189379751, + "grad_norm": 0.7578125, + "learning_rate": 0.00015681610143036579, + "loss": 4.2631, + "step": 8908 + }, + { + "epoch": 0.9237568774342391, + "grad_norm": 0.83984375, + "learning_rate": 0.0001568071623244141, + "loss": 4.2663, + "step": 8909 + }, + { + "epoch": 0.9238605654887272, + "grad_norm": 0.6953125, + "learning_rate": 0.00015679822254819798, + "loss": 4.2789, + "step": 8910 + }, + { + "epoch": 0.9239642535432152, + "grad_norm": 0.8125, + "learning_rate": 0.0001567892821018229, + "loss": 4.2898, + "step": 8911 + }, + { + "epoch": 0.9240679415977033, + "grad_norm": 0.7421875, + "learning_rate": 0.00015678034098539427, + "loss": 4.267, + "step": 8912 + }, + { + "epoch": 0.9241716296521914, + "grad_norm": 0.8125, + "learning_rate": 0.00015677139919901764, + "loss": 4.2639, + "step": 8913 + }, + { + "epoch": 0.9242753177066795, + "grad_norm": 0.80859375, + "learning_rate": 0.00015676245674279855, + "loss": 4.2804, + "step": 8914 + }, + { + "epoch": 0.9243790057611675, + "grad_norm": 0.828125, + "learning_rate": 0.00015675351361684242, + "loss": 4.2755, + "step": 8915 + }, + { + "epoch": 0.9244826938156556, + "grad_norm": 0.76171875, + "learning_rate": 0.00015674456982125485, + "loss": 4.2354, + "step": 8916 + }, + { + "epoch": 0.9245863818701436, + "grad_norm": 0.83203125, + "learning_rate": 0.00015673562535614135, + "loss": 4.2736, + "step": 8917 + }, + { + "epoch": 0.9246900699246318, + "grad_norm": 0.68359375, + "learning_rate": 0.0001567266802216074, + "loss": 4.2275, + "step": 8918 + }, + { + "epoch": 0.9247937579791198, + "grad_norm": 0.78515625, + "learning_rate": 0.0001567177344177586, + "loss": 4.2584, + "step": 8919 + }, + { + "epoch": 0.9248974460336079, + "grad_norm": 0.7109375, + "learning_rate": 0.0001567087879447005, + "loss": 4.2091, + "step": 8920 + }, + { + "epoch": 0.9250011340880959, + "grad_norm": 0.78125, + "learning_rate": 0.00015669984080253865, + "loss": 4.2761, + "step": 8921 + }, + { + "epoch": 0.925104822142584, + "grad_norm": 0.70703125, + "learning_rate": 0.00015669089299137862, + "loss": 4.3291, + "step": 8922 + }, + { + "epoch": 0.9252085101970721, + "grad_norm": 0.70703125, + "learning_rate": 0.00015668194451132596, + "loss": 4.2769, + "step": 8923 + }, + { + "epoch": 0.9253121982515602, + "grad_norm": 0.74609375, + "learning_rate": 0.00015667299536248626, + "loss": 4.3015, + "step": 8924 + }, + { + "epoch": 0.9254158863060482, + "grad_norm": 0.69140625, + "learning_rate": 0.00015666404554496514, + "loss": 4.275, + "step": 8925 + }, + { + "epoch": 0.9255195743605363, + "grad_norm": 0.703125, + "learning_rate": 0.0001566550950588682, + "loss": 4.2754, + "step": 8926 + }, + { + "epoch": 0.9256232624150244, + "grad_norm": 0.640625, + "learning_rate": 0.000156646143904301, + "loss": 4.298, + "step": 8927 + }, + { + "epoch": 0.9257269504695125, + "grad_norm": 0.6953125, + "learning_rate": 0.00015663719208136917, + "loss": 4.2431, + "step": 8928 + }, + { + "epoch": 0.9258306385240005, + "grad_norm": 0.6484375, + "learning_rate": 0.00015662823959017836, + "loss": 4.1848, + "step": 8929 + }, + { + "epoch": 0.9259343265784886, + "grad_norm": 0.66796875, + "learning_rate": 0.00015661928643083417, + "loss": 4.2754, + "step": 8930 + }, + { + "epoch": 0.9260380146329766, + "grad_norm": 0.5703125, + "learning_rate": 0.00015661033260344224, + "loss": 4.2398, + "step": 8931 + }, + { + "epoch": 0.9261417026874648, + "grad_norm": 0.609375, + "learning_rate": 0.00015660137810810825, + "loss": 4.2746, + "step": 8932 + }, + { + "epoch": 0.9262453907419529, + "grad_norm": 0.67578125, + "learning_rate": 0.0001565924229449378, + "loss": 4.2448, + "step": 8933 + }, + { + "epoch": 0.9263490787964409, + "grad_norm": 0.7421875, + "learning_rate": 0.00015658346711403662, + "loss": 4.2526, + "step": 8934 + }, + { + "epoch": 0.926452766850929, + "grad_norm": 0.6953125, + "learning_rate": 0.0001565745106155103, + "loss": 4.3112, + "step": 8935 + }, + { + "epoch": 0.926556454905417, + "grad_norm": 0.70703125, + "learning_rate": 0.0001565655534494646, + "loss": 4.2805, + "step": 8936 + }, + { + "epoch": 0.9266601429599052, + "grad_norm": 0.625, + "learning_rate": 0.0001565565956160051, + "loss": 4.2638, + "step": 8937 + }, + { + "epoch": 0.9267638310143932, + "grad_norm": 0.73828125, + "learning_rate": 0.0001565476371152376, + "loss": 4.2515, + "step": 8938 + }, + { + "epoch": 0.9268675190688813, + "grad_norm": 0.66796875, + "learning_rate": 0.00015653867794726773, + "loss": 4.2495, + "step": 8939 + }, + { + "epoch": 0.9269712071233693, + "grad_norm": 0.78125, + "learning_rate": 0.00015652971811220125, + "loss": 4.2629, + "step": 8940 + }, + { + "epoch": 0.9270748951778575, + "grad_norm": 0.7265625, + "learning_rate": 0.00015652075761014384, + "loss": 4.2958, + "step": 8941 + }, + { + "epoch": 0.9271785832323455, + "grad_norm": 0.68359375, + "learning_rate": 0.00015651179644120123, + "loss": 4.1946, + "step": 8942 + }, + { + "epoch": 0.9272822712868336, + "grad_norm": 0.71875, + "learning_rate": 0.00015650283460547914, + "loss": 4.259, + "step": 8943 + }, + { + "epoch": 0.9273859593413216, + "grad_norm": 0.69140625, + "learning_rate": 0.00015649387210308336, + "loss": 4.2503, + "step": 8944 + }, + { + "epoch": 0.9274896473958097, + "grad_norm": 0.7265625, + "learning_rate": 0.0001564849089341196, + "loss": 4.2597, + "step": 8945 + }, + { + "epoch": 0.9275933354502978, + "grad_norm": 0.7109375, + "learning_rate": 0.0001564759450986936, + "loss": 4.2625, + "step": 8946 + }, + { + "epoch": 0.9276970235047859, + "grad_norm": 0.71875, + "learning_rate": 0.0001564669805969112, + "loss": 4.3084, + "step": 8947 + }, + { + "epoch": 0.9278007115592739, + "grad_norm": 0.72265625, + "learning_rate": 0.00015645801542887806, + "loss": 4.2666, + "step": 8948 + }, + { + "epoch": 0.927904399613762, + "grad_norm": 0.71875, + "learning_rate": 0.00015644904959470003, + "loss": 4.2831, + "step": 8949 + }, + { + "epoch": 0.92800808766825, + "grad_norm": 0.734375, + "learning_rate": 0.00015644008309448291, + "loss": 4.2836, + "step": 8950 + }, + { + "epoch": 0.9281117757227382, + "grad_norm": 0.70703125, + "learning_rate": 0.00015643111592833245, + "loss": 4.2428, + "step": 8951 + }, + { + "epoch": 0.9282154637772262, + "grad_norm": 0.69140625, + "learning_rate": 0.00015642214809635444, + "loss": 4.2951, + "step": 8952 + }, + { + "epoch": 0.9283191518317143, + "grad_norm": 0.71484375, + "learning_rate": 0.00015641317959865476, + "loss": 4.2916, + "step": 8953 + }, + { + "epoch": 0.9284228398862023, + "grad_norm": 0.703125, + "learning_rate": 0.0001564042104353392, + "loss": 4.3157, + "step": 8954 + }, + { + "epoch": 0.9285265279406905, + "grad_norm": 0.71875, + "learning_rate": 0.00015639524060651356, + "loss": 4.2512, + "step": 8955 + }, + { + "epoch": 0.9286302159951785, + "grad_norm": 0.72265625, + "learning_rate": 0.0001563862701122837, + "loss": 4.2779, + "step": 8956 + }, + { + "epoch": 0.9287339040496666, + "grad_norm": 0.75, + "learning_rate": 0.00015637729895275545, + "loss": 4.2798, + "step": 8957 + }, + { + "epoch": 0.9288375921041546, + "grad_norm": 0.75390625, + "learning_rate": 0.0001563683271280347, + "loss": 4.2495, + "step": 8958 + }, + { + "epoch": 0.9289412801586427, + "grad_norm": 0.78515625, + "learning_rate": 0.00015635935463822722, + "loss": 4.2622, + "step": 8959 + }, + { + "epoch": 0.9290449682131308, + "grad_norm": 0.7578125, + "learning_rate": 0.00015635038148343895, + "loss": 4.2685, + "step": 8960 + }, + { + "epoch": 0.9291486562676189, + "grad_norm": 0.85546875, + "learning_rate": 0.00015634140766377573, + "loss": 4.264, + "step": 8961 + }, + { + "epoch": 0.9292523443221069, + "grad_norm": 0.734375, + "learning_rate": 0.0001563324331793435, + "loss": 4.31, + "step": 8962 + }, + { + "epoch": 0.929356032376595, + "grad_norm": 0.97265625, + "learning_rate": 0.00015632345803024805, + "loss": 4.2839, + "step": 8963 + }, + { + "epoch": 0.929459720431083, + "grad_norm": 0.73828125, + "learning_rate": 0.00015631448221659536, + "loss": 4.3095, + "step": 8964 + }, + { + "epoch": 0.9295634084855712, + "grad_norm": 0.86328125, + "learning_rate": 0.00015630550573849132, + "loss": 4.2685, + "step": 8965 + }, + { + "epoch": 0.9296670965400592, + "grad_norm": 0.7890625, + "learning_rate": 0.00015629652859604182, + "loss": 4.2873, + "step": 8966 + }, + { + "epoch": 0.9297707845945473, + "grad_norm": 0.6953125, + "learning_rate": 0.00015628755078935278, + "loss": 4.2562, + "step": 8967 + }, + { + "epoch": 0.9298744726490353, + "grad_norm": 0.87890625, + "learning_rate": 0.00015627857231853014, + "loss": 4.2917, + "step": 8968 + }, + { + "epoch": 0.9299781607035235, + "grad_norm": 0.6953125, + "learning_rate": 0.0001562695931836798, + "loss": 4.2779, + "step": 8969 + }, + { + "epoch": 0.9300818487580115, + "grad_norm": 0.796875, + "learning_rate": 0.00015626061338490783, + "loss": 4.2989, + "step": 8970 + }, + { + "epoch": 0.9301855368124996, + "grad_norm": 0.8046875, + "learning_rate": 0.00015625163292232004, + "loss": 4.2377, + "step": 8971 + }, + { + "epoch": 0.9302892248669876, + "grad_norm": 0.85546875, + "learning_rate": 0.00015624265179602244, + "loss": 4.2382, + "step": 8972 + }, + { + "epoch": 0.9303929129214757, + "grad_norm": 0.78515625, + "learning_rate": 0.00015623367000612098, + "loss": 4.2848, + "step": 8973 + }, + { + "epoch": 0.9304966009759638, + "grad_norm": 0.859375, + "learning_rate": 0.0001562246875527217, + "loss": 4.2626, + "step": 8974 + }, + { + "epoch": 0.9306002890304519, + "grad_norm": 0.765625, + "learning_rate": 0.00015621570443593053, + "loss": 4.2344, + "step": 8975 + }, + { + "epoch": 0.9307039770849399, + "grad_norm": 0.8359375, + "learning_rate": 0.00015620672065585345, + "loss": 4.2367, + "step": 8976 + }, + { + "epoch": 0.930807665139428, + "grad_norm": 0.7578125, + "learning_rate": 0.00015619773621259648, + "loss": 4.2997, + "step": 8977 + }, + { + "epoch": 0.9309113531939162, + "grad_norm": 0.7734375, + "learning_rate": 0.00015618875110626562, + "loss": 4.2668, + "step": 8978 + }, + { + "epoch": 0.9310150412484042, + "grad_norm": 0.77734375, + "learning_rate": 0.00015617976533696692, + "loss": 4.2893, + "step": 8979 + }, + { + "epoch": 0.9311187293028923, + "grad_norm": 0.81640625, + "learning_rate": 0.00015617077890480637, + "loss": 4.2655, + "step": 8980 + }, + { + "epoch": 0.9312224173573803, + "grad_norm": 0.70703125, + "learning_rate": 0.00015616179180988996, + "loss": 4.2781, + "step": 8981 + }, + { + "epoch": 0.9313261054118684, + "grad_norm": 0.80859375, + "learning_rate": 0.00015615280405232383, + "loss": 4.2794, + "step": 8982 + }, + { + "epoch": 0.9314297934663565, + "grad_norm": 0.828125, + "learning_rate": 0.00015614381563221395, + "loss": 4.307, + "step": 8983 + }, + { + "epoch": 0.9315334815208446, + "grad_norm": 0.7578125, + "learning_rate": 0.00015613482654966638, + "loss": 4.286, + "step": 8984 + }, + { + "epoch": 0.9316371695753326, + "grad_norm": 0.78125, + "learning_rate": 0.00015612583680478718, + "loss": 4.23, + "step": 8985 + }, + { + "epoch": 0.9317408576298207, + "grad_norm": 0.765625, + "learning_rate": 0.0001561168463976825, + "loss": 4.2303, + "step": 8986 + }, + { + "epoch": 0.9318445456843087, + "grad_norm": 0.79296875, + "learning_rate": 0.0001561078553284583, + "loss": 4.33, + "step": 8987 + }, + { + "epoch": 0.9319482337387969, + "grad_norm": 0.8359375, + "learning_rate": 0.0001560988635972207, + "loss": 4.2243, + "step": 8988 + }, + { + "epoch": 0.9320519217932849, + "grad_norm": 0.859375, + "learning_rate": 0.0001560898712040758, + "loss": 4.2901, + "step": 8989 + }, + { + "epoch": 0.932155609847773, + "grad_norm": 0.80078125, + "learning_rate": 0.00015608087814912978, + "loss": 4.2306, + "step": 8990 + }, + { + "epoch": 0.932259297902261, + "grad_norm": 0.76953125, + "learning_rate": 0.00015607188443248862, + "loss": 4.2759, + "step": 8991 + }, + { + "epoch": 0.9323629859567492, + "grad_norm": 0.7734375, + "learning_rate": 0.00015606289005425848, + "loss": 4.2388, + "step": 8992 + }, + { + "epoch": 0.9324666740112372, + "grad_norm": 0.84375, + "learning_rate": 0.00015605389501454554, + "loss": 4.2662, + "step": 8993 + }, + { + "epoch": 0.9325703620657253, + "grad_norm": 0.75390625, + "learning_rate": 0.00015604489931345588, + "loss": 4.246, + "step": 8994 + }, + { + "epoch": 0.9326740501202133, + "grad_norm": 0.83203125, + "learning_rate": 0.00015603590295109565, + "loss": 4.2991, + "step": 8995 + }, + { + "epoch": 0.9327777381747014, + "grad_norm": 0.76953125, + "learning_rate": 0.000156026905927571, + "loss": 4.2641, + "step": 8996 + }, + { + "epoch": 0.9328814262291895, + "grad_norm": 0.98828125, + "learning_rate": 0.00015601790824298808, + "loss": 4.2938, + "step": 8997 + }, + { + "epoch": 0.9329851142836776, + "grad_norm": 0.82421875, + "learning_rate": 0.00015600890989745305, + "loss": 4.2682, + "step": 8998 + }, + { + "epoch": 0.9330888023381656, + "grad_norm": 0.91796875, + "learning_rate": 0.00015599991089107208, + "loss": 4.2853, + "step": 8999 + }, + { + "epoch": 0.9331924903926537, + "grad_norm": 0.8203125, + "learning_rate": 0.00015599091122395139, + "loss": 4.2797, + "step": 9000 + }, + { + "epoch": 0.9332961784471417, + "grad_norm": 0.921875, + "learning_rate": 0.0001559819108961971, + "loss": 4.2417, + "step": 9001 + }, + { + "epoch": 0.9333998665016299, + "grad_norm": 0.77734375, + "learning_rate": 0.00015597290990791544, + "loss": 4.2504, + "step": 9002 + }, + { + "epoch": 0.9335035545561179, + "grad_norm": 0.91796875, + "learning_rate": 0.00015596390825921264, + "loss": 4.2459, + "step": 9003 + }, + { + "epoch": 0.933607242610606, + "grad_norm": 0.71875, + "learning_rate": 0.00015595490595019483, + "loss": 4.2283, + "step": 9004 + }, + { + "epoch": 0.933710930665094, + "grad_norm": 0.86328125, + "learning_rate": 0.00015594590298096832, + "loss": 4.2622, + "step": 9005 + }, + { + "epoch": 0.9338146187195822, + "grad_norm": 0.7890625, + "learning_rate": 0.00015593689935163924, + "loss": 4.2009, + "step": 9006 + }, + { + "epoch": 0.9339183067740702, + "grad_norm": 0.76953125, + "learning_rate": 0.00015592789506231394, + "loss": 4.2585, + "step": 9007 + }, + { + "epoch": 0.9340219948285583, + "grad_norm": 0.75390625, + "learning_rate": 0.00015591889011309858, + "loss": 4.2724, + "step": 9008 + }, + { + "epoch": 0.9341256828830463, + "grad_norm": 0.75390625, + "learning_rate": 0.00015590988450409939, + "loss": 4.301, + "step": 9009 + }, + { + "epoch": 0.9342293709375344, + "grad_norm": 0.68359375, + "learning_rate": 0.00015590087823542267, + "loss": 4.2327, + "step": 9010 + }, + { + "epoch": 0.9343330589920225, + "grad_norm": 0.78515625, + "learning_rate": 0.0001558918713071747, + "loss": 4.2382, + "step": 9011 + }, + { + "epoch": 0.9344367470465106, + "grad_norm": 0.6328125, + "learning_rate": 0.0001558828637194617, + "loss": 4.2565, + "step": 9012 + }, + { + "epoch": 0.9345404351009986, + "grad_norm": 0.7734375, + "learning_rate": 0.00015587385547239, + "loss": 4.2808, + "step": 9013 + }, + { + "epoch": 0.9346441231554867, + "grad_norm": 0.671875, + "learning_rate": 0.0001558648465660659, + "loss": 4.2589, + "step": 9014 + }, + { + "epoch": 0.9347478112099747, + "grad_norm": 0.83984375, + "learning_rate": 0.00015585583700059564, + "loss": 4.2889, + "step": 9015 + }, + { + "epoch": 0.9348514992644629, + "grad_norm": 0.73046875, + "learning_rate": 0.0001558468267760855, + "loss": 4.2689, + "step": 9016 + }, + { + "epoch": 0.9349551873189509, + "grad_norm": 0.74609375, + "learning_rate": 0.00015583781589264186, + "loss": 4.2566, + "step": 9017 + }, + { + "epoch": 0.935058875373439, + "grad_norm": 0.703125, + "learning_rate": 0.00015582880435037103, + "loss": 4.2746, + "step": 9018 + }, + { + "epoch": 0.935162563427927, + "grad_norm": 0.73828125, + "learning_rate": 0.00015581979214937932, + "loss": 4.2494, + "step": 9019 + }, + { + "epoch": 0.9352662514824152, + "grad_norm": 0.71875, + "learning_rate": 0.00015581077928977308, + "loss": 4.3052, + "step": 9020 + }, + { + "epoch": 0.9353699395369032, + "grad_norm": 0.78515625, + "learning_rate": 0.0001558017657716586, + "loss": 4.2869, + "step": 9021 + }, + { + "epoch": 0.9354736275913913, + "grad_norm": 0.7421875, + "learning_rate": 0.0001557927515951423, + "loss": 4.1821, + "step": 9022 + }, + { + "epoch": 0.9355773156458794, + "grad_norm": 0.765625, + "learning_rate": 0.0001557837367603305, + "loss": 4.2827, + "step": 9023 + }, + { + "epoch": 0.9356810037003674, + "grad_norm": 0.73828125, + "learning_rate": 0.0001557747212673296, + "loss": 4.2785, + "step": 9024 + }, + { + "epoch": 0.9357846917548556, + "grad_norm": 0.6875, + "learning_rate": 0.0001557657051162459, + "loss": 4.2271, + "step": 9025 + }, + { + "epoch": 0.9358883798093436, + "grad_norm": 0.6796875, + "learning_rate": 0.00015575668830718583, + "loss": 4.26, + "step": 9026 + }, + { + "epoch": 0.9359920678638317, + "grad_norm": 0.77734375, + "learning_rate": 0.0001557476708402558, + "loss": 4.2341, + "step": 9027 + }, + { + "epoch": 0.9360957559183197, + "grad_norm": 0.7265625, + "learning_rate": 0.00015573865271556217, + "loss": 4.2817, + "step": 9028 + }, + { + "epoch": 0.9361994439728079, + "grad_norm": 0.72265625, + "learning_rate": 0.00015572963393321136, + "loss": 4.294, + "step": 9029 + }, + { + "epoch": 0.9363031320272959, + "grad_norm": 0.67578125, + "learning_rate": 0.00015572061449330976, + "loss": 4.2312, + "step": 9030 + }, + { + "epoch": 0.936406820081784, + "grad_norm": 0.7109375, + "learning_rate": 0.00015571159439596382, + "loss": 4.2447, + "step": 9031 + }, + { + "epoch": 0.936510508136272, + "grad_norm": 0.6875, + "learning_rate": 0.00015570257364127995, + "loss": 4.2631, + "step": 9032 + }, + { + "epoch": 0.9366141961907601, + "grad_norm": 0.65625, + "learning_rate": 0.0001556935522293646, + "loss": 4.2454, + "step": 9033 + }, + { + "epoch": 0.9367178842452482, + "grad_norm": 0.765625, + "learning_rate": 0.0001556845301603242, + "loss": 4.2791, + "step": 9034 + }, + { + "epoch": 0.9368215722997363, + "grad_norm": 0.69140625, + "learning_rate": 0.0001556755074342652, + "loss": 4.2614, + "step": 9035 + }, + { + "epoch": 0.9369252603542243, + "grad_norm": 0.72265625, + "learning_rate": 0.0001556664840512941, + "loss": 4.2768, + "step": 9036 + }, + { + "epoch": 0.9370289484087124, + "grad_norm": 0.734375, + "learning_rate": 0.0001556574600115173, + "loss": 4.2986, + "step": 9037 + }, + { + "epoch": 0.9371326364632004, + "grad_norm": 0.7421875, + "learning_rate": 0.0001556484353150413, + "loss": 4.3204, + "step": 9038 + }, + { + "epoch": 0.9372363245176886, + "grad_norm": 0.6953125, + "learning_rate": 0.0001556394099619726, + "loss": 4.262, + "step": 9039 + }, + { + "epoch": 0.9373400125721766, + "grad_norm": 0.7421875, + "learning_rate": 0.00015563038395241766, + "loss": 4.3063, + "step": 9040 + }, + { + "epoch": 0.9374437006266647, + "grad_norm": 0.734375, + "learning_rate": 0.00015562135728648303, + "loss": 4.2485, + "step": 9041 + }, + { + "epoch": 0.9375473886811527, + "grad_norm": 0.703125, + "learning_rate": 0.00015561232996427513, + "loss": 4.2451, + "step": 9042 + }, + { + "epoch": 0.9376510767356409, + "grad_norm": 0.6328125, + "learning_rate": 0.00015560330198590054, + "loss": 4.2971, + "step": 9043 + }, + { + "epoch": 0.9377547647901289, + "grad_norm": 0.6875, + "learning_rate": 0.00015559427335146579, + "loss": 4.2891, + "step": 9044 + }, + { + "epoch": 0.937858452844617, + "grad_norm": 0.71875, + "learning_rate": 0.00015558524406107733, + "loss": 4.27, + "step": 9045 + }, + { + "epoch": 0.937962140899105, + "grad_norm": 0.64453125, + "learning_rate": 0.00015557621411484176, + "loss": 4.2278, + "step": 9046 + }, + { + "epoch": 0.9380658289535931, + "grad_norm": 0.73828125, + "learning_rate": 0.00015556718351286564, + "loss": 4.2598, + "step": 9047 + }, + { + "epoch": 0.9381695170080812, + "grad_norm": 0.65234375, + "learning_rate": 0.00015555815225525547, + "loss": 4.2699, + "step": 9048 + }, + { + "epoch": 0.9382732050625693, + "grad_norm": 0.65625, + "learning_rate": 0.00015554912034211783, + "loss": 4.2374, + "step": 9049 + }, + { + "epoch": 0.9383768931170573, + "grad_norm": 0.6640625, + "learning_rate": 0.00015554008777355927, + "loss": 4.2409, + "step": 9050 + }, + { + "epoch": 0.9384805811715454, + "grad_norm": 0.63671875, + "learning_rate": 0.00015553105454968642, + "loss": 4.2517, + "step": 9051 + }, + { + "epoch": 0.9385842692260334, + "grad_norm": 0.65234375, + "learning_rate": 0.0001555220206706058, + "loss": 4.2676, + "step": 9052 + }, + { + "epoch": 0.9386879572805216, + "grad_norm": 0.65234375, + "learning_rate": 0.00015551298613642405, + "loss": 4.2632, + "step": 9053 + }, + { + "epoch": 0.9387916453350096, + "grad_norm": 0.73828125, + "learning_rate": 0.0001555039509472477, + "loss": 4.2829, + "step": 9054 + }, + { + "epoch": 0.9388953333894977, + "grad_norm": 0.6171875, + "learning_rate": 0.00015549491510318344, + "loss": 4.2757, + "step": 9055 + }, + { + "epoch": 0.9389990214439857, + "grad_norm": 0.7578125, + "learning_rate": 0.00015548587860433782, + "loss": 4.2739, + "step": 9056 + }, + { + "epoch": 0.9391027094984739, + "grad_norm": 0.70703125, + "learning_rate": 0.0001554768414508175, + "loss": 4.3261, + "step": 9057 + }, + { + "epoch": 0.9392063975529619, + "grad_norm": 0.72265625, + "learning_rate": 0.0001554678036427291, + "loss": 4.2985, + "step": 9058 + }, + { + "epoch": 0.93931008560745, + "grad_norm": 0.7421875, + "learning_rate": 0.00015545876518017923, + "loss": 4.2876, + "step": 9059 + }, + { + "epoch": 0.939413773661938, + "grad_norm": 0.734375, + "learning_rate": 0.00015544972606327458, + "loss": 4.256, + "step": 9060 + }, + { + "epoch": 0.9395174617164261, + "grad_norm": 0.78125, + "learning_rate": 0.00015544068629212173, + "loss": 4.2775, + "step": 9061 + }, + { + "epoch": 0.9396211497709142, + "grad_norm": 0.71484375, + "learning_rate": 0.00015543164586682742, + "loss": 4.28, + "step": 9062 + }, + { + "epoch": 0.9397248378254023, + "grad_norm": 0.71484375, + "learning_rate": 0.00015542260478749827, + "loss": 4.2796, + "step": 9063 + }, + { + "epoch": 0.9398285258798903, + "grad_norm": 0.7109375, + "learning_rate": 0.00015541356305424095, + "loss": 4.2735, + "step": 9064 + }, + { + "epoch": 0.9399322139343784, + "grad_norm": 0.6640625, + "learning_rate": 0.0001554045206671622, + "loss": 4.2218, + "step": 9065 + }, + { + "epoch": 0.9400359019888664, + "grad_norm": 0.72265625, + "learning_rate": 0.00015539547762636864, + "loss": 4.2663, + "step": 9066 + }, + { + "epoch": 0.9401395900433546, + "grad_norm": 0.6484375, + "learning_rate": 0.00015538643393196703, + "loss": 4.2504, + "step": 9067 + }, + { + "epoch": 0.9402432780978427, + "grad_norm": 0.71875, + "learning_rate": 0.000155377389584064, + "loss": 4.1995, + "step": 9068 + }, + { + "epoch": 0.9403469661523307, + "grad_norm": 0.66796875, + "learning_rate": 0.00015536834458276634, + "loss": 4.2034, + "step": 9069 + }, + { + "epoch": 0.9404506542068188, + "grad_norm": 0.75390625, + "learning_rate": 0.00015535929892818073, + "loss": 4.2722, + "step": 9070 + }, + { + "epoch": 0.9405543422613069, + "grad_norm": 0.76171875, + "learning_rate": 0.0001553502526204139, + "loss": 4.2463, + "step": 9071 + }, + { + "epoch": 0.940658030315795, + "grad_norm": 0.7421875, + "learning_rate": 0.00015534120565957263, + "loss": 4.2895, + "step": 9072 + }, + { + "epoch": 0.940761718370283, + "grad_norm": 0.7890625, + "learning_rate": 0.00015533215804576362, + "loss": 4.2403, + "step": 9073 + }, + { + "epoch": 0.9408654064247711, + "grad_norm": 0.68359375, + "learning_rate": 0.0001553231097790936, + "loss": 4.2882, + "step": 9074 + }, + { + "epoch": 0.9409690944792591, + "grad_norm": 0.703125, + "learning_rate": 0.0001553140608596694, + "loss": 4.2911, + "step": 9075 + }, + { + "epoch": 0.9410727825337473, + "grad_norm": 0.72265625, + "learning_rate": 0.00015530501128759773, + "loss": 4.2612, + "step": 9076 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.79296875, + "learning_rate": 0.00015529596106298542, + "loss": 4.2227, + "step": 9077 + }, + { + "epoch": 0.9412801586427234, + "grad_norm": 0.75, + "learning_rate": 0.00015528691018593918, + "loss": 4.2334, + "step": 9078 + }, + { + "epoch": 0.9413838466972114, + "grad_norm": 0.7578125, + "learning_rate": 0.00015527785865656588, + "loss": 4.2482, + "step": 9079 + }, + { + "epoch": 0.9414875347516996, + "grad_norm": 0.76171875, + "learning_rate": 0.00015526880647497224, + "loss": 4.2654, + "step": 9080 + }, + { + "epoch": 0.9415912228061876, + "grad_norm": 0.7109375, + "learning_rate": 0.00015525975364126513, + "loss": 4.3063, + "step": 9081 + }, + { + "epoch": 0.9416949108606757, + "grad_norm": 0.83203125, + "learning_rate": 0.00015525070015555135, + "loss": 4.2706, + "step": 9082 + }, + { + "epoch": 0.9417985989151637, + "grad_norm": 0.73046875, + "learning_rate": 0.00015524164601793768, + "loss": 4.2785, + "step": 9083 + }, + { + "epoch": 0.9419022869696518, + "grad_norm": 0.8671875, + "learning_rate": 0.000155232591228531, + "loss": 4.234, + "step": 9084 + }, + { + "epoch": 0.9420059750241399, + "grad_norm": 0.70703125, + "learning_rate": 0.0001552235357874381, + "loss": 4.2799, + "step": 9085 + }, + { + "epoch": 0.942109663078628, + "grad_norm": 0.78125, + "learning_rate": 0.0001552144796947659, + "loss": 4.2756, + "step": 9086 + }, + { + "epoch": 0.942213351133116, + "grad_norm": 0.765625, + "learning_rate": 0.00015520542295062117, + "loss": 4.2696, + "step": 9087 + }, + { + "epoch": 0.9423170391876041, + "grad_norm": 0.71484375, + "learning_rate": 0.00015519636555511078, + "loss": 4.2314, + "step": 9088 + }, + { + "epoch": 0.9424207272420921, + "grad_norm": 0.77734375, + "learning_rate": 0.00015518730750834167, + "loss": 4.2627, + "step": 9089 + }, + { + "epoch": 0.9425244152965803, + "grad_norm": 0.74609375, + "learning_rate": 0.00015517824881042066, + "loss": 4.2633, + "step": 9090 + }, + { + "epoch": 0.9426281033510683, + "grad_norm": 0.7265625, + "learning_rate": 0.0001551691894614546, + "loss": 4.2876, + "step": 9091 + }, + { + "epoch": 0.9427317914055564, + "grad_norm": 0.8046875, + "learning_rate": 0.00015516012946155047, + "loss": 4.231, + "step": 9092 + }, + { + "epoch": 0.9428354794600444, + "grad_norm": 0.7109375, + "learning_rate": 0.00015515106881081508, + "loss": 4.3029, + "step": 9093 + }, + { + "epoch": 0.9429391675145326, + "grad_norm": 0.67578125, + "learning_rate": 0.00015514200750935543, + "loss": 4.28, + "step": 9094 + }, + { + "epoch": 0.9430428555690206, + "grad_norm": 0.80078125, + "learning_rate": 0.00015513294555727833, + "loss": 4.2345, + "step": 9095 + }, + { + "epoch": 0.9431465436235087, + "grad_norm": 0.69921875, + "learning_rate": 0.00015512388295469077, + "loss": 4.2922, + "step": 9096 + }, + { + "epoch": 0.9432502316779967, + "grad_norm": 0.8046875, + "learning_rate": 0.00015511481970169964, + "loss": 4.2515, + "step": 9097 + }, + { + "epoch": 0.9433539197324848, + "grad_norm": 0.734375, + "learning_rate": 0.00015510575579841192, + "loss": 4.3032, + "step": 9098 + }, + { + "epoch": 0.9434576077869729, + "grad_norm": 0.8828125, + "learning_rate": 0.00015509669124493452, + "loss": 4.2944, + "step": 9099 + }, + { + "epoch": 0.943561295841461, + "grad_norm": 0.71875, + "learning_rate": 0.00015508762604137438, + "loss": 4.2555, + "step": 9100 + }, + { + "epoch": 0.943664983895949, + "grad_norm": 0.83203125, + "learning_rate": 0.0001550785601878385, + "loss": 4.2433, + "step": 9101 + }, + { + "epoch": 0.9437686719504371, + "grad_norm": 0.71875, + "learning_rate": 0.00015506949368443386, + "loss": 4.2553, + "step": 9102 + }, + { + "epoch": 0.9438723600049251, + "grad_norm": 0.80078125, + "learning_rate": 0.00015506042653126736, + "loss": 4.2619, + "step": 9103 + }, + { + "epoch": 0.9439760480594133, + "grad_norm": 0.79296875, + "learning_rate": 0.00015505135872844608, + "loss": 4.2375, + "step": 9104 + }, + { + "epoch": 0.9440797361139013, + "grad_norm": 0.8984375, + "learning_rate": 0.0001550422902760769, + "loss": 4.322, + "step": 9105 + }, + { + "epoch": 0.9441834241683894, + "grad_norm": 0.77734375, + "learning_rate": 0.0001550332211742669, + "loss": 4.2436, + "step": 9106 + }, + { + "epoch": 0.9442871122228774, + "grad_norm": 0.91015625, + "learning_rate": 0.0001550241514231231, + "loss": 4.2417, + "step": 9107 + }, + { + "epoch": 0.9443908002773655, + "grad_norm": 0.72265625, + "learning_rate": 0.0001550150810227524, + "loss": 4.3237, + "step": 9108 + }, + { + "epoch": 0.9444944883318536, + "grad_norm": 0.9453125, + "learning_rate": 0.00015500600997326195, + "loss": 4.2396, + "step": 9109 + }, + { + "epoch": 0.9445981763863417, + "grad_norm": 0.6796875, + "learning_rate": 0.00015499693827475874, + "loss": 4.2631, + "step": 9110 + }, + { + "epoch": 0.9447018644408297, + "grad_norm": 1.0, + "learning_rate": 0.00015498786592734977, + "loss": 4.2638, + "step": 9111 + }, + { + "epoch": 0.9448055524953178, + "grad_norm": 0.6640625, + "learning_rate": 0.0001549787929311421, + "loss": 4.2598, + "step": 9112 + }, + { + "epoch": 0.944909240549806, + "grad_norm": 0.90625, + "learning_rate": 0.0001549697192862428, + "loss": 4.2533, + "step": 9113 + }, + { + "epoch": 0.945012928604294, + "grad_norm": 0.703125, + "learning_rate": 0.0001549606449927589, + "loss": 4.2861, + "step": 9114 + }, + { + "epoch": 0.9451166166587821, + "grad_norm": 0.85546875, + "learning_rate": 0.00015495157005079751, + "loss": 4.2727, + "step": 9115 + }, + { + "epoch": 0.9452203047132701, + "grad_norm": 0.76171875, + "learning_rate": 0.00015494249446046566, + "loss": 4.3136, + "step": 9116 + }, + { + "epoch": 0.9453239927677582, + "grad_norm": 0.81640625, + "learning_rate": 0.00015493341822187047, + "loss": 4.2814, + "step": 9117 + }, + { + "epoch": 0.9454276808222463, + "grad_norm": 0.7734375, + "learning_rate": 0.000154924341335119, + "loss": 4.2213, + "step": 9118 + }, + { + "epoch": 0.9455313688767344, + "grad_norm": 0.75, + "learning_rate": 0.00015491526380031836, + "loss": 4.257, + "step": 9119 + }, + { + "epoch": 0.9456350569312224, + "grad_norm": 0.71484375, + "learning_rate": 0.00015490618561757566, + "loss": 4.3009, + "step": 9120 + }, + { + "epoch": 0.9457387449857105, + "grad_norm": 0.78125, + "learning_rate": 0.000154897106786998, + "loss": 4.2634, + "step": 9121 + }, + { + "epoch": 0.9458424330401985, + "grad_norm": 0.77734375, + "learning_rate": 0.00015488802730869252, + "loss": 4.3353, + "step": 9122 + }, + { + "epoch": 0.9459461210946867, + "grad_norm": 0.8203125, + "learning_rate": 0.00015487894718276633, + "loss": 4.2733, + "step": 9123 + }, + { + "epoch": 0.9460498091491747, + "grad_norm": 0.68359375, + "learning_rate": 0.00015486986640932658, + "loss": 4.2482, + "step": 9124 + }, + { + "epoch": 0.9461534972036628, + "grad_norm": 0.71875, + "learning_rate": 0.0001548607849884804, + "loss": 4.2753, + "step": 9125 + }, + { + "epoch": 0.9462571852581508, + "grad_norm": 0.625, + "learning_rate": 0.00015485170292033496, + "loss": 4.2713, + "step": 9126 + }, + { + "epoch": 0.946360873312639, + "grad_norm": 0.78125, + "learning_rate": 0.0001548426202049974, + "loss": 4.2871, + "step": 9127 + }, + { + "epoch": 0.946464561367127, + "grad_norm": 0.68359375, + "learning_rate": 0.00015483353684257487, + "loss": 4.2852, + "step": 9128 + }, + { + "epoch": 0.9465682494216151, + "grad_norm": 0.71875, + "learning_rate": 0.0001548244528331746, + "loss": 4.2846, + "step": 9129 + }, + { + "epoch": 0.9466719374761031, + "grad_norm": 0.72265625, + "learning_rate": 0.0001548153681769037, + "loss": 4.2558, + "step": 9130 + }, + { + "epoch": 0.9467756255305912, + "grad_norm": 0.81640625, + "learning_rate": 0.0001548062828738694, + "loss": 4.2772, + "step": 9131 + }, + { + "epoch": 0.9468793135850793, + "grad_norm": 0.74609375, + "learning_rate": 0.00015479719692417892, + "loss": 4.2572, + "step": 9132 + }, + { + "epoch": 0.9469830016395674, + "grad_norm": 0.8984375, + "learning_rate": 0.00015478811032793946, + "loss": 4.288, + "step": 9133 + }, + { + "epoch": 0.9470866896940554, + "grad_norm": 0.734375, + "learning_rate": 0.00015477902308525817, + "loss": 4.2884, + "step": 9134 + }, + { + "epoch": 0.9471903777485435, + "grad_norm": 1.015625, + "learning_rate": 0.00015476993519624233, + "loss": 4.2595, + "step": 9135 + }, + { + "epoch": 0.9472940658030315, + "grad_norm": 0.703125, + "learning_rate": 0.00015476084666099914, + "loss": 4.2354, + "step": 9136 + }, + { + "epoch": 0.9473977538575197, + "grad_norm": 1.015625, + "learning_rate": 0.00015475175747963586, + "loss": 4.1977, + "step": 9137 + }, + { + "epoch": 0.9475014419120077, + "grad_norm": 0.68359375, + "learning_rate": 0.00015474266765225968, + "loss": 4.294, + "step": 9138 + }, + { + "epoch": 0.9476051299664958, + "grad_norm": 1.03125, + "learning_rate": 0.00015473357717897792, + "loss": 4.2882, + "step": 9139 + }, + { + "epoch": 0.9477088180209838, + "grad_norm": 0.75, + "learning_rate": 0.00015472448605989783, + "loss": 4.2827, + "step": 9140 + }, + { + "epoch": 0.947812506075472, + "grad_norm": 1.0859375, + "learning_rate": 0.0001547153942951266, + "loss": 4.2591, + "step": 9141 + }, + { + "epoch": 0.94791619412996, + "grad_norm": 0.796875, + "learning_rate": 0.0001547063018847716, + "loss": 4.2223, + "step": 9142 + }, + { + "epoch": 0.9480198821844481, + "grad_norm": 1.265625, + "learning_rate": 0.00015469720882894, + "loss": 4.2614, + "step": 9143 + }, + { + "epoch": 0.9481235702389361, + "grad_norm": 0.85546875, + "learning_rate": 0.00015468811512773923, + "loss": 4.2703, + "step": 9144 + }, + { + "epoch": 0.9482272582934242, + "grad_norm": 1.4453125, + "learning_rate": 0.00015467902078127645, + "loss": 4.2755, + "step": 9145 + }, + { + "epoch": 0.9483309463479123, + "grad_norm": 1.1875, + "learning_rate": 0.00015466992578965907, + "loss": 4.2887, + "step": 9146 + }, + { + "epoch": 0.9484346344024004, + "grad_norm": 1.578125, + "learning_rate": 0.00015466083015299432, + "loss": 4.2957, + "step": 9147 + }, + { + "epoch": 0.9485383224568884, + "grad_norm": 1.5390625, + "learning_rate": 0.00015465173387138955, + "loss": 4.2571, + "step": 9148 + }, + { + "epoch": 0.9486420105113765, + "grad_norm": 1.0625, + "learning_rate": 0.0001546426369449521, + "loss": 4.2698, + "step": 9149 + }, + { + "epoch": 0.9487456985658645, + "grad_norm": 1.046875, + "learning_rate": 0.0001546335393737893, + "loss": 4.2932, + "step": 9150 + }, + { + "epoch": 0.9488493866203527, + "grad_norm": 1.1953125, + "learning_rate": 0.0001546244411580085, + "loss": 4.2408, + "step": 9151 + }, + { + "epoch": 0.9489530746748407, + "grad_norm": 1.015625, + "learning_rate": 0.00015461534229771698, + "loss": 4.2668, + "step": 9152 + }, + { + "epoch": 0.9490567627293288, + "grad_norm": 1.46875, + "learning_rate": 0.0001546062427930222, + "loss": 4.2251, + "step": 9153 + }, + { + "epoch": 0.9491604507838168, + "grad_norm": 1.2734375, + "learning_rate": 0.00015459714264403149, + "loss": 4.2717, + "step": 9154 + }, + { + "epoch": 0.949264138838305, + "grad_norm": 1.515625, + "learning_rate": 0.00015458804185085215, + "loss": 4.2636, + "step": 9155 + }, + { + "epoch": 0.9493678268927931, + "grad_norm": 1.4375, + "learning_rate": 0.00015457894041359164, + "loss": 4.2856, + "step": 9156 + }, + { + "epoch": 0.9494715149472811, + "grad_norm": 1.1640625, + "learning_rate": 0.00015456983833235735, + "loss": 4.2362, + "step": 9157 + }, + { + "epoch": 0.9495752030017692, + "grad_norm": 1.203125, + "learning_rate": 0.00015456073560725665, + "loss": 4.2561, + "step": 9158 + }, + { + "epoch": 0.9496788910562572, + "grad_norm": 1.015625, + "learning_rate": 0.00015455163223839692, + "loss": 4.2903, + "step": 9159 + }, + { + "epoch": 0.9497825791107454, + "grad_norm": 1.1484375, + "learning_rate": 0.0001545425282258856, + "loss": 4.3087, + "step": 9160 + }, + { + "epoch": 0.9498862671652334, + "grad_norm": 1.078125, + "learning_rate": 0.00015453342356983014, + "loss": 4.3171, + "step": 9161 + }, + { + "epoch": 0.9499899552197215, + "grad_norm": 1.0078125, + "learning_rate": 0.0001545243182703379, + "loss": 4.2506, + "step": 9162 + }, + { + "epoch": 0.9500936432742095, + "grad_norm": 1.4765625, + "learning_rate": 0.00015451521232751628, + "loss": 4.2497, + "step": 9163 + }, + { + "epoch": 0.9501973313286977, + "grad_norm": 1.2109375, + "learning_rate": 0.00015450610574147284, + "loss": 4.2478, + "step": 9164 + }, + { + "epoch": 0.9503010193831857, + "grad_norm": 1.4609375, + "learning_rate": 0.000154496998512315, + "loss": 4.2293, + "step": 9165 + }, + { + "epoch": 0.9504047074376738, + "grad_norm": 1.421875, + "learning_rate": 0.0001544878906401501, + "loss": 4.2779, + "step": 9166 + }, + { + "epoch": 0.9505083954921618, + "grad_norm": 1.171875, + "learning_rate": 0.00015447878212508572, + "loss": 4.226, + "step": 9167 + }, + { + "epoch": 0.9506120835466499, + "grad_norm": 1.2265625, + "learning_rate": 0.0001544696729672293, + "loss": 4.2892, + "step": 9168 + }, + { + "epoch": 0.950715771601138, + "grad_norm": 1.1171875, + "learning_rate": 0.00015446056316668837, + "loss": 4.2678, + "step": 9169 + }, + { + "epoch": 0.9508194596556261, + "grad_norm": 0.98046875, + "learning_rate": 0.00015445145272357028, + "loss": 4.2994, + "step": 9170 + }, + { + "epoch": 0.9509231477101141, + "grad_norm": 1.375, + "learning_rate": 0.00015444234163798263, + "loss": 4.2739, + "step": 9171 + }, + { + "epoch": 0.9510268357646022, + "grad_norm": 1.0234375, + "learning_rate": 0.0001544332299100329, + "loss": 4.2484, + "step": 9172 + }, + { + "epoch": 0.9511305238190902, + "grad_norm": 1.703125, + "learning_rate": 0.00015442411753982863, + "loss": 4.2454, + "step": 9173 + }, + { + "epoch": 0.9512342118735784, + "grad_norm": 1.671875, + "learning_rate": 0.00015441500452747725, + "loss": 4.2962, + "step": 9174 + }, + { + "epoch": 0.9513378999280664, + "grad_norm": 1.1640625, + "learning_rate": 0.00015440589087308636, + "loss": 4.2961, + "step": 9175 + }, + { + "epoch": 0.9514415879825545, + "grad_norm": 1.1875, + "learning_rate": 0.00015439677657676346, + "loss": 4.2194, + "step": 9176 + }, + { + "epoch": 0.9515452760370425, + "grad_norm": 0.98828125, + "learning_rate": 0.0001543876616386161, + "loss": 4.3136, + "step": 9177 + }, + { + "epoch": 0.9516489640915307, + "grad_norm": 0.92578125, + "learning_rate": 0.0001543785460587518, + "loss": 4.2778, + "step": 9178 + }, + { + "epoch": 0.9517526521460187, + "grad_norm": 1.0390625, + "learning_rate": 0.0001543694298372782, + "loss": 4.2829, + "step": 9179 + }, + { + "epoch": 0.9518563402005068, + "grad_norm": 0.79296875, + "learning_rate": 0.00015436031297430276, + "loss": 4.2372, + "step": 9180 + }, + { + "epoch": 0.9519600282549948, + "grad_norm": 0.96875, + "learning_rate": 0.00015435119546993307, + "loss": 4.2937, + "step": 9181 + }, + { + "epoch": 0.9520637163094829, + "grad_norm": 0.77734375, + "learning_rate": 0.00015434207732427675, + "loss": 4.2077, + "step": 9182 + }, + { + "epoch": 0.952167404363971, + "grad_norm": 0.9921875, + "learning_rate": 0.00015433295853744134, + "loss": 4.2331, + "step": 9183 + }, + { + "epoch": 0.9522710924184591, + "grad_norm": 0.77734375, + "learning_rate": 0.00015432383910953448, + "loss": 4.2744, + "step": 9184 + }, + { + "epoch": 0.9523747804729471, + "grad_norm": 1.03125, + "learning_rate": 0.00015431471904066372, + "loss": 4.2522, + "step": 9185 + }, + { + "epoch": 0.9524784685274352, + "grad_norm": 0.796875, + "learning_rate": 0.00015430559833093672, + "loss": 4.2428, + "step": 9186 + }, + { + "epoch": 0.9525821565819232, + "grad_norm": 1.140625, + "learning_rate": 0.00015429647698046104, + "loss": 4.2723, + "step": 9187 + }, + { + "epoch": 0.9526858446364114, + "grad_norm": 0.85546875, + "learning_rate": 0.0001542873549893443, + "loss": 4.2884, + "step": 9188 + }, + { + "epoch": 0.9527895326908994, + "grad_norm": 1.4375, + "learning_rate": 0.00015427823235769416, + "loss": 4.2651, + "step": 9189 + }, + { + "epoch": 0.9528932207453875, + "grad_norm": 1.25, + "learning_rate": 0.0001542691090856183, + "loss": 4.2941, + "step": 9190 + }, + { + "epoch": 0.9529969087998755, + "grad_norm": 1.4765625, + "learning_rate": 0.00015425998517322426, + "loss": 4.2693, + "step": 9191 + }, + { + "epoch": 0.9531005968543637, + "grad_norm": 1.4609375, + "learning_rate": 0.0001542508606206198, + "loss": 4.2732, + "step": 9192 + }, + { + "epoch": 0.9532042849088517, + "grad_norm": 1.0625, + "learning_rate": 0.00015424173542791253, + "loss": 4.2487, + "step": 9193 + }, + { + "epoch": 0.9533079729633398, + "grad_norm": 1.015625, + "learning_rate": 0.00015423260959521008, + "loss": 4.2602, + "step": 9194 + }, + { + "epoch": 0.9534116610178278, + "grad_norm": 1.234375, + "learning_rate": 0.0001542234831226202, + "loss": 4.325, + "step": 9195 + }, + { + "epoch": 0.9535153490723159, + "grad_norm": 1.03125, + "learning_rate": 0.00015421435601025048, + "loss": 4.2692, + "step": 9196 + }, + { + "epoch": 0.953619037126804, + "grad_norm": 1.546875, + "learning_rate": 0.0001542052282582087, + "loss": 4.251, + "step": 9197 + }, + { + "epoch": 0.9537227251812921, + "grad_norm": 1.390625, + "learning_rate": 0.00015419609986660256, + "loss": 4.2267, + "step": 9198 + }, + { + "epoch": 0.9538264132357801, + "grad_norm": 1.3203125, + "learning_rate": 0.00015418697083553968, + "loss": 4.2704, + "step": 9199 + }, + { + "epoch": 0.9539301012902682, + "grad_norm": 1.3046875, + "learning_rate": 0.00015417784116512784, + "loss": 4.2562, + "step": 9200 + }, + { + "epoch": 0.9540337893447564, + "grad_norm": 1.234375, + "learning_rate": 0.00015416871085547474, + "loss": 4.2652, + "step": 9201 + }, + { + "epoch": 0.9541374773992444, + "grad_norm": 1.171875, + "learning_rate": 0.00015415957990668808, + "loss": 4.2949, + "step": 9202 + }, + { + "epoch": 0.9542411654537325, + "grad_norm": 1.0546875, + "learning_rate": 0.00015415044831887568, + "loss": 4.2267, + "step": 9203 + }, + { + "epoch": 0.9543448535082205, + "grad_norm": 1.0078125, + "learning_rate": 0.00015414131609214522, + "loss": 4.2524, + "step": 9204 + }, + { + "epoch": 0.9544485415627086, + "grad_norm": 1.203125, + "learning_rate": 0.00015413218322660443, + "loss": 4.2644, + "step": 9205 + }, + { + "epoch": 0.9545522296171967, + "grad_norm": 1.1328125, + "learning_rate": 0.00015412304972236112, + "loss": 4.2714, + "step": 9206 + }, + { + "epoch": 0.9546559176716848, + "grad_norm": 1.2734375, + "learning_rate": 0.00015411391557952303, + "loss": 4.3073, + "step": 9207 + }, + { + "epoch": 0.9547596057261728, + "grad_norm": 1.171875, + "learning_rate": 0.00015410478079819795, + "loss": 4.2954, + "step": 9208 + }, + { + "epoch": 0.9548632937806609, + "grad_norm": 1.203125, + "learning_rate": 0.0001540956453784936, + "loss": 4.2384, + "step": 9209 + }, + { + "epoch": 0.9549669818351489, + "grad_norm": 1.15625, + "learning_rate": 0.00015408650932051786, + "loss": 4.2725, + "step": 9210 + }, + { + "epoch": 0.9550706698896371, + "grad_norm": 1.15625, + "learning_rate": 0.00015407737262437848, + "loss": 4.2239, + "step": 9211 + }, + { + "epoch": 0.9551743579441251, + "grad_norm": 1.1015625, + "learning_rate": 0.00015406823529018327, + "loss": 4.247, + "step": 9212 + }, + { + "epoch": 0.9552780459986132, + "grad_norm": 1.2578125, + "learning_rate": 0.00015405909731804003, + "loss": 4.2726, + "step": 9213 + }, + { + "epoch": 0.9553817340531012, + "grad_norm": 1.15625, + "learning_rate": 0.00015404995870805656, + "loss": 4.25, + "step": 9214 + }, + { + "epoch": 0.9554854221075894, + "grad_norm": 1.2578125, + "learning_rate": 0.00015404081946034074, + "loss": 4.2884, + "step": 9215 + }, + { + "epoch": 0.9555891101620774, + "grad_norm": 1.1171875, + "learning_rate": 0.00015403167957500036, + "loss": 4.2743, + "step": 9216 + }, + { + "epoch": 0.9556927982165655, + "grad_norm": 1.28125, + "learning_rate": 0.00015402253905214325, + "loss": 4.2711, + "step": 9217 + }, + { + "epoch": 0.9557964862710535, + "grad_norm": 1.2734375, + "learning_rate": 0.00015401339789187731, + "loss": 4.2759, + "step": 9218 + }, + { + "epoch": 0.9559001743255416, + "grad_norm": 1.2109375, + "learning_rate": 0.0001540042560943104, + "loss": 4.2727, + "step": 9219 + }, + { + "epoch": 0.9560038623800297, + "grad_norm": 1.140625, + "learning_rate": 0.00015399511365955034, + "loss": 4.2178, + "step": 9220 + }, + { + "epoch": 0.9561075504345178, + "grad_norm": 1.09375, + "learning_rate": 0.00015398597058770497, + "loss": 4.2876, + "step": 9221 + }, + { + "epoch": 0.9562112384890058, + "grad_norm": 1.015625, + "learning_rate": 0.00015397682687888224, + "loss": 4.274, + "step": 9222 + }, + { + "epoch": 0.9563149265434939, + "grad_norm": 1.28125, + "learning_rate": 0.00015396768253319004, + "loss": 4.2447, + "step": 9223 + }, + { + "epoch": 0.9564186145979819, + "grad_norm": 1.125, + "learning_rate": 0.00015395853755073617, + "loss": 4.2406, + "step": 9224 + }, + { + "epoch": 0.9565223026524701, + "grad_norm": 1.5703125, + "learning_rate": 0.0001539493919316286, + "loss": 4.2687, + "step": 9225 + }, + { + "epoch": 0.9566259907069581, + "grad_norm": 1.46875, + "learning_rate": 0.00015394024567597528, + "loss": 4.2737, + "step": 9226 + }, + { + "epoch": 0.9567296787614462, + "grad_norm": 1.28125, + "learning_rate": 0.00015393109878388404, + "loss": 4.2817, + "step": 9227 + }, + { + "epoch": 0.9568333668159342, + "grad_norm": 1.1796875, + "learning_rate": 0.00015392195125546286, + "loss": 4.2752, + "step": 9228 + }, + { + "epoch": 0.9569370548704224, + "grad_norm": 1.15625, + "learning_rate": 0.00015391280309081968, + "loss": 4.2593, + "step": 9229 + }, + { + "epoch": 0.9570407429249104, + "grad_norm": 1.0859375, + "learning_rate": 0.00015390365429006236, + "loss": 4.2702, + "step": 9230 + }, + { + "epoch": 0.9571444309793985, + "grad_norm": 1.140625, + "learning_rate": 0.00015389450485329894, + "loss": 4.2631, + "step": 9231 + }, + { + "epoch": 0.9572481190338865, + "grad_norm": 1.0234375, + "learning_rate": 0.0001538853547806373, + "loss": 4.2499, + "step": 9232 + }, + { + "epoch": 0.9573518070883746, + "grad_norm": 1.2421875, + "learning_rate": 0.00015387620407218545, + "loss": 4.2652, + "step": 9233 + }, + { + "epoch": 0.9574554951428627, + "grad_norm": 1.078125, + "learning_rate": 0.00015386705272805133, + "loss": 4.2738, + "step": 9234 + }, + { + "epoch": 0.9575591831973508, + "grad_norm": 1.46875, + "learning_rate": 0.0001538579007483429, + "loss": 4.2667, + "step": 9235 + }, + { + "epoch": 0.9576628712518388, + "grad_norm": 1.4453125, + "learning_rate": 0.00015384874813316826, + "loss": 4.2577, + "step": 9236 + }, + { + "epoch": 0.9577665593063269, + "grad_norm": 1.046875, + "learning_rate": 0.00015383959488263525, + "loss": 4.2821, + "step": 9237 + }, + { + "epoch": 0.9578702473608149, + "grad_norm": 1.109375, + "learning_rate": 0.00015383044099685192, + "loss": 4.3023, + "step": 9238 + }, + { + "epoch": 0.9579739354153031, + "grad_norm": 0.9375, + "learning_rate": 0.00015382128647592632, + "loss": 4.2862, + "step": 9239 + }, + { + "epoch": 0.9580776234697911, + "grad_norm": 0.90625, + "learning_rate": 0.00015381213131996643, + "loss": 4.2631, + "step": 9240 + }, + { + "epoch": 0.9581813115242792, + "grad_norm": 0.98046875, + "learning_rate": 0.00015380297552908026, + "loss": 4.2545, + "step": 9241 + }, + { + "epoch": 0.9582849995787672, + "grad_norm": 0.765625, + "learning_rate": 0.00015379381910337583, + "loss": 4.2514, + "step": 9242 + }, + { + "epoch": 0.9583886876332554, + "grad_norm": 0.99609375, + "learning_rate": 0.00015378466204296121, + "loss": 4.2767, + "step": 9243 + }, + { + "epoch": 0.9584923756877434, + "grad_norm": 0.72265625, + "learning_rate": 0.00015377550434794447, + "loss": 4.29, + "step": 9244 + }, + { + "epoch": 0.9585960637422315, + "grad_norm": 0.9296875, + "learning_rate": 0.00015376634601843358, + "loss": 4.2716, + "step": 9245 + }, + { + "epoch": 0.9586997517967196, + "grad_norm": 0.72265625, + "learning_rate": 0.00015375718705453663, + "loss": 4.2961, + "step": 9246 + }, + { + "epoch": 0.9588034398512076, + "grad_norm": 0.9375, + "learning_rate": 0.00015374802745636172, + "loss": 4.2452, + "step": 9247 + }, + { + "epoch": 0.9589071279056958, + "grad_norm": 0.75390625, + "learning_rate": 0.00015373886722401693, + "loss": 4.2632, + "step": 9248 + }, + { + "epoch": 0.9590108159601838, + "grad_norm": 0.77734375, + "learning_rate": 0.00015372970635761025, + "loss": 4.2766, + "step": 9249 + }, + { + "epoch": 0.9591145040146719, + "grad_norm": 0.734375, + "learning_rate": 0.00015372054485724988, + "loss": 4.2929, + "step": 9250 + }, + { + "epoch": 0.9592181920691599, + "grad_norm": 0.73046875, + "learning_rate": 0.00015371138272304383, + "loss": 4.2799, + "step": 9251 + }, + { + "epoch": 0.959321880123648, + "grad_norm": 0.75, + "learning_rate": 0.00015370221995510025, + "loss": 4.2487, + "step": 9252 + }, + { + "epoch": 0.9594255681781361, + "grad_norm": 0.83203125, + "learning_rate": 0.0001536930565535273, + "loss": 4.2844, + "step": 9253 + }, + { + "epoch": 0.9595292562326242, + "grad_norm": 0.6953125, + "learning_rate": 0.00015368389251843297, + "loss": 4.258, + "step": 9254 + }, + { + "epoch": 0.9596329442871122, + "grad_norm": 0.71875, + "learning_rate": 0.00015367472784992545, + "loss": 4.2792, + "step": 9255 + }, + { + "epoch": 0.9597366323416003, + "grad_norm": 0.8359375, + "learning_rate": 0.0001536655625481129, + "loss": 4.2807, + "step": 9256 + }, + { + "epoch": 0.9598403203960884, + "grad_norm": 0.6875, + "learning_rate": 0.00015365639661310342, + "loss": 4.2627, + "step": 9257 + }, + { + "epoch": 0.9599440084505765, + "grad_norm": 0.796875, + "learning_rate": 0.00015364723004500518, + "loss": 4.2875, + "step": 9258 + }, + { + "epoch": 0.9600476965050645, + "grad_norm": 0.72265625, + "learning_rate": 0.00015363806284392637, + "loss": 4.2563, + "step": 9259 + }, + { + "epoch": 0.9601513845595526, + "grad_norm": 0.86328125, + "learning_rate": 0.0001536288950099751, + "loss": 4.266, + "step": 9260 + }, + { + "epoch": 0.9602550726140406, + "grad_norm": 0.69921875, + "learning_rate": 0.00015361972654325952, + "loss": 4.2698, + "step": 9261 + }, + { + "epoch": 0.9603587606685288, + "grad_norm": 0.79296875, + "learning_rate": 0.00015361055744388789, + "loss": 4.2405, + "step": 9262 + }, + { + "epoch": 0.9604624487230168, + "grad_norm": 0.76953125, + "learning_rate": 0.00015360138771196833, + "loss": 4.2959, + "step": 9263 + }, + { + "epoch": 0.9605661367775049, + "grad_norm": 0.87890625, + "learning_rate": 0.00015359221734760907, + "loss": 4.2977, + "step": 9264 + }, + { + "epoch": 0.9606698248319929, + "grad_norm": 0.7421875, + "learning_rate": 0.0001535830463509183, + "loss": 4.2779, + "step": 9265 + }, + { + "epoch": 0.960773512886481, + "grad_norm": 0.7890625, + "learning_rate": 0.00015357387472200417, + "loss": 4.2732, + "step": 9266 + }, + { + "epoch": 0.9608772009409691, + "grad_norm": 0.7421875, + "learning_rate": 0.000153564702460975, + "loss": 4.266, + "step": 9267 + }, + { + "epoch": 0.9609808889954572, + "grad_norm": 0.84375, + "learning_rate": 0.00015355552956793891, + "loss": 4.2521, + "step": 9268 + }, + { + "epoch": 0.9610845770499452, + "grad_norm": 0.75, + "learning_rate": 0.00015354635604300425, + "loss": 4.2541, + "step": 9269 + }, + { + "epoch": 0.9611882651044333, + "grad_norm": 0.79296875, + "learning_rate": 0.00015353718188627915, + "loss": 4.2916, + "step": 9270 + }, + { + "epoch": 0.9612919531589214, + "grad_norm": 0.75390625, + "learning_rate": 0.0001535280070978719, + "loss": 4.2589, + "step": 9271 + }, + { + "epoch": 0.9613956412134095, + "grad_norm": 0.7890625, + "learning_rate": 0.00015351883167789073, + "loss": 4.2507, + "step": 9272 + }, + { + "epoch": 0.9614993292678975, + "grad_norm": 0.8125, + "learning_rate": 0.00015350965562644397, + "loss": 4.2826, + "step": 9273 + }, + { + "epoch": 0.9616030173223856, + "grad_norm": 0.828125, + "learning_rate": 0.00015350047894363978, + "loss": 4.2858, + "step": 9274 + }, + { + "epoch": 0.9617067053768736, + "grad_norm": 0.8046875, + "learning_rate": 0.00015349130162958653, + "loss": 4.3188, + "step": 9275 + }, + { + "epoch": 0.9618103934313618, + "grad_norm": 0.7890625, + "learning_rate": 0.00015348212368439246, + "loss": 4.2959, + "step": 9276 + }, + { + "epoch": 0.9619140814858498, + "grad_norm": 0.765625, + "learning_rate": 0.00015347294510816584, + "loss": 4.2796, + "step": 9277 + }, + { + "epoch": 0.9620177695403379, + "grad_norm": 0.72265625, + "learning_rate": 0.000153463765901015, + "loss": 4.2404, + "step": 9278 + }, + { + "epoch": 0.9621214575948259, + "grad_norm": 0.8125, + "learning_rate": 0.00015345458606304827, + "loss": 4.2171, + "step": 9279 + }, + { + "epoch": 0.962225145649314, + "grad_norm": 0.7421875, + "learning_rate": 0.0001534454055943739, + "loss": 4.3025, + "step": 9280 + }, + { + "epoch": 0.9623288337038021, + "grad_norm": 0.70703125, + "learning_rate": 0.0001534362244951002, + "loss": 4.2821, + "step": 9281 + }, + { + "epoch": 0.9624325217582902, + "grad_norm": 0.7421875, + "learning_rate": 0.00015342704276533558, + "loss": 4.2664, + "step": 9282 + }, + { + "epoch": 0.9625362098127782, + "grad_norm": 0.66796875, + "learning_rate": 0.0001534178604051883, + "loss": 4.2356, + "step": 9283 + }, + { + "epoch": 0.9626398978672663, + "grad_norm": 0.77734375, + "learning_rate": 0.00015340867741476676, + "loss": 4.2559, + "step": 9284 + }, + { + "epoch": 0.9627435859217544, + "grad_norm": 0.640625, + "learning_rate": 0.00015339949379417927, + "loss": 4.2702, + "step": 9285 + }, + { + "epoch": 0.9628472739762425, + "grad_norm": 0.69140625, + "learning_rate": 0.00015339030954353424, + "loss": 4.2583, + "step": 9286 + }, + { + "epoch": 0.9629509620307305, + "grad_norm": 0.68359375, + "learning_rate": 0.00015338112466293992, + "loss": 4.2732, + "step": 9287 + }, + { + "epoch": 0.9630546500852186, + "grad_norm": 0.7265625, + "learning_rate": 0.0001533719391525048, + "loss": 4.2976, + "step": 9288 + }, + { + "epoch": 0.9631583381397066, + "grad_norm": 0.796875, + "learning_rate": 0.00015336275301233723, + "loss": 4.2614, + "step": 9289 + }, + { + "epoch": 0.9632620261941948, + "grad_norm": 0.69921875, + "learning_rate": 0.00015335356624254556, + "loss": 4.2133, + "step": 9290 + }, + { + "epoch": 0.9633657142486829, + "grad_norm": 0.7421875, + "learning_rate": 0.0001533443788432382, + "loss": 4.2687, + "step": 9291 + }, + { + "epoch": 0.9634694023031709, + "grad_norm": 0.71875, + "learning_rate": 0.00015333519081452357, + "loss": 4.3103, + "step": 9292 + }, + { + "epoch": 0.963573090357659, + "grad_norm": 0.64453125, + "learning_rate": 0.00015332600215651004, + "loss": 4.2408, + "step": 9293 + }, + { + "epoch": 0.963676778412147, + "grad_norm": 0.73828125, + "learning_rate": 0.0001533168128693061, + "loss": 4.2903, + "step": 9294 + }, + { + "epoch": 0.9637804664666352, + "grad_norm": 0.6796875, + "learning_rate": 0.00015330762295302008, + "loss": 4.2581, + "step": 9295 + }, + { + "epoch": 0.9638841545211232, + "grad_norm": 0.73828125, + "learning_rate": 0.00015329843240776049, + "loss": 4.2235, + "step": 9296 + }, + { + "epoch": 0.9639878425756113, + "grad_norm": 0.77734375, + "learning_rate": 0.0001532892412336357, + "loss": 4.2837, + "step": 9297 + }, + { + "epoch": 0.9640915306300993, + "grad_norm": 0.75390625, + "learning_rate": 0.0001532800494307542, + "loss": 4.261, + "step": 9298 + }, + { + "epoch": 0.9641952186845875, + "grad_norm": 0.75, + "learning_rate": 0.00015327085699922446, + "loss": 4.2868, + "step": 9299 + }, + { + "epoch": 0.9642989067390755, + "grad_norm": 0.734375, + "learning_rate": 0.0001532616639391549, + "loss": 4.2561, + "step": 9300 + }, + { + "epoch": 0.9644025947935636, + "grad_norm": 0.76953125, + "learning_rate": 0.00015325247025065403, + "loss": 4.2627, + "step": 9301 + }, + { + "epoch": 0.9645062828480516, + "grad_norm": 0.82421875, + "learning_rate": 0.00015324327593383027, + "loss": 4.2958, + "step": 9302 + }, + { + "epoch": 0.9646099709025397, + "grad_norm": 0.81640625, + "learning_rate": 0.00015323408098879217, + "loss": 4.2615, + "step": 9303 + }, + { + "epoch": 0.9647136589570278, + "grad_norm": 0.86328125, + "learning_rate": 0.00015322488541564817, + "loss": 4.2625, + "step": 9304 + }, + { + "epoch": 0.9648173470115159, + "grad_norm": 0.78125, + "learning_rate": 0.00015321568921450676, + "loss": 4.2513, + "step": 9305 + }, + { + "epoch": 0.9649210350660039, + "grad_norm": 0.86328125, + "learning_rate": 0.0001532064923854765, + "loss": 4.2893, + "step": 9306 + }, + { + "epoch": 0.965024723120492, + "grad_norm": 0.7578125, + "learning_rate": 0.00015319729492866584, + "loss": 4.2635, + "step": 9307 + }, + { + "epoch": 0.96512841117498, + "grad_norm": 0.875, + "learning_rate": 0.00015318809684418337, + "loss": 4.2596, + "step": 9308 + }, + { + "epoch": 0.9652320992294682, + "grad_norm": 0.7421875, + "learning_rate": 0.00015317889813213753, + "loss": 4.2766, + "step": 9309 + }, + { + "epoch": 0.9653357872839562, + "grad_norm": 0.8828125, + "learning_rate": 0.00015316969879263694, + "loss": 4.2442, + "step": 9310 + }, + { + "epoch": 0.9654394753384443, + "grad_norm": 0.7265625, + "learning_rate": 0.0001531604988257901, + "loss": 4.2975, + "step": 9311 + }, + { + "epoch": 0.9655431633929323, + "grad_norm": 0.9140625, + "learning_rate": 0.00015315129823170555, + "loss": 4.2573, + "step": 9312 + }, + { + "epoch": 0.9656468514474205, + "grad_norm": 0.6953125, + "learning_rate": 0.0001531420970104919, + "loss": 4.2364, + "step": 9313 + }, + { + "epoch": 0.9657505395019085, + "grad_norm": 0.953125, + "learning_rate": 0.00015313289516225766, + "loss": 4.2463, + "step": 9314 + }, + { + "epoch": 0.9658542275563966, + "grad_norm": 0.72265625, + "learning_rate": 0.00015312369268711144, + "loss": 4.2555, + "step": 9315 + }, + { + "epoch": 0.9659579156108846, + "grad_norm": 0.87890625, + "learning_rate": 0.00015311448958516176, + "loss": 4.2938, + "step": 9316 + }, + { + "epoch": 0.9660616036653727, + "grad_norm": 0.64453125, + "learning_rate": 0.00015310528585651725, + "loss": 4.2862, + "step": 9317 + }, + { + "epoch": 0.9661652917198608, + "grad_norm": 0.90234375, + "learning_rate": 0.00015309608150128654, + "loss": 4.2608, + "step": 9318 + }, + { + "epoch": 0.9662689797743489, + "grad_norm": 0.66015625, + "learning_rate": 0.00015308687651957817, + "loss": 4.2827, + "step": 9319 + }, + { + "epoch": 0.9663726678288369, + "grad_norm": 0.84765625, + "learning_rate": 0.00015307767091150078, + "loss": 4.31, + "step": 9320 + }, + { + "epoch": 0.966476355883325, + "grad_norm": 0.66015625, + "learning_rate": 0.00015306846467716295, + "loss": 4.2828, + "step": 9321 + }, + { + "epoch": 0.966580043937813, + "grad_norm": 0.80859375, + "learning_rate": 0.00015305925781667335, + "loss": 4.2935, + "step": 9322 + }, + { + "epoch": 0.9666837319923012, + "grad_norm": 0.79296875, + "learning_rate": 0.00015305005033014064, + "loss": 4.2706, + "step": 9323 + }, + { + "epoch": 0.9667874200467892, + "grad_norm": 0.75, + "learning_rate": 0.00015304084221767335, + "loss": 4.2371, + "step": 9324 + }, + { + "epoch": 0.9668911081012773, + "grad_norm": 0.73828125, + "learning_rate": 0.0001530316334793802, + "loss": 4.2946, + "step": 9325 + }, + { + "epoch": 0.9669947961557653, + "grad_norm": 0.69921875, + "learning_rate": 0.00015302242411536988, + "loss": 4.2853, + "step": 9326 + }, + { + "epoch": 0.9670984842102535, + "grad_norm": 0.8046875, + "learning_rate": 0.00015301321412575095, + "loss": 4.273, + "step": 9327 + }, + { + "epoch": 0.9672021722647415, + "grad_norm": 0.68359375, + "learning_rate": 0.00015300400351063215, + "loss": 4.2475, + "step": 9328 + }, + { + "epoch": 0.9673058603192296, + "grad_norm": 0.83203125, + "learning_rate": 0.00015299479227012214, + "loss": 4.3119, + "step": 9329 + }, + { + "epoch": 0.9674095483737176, + "grad_norm": 0.68359375, + "learning_rate": 0.0001529855804043296, + "loss": 4.2729, + "step": 9330 + }, + { + "epoch": 0.9675132364282057, + "grad_norm": 0.90625, + "learning_rate": 0.0001529763679133632, + "loss": 4.2845, + "step": 9331 + }, + { + "epoch": 0.9676169244826938, + "grad_norm": 0.71875, + "learning_rate": 0.0001529671547973317, + "loss": 4.2634, + "step": 9332 + }, + { + "epoch": 0.9677206125371819, + "grad_norm": 0.97265625, + "learning_rate": 0.00015295794105634372, + "loss": 4.2497, + "step": 9333 + }, + { + "epoch": 0.9678243005916699, + "grad_norm": 0.7109375, + "learning_rate": 0.00015294872669050798, + "loss": 4.2846, + "step": 9334 + }, + { + "epoch": 0.967927988646158, + "grad_norm": 0.87109375, + "learning_rate": 0.00015293951169993331, + "loss": 4.272, + "step": 9335 + }, + { + "epoch": 0.9680316767006462, + "grad_norm": 0.7109375, + "learning_rate": 0.00015293029608472834, + "loss": 4.2757, + "step": 9336 + }, + { + "epoch": 0.9681353647551342, + "grad_norm": 0.80859375, + "learning_rate": 0.00015292107984500182, + "loss": 4.2667, + "step": 9337 + }, + { + "epoch": 0.9682390528096223, + "grad_norm": 0.6796875, + "learning_rate": 0.00015291186298086248, + "loss": 4.2901, + "step": 9338 + }, + { + "epoch": 0.9683427408641103, + "grad_norm": 0.828125, + "learning_rate": 0.0001529026454924191, + "loss": 4.284, + "step": 9339 + }, + { + "epoch": 0.9684464289185984, + "grad_norm": 0.703125, + "learning_rate": 0.00015289342737978044, + "loss": 4.2652, + "step": 9340 + }, + { + "epoch": 0.9685501169730865, + "grad_norm": 0.78125, + "learning_rate": 0.00015288420864305522, + "loss": 4.2741, + "step": 9341 + }, + { + "epoch": 0.9686538050275746, + "grad_norm": 0.76171875, + "learning_rate": 0.00015287498928235227, + "loss": 4.2619, + "step": 9342 + }, + { + "epoch": 0.9687574930820626, + "grad_norm": 0.796875, + "learning_rate": 0.0001528657692977803, + "loss": 4.2337, + "step": 9343 + }, + { + "epoch": 0.9688611811365507, + "grad_norm": 0.8515625, + "learning_rate": 0.00015285654868944817, + "loss": 4.2751, + "step": 9344 + }, + { + "epoch": 0.9689648691910387, + "grad_norm": 0.890625, + "learning_rate": 0.0001528473274574646, + "loss": 4.2606, + "step": 9345 + }, + { + "epoch": 0.9690685572455269, + "grad_norm": 0.80859375, + "learning_rate": 0.00015283810560193846, + "loss": 4.2429, + "step": 9346 + }, + { + "epoch": 0.9691722453000149, + "grad_norm": 0.9609375, + "learning_rate": 0.0001528288831229785, + "loss": 4.2399, + "step": 9347 + }, + { + "epoch": 0.969275933354503, + "grad_norm": 0.76171875, + "learning_rate": 0.00015281966002069362, + "loss": 4.284, + "step": 9348 + }, + { + "epoch": 0.969379621408991, + "grad_norm": 0.80078125, + "learning_rate": 0.00015281043629519252, + "loss": 4.2841, + "step": 9349 + }, + { + "epoch": 0.9694833094634792, + "grad_norm": 0.7265625, + "learning_rate": 0.0001528012119465841, + "loss": 4.292, + "step": 9350 + }, + { + "epoch": 0.9695869975179672, + "grad_norm": 0.77734375, + "learning_rate": 0.00015279198697497722, + "loss": 4.2625, + "step": 9351 + }, + { + "epoch": 0.9696906855724553, + "grad_norm": 0.7109375, + "learning_rate": 0.00015278276138048068, + "loss": 4.2883, + "step": 9352 + }, + { + "epoch": 0.9697943736269433, + "grad_norm": 0.70703125, + "learning_rate": 0.00015277353516320337, + "loss": 4.2763, + "step": 9353 + }, + { + "epoch": 0.9698980616814314, + "grad_norm": 0.75390625, + "learning_rate": 0.0001527643083232541, + "loss": 4.2784, + "step": 9354 + }, + { + "epoch": 0.9700017497359195, + "grad_norm": 0.6953125, + "learning_rate": 0.00015275508086074176, + "loss": 4.235, + "step": 9355 + }, + { + "epoch": 0.9701054377904076, + "grad_norm": 0.68359375, + "learning_rate": 0.0001527458527757753, + "loss": 4.3079, + "step": 9356 + }, + { + "epoch": 0.9702091258448956, + "grad_norm": 0.640625, + "learning_rate": 0.0001527366240684635, + "loss": 4.2871, + "step": 9357 + }, + { + "epoch": 0.9703128138993837, + "grad_norm": 0.69921875, + "learning_rate": 0.0001527273947389152, + "loss": 4.2377, + "step": 9358 + }, + { + "epoch": 0.9704165019538717, + "grad_norm": 0.671875, + "learning_rate": 0.00015271816478723945, + "loss": 4.3026, + "step": 9359 + }, + { + "epoch": 0.9705201900083599, + "grad_norm": 0.72265625, + "learning_rate": 0.00015270893421354508, + "loss": 4.2523, + "step": 9360 + }, + { + "epoch": 0.9706238780628479, + "grad_norm": 0.69921875, + "learning_rate": 0.00015269970301794102, + "loss": 4.2463, + "step": 9361 + }, + { + "epoch": 0.970727566117336, + "grad_norm": 0.671875, + "learning_rate": 0.0001526904712005361, + "loss": 4.2646, + "step": 9362 + }, + { + "epoch": 0.970831254171824, + "grad_norm": 0.6640625, + "learning_rate": 0.00015268123876143938, + "loss": 4.2801, + "step": 9363 + }, + { + "epoch": 0.9709349422263122, + "grad_norm": 0.66015625, + "learning_rate": 0.00015267200570075973, + "loss": 4.2408, + "step": 9364 + }, + { + "epoch": 0.9710386302808002, + "grad_norm": 0.68359375, + "learning_rate": 0.00015266277201860608, + "loss": 4.1902, + "step": 9365 + }, + { + "epoch": 0.9711423183352883, + "grad_norm": 0.671875, + "learning_rate": 0.00015265353771508737, + "loss": 4.2369, + "step": 9366 + }, + { + "epoch": 0.9712460063897763, + "grad_norm": 0.6875, + "learning_rate": 0.00015264430279031256, + "loss": 4.2613, + "step": 9367 + }, + { + "epoch": 0.9713496944442644, + "grad_norm": 0.7421875, + "learning_rate": 0.00015263506724439064, + "loss": 4.2568, + "step": 9368 + }, + { + "epoch": 0.9714533824987525, + "grad_norm": 0.74609375, + "learning_rate": 0.00015262583107743057, + "loss": 4.2521, + "step": 9369 + }, + { + "epoch": 0.9715570705532406, + "grad_norm": 0.8828125, + "learning_rate": 0.00015261659428954133, + "loss": 4.2344, + "step": 9370 + }, + { + "epoch": 0.9716607586077286, + "grad_norm": 0.796875, + "learning_rate": 0.0001526073568808319, + "loss": 4.2492, + "step": 9371 + }, + { + "epoch": 0.9717644466622167, + "grad_norm": 0.9140625, + "learning_rate": 0.0001525981188514112, + "loss": 4.2453, + "step": 9372 + }, + { + "epoch": 0.9718681347167047, + "grad_norm": 0.78125, + "learning_rate": 0.0001525888802013884, + "loss": 4.2665, + "step": 9373 + }, + { + "epoch": 0.9719718227711929, + "grad_norm": 0.7890625, + "learning_rate": 0.00015257964093087233, + "loss": 4.2541, + "step": 9374 + }, + { + "epoch": 0.9720755108256809, + "grad_norm": 0.7578125, + "learning_rate": 0.00015257040103997208, + "loss": 4.2422, + "step": 9375 + }, + { + "epoch": 0.972179198880169, + "grad_norm": 0.8984375, + "learning_rate": 0.0001525611605287967, + "loss": 4.2632, + "step": 9376 + }, + { + "epoch": 0.972282886934657, + "grad_norm": 0.75390625, + "learning_rate": 0.00015255191939745518, + "loss": 4.2566, + "step": 9377 + }, + { + "epoch": 0.9723865749891452, + "grad_norm": 0.82421875, + "learning_rate": 0.00015254267764605657, + "loss": 4.2434, + "step": 9378 + }, + { + "epoch": 0.9724902630436332, + "grad_norm": 0.74609375, + "learning_rate": 0.00015253343527470986, + "loss": 4.287, + "step": 9379 + }, + { + "epoch": 0.9725939510981213, + "grad_norm": 0.80078125, + "learning_rate": 0.00015252419228352414, + "loss": 4.2895, + "step": 9380 + }, + { + "epoch": 0.9726976391526094, + "grad_norm": 0.82421875, + "learning_rate": 0.00015251494867260854, + "loss": 4.2692, + "step": 9381 + }, + { + "epoch": 0.9728013272070974, + "grad_norm": 0.74609375, + "learning_rate": 0.000152505704442072, + "loss": 4.3054, + "step": 9382 + }, + { + "epoch": 0.9729050152615856, + "grad_norm": 0.8046875, + "learning_rate": 0.00015249645959202366, + "loss": 4.2397, + "step": 9383 + }, + { + "epoch": 0.9730087033160736, + "grad_norm": 0.71875, + "learning_rate": 0.0001524872141225726, + "loss": 4.2267, + "step": 9384 + }, + { + "epoch": 0.9731123913705617, + "grad_norm": 0.80859375, + "learning_rate": 0.00015247796803382789, + "loss": 4.2755, + "step": 9385 + }, + { + "epoch": 0.9732160794250497, + "grad_norm": 0.6015625, + "learning_rate": 0.00015246872132589862, + "loss": 4.2202, + "step": 9386 + }, + { + "epoch": 0.9733197674795379, + "grad_norm": 0.85546875, + "learning_rate": 0.0001524594739988939, + "loss": 4.2612, + "step": 9387 + }, + { + "epoch": 0.9734234555340259, + "grad_norm": 0.6171875, + "learning_rate": 0.00015245022605292285, + "loss": 4.2896, + "step": 9388 + }, + { + "epoch": 0.973527143588514, + "grad_norm": 0.77734375, + "learning_rate": 0.00015244097748809456, + "loss": 4.2922, + "step": 9389 + }, + { + "epoch": 0.973630831643002, + "grad_norm": 0.69140625, + "learning_rate": 0.0001524317283045182, + "loss": 4.2702, + "step": 9390 + }, + { + "epoch": 0.9737345196974901, + "grad_norm": 0.79296875, + "learning_rate": 0.00015242247850230282, + "loss": 4.2298, + "step": 9391 + }, + { + "epoch": 0.9738382077519782, + "grad_norm": 0.703125, + "learning_rate": 0.00015241322808155763, + "loss": 4.2585, + "step": 9392 + }, + { + "epoch": 0.9739418958064663, + "grad_norm": 0.75390625, + "learning_rate": 0.00015240397704239177, + "loss": 4.3028, + "step": 9393 + }, + { + "epoch": 0.9740455838609543, + "grad_norm": 0.72265625, + "learning_rate": 0.00015239472538491439, + "loss": 4.2511, + "step": 9394 + }, + { + "epoch": 0.9741492719154424, + "grad_norm": 0.7265625, + "learning_rate": 0.00015238547310923457, + "loss": 4.2455, + "step": 9395 + }, + { + "epoch": 0.9742529599699304, + "grad_norm": 0.69921875, + "learning_rate": 0.00015237622021546158, + "loss": 4.2601, + "step": 9396 + }, + { + "epoch": 0.9743566480244186, + "grad_norm": 0.71875, + "learning_rate": 0.00015236696670370455, + "loss": 4.2497, + "step": 9397 + }, + { + "epoch": 0.9744603360789066, + "grad_norm": 0.7421875, + "learning_rate": 0.00015235771257407268, + "loss": 4.1968, + "step": 9398 + }, + { + "epoch": 0.9745640241333947, + "grad_norm": 0.83984375, + "learning_rate": 0.0001523484578266751, + "loss": 4.2809, + "step": 9399 + }, + { + "epoch": 0.9746677121878827, + "grad_norm": 0.7421875, + "learning_rate": 0.00015233920246162107, + "loss": 4.2493, + "step": 9400 + }, + { + "epoch": 0.9747714002423709, + "grad_norm": 0.80859375, + "learning_rate": 0.00015232994647901982, + "loss": 4.2857, + "step": 9401 + }, + { + "epoch": 0.9748750882968589, + "grad_norm": 0.84765625, + "learning_rate": 0.0001523206898789805, + "loss": 4.2667, + "step": 9402 + }, + { + "epoch": 0.974978776351347, + "grad_norm": 0.81640625, + "learning_rate": 0.00015231143266161232, + "loss": 4.2546, + "step": 9403 + }, + { + "epoch": 0.975082464405835, + "grad_norm": 0.80078125, + "learning_rate": 0.00015230217482702454, + "loss": 4.2398, + "step": 9404 + }, + { + "epoch": 0.9751861524603231, + "grad_norm": 0.8125, + "learning_rate": 0.0001522929163753264, + "loss": 4.2702, + "step": 9405 + }, + { + "epoch": 0.9752898405148112, + "grad_norm": 0.75, + "learning_rate": 0.00015228365730662712, + "loss": 4.2692, + "step": 9406 + }, + { + "epoch": 0.9753935285692993, + "grad_norm": 0.71484375, + "learning_rate": 0.00015227439762103594, + "loss": 4.3059, + "step": 9407 + }, + { + "epoch": 0.9754972166237873, + "grad_norm": 0.71484375, + "learning_rate": 0.0001522651373186621, + "loss": 4.2772, + "step": 9408 + }, + { + "epoch": 0.9756009046782754, + "grad_norm": 0.72265625, + "learning_rate": 0.00015225587639961492, + "loss": 4.2697, + "step": 9409 + }, + { + "epoch": 0.9757045927327634, + "grad_norm": 0.7734375, + "learning_rate": 0.0001522466148640036, + "loss": 4.2494, + "step": 9410 + }, + { + "epoch": 0.9758082807872516, + "grad_norm": 0.7734375, + "learning_rate": 0.00015223735271193748, + "loss": 4.2737, + "step": 9411 + }, + { + "epoch": 0.9759119688417396, + "grad_norm": 0.75, + "learning_rate": 0.00015222808994352582, + "loss": 4.2676, + "step": 9412 + }, + { + "epoch": 0.9760156568962277, + "grad_norm": 0.77734375, + "learning_rate": 0.0001522188265588779, + "loss": 4.2648, + "step": 9413 + }, + { + "epoch": 0.9761193449507157, + "grad_norm": 0.7890625, + "learning_rate": 0.00015220956255810304, + "loss": 4.2446, + "step": 9414 + }, + { + "epoch": 0.9762230330052039, + "grad_norm": 0.73828125, + "learning_rate": 0.0001522002979413105, + "loss": 4.2817, + "step": 9415 + }, + { + "epoch": 0.9763267210596919, + "grad_norm": 0.765625, + "learning_rate": 0.00015219103270860965, + "loss": 4.2879, + "step": 9416 + }, + { + "epoch": 0.97643040911418, + "grad_norm": 0.8359375, + "learning_rate": 0.00015218176686010974, + "loss": 4.2554, + "step": 9417 + }, + { + "epoch": 0.976534097168668, + "grad_norm": 0.8046875, + "learning_rate": 0.00015217250039592018, + "loss": 4.2662, + "step": 9418 + }, + { + "epoch": 0.9766377852231561, + "grad_norm": 0.77734375, + "learning_rate": 0.00015216323331615023, + "loss": 4.271, + "step": 9419 + }, + { + "epoch": 0.9767414732776442, + "grad_norm": 0.76171875, + "learning_rate": 0.0001521539656209093, + "loss": 4.261, + "step": 9420 + }, + { + "epoch": 0.9768451613321323, + "grad_norm": 0.7890625, + "learning_rate": 0.00015214469731030666, + "loss": 4.3136, + "step": 9421 + }, + { + "epoch": 0.9769488493866203, + "grad_norm": 0.71484375, + "learning_rate": 0.00015213542838445176, + "loss": 4.2493, + "step": 9422 + }, + { + "epoch": 0.9770525374411084, + "grad_norm": 0.7890625, + "learning_rate": 0.0001521261588434539, + "loss": 4.2347, + "step": 9423 + }, + { + "epoch": 0.9771562254955966, + "grad_norm": 0.66796875, + "learning_rate": 0.00015211688868742247, + "loss": 4.2623, + "step": 9424 + }, + { + "epoch": 0.9772599135500846, + "grad_norm": 0.7109375, + "learning_rate": 0.00015210761791646684, + "loss": 4.2435, + "step": 9425 + }, + { + "epoch": 0.9773636016045727, + "grad_norm": 0.72265625, + "learning_rate": 0.0001520983465306964, + "loss": 4.2485, + "step": 9426 + }, + { + "epoch": 0.9774672896590607, + "grad_norm": 0.75390625, + "learning_rate": 0.0001520890745302205, + "loss": 4.2038, + "step": 9427 + }, + { + "epoch": 0.9775709777135488, + "grad_norm": 0.703125, + "learning_rate": 0.00015207980191514863, + "loss": 4.2652, + "step": 9428 + }, + { + "epoch": 0.9776746657680369, + "grad_norm": 0.69921875, + "learning_rate": 0.00015207052868559012, + "loss": 4.2598, + "step": 9429 + }, + { + "epoch": 0.977778353822525, + "grad_norm": 0.7421875, + "learning_rate": 0.00015206125484165445, + "loss": 4.2875, + "step": 9430 + }, + { + "epoch": 0.977882041877013, + "grad_norm": 0.74609375, + "learning_rate": 0.000152051980383451, + "loss": 4.2917, + "step": 9431 + }, + { + "epoch": 0.9779857299315011, + "grad_norm": 0.73828125, + "learning_rate": 0.00015204270531108915, + "loss": 4.2378, + "step": 9432 + }, + { + "epoch": 0.9780894179859891, + "grad_norm": 0.76171875, + "learning_rate": 0.00015203342962467843, + "loss": 4.2538, + "step": 9433 + }, + { + "epoch": 0.9781931060404773, + "grad_norm": 0.671875, + "learning_rate": 0.00015202415332432826, + "loss": 4.2516, + "step": 9434 + }, + { + "epoch": 0.9782967940949653, + "grad_norm": 0.703125, + "learning_rate": 0.00015201487641014803, + "loss": 4.2188, + "step": 9435 + }, + { + "epoch": 0.9784004821494534, + "grad_norm": 0.73046875, + "learning_rate": 0.00015200559888224727, + "loss": 4.2706, + "step": 9436 + }, + { + "epoch": 0.9785041702039414, + "grad_norm": 0.77734375, + "learning_rate": 0.00015199632074073538, + "loss": 4.294, + "step": 9437 + }, + { + "epoch": 0.9786078582584296, + "grad_norm": 0.79296875, + "learning_rate": 0.00015198704198572189, + "loss": 4.2657, + "step": 9438 + }, + { + "epoch": 0.9787115463129176, + "grad_norm": 0.80859375, + "learning_rate": 0.00015197776261731627, + "loss": 4.3236, + "step": 9439 + }, + { + "epoch": 0.9788152343674057, + "grad_norm": 0.70703125, + "learning_rate": 0.000151968482635628, + "loss": 4.2368, + "step": 9440 + }, + { + "epoch": 0.9789189224218937, + "grad_norm": 0.81640625, + "learning_rate": 0.00015195920204076654, + "loss": 4.2963, + "step": 9441 + }, + { + "epoch": 0.9790226104763818, + "grad_norm": 0.78125, + "learning_rate": 0.00015194992083284142, + "loss": 4.2503, + "step": 9442 + }, + { + "epoch": 0.9791262985308699, + "grad_norm": 0.83203125, + "learning_rate": 0.00015194063901196217, + "loss": 4.2708, + "step": 9443 + }, + { + "epoch": 0.979229986585358, + "grad_norm": 0.70703125, + "learning_rate": 0.00015193135657823827, + "loss": 4.2709, + "step": 9444 + }, + { + "epoch": 0.979333674639846, + "grad_norm": 0.8359375, + "learning_rate": 0.00015192207353177922, + "loss": 4.2549, + "step": 9445 + }, + { + "epoch": 0.9794373626943341, + "grad_norm": 0.73046875, + "learning_rate": 0.00015191278987269463, + "loss": 4.2305, + "step": 9446 + }, + { + "epoch": 0.9795410507488221, + "grad_norm": 0.82421875, + "learning_rate": 0.00015190350560109398, + "loss": 4.2507, + "step": 9447 + }, + { + "epoch": 0.9796447388033103, + "grad_norm": 0.76171875, + "learning_rate": 0.00015189422071708685, + "loss": 4.2778, + "step": 9448 + }, + { + "epoch": 0.9797484268577983, + "grad_norm": 0.91015625, + "learning_rate": 0.00015188493522078276, + "loss": 4.2954, + "step": 9449 + }, + { + "epoch": 0.9798521149122864, + "grad_norm": 0.765625, + "learning_rate": 0.00015187564911229125, + "loss": 4.298, + "step": 9450 + }, + { + "epoch": 0.9799558029667744, + "grad_norm": 0.94921875, + "learning_rate": 0.00015186636239172198, + "loss": 4.2874, + "step": 9451 + }, + { + "epoch": 0.9800594910212626, + "grad_norm": 0.87109375, + "learning_rate": 0.00015185707505918438, + "loss": 4.2759, + "step": 9452 + }, + { + "epoch": 0.9801631790757506, + "grad_norm": 0.890625, + "learning_rate": 0.00015184778711478818, + "loss": 4.3086, + "step": 9453 + }, + { + "epoch": 0.9802668671302387, + "grad_norm": 0.81640625, + "learning_rate": 0.00015183849855864286, + "loss": 4.2788, + "step": 9454 + }, + { + "epoch": 0.9803705551847267, + "grad_norm": 0.91015625, + "learning_rate": 0.00015182920939085806, + "loss": 4.2358, + "step": 9455 + }, + { + "epoch": 0.9804742432392148, + "grad_norm": 0.85546875, + "learning_rate": 0.0001518199196115434, + "loss": 4.2496, + "step": 9456 + }, + { + "epoch": 0.9805779312937029, + "grad_norm": 0.8125, + "learning_rate": 0.0001518106292208084, + "loss": 4.2617, + "step": 9457 + }, + { + "epoch": 0.980681619348191, + "grad_norm": 0.80859375, + "learning_rate": 0.00015180133821876278, + "loss": 4.2754, + "step": 9458 + }, + { + "epoch": 0.980785307402679, + "grad_norm": 0.8125, + "learning_rate": 0.00015179204660551614, + "loss": 4.2655, + "step": 9459 + }, + { + "epoch": 0.9808889954571671, + "grad_norm": 0.83203125, + "learning_rate": 0.00015178275438117807, + "loss": 4.2661, + "step": 9460 + }, + { + "epoch": 0.9809926835116551, + "grad_norm": 0.77734375, + "learning_rate": 0.00015177346154585824, + "loss": 4.2846, + "step": 9461 + }, + { + "epoch": 0.9810963715661433, + "grad_norm": 0.8984375, + "learning_rate": 0.0001517641680996663, + "loss": 4.2724, + "step": 9462 + }, + { + "epoch": 0.9812000596206313, + "grad_norm": 0.78125, + "learning_rate": 0.0001517548740427119, + "loss": 4.2302, + "step": 9463 + }, + { + "epoch": 0.9813037476751194, + "grad_norm": 0.7890625, + "learning_rate": 0.00015174557937510467, + "loss": 4.2434, + "step": 9464 + }, + { + "epoch": 0.9814074357296074, + "grad_norm": 0.84765625, + "learning_rate": 0.00015173628409695432, + "loss": 4.257, + "step": 9465 + }, + { + "epoch": 0.9815111237840956, + "grad_norm": 0.90234375, + "learning_rate": 0.0001517269882083705, + "loss": 4.2477, + "step": 9466 + }, + { + "epoch": 0.9816148118385836, + "grad_norm": 0.77734375, + "learning_rate": 0.00015171769170946287, + "loss": 4.2774, + "step": 9467 + }, + { + "epoch": 0.9817184998930717, + "grad_norm": 0.9453125, + "learning_rate": 0.00015170839460034122, + "loss": 4.2568, + "step": 9468 + }, + { + "epoch": 0.9818221879475598, + "grad_norm": 0.7890625, + "learning_rate": 0.00015169909688111512, + "loss": 4.2727, + "step": 9469 + }, + { + "epoch": 0.9819258760020478, + "grad_norm": 0.8359375, + "learning_rate": 0.0001516897985518943, + "loss": 4.2676, + "step": 9470 + }, + { + "epoch": 0.982029564056536, + "grad_norm": 0.796875, + "learning_rate": 0.00015168049961278854, + "loss": 4.2741, + "step": 9471 + }, + { + "epoch": 0.982133252111024, + "grad_norm": 0.8203125, + "learning_rate": 0.0001516712000639075, + "loss": 4.2287, + "step": 9472 + }, + { + "epoch": 0.9822369401655121, + "grad_norm": 0.8125, + "learning_rate": 0.0001516618999053609, + "loss": 4.2734, + "step": 9473 + }, + { + "epoch": 0.9823406282200001, + "grad_norm": 0.83203125, + "learning_rate": 0.0001516525991372585, + "loss": 4.2906, + "step": 9474 + }, + { + "epoch": 0.9824443162744882, + "grad_norm": 0.80859375, + "learning_rate": 0.00015164329775971003, + "loss": 4.2236, + "step": 9475 + }, + { + "epoch": 0.9825480043289763, + "grad_norm": 0.7890625, + "learning_rate": 0.00015163399577282526, + "loss": 4.2571, + "step": 9476 + }, + { + "epoch": 0.9826516923834644, + "grad_norm": 0.7890625, + "learning_rate": 0.00015162469317671392, + "loss": 4.2449, + "step": 9477 + }, + { + "epoch": 0.9827553804379524, + "grad_norm": 0.7734375, + "learning_rate": 0.00015161538997148573, + "loss": 4.2457, + "step": 9478 + }, + { + "epoch": 0.9828590684924405, + "grad_norm": 0.84375, + "learning_rate": 0.00015160608615725054, + "loss": 4.2468, + "step": 9479 + }, + { + "epoch": 0.9829627565469286, + "grad_norm": 0.80078125, + "learning_rate": 0.00015159678173411805, + "loss": 4.2502, + "step": 9480 + }, + { + "epoch": 0.9830664446014167, + "grad_norm": 0.83203125, + "learning_rate": 0.0001515874767021981, + "loss": 4.2675, + "step": 9481 + }, + { + "epoch": 0.9831701326559047, + "grad_norm": 0.85546875, + "learning_rate": 0.00015157817106160044, + "loss": 4.3162, + "step": 9482 + }, + { + "epoch": 0.9832738207103928, + "grad_norm": 0.73046875, + "learning_rate": 0.0001515688648124349, + "loss": 4.2683, + "step": 9483 + }, + { + "epoch": 0.9833775087648808, + "grad_norm": 0.81640625, + "learning_rate": 0.00015155955795481124, + "loss": 4.2951, + "step": 9484 + }, + { + "epoch": 0.983481196819369, + "grad_norm": 0.734375, + "learning_rate": 0.0001515502504888393, + "loss": 4.2818, + "step": 9485 + }, + { + "epoch": 0.983584884873857, + "grad_norm": 0.765625, + "learning_rate": 0.0001515409424146289, + "loss": 4.3042, + "step": 9486 + }, + { + "epoch": 0.9836885729283451, + "grad_norm": 0.80078125, + "learning_rate": 0.00015153163373228987, + "loss": 4.2414, + "step": 9487 + }, + { + "epoch": 0.9837922609828331, + "grad_norm": 0.7890625, + "learning_rate": 0.00015152232444193202, + "loss": 4.2496, + "step": 9488 + }, + { + "epoch": 0.9838959490373212, + "grad_norm": 0.77734375, + "learning_rate": 0.00015151301454366522, + "loss": 4.2784, + "step": 9489 + }, + { + "epoch": 0.9839996370918093, + "grad_norm": 0.73828125, + "learning_rate": 0.00015150370403759929, + "loss": 4.2508, + "step": 9490 + }, + { + "epoch": 0.9841033251462974, + "grad_norm": 0.84765625, + "learning_rate": 0.0001514943929238441, + "loss": 4.2928, + "step": 9491 + }, + { + "epoch": 0.9842070132007854, + "grad_norm": 0.69921875, + "learning_rate": 0.00015148508120250948, + "loss": 4.2469, + "step": 9492 + }, + { + "epoch": 0.9843107012552735, + "grad_norm": 0.80078125, + "learning_rate": 0.00015147576887370535, + "loss": 4.2814, + "step": 9493 + }, + { + "epoch": 0.9844143893097616, + "grad_norm": 0.70703125, + "learning_rate": 0.00015146645593754155, + "loss": 4.2694, + "step": 9494 + }, + { + "epoch": 0.9845180773642497, + "grad_norm": 0.75390625, + "learning_rate": 0.000151457142394128, + "loss": 4.2363, + "step": 9495 + }, + { + "epoch": 0.9846217654187377, + "grad_norm": 0.671875, + "learning_rate": 0.0001514478282435745, + "loss": 4.2998, + "step": 9496 + }, + { + "epoch": 0.9847254534732258, + "grad_norm": 0.7734375, + "learning_rate": 0.00015143851348599108, + "loss": 4.242, + "step": 9497 + }, + { + "epoch": 0.9848291415277138, + "grad_norm": 0.765625, + "learning_rate": 0.00015142919812148752, + "loss": 4.2483, + "step": 9498 + }, + { + "epoch": 0.984932829582202, + "grad_norm": 0.7734375, + "learning_rate": 0.0001514198821501738, + "loss": 4.2775, + "step": 9499 + }, + { + "epoch": 0.98503651763669, + "grad_norm": 0.78515625, + "learning_rate": 0.00015141056557215984, + "loss": 4.2735, + "step": 9500 + }, + { + "epoch": 0.9851402056911781, + "grad_norm": 0.80078125, + "learning_rate": 0.00015140124838755554, + "loss": 4.2642, + "step": 9501 + }, + { + "epoch": 0.9852438937456661, + "grad_norm": 0.73828125, + "learning_rate": 0.00015139193059647086, + "loss": 4.2407, + "step": 9502 + }, + { + "epoch": 0.9853475818001542, + "grad_norm": 0.78125, + "learning_rate": 0.0001513826121990157, + "loss": 4.3053, + "step": 9503 + }, + { + "epoch": 0.9854512698546423, + "grad_norm": 0.7421875, + "learning_rate": 0.00015137329319530002, + "loss": 4.2636, + "step": 9504 + }, + { + "epoch": 0.9855549579091304, + "grad_norm": 0.7578125, + "learning_rate": 0.00015136397358543382, + "loss": 4.2455, + "step": 9505 + }, + { + "epoch": 0.9856586459636184, + "grad_norm": 0.8203125, + "learning_rate": 0.00015135465336952702, + "loss": 4.2578, + "step": 9506 + }, + { + "epoch": 0.9857623340181065, + "grad_norm": 0.71484375, + "learning_rate": 0.00015134533254768958, + "loss": 4.2418, + "step": 9507 + }, + { + "epoch": 0.9858660220725945, + "grad_norm": 0.84375, + "learning_rate": 0.00015133601112003145, + "loss": 4.2535, + "step": 9508 + }, + { + "epoch": 0.9859697101270827, + "grad_norm": 0.7578125, + "learning_rate": 0.0001513266890866627, + "loss": 4.2542, + "step": 9509 + }, + { + "epoch": 0.9860733981815707, + "grad_norm": 0.78125, + "learning_rate": 0.00015131736644769328, + "loss": 4.308, + "step": 9510 + }, + { + "epoch": 0.9861770862360588, + "grad_norm": 0.8046875, + "learning_rate": 0.00015130804320323318, + "loss": 4.2822, + "step": 9511 + }, + { + "epoch": 0.9862807742905468, + "grad_norm": 0.79296875, + "learning_rate": 0.00015129871935339238, + "loss": 4.2731, + "step": 9512 + }, + { + "epoch": 0.986384462345035, + "grad_norm": 0.80859375, + "learning_rate": 0.0001512893948982809, + "loss": 4.2935, + "step": 9513 + }, + { + "epoch": 0.9864881503995231, + "grad_norm": 0.8203125, + "learning_rate": 0.00015128006983800883, + "loss": 4.2167, + "step": 9514 + }, + { + "epoch": 0.9865918384540111, + "grad_norm": 0.83984375, + "learning_rate": 0.00015127074417268613, + "loss": 4.2594, + "step": 9515 + }, + { + "epoch": 0.9866955265084992, + "grad_norm": 0.78125, + "learning_rate": 0.00015126141790242282, + "loss": 4.2714, + "step": 9516 + }, + { + "epoch": 0.9867992145629872, + "grad_norm": 0.87109375, + "learning_rate": 0.00015125209102732895, + "loss": 4.2274, + "step": 9517 + }, + { + "epoch": 0.9869029026174754, + "grad_norm": 0.71484375, + "learning_rate": 0.0001512427635475146, + "loss": 4.2527, + "step": 9518 + }, + { + "epoch": 0.9870065906719634, + "grad_norm": 0.828125, + "learning_rate": 0.00015123343546308984, + "loss": 4.2661, + "step": 9519 + }, + { + "epoch": 0.9871102787264515, + "grad_norm": 0.75390625, + "learning_rate": 0.00015122410677416467, + "loss": 4.2873, + "step": 9520 + }, + { + "epoch": 0.9872139667809395, + "grad_norm": 0.859375, + "learning_rate": 0.00015121477748084917, + "loss": 4.259, + "step": 9521 + }, + { + "epoch": 0.9873176548354277, + "grad_norm": 0.6953125, + "learning_rate": 0.00015120544758325346, + "loss": 4.2133, + "step": 9522 + }, + { + "epoch": 0.9874213428899157, + "grad_norm": 0.7265625, + "learning_rate": 0.0001511961170814876, + "loss": 4.252, + "step": 9523 + }, + { + "epoch": 0.9875250309444038, + "grad_norm": 0.72265625, + "learning_rate": 0.00015118678597566163, + "loss": 4.2915, + "step": 9524 + }, + { + "epoch": 0.9876287189988918, + "grad_norm": 0.75, + "learning_rate": 0.0001511774542658857, + "loss": 4.2505, + "step": 9525 + }, + { + "epoch": 0.98773240705338, + "grad_norm": 0.78515625, + "learning_rate": 0.00015116812195226995, + "loss": 4.2576, + "step": 9526 + }, + { + "epoch": 0.987836095107868, + "grad_norm": 0.69140625, + "learning_rate": 0.00015115878903492443, + "loss": 4.3013, + "step": 9527 + }, + { + "epoch": 0.9879397831623561, + "grad_norm": 0.72265625, + "learning_rate": 0.0001511494555139593, + "loss": 4.2584, + "step": 9528 + }, + { + "epoch": 0.9880434712168441, + "grad_norm": 0.69140625, + "learning_rate": 0.0001511401213894846, + "loss": 4.2957, + "step": 9529 + }, + { + "epoch": 0.9881471592713322, + "grad_norm": 0.69140625, + "learning_rate": 0.00015113078666161055, + "loss": 4.2904, + "step": 9530 + }, + { + "epoch": 0.9882508473258202, + "grad_norm": 0.6953125, + "learning_rate": 0.00015112145133044732, + "loss": 4.2956, + "step": 9531 + }, + { + "epoch": 0.9883545353803084, + "grad_norm": 0.71875, + "learning_rate": 0.00015111211539610496, + "loss": 4.2502, + "step": 9532 + }, + { + "epoch": 0.9884582234347964, + "grad_norm": 0.6875, + "learning_rate": 0.00015110277885869362, + "loss": 4.2664, + "step": 9533 + }, + { + "epoch": 0.9885619114892845, + "grad_norm": 0.796875, + "learning_rate": 0.00015109344171832355, + "loss": 4.2533, + "step": 9534 + }, + { + "epoch": 0.9886655995437725, + "grad_norm": 0.67578125, + "learning_rate": 0.00015108410397510487, + "loss": 4.2616, + "step": 9535 + }, + { + "epoch": 0.9887692875982607, + "grad_norm": 0.8359375, + "learning_rate": 0.0001510747656291478, + "loss": 4.2467, + "step": 9536 + }, + { + "epoch": 0.9888729756527487, + "grad_norm": 0.65625, + "learning_rate": 0.00015106542668056241, + "loss": 4.2971, + "step": 9537 + }, + { + "epoch": 0.9889766637072368, + "grad_norm": 0.80859375, + "learning_rate": 0.00015105608712945903, + "loss": 4.2468, + "step": 9538 + }, + { + "epoch": 0.9890803517617248, + "grad_norm": 0.671875, + "learning_rate": 0.00015104674697594776, + "loss": 4.2745, + "step": 9539 + }, + { + "epoch": 0.989184039816213, + "grad_norm": 0.8203125, + "learning_rate": 0.00015103740622013882, + "loss": 4.2288, + "step": 9540 + }, + { + "epoch": 0.989287727870701, + "grad_norm": 0.625, + "learning_rate": 0.0001510280648621425, + "loss": 4.3209, + "step": 9541 + }, + { + "epoch": 0.9893914159251891, + "grad_norm": 0.74609375, + "learning_rate": 0.00015101872290206888, + "loss": 4.286, + "step": 9542 + }, + { + "epoch": 0.9894951039796771, + "grad_norm": 0.72265625, + "learning_rate": 0.00015100938034002826, + "loss": 4.2625, + "step": 9543 + }, + { + "epoch": 0.9895987920341652, + "grad_norm": 0.7734375, + "learning_rate": 0.00015100003717613095, + "loss": 4.249, + "step": 9544 + }, + { + "epoch": 0.9897024800886532, + "grad_norm": 0.70703125, + "learning_rate": 0.00015099069341048703, + "loss": 4.2947, + "step": 9545 + }, + { + "epoch": 0.9898061681431414, + "grad_norm": 0.80859375, + "learning_rate": 0.00015098134904320686, + "loss": 4.2678, + "step": 9546 + }, + { + "epoch": 0.9899098561976294, + "grad_norm": 0.71484375, + "learning_rate": 0.00015097200407440065, + "loss": 4.2481, + "step": 9547 + }, + { + "epoch": 0.9900135442521175, + "grad_norm": 0.76953125, + "learning_rate": 0.0001509626585041787, + "loss": 4.2628, + "step": 9548 + }, + { + "epoch": 0.9901172323066055, + "grad_norm": 0.7890625, + "learning_rate": 0.00015095331233265123, + "loss": 4.3012, + "step": 9549 + }, + { + "epoch": 0.9902209203610937, + "grad_norm": 0.7890625, + "learning_rate": 0.00015094396555992852, + "loss": 4.2533, + "step": 9550 + }, + { + "epoch": 0.9903246084155817, + "grad_norm": 0.73046875, + "learning_rate": 0.00015093461818612087, + "loss": 4.263, + "step": 9551 + }, + { + "epoch": 0.9904282964700698, + "grad_norm": 0.7734375, + "learning_rate": 0.00015092527021133857, + "loss": 4.2751, + "step": 9552 + }, + { + "epoch": 0.9905319845245578, + "grad_norm": 0.77734375, + "learning_rate": 0.00015091592163569195, + "loss": 4.2127, + "step": 9553 + }, + { + "epoch": 0.990635672579046, + "grad_norm": 0.73046875, + "learning_rate": 0.00015090657245929123, + "loss": 4.2407, + "step": 9554 + }, + { + "epoch": 0.990739360633534, + "grad_norm": 0.71484375, + "learning_rate": 0.00015089722268224676, + "loss": 4.247, + "step": 9555 + }, + { + "epoch": 0.9908430486880221, + "grad_norm": 0.796875, + "learning_rate": 0.0001508878723046689, + "loss": 4.2134, + "step": 9556 + }, + { + "epoch": 0.9909467367425101, + "grad_norm": 0.71875, + "learning_rate": 0.0001508785213266679, + "loss": 4.2577, + "step": 9557 + }, + { + "epoch": 0.9910504247969982, + "grad_norm": 0.7734375, + "learning_rate": 0.00015086916974835413, + "loss": 4.2612, + "step": 9558 + }, + { + "epoch": 0.9911541128514864, + "grad_norm": 0.71875, + "learning_rate": 0.00015085981756983796, + "loss": 4.259, + "step": 9559 + }, + { + "epoch": 0.9912578009059744, + "grad_norm": 0.8125, + "learning_rate": 0.0001508504647912297, + "loss": 4.2905, + "step": 9560 + }, + { + "epoch": 0.9913614889604625, + "grad_norm": 0.78515625, + "learning_rate": 0.0001508411114126397, + "loss": 4.2423, + "step": 9561 + }, + { + "epoch": 0.9914651770149505, + "grad_norm": 0.7265625, + "learning_rate": 0.00015083175743417832, + "loss": 4.2527, + "step": 9562 + }, + { + "epoch": 0.9915688650694386, + "grad_norm": 0.84765625, + "learning_rate": 0.00015082240285595592, + "loss": 4.2274, + "step": 9563 + }, + { + "epoch": 0.9916725531239267, + "grad_norm": 0.8046875, + "learning_rate": 0.00015081304767808294, + "loss": 4.2877, + "step": 9564 + }, + { + "epoch": 0.9917762411784148, + "grad_norm": 0.765625, + "learning_rate": 0.00015080369190066965, + "loss": 4.2442, + "step": 9565 + }, + { + "epoch": 0.9918799292329028, + "grad_norm": 0.74609375, + "learning_rate": 0.00015079433552382654, + "loss": 4.2374, + "step": 9566 + }, + { + "epoch": 0.9919836172873909, + "grad_norm": 0.75390625, + "learning_rate": 0.00015078497854766393, + "loss": 4.2583, + "step": 9567 + }, + { + "epoch": 0.9920873053418789, + "grad_norm": 0.73828125, + "learning_rate": 0.00015077562097229227, + "loss": 4.2587, + "step": 9568 + }, + { + "epoch": 0.9921909933963671, + "grad_norm": 0.8046875, + "learning_rate": 0.00015076626279782196, + "loss": 4.2521, + "step": 9569 + }, + { + "epoch": 0.9922946814508551, + "grad_norm": 0.7421875, + "learning_rate": 0.0001507569040243634, + "loss": 4.2325, + "step": 9570 + }, + { + "epoch": 0.9923983695053432, + "grad_norm": 0.77734375, + "learning_rate": 0.00015074754465202702, + "loss": 4.2582, + "step": 9571 + }, + { + "epoch": 0.9925020575598312, + "grad_norm": 0.71875, + "learning_rate": 0.00015073818468092326, + "loss": 4.2995, + "step": 9572 + }, + { + "epoch": 0.9926057456143194, + "grad_norm": 0.77734375, + "learning_rate": 0.00015072882411116257, + "loss": 4.2835, + "step": 9573 + }, + { + "epoch": 0.9927094336688074, + "grad_norm": 0.78515625, + "learning_rate": 0.00015071946294285536, + "loss": 4.2457, + "step": 9574 + }, + { + "epoch": 0.9928131217232955, + "grad_norm": 0.75390625, + "learning_rate": 0.0001507101011761121, + "loss": 4.3043, + "step": 9575 + }, + { + "epoch": 0.9929168097777835, + "grad_norm": 0.796875, + "learning_rate": 0.00015070073881104325, + "loss": 4.3035, + "step": 9576 + }, + { + "epoch": 0.9930204978322716, + "grad_norm": 0.7421875, + "learning_rate": 0.00015069137584775933, + "loss": 4.2266, + "step": 9577 + }, + { + "epoch": 0.9931241858867597, + "grad_norm": 0.79296875, + "learning_rate": 0.00015068201228637072, + "loss": 4.2521, + "step": 9578 + }, + { + "epoch": 0.9932278739412478, + "grad_norm": 0.73828125, + "learning_rate": 0.0001506726481269879, + "loss": 4.2694, + "step": 9579 + }, + { + "epoch": 0.9933315619957358, + "grad_norm": 0.75390625, + "learning_rate": 0.00015066328336972142, + "loss": 4.2715, + "step": 9580 + }, + { + "epoch": 0.9934352500502239, + "grad_norm": 0.76171875, + "learning_rate": 0.00015065391801468176, + "loss": 4.2333, + "step": 9581 + }, + { + "epoch": 0.9935389381047119, + "grad_norm": 0.71875, + "learning_rate": 0.00015064455206197937, + "loss": 4.285, + "step": 9582 + }, + { + "epoch": 0.9936426261592001, + "grad_norm": 0.796875, + "learning_rate": 0.00015063518551172486, + "loss": 4.2574, + "step": 9583 + }, + { + "epoch": 0.9937463142136881, + "grad_norm": 0.72265625, + "learning_rate": 0.00015062581836402866, + "loss": 4.2694, + "step": 9584 + }, + { + "epoch": 0.9938500022681762, + "grad_norm": 0.74609375, + "learning_rate": 0.00015061645061900132, + "loss": 4.2823, + "step": 9585 + }, + { + "epoch": 0.9939536903226642, + "grad_norm": 0.7265625, + "learning_rate": 0.00015060708227675337, + "loss": 4.2766, + "step": 9586 + }, + { + "epoch": 0.9940573783771524, + "grad_norm": 0.734375, + "learning_rate": 0.0001505977133373953, + "loss": 4.2578, + "step": 9587 + }, + { + "epoch": 0.9941610664316404, + "grad_norm": 0.76171875, + "learning_rate": 0.00015058834380103772, + "loss": 4.2511, + "step": 9588 + }, + { + "epoch": 0.9942647544861285, + "grad_norm": 0.73046875, + "learning_rate": 0.0001505789736677912, + "loss": 4.2933, + "step": 9589 + }, + { + "epoch": 0.9943684425406165, + "grad_norm": 0.87109375, + "learning_rate": 0.0001505696029377662, + "loss": 4.248, + "step": 9590 + }, + { + "epoch": 0.9944721305951046, + "grad_norm": 0.8046875, + "learning_rate": 0.0001505602316110734, + "loss": 4.2504, + "step": 9591 + }, + { + "epoch": 0.9945758186495927, + "grad_norm": 0.78125, + "learning_rate": 0.00015055085968782326, + "loss": 4.2362, + "step": 9592 + }, + { + "epoch": 0.9946795067040808, + "grad_norm": 0.7890625, + "learning_rate": 0.00015054148716812642, + "loss": 4.2888, + "step": 9593 + }, + { + "epoch": 0.9947831947585688, + "grad_norm": 0.78515625, + "learning_rate": 0.00015053211405209347, + "loss": 4.2504, + "step": 9594 + }, + { + "epoch": 0.9948868828130569, + "grad_norm": 0.75, + "learning_rate": 0.00015052274033983495, + "loss": 4.2843, + "step": 9595 + }, + { + "epoch": 0.9949905708675449, + "grad_norm": 0.75, + "learning_rate": 0.00015051336603146153, + "loss": 4.277, + "step": 9596 + }, + { + "epoch": 0.9950942589220331, + "grad_norm": 0.859375, + "learning_rate": 0.0001505039911270838, + "loss": 4.2381, + "step": 9597 + }, + { + "epoch": 0.9951979469765211, + "grad_norm": 0.79296875, + "learning_rate": 0.00015049461562681232, + "loss": 4.2435, + "step": 9598 + }, + { + "epoch": 0.9953016350310092, + "grad_norm": 0.8046875, + "learning_rate": 0.00015048523953075778, + "loss": 4.2282, + "step": 9599 + }, + { + "epoch": 0.9954053230854972, + "grad_norm": 0.80859375, + "learning_rate": 0.00015047586283903076, + "loss": 4.2414, + "step": 9600 + }, + { + "epoch": 0.9955090111399854, + "grad_norm": 0.75, + "learning_rate": 0.00015046648555174194, + "loss": 4.3018, + "step": 9601 + }, + { + "epoch": 0.9956126991944734, + "grad_norm": 0.765625, + "learning_rate": 0.00015045710766900194, + "loss": 4.2932, + "step": 9602 + }, + { + "epoch": 0.9957163872489615, + "grad_norm": 0.765625, + "learning_rate": 0.00015044772919092138, + "loss": 4.266, + "step": 9603 + }, + { + "epoch": 0.9958200753034496, + "grad_norm": 0.828125, + "learning_rate": 0.00015043835011761095, + "loss": 4.3192, + "step": 9604 + }, + { + "epoch": 0.9959237633579376, + "grad_norm": 0.80859375, + "learning_rate": 0.0001504289704491813, + "loss": 4.316, + "step": 9605 + }, + { + "epoch": 0.9960274514124258, + "grad_norm": 0.84375, + "learning_rate": 0.00015041959018574313, + "loss": 4.2825, + "step": 9606 + }, + { + "epoch": 0.9961311394669138, + "grad_norm": 0.71484375, + "learning_rate": 0.00015041020932740707, + "loss": 4.2815, + "step": 9607 + }, + { + "epoch": 0.9962348275214019, + "grad_norm": 0.84375, + "learning_rate": 0.0001504008278742838, + "loss": 4.261, + "step": 9608 + }, + { + "epoch": 0.9963385155758899, + "grad_norm": 0.75390625, + "learning_rate": 0.0001503914458264841, + "loss": 4.2402, + "step": 9609 + }, + { + "epoch": 0.996442203630378, + "grad_norm": 0.84765625, + "learning_rate": 0.00015038206318411854, + "loss": 4.2206, + "step": 9610 + }, + { + "epoch": 0.9965458916848661, + "grad_norm": 0.7890625, + "learning_rate": 0.00015037267994729795, + "loss": 4.295, + "step": 9611 + }, + { + "epoch": 0.9966495797393542, + "grad_norm": 0.7890625, + "learning_rate": 0.0001503632961161329, + "loss": 4.2348, + "step": 9612 + }, + { + "epoch": 0.9967532677938422, + "grad_norm": 0.75390625, + "learning_rate": 0.00015035391169073426, + "loss": 4.2728, + "step": 9613 + }, + { + "epoch": 0.9968569558483303, + "grad_norm": 0.89453125, + "learning_rate": 0.00015034452667121267, + "loss": 4.2732, + "step": 9614 + }, + { + "epoch": 0.9969606439028184, + "grad_norm": 0.7265625, + "learning_rate": 0.0001503351410576789, + "loss": 4.2078, + "step": 9615 + }, + { + "epoch": 0.9970643319573065, + "grad_norm": 0.78515625, + "learning_rate": 0.00015032575485024365, + "loss": 4.2402, + "step": 9616 + }, + { + "epoch": 0.9971680200117945, + "grad_norm": 0.68359375, + "learning_rate": 0.0001503163680490177, + "loss": 4.2576, + "step": 9617 + }, + { + "epoch": 0.9972717080662826, + "grad_norm": 0.75390625, + "learning_rate": 0.00015030698065411177, + "loss": 4.2684, + "step": 9618 + }, + { + "epoch": 0.9973753961207706, + "grad_norm": 0.6875, + "learning_rate": 0.00015029759266563667, + "loss": 4.2699, + "step": 9619 + }, + { + "epoch": 0.9974790841752588, + "grad_norm": 0.75, + "learning_rate": 0.00015028820408370314, + "loss": 4.2519, + "step": 9620 + }, + { + "epoch": 0.9975827722297468, + "grad_norm": 0.69140625, + "learning_rate": 0.00015027881490842194, + "loss": 4.2527, + "step": 9621 + }, + { + "epoch": 0.9976864602842349, + "grad_norm": 0.70703125, + "learning_rate": 0.0001502694251399039, + "loss": 4.2643, + "step": 9622 + }, + { + "epoch": 0.9977901483387229, + "grad_norm": 0.671875, + "learning_rate": 0.00015026003477825977, + "loss": 4.2781, + "step": 9623 + }, + { + "epoch": 0.997893836393211, + "grad_norm": 0.703125, + "learning_rate": 0.00015025064382360036, + "loss": 4.2317, + "step": 9624 + }, + { + "epoch": 0.9979975244476991, + "grad_norm": 0.71484375, + "learning_rate": 0.00015024125227603645, + "loss": 4.2581, + "step": 9625 + }, + { + "epoch": 0.9981012125021872, + "grad_norm": 0.71875, + "learning_rate": 0.00015023186013567887, + "loss": 4.2255, + "step": 9626 + }, + { + "epoch": 0.9982049005566752, + "grad_norm": 0.69921875, + "learning_rate": 0.00015022246740263844, + "loss": 4.225, + "step": 9627 + }, + { + "epoch": 0.9983085886111633, + "grad_norm": 0.66796875, + "learning_rate": 0.00015021307407702604, + "loss": 4.2266, + "step": 9628 + }, + { + "epoch": 0.9984122766656514, + "grad_norm": 0.69140625, + "learning_rate": 0.0001502036801589524, + "loss": 4.2333, + "step": 9629 + }, + { + "epoch": 0.9985159647201395, + "grad_norm": 0.6328125, + "learning_rate": 0.00015019428564852838, + "loss": 4.248, + "step": 9630 + }, + { + "epoch": 0.9986196527746275, + "grad_norm": 0.71484375, + "learning_rate": 0.0001501848905458649, + "loss": 4.2952, + "step": 9631 + }, + { + "epoch": 0.9987233408291156, + "grad_norm": 0.72265625, + "learning_rate": 0.0001501754948510727, + "loss": 4.2362, + "step": 9632 + }, + { + "epoch": 0.9988270288836036, + "grad_norm": 0.72265625, + "learning_rate": 0.0001501660985642627, + "loss": 4.2598, + "step": 9633 + }, + { + "epoch": 0.9989307169380918, + "grad_norm": 0.76953125, + "learning_rate": 0.0001501567016855458, + "loss": 4.2798, + "step": 9634 + }, + { + "epoch": 0.9990344049925798, + "grad_norm": 0.65625, + "learning_rate": 0.00015014730421503286, + "loss": 4.2741, + "step": 9635 + }, + { + "epoch": 0.9991380930470679, + "grad_norm": 0.73828125, + "learning_rate": 0.00015013790615283468, + "loss": 4.2794, + "step": 9636 + }, + { + "epoch": 0.9992417811015559, + "grad_norm": 0.68359375, + "learning_rate": 0.00015012850749906222, + "loss": 4.2997, + "step": 9637 + }, + { + "epoch": 0.999345469156044, + "grad_norm": 0.8125, + "learning_rate": 0.00015011910825382638, + "loss": 4.2481, + "step": 9638 + }, + { + "epoch": 0.9994491572105321, + "grad_norm": 0.75390625, + "learning_rate": 0.00015010970841723803, + "loss": 4.2476, + "step": 9639 + }, + { + "epoch": 0.9995528452650202, + "grad_norm": 0.77734375, + "learning_rate": 0.00015010030798940808, + "loss": 4.2513, + "step": 9640 + }, + { + "epoch": 0.9996565333195082, + "grad_norm": 0.72265625, + "learning_rate": 0.00015009090697044747, + "loss": 4.2688, + "step": 9641 + }, + { + "epoch": 0.9997602213739963, + "grad_norm": 0.78125, + "learning_rate": 0.0001500815053604671, + "loss": 4.2505, + "step": 9642 + }, + { + "epoch": 0.9998639094284844, + "grad_norm": 0.72265625, + "learning_rate": 0.00015007210315957792, + "loss": 4.2664, + "step": 9643 + }, + { + "epoch": 0.9999675974829725, + "grad_norm": 0.71484375, + "learning_rate": 0.00015006270036789084, + "loss": 4.2492, + "step": 9644 + }, + { + "epoch": 0.9999675974829725, + "eval_loss": 4.279594898223877, + "eval_runtime": 0.4479, + "eval_samples_per_second": 332.645, + "eval_steps_per_second": 15.628, + "step": 9644 } ], "logging_steps": 1, @@ -16907,7 +67562,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2411, - "total_flos": 3.4729528980175585e+18, + "total_flos": 1.3911233195677843e+19, "train_batch_size": 3, "trial_name": null, "trial_params": null