diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24590 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8211143695014663, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002346041055718475, + "grad_norm": 3.6450053402509406, + "learning_rate": 0.0, + "loss": 1.2572, + "step": 1 + }, + { + "epoch": 0.000469208211143695, + "grad_norm": 4.3485356326437845, + "learning_rate": 2.3419203747072602e-08, + "loss": 1.302, + "step": 2 + }, + { + "epoch": 0.0007038123167155425, + "grad_norm": 6.519353527947624, + "learning_rate": 4.6838407494145204e-08, + "loss": 1.3035, + "step": 3 + }, + { + "epoch": 0.00093841642228739, + "grad_norm": 8.604850613314342, + "learning_rate": 7.02576112412178e-08, + "loss": 1.3578, + "step": 4 + }, + { + "epoch": 0.0011730205278592375, + "grad_norm": 5.384869815415941, + "learning_rate": 9.367681498829041e-08, + "loss": 1.3102, + "step": 5 + }, + { + "epoch": 0.001407624633431085, + "grad_norm": 3.0808542461257487, + "learning_rate": 1.1709601873536301e-07, + "loss": 1.36, + "step": 6 + }, + { + "epoch": 0.0016422287390029325, + "grad_norm": 4.818428908064201, + "learning_rate": 1.405152224824356e-07, + "loss": 1.3193, + "step": 7 + }, + { + "epoch": 0.00187683284457478, + "grad_norm": 8.881983711459453, + "learning_rate": 1.639344262295082e-07, + "loss": 1.3436, + "step": 8 + }, + { + "epoch": 0.0021114369501466276, + "grad_norm": 6.209731653543138, + "learning_rate": 1.8735362997658082e-07, + "loss": 1.3472, + "step": 9 + }, + { + "epoch": 0.002346041055718475, + "grad_norm": 3.8408158741152936, + "learning_rate": 2.107728337236534e-07, + "loss": 1.3155, + "step": 10 + }, + { + "epoch": 0.0025806451612903226, + "grad_norm": 6.645338741206632, + "learning_rate": 2.3419203747072603e-07, + "loss": 1.2661, + "step": 11 + }, + { + "epoch": 0.00281524926686217, + "grad_norm": 16.87412496284319, + "learning_rate": 2.5761124121779863e-07, + "loss": 1.3359, + "step": 12 + }, + { + "epoch": 0.0030498533724340176, + "grad_norm": 3.774480076443614, + "learning_rate": 2.810304449648712e-07, + "loss": 1.2923, + "step": 13 + }, + { + "epoch": 0.003284457478005865, + "grad_norm": 3.8836693486151312, + "learning_rate": 3.044496487119438e-07, + "loss": 1.3094, + "step": 14 + }, + { + "epoch": 0.0035190615835777126, + "grad_norm": 5.2500074581781675, + "learning_rate": 3.278688524590164e-07, + "loss": 1.2843, + "step": 15 + }, + { + "epoch": 0.00375366568914956, + "grad_norm": 4.0449453982333266, + "learning_rate": 3.51288056206089e-07, + "loss": 1.305, + "step": 16 + }, + { + "epoch": 0.003988269794721408, + "grad_norm": 5.250683179693039, + "learning_rate": 3.7470725995316163e-07, + "loss": 1.3284, + "step": 17 + }, + { + "epoch": 0.004222873900293255, + "grad_norm": 7.063096507845875, + "learning_rate": 3.9812646370023426e-07, + "loss": 1.3711, + "step": 18 + }, + { + "epoch": 0.004457478005865103, + "grad_norm": 5.389525286489735, + "learning_rate": 4.215456674473068e-07, + "loss": 1.2755, + "step": 19 + }, + { + "epoch": 0.00469208211143695, + "grad_norm": 13.47512068224741, + "learning_rate": 4.449648711943794e-07, + "loss": 1.3309, + "step": 20 + }, + { + "epoch": 0.004926686217008798, + "grad_norm": 8.690321624905424, + "learning_rate": 4.6838407494145205e-07, + "loss": 1.3033, + "step": 21 + }, + { + "epoch": 0.005161290322580645, + "grad_norm": 5.3665280506949715, + "learning_rate": 4.918032786885246e-07, + "loss": 1.3485, + "step": 22 + }, + { + "epoch": 0.005395894428152493, + "grad_norm": 12.778939105297262, + "learning_rate": 5.152224824355973e-07, + "loss": 1.3281, + "step": 23 + }, + { + "epoch": 0.00563049853372434, + "grad_norm": 3.954317406271846, + "learning_rate": 5.386416861826698e-07, + "loss": 1.3843, + "step": 24 + }, + { + "epoch": 0.005865102639296188, + "grad_norm": 4.116531974224858, + "learning_rate": 5.620608899297424e-07, + "loss": 1.3211, + "step": 25 + }, + { + "epoch": 0.006099706744868035, + "grad_norm": 3.9250460374163683, + "learning_rate": 5.85480093676815e-07, + "loss": 1.2407, + "step": 26 + }, + { + "epoch": 0.006334310850439883, + "grad_norm": 3.519670059389327, + "learning_rate": 6.088992974238876e-07, + "loss": 1.2983, + "step": 27 + }, + { + "epoch": 0.00656891495601173, + "grad_norm": 4.136300157405571, + "learning_rate": 6.323185011709602e-07, + "loss": 1.3416, + "step": 28 + }, + { + "epoch": 0.006803519061583578, + "grad_norm": 10.279347514776052, + "learning_rate": 6.557377049180328e-07, + "loss": 1.3196, + "step": 29 + }, + { + "epoch": 0.007038123167155425, + "grad_norm": 4.765152922147381, + "learning_rate": 6.791569086651055e-07, + "loss": 1.2835, + "step": 30 + }, + { + "epoch": 0.007272727272727273, + "grad_norm": 7.208624944075502, + "learning_rate": 7.02576112412178e-07, + "loss": 1.3429, + "step": 31 + }, + { + "epoch": 0.00750733137829912, + "grad_norm": 18.131451894085693, + "learning_rate": 7.259953161592506e-07, + "loss": 1.3031, + "step": 32 + }, + { + "epoch": 0.007741935483870968, + "grad_norm": 3.251140914430706, + "learning_rate": 7.494145199063233e-07, + "loss": 1.3911, + "step": 33 + }, + { + "epoch": 0.007976539589442815, + "grad_norm": 8.79645269699139, + "learning_rate": 7.728337236533959e-07, + "loss": 1.3865, + "step": 34 + }, + { + "epoch": 0.008211143695014663, + "grad_norm": 5.75165059623209, + "learning_rate": 7.962529274004685e-07, + "loss": 1.2966, + "step": 35 + }, + { + "epoch": 0.00844574780058651, + "grad_norm": 4.8261727607857505, + "learning_rate": 8.196721311475409e-07, + "loss": 1.3001, + "step": 36 + }, + { + "epoch": 0.008680351906158358, + "grad_norm": 3.7899028159194055, + "learning_rate": 8.430913348946136e-07, + "loss": 1.3233, + "step": 37 + }, + { + "epoch": 0.008914956011730205, + "grad_norm": 77.06771197169489, + "learning_rate": 8.665105386416862e-07, + "loss": 1.4009, + "step": 38 + }, + { + "epoch": 0.009149560117302053, + "grad_norm": 5.551275085083891, + "learning_rate": 8.899297423887588e-07, + "loss": 1.3317, + "step": 39 + }, + { + "epoch": 0.0093841642228739, + "grad_norm": 6.607621487321918, + "learning_rate": 9.133489461358315e-07, + "loss": 1.2016, + "step": 40 + }, + { + "epoch": 0.009618768328445748, + "grad_norm": 4.4406064626417985, + "learning_rate": 9.367681498829041e-07, + "loss": 1.3097, + "step": 41 + }, + { + "epoch": 0.009853372434017595, + "grad_norm": 5.952053250291134, + "learning_rate": 9.601873536299766e-07, + "loss": 1.2593, + "step": 42 + }, + { + "epoch": 0.010087976539589443, + "grad_norm": 5.4801646507078505, + "learning_rate": 9.836065573770493e-07, + "loss": 1.3073, + "step": 43 + }, + { + "epoch": 0.01032258064516129, + "grad_norm": 3.277972266250071, + "learning_rate": 1.0070257611241219e-06, + "loss": 1.3387, + "step": 44 + }, + { + "epoch": 0.010557184750733138, + "grad_norm": 12.429665956983847, + "learning_rate": 1.0304449648711945e-06, + "loss": 1.3176, + "step": 45 + }, + { + "epoch": 0.010791788856304985, + "grad_norm": 37.48972536707086, + "learning_rate": 1.0538641686182672e-06, + "loss": 1.3101, + "step": 46 + }, + { + "epoch": 0.011026392961876833, + "grad_norm": 10.426155225949543, + "learning_rate": 1.0772833723653396e-06, + "loss": 1.2829, + "step": 47 + }, + { + "epoch": 0.01126099706744868, + "grad_norm": 6.811979793371468, + "learning_rate": 1.1007025761124122e-06, + "loss": 1.3173, + "step": 48 + }, + { + "epoch": 0.011495601173020528, + "grad_norm": 5.344640324674893, + "learning_rate": 1.1241217798594848e-06, + "loss": 1.3259, + "step": 49 + }, + { + "epoch": 0.011730205278592375, + "grad_norm": 8.884608922039357, + "learning_rate": 1.1475409836065575e-06, + "loss": 1.3234, + "step": 50 + }, + { + "epoch": 0.011964809384164223, + "grad_norm": 3.9335710107098607, + "learning_rate": 1.17096018735363e-06, + "loss": 1.3233, + "step": 51 + }, + { + "epoch": 0.01219941348973607, + "grad_norm": 9.519135441340808, + "learning_rate": 1.1943793911007025e-06, + "loss": 1.3034, + "step": 52 + }, + { + "epoch": 0.012434017595307918, + "grad_norm": 8.035792757540126, + "learning_rate": 1.2177985948477752e-06, + "loss": 1.2138, + "step": 53 + }, + { + "epoch": 0.012668621700879765, + "grad_norm": 4.904948094072176, + "learning_rate": 1.2412177985948478e-06, + "loss": 1.2059, + "step": 54 + }, + { + "epoch": 0.012903225806451613, + "grad_norm": 12.830754470261745, + "learning_rate": 1.2646370023419204e-06, + "loss": 1.2152, + "step": 55 + }, + { + "epoch": 0.01313782991202346, + "grad_norm": 7.256363284150356, + "learning_rate": 1.288056206088993e-06, + "loss": 1.2153, + "step": 56 + }, + { + "epoch": 0.013372434017595308, + "grad_norm": 6.491778580398843, + "learning_rate": 1.3114754098360657e-06, + "loss": 1.3016, + "step": 57 + }, + { + "epoch": 0.013607038123167155, + "grad_norm": 6.033267561982283, + "learning_rate": 1.3348946135831383e-06, + "loss": 1.2461, + "step": 58 + }, + { + "epoch": 0.013841642228739003, + "grad_norm": 3.059255577392171, + "learning_rate": 1.358313817330211e-06, + "loss": 1.2672, + "step": 59 + }, + { + "epoch": 0.01407624633431085, + "grad_norm": 3.3358502094614546, + "learning_rate": 1.3817330210772834e-06, + "loss": 1.1633, + "step": 60 + }, + { + "epoch": 0.014310850439882698, + "grad_norm": 4.572746088964143, + "learning_rate": 1.405152224824356e-06, + "loss": 1.2182, + "step": 61 + }, + { + "epoch": 0.014545454545454545, + "grad_norm": 5.754073739243907, + "learning_rate": 1.4285714285714286e-06, + "loss": 1.1733, + "step": 62 + }, + { + "epoch": 0.014780058651026393, + "grad_norm": 4.461948662542373, + "learning_rate": 1.4519906323185013e-06, + "loss": 1.2704, + "step": 63 + }, + { + "epoch": 0.01501466275659824, + "grad_norm": 3.9199423514140728, + "learning_rate": 1.4754098360655739e-06, + "loss": 1.2226, + "step": 64 + }, + { + "epoch": 0.015249266862170088, + "grad_norm": 4.243247745044473, + "learning_rate": 1.4988290398126465e-06, + "loss": 1.1889, + "step": 65 + }, + { + "epoch": 0.015483870967741935, + "grad_norm": 5.197052200876362, + "learning_rate": 1.5222482435597192e-06, + "loss": 1.1651, + "step": 66 + }, + { + "epoch": 0.015718475073313785, + "grad_norm": 5.624190656276907, + "learning_rate": 1.5456674473067918e-06, + "loss": 1.2036, + "step": 67 + }, + { + "epoch": 0.01595307917888563, + "grad_norm": 4.622812813704082, + "learning_rate": 1.5690866510538644e-06, + "loss": 1.1532, + "step": 68 + }, + { + "epoch": 0.01618768328445748, + "grad_norm": 8.192368911444243, + "learning_rate": 1.592505854800937e-06, + "loss": 1.2101, + "step": 69 + }, + { + "epoch": 0.016422287390029325, + "grad_norm": 12.034619839284026, + "learning_rate": 1.6159250585480097e-06, + "loss": 1.1385, + "step": 70 + }, + { + "epoch": 0.016656891495601175, + "grad_norm": 6.585215889253293, + "learning_rate": 1.6393442622950819e-06, + "loss": 1.102, + "step": 71 + }, + { + "epoch": 0.01689149560117302, + "grad_norm": 6.77462634029134, + "learning_rate": 1.6627634660421545e-06, + "loss": 1.1775, + "step": 72 + }, + { + "epoch": 0.01712609970674487, + "grad_norm": 2.7854183249596143, + "learning_rate": 1.6861826697892272e-06, + "loss": 1.0731, + "step": 73 + }, + { + "epoch": 0.017360703812316716, + "grad_norm": 7.808614997644782, + "learning_rate": 1.7096018735362998e-06, + "loss": 1.1439, + "step": 74 + }, + { + "epoch": 0.017595307917888565, + "grad_norm": 4.510160348409813, + "learning_rate": 1.7330210772833724e-06, + "loss": 1.104, + "step": 75 + }, + { + "epoch": 0.01782991202346041, + "grad_norm": 5.556851183428352, + "learning_rate": 1.756440281030445e-06, + "loss": 1.1709, + "step": 76 + }, + { + "epoch": 0.01806451612903226, + "grad_norm": 3.139324465797069, + "learning_rate": 1.7798594847775177e-06, + "loss": 1.1606, + "step": 77 + }, + { + "epoch": 0.018299120234604106, + "grad_norm": 3.5137450847721152, + "learning_rate": 1.8032786885245903e-06, + "loss": 1.0535, + "step": 78 + }, + { + "epoch": 0.018533724340175955, + "grad_norm": 3.213645933277671, + "learning_rate": 1.826697892271663e-06, + "loss": 1.0311, + "step": 79 + }, + { + "epoch": 0.0187683284457478, + "grad_norm": 4.812998571795422, + "learning_rate": 1.8501170960187356e-06, + "loss": 1.0272, + "step": 80 + }, + { + "epoch": 0.01900293255131965, + "grad_norm": 2.6303156249950534, + "learning_rate": 1.8735362997658082e-06, + "loss": 1.1085, + "step": 81 + }, + { + "epoch": 0.019237536656891496, + "grad_norm": 3.0519113832521416, + "learning_rate": 1.8969555035128806e-06, + "loss": 0.9947, + "step": 82 + }, + { + "epoch": 0.019472140762463345, + "grad_norm": 3.442196083590346, + "learning_rate": 1.9203747072599533e-06, + "loss": 1.0389, + "step": 83 + }, + { + "epoch": 0.01970674486803519, + "grad_norm": 3.451466998541888, + "learning_rate": 1.9437939110070257e-06, + "loss": 1.0224, + "step": 84 + }, + { + "epoch": 0.01994134897360704, + "grad_norm": 6.902751613310287, + "learning_rate": 1.9672131147540985e-06, + "loss": 0.9854, + "step": 85 + }, + { + "epoch": 0.020175953079178886, + "grad_norm": 2.1334082210475227, + "learning_rate": 1.990632318501171e-06, + "loss": 0.9993, + "step": 86 + }, + { + "epoch": 0.020410557184750735, + "grad_norm": 4.796576233245253, + "learning_rate": 2.0140515222482438e-06, + "loss": 1.085, + "step": 87 + }, + { + "epoch": 0.02064516129032258, + "grad_norm": 4.0395890106166, + "learning_rate": 2.037470725995316e-06, + "loss": 1.0382, + "step": 88 + }, + { + "epoch": 0.02087976539589443, + "grad_norm": 3.497753662084459, + "learning_rate": 2.060889929742389e-06, + "loss": 1.0052, + "step": 89 + }, + { + "epoch": 0.021114369501466276, + "grad_norm": 3.375552138252609, + "learning_rate": 2.0843091334894615e-06, + "loss": 0.9285, + "step": 90 + }, + { + "epoch": 0.021348973607038125, + "grad_norm": 4.70530115072386, + "learning_rate": 2.1077283372365343e-06, + "loss": 1.03, + "step": 91 + }, + { + "epoch": 0.02158357771260997, + "grad_norm": 8.331460010851679, + "learning_rate": 2.1311475409836067e-06, + "loss": 1.0102, + "step": 92 + }, + { + "epoch": 0.02181818181818182, + "grad_norm": 3.334930764388384, + "learning_rate": 2.154566744730679e-06, + "loss": 0.9987, + "step": 93 + }, + { + "epoch": 0.022052785923753666, + "grad_norm": 3.4622456034454814, + "learning_rate": 2.177985948477752e-06, + "loss": 0.9862, + "step": 94 + }, + { + "epoch": 0.022287390029325515, + "grad_norm": 2.9998967371732967, + "learning_rate": 2.2014051522248244e-06, + "loss": 0.9958, + "step": 95 + }, + { + "epoch": 0.02252199413489736, + "grad_norm": 23.01438365400511, + "learning_rate": 2.2248243559718973e-06, + "loss": 0.9896, + "step": 96 + }, + { + "epoch": 0.02275659824046921, + "grad_norm": 2.914651920561855, + "learning_rate": 2.2482435597189697e-06, + "loss": 0.9177, + "step": 97 + }, + { + "epoch": 0.022991202346041056, + "grad_norm": 5.57426785924633, + "learning_rate": 2.2716627634660425e-06, + "loss": 0.9969, + "step": 98 + }, + { + "epoch": 0.023225806451612905, + "grad_norm": 3.237900155474082, + "learning_rate": 2.295081967213115e-06, + "loss": 0.9954, + "step": 99 + }, + { + "epoch": 0.02346041055718475, + "grad_norm": 2.291976229975076, + "learning_rate": 2.3185011709601878e-06, + "loss": 0.9271, + "step": 100 + }, + { + "epoch": 0.0236950146627566, + "grad_norm": 3.370251844607358, + "learning_rate": 2.34192037470726e-06, + "loss": 0.9272, + "step": 101 + }, + { + "epoch": 0.023929618768328446, + "grad_norm": 2.480256748243063, + "learning_rate": 2.365339578454333e-06, + "loss": 0.9075, + "step": 102 + }, + { + "epoch": 0.024164222873900295, + "grad_norm": 2.6163399629836546, + "learning_rate": 2.388758782201405e-06, + "loss": 0.9246, + "step": 103 + }, + { + "epoch": 0.02439882697947214, + "grad_norm": 6.052092050155686, + "learning_rate": 2.412177985948478e-06, + "loss": 1.0002, + "step": 104 + }, + { + "epoch": 0.02463343108504399, + "grad_norm": 2.899665607986737, + "learning_rate": 2.4355971896955503e-06, + "loss": 0.9357, + "step": 105 + }, + { + "epoch": 0.024868035190615836, + "grad_norm": 19.84317921055058, + "learning_rate": 2.459016393442623e-06, + "loss": 0.9558, + "step": 106 + }, + { + "epoch": 0.025102639296187685, + "grad_norm": 2.698172065560486, + "learning_rate": 2.4824355971896956e-06, + "loss": 0.9606, + "step": 107 + }, + { + "epoch": 0.02533724340175953, + "grad_norm": 2.433886867758403, + "learning_rate": 2.5058548009367684e-06, + "loss": 0.9222, + "step": 108 + }, + { + "epoch": 0.02557184750733138, + "grad_norm": 2.2816602174764893, + "learning_rate": 2.529274004683841e-06, + "loss": 0.9257, + "step": 109 + }, + { + "epoch": 0.025806451612903226, + "grad_norm": 4.635696388912117, + "learning_rate": 2.5526932084309137e-06, + "loss": 0.9065, + "step": 110 + }, + { + "epoch": 0.026041055718475075, + "grad_norm": 3.0329036230732744, + "learning_rate": 2.576112412177986e-06, + "loss": 0.895, + "step": 111 + }, + { + "epoch": 0.02627565982404692, + "grad_norm": 2.5498245911594104, + "learning_rate": 2.599531615925059e-06, + "loss": 0.912, + "step": 112 + }, + { + "epoch": 0.02651026392961877, + "grad_norm": 2.45877636159667, + "learning_rate": 2.6229508196721314e-06, + "loss": 0.9333, + "step": 113 + }, + { + "epoch": 0.026744868035190616, + "grad_norm": 20.90236117147583, + "learning_rate": 2.646370023419204e-06, + "loss": 0.8683, + "step": 114 + }, + { + "epoch": 0.026979472140762465, + "grad_norm": 2.359657953967608, + "learning_rate": 2.6697892271662766e-06, + "loss": 0.9192, + "step": 115 + }, + { + "epoch": 0.02721407624633431, + "grad_norm": 2.369533504919403, + "learning_rate": 2.6932084309133495e-06, + "loss": 0.8735, + "step": 116 + }, + { + "epoch": 0.02744868035190616, + "grad_norm": 13.020543014935672, + "learning_rate": 2.716627634660422e-06, + "loss": 0.9399, + "step": 117 + }, + { + "epoch": 0.027683284457478006, + "grad_norm": 2.387279361264047, + "learning_rate": 2.7400468384074947e-06, + "loss": 0.8968, + "step": 118 + }, + { + "epoch": 0.027917888563049855, + "grad_norm": 3.089762607316902, + "learning_rate": 2.7634660421545667e-06, + "loss": 0.8932, + "step": 119 + }, + { + "epoch": 0.0281524926686217, + "grad_norm": 3.9455101031754864, + "learning_rate": 2.786885245901639e-06, + "loss": 0.8669, + "step": 120 + }, + { + "epoch": 0.02838709677419355, + "grad_norm": 2.2155994106074215, + "learning_rate": 2.810304449648712e-06, + "loss": 0.8372, + "step": 121 + }, + { + "epoch": 0.028621700879765396, + "grad_norm": 1.9575993824588998, + "learning_rate": 2.8337236533957844e-06, + "loss": 0.9265, + "step": 122 + }, + { + "epoch": 0.028856304985337245, + "grad_norm": 3.235759284809253, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.8679, + "step": 123 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 3.181817998484809, + "learning_rate": 2.8805620608899297e-06, + "loss": 0.9384, + "step": 124 + }, + { + "epoch": 0.02932551319648094, + "grad_norm": 3.6594983240678904, + "learning_rate": 2.9039812646370025e-06, + "loss": 0.8532, + "step": 125 + }, + { + "epoch": 0.029560117302052786, + "grad_norm": 2.3937017582314404, + "learning_rate": 2.927400468384075e-06, + "loss": 0.8876, + "step": 126 + }, + { + "epoch": 0.029794721407624635, + "grad_norm": 2.3734098946643485, + "learning_rate": 2.9508196721311478e-06, + "loss": 0.874, + "step": 127 + }, + { + "epoch": 0.03002932551319648, + "grad_norm": 2.4713426475904448, + "learning_rate": 2.97423887587822e-06, + "loss": 0.9142, + "step": 128 + }, + { + "epoch": 0.03026392961876833, + "grad_norm": 3.397728713320515, + "learning_rate": 2.997658079625293e-06, + "loss": 0.915, + "step": 129 + }, + { + "epoch": 0.030498533724340176, + "grad_norm": 2.0874593525371723, + "learning_rate": 3.0210772833723655e-06, + "loss": 0.9252, + "step": 130 + }, + { + "epoch": 0.030733137829912025, + "grad_norm": 6.3434303421296026, + "learning_rate": 3.0444964871194383e-06, + "loss": 0.8908, + "step": 131 + }, + { + "epoch": 0.03096774193548387, + "grad_norm": 57.7898256836101, + "learning_rate": 3.0679156908665107e-06, + "loss": 0.8971, + "step": 132 + }, + { + "epoch": 0.03120234604105572, + "grad_norm": 1.711252751491627, + "learning_rate": 3.0913348946135836e-06, + "loss": 0.9249, + "step": 133 + }, + { + "epoch": 0.03143695014662757, + "grad_norm": 5.0126693472094646, + "learning_rate": 3.114754098360656e-06, + "loss": 0.875, + "step": 134 + }, + { + "epoch": 0.03167155425219941, + "grad_norm": 5.85541088273383, + "learning_rate": 3.138173302107729e-06, + "loss": 0.9186, + "step": 135 + }, + { + "epoch": 0.03190615835777126, + "grad_norm": 3.0975868523648717, + "learning_rate": 3.1615925058548013e-06, + "loss": 0.9148, + "step": 136 + }, + { + "epoch": 0.03214076246334311, + "grad_norm": 2.606556275041088, + "learning_rate": 3.185011709601874e-06, + "loss": 0.8415, + "step": 137 + }, + { + "epoch": 0.03237536656891496, + "grad_norm": 2.484589455292374, + "learning_rate": 3.2084309133489465e-06, + "loss": 0.9124, + "step": 138 + }, + { + "epoch": 0.0326099706744868, + "grad_norm": 2.6972597309695545, + "learning_rate": 3.2318501170960194e-06, + "loss": 0.8407, + "step": 139 + }, + { + "epoch": 0.03284457478005865, + "grad_norm": 3.375807212236533, + "learning_rate": 3.2552693208430914e-06, + "loss": 0.8976, + "step": 140 + }, + { + "epoch": 0.0330791788856305, + "grad_norm": 20.40922837350589, + "learning_rate": 3.2786885245901638e-06, + "loss": 0.7912, + "step": 141 + }, + { + "epoch": 0.03331378299120235, + "grad_norm": 3.9405236611888794, + "learning_rate": 3.3021077283372366e-06, + "loss": 0.8665, + "step": 142 + }, + { + "epoch": 0.03354838709677419, + "grad_norm": 2.7508052811782795, + "learning_rate": 3.325526932084309e-06, + "loss": 0.8238, + "step": 143 + }, + { + "epoch": 0.03378299120234604, + "grad_norm": 5.017113290178286, + "learning_rate": 3.348946135831382e-06, + "loss": 0.8803, + "step": 144 + }, + { + "epoch": 0.03401759530791789, + "grad_norm": 2.180732748232138, + "learning_rate": 3.3723653395784543e-06, + "loss": 0.8724, + "step": 145 + }, + { + "epoch": 0.03425219941348974, + "grad_norm": 3.4586882050068692, + "learning_rate": 3.395784543325527e-06, + "loss": 0.7973, + "step": 146 + }, + { + "epoch": 0.03448680351906158, + "grad_norm": 6.8618732473004265, + "learning_rate": 3.4192037470725996e-06, + "loss": 0.8987, + "step": 147 + }, + { + "epoch": 0.03472140762463343, + "grad_norm": 1.6297016411593837, + "learning_rate": 3.4426229508196724e-06, + "loss": 0.8103, + "step": 148 + }, + { + "epoch": 0.03495601173020528, + "grad_norm": 2.7847740603147257, + "learning_rate": 3.466042154566745e-06, + "loss": 0.8684, + "step": 149 + }, + { + "epoch": 0.03519061583577713, + "grad_norm": 1.7603035059230738, + "learning_rate": 3.4894613583138177e-06, + "loss": 0.8308, + "step": 150 + }, + { + "epoch": 0.03542521994134897, + "grad_norm": 1.6040986957520253, + "learning_rate": 3.51288056206089e-06, + "loss": 0.8349, + "step": 151 + }, + { + "epoch": 0.03565982404692082, + "grad_norm": 3.1963778933146907, + "learning_rate": 3.536299765807963e-06, + "loss": 0.88, + "step": 152 + }, + { + "epoch": 0.03589442815249267, + "grad_norm": 3.3496492672952014, + "learning_rate": 3.5597189695550354e-06, + "loss": 0.909, + "step": 153 + }, + { + "epoch": 0.03612903225806452, + "grad_norm": 2.455904571079815, + "learning_rate": 3.583138173302108e-06, + "loss": 0.8191, + "step": 154 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 1.5896943471703224, + "learning_rate": 3.6065573770491806e-06, + "loss": 0.9028, + "step": 155 + }, + { + "epoch": 0.03659824046920821, + "grad_norm": 2.6915939177699078, + "learning_rate": 3.6299765807962535e-06, + "loss": 0.8393, + "step": 156 + }, + { + "epoch": 0.03683284457478006, + "grad_norm": 7.175761101203019, + "learning_rate": 3.653395784543326e-06, + "loss": 0.8643, + "step": 157 + }, + { + "epoch": 0.03706744868035191, + "grad_norm": 2.2229527474502735, + "learning_rate": 3.6768149882903987e-06, + "loss": 0.8513, + "step": 158 + }, + { + "epoch": 0.03730205278592375, + "grad_norm": 2.9865753215500197, + "learning_rate": 3.700234192037471e-06, + "loss": 0.8868, + "step": 159 + }, + { + "epoch": 0.0375366568914956, + "grad_norm": 2.9734325151644554, + "learning_rate": 3.723653395784544e-06, + "loss": 0.8532, + "step": 160 + }, + { + "epoch": 0.03777126099706745, + "grad_norm": 1.8588526791176345, + "learning_rate": 3.7470725995316164e-06, + "loss": 0.819, + "step": 161 + }, + { + "epoch": 0.0380058651026393, + "grad_norm": 8.645485871225635, + "learning_rate": 3.7704918032786884e-06, + "loss": 0.8521, + "step": 162 + }, + { + "epoch": 0.03824046920821114, + "grad_norm": 3.002196055017224, + "learning_rate": 3.7939110070257613e-06, + "loss": 0.8084, + "step": 163 + }, + { + "epoch": 0.03847507331378299, + "grad_norm": 3.2302818373533966, + "learning_rate": 3.817330210772834e-06, + "loss": 0.9311, + "step": 164 + }, + { + "epoch": 0.03870967741935484, + "grad_norm": 3.6213961241692036, + "learning_rate": 3.8407494145199065e-06, + "loss": 0.844, + "step": 165 + }, + { + "epoch": 0.03894428152492669, + "grad_norm": 2.7852526099599224, + "learning_rate": 3.864168618266979e-06, + "loss": 0.863, + "step": 166 + }, + { + "epoch": 0.03917888563049853, + "grad_norm": 4.334673507783282, + "learning_rate": 3.887587822014051e-06, + "loss": 0.8363, + "step": 167 + }, + { + "epoch": 0.03941348973607038, + "grad_norm": 2.048656770606946, + "learning_rate": 3.911007025761124e-06, + "loss": 0.8315, + "step": 168 + }, + { + "epoch": 0.03964809384164223, + "grad_norm": 4.177636179080158, + "learning_rate": 3.934426229508197e-06, + "loss": 0.7962, + "step": 169 + }, + { + "epoch": 0.03988269794721408, + "grad_norm": 1.8523171799760045, + "learning_rate": 3.95784543325527e-06, + "loss": 0.8058, + "step": 170 + }, + { + "epoch": 0.04011730205278592, + "grad_norm": 2.9191774008849567, + "learning_rate": 3.981264637002342e-06, + "loss": 0.876, + "step": 171 + }, + { + "epoch": 0.04035190615835777, + "grad_norm": 2.530425709085943, + "learning_rate": 4.004683840749415e-06, + "loss": 0.8415, + "step": 172 + }, + { + "epoch": 0.04058651026392962, + "grad_norm": 2.918808157433132, + "learning_rate": 4.0281030444964876e-06, + "loss": 0.8353, + "step": 173 + }, + { + "epoch": 0.04082111436950147, + "grad_norm": 2.37263511816868, + "learning_rate": 4.05152224824356e-06, + "loss": 0.856, + "step": 174 + }, + { + "epoch": 0.04105571847507331, + "grad_norm": 2.7199432299385293, + "learning_rate": 4.074941451990632e-06, + "loss": 0.8177, + "step": 175 + }, + { + "epoch": 0.04129032258064516, + "grad_norm": 1.9137699316073797, + "learning_rate": 4.098360655737705e-06, + "loss": 0.774, + "step": 176 + }, + { + "epoch": 0.04152492668621701, + "grad_norm": 1.9969742439162197, + "learning_rate": 4.121779859484778e-06, + "loss": 0.8587, + "step": 177 + }, + { + "epoch": 0.04175953079178886, + "grad_norm": 1.62573462869543, + "learning_rate": 4.145199063231851e-06, + "loss": 0.8197, + "step": 178 + }, + { + "epoch": 0.0419941348973607, + "grad_norm": 2.6789717841087635, + "learning_rate": 4.168618266978923e-06, + "loss": 0.7843, + "step": 179 + }, + { + "epoch": 0.04222873900293255, + "grad_norm": 2.8296926850030437, + "learning_rate": 4.192037470725996e-06, + "loss": 0.7829, + "step": 180 + }, + { + "epoch": 0.0424633431085044, + "grad_norm": 2.9687571177348078, + "learning_rate": 4.215456674473069e-06, + "loss": 0.8265, + "step": 181 + }, + { + "epoch": 0.04269794721407625, + "grad_norm": 8.057262005821913, + "learning_rate": 4.2388758782201415e-06, + "loss": 0.8063, + "step": 182 + }, + { + "epoch": 0.04293255131964809, + "grad_norm": 1.3309377653809522, + "learning_rate": 4.2622950819672135e-06, + "loss": 0.85, + "step": 183 + }, + { + "epoch": 0.04316715542521994, + "grad_norm": 4.796473479193148, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.8535, + "step": 184 + }, + { + "epoch": 0.04340175953079179, + "grad_norm": 2.29076687107649, + "learning_rate": 4.309133489461358e-06, + "loss": 0.8308, + "step": 185 + }, + { + "epoch": 0.04363636363636364, + "grad_norm": 1.9668639979537659, + "learning_rate": 4.332552693208431e-06, + "loss": 0.8164, + "step": 186 + }, + { + "epoch": 0.04387096774193548, + "grad_norm": 1.7492988720129186, + "learning_rate": 4.355971896955504e-06, + "loss": 0.7862, + "step": 187 + }, + { + "epoch": 0.04410557184750733, + "grad_norm": 1.902084590499895, + "learning_rate": 4.379391100702576e-06, + "loss": 0.7948, + "step": 188 + }, + { + "epoch": 0.04434017595307918, + "grad_norm": 9.860967065038771, + "learning_rate": 4.402810304449649e-06, + "loss": 0.8499, + "step": 189 + }, + { + "epoch": 0.04457478005865103, + "grad_norm": 1.5904532338079087, + "learning_rate": 4.426229508196722e-06, + "loss": 0.8706, + "step": 190 + }, + { + "epoch": 0.04480938416422287, + "grad_norm": 3.2286150229605584, + "learning_rate": 4.4496487119437945e-06, + "loss": 0.8434, + "step": 191 + }, + { + "epoch": 0.04504398826979472, + "grad_norm": 2.642953811023596, + "learning_rate": 4.4730679156908665e-06, + "loss": 0.7932, + "step": 192 + }, + { + "epoch": 0.04527859237536657, + "grad_norm": 6.708623247161702, + "learning_rate": 4.496487119437939e-06, + "loss": 0.7557, + "step": 193 + }, + { + "epoch": 0.04551319648093842, + "grad_norm": 4.245544259005578, + "learning_rate": 4.519906323185012e-06, + "loss": 0.7919, + "step": 194 + }, + { + "epoch": 0.04574780058651026, + "grad_norm": 2.2209264855596773, + "learning_rate": 4.543325526932085e-06, + "loss": 0.8008, + "step": 195 + }, + { + "epoch": 0.04598240469208211, + "grad_norm": 4.615319317159054, + "learning_rate": 4.566744730679157e-06, + "loss": 0.7972, + "step": 196 + }, + { + "epoch": 0.04621700879765396, + "grad_norm": 2.4016647930607617, + "learning_rate": 4.59016393442623e-06, + "loss": 0.7921, + "step": 197 + }, + { + "epoch": 0.04645161290322581, + "grad_norm": 2.1065053768148054, + "learning_rate": 4.613583138173303e-06, + "loss": 0.8194, + "step": 198 + }, + { + "epoch": 0.04668621700879765, + "grad_norm": 1.3854497678939341, + "learning_rate": 4.6370023419203756e-06, + "loss": 0.8471, + "step": 199 + }, + { + "epoch": 0.0469208211143695, + "grad_norm": 6.859498238314571, + "learning_rate": 4.6604215456674476e-06, + "loss": 0.8511, + "step": 200 + }, + { + "epoch": 0.04715542521994135, + "grad_norm": 2.2870014320129126, + "learning_rate": 4.68384074941452e-06, + "loss": 0.778, + "step": 201 + }, + { + "epoch": 0.0473900293255132, + "grad_norm": 3.412042568878162, + "learning_rate": 4.707259953161593e-06, + "loss": 0.7838, + "step": 202 + }, + { + "epoch": 0.04762463343108504, + "grad_norm": 1.5441510074320814, + "learning_rate": 4.730679156908666e-06, + "loss": 0.7667, + "step": 203 + }, + { + "epoch": 0.04785923753665689, + "grad_norm": 2.4279262506292065, + "learning_rate": 4.754098360655738e-06, + "loss": 0.7373, + "step": 204 + }, + { + "epoch": 0.04809384164222874, + "grad_norm": 2.68155491287197, + "learning_rate": 4.77751756440281e-06, + "loss": 0.7851, + "step": 205 + }, + { + "epoch": 0.04832844574780059, + "grad_norm": 2.5361715948287236, + "learning_rate": 4.800936768149883e-06, + "loss": 0.7991, + "step": 206 + }, + { + "epoch": 0.04856304985337243, + "grad_norm": 1.965596504616627, + "learning_rate": 4.824355971896956e-06, + "loss": 0.754, + "step": 207 + }, + { + "epoch": 0.04879765395894428, + "grad_norm": 5.682372140740957, + "learning_rate": 4.847775175644029e-06, + "loss": 0.8054, + "step": 208 + }, + { + "epoch": 0.04903225806451613, + "grad_norm": 3.5475724826725252, + "learning_rate": 4.871194379391101e-06, + "loss": 0.8108, + "step": 209 + }, + { + "epoch": 0.04926686217008798, + "grad_norm": 2.0753730912698294, + "learning_rate": 4.8946135831381735e-06, + "loss": 0.7497, + "step": 210 + }, + { + "epoch": 0.04950146627565982, + "grad_norm": 4.063420592276077, + "learning_rate": 4.918032786885246e-06, + "loss": 0.8285, + "step": 211 + }, + { + "epoch": 0.04973607038123167, + "grad_norm": 1.7307046068411553, + "learning_rate": 4.941451990632319e-06, + "loss": 0.8044, + "step": 212 + }, + { + "epoch": 0.04997067448680352, + "grad_norm": 2.2640298230945226, + "learning_rate": 4.964871194379391e-06, + "loss": 0.7995, + "step": 213 + }, + { + "epoch": 0.05020527859237537, + "grad_norm": 2.3810018614242403, + "learning_rate": 4.988290398126464e-06, + "loss": 0.786, + "step": 214 + }, + { + "epoch": 0.05043988269794721, + "grad_norm": 4.549399873923761, + "learning_rate": 5.011709601873537e-06, + "loss": 0.7859, + "step": 215 + }, + { + "epoch": 0.05067448680351906, + "grad_norm": 2.0135449375550003, + "learning_rate": 5.03512880562061e-06, + "loss": 0.7344, + "step": 216 + }, + { + "epoch": 0.05090909090909091, + "grad_norm": 3.872319715699657, + "learning_rate": 5.058548009367682e-06, + "loss": 0.7831, + "step": 217 + }, + { + "epoch": 0.05114369501466276, + "grad_norm": 1.6872682506797199, + "learning_rate": 5.0819672131147545e-06, + "loss": 0.8038, + "step": 218 + }, + { + "epoch": 0.0513782991202346, + "grad_norm": 2.1830008681001147, + "learning_rate": 5.105386416861827e-06, + "loss": 0.7964, + "step": 219 + }, + { + "epoch": 0.05161290322580645, + "grad_norm": 1.4493570098406372, + "learning_rate": 5.1288056206089e-06, + "loss": 0.7337, + "step": 220 + }, + { + "epoch": 0.0518475073313783, + "grad_norm": 2.5689272205754294, + "learning_rate": 5.152224824355972e-06, + "loss": 0.7797, + "step": 221 + }, + { + "epoch": 0.05208211143695015, + "grad_norm": 3.389990850999057, + "learning_rate": 5.175644028103045e-06, + "loss": 0.8088, + "step": 222 + }, + { + "epoch": 0.05231671554252199, + "grad_norm": 4.273411621435788, + "learning_rate": 5.199063231850118e-06, + "loss": 0.7198, + "step": 223 + }, + { + "epoch": 0.05255131964809384, + "grad_norm": 4.9769025368159685, + "learning_rate": 5.222482435597191e-06, + "loss": 0.7838, + "step": 224 + }, + { + "epoch": 0.05278592375366569, + "grad_norm": 2.167852931175855, + "learning_rate": 5.245901639344263e-06, + "loss": 0.7534, + "step": 225 + }, + { + "epoch": 0.05302052785923754, + "grad_norm": 3.798350568648415, + "learning_rate": 5.2693208430913356e-06, + "loss": 0.7651, + "step": 226 + }, + { + "epoch": 0.05325513196480938, + "grad_norm": 1.299473999291837, + "learning_rate": 5.292740046838408e-06, + "loss": 0.7829, + "step": 227 + }, + { + "epoch": 0.05348973607038123, + "grad_norm": 3.2234237743302283, + "learning_rate": 5.316159250585481e-06, + "loss": 0.799, + "step": 228 + }, + { + "epoch": 0.05372434017595308, + "grad_norm": 8.030276193203514, + "learning_rate": 5.339578454332553e-06, + "loss": 0.7717, + "step": 229 + }, + { + "epoch": 0.05395894428152493, + "grad_norm": 2.5370284044134253, + "learning_rate": 5.362997658079626e-06, + "loss": 0.7736, + "step": 230 + }, + { + "epoch": 0.05419354838709677, + "grad_norm": 3.844656305538049, + "learning_rate": 5.386416861826699e-06, + "loss": 0.749, + "step": 231 + }, + { + "epoch": 0.05442815249266862, + "grad_norm": 1.9427868676296431, + "learning_rate": 5.409836065573772e-06, + "loss": 0.7908, + "step": 232 + }, + { + "epoch": 0.05466275659824047, + "grad_norm": 3.9911344896775107, + "learning_rate": 5.433255269320844e-06, + "loss": 0.8008, + "step": 233 + }, + { + "epoch": 0.05489736070381232, + "grad_norm": 1.9734180350151789, + "learning_rate": 5.456674473067917e-06, + "loss": 0.7454, + "step": 234 + }, + { + "epoch": 0.05513196480938416, + "grad_norm": 4.702544055041282, + "learning_rate": 5.4800936768149895e-06, + "loss": 0.7252, + "step": 235 + }, + { + "epoch": 0.05536656891495601, + "grad_norm": 2.1911754420328022, + "learning_rate": 5.503512880562061e-06, + "loss": 0.7967, + "step": 236 + }, + { + "epoch": 0.05560117302052786, + "grad_norm": 1.875015352817215, + "learning_rate": 5.5269320843091335e-06, + "loss": 0.807, + "step": 237 + }, + { + "epoch": 0.05583577712609971, + "grad_norm": 3.8966209518040555, + "learning_rate": 5.550351288056206e-06, + "loss": 0.7806, + "step": 238 + }, + { + "epoch": 0.05607038123167155, + "grad_norm": 1.4214964022874772, + "learning_rate": 5.573770491803278e-06, + "loss": 0.7544, + "step": 239 + }, + { + "epoch": 0.0563049853372434, + "grad_norm": 3.967817931858348, + "learning_rate": 5.597189695550351e-06, + "loss": 0.8101, + "step": 240 + }, + { + "epoch": 0.05653958944281525, + "grad_norm": 1.8768806953038544, + "learning_rate": 5.620608899297424e-06, + "loss": 0.7151, + "step": 241 + }, + { + "epoch": 0.0567741935483871, + "grad_norm": 1.5456485323537321, + "learning_rate": 5.644028103044497e-06, + "loss": 0.7259, + "step": 242 + }, + { + "epoch": 0.05700879765395894, + "grad_norm": 11.443111964052978, + "learning_rate": 5.667447306791569e-06, + "loss": 0.752, + "step": 243 + }, + { + "epoch": 0.05724340175953079, + "grad_norm": 2.2004989352128472, + "learning_rate": 5.690866510538642e-06, + "loss": 0.8032, + "step": 244 + }, + { + "epoch": 0.05747800586510264, + "grad_norm": 2.509396292931848, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.7954, + "step": 245 + }, + { + "epoch": 0.05771260997067449, + "grad_norm": 4.62406672031256, + "learning_rate": 5.737704918032787e-06, + "loss": 0.6891, + "step": 246 + }, + { + "epoch": 0.05794721407624633, + "grad_norm": 1.7023733051611323, + "learning_rate": 5.761124121779859e-06, + "loss": 0.7457, + "step": 247 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 2.1076480192526903, + "learning_rate": 5.784543325526932e-06, + "loss": 0.7325, + "step": 248 + }, + { + "epoch": 0.05841642228739003, + "grad_norm": 3.3089258745559524, + "learning_rate": 5.807962529274005e-06, + "loss": 0.7939, + "step": 249 + }, + { + "epoch": 0.05865102639296188, + "grad_norm": 3.148213007235479, + "learning_rate": 5.831381733021078e-06, + "loss": 0.7545, + "step": 250 + }, + { + "epoch": 0.05888563049853372, + "grad_norm": 1.6415544135274773, + "learning_rate": 5.85480093676815e-06, + "loss": 0.8057, + "step": 251 + }, + { + "epoch": 0.05912023460410557, + "grad_norm": 2.419528106327016, + "learning_rate": 5.878220140515223e-06, + "loss": 0.7841, + "step": 252 + }, + { + "epoch": 0.05935483870967742, + "grad_norm": 3.1088794443157854, + "learning_rate": 5.9016393442622956e-06, + "loss": 0.7512, + "step": 253 + }, + { + "epoch": 0.05958944281524927, + "grad_norm": 1.9587822334450857, + "learning_rate": 5.925058548009368e-06, + "loss": 0.7119, + "step": 254 + }, + { + "epoch": 0.05982404692082111, + "grad_norm": 3.5891298714306776, + "learning_rate": 5.94847775175644e-06, + "loss": 0.7205, + "step": 255 + }, + { + "epoch": 0.06005865102639296, + "grad_norm": 1.7806666652881635, + "learning_rate": 5.971896955503513e-06, + "loss": 0.7718, + "step": 256 + }, + { + "epoch": 0.06029325513196481, + "grad_norm": 1.5452906290772128, + "learning_rate": 5.995316159250586e-06, + "loss": 0.7554, + "step": 257 + }, + { + "epoch": 0.06052785923753666, + "grad_norm": 1.7300711388008159, + "learning_rate": 6.018735362997659e-06, + "loss": 0.7274, + "step": 258 + }, + { + "epoch": 0.0607624633431085, + "grad_norm": 3.4093157523447273, + "learning_rate": 6.042154566744731e-06, + "loss": 0.7214, + "step": 259 + }, + { + "epoch": 0.06099706744868035, + "grad_norm": 1.6095409898986446, + "learning_rate": 6.065573770491804e-06, + "loss": 0.7483, + "step": 260 + }, + { + "epoch": 0.0612316715542522, + "grad_norm": 2.6284860137387382, + "learning_rate": 6.088992974238877e-06, + "loss": 0.7856, + "step": 261 + }, + { + "epoch": 0.06146627565982405, + "grad_norm": 2.0697469231199914, + "learning_rate": 6.1124121779859495e-06, + "loss": 0.727, + "step": 262 + }, + { + "epoch": 0.06170087976539589, + "grad_norm": 1.7751716037540561, + "learning_rate": 6.1358313817330215e-06, + "loss": 0.7186, + "step": 263 + }, + { + "epoch": 0.06193548387096774, + "grad_norm": 10.737798739408827, + "learning_rate": 6.159250585480094e-06, + "loss": 0.8159, + "step": 264 + }, + { + "epoch": 0.06217008797653959, + "grad_norm": 4.095125238646114, + "learning_rate": 6.182669789227167e-06, + "loss": 0.7606, + "step": 265 + }, + { + "epoch": 0.06240469208211144, + "grad_norm": 2.533566695384937, + "learning_rate": 6.20608899297424e-06, + "loss": 0.7428, + "step": 266 + }, + { + "epoch": 0.06263929618768328, + "grad_norm": 3.9375956780165087, + "learning_rate": 6.229508196721312e-06, + "loss": 0.7536, + "step": 267 + }, + { + "epoch": 0.06287390029325514, + "grad_norm": 3.587867577881465, + "learning_rate": 6.252927400468385e-06, + "loss": 0.8013, + "step": 268 + }, + { + "epoch": 0.06310850439882698, + "grad_norm": 1.9828841441463887, + "learning_rate": 6.276346604215458e-06, + "loss": 0.6658, + "step": 269 + }, + { + "epoch": 0.06334310850439882, + "grad_norm": 2.532887381542946, + "learning_rate": 6.2997658079625305e-06, + "loss": 0.7835, + "step": 270 + }, + { + "epoch": 0.06357771260997068, + "grad_norm": 3.5378104561825925, + "learning_rate": 6.3231850117096025e-06, + "loss": 0.7792, + "step": 271 + }, + { + "epoch": 0.06381231671554252, + "grad_norm": 8.346765497137895, + "learning_rate": 6.346604215456675e-06, + "loss": 0.6928, + "step": 272 + }, + { + "epoch": 0.06404692082111436, + "grad_norm": 1.874277091383251, + "learning_rate": 6.370023419203748e-06, + "loss": 0.7879, + "step": 273 + }, + { + "epoch": 0.06428152492668622, + "grad_norm": 2.5695005602644456, + "learning_rate": 6.393442622950821e-06, + "loss": 0.7346, + "step": 274 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.8923461355980111, + "learning_rate": 6.416861826697893e-06, + "loss": 0.7706, + "step": 275 + }, + { + "epoch": 0.06475073313782992, + "grad_norm": 2.345792784341501, + "learning_rate": 6.440281030444966e-06, + "loss": 0.7925, + "step": 276 + }, + { + "epoch": 0.06498533724340176, + "grad_norm": 1.3700649626976735, + "learning_rate": 6.463700234192039e-06, + "loss": 0.7343, + "step": 277 + }, + { + "epoch": 0.0652199413489736, + "grad_norm": 4.2957117515820915, + "learning_rate": 6.487119437939111e-06, + "loss": 0.7899, + "step": 278 + }, + { + "epoch": 0.06545454545454546, + "grad_norm": 1.8088841845847408, + "learning_rate": 6.510538641686183e-06, + "loss": 0.7432, + "step": 279 + }, + { + "epoch": 0.0656891495601173, + "grad_norm": 1.9353168320863243, + "learning_rate": 6.5339578454332556e-06, + "loss": 0.7195, + "step": 280 + }, + { + "epoch": 0.06592375366568914, + "grad_norm": 2.0971904069466185, + "learning_rate": 6.5573770491803276e-06, + "loss": 0.797, + "step": 281 + }, + { + "epoch": 0.066158357771261, + "grad_norm": 2.9785324231253605, + "learning_rate": 6.5807962529274e-06, + "loss": 0.7344, + "step": 282 + }, + { + "epoch": 0.06639296187683284, + "grad_norm": 3.4524226360583423, + "learning_rate": 6.604215456674473e-06, + "loss": 0.7101, + "step": 283 + }, + { + "epoch": 0.0666275659824047, + "grad_norm": 2.5242898267713128, + "learning_rate": 6.627634660421546e-06, + "loss": 0.753, + "step": 284 + }, + { + "epoch": 0.06686217008797654, + "grad_norm": 1.4934534118028575, + "learning_rate": 6.651053864168618e-06, + "loss": 0.7951, + "step": 285 + }, + { + "epoch": 0.06709677419354838, + "grad_norm": 5.172684338844145, + "learning_rate": 6.674473067915691e-06, + "loss": 0.7276, + "step": 286 + }, + { + "epoch": 0.06733137829912024, + "grad_norm": 13.510287957162044, + "learning_rate": 6.697892271662764e-06, + "loss": 0.7135, + "step": 287 + }, + { + "epoch": 0.06756598240469208, + "grad_norm": 2.9564063625674395, + "learning_rate": 6.721311475409837e-06, + "loss": 0.8378, + "step": 288 + }, + { + "epoch": 0.06780058651026392, + "grad_norm": 1.859065502082555, + "learning_rate": 6.744730679156909e-06, + "loss": 0.7448, + "step": 289 + }, + { + "epoch": 0.06803519061583578, + "grad_norm": 4.68045877292896, + "learning_rate": 6.7681498829039815e-06, + "loss": 0.6803, + "step": 290 + }, + { + "epoch": 0.06826979472140762, + "grad_norm": 3.0530974991612134, + "learning_rate": 6.791569086651054e-06, + "loss": 0.7392, + "step": 291 + }, + { + "epoch": 0.06850439882697948, + "grad_norm": 2.9913463833605274, + "learning_rate": 6.814988290398127e-06, + "loss": 0.7589, + "step": 292 + }, + { + "epoch": 0.06873900293255132, + "grad_norm": 2.1123062226147153, + "learning_rate": 6.838407494145199e-06, + "loss": 0.7948, + "step": 293 + }, + { + "epoch": 0.06897360703812316, + "grad_norm": 12.645487621717868, + "learning_rate": 6.861826697892272e-06, + "loss": 0.7567, + "step": 294 + }, + { + "epoch": 0.06920821114369502, + "grad_norm": 1.465655431460024, + "learning_rate": 6.885245901639345e-06, + "loss": 0.7775, + "step": 295 + }, + { + "epoch": 0.06944281524926686, + "grad_norm": 5.270313079861416, + "learning_rate": 6.908665105386418e-06, + "loss": 0.7425, + "step": 296 + }, + { + "epoch": 0.0696774193548387, + "grad_norm": 11.365986847972149, + "learning_rate": 6.93208430913349e-06, + "loss": 0.7524, + "step": 297 + }, + { + "epoch": 0.06991202346041056, + "grad_norm": 1.788096518614783, + "learning_rate": 6.9555035128805625e-06, + "loss": 0.676, + "step": 298 + }, + { + "epoch": 0.0701466275659824, + "grad_norm": 2.7086246514533583, + "learning_rate": 6.978922716627635e-06, + "loss": 0.7226, + "step": 299 + }, + { + "epoch": 0.07038123167155426, + "grad_norm": 1.3262605972809178, + "learning_rate": 7.002341920374708e-06, + "loss": 0.7954, + "step": 300 + }, + { + "epoch": 0.0706158357771261, + "grad_norm": 15.440608301240019, + "learning_rate": 7.02576112412178e-06, + "loss": 0.7638, + "step": 301 + }, + { + "epoch": 0.07085043988269794, + "grad_norm": 2.14628459786964, + "learning_rate": 7.049180327868853e-06, + "loss": 0.7971, + "step": 302 + }, + { + "epoch": 0.0710850439882698, + "grad_norm": 2.787143616061244, + "learning_rate": 7.072599531615926e-06, + "loss": 0.7924, + "step": 303 + }, + { + "epoch": 0.07131964809384164, + "grad_norm": 6.82002908629792, + "learning_rate": 7.096018735362999e-06, + "loss": 0.7978, + "step": 304 + }, + { + "epoch": 0.07155425219941348, + "grad_norm": 1.8496761797035541, + "learning_rate": 7.119437939110071e-06, + "loss": 0.7508, + "step": 305 + }, + { + "epoch": 0.07178885630498534, + "grad_norm": 4.338350528797399, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.7585, + "step": 306 + }, + { + "epoch": 0.07202346041055718, + "grad_norm": 4.857979418553227, + "learning_rate": 7.166276346604216e-06, + "loss": 0.6679, + "step": 307 + }, + { + "epoch": 0.07225806451612904, + "grad_norm": 1.7348608947080373, + "learning_rate": 7.189695550351289e-06, + "loss": 0.6959, + "step": 308 + }, + { + "epoch": 0.07249266862170088, + "grad_norm": 2.2509485892266348, + "learning_rate": 7.213114754098361e-06, + "loss": 0.7304, + "step": 309 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 1.5645365862972824, + "learning_rate": 7.236533957845434e-06, + "loss": 0.7468, + "step": 310 + }, + { + "epoch": 0.07296187683284458, + "grad_norm": 5.243283974944376, + "learning_rate": 7.259953161592507e-06, + "loss": 0.7103, + "step": 311 + }, + { + "epoch": 0.07319648093841642, + "grad_norm": 1.8240644408057207, + "learning_rate": 7.28337236533958e-06, + "loss": 0.6277, + "step": 312 + }, + { + "epoch": 0.07343108504398826, + "grad_norm": 4.009256485400117, + "learning_rate": 7.306791569086652e-06, + "loss": 0.7491, + "step": 313 + }, + { + "epoch": 0.07366568914956012, + "grad_norm": 2.5862860233978204, + "learning_rate": 7.330210772833725e-06, + "loss": 0.6999, + "step": 314 + }, + { + "epoch": 0.07390029325513196, + "grad_norm": 1.95411282236033, + "learning_rate": 7.3536299765807975e-06, + "loss": 0.7655, + "step": 315 + }, + { + "epoch": 0.07413489736070382, + "grad_norm": 2.5324854678602726, + "learning_rate": 7.3770491803278695e-06, + "loss": 0.7145, + "step": 316 + }, + { + "epoch": 0.07436950146627566, + "grad_norm": 2.2048590914639883, + "learning_rate": 7.400468384074942e-06, + "loss": 0.752, + "step": 317 + }, + { + "epoch": 0.0746041055718475, + "grad_norm": 10.802056503356317, + "learning_rate": 7.423887587822015e-06, + "loss": 0.7544, + "step": 318 + }, + { + "epoch": 0.07483870967741936, + "grad_norm": 5.947513346289248, + "learning_rate": 7.447306791569088e-06, + "loss": 0.7024, + "step": 319 + }, + { + "epoch": 0.0750733137829912, + "grad_norm": 2.2835797681082726, + "learning_rate": 7.47072599531616e-06, + "loss": 0.7484, + "step": 320 + }, + { + "epoch": 0.07530791788856304, + "grad_norm": 2.0787193869349947, + "learning_rate": 7.494145199063233e-06, + "loss": 0.747, + "step": 321 + }, + { + "epoch": 0.0755425219941349, + "grad_norm": 1.755869711708477, + "learning_rate": 7.517564402810305e-06, + "loss": 0.6877, + "step": 322 + }, + { + "epoch": 0.07577712609970674, + "grad_norm": 3.1638128679507758, + "learning_rate": 7.540983606557377e-06, + "loss": 0.7697, + "step": 323 + }, + { + "epoch": 0.0760117302052786, + "grad_norm": 1.924419722616588, + "learning_rate": 7.56440281030445e-06, + "loss": 0.7838, + "step": 324 + }, + { + "epoch": 0.07624633431085044, + "grad_norm": 1.784712238212702, + "learning_rate": 7.5878220140515225e-06, + "loss": 0.7164, + "step": 325 + }, + { + "epoch": 0.07648093841642228, + "grad_norm": 1.5556484461805318, + "learning_rate": 7.611241217798595e-06, + "loss": 0.7584, + "step": 326 + }, + { + "epoch": 0.07671554252199414, + "grad_norm": 1.6239563401361596, + "learning_rate": 7.634660421545667e-06, + "loss": 0.672, + "step": 327 + }, + { + "epoch": 0.07695014662756598, + "grad_norm": 2.7172441896685084, + "learning_rate": 7.65807962529274e-06, + "loss": 0.7238, + "step": 328 + }, + { + "epoch": 0.07718475073313782, + "grad_norm": 2.7504509737580323, + "learning_rate": 7.681498829039813e-06, + "loss": 0.7063, + "step": 329 + }, + { + "epoch": 0.07741935483870968, + "grad_norm": 4.209971644960854, + "learning_rate": 7.704918032786886e-06, + "loss": 0.7075, + "step": 330 + }, + { + "epoch": 0.07765395894428152, + "grad_norm": 3.8102324360788793, + "learning_rate": 7.728337236533959e-06, + "loss": 0.759, + "step": 331 + }, + { + "epoch": 0.07788856304985338, + "grad_norm": 2.948409700744492, + "learning_rate": 7.751756440281032e-06, + "loss": 0.6859, + "step": 332 + }, + { + "epoch": 0.07812316715542522, + "grad_norm": 2.12117726157581, + "learning_rate": 7.775175644028103e-06, + "loss": 0.7335, + "step": 333 + }, + { + "epoch": 0.07835777126099706, + "grad_norm": 3.7746062350923677, + "learning_rate": 7.798594847775176e-06, + "loss": 0.7585, + "step": 334 + }, + { + "epoch": 0.07859237536656892, + "grad_norm": 7.485470991883209, + "learning_rate": 7.822014051522248e-06, + "loss": 0.6875, + "step": 335 + }, + { + "epoch": 0.07882697947214076, + "grad_norm": 2.438658899659587, + "learning_rate": 7.845433255269321e-06, + "loss": 0.7741, + "step": 336 + }, + { + "epoch": 0.0790615835777126, + "grad_norm": 2.460708668740363, + "learning_rate": 7.868852459016394e-06, + "loss": 0.7959, + "step": 337 + }, + { + "epoch": 0.07929618768328446, + "grad_norm": 2.074075903721013, + "learning_rate": 7.892271662763467e-06, + "loss": 0.7462, + "step": 338 + }, + { + "epoch": 0.0795307917888563, + "grad_norm": 3.7240911196711077, + "learning_rate": 7.91569086651054e-06, + "loss": 0.7188, + "step": 339 + }, + { + "epoch": 0.07976539589442816, + "grad_norm": 5.283811424972736, + "learning_rate": 7.939110070257613e-06, + "loss": 0.7185, + "step": 340 + }, + { + "epoch": 0.08, + "grad_norm": 3.2104694215673786, + "learning_rate": 7.962529274004684e-06, + "loss": 0.7172, + "step": 341 + }, + { + "epoch": 0.08023460410557184, + "grad_norm": 2.997008713052917, + "learning_rate": 7.985948477751757e-06, + "loss": 0.7621, + "step": 342 + }, + { + "epoch": 0.0804692082111437, + "grad_norm": 2.506302477784963, + "learning_rate": 8.00936768149883e-06, + "loss": 0.7508, + "step": 343 + }, + { + "epoch": 0.08070381231671554, + "grad_norm": 3.1906634786571337, + "learning_rate": 8.032786885245902e-06, + "loss": 0.6898, + "step": 344 + }, + { + "epoch": 0.08093841642228738, + "grad_norm": 1.8579820431446714, + "learning_rate": 8.056206088992975e-06, + "loss": 0.744, + "step": 345 + }, + { + "epoch": 0.08117302052785924, + "grad_norm": 3.014565732438054, + "learning_rate": 8.079625292740048e-06, + "loss": 0.7069, + "step": 346 + }, + { + "epoch": 0.08140762463343108, + "grad_norm": 1.2533412435142142, + "learning_rate": 8.10304449648712e-06, + "loss": 0.6775, + "step": 347 + }, + { + "epoch": 0.08164222873900294, + "grad_norm": 8.653046483399084, + "learning_rate": 8.126463700234194e-06, + "loss": 0.7725, + "step": 348 + }, + { + "epoch": 0.08187683284457478, + "grad_norm": 1.7503738115203018, + "learning_rate": 8.149882903981265e-06, + "loss": 0.7135, + "step": 349 + }, + { + "epoch": 0.08211143695014662, + "grad_norm": 1.9139633404300693, + "learning_rate": 8.173302107728338e-06, + "loss": 0.7412, + "step": 350 + }, + { + "epoch": 0.08234604105571848, + "grad_norm": 1.536727880890203, + "learning_rate": 8.19672131147541e-06, + "loss": 0.7359, + "step": 351 + }, + { + "epoch": 0.08258064516129032, + "grad_norm": 2.8660997603136744, + "learning_rate": 8.220140515222483e-06, + "loss": 0.7667, + "step": 352 + }, + { + "epoch": 0.08281524926686216, + "grad_norm": 2.23335924032936, + "learning_rate": 8.243559718969556e-06, + "loss": 0.7349, + "step": 353 + }, + { + "epoch": 0.08304985337243402, + "grad_norm": 2.415132428444925, + "learning_rate": 8.266978922716629e-06, + "loss": 0.7546, + "step": 354 + }, + { + "epoch": 0.08328445747800586, + "grad_norm": 2.0073321543371736, + "learning_rate": 8.290398126463702e-06, + "loss": 0.7138, + "step": 355 + }, + { + "epoch": 0.08351906158357772, + "grad_norm": 3.2944999780355095, + "learning_rate": 8.313817330210773e-06, + "loss": 0.6752, + "step": 356 + }, + { + "epoch": 0.08375366568914956, + "grad_norm": 1.7139802340737562, + "learning_rate": 8.337236533957846e-06, + "loss": 0.6795, + "step": 357 + }, + { + "epoch": 0.0839882697947214, + "grad_norm": 5.315894715413043, + "learning_rate": 8.360655737704919e-06, + "loss": 0.745, + "step": 358 + }, + { + "epoch": 0.08422287390029326, + "grad_norm": 2.060548793865288, + "learning_rate": 8.384074941451992e-06, + "loss": 0.7033, + "step": 359 + }, + { + "epoch": 0.0844574780058651, + "grad_norm": 2.854342221447936, + "learning_rate": 8.407494145199064e-06, + "loss": 0.7255, + "step": 360 + }, + { + "epoch": 0.08469208211143694, + "grad_norm": 1.5296233063364677, + "learning_rate": 8.430913348946137e-06, + "loss": 0.7143, + "step": 361 + }, + { + "epoch": 0.0849266862170088, + "grad_norm": 2.9366860113406483, + "learning_rate": 8.45433255269321e-06, + "loss": 0.7498, + "step": 362 + }, + { + "epoch": 0.08516129032258064, + "grad_norm": 1.4632395156755973, + "learning_rate": 8.477751756440283e-06, + "loss": 0.6979, + "step": 363 + }, + { + "epoch": 0.0853958944281525, + "grad_norm": 4.237013919995425, + "learning_rate": 8.501170960187354e-06, + "loss": 0.7334, + "step": 364 + }, + { + "epoch": 0.08563049853372434, + "grad_norm": 2.173475328218578, + "learning_rate": 8.524590163934427e-06, + "loss": 0.7177, + "step": 365 + }, + { + "epoch": 0.08586510263929618, + "grad_norm": 2.5213528899664968, + "learning_rate": 8.5480093676815e-06, + "loss": 0.6737, + "step": 366 + }, + { + "epoch": 0.08609970674486804, + "grad_norm": 4.2993196772593345, + "learning_rate": 8.571428571428571e-06, + "loss": 0.7739, + "step": 367 + }, + { + "epoch": 0.08633431085043988, + "grad_norm": 2.5155006280989087, + "learning_rate": 8.594847775175644e-06, + "loss": 0.7361, + "step": 368 + }, + { + "epoch": 0.08656891495601172, + "grad_norm": 1.9647822308984213, + "learning_rate": 8.618266978922717e-06, + "loss": 0.7205, + "step": 369 + }, + { + "epoch": 0.08680351906158358, + "grad_norm": 4.562372816515158, + "learning_rate": 8.64168618266979e-06, + "loss": 0.7631, + "step": 370 + }, + { + "epoch": 0.08703812316715542, + "grad_norm": 3.1620928347884623, + "learning_rate": 8.665105386416862e-06, + "loss": 0.7482, + "step": 371 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 2.901318848524828, + "learning_rate": 8.688524590163935e-06, + "loss": 0.7036, + "step": 372 + }, + { + "epoch": 0.08750733137829912, + "grad_norm": 4.864603743688095, + "learning_rate": 8.711943793911008e-06, + "loss": 0.763, + "step": 373 + }, + { + "epoch": 0.08774193548387096, + "grad_norm": 1.8308419723689138, + "learning_rate": 8.73536299765808e-06, + "loss": 0.7396, + "step": 374 + }, + { + "epoch": 0.08797653958944282, + "grad_norm": 2.411984661545923, + "learning_rate": 8.758782201405152e-06, + "loss": 0.7045, + "step": 375 + }, + { + "epoch": 0.08821114369501466, + "grad_norm": 4.24982232258306, + "learning_rate": 8.782201405152225e-06, + "loss": 0.7396, + "step": 376 + }, + { + "epoch": 0.0884457478005865, + "grad_norm": 3.190757144859846, + "learning_rate": 8.805620608899298e-06, + "loss": 0.7109, + "step": 377 + }, + { + "epoch": 0.08868035190615836, + "grad_norm": 1.5464630154259653, + "learning_rate": 8.82903981264637e-06, + "loss": 0.7067, + "step": 378 + }, + { + "epoch": 0.0889149560117302, + "grad_norm": 1.4056128329371305, + "learning_rate": 8.852459016393443e-06, + "loss": 0.6872, + "step": 379 + }, + { + "epoch": 0.08914956011730206, + "grad_norm": 1.6722723816773548, + "learning_rate": 8.875878220140516e-06, + "loss": 0.7472, + "step": 380 + }, + { + "epoch": 0.0893841642228739, + "grad_norm": 7.751412884958818, + "learning_rate": 8.899297423887589e-06, + "loss": 0.6917, + "step": 381 + }, + { + "epoch": 0.08961876832844574, + "grad_norm": 3.2510202761271447, + "learning_rate": 8.922716627634662e-06, + "loss": 0.6748, + "step": 382 + }, + { + "epoch": 0.0898533724340176, + "grad_norm": 2.5669568293551728, + "learning_rate": 8.946135831381733e-06, + "loss": 0.725, + "step": 383 + }, + { + "epoch": 0.09008797653958944, + "grad_norm": 3.7440576716747813, + "learning_rate": 8.969555035128806e-06, + "loss": 0.6906, + "step": 384 + }, + { + "epoch": 0.09032258064516129, + "grad_norm": 8.296111298322213, + "learning_rate": 8.992974238875879e-06, + "loss": 0.6661, + "step": 385 + }, + { + "epoch": 0.09055718475073314, + "grad_norm": 2.716287161549124, + "learning_rate": 9.016393442622952e-06, + "loss": 0.6638, + "step": 386 + }, + { + "epoch": 0.09079178885630498, + "grad_norm": 1.4637051860404733, + "learning_rate": 9.039812646370024e-06, + "loss": 0.75, + "step": 387 + }, + { + "epoch": 0.09102639296187684, + "grad_norm": 1.9054264366067561, + "learning_rate": 9.063231850117097e-06, + "loss": 0.7222, + "step": 388 + }, + { + "epoch": 0.09126099706744868, + "grad_norm": 5.88123789755757, + "learning_rate": 9.08665105386417e-06, + "loss": 0.7677, + "step": 389 + }, + { + "epoch": 0.09149560117302052, + "grad_norm": 1.5836577273520098, + "learning_rate": 9.110070257611243e-06, + "loss": 0.6886, + "step": 390 + }, + { + "epoch": 0.09173020527859238, + "grad_norm": 1.4769405393823147, + "learning_rate": 9.133489461358314e-06, + "loss": 0.6943, + "step": 391 + }, + { + "epoch": 0.09196480938416422, + "grad_norm": 1.5393930672289444, + "learning_rate": 9.156908665105387e-06, + "loss": 0.6734, + "step": 392 + }, + { + "epoch": 0.09219941348973607, + "grad_norm": 2.04117582768801, + "learning_rate": 9.18032786885246e-06, + "loss": 0.7467, + "step": 393 + }, + { + "epoch": 0.09243401759530792, + "grad_norm": 3.3753912706732523, + "learning_rate": 9.203747072599533e-06, + "loss": 0.7235, + "step": 394 + }, + { + "epoch": 0.09266862170087976, + "grad_norm": 1.8420173745879682, + "learning_rate": 9.227166276346605e-06, + "loss": 0.6881, + "step": 395 + }, + { + "epoch": 0.09290322580645162, + "grad_norm": 2.7424273016845655, + "learning_rate": 9.250585480093678e-06, + "loss": 0.6923, + "step": 396 + }, + { + "epoch": 0.09313782991202346, + "grad_norm": 3.345559957565154, + "learning_rate": 9.274004683840751e-06, + "loss": 0.7716, + "step": 397 + }, + { + "epoch": 0.0933724340175953, + "grad_norm": 4.7620075885039945, + "learning_rate": 9.297423887587822e-06, + "loss": 0.7342, + "step": 398 + }, + { + "epoch": 0.09360703812316716, + "grad_norm": 1.8391282182105828, + "learning_rate": 9.320843091334895e-06, + "loss": 0.6778, + "step": 399 + }, + { + "epoch": 0.093841642228739, + "grad_norm": 4.021920739798764, + "learning_rate": 9.344262295081968e-06, + "loss": 0.7419, + "step": 400 + }, + { + "epoch": 0.09407624633431085, + "grad_norm": 1.7430126105824388, + "learning_rate": 9.36768149882904e-06, + "loss": 0.6887, + "step": 401 + }, + { + "epoch": 0.0943108504398827, + "grad_norm": 1.5251332201860028, + "learning_rate": 9.391100702576114e-06, + "loss": 0.6463, + "step": 402 + }, + { + "epoch": 0.09454545454545454, + "grad_norm": 2.5201755219435515, + "learning_rate": 9.414519906323187e-06, + "loss": 0.69, + "step": 403 + }, + { + "epoch": 0.0947800586510264, + "grad_norm": 1.5280311623002627, + "learning_rate": 9.43793911007026e-06, + "loss": 0.7285, + "step": 404 + }, + { + "epoch": 0.09501466275659824, + "grad_norm": 2.447952313998854, + "learning_rate": 9.461358313817332e-06, + "loss": 0.6911, + "step": 405 + }, + { + "epoch": 0.09524926686217008, + "grad_norm": 1.9015531040690363, + "learning_rate": 9.484777517564403e-06, + "loss": 0.7142, + "step": 406 + }, + { + "epoch": 0.09548387096774194, + "grad_norm": 1.9335270760208139, + "learning_rate": 9.508196721311476e-06, + "loss": 0.7182, + "step": 407 + }, + { + "epoch": 0.09571847507331378, + "grad_norm": 2.471047004477268, + "learning_rate": 9.531615925058549e-06, + "loss": 0.7297, + "step": 408 + }, + { + "epoch": 0.09595307917888563, + "grad_norm": 3.904270141650119, + "learning_rate": 9.55503512880562e-06, + "loss": 0.716, + "step": 409 + }, + { + "epoch": 0.09618768328445748, + "grad_norm": 2.3095723128690984, + "learning_rate": 9.578454332552693e-06, + "loss": 0.6413, + "step": 410 + }, + { + "epoch": 0.09642228739002932, + "grad_norm": 11.94533457021786, + "learning_rate": 9.601873536299766e-06, + "loss": 0.656, + "step": 411 + }, + { + "epoch": 0.09665689149560118, + "grad_norm": 3.348994409400017, + "learning_rate": 9.625292740046839e-06, + "loss": 0.7164, + "step": 412 + }, + { + "epoch": 0.09689149560117302, + "grad_norm": 2.5657738183621404, + "learning_rate": 9.648711943793912e-06, + "loss": 0.659, + "step": 413 + }, + { + "epoch": 0.09712609970674486, + "grad_norm": 1.9489336510081479, + "learning_rate": 9.672131147540984e-06, + "loss": 0.6836, + "step": 414 + }, + { + "epoch": 0.09736070381231672, + "grad_norm": 5.068307001090185, + "learning_rate": 9.695550351288057e-06, + "loss": 0.7652, + "step": 415 + }, + { + "epoch": 0.09759530791788856, + "grad_norm": 2.748453987481012, + "learning_rate": 9.71896955503513e-06, + "loss": 0.7079, + "step": 416 + }, + { + "epoch": 0.0978299120234604, + "grad_norm": 2.4709648008841603, + "learning_rate": 9.742388758782201e-06, + "loss": 0.6939, + "step": 417 + }, + { + "epoch": 0.09806451612903226, + "grad_norm": 1.511478089066913, + "learning_rate": 9.765807962529274e-06, + "loss": 0.6551, + "step": 418 + }, + { + "epoch": 0.0982991202346041, + "grad_norm": 2.4978493830642563, + "learning_rate": 9.789227166276347e-06, + "loss": 0.7331, + "step": 419 + }, + { + "epoch": 0.09853372434017596, + "grad_norm": 2.484188148554729, + "learning_rate": 9.81264637002342e-06, + "loss": 0.7333, + "step": 420 + }, + { + "epoch": 0.0987683284457478, + "grad_norm": 2.896283868289636, + "learning_rate": 9.836065573770493e-06, + "loss": 0.6859, + "step": 421 + }, + { + "epoch": 0.09900293255131964, + "grad_norm": 2.7543059139679955, + "learning_rate": 9.859484777517565e-06, + "loss": 0.7014, + "step": 422 + }, + { + "epoch": 0.0992375366568915, + "grad_norm": 1.93457550943428, + "learning_rate": 9.882903981264638e-06, + "loss": 0.6821, + "step": 423 + }, + { + "epoch": 0.09947214076246334, + "grad_norm": 3.541421711373985, + "learning_rate": 9.906323185011711e-06, + "loss": 0.7795, + "step": 424 + }, + { + "epoch": 0.09970674486803519, + "grad_norm": 2.3210901497458902, + "learning_rate": 9.929742388758782e-06, + "loss": 0.7066, + "step": 425 + }, + { + "epoch": 0.09994134897360704, + "grad_norm": 2.061503531228248, + "learning_rate": 9.953161592505855e-06, + "loss": 0.6978, + "step": 426 + }, + { + "epoch": 0.10017595307917888, + "grad_norm": 2.881370488654951, + "learning_rate": 9.976580796252928e-06, + "loss": 0.6827, + "step": 427 + }, + { + "epoch": 0.10041055718475074, + "grad_norm": 1.9376756360692144, + "learning_rate": 1e-05, + "loss": 0.6985, + "step": 428 + }, + { + "epoch": 0.10064516129032258, + "grad_norm": 9.185995500420793, + "learning_rate": 9.99999832232052e-06, + "loss": 0.7061, + "step": 429 + }, + { + "epoch": 0.10087976539589442, + "grad_norm": 1.3779257570194963, + "learning_rate": 9.999993289283198e-06, + "loss": 0.6873, + "step": 430 + }, + { + "epoch": 0.10111436950146628, + "grad_norm": 7.567980802908717, + "learning_rate": 9.999984900891416e-06, + "loss": 0.7555, + "step": 431 + }, + { + "epoch": 0.10134897360703812, + "grad_norm": 5.516879792515433, + "learning_rate": 9.999973157150802e-06, + "loss": 0.7445, + "step": 432 + }, + { + "epoch": 0.10158357771260997, + "grad_norm": 2.8992965270273783, + "learning_rate": 9.999958058069237e-06, + "loss": 0.7679, + "step": 433 + }, + { + "epoch": 0.10181818181818182, + "grad_norm": 3.2406301614465884, + "learning_rate": 9.999939603656855e-06, + "loss": 0.7243, + "step": 434 + }, + { + "epoch": 0.10205278592375366, + "grad_norm": 3.4869381136029345, + "learning_rate": 9.999917793926037e-06, + "loss": 0.6637, + "step": 435 + }, + { + "epoch": 0.10228739002932552, + "grad_norm": 1.3184474068296157, + "learning_rate": 9.999892628891421e-06, + "loss": 0.7128, + "step": 436 + }, + { + "epoch": 0.10252199413489736, + "grad_norm": 2.1731601294137515, + "learning_rate": 9.999864108569895e-06, + "loss": 0.7009, + "step": 437 + }, + { + "epoch": 0.1027565982404692, + "grad_norm": 1.5904788438718565, + "learning_rate": 9.999832232980598e-06, + "loss": 0.6918, + "step": 438 + }, + { + "epoch": 0.10299120234604106, + "grad_norm": 2.40384485405665, + "learning_rate": 9.99979700214492e-06, + "loss": 0.7086, + "step": 439 + }, + { + "epoch": 0.1032258064516129, + "grad_norm": 9.0473937275457, + "learning_rate": 9.999758416086503e-06, + "loss": 0.6649, + "step": 440 + }, + { + "epoch": 0.10346041055718475, + "grad_norm": 1.8878111541309193, + "learning_rate": 9.999716474831241e-06, + "loss": 0.6822, + "step": 441 + }, + { + "epoch": 0.1036950146627566, + "grad_norm": 2.01013703192227, + "learning_rate": 9.999671178407283e-06, + "loss": 0.7049, + "step": 442 + }, + { + "epoch": 0.10392961876832844, + "grad_norm": 1.8871464386483952, + "learning_rate": 9.99962252684502e-06, + "loss": 0.6495, + "step": 443 + }, + { + "epoch": 0.1041642228739003, + "grad_norm": 2.715514625300475, + "learning_rate": 9.999570520177107e-06, + "loss": 0.7042, + "step": 444 + }, + { + "epoch": 0.10439882697947214, + "grad_norm": 1.679303542504971, + "learning_rate": 9.99951515843844e-06, + "loss": 0.7118, + "step": 445 + }, + { + "epoch": 0.10463343108504398, + "grad_norm": 3.641101181087701, + "learning_rate": 9.999456441666172e-06, + "loss": 0.7259, + "step": 446 + }, + { + "epoch": 0.10486803519061584, + "grad_norm": 3.341226347625401, + "learning_rate": 9.999394369899706e-06, + "loss": 0.7389, + "step": 447 + }, + { + "epoch": 0.10510263929618768, + "grad_norm": 10.714831824909773, + "learning_rate": 9.999328943180696e-06, + "loss": 0.6824, + "step": 448 + }, + { + "epoch": 0.10533724340175953, + "grad_norm": 20.08655518501293, + "learning_rate": 9.999260161553052e-06, + "loss": 0.7064, + "step": 449 + }, + { + "epoch": 0.10557184750733138, + "grad_norm": 1.2515783765249133, + "learning_rate": 9.999188025062925e-06, + "loss": 0.6944, + "step": 450 + }, + { + "epoch": 0.10580645161290322, + "grad_norm": 1.7578886402867833, + "learning_rate": 9.99911253375873e-06, + "loss": 0.7077, + "step": 451 + }, + { + "epoch": 0.10604105571847508, + "grad_norm": 1.9250104195615887, + "learning_rate": 9.999033687691122e-06, + "loss": 0.6437, + "step": 452 + }, + { + "epoch": 0.10627565982404692, + "grad_norm": 26.687022521364753, + "learning_rate": 9.998951486913015e-06, + "loss": 0.7095, + "step": 453 + }, + { + "epoch": 0.10651026392961876, + "grad_norm": 2.7693857312683225, + "learning_rate": 9.998865931479572e-06, + "loss": 0.7758, + "step": 454 + }, + { + "epoch": 0.10674486803519062, + "grad_norm": 2.615705716385251, + "learning_rate": 9.998777021448204e-06, + "loss": 0.6635, + "step": 455 + }, + { + "epoch": 0.10697947214076246, + "grad_norm": 2.2314457566850345, + "learning_rate": 9.99868475687858e-06, + "loss": 0.6474, + "step": 456 + }, + { + "epoch": 0.1072140762463343, + "grad_norm": 2.821634138231432, + "learning_rate": 9.998589137832612e-06, + "loss": 0.6494, + "step": 457 + }, + { + "epoch": 0.10744868035190616, + "grad_norm": 4.725909833280999, + "learning_rate": 9.998490164374472e-06, + "loss": 0.7363, + "step": 458 + }, + { + "epoch": 0.107683284457478, + "grad_norm": 1.5841167891781551, + "learning_rate": 9.998387836570573e-06, + "loss": 0.7518, + "step": 459 + }, + { + "epoch": 0.10791788856304986, + "grad_norm": 1.7115189418913666, + "learning_rate": 9.998282154489588e-06, + "loss": 0.6926, + "step": 460 + }, + { + "epoch": 0.1081524926686217, + "grad_norm": 2.388898935446305, + "learning_rate": 9.998173118202438e-06, + "loss": 0.6799, + "step": 461 + }, + { + "epoch": 0.10838709677419354, + "grad_norm": 2.9397115202024615, + "learning_rate": 9.998060727782291e-06, + "loss": 0.6984, + "step": 462 + }, + { + "epoch": 0.1086217008797654, + "grad_norm": 1.8706360121603745, + "learning_rate": 9.99794498330457e-06, + "loss": 0.6952, + "step": 463 + }, + { + "epoch": 0.10885630498533724, + "grad_norm": 1.684443764370597, + "learning_rate": 9.997825884846947e-06, + "loss": 0.7163, + "step": 464 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 2.304327777151329, + "learning_rate": 9.99770343248935e-06, + "loss": 0.7277, + "step": 465 + }, + { + "epoch": 0.10932551319648094, + "grad_norm": 2.500667829864606, + "learning_rate": 9.997577626313948e-06, + "loss": 0.7621, + "step": 466 + }, + { + "epoch": 0.10956011730205278, + "grad_norm": 1.377090939725991, + "learning_rate": 9.99744846640517e-06, + "loss": 0.6609, + "step": 467 + }, + { + "epoch": 0.10979472140762464, + "grad_norm": 2.296341610377368, + "learning_rate": 9.99731595284969e-06, + "loss": 0.7315, + "step": 468 + }, + { + "epoch": 0.11002932551319648, + "grad_norm": 4.818232817430393, + "learning_rate": 9.997180085736431e-06, + "loss": 0.6855, + "step": 469 + }, + { + "epoch": 0.11026392961876832, + "grad_norm": 3.6547377355731085, + "learning_rate": 9.997040865156575e-06, + "loss": 0.6983, + "step": 470 + }, + { + "epoch": 0.11049853372434018, + "grad_norm": 5.386859126999124, + "learning_rate": 9.996898291203544e-06, + "loss": 0.6702, + "step": 471 + }, + { + "epoch": 0.11073313782991202, + "grad_norm": 2.8098348542077938, + "learning_rate": 9.99675236397302e-06, + "loss": 0.7545, + "step": 472 + }, + { + "epoch": 0.11096774193548387, + "grad_norm": 2.536389393667118, + "learning_rate": 9.996603083562928e-06, + "loss": 0.6799, + "step": 473 + }, + { + "epoch": 0.11120234604105572, + "grad_norm": 1.3917740827397262, + "learning_rate": 9.996450450073447e-06, + "loss": 0.6756, + "step": 474 + }, + { + "epoch": 0.11143695014662756, + "grad_norm": 2.345795464079582, + "learning_rate": 9.996294463607004e-06, + "loss": 0.7228, + "step": 475 + }, + { + "epoch": 0.11167155425219942, + "grad_norm": 1.5696093579763688, + "learning_rate": 9.996135124268276e-06, + "loss": 0.6942, + "step": 476 + }, + { + "epoch": 0.11190615835777126, + "grad_norm": 1.2610491066110727, + "learning_rate": 9.995972432164194e-06, + "loss": 0.7187, + "step": 477 + }, + { + "epoch": 0.1121407624633431, + "grad_norm": 2.9972015560304666, + "learning_rate": 9.995806387403935e-06, + "loss": 0.704, + "step": 478 + }, + { + "epoch": 0.11237536656891496, + "grad_norm": 2.044081312404094, + "learning_rate": 9.995636990098928e-06, + "loss": 0.7446, + "step": 479 + }, + { + "epoch": 0.1126099706744868, + "grad_norm": 1.7584032388011313, + "learning_rate": 9.995464240362847e-06, + "loss": 0.7194, + "step": 480 + }, + { + "epoch": 0.11284457478005865, + "grad_norm": 3.3210695065864937, + "learning_rate": 9.995288138311622e-06, + "loss": 0.6865, + "step": 481 + }, + { + "epoch": 0.1130791788856305, + "grad_norm": 8.73885391298322, + "learning_rate": 9.995108684063431e-06, + "loss": 0.724, + "step": 482 + }, + { + "epoch": 0.11331378299120234, + "grad_norm": 4.418388663156157, + "learning_rate": 9.9949258777387e-06, + "loss": 0.7392, + "step": 483 + }, + { + "epoch": 0.1135483870967742, + "grad_norm": 1.7514645214551325, + "learning_rate": 9.994739719460103e-06, + "loss": 0.643, + "step": 484 + }, + { + "epoch": 0.11378299120234604, + "grad_norm": 2.34653301290369, + "learning_rate": 9.994550209352569e-06, + "loss": 0.7112, + "step": 485 + }, + { + "epoch": 0.11401759530791788, + "grad_norm": 1.5689601767472114, + "learning_rate": 9.99435734754327e-06, + "loss": 0.6981, + "step": 486 + }, + { + "epoch": 0.11425219941348974, + "grad_norm": 2.7327993867035234, + "learning_rate": 9.994161134161635e-06, + "loss": 0.703, + "step": 487 + }, + { + "epoch": 0.11448680351906158, + "grad_norm": 2.6868476495035014, + "learning_rate": 9.99396156933933e-06, + "loss": 0.7232, + "step": 488 + }, + { + "epoch": 0.11472140762463343, + "grad_norm": 4.318914367666479, + "learning_rate": 9.993758653210283e-06, + "loss": 0.7436, + "step": 489 + }, + { + "epoch": 0.11495601173020528, + "grad_norm": 2.3801491122694087, + "learning_rate": 9.993552385910663e-06, + "loss": 0.6563, + "step": 490 + }, + { + "epoch": 0.11519061583577712, + "grad_norm": 3.3685873283237537, + "learning_rate": 9.993342767578888e-06, + "loss": 0.6611, + "step": 491 + }, + { + "epoch": 0.11542521994134898, + "grad_norm": 9.216519497431854, + "learning_rate": 9.993129798355633e-06, + "loss": 0.7036, + "step": 492 + }, + { + "epoch": 0.11565982404692082, + "grad_norm": 2.3057579250776237, + "learning_rate": 9.99291347838381e-06, + "loss": 0.7227, + "step": 493 + }, + { + "epoch": 0.11589442815249267, + "grad_norm": 5.09132377443943, + "learning_rate": 9.992693807808589e-06, + "loss": 0.7115, + "step": 494 + }, + { + "epoch": 0.11612903225806452, + "grad_norm": 1.784883173027611, + "learning_rate": 9.992470786777382e-06, + "loss": 0.6987, + "step": 495 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 2.374758365052141, + "learning_rate": 9.992244415439853e-06, + "loss": 0.6656, + "step": 496 + }, + { + "epoch": 0.1165982404692082, + "grad_norm": 3.146138261728129, + "learning_rate": 9.992014693947914e-06, + "loss": 0.678, + "step": 497 + }, + { + "epoch": 0.11683284457478006, + "grad_norm": 7.476451543495553, + "learning_rate": 9.991781622455723e-06, + "loss": 0.6661, + "step": 498 + }, + { + "epoch": 0.1170674486803519, + "grad_norm": 2.072243392022778, + "learning_rate": 9.991545201119692e-06, + "loss": 0.6763, + "step": 499 + }, + { + "epoch": 0.11730205278592376, + "grad_norm": 2.0321025061706512, + "learning_rate": 9.991305430098471e-06, + "loss": 0.6666, + "step": 500 + }, + { + "epoch": 0.11730205278592376, + "eval_loss": 0.6888240575790405, + "eval_runtime": 28.5702, + "eval_samples_per_second": 19.111, + "eval_steps_per_second": 0.175, + "step": 500 + }, + { + "epoch": 0.1175366568914956, + "grad_norm": 1.6318127853421331, + "learning_rate": 9.991062309552965e-06, + "loss": 0.6265, + "step": 501 + }, + { + "epoch": 0.11777126099706745, + "grad_norm": 1.7473636157017485, + "learning_rate": 9.990815839646329e-06, + "loss": 0.655, + "step": 502 + }, + { + "epoch": 0.1180058651026393, + "grad_norm": 1.5512715760294424, + "learning_rate": 9.99056602054396e-06, + "loss": 0.668, + "step": 503 + }, + { + "epoch": 0.11824046920821114, + "grad_norm": 2.3950227354199374, + "learning_rate": 9.990312852413501e-06, + "loss": 0.7096, + "step": 504 + }, + { + "epoch": 0.11847507331378299, + "grad_norm": 1.5296372584551887, + "learning_rate": 9.990056335424853e-06, + "loss": 0.6857, + "step": 505 + }, + { + "epoch": 0.11870967741935484, + "grad_norm": 1.867314419724177, + "learning_rate": 9.98979646975015e-06, + "loss": 0.659, + "step": 506 + }, + { + "epoch": 0.11894428152492668, + "grad_norm": 2.2755337401213285, + "learning_rate": 9.989533255563786e-06, + "loss": 0.7274, + "step": 507 + }, + { + "epoch": 0.11917888563049854, + "grad_norm": 6.432226473108074, + "learning_rate": 9.989266693042394e-06, + "loss": 0.6454, + "step": 508 + }, + { + "epoch": 0.11941348973607038, + "grad_norm": 7.977964543167525, + "learning_rate": 9.988996782364858e-06, + "loss": 0.6986, + "step": 509 + }, + { + "epoch": 0.11964809384164223, + "grad_norm": 2.492942939386226, + "learning_rate": 9.988723523712304e-06, + "loss": 0.7261, + "step": 510 + }, + { + "epoch": 0.11988269794721408, + "grad_norm": 13.259547924776628, + "learning_rate": 9.988446917268113e-06, + "loss": 0.6809, + "step": 511 + }, + { + "epoch": 0.12011730205278592, + "grad_norm": 2.11113904634305, + "learning_rate": 9.988166963217903e-06, + "loss": 0.7064, + "step": 512 + }, + { + "epoch": 0.12035190615835777, + "grad_norm": 1.3151089527283533, + "learning_rate": 9.987883661749548e-06, + "loss": 0.7164, + "step": 513 + }, + { + "epoch": 0.12058651026392962, + "grad_norm": 2.733472003363916, + "learning_rate": 9.987597013053161e-06, + "loss": 0.6691, + "step": 514 + }, + { + "epoch": 0.12082111436950146, + "grad_norm": 2.1188076613210702, + "learning_rate": 9.987307017321105e-06, + "loss": 0.6608, + "step": 515 + }, + { + "epoch": 0.12105571847507332, + "grad_norm": 1.413454097880752, + "learning_rate": 9.987013674747986e-06, + "loss": 0.7397, + "step": 516 + }, + { + "epoch": 0.12129032258064516, + "grad_norm": 3.0244674122202584, + "learning_rate": 9.986716985530658e-06, + "loss": 0.6533, + "step": 517 + }, + { + "epoch": 0.121524926686217, + "grad_norm": 2.1969109132092246, + "learning_rate": 9.986416949868223e-06, + "loss": 0.7449, + "step": 518 + }, + { + "epoch": 0.12175953079178886, + "grad_norm": 3.233230646965809, + "learning_rate": 9.986113567962025e-06, + "loss": 0.7096, + "step": 519 + }, + { + "epoch": 0.1219941348973607, + "grad_norm": 2.5578549246896296, + "learning_rate": 9.985806840015657e-06, + "loss": 0.7025, + "step": 520 + }, + { + "epoch": 0.12222873900293255, + "grad_norm": 2.379363537670544, + "learning_rate": 9.985496766234952e-06, + "loss": 0.7108, + "step": 521 + }, + { + "epoch": 0.1224633431085044, + "grad_norm": 1.6069661740113346, + "learning_rate": 9.985183346827995e-06, + "loss": 0.7185, + "step": 522 + }, + { + "epoch": 0.12269794721407624, + "grad_norm": 2.870991700048875, + "learning_rate": 9.984866582005112e-06, + "loss": 0.7037, + "step": 523 + }, + { + "epoch": 0.1229325513196481, + "grad_norm": 1.775199433098271, + "learning_rate": 9.984546471978872e-06, + "loss": 0.6602, + "step": 524 + }, + { + "epoch": 0.12316715542521994, + "grad_norm": 2.1573711551915564, + "learning_rate": 9.984223016964096e-06, + "loss": 0.6794, + "step": 525 + }, + { + "epoch": 0.12340175953079179, + "grad_norm": 1.502186193122687, + "learning_rate": 9.983896217177843e-06, + "loss": 0.6603, + "step": 526 + }, + { + "epoch": 0.12363636363636364, + "grad_norm": 2.0934295229608386, + "learning_rate": 9.983566072839423e-06, + "loss": 0.6766, + "step": 527 + }, + { + "epoch": 0.12387096774193548, + "grad_norm": 1.6941035922061842, + "learning_rate": 9.98323258417038e-06, + "loss": 0.66, + "step": 528 + }, + { + "epoch": 0.12410557184750733, + "grad_norm": 2.165000453568371, + "learning_rate": 9.982895751394515e-06, + "loss": 0.7038, + "step": 529 + }, + { + "epoch": 0.12434017595307918, + "grad_norm": 2.043357882178577, + "learning_rate": 9.982555574737862e-06, + "loss": 0.6648, + "step": 530 + }, + { + "epoch": 0.12457478005865102, + "grad_norm": 1.895714786932282, + "learning_rate": 9.982212054428708e-06, + "loss": 0.7147, + "step": 531 + }, + { + "epoch": 0.12480938416422288, + "grad_norm": 2.654530541516175, + "learning_rate": 9.981865190697577e-06, + "loss": 0.7089, + "step": 532 + }, + { + "epoch": 0.1250439882697947, + "grad_norm": 1.597405747196785, + "learning_rate": 9.98151498377724e-06, + "loss": 0.7359, + "step": 533 + }, + { + "epoch": 0.12527859237536657, + "grad_norm": 2.9749176981598384, + "learning_rate": 9.981161433902712e-06, + "loss": 0.6893, + "step": 534 + }, + { + "epoch": 0.12551319648093842, + "grad_norm": 2.4159566695813104, + "learning_rate": 9.980804541311252e-06, + "loss": 0.7106, + "step": 535 + }, + { + "epoch": 0.12574780058651028, + "grad_norm": 22.842593769056897, + "learning_rate": 9.980444306242355e-06, + "loss": 0.7076, + "step": 536 + }, + { + "epoch": 0.1259824046920821, + "grad_norm": 2.666099139107497, + "learning_rate": 9.98008072893777e-06, + "loss": 0.6791, + "step": 537 + }, + { + "epoch": 0.12621700879765396, + "grad_norm": 1.6269471981229704, + "learning_rate": 9.979713809641482e-06, + "loss": 0.6798, + "step": 538 + }, + { + "epoch": 0.12645161290322582, + "grad_norm": 4.202791295534201, + "learning_rate": 9.97934354859972e-06, + "loss": 0.6882, + "step": 539 + }, + { + "epoch": 0.12668621700879765, + "grad_norm": 1.5103425993192718, + "learning_rate": 9.978969946060954e-06, + "loss": 0.6643, + "step": 540 + }, + { + "epoch": 0.1269208211143695, + "grad_norm": 1.596861789287424, + "learning_rate": 9.9785930022759e-06, + "loss": 0.697, + "step": 541 + }, + { + "epoch": 0.12715542521994136, + "grad_norm": 1.1667616716218816, + "learning_rate": 9.978212717497515e-06, + "loss": 0.7428, + "step": 542 + }, + { + "epoch": 0.1273900293255132, + "grad_norm": 13.354063943546418, + "learning_rate": 9.977829091980996e-06, + "loss": 0.6302, + "step": 543 + }, + { + "epoch": 0.12762463343108504, + "grad_norm": 2.3558509951518074, + "learning_rate": 9.977442125983781e-06, + "loss": 0.6839, + "step": 544 + }, + { + "epoch": 0.1278592375366569, + "grad_norm": 3.1414319250785487, + "learning_rate": 9.977051819765558e-06, + "loss": 0.7044, + "step": 545 + }, + { + "epoch": 0.12809384164222873, + "grad_norm": 3.1115702760435413, + "learning_rate": 9.976658173588244e-06, + "loss": 0.7383, + "step": 546 + }, + { + "epoch": 0.12832844574780058, + "grad_norm": 3.5567349181056316, + "learning_rate": 9.976261187716008e-06, + "loss": 0.6763, + "step": 547 + }, + { + "epoch": 0.12856304985337244, + "grad_norm": 2.3978902826675954, + "learning_rate": 9.975860862415255e-06, + "loss": 0.7546, + "step": 548 + }, + { + "epoch": 0.12879765395894427, + "grad_norm": 29.013203676935372, + "learning_rate": 9.975457197954632e-06, + "loss": 0.7034, + "step": 549 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 2.0575848372214307, + "learning_rate": 9.975050194605027e-06, + "loss": 0.6951, + "step": 550 + }, + { + "epoch": 0.12926686217008798, + "grad_norm": 1.4098369621927678, + "learning_rate": 9.974639852639568e-06, + "loss": 0.6922, + "step": 551 + }, + { + "epoch": 0.12950146627565984, + "grad_norm": 2.897930701241758, + "learning_rate": 9.974226172333623e-06, + "loss": 0.7343, + "step": 552 + }, + { + "epoch": 0.12973607038123167, + "grad_norm": 2.59537067559108, + "learning_rate": 9.973809153964803e-06, + "loss": 0.6689, + "step": 553 + }, + { + "epoch": 0.12997067448680352, + "grad_norm": 3.027373892989576, + "learning_rate": 9.973388797812958e-06, + "loss": 0.6718, + "step": 554 + }, + { + "epoch": 0.13020527859237538, + "grad_norm": 2.113493209589616, + "learning_rate": 9.972965104160175e-06, + "loss": 0.6639, + "step": 555 + }, + { + "epoch": 0.1304398826979472, + "grad_norm": 1.3623863166221668, + "learning_rate": 9.972538073290783e-06, + "loss": 0.6278, + "step": 556 + }, + { + "epoch": 0.13067448680351906, + "grad_norm": 2.5295207930233685, + "learning_rate": 9.972107705491351e-06, + "loss": 0.6006, + "step": 557 + }, + { + "epoch": 0.13090909090909092, + "grad_norm": 1.6585614514698723, + "learning_rate": 9.971674001050687e-06, + "loss": 0.6385, + "step": 558 + }, + { + "epoch": 0.13114369501466275, + "grad_norm": 6.160387588977871, + "learning_rate": 9.971236960259836e-06, + "loss": 0.6728, + "step": 559 + }, + { + "epoch": 0.1313782991202346, + "grad_norm": 2.4013303577531806, + "learning_rate": 9.970796583412088e-06, + "loss": 0.6698, + "step": 560 + }, + { + "epoch": 0.13161290322580646, + "grad_norm": 1.8285169637361147, + "learning_rate": 9.970352870802962e-06, + "loss": 0.6668, + "step": 561 + }, + { + "epoch": 0.1318475073313783, + "grad_norm": 1.303920615682513, + "learning_rate": 9.969905822730226e-06, + "loss": 0.6264, + "step": 562 + }, + { + "epoch": 0.13208211143695014, + "grad_norm": 1.9978814715093303, + "learning_rate": 9.969455439493877e-06, + "loss": 0.6987, + "step": 563 + }, + { + "epoch": 0.132316715542522, + "grad_norm": 7.972031582327647, + "learning_rate": 9.969001721396157e-06, + "loss": 0.7122, + "step": 564 + }, + { + "epoch": 0.13255131964809383, + "grad_norm": 2.1939819423538207, + "learning_rate": 9.968544668741544e-06, + "loss": 0.7235, + "step": 565 + }, + { + "epoch": 0.13278592375366569, + "grad_norm": 2.111594241775808, + "learning_rate": 9.968084281836752e-06, + "loss": 0.7514, + "step": 566 + }, + { + "epoch": 0.13302052785923754, + "grad_norm": 1.6540698615973652, + "learning_rate": 9.967620560990733e-06, + "loss": 0.6209, + "step": 567 + }, + { + "epoch": 0.1332551319648094, + "grad_norm": 1.734001307851314, + "learning_rate": 9.967153506514677e-06, + "loss": 0.6821, + "step": 568 + }, + { + "epoch": 0.13348973607038123, + "grad_norm": 2.898214053367665, + "learning_rate": 9.966683118722014e-06, + "loss": 0.6357, + "step": 569 + }, + { + "epoch": 0.13372434017595308, + "grad_norm": 2.8199846284828034, + "learning_rate": 9.966209397928404e-06, + "loss": 0.669, + "step": 570 + }, + { + "epoch": 0.13395894428152494, + "grad_norm": 3.209199942211371, + "learning_rate": 9.965732344451751e-06, + "loss": 0.683, + "step": 571 + }, + { + "epoch": 0.13419354838709677, + "grad_norm": 5.56090036713883, + "learning_rate": 9.96525195861219e-06, + "loss": 0.7191, + "step": 572 + }, + { + "epoch": 0.13442815249266862, + "grad_norm": 2.5431957689833666, + "learning_rate": 9.964768240732094e-06, + "loss": 0.692, + "step": 573 + }, + { + "epoch": 0.13466275659824048, + "grad_norm": 1.4805250152975054, + "learning_rate": 9.964281191136073e-06, + "loss": 0.7041, + "step": 574 + }, + { + "epoch": 0.1348973607038123, + "grad_norm": 3.560789408514976, + "learning_rate": 9.963790810150975e-06, + "loss": 0.653, + "step": 575 + }, + { + "epoch": 0.13513196480938416, + "grad_norm": 3.372954282563982, + "learning_rate": 9.963297098105877e-06, + "loss": 0.6656, + "step": 576 + }, + { + "epoch": 0.13536656891495602, + "grad_norm": 1.716873774016931, + "learning_rate": 9.962800055332098e-06, + "loss": 0.6824, + "step": 577 + }, + { + "epoch": 0.13560117302052785, + "grad_norm": 1.5706571563062237, + "learning_rate": 9.962299682163185e-06, + "loss": 0.6542, + "step": 578 + }, + { + "epoch": 0.1358357771260997, + "grad_norm": 4.257196400759658, + "learning_rate": 9.961795978934932e-06, + "loss": 0.6593, + "step": 579 + }, + { + "epoch": 0.13607038123167156, + "grad_norm": 1.6639609965913367, + "learning_rate": 9.961288945985352e-06, + "loss": 0.647, + "step": 580 + }, + { + "epoch": 0.1363049853372434, + "grad_norm": 25.232129147256217, + "learning_rate": 9.960778583654704e-06, + "loss": 0.6564, + "step": 581 + }, + { + "epoch": 0.13653958944281525, + "grad_norm": 1.2901397862502486, + "learning_rate": 9.960264892285481e-06, + "loss": 0.658, + "step": 582 + }, + { + "epoch": 0.1367741935483871, + "grad_norm": 1.8257065820782765, + "learning_rate": 9.9597478722224e-06, + "loss": 0.7135, + "step": 583 + }, + { + "epoch": 0.13700879765395896, + "grad_norm": 1.1847585123600912, + "learning_rate": 9.959227523812423e-06, + "loss": 0.6541, + "step": 584 + }, + { + "epoch": 0.1372434017595308, + "grad_norm": 1.6699141702923748, + "learning_rate": 9.958703847404742e-06, + "loss": 0.7057, + "step": 585 + }, + { + "epoch": 0.13747800586510264, + "grad_norm": 1.2962710057457882, + "learning_rate": 9.958176843350778e-06, + "loss": 0.6709, + "step": 586 + }, + { + "epoch": 0.1377126099706745, + "grad_norm": 2.457488711215677, + "learning_rate": 9.957646512004191e-06, + "loss": 0.6516, + "step": 587 + }, + { + "epoch": 0.13794721407624633, + "grad_norm": 1.8892307626675284, + "learning_rate": 9.95711285372087e-06, + "loss": 0.6908, + "step": 588 + }, + { + "epoch": 0.13818181818181818, + "grad_norm": 4.763454832394259, + "learning_rate": 9.95657586885894e-06, + "loss": 0.6708, + "step": 589 + }, + { + "epoch": 0.13841642228739004, + "grad_norm": 8.020713129219255, + "learning_rate": 9.956035557778753e-06, + "loss": 0.6752, + "step": 590 + }, + { + "epoch": 0.13865102639296187, + "grad_norm": 3.4908925487869467, + "learning_rate": 9.955491920842902e-06, + "loss": 0.6909, + "step": 591 + }, + { + "epoch": 0.13888563049853372, + "grad_norm": 1.776193850374824, + "learning_rate": 9.9549449584162e-06, + "loss": 0.6748, + "step": 592 + }, + { + "epoch": 0.13912023460410558, + "grad_norm": 3.1813693527805396, + "learning_rate": 9.954394670865701e-06, + "loss": 0.66, + "step": 593 + }, + { + "epoch": 0.1393548387096774, + "grad_norm": 1.342747330197621, + "learning_rate": 9.95384105856069e-06, + "loss": 0.6375, + "step": 594 + }, + { + "epoch": 0.13958944281524927, + "grad_norm": 1.897633631785078, + "learning_rate": 9.953284121872676e-06, + "loss": 0.6633, + "step": 595 + }, + { + "epoch": 0.13982404692082112, + "grad_norm": 1.5089958808746635, + "learning_rate": 9.952723861175406e-06, + "loss": 0.6554, + "step": 596 + }, + { + "epoch": 0.14005865102639295, + "grad_norm": 2.07875619918608, + "learning_rate": 9.952160276844855e-06, + "loss": 0.7027, + "step": 597 + }, + { + "epoch": 0.1402932551319648, + "grad_norm": 19.9773501877953, + "learning_rate": 9.95159336925923e-06, + "loss": 0.7008, + "step": 598 + }, + { + "epoch": 0.14052785923753666, + "grad_norm": 3.149299323527888, + "learning_rate": 9.951023138798964e-06, + "loss": 0.7071, + "step": 599 + }, + { + "epoch": 0.14076246334310852, + "grad_norm": 1.662461198147052, + "learning_rate": 9.950449585846722e-06, + "loss": 0.6658, + "step": 600 + }, + { + "epoch": 0.14099706744868035, + "grad_norm": 1.533295748310548, + "learning_rate": 9.949872710787402e-06, + "loss": 0.684, + "step": 601 + }, + { + "epoch": 0.1412316715542522, + "grad_norm": 1.7816342527457778, + "learning_rate": 9.949292514008127e-06, + "loss": 0.6591, + "step": 602 + }, + { + "epoch": 0.14146627565982406, + "grad_norm": 3.1159357600480035, + "learning_rate": 9.948708995898251e-06, + "loss": 0.6505, + "step": 603 + }, + { + "epoch": 0.1417008797653959, + "grad_norm": 6.279947901045703, + "learning_rate": 9.948122156849357e-06, + "loss": 0.7105, + "step": 604 + }, + { + "epoch": 0.14193548387096774, + "grad_norm": 1.4514443812808602, + "learning_rate": 9.947531997255256e-06, + "loss": 0.6555, + "step": 605 + }, + { + "epoch": 0.1421700879765396, + "grad_norm": 16.123057899871036, + "learning_rate": 9.946938517511988e-06, + "loss": 0.6481, + "step": 606 + }, + { + "epoch": 0.14240469208211143, + "grad_norm": 1.8485177049511416, + "learning_rate": 9.946341718017817e-06, + "loss": 0.6444, + "step": 607 + }, + { + "epoch": 0.14263929618768328, + "grad_norm": 1.381903063248314, + "learning_rate": 9.945741599173244e-06, + "loss": 0.6847, + "step": 608 + }, + { + "epoch": 0.14287390029325514, + "grad_norm": 1.9733424925107392, + "learning_rate": 9.945138161380987e-06, + "loss": 0.6566, + "step": 609 + }, + { + "epoch": 0.14310850439882697, + "grad_norm": 1.4736215812750697, + "learning_rate": 9.944531405046001e-06, + "loss": 0.6727, + "step": 610 + }, + { + "epoch": 0.14334310850439883, + "grad_norm": 1.674427950216094, + "learning_rate": 9.943921330575457e-06, + "loss": 0.6949, + "step": 611 + }, + { + "epoch": 0.14357771260997068, + "grad_norm": 1.353555356656059, + "learning_rate": 9.943307938378762e-06, + "loss": 0.6239, + "step": 612 + }, + { + "epoch": 0.1438123167155425, + "grad_norm": 6.462783821793124, + "learning_rate": 9.942691228867548e-06, + "loss": 0.6735, + "step": 613 + }, + { + "epoch": 0.14404692082111437, + "grad_norm": 1.8066326048928583, + "learning_rate": 9.942071202455667e-06, + "loss": 0.6635, + "step": 614 + }, + { + "epoch": 0.14428152492668622, + "grad_norm": 7.481709632140974, + "learning_rate": 9.941447859559204e-06, + "loss": 0.7008, + "step": 615 + }, + { + "epoch": 0.14451612903225808, + "grad_norm": 1.868185126323605, + "learning_rate": 9.940821200596466e-06, + "loss": 0.7114, + "step": 616 + }, + { + "epoch": 0.1447507331378299, + "grad_norm": 10.616230292276052, + "learning_rate": 9.94019122598799e-06, + "loss": 0.6753, + "step": 617 + }, + { + "epoch": 0.14498533724340176, + "grad_norm": 1.6889062877523384, + "learning_rate": 9.939557936156527e-06, + "loss": 0.6832, + "step": 618 + }, + { + "epoch": 0.14521994134897362, + "grad_norm": 1.3246938175496756, + "learning_rate": 9.938921331527066e-06, + "loss": 0.6755, + "step": 619 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 1.5267349547650975, + "learning_rate": 9.938281412526811e-06, + "loss": 0.7326, + "step": 620 + }, + { + "epoch": 0.1456891495601173, + "grad_norm": 1.3327513478934103, + "learning_rate": 9.937638179585195e-06, + "loss": 0.621, + "step": 621 + }, + { + "epoch": 0.14592375366568916, + "grad_norm": 3.3950719442877033, + "learning_rate": 9.936991633133872e-06, + "loss": 0.6918, + "step": 622 + }, + { + "epoch": 0.146158357771261, + "grad_norm": 2.281479467961296, + "learning_rate": 9.936341773606723e-06, + "loss": 0.6954, + "step": 623 + }, + { + "epoch": 0.14639296187683284, + "grad_norm": 1.1802407354424511, + "learning_rate": 9.93568860143985e-06, + "loss": 0.7372, + "step": 624 + }, + { + "epoch": 0.1466275659824047, + "grad_norm": 2.315080643869437, + "learning_rate": 9.93503211707158e-06, + "loss": 0.6501, + "step": 625 + }, + { + "epoch": 0.14686217008797653, + "grad_norm": 5.117637311109828, + "learning_rate": 9.934372320942455e-06, + "loss": 0.6691, + "step": 626 + }, + { + "epoch": 0.14709677419354839, + "grad_norm": 7.696774629839776, + "learning_rate": 9.933709213495253e-06, + "loss": 0.7046, + "step": 627 + }, + { + "epoch": 0.14733137829912024, + "grad_norm": 1.4226681215442163, + "learning_rate": 9.933042795174964e-06, + "loss": 0.684, + "step": 628 + }, + { + "epoch": 0.14756598240469207, + "grad_norm": 2.984733020473841, + "learning_rate": 9.9323730664288e-06, + "loss": 0.6815, + "step": 629 + }, + { + "epoch": 0.14780058651026393, + "grad_norm": 3.935812255521951, + "learning_rate": 9.931700027706199e-06, + "loss": 0.7096, + "step": 630 + }, + { + "epoch": 0.14803519061583578, + "grad_norm": 2.4457763134413084, + "learning_rate": 9.93102367945882e-06, + "loss": 0.6311, + "step": 631 + }, + { + "epoch": 0.14826979472140764, + "grad_norm": 1.4359062113524443, + "learning_rate": 9.930344022140539e-06, + "loss": 0.6773, + "step": 632 + }, + { + "epoch": 0.14850439882697947, + "grad_norm": 1.2100526324980292, + "learning_rate": 9.929661056207455e-06, + "loss": 0.6622, + "step": 633 + }, + { + "epoch": 0.14873900293255132, + "grad_norm": 1.0190753291928116, + "learning_rate": 9.928974782117889e-06, + "loss": 0.6981, + "step": 634 + }, + { + "epoch": 0.14897360703812318, + "grad_norm": 2.799854932121048, + "learning_rate": 9.928285200332377e-06, + "loss": 0.6984, + "step": 635 + }, + { + "epoch": 0.149208211143695, + "grad_norm": 2.905366083267552, + "learning_rate": 9.927592311313681e-06, + "loss": 0.6738, + "step": 636 + }, + { + "epoch": 0.14944281524926686, + "grad_norm": 2.0090448417142257, + "learning_rate": 9.926896115526777e-06, + "loss": 0.6518, + "step": 637 + }, + { + "epoch": 0.14967741935483872, + "grad_norm": 3.414729484354847, + "learning_rate": 9.926196613438866e-06, + "loss": 0.6537, + "step": 638 + }, + { + "epoch": 0.14991202346041055, + "grad_norm": 2.6622638389494115, + "learning_rate": 9.925493805519359e-06, + "loss": 0.6597, + "step": 639 + }, + { + "epoch": 0.1501466275659824, + "grad_norm": 2.7737320980633533, + "learning_rate": 9.924787692239892e-06, + "loss": 0.6643, + "step": 640 + }, + { + "epoch": 0.15038123167155426, + "grad_norm": 1.5644793983992447, + "learning_rate": 9.92407827407432e-06, + "loss": 0.6744, + "step": 641 + }, + { + "epoch": 0.1506158357771261, + "grad_norm": 1.6708319503739981, + "learning_rate": 9.923365551498713e-06, + "loss": 0.7092, + "step": 642 + }, + { + "epoch": 0.15085043988269795, + "grad_norm": 2.016621110195836, + "learning_rate": 9.922649524991359e-06, + "loss": 0.7289, + "step": 643 + }, + { + "epoch": 0.1510850439882698, + "grad_norm": 2.504724838609121, + "learning_rate": 9.921930195032763e-06, + "loss": 0.7298, + "step": 644 + }, + { + "epoch": 0.15131964809384163, + "grad_norm": 1.4473345607651253, + "learning_rate": 9.921207562105643e-06, + "loss": 0.6649, + "step": 645 + }, + { + "epoch": 0.1515542521994135, + "grad_norm": 1.3709821954549388, + "learning_rate": 9.920481626694945e-06, + "loss": 0.6696, + "step": 646 + }, + { + "epoch": 0.15178885630498534, + "grad_norm": 1.8706584965395907, + "learning_rate": 9.919752389287818e-06, + "loss": 0.6705, + "step": 647 + }, + { + "epoch": 0.1520234604105572, + "grad_norm": 2.249129643505635, + "learning_rate": 9.919019850373635e-06, + "loss": 0.6147, + "step": 648 + }, + { + "epoch": 0.15225806451612903, + "grad_norm": 2.097343552522159, + "learning_rate": 9.918284010443981e-06, + "loss": 0.6334, + "step": 649 + }, + { + "epoch": 0.15249266862170088, + "grad_norm": 1.8768058707443809, + "learning_rate": 9.91754486999266e-06, + "loss": 0.7041, + "step": 650 + }, + { + "epoch": 0.15272727272727274, + "grad_norm": 4.491233851879105, + "learning_rate": 9.916802429515685e-06, + "loss": 0.681, + "step": 651 + }, + { + "epoch": 0.15296187683284457, + "grad_norm": 1.533031987182923, + "learning_rate": 9.916056689511287e-06, + "loss": 0.6536, + "step": 652 + }, + { + "epoch": 0.15319648093841642, + "grad_norm": 1.759911764487037, + "learning_rate": 9.915307650479915e-06, + "loss": 0.6293, + "step": 653 + }, + { + "epoch": 0.15343108504398828, + "grad_norm": 1.2374310964542206, + "learning_rate": 9.914555312924223e-06, + "loss": 0.7126, + "step": 654 + }, + { + "epoch": 0.1536656891495601, + "grad_norm": 1.8037339536830177, + "learning_rate": 9.913799677349087e-06, + "loss": 0.6788, + "step": 655 + }, + { + "epoch": 0.15390029325513196, + "grad_norm": 2.2873799899386493, + "learning_rate": 9.913040744261594e-06, + "loss": 0.6878, + "step": 656 + }, + { + "epoch": 0.15413489736070382, + "grad_norm": 8.552252314012465, + "learning_rate": 9.912278514171037e-06, + "loss": 0.6558, + "step": 657 + }, + { + "epoch": 0.15436950146627565, + "grad_norm": 4.043452822531138, + "learning_rate": 9.911512987588932e-06, + "loss": 0.6074, + "step": 658 + }, + { + "epoch": 0.1546041055718475, + "grad_norm": 1.5929166007016708, + "learning_rate": 9.910744165028998e-06, + "loss": 0.6456, + "step": 659 + }, + { + "epoch": 0.15483870967741936, + "grad_norm": 1.322345644985473, + "learning_rate": 9.909972047007176e-06, + "loss": 0.6527, + "step": 660 + }, + { + "epoch": 0.1550733137829912, + "grad_norm": 1.9134269361860932, + "learning_rate": 9.909196634041607e-06, + "loss": 0.6721, + "step": 661 + }, + { + "epoch": 0.15530791788856305, + "grad_norm": 3.2279892391112743, + "learning_rate": 9.908417926652653e-06, + "loss": 0.7016, + "step": 662 + }, + { + "epoch": 0.1555425219941349, + "grad_norm": 4.605703097073935, + "learning_rate": 9.90763592536288e-06, + "loss": 0.7173, + "step": 663 + }, + { + "epoch": 0.15577712609970676, + "grad_norm": 2.2844915719419006, + "learning_rate": 9.906850630697068e-06, + "loss": 0.684, + "step": 664 + }, + { + "epoch": 0.1560117302052786, + "grad_norm": 1.3671944830965983, + "learning_rate": 9.906062043182205e-06, + "loss": 0.7538, + "step": 665 + }, + { + "epoch": 0.15624633431085044, + "grad_norm": 17.315048588685254, + "learning_rate": 9.905270163347491e-06, + "loss": 0.6221, + "step": 666 + }, + { + "epoch": 0.1564809384164223, + "grad_norm": 1.1715675455606274, + "learning_rate": 9.904474991724335e-06, + "loss": 0.6396, + "step": 667 + }, + { + "epoch": 0.15671554252199413, + "grad_norm": 1.7500860284386124, + "learning_rate": 9.903676528846353e-06, + "loss": 0.6947, + "step": 668 + }, + { + "epoch": 0.15695014662756598, + "grad_norm": 1.3131765793711139, + "learning_rate": 9.90287477524937e-06, + "loss": 0.7223, + "step": 669 + }, + { + "epoch": 0.15718475073313784, + "grad_norm": 5.544168444593145, + "learning_rate": 9.902069731471422e-06, + "loss": 0.6371, + "step": 670 + }, + { + "epoch": 0.15741935483870967, + "grad_norm": 1.5030904405800865, + "learning_rate": 9.90126139805275e-06, + "loss": 0.6581, + "step": 671 + }, + { + "epoch": 0.15765395894428152, + "grad_norm": 2.211411041740967, + "learning_rate": 9.900449775535802e-06, + "loss": 0.6776, + "step": 672 + }, + { + "epoch": 0.15788856304985338, + "grad_norm": 101.66172969743218, + "learning_rate": 9.899634864465242e-06, + "loss": 0.63, + "step": 673 + }, + { + "epoch": 0.1581231671554252, + "grad_norm": 1.6269899042718914, + "learning_rate": 9.898816665387924e-06, + "loss": 0.6985, + "step": 674 + }, + { + "epoch": 0.15835777126099707, + "grad_norm": 25.473981421672807, + "learning_rate": 9.897995178852927e-06, + "loss": 0.6415, + "step": 675 + }, + { + "epoch": 0.15859237536656892, + "grad_norm": 2.7339580672894845, + "learning_rate": 9.897170405411522e-06, + "loss": 0.6727, + "step": 676 + }, + { + "epoch": 0.15882697947214075, + "grad_norm": 4.514988151322233, + "learning_rate": 9.896342345617193e-06, + "loss": 0.6098, + "step": 677 + }, + { + "epoch": 0.1590615835777126, + "grad_norm": 7.419269783067423, + "learning_rate": 9.89551100002563e-06, + "loss": 0.7198, + "step": 678 + }, + { + "epoch": 0.15929618768328446, + "grad_norm": 2.6347918463592412, + "learning_rate": 9.89467636919472e-06, + "loss": 0.6904, + "step": 679 + }, + { + "epoch": 0.15953079178885632, + "grad_norm": 1.6903424361035653, + "learning_rate": 9.893838453684566e-06, + "loss": 0.6073, + "step": 680 + }, + { + "epoch": 0.15976539589442815, + "grad_norm": 4.683500265566674, + "learning_rate": 9.892997254057463e-06, + "loss": 0.6521, + "step": 681 + }, + { + "epoch": 0.16, + "grad_norm": 1.898678756553585, + "learning_rate": 9.892152770877923e-06, + "loss": 0.7192, + "step": 682 + }, + { + "epoch": 0.16023460410557186, + "grad_norm": 4.566830073066921, + "learning_rate": 9.891305004712652e-06, + "loss": 0.6814, + "step": 683 + }, + { + "epoch": 0.1604692082111437, + "grad_norm": 1.5554337393038395, + "learning_rate": 9.890453956130562e-06, + "loss": 0.6378, + "step": 684 + }, + { + "epoch": 0.16070381231671554, + "grad_norm": 1.6568419750114889, + "learning_rate": 9.889599625702765e-06, + "loss": 0.6609, + "step": 685 + }, + { + "epoch": 0.1609384164222874, + "grad_norm": 1.6968141200340794, + "learning_rate": 9.888742014002583e-06, + "loss": 0.6211, + "step": 686 + }, + { + "epoch": 0.16117302052785923, + "grad_norm": 1.1858462092018027, + "learning_rate": 9.887881121605532e-06, + "loss": 0.677, + "step": 687 + }, + { + "epoch": 0.16140762463343108, + "grad_norm": 1.7694121781659031, + "learning_rate": 9.887016949089334e-06, + "loss": 0.6674, + "step": 688 + }, + { + "epoch": 0.16164222873900294, + "grad_norm": 2.193857961379748, + "learning_rate": 9.88614949703391e-06, + "loss": 0.676, + "step": 689 + }, + { + "epoch": 0.16187683284457477, + "grad_norm": 1.0564631760371397, + "learning_rate": 9.885278766021383e-06, + "loss": 0.68, + "step": 690 + }, + { + "epoch": 0.16211143695014663, + "grad_norm": 4.093060650971836, + "learning_rate": 9.884404756636074e-06, + "loss": 0.6445, + "step": 691 + }, + { + "epoch": 0.16234604105571848, + "grad_norm": 3.4529279760731915, + "learning_rate": 9.88352746946451e-06, + "loss": 0.6595, + "step": 692 + }, + { + "epoch": 0.1625806451612903, + "grad_norm": 1.7772841785991123, + "learning_rate": 9.88264690509541e-06, + "loss": 0.649, + "step": 693 + }, + { + "epoch": 0.16281524926686217, + "grad_norm": 2.349712763219154, + "learning_rate": 9.881763064119697e-06, + "loss": 0.6299, + "step": 694 + }, + { + "epoch": 0.16304985337243402, + "grad_norm": 1.954528333429281, + "learning_rate": 9.880875947130492e-06, + "loss": 0.6594, + "step": 695 + }, + { + "epoch": 0.16328445747800588, + "grad_norm": 1.588185836220011, + "learning_rate": 9.879985554723116e-06, + "loss": 0.6469, + "step": 696 + }, + { + "epoch": 0.1635190615835777, + "grad_norm": 1.774096724121723, + "learning_rate": 9.879091887495083e-06, + "loss": 0.6966, + "step": 697 + }, + { + "epoch": 0.16375366568914956, + "grad_norm": 1.328322615162832, + "learning_rate": 9.87819494604611e-06, + "loss": 0.6754, + "step": 698 + }, + { + "epoch": 0.16398826979472142, + "grad_norm": 2.221119835330553, + "learning_rate": 9.877294730978108e-06, + "loss": 0.6782, + "step": 699 + }, + { + "epoch": 0.16422287390029325, + "grad_norm": 1.4702304509045119, + "learning_rate": 9.876391242895188e-06, + "loss": 0.691, + "step": 700 + }, + { + "epoch": 0.1644574780058651, + "grad_norm": 1.9739830661007176, + "learning_rate": 9.875484482403653e-06, + "loss": 0.6662, + "step": 701 + }, + { + "epoch": 0.16469208211143696, + "grad_norm": 2.2302699298247317, + "learning_rate": 9.874574450112005e-06, + "loss": 0.608, + "step": 702 + }, + { + "epoch": 0.1649266862170088, + "grad_norm": 1.8007165172025146, + "learning_rate": 9.87366114663094e-06, + "loss": 0.6431, + "step": 703 + }, + { + "epoch": 0.16516129032258065, + "grad_norm": 1.4797559173895927, + "learning_rate": 9.872744572573352e-06, + "loss": 0.5899, + "step": 704 + }, + { + "epoch": 0.1653958944281525, + "grad_norm": 2.371687957119544, + "learning_rate": 9.871824728554329e-06, + "loss": 0.7164, + "step": 705 + }, + { + "epoch": 0.16563049853372433, + "grad_norm": 3.6577081342450564, + "learning_rate": 9.87090161519115e-06, + "loss": 0.6789, + "step": 706 + }, + { + "epoch": 0.16586510263929619, + "grad_norm": 1.3932301047626219, + "learning_rate": 9.869975233103291e-06, + "loss": 0.6866, + "step": 707 + }, + { + "epoch": 0.16609970674486804, + "grad_norm": 2.0554334361093876, + "learning_rate": 9.869045582912419e-06, + "loss": 0.6793, + "step": 708 + }, + { + "epoch": 0.16633431085043987, + "grad_norm": 2.224053361060325, + "learning_rate": 9.868112665242401e-06, + "loss": 0.6999, + "step": 709 + }, + { + "epoch": 0.16656891495601173, + "grad_norm": 1.6551641825130865, + "learning_rate": 9.867176480719287e-06, + "loss": 0.7042, + "step": 710 + }, + { + "epoch": 0.16680351906158358, + "grad_norm": 1.6116484588806546, + "learning_rate": 9.866237029971327e-06, + "loss": 0.6283, + "step": 711 + }, + { + "epoch": 0.16703812316715544, + "grad_norm": 1.794097210603171, + "learning_rate": 9.865294313628959e-06, + "loss": 0.7035, + "step": 712 + }, + { + "epoch": 0.16727272727272727, + "grad_norm": 1.8271196863853747, + "learning_rate": 9.864348332324811e-06, + "loss": 0.611, + "step": 713 + }, + { + "epoch": 0.16750733137829912, + "grad_norm": 1.3309702239327663, + "learning_rate": 9.863399086693709e-06, + "loss": 0.6366, + "step": 714 + }, + { + "epoch": 0.16774193548387098, + "grad_norm": 3.331889826028672, + "learning_rate": 9.86244657737266e-06, + "loss": 0.6506, + "step": 715 + }, + { + "epoch": 0.1679765395894428, + "grad_norm": 4.773871403810453, + "learning_rate": 9.861490805000868e-06, + "loss": 0.657, + "step": 716 + }, + { + "epoch": 0.16821114369501466, + "grad_norm": 1.6248289349019442, + "learning_rate": 9.860531770219729e-06, + "loss": 0.6469, + "step": 717 + }, + { + "epoch": 0.16844574780058652, + "grad_norm": 1.1194824056463804, + "learning_rate": 9.859569473672816e-06, + "loss": 0.6715, + "step": 718 + }, + { + "epoch": 0.16868035190615835, + "grad_norm": 3.5798125313775007, + "learning_rate": 9.858603916005907e-06, + "loss": 0.6333, + "step": 719 + }, + { + "epoch": 0.1689149560117302, + "grad_norm": 8.283484339955482, + "learning_rate": 9.857635097866955e-06, + "loss": 0.6114, + "step": 720 + }, + { + "epoch": 0.16914956011730206, + "grad_norm": 2.0421584535823474, + "learning_rate": 9.856663019906112e-06, + "loss": 0.6806, + "step": 721 + }, + { + "epoch": 0.1693841642228739, + "grad_norm": 1.8570662954694708, + "learning_rate": 9.855687682775706e-06, + "loss": 0.6636, + "step": 722 + }, + { + "epoch": 0.16961876832844575, + "grad_norm": 2.166770834539674, + "learning_rate": 9.854709087130261e-06, + "loss": 0.6291, + "step": 723 + }, + { + "epoch": 0.1698533724340176, + "grad_norm": 3.239680976176979, + "learning_rate": 9.853727233626485e-06, + "loss": 0.6652, + "step": 724 + }, + { + "epoch": 0.17008797653958943, + "grad_norm": 1.0323438162265144, + "learning_rate": 9.852742122923274e-06, + "loss": 0.6699, + "step": 725 + }, + { + "epoch": 0.1703225806451613, + "grad_norm": 3.40075441491087, + "learning_rate": 9.851753755681703e-06, + "loss": 0.6995, + "step": 726 + }, + { + "epoch": 0.17055718475073314, + "grad_norm": 5.241387160101221, + "learning_rate": 9.850762132565044e-06, + "loss": 0.6687, + "step": 727 + }, + { + "epoch": 0.170791788856305, + "grad_norm": 2.5134466364367536, + "learning_rate": 9.849767254238741e-06, + "loss": 0.687, + "step": 728 + }, + { + "epoch": 0.17102639296187683, + "grad_norm": 3.2450867951663804, + "learning_rate": 9.848769121370434e-06, + "loss": 0.7159, + "step": 729 + }, + { + "epoch": 0.17126099706744868, + "grad_norm": 1.8679718244097496, + "learning_rate": 9.847767734629938e-06, + "loss": 0.6633, + "step": 730 + }, + { + "epoch": 0.17149560117302054, + "grad_norm": 1.989276653817671, + "learning_rate": 9.846763094689255e-06, + "loss": 0.6178, + "step": 731 + }, + { + "epoch": 0.17173020527859237, + "grad_norm": 1.2513787228924333, + "learning_rate": 9.845755202222575e-06, + "loss": 0.6619, + "step": 732 + }, + { + "epoch": 0.17196480938416422, + "grad_norm": 2.136836891169221, + "learning_rate": 9.844744057906263e-06, + "loss": 0.6295, + "step": 733 + }, + { + "epoch": 0.17219941348973608, + "grad_norm": 2.4852757416980555, + "learning_rate": 9.84372966241887e-06, + "loss": 0.6192, + "step": 734 + }, + { + "epoch": 0.1724340175953079, + "grad_norm": 2.4380261568528034, + "learning_rate": 9.842712016441129e-06, + "loss": 0.7001, + "step": 735 + }, + { + "epoch": 0.17266862170087977, + "grad_norm": 1.7994954992881838, + "learning_rate": 9.841691120655952e-06, + "loss": 0.7044, + "step": 736 + }, + { + "epoch": 0.17290322580645162, + "grad_norm": 2.332061873888857, + "learning_rate": 9.840666975748432e-06, + "loss": 0.7257, + "step": 737 + }, + { + "epoch": 0.17313782991202345, + "grad_norm": 1.7137094658452967, + "learning_rate": 9.83963958240585e-06, + "loss": 0.6831, + "step": 738 + }, + { + "epoch": 0.1733724340175953, + "grad_norm": 2.715564110665805, + "learning_rate": 9.838608941317653e-06, + "loss": 0.6834, + "step": 739 + }, + { + "epoch": 0.17360703812316716, + "grad_norm": 1.4855822273183557, + "learning_rate": 9.837575053175479e-06, + "loss": 0.6729, + "step": 740 + }, + { + "epoch": 0.173841642228739, + "grad_norm": 1.7370975706002307, + "learning_rate": 9.836537918673142e-06, + "loss": 0.6588, + "step": 741 + }, + { + "epoch": 0.17407624633431085, + "grad_norm": 1.3785306739940393, + "learning_rate": 9.835497538506634e-06, + "loss": 0.7089, + "step": 742 + }, + { + "epoch": 0.1743108504398827, + "grad_norm": 3.8083059374134676, + "learning_rate": 9.83445391337412e-06, + "loss": 0.6702, + "step": 743 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 1.8835346626639433, + "learning_rate": 9.833407043975952e-06, + "loss": 0.6741, + "step": 744 + }, + { + "epoch": 0.1747800586510264, + "grad_norm": 2.524266850602126, + "learning_rate": 9.832356931014653e-06, + "loss": 0.6245, + "step": 745 + }, + { + "epoch": 0.17501466275659824, + "grad_norm": 1.6516777113546257, + "learning_rate": 9.831303575194926e-06, + "loss": 0.6318, + "step": 746 + }, + { + "epoch": 0.1752492668621701, + "grad_norm": 1.1086329070239551, + "learning_rate": 9.830246977223645e-06, + "loss": 0.7131, + "step": 747 + }, + { + "epoch": 0.17548387096774193, + "grad_norm": 18.679637874630913, + "learning_rate": 9.829187137809865e-06, + "loss": 0.6033, + "step": 748 + }, + { + "epoch": 0.17571847507331378, + "grad_norm": 3.023618669662723, + "learning_rate": 9.828124057664816e-06, + "loss": 0.6851, + "step": 749 + }, + { + "epoch": 0.17595307917888564, + "grad_norm": 1.6983262840922475, + "learning_rate": 9.827057737501897e-06, + "loss": 0.6404, + "step": 750 + }, + { + "epoch": 0.17618768328445747, + "grad_norm": 1.2721188731290216, + "learning_rate": 9.825988178036689e-06, + "loss": 0.6785, + "step": 751 + }, + { + "epoch": 0.17642228739002933, + "grad_norm": 1.1787327418597884, + "learning_rate": 9.824915379986943e-06, + "loss": 0.6468, + "step": 752 + }, + { + "epoch": 0.17665689149560118, + "grad_norm": 3.2793541700112976, + "learning_rate": 9.823839344072582e-06, + "loss": 0.6879, + "step": 753 + }, + { + "epoch": 0.176891495601173, + "grad_norm": 8.160211428506143, + "learning_rate": 9.822760071015702e-06, + "loss": 0.6498, + "step": 754 + }, + { + "epoch": 0.17712609970674487, + "grad_norm": 1.6675553229919307, + "learning_rate": 9.821677561540575e-06, + "loss": 0.6875, + "step": 755 + }, + { + "epoch": 0.17736070381231672, + "grad_norm": 7.943861265740587, + "learning_rate": 9.820591816373642e-06, + "loss": 0.6838, + "step": 756 + }, + { + "epoch": 0.17759530791788855, + "grad_norm": 2.4811259708091935, + "learning_rate": 9.819502836243518e-06, + "loss": 0.6246, + "step": 757 + }, + { + "epoch": 0.1778299120234604, + "grad_norm": 1.8298941304912442, + "learning_rate": 9.818410621880983e-06, + "loss": 0.6349, + "step": 758 + }, + { + "epoch": 0.17806451612903226, + "grad_norm": 2.0813385082191806, + "learning_rate": 9.817315174018993e-06, + "loss": 0.6656, + "step": 759 + }, + { + "epoch": 0.17829912023460412, + "grad_norm": 2.1358142132141134, + "learning_rate": 9.816216493392673e-06, + "loss": 0.7075, + "step": 760 + }, + { + "epoch": 0.17853372434017595, + "grad_norm": 1.427783318018125, + "learning_rate": 9.815114580739316e-06, + "loss": 0.6801, + "step": 761 + }, + { + "epoch": 0.1787683284457478, + "grad_norm": 1.3619377798420857, + "learning_rate": 9.814009436798383e-06, + "loss": 0.6703, + "step": 762 + }, + { + "epoch": 0.17900293255131966, + "grad_norm": 1.4922700171855006, + "learning_rate": 9.812901062311508e-06, + "loss": 0.6294, + "step": 763 + }, + { + "epoch": 0.1792375366568915, + "grad_norm": 1.1267813344280002, + "learning_rate": 9.811789458022487e-06, + "loss": 0.659, + "step": 764 + }, + { + "epoch": 0.17947214076246334, + "grad_norm": 1.058523036042874, + "learning_rate": 9.810674624677288e-06, + "loss": 0.6694, + "step": 765 + }, + { + "epoch": 0.1797067448680352, + "grad_norm": 1.4117029808866475, + "learning_rate": 9.809556563024043e-06, + "loss": 0.6793, + "step": 766 + }, + { + "epoch": 0.17994134897360703, + "grad_norm": 2.5544433600431193, + "learning_rate": 9.808435273813053e-06, + "loss": 0.6806, + "step": 767 + }, + { + "epoch": 0.18017595307917889, + "grad_norm": 1.3183795596647168, + "learning_rate": 9.807310757796782e-06, + "loss": 0.6803, + "step": 768 + }, + { + "epoch": 0.18041055718475074, + "grad_norm": 1.7218640838577965, + "learning_rate": 9.806183015729862e-06, + "loss": 0.7058, + "step": 769 + }, + { + "epoch": 0.18064516129032257, + "grad_norm": 1.5752529981894803, + "learning_rate": 9.805052048369089e-06, + "loss": 0.7531, + "step": 770 + }, + { + "epoch": 0.18087976539589443, + "grad_norm": 3.1420200938798337, + "learning_rate": 9.803917856473423e-06, + "loss": 0.6798, + "step": 771 + }, + { + "epoch": 0.18111436950146628, + "grad_norm": 3.7302991907088736, + "learning_rate": 9.802780440803986e-06, + "loss": 0.6542, + "step": 772 + }, + { + "epoch": 0.1813489736070381, + "grad_norm": 1.7749633842591563, + "learning_rate": 9.801639802124073e-06, + "loss": 0.6246, + "step": 773 + }, + { + "epoch": 0.18158357771260997, + "grad_norm": 2.541138158664817, + "learning_rate": 9.800495941199125e-06, + "loss": 0.6606, + "step": 774 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 1.6776874386735878, + "learning_rate": 9.79934885879676e-06, + "loss": 0.6663, + "step": 775 + }, + { + "epoch": 0.18205278592375368, + "grad_norm": 1.0805462857796522, + "learning_rate": 9.798198555686751e-06, + "loss": 0.6555, + "step": 776 + }, + { + "epoch": 0.1822873900293255, + "grad_norm": 3.0300715489113785, + "learning_rate": 9.797045032641036e-06, + "loss": 0.7376, + "step": 777 + }, + { + "epoch": 0.18252199413489736, + "grad_norm": 1.0673645067009438, + "learning_rate": 9.795888290433709e-06, + "loss": 0.6737, + "step": 778 + }, + { + "epoch": 0.18275659824046922, + "grad_norm": 10.552559588259303, + "learning_rate": 9.79472832984103e-06, + "loss": 0.6717, + "step": 779 + }, + { + "epoch": 0.18299120234604105, + "grad_norm": 4.436167680881757, + "learning_rate": 9.793565151641411e-06, + "loss": 0.6501, + "step": 780 + }, + { + "epoch": 0.1832258064516129, + "grad_norm": 2.055463005158563, + "learning_rate": 9.792398756615435e-06, + "loss": 0.6546, + "step": 781 + }, + { + "epoch": 0.18346041055718476, + "grad_norm": 1.9264261961298847, + "learning_rate": 9.791229145545832e-06, + "loss": 0.6632, + "step": 782 + }, + { + "epoch": 0.1836950146627566, + "grad_norm": 1.5882634716208426, + "learning_rate": 9.790056319217496e-06, + "loss": 0.6561, + "step": 783 + }, + { + "epoch": 0.18392961876832845, + "grad_norm": 1.7661044756680175, + "learning_rate": 9.788880278417479e-06, + "loss": 0.6875, + "step": 784 + }, + { + "epoch": 0.1841642228739003, + "grad_norm": 1.6240516754366172, + "learning_rate": 9.787701023934985e-06, + "loss": 0.6501, + "step": 785 + }, + { + "epoch": 0.18439882697947213, + "grad_norm": 8.357259392450423, + "learning_rate": 9.786518556561383e-06, + "loss": 0.6823, + "step": 786 + }, + { + "epoch": 0.184633431085044, + "grad_norm": 1.4589649174263475, + "learning_rate": 9.78533287709019e-06, + "loss": 0.6444, + "step": 787 + }, + { + "epoch": 0.18486803519061584, + "grad_norm": 1.968470742577988, + "learning_rate": 9.784143986317084e-06, + "loss": 0.6723, + "step": 788 + }, + { + "epoch": 0.18510263929618767, + "grad_norm": 1.2035726817002486, + "learning_rate": 9.782951885039897e-06, + "loss": 0.6581, + "step": 789 + }, + { + "epoch": 0.18533724340175953, + "grad_norm": 1.0962656011799101, + "learning_rate": 9.781756574058614e-06, + "loss": 0.666, + "step": 790 + }, + { + "epoch": 0.18557184750733138, + "grad_norm": 2.2196841349727054, + "learning_rate": 9.780558054175373e-06, + "loss": 0.7009, + "step": 791 + }, + { + "epoch": 0.18580645161290324, + "grad_norm": 1.2976460256622553, + "learning_rate": 9.779356326194466e-06, + "loss": 0.6668, + "step": 792 + }, + { + "epoch": 0.18604105571847507, + "grad_norm": 1.4330922832281572, + "learning_rate": 9.77815139092234e-06, + "loss": 0.6858, + "step": 793 + }, + { + "epoch": 0.18627565982404692, + "grad_norm": 1.1778095642510038, + "learning_rate": 9.776943249167595e-06, + "loss": 0.6462, + "step": 794 + }, + { + "epoch": 0.18651026392961878, + "grad_norm": 2.084625097304774, + "learning_rate": 9.77573190174098e-06, + "loss": 0.6666, + "step": 795 + }, + { + "epoch": 0.1867448680351906, + "grad_norm": 1.3980507530165778, + "learning_rate": 9.774517349455393e-06, + "loss": 0.694, + "step": 796 + }, + { + "epoch": 0.18697947214076246, + "grad_norm": 2.6094745942939963, + "learning_rate": 9.77329959312589e-06, + "loss": 0.6127, + "step": 797 + }, + { + "epoch": 0.18721407624633432, + "grad_norm": 1.8460884345290358, + "learning_rate": 9.772078633569671e-06, + "loss": 0.7003, + "step": 798 + }, + { + "epoch": 0.18744868035190615, + "grad_norm": 1.9135578730776541, + "learning_rate": 9.770854471606086e-06, + "loss": 0.6962, + "step": 799 + }, + { + "epoch": 0.187683284457478, + "grad_norm": 1.2722374112691082, + "learning_rate": 9.769627108056638e-06, + "loss": 0.6594, + "step": 800 + }, + { + "epoch": 0.18791788856304986, + "grad_norm": 1.9797278719422926, + "learning_rate": 9.768396543744976e-06, + "loss": 0.6509, + "step": 801 + }, + { + "epoch": 0.1881524926686217, + "grad_norm": 1.3958306309948436, + "learning_rate": 9.767162779496894e-06, + "loss": 0.5916, + "step": 802 + }, + { + "epoch": 0.18838709677419355, + "grad_norm": 5.267892629337839, + "learning_rate": 9.76592581614034e-06, + "loss": 0.6734, + "step": 803 + }, + { + "epoch": 0.1886217008797654, + "grad_norm": 1.605813259416675, + "learning_rate": 9.764685654505405e-06, + "loss": 0.6719, + "step": 804 + }, + { + "epoch": 0.18885630498533723, + "grad_norm": 2.5451781138228275, + "learning_rate": 9.763442295424323e-06, + "loss": 0.6623, + "step": 805 + }, + { + "epoch": 0.1890909090909091, + "grad_norm": 3.475430507902762, + "learning_rate": 9.762195739731482e-06, + "loss": 0.6365, + "step": 806 + }, + { + "epoch": 0.18932551319648094, + "grad_norm": 6.680299483411663, + "learning_rate": 9.760945988263406e-06, + "loss": 0.6101, + "step": 807 + }, + { + "epoch": 0.1895601173020528, + "grad_norm": 5.3399812265684865, + "learning_rate": 9.75969304185877e-06, + "loss": 0.6359, + "step": 808 + }, + { + "epoch": 0.18979472140762463, + "grad_norm": 2.407122041291883, + "learning_rate": 9.758436901358391e-06, + "loss": 0.6736, + "step": 809 + }, + { + "epoch": 0.19002932551319648, + "grad_norm": 2.9490572761081375, + "learning_rate": 9.757177567605229e-06, + "loss": 0.6364, + "step": 810 + }, + { + "epoch": 0.19026392961876834, + "grad_norm": 6.217157809007754, + "learning_rate": 9.755915041444389e-06, + "loss": 0.671, + "step": 811 + }, + { + "epoch": 0.19049853372434017, + "grad_norm": 1.9173672087643958, + "learning_rate": 9.754649323723113e-06, + "loss": 0.6805, + "step": 812 + }, + { + "epoch": 0.19073313782991203, + "grad_norm": 1.7146151689127684, + "learning_rate": 9.753380415290793e-06, + "loss": 0.6029, + "step": 813 + }, + { + "epoch": 0.19096774193548388, + "grad_norm": 3.5549682056714245, + "learning_rate": 9.752108316998955e-06, + "loss": 0.6708, + "step": 814 + }, + { + "epoch": 0.1912023460410557, + "grad_norm": 4.602087148937077, + "learning_rate": 9.750833029701268e-06, + "loss": 0.6749, + "step": 815 + }, + { + "epoch": 0.19143695014662757, + "grad_norm": 1.3035557920997363, + "learning_rate": 9.749554554253542e-06, + "loss": 0.6647, + "step": 816 + }, + { + "epoch": 0.19167155425219942, + "grad_norm": 1.2639233011244417, + "learning_rate": 9.748272891513727e-06, + "loss": 0.6321, + "step": 817 + }, + { + "epoch": 0.19190615835777125, + "grad_norm": 1.8385936360209119, + "learning_rate": 9.746988042341907e-06, + "loss": 0.6579, + "step": 818 + }, + { + "epoch": 0.1921407624633431, + "grad_norm": 1.6376446434078913, + "learning_rate": 9.745700007600312e-06, + "loss": 0.6757, + "step": 819 + }, + { + "epoch": 0.19237536656891496, + "grad_norm": 2.9475770551988254, + "learning_rate": 9.744408788153305e-06, + "loss": 0.6732, + "step": 820 + }, + { + "epoch": 0.1926099706744868, + "grad_norm": 1.172714115730443, + "learning_rate": 9.743114384867387e-06, + "loss": 0.6162, + "step": 821 + }, + { + "epoch": 0.19284457478005865, + "grad_norm": 2.1645302554961603, + "learning_rate": 9.741816798611193e-06, + "loss": 0.6618, + "step": 822 + }, + { + "epoch": 0.1930791788856305, + "grad_norm": 2.7584078984953635, + "learning_rate": 9.7405160302555e-06, + "loss": 0.6776, + "step": 823 + }, + { + "epoch": 0.19331378299120236, + "grad_norm": 2.817753347077862, + "learning_rate": 9.739212080673215e-06, + "loss": 0.6836, + "step": 824 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 1.3924384398039422, + "learning_rate": 9.73790495073938e-06, + "loss": 0.6815, + "step": 825 + }, + { + "epoch": 0.19378299120234604, + "grad_norm": 2.7466754222436913, + "learning_rate": 9.736594641331178e-06, + "loss": 0.6706, + "step": 826 + }, + { + "epoch": 0.1940175953079179, + "grad_norm": 1.4690068934712832, + "learning_rate": 9.735281153327916e-06, + "loss": 0.6652, + "step": 827 + }, + { + "epoch": 0.19425219941348973, + "grad_norm": 0.9912171222047818, + "learning_rate": 9.733964487611044e-06, + "loss": 0.6923, + "step": 828 + }, + { + "epoch": 0.19448680351906159, + "grad_norm": 2.3940548177009275, + "learning_rate": 9.73264464506413e-06, + "loss": 0.6826, + "step": 829 + }, + { + "epoch": 0.19472140762463344, + "grad_norm": 5.982181001504478, + "learning_rate": 9.731321626572894e-06, + "loss": 0.6929, + "step": 830 + }, + { + "epoch": 0.19495601173020527, + "grad_norm": 1.3475966109935826, + "learning_rate": 9.729995433025169e-06, + "loss": 0.7073, + "step": 831 + }, + { + "epoch": 0.19519061583577713, + "grad_norm": 1.043564031113599, + "learning_rate": 9.728666065310929e-06, + "loss": 0.6182, + "step": 832 + }, + { + "epoch": 0.19542521994134898, + "grad_norm": 1.7350303914688778, + "learning_rate": 9.727333524322274e-06, + "loss": 0.668, + "step": 833 + }, + { + "epoch": 0.1956598240469208, + "grad_norm": 2.0000050782238272, + "learning_rate": 9.725997810953436e-06, + "loss": 0.6098, + "step": 834 + }, + { + "epoch": 0.19589442815249267, + "grad_norm": 1.291321907822395, + "learning_rate": 9.724658926100775e-06, + "loss": 0.6728, + "step": 835 + }, + { + "epoch": 0.19612903225806452, + "grad_norm": 1.763048699714866, + "learning_rate": 9.723316870662775e-06, + "loss": 0.6375, + "step": 836 + }, + { + "epoch": 0.19636363636363635, + "grad_norm": 2.3531051325550982, + "learning_rate": 9.721971645540055e-06, + "loss": 0.6648, + "step": 837 + }, + { + "epoch": 0.1965982404692082, + "grad_norm": 2.1961219586654317, + "learning_rate": 9.720623251635358e-06, + "loss": 0.6738, + "step": 838 + }, + { + "epoch": 0.19683284457478006, + "grad_norm": 3.7024000150930774, + "learning_rate": 9.71927168985355e-06, + "loss": 0.6566, + "step": 839 + }, + { + "epoch": 0.19706744868035192, + "grad_norm": 2.624254973613114, + "learning_rate": 9.717916961101632e-06, + "loss": 0.6579, + "step": 840 + }, + { + "epoch": 0.19730205278592375, + "grad_norm": 2.1037066238923123, + "learning_rate": 9.716559066288716e-06, + "loss": 0.6406, + "step": 841 + }, + { + "epoch": 0.1975366568914956, + "grad_norm": 1.1942497303256887, + "learning_rate": 9.715198006326053e-06, + "loss": 0.7186, + "step": 842 + }, + { + "epoch": 0.19777126099706746, + "grad_norm": 2.45649961745645, + "learning_rate": 9.713833782127008e-06, + "loss": 0.7026, + "step": 843 + }, + { + "epoch": 0.1980058651026393, + "grad_norm": 1.2871905539089525, + "learning_rate": 9.712466394607079e-06, + "loss": 0.6928, + "step": 844 + }, + { + "epoch": 0.19824046920821115, + "grad_norm": 1.5272448501545626, + "learning_rate": 9.711095844683875e-06, + "loss": 0.645, + "step": 845 + }, + { + "epoch": 0.198475073313783, + "grad_norm": 1.5635761481172537, + "learning_rate": 9.709722133277137e-06, + "loss": 0.6785, + "step": 846 + }, + { + "epoch": 0.19870967741935483, + "grad_norm": 0.9778959639272299, + "learning_rate": 9.708345261308724e-06, + "loss": 0.6714, + "step": 847 + }, + { + "epoch": 0.19894428152492669, + "grad_norm": 1.7864809366163665, + "learning_rate": 9.706965229702614e-06, + "loss": 0.6575, + "step": 848 + }, + { + "epoch": 0.19917888563049854, + "grad_norm": 1.2864871417843071, + "learning_rate": 9.705582039384907e-06, + "loss": 0.6633, + "step": 849 + }, + { + "epoch": 0.19941348973607037, + "grad_norm": 2.7203996971022395, + "learning_rate": 9.704195691283826e-06, + "loss": 0.6957, + "step": 850 + }, + { + "epoch": 0.19964809384164223, + "grad_norm": 1.730215994334602, + "learning_rate": 9.702806186329707e-06, + "loss": 0.7061, + "step": 851 + }, + { + "epoch": 0.19988269794721408, + "grad_norm": 4.388761434832178, + "learning_rate": 9.70141352545501e-06, + "loss": 0.6612, + "step": 852 + }, + { + "epoch": 0.2001173020527859, + "grad_norm": 4.265248714766533, + "learning_rate": 9.70001770959431e-06, + "loss": 0.644, + "step": 853 + }, + { + "epoch": 0.20035190615835777, + "grad_norm": 1.068965180050243, + "learning_rate": 9.698618739684298e-06, + "loss": 0.6442, + "step": 854 + }, + { + "epoch": 0.20058651026392962, + "grad_norm": 1.3756426647548368, + "learning_rate": 9.697216616663782e-06, + "loss": 0.6561, + "step": 855 + }, + { + "epoch": 0.20082111436950148, + "grad_norm": 1.766210240227299, + "learning_rate": 9.695811341473694e-06, + "loss": 0.6247, + "step": 856 + }, + { + "epoch": 0.2010557184750733, + "grad_norm": 2.160118229146718, + "learning_rate": 9.694402915057066e-06, + "loss": 0.6571, + "step": 857 + }, + { + "epoch": 0.20129032258064516, + "grad_norm": 2.469393549137364, + "learning_rate": 9.69299133835906e-06, + "loss": 0.5879, + "step": 858 + }, + { + "epoch": 0.20152492668621702, + "grad_norm": 1.5273736674661544, + "learning_rate": 9.691576612326941e-06, + "loss": 0.6743, + "step": 859 + }, + { + "epoch": 0.20175953079178885, + "grad_norm": 1.0668532110943905, + "learning_rate": 9.690158737910093e-06, + "loss": 0.6445, + "step": 860 + }, + { + "epoch": 0.2019941348973607, + "grad_norm": 2.890483837414929, + "learning_rate": 9.688737716060014e-06, + "loss": 0.6298, + "step": 861 + }, + { + "epoch": 0.20222873900293256, + "grad_norm": 1.843801407096205, + "learning_rate": 9.687313547730308e-06, + "loss": 0.6854, + "step": 862 + }, + { + "epoch": 0.2024633431085044, + "grad_norm": 0.9286428173977231, + "learning_rate": 9.685886233876696e-06, + "loss": 0.6231, + "step": 863 + }, + { + "epoch": 0.20269794721407625, + "grad_norm": 1.4125851665859233, + "learning_rate": 9.684455775457009e-06, + "loss": 0.6245, + "step": 864 + }, + { + "epoch": 0.2029325513196481, + "grad_norm": 6.073805403505272, + "learning_rate": 9.683022173431184e-06, + "loss": 0.6709, + "step": 865 + }, + { + "epoch": 0.20316715542521993, + "grad_norm": 0.9691315830576945, + "learning_rate": 9.681585428761272e-06, + "loss": 0.6798, + "step": 866 + }, + { + "epoch": 0.2034017595307918, + "grad_norm": 2.5907417889006443, + "learning_rate": 9.680145542411436e-06, + "loss": 0.6738, + "step": 867 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 11.722014166762376, + "learning_rate": 9.678702515347937e-06, + "loss": 0.5994, + "step": 868 + }, + { + "epoch": 0.20387096774193547, + "grad_norm": 1.6480056607813116, + "learning_rate": 9.677256348539154e-06, + "loss": 0.6621, + "step": 869 + }, + { + "epoch": 0.20410557184750733, + "grad_norm": 2.1007903859154027, + "learning_rate": 9.675807042955567e-06, + "loss": 0.7475, + "step": 870 + }, + { + "epoch": 0.20434017595307918, + "grad_norm": 1.0714730702482032, + "learning_rate": 9.674354599569767e-06, + "loss": 0.6802, + "step": 871 + }, + { + "epoch": 0.20457478005865104, + "grad_norm": 1.5835331150662713, + "learning_rate": 9.672899019356442e-06, + "loss": 0.6558, + "step": 872 + }, + { + "epoch": 0.20480938416422287, + "grad_norm": 1.2478724145092908, + "learning_rate": 9.671440303292395e-06, + "loss": 0.6264, + "step": 873 + }, + { + "epoch": 0.20504398826979472, + "grad_norm": 1.8847929441323756, + "learning_rate": 9.669978452356528e-06, + "loss": 0.6572, + "step": 874 + }, + { + "epoch": 0.20527859237536658, + "grad_norm": 1.1087069245802912, + "learning_rate": 9.66851346752985e-06, + "loss": 0.7131, + "step": 875 + }, + { + "epoch": 0.2055131964809384, + "grad_norm": 1.6276937142263947, + "learning_rate": 9.667045349795468e-06, + "loss": 0.647, + "step": 876 + }, + { + "epoch": 0.20574780058651027, + "grad_norm": 2.9117212268666752, + "learning_rate": 9.665574100138595e-06, + "loss": 0.674, + "step": 877 + }, + { + "epoch": 0.20598240469208212, + "grad_norm": 1.802950641271152, + "learning_rate": 9.664099719546547e-06, + "loss": 0.643, + "step": 878 + }, + { + "epoch": 0.20621700879765395, + "grad_norm": 2.04554378713996, + "learning_rate": 9.662622209008735e-06, + "loss": 0.6304, + "step": 879 + }, + { + "epoch": 0.2064516129032258, + "grad_norm": 2.089329297788112, + "learning_rate": 9.661141569516682e-06, + "loss": 0.6362, + "step": 880 + }, + { + "epoch": 0.20668621700879766, + "grad_norm": 1.1748100618556383, + "learning_rate": 9.659657802063996e-06, + "loss": 0.6586, + "step": 881 + }, + { + "epoch": 0.2069208211143695, + "grad_norm": 2.1537283721903457, + "learning_rate": 9.658170907646396e-06, + "loss": 0.6507, + "step": 882 + }, + { + "epoch": 0.20715542521994135, + "grad_norm": 1.1763604556671938, + "learning_rate": 9.656680887261693e-06, + "loss": 0.6495, + "step": 883 + }, + { + "epoch": 0.2073900293255132, + "grad_norm": 1.760765827261432, + "learning_rate": 9.655187741909797e-06, + "loss": 0.6522, + "step": 884 + }, + { + "epoch": 0.20762463343108503, + "grad_norm": 2.101601249166311, + "learning_rate": 9.653691472592718e-06, + "loss": 0.6592, + "step": 885 + }, + { + "epoch": 0.2078592375366569, + "grad_norm": 1.5337659374549815, + "learning_rate": 9.65219208031456e-06, + "loss": 0.6481, + "step": 886 + }, + { + "epoch": 0.20809384164222874, + "grad_norm": 1.4401051067751012, + "learning_rate": 9.65068956608152e-06, + "loss": 0.6675, + "step": 887 + }, + { + "epoch": 0.2083284457478006, + "grad_norm": 1.2219105521679687, + "learning_rate": 9.649183930901895e-06, + "loss": 0.6642, + "step": 888 + }, + { + "epoch": 0.20856304985337243, + "grad_norm": 1.201737906274961, + "learning_rate": 9.647675175786075e-06, + "loss": 0.6724, + "step": 889 + }, + { + "epoch": 0.20879765395894428, + "grad_norm": 3.073136252336065, + "learning_rate": 9.64616330174654e-06, + "loss": 0.6887, + "step": 890 + }, + { + "epoch": 0.20903225806451614, + "grad_norm": 1.7337964061237228, + "learning_rate": 9.644648309797871e-06, + "loss": 0.6434, + "step": 891 + }, + { + "epoch": 0.20926686217008797, + "grad_norm": 2.140464167952869, + "learning_rate": 9.643130200956732e-06, + "loss": 0.6565, + "step": 892 + }, + { + "epoch": 0.20950146627565983, + "grad_norm": 1.7400320702775463, + "learning_rate": 9.641608976241883e-06, + "loss": 0.6312, + "step": 893 + }, + { + "epoch": 0.20973607038123168, + "grad_norm": 3.1966431953328693, + "learning_rate": 9.640084636674179e-06, + "loss": 0.6036, + "step": 894 + }, + { + "epoch": 0.2099706744868035, + "grad_norm": 0.9554403450244524, + "learning_rate": 9.638557183276556e-06, + "loss": 0.7085, + "step": 895 + }, + { + "epoch": 0.21020527859237537, + "grad_norm": 1.498413589018548, + "learning_rate": 9.63702661707405e-06, + "loss": 0.6753, + "step": 896 + }, + { + "epoch": 0.21043988269794722, + "grad_norm": 1.8207602804392655, + "learning_rate": 9.635492939093778e-06, + "loss": 0.7039, + "step": 897 + }, + { + "epoch": 0.21067448680351905, + "grad_norm": 1.631255264135716, + "learning_rate": 9.633956150364948e-06, + "loss": 0.6373, + "step": 898 + }, + { + "epoch": 0.2109090909090909, + "grad_norm": 2.109503092325333, + "learning_rate": 9.632416251918855e-06, + "loss": 0.6666, + "step": 899 + }, + { + "epoch": 0.21114369501466276, + "grad_norm": 1.4734913865053512, + "learning_rate": 9.630873244788884e-06, + "loss": 0.6793, + "step": 900 + }, + { + "epoch": 0.2113782991202346, + "grad_norm": 1.4997857924939262, + "learning_rate": 9.629327130010501e-06, + "loss": 0.6083, + "step": 901 + }, + { + "epoch": 0.21161290322580645, + "grad_norm": 2.307097896233073, + "learning_rate": 9.62777790862126e-06, + "loss": 0.6057, + "step": 902 + }, + { + "epoch": 0.2118475073313783, + "grad_norm": 1.1774544123047836, + "learning_rate": 9.626225581660802e-06, + "loss": 0.656, + "step": 903 + }, + { + "epoch": 0.21208211143695016, + "grad_norm": 0.9239882041669124, + "learning_rate": 9.62467015017085e-06, + "loss": 0.6021, + "step": 904 + }, + { + "epoch": 0.212316715542522, + "grad_norm": 3.663284148439789, + "learning_rate": 9.623111615195206e-06, + "loss": 0.6102, + "step": 905 + }, + { + "epoch": 0.21255131964809384, + "grad_norm": 2.22539971046485, + "learning_rate": 9.621549977779765e-06, + "loss": 0.6936, + "step": 906 + }, + { + "epoch": 0.2127859237536657, + "grad_norm": 1.901468590290738, + "learning_rate": 9.61998523897249e-06, + "loss": 0.6553, + "step": 907 + }, + { + "epoch": 0.21302052785923753, + "grad_norm": 1.6609483168416135, + "learning_rate": 9.618417399823442e-06, + "loss": 0.6995, + "step": 908 + }, + { + "epoch": 0.21325513196480939, + "grad_norm": 1.7380882151027832, + "learning_rate": 9.616846461384748e-06, + "loss": 0.6501, + "step": 909 + }, + { + "epoch": 0.21348973607038124, + "grad_norm": 1.8347208426109232, + "learning_rate": 9.61527242471062e-06, + "loss": 0.6866, + "step": 910 + }, + { + "epoch": 0.21372434017595307, + "grad_norm": 3.5676223816730133, + "learning_rate": 9.613695290857352e-06, + "loss": 0.669, + "step": 911 + }, + { + "epoch": 0.21395894428152493, + "grad_norm": 1.191285952404426, + "learning_rate": 9.612115060883315e-06, + "loss": 0.653, + "step": 912 + }, + { + "epoch": 0.21419354838709678, + "grad_norm": 1.6788989812175175, + "learning_rate": 9.610531735848953e-06, + "loss": 0.6226, + "step": 913 + }, + { + "epoch": 0.2144281524926686, + "grad_norm": 1.4586256760129428, + "learning_rate": 9.608945316816794e-06, + "loss": 0.6539, + "step": 914 + }, + { + "epoch": 0.21466275659824047, + "grad_norm": 1.403774897238066, + "learning_rate": 9.607355804851437e-06, + "loss": 0.6669, + "step": 915 + }, + { + "epoch": 0.21489736070381232, + "grad_norm": 2.0700943384283157, + "learning_rate": 9.60576320101956e-06, + "loss": 0.6422, + "step": 916 + }, + { + "epoch": 0.21513196480938415, + "grad_norm": 1.2489560665864599, + "learning_rate": 9.604167506389915e-06, + "loss": 0.6755, + "step": 917 + }, + { + "epoch": 0.215366568914956, + "grad_norm": 1.1549235647914915, + "learning_rate": 9.602568722033325e-06, + "loss": 0.6623, + "step": 918 + }, + { + "epoch": 0.21560117302052786, + "grad_norm": 1.8360283762120344, + "learning_rate": 9.600966849022692e-06, + "loss": 0.6891, + "step": 919 + }, + { + "epoch": 0.21583577712609972, + "grad_norm": 1.0393580997748637, + "learning_rate": 9.599361888432987e-06, + "loss": 0.6497, + "step": 920 + }, + { + "epoch": 0.21607038123167155, + "grad_norm": 1.7945581650248654, + "learning_rate": 9.597753841341253e-06, + "loss": 0.6318, + "step": 921 + }, + { + "epoch": 0.2163049853372434, + "grad_norm": 1.6188242066426033, + "learning_rate": 9.596142708826604e-06, + "loss": 0.6192, + "step": 922 + }, + { + "epoch": 0.21653958944281526, + "grad_norm": 2.8364396280552726, + "learning_rate": 9.594528491970228e-06, + "loss": 0.6287, + "step": 923 + }, + { + "epoch": 0.2167741935483871, + "grad_norm": 1.317938874984234, + "learning_rate": 9.592911191855381e-06, + "loss": 0.614, + "step": 924 + }, + { + "epoch": 0.21700879765395895, + "grad_norm": 1.2506757855251625, + "learning_rate": 9.591290809567384e-06, + "loss": 0.6462, + "step": 925 + }, + { + "epoch": 0.2172434017595308, + "grad_norm": 1.5113552648070603, + "learning_rate": 9.589667346193632e-06, + "loss": 0.69, + "step": 926 + }, + { + "epoch": 0.21747800586510263, + "grad_norm": 3.007533355217859, + "learning_rate": 9.588040802823585e-06, + "loss": 0.6632, + "step": 927 + }, + { + "epoch": 0.2177126099706745, + "grad_norm": 2.2544121182751016, + "learning_rate": 9.586411180548771e-06, + "loss": 0.698, + "step": 928 + }, + { + "epoch": 0.21794721407624634, + "grad_norm": 1.6804815608054733, + "learning_rate": 9.584778480462785e-06, + "loss": 0.6661, + "step": 929 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 1.8591404754299143, + "learning_rate": 9.58314270366128e-06, + "loss": 0.6638, + "step": 930 + }, + { + "epoch": 0.21841642228739003, + "grad_norm": 1.7913063387462433, + "learning_rate": 9.581503851241987e-06, + "loss": 0.6493, + "step": 931 + }, + { + "epoch": 0.21865102639296188, + "grad_norm": 1.296486739763903, + "learning_rate": 9.57986192430469e-06, + "loss": 0.6721, + "step": 932 + }, + { + "epoch": 0.2188856304985337, + "grad_norm": 1.458842281608999, + "learning_rate": 9.57821692395124e-06, + "loss": 0.6794, + "step": 933 + }, + { + "epoch": 0.21912023460410557, + "grad_norm": 1.8068416750746825, + "learning_rate": 9.57656885128555e-06, + "loss": 0.6618, + "step": 934 + }, + { + "epoch": 0.21935483870967742, + "grad_norm": 1.4064094091161847, + "learning_rate": 9.574917707413596e-06, + "loss": 0.618, + "step": 935 + }, + { + "epoch": 0.21958944281524928, + "grad_norm": 4.204221566331364, + "learning_rate": 9.573263493443415e-06, + "loss": 0.6702, + "step": 936 + }, + { + "epoch": 0.2198240469208211, + "grad_norm": 3.8912619628950247, + "learning_rate": 9.5716062104851e-06, + "loss": 0.685, + "step": 937 + }, + { + "epoch": 0.22005865102639297, + "grad_norm": 4.04575953654595, + "learning_rate": 9.56994585965081e-06, + "loss": 0.6739, + "step": 938 + }, + { + "epoch": 0.22029325513196482, + "grad_norm": 1.0417380395271678, + "learning_rate": 9.568282442054759e-06, + "loss": 0.6571, + "step": 939 + }, + { + "epoch": 0.22052785923753665, + "grad_norm": 1.1622667689308102, + "learning_rate": 9.566615958813219e-06, + "loss": 0.6343, + "step": 940 + }, + { + "epoch": 0.2207624633431085, + "grad_norm": 2.272194661714964, + "learning_rate": 9.56494641104452e-06, + "loss": 0.6957, + "step": 941 + }, + { + "epoch": 0.22099706744868036, + "grad_norm": 4.689768435648827, + "learning_rate": 9.563273799869048e-06, + "loss": 0.5986, + "step": 942 + }, + { + "epoch": 0.2212316715542522, + "grad_norm": 2.322851221802614, + "learning_rate": 9.561598126409247e-06, + "loss": 0.6414, + "step": 943 + }, + { + "epoch": 0.22146627565982405, + "grad_norm": 1.2832930768450737, + "learning_rate": 9.559919391789611e-06, + "loss": 0.6687, + "step": 944 + }, + { + "epoch": 0.2217008797653959, + "grad_norm": 1.3127832267230448, + "learning_rate": 9.558237597136695e-06, + "loss": 0.667, + "step": 945 + }, + { + "epoch": 0.22193548387096773, + "grad_norm": 1.061136215661653, + "learning_rate": 9.556552743579102e-06, + "loss": 0.6333, + "step": 946 + }, + { + "epoch": 0.2221700879765396, + "grad_norm": 1.310962500440387, + "learning_rate": 9.554864832247488e-06, + "loss": 0.6293, + "step": 947 + }, + { + "epoch": 0.22240469208211144, + "grad_norm": 1.847743699079574, + "learning_rate": 9.553173864274567e-06, + "loss": 0.6147, + "step": 948 + }, + { + "epoch": 0.22263929618768327, + "grad_norm": 1.2710568543733551, + "learning_rate": 9.551479840795098e-06, + "loss": 0.6437, + "step": 949 + }, + { + "epoch": 0.22287390029325513, + "grad_norm": 7.601474393023154, + "learning_rate": 9.54978276294589e-06, + "loss": 0.6349, + "step": 950 + }, + { + "epoch": 0.22310850439882698, + "grad_norm": 1.415799715852848, + "learning_rate": 9.548082631865806e-06, + "loss": 0.6329, + "step": 951 + }, + { + "epoch": 0.22334310850439884, + "grad_norm": 1.503562399006367, + "learning_rate": 9.546379448695758e-06, + "loss": 0.676, + "step": 952 + }, + { + "epoch": 0.22357771260997067, + "grad_norm": 1.5623571065914645, + "learning_rate": 9.544673214578699e-06, + "loss": 0.7151, + "step": 953 + }, + { + "epoch": 0.22381231671554253, + "grad_norm": 4.279176198980748, + "learning_rate": 9.54296393065964e-06, + "loss": 0.6018, + "step": 954 + }, + { + "epoch": 0.22404692082111438, + "grad_norm": 8.779017143517844, + "learning_rate": 9.541251598085631e-06, + "loss": 0.6721, + "step": 955 + }, + { + "epoch": 0.2242815249266862, + "grad_norm": 2.644174124377122, + "learning_rate": 9.53953621800577e-06, + "loss": 0.5881, + "step": 956 + }, + { + "epoch": 0.22451612903225807, + "grad_norm": 1.7114965928788233, + "learning_rate": 9.5378177915712e-06, + "loss": 0.6229, + "step": 957 + }, + { + "epoch": 0.22475073313782992, + "grad_norm": 1.1050818447192206, + "learning_rate": 9.536096319935109e-06, + "loss": 0.6163, + "step": 958 + }, + { + "epoch": 0.22498533724340175, + "grad_norm": 1.6257590344746873, + "learning_rate": 9.534371804252727e-06, + "loss": 0.6568, + "step": 959 + }, + { + "epoch": 0.2252199413489736, + "grad_norm": 1.0705841536133995, + "learning_rate": 9.53264424568133e-06, + "loss": 0.6444, + "step": 960 + }, + { + "epoch": 0.22545454545454546, + "grad_norm": 1.6502909334600855, + "learning_rate": 9.530913645380233e-06, + "loss": 0.693, + "step": 961 + }, + { + "epoch": 0.2256891495601173, + "grad_norm": 5.6394366718967515, + "learning_rate": 9.529180004510791e-06, + "loss": 0.6618, + "step": 962 + }, + { + "epoch": 0.22592375366568915, + "grad_norm": 8.228928112712278, + "learning_rate": 9.527443324236403e-06, + "loss": 0.7084, + "step": 963 + }, + { + "epoch": 0.226158357771261, + "grad_norm": 1.7709964191958698, + "learning_rate": 9.525703605722508e-06, + "loss": 0.5879, + "step": 964 + }, + { + "epoch": 0.22639296187683283, + "grad_norm": 0.9300113928923643, + "learning_rate": 9.52396085013658e-06, + "loss": 0.6626, + "step": 965 + }, + { + "epoch": 0.2266275659824047, + "grad_norm": 1.3345040668724075, + "learning_rate": 9.522215058648132e-06, + "loss": 0.668, + "step": 966 + }, + { + "epoch": 0.22686217008797654, + "grad_norm": 6.219237350890805, + "learning_rate": 9.520466232428717e-06, + "loss": 0.6748, + "step": 967 + }, + { + "epoch": 0.2270967741935484, + "grad_norm": 1.9923763181200986, + "learning_rate": 9.518714372651922e-06, + "loss": 0.6387, + "step": 968 + }, + { + "epoch": 0.22733137829912023, + "grad_norm": 1.4539746482582692, + "learning_rate": 9.516959480493373e-06, + "loss": 0.6638, + "step": 969 + }, + { + "epoch": 0.22756598240469209, + "grad_norm": 1.6238750637770847, + "learning_rate": 9.515201557130726e-06, + "loss": 0.6723, + "step": 970 + }, + { + "epoch": 0.22780058651026394, + "grad_norm": 1.2739388924616808, + "learning_rate": 9.513440603743676e-06, + "loss": 0.6362, + "step": 971 + }, + { + "epoch": 0.22803519061583577, + "grad_norm": 1.6651793574871914, + "learning_rate": 9.511676621513947e-06, + "loss": 0.6363, + "step": 972 + }, + { + "epoch": 0.22826979472140763, + "grad_norm": 1.2091741125938216, + "learning_rate": 9.509909611625298e-06, + "loss": 0.6081, + "step": 973 + }, + { + "epoch": 0.22850439882697948, + "grad_norm": 1.2741025241549604, + "learning_rate": 9.508139575263522e-06, + "loss": 0.6316, + "step": 974 + }, + { + "epoch": 0.2287390029325513, + "grad_norm": 2.724056489379569, + "learning_rate": 9.506366513616439e-06, + "loss": 0.7075, + "step": 975 + }, + { + "epoch": 0.22897360703812317, + "grad_norm": 3.217519450192786, + "learning_rate": 9.504590427873897e-06, + "loss": 0.6302, + "step": 976 + }, + { + "epoch": 0.22920821114369502, + "grad_norm": 6.242868733111834, + "learning_rate": 9.502811319227783e-06, + "loss": 0.6743, + "step": 977 + }, + { + "epoch": 0.22944281524926685, + "grad_norm": 1.3589476990754552, + "learning_rate": 9.501029188872005e-06, + "loss": 0.684, + "step": 978 + }, + { + "epoch": 0.2296774193548387, + "grad_norm": 1.3256733080498688, + "learning_rate": 9.499244038002495e-06, + "loss": 0.6038, + "step": 979 + }, + { + "epoch": 0.22991202346041056, + "grad_norm": 1.2609101984988522, + "learning_rate": 9.497455867817225e-06, + "loss": 0.6731, + "step": 980 + }, + { + "epoch": 0.2301466275659824, + "grad_norm": 1.4483857272646994, + "learning_rate": 9.495664679516183e-06, + "loss": 0.6333, + "step": 981 + }, + { + "epoch": 0.23038123167155425, + "grad_norm": 1.2304459998771808, + "learning_rate": 9.493870474301383e-06, + "loss": 0.6334, + "step": 982 + }, + { + "epoch": 0.2306158357771261, + "grad_norm": 1.0767086812601114, + "learning_rate": 9.492073253376865e-06, + "loss": 0.6207, + "step": 983 + }, + { + "epoch": 0.23085043988269796, + "grad_norm": 1.1457537977191967, + "learning_rate": 9.490273017948698e-06, + "loss": 0.6675, + "step": 984 + }, + { + "epoch": 0.2310850439882698, + "grad_norm": 1.6217273167197999, + "learning_rate": 9.488469769224966e-06, + "loss": 0.6492, + "step": 985 + }, + { + "epoch": 0.23131964809384165, + "grad_norm": 2.947889514848731, + "learning_rate": 9.486663508415777e-06, + "loss": 0.62, + "step": 986 + }, + { + "epoch": 0.2315542521994135, + "grad_norm": 1.5930045829986972, + "learning_rate": 9.484854236733265e-06, + "loss": 0.645, + "step": 987 + }, + { + "epoch": 0.23178885630498533, + "grad_norm": 1.0411270862368243, + "learning_rate": 9.48304195539158e-06, + "loss": 0.6569, + "step": 988 + }, + { + "epoch": 0.2320234604105572, + "grad_norm": 1.8714760360067142, + "learning_rate": 9.48122666560689e-06, + "loss": 0.5927, + "step": 989 + }, + { + "epoch": 0.23225806451612904, + "grad_norm": 1.2536983547748748, + "learning_rate": 9.479408368597386e-06, + "loss": 0.611, + "step": 990 + }, + { + "epoch": 0.23249266862170087, + "grad_norm": 2.198810965849353, + "learning_rate": 9.477587065583281e-06, + "loss": 0.6586, + "step": 991 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 2.1986534808557616, + "learning_rate": 9.475762757786796e-06, + "loss": 0.6528, + "step": 992 + }, + { + "epoch": 0.23296187683284458, + "grad_norm": 1.0833049104621884, + "learning_rate": 9.473935446432169e-06, + "loss": 0.6475, + "step": 993 + }, + { + "epoch": 0.2331964809384164, + "grad_norm": 2.964486398105646, + "learning_rate": 9.472105132745664e-06, + "loss": 0.624, + "step": 994 + }, + { + "epoch": 0.23343108504398827, + "grad_norm": 1.0466438463264922, + "learning_rate": 9.470271817955547e-06, + "loss": 0.6104, + "step": 995 + }, + { + "epoch": 0.23366568914956012, + "grad_norm": 3.852799335158689, + "learning_rate": 9.468435503292108e-06, + "loss": 0.6377, + "step": 996 + }, + { + "epoch": 0.23390029325513195, + "grad_norm": 1.220217041154978, + "learning_rate": 9.466596189987644e-06, + "loss": 0.6733, + "step": 997 + }, + { + "epoch": 0.2341348973607038, + "grad_norm": 1.0496610362694465, + "learning_rate": 9.464753879276467e-06, + "loss": 0.6215, + "step": 998 + }, + { + "epoch": 0.23436950146627566, + "grad_norm": 2.2817838781315336, + "learning_rate": 9.4629085723949e-06, + "loss": 0.6116, + "step": 999 + }, + { + "epoch": 0.23460410557184752, + "grad_norm": 1.3680657229056437, + "learning_rate": 9.461060270581276e-06, + "loss": 0.6267, + "step": 1000 + }, + { + "epoch": 0.23460410557184752, + "eval_loss": 0.6492986679077148, + "eval_runtime": 25.3774, + "eval_samples_per_second": 21.515, + "eval_steps_per_second": 0.197, + "step": 1000 + }, + { + "epoch": 0.23483870967741935, + "grad_norm": 5.649880362698411, + "learning_rate": 9.459208975075936e-06, + "loss": 0.5947, + "step": 1001 + }, + { + "epoch": 0.2350733137829912, + "grad_norm": 1.3704636130767294, + "learning_rate": 9.457354687121234e-06, + "loss": 0.6535, + "step": 1002 + }, + { + "epoch": 0.23530791788856306, + "grad_norm": 1.5531555837354978, + "learning_rate": 9.455497407961533e-06, + "loss": 0.6578, + "step": 1003 + }, + { + "epoch": 0.2355425219941349, + "grad_norm": 2.385298625756661, + "learning_rate": 9.453637138843196e-06, + "loss": 0.6898, + "step": 1004 + }, + { + "epoch": 0.23577712609970675, + "grad_norm": 2.18241034809193, + "learning_rate": 9.4517738810146e-06, + "loss": 0.6549, + "step": 1005 + }, + { + "epoch": 0.2360117302052786, + "grad_norm": 1.1336483483024176, + "learning_rate": 9.449907635726125e-06, + "loss": 0.6367, + "step": 1006 + }, + { + "epoch": 0.23624633431085043, + "grad_norm": 1.0691181136009236, + "learning_rate": 9.448038404230156e-06, + "loss": 0.6423, + "step": 1007 + }, + { + "epoch": 0.2364809384164223, + "grad_norm": 1.0553117097822624, + "learning_rate": 9.446166187781077e-06, + "loss": 0.6984, + "step": 1008 + }, + { + "epoch": 0.23671554252199414, + "grad_norm": 1.5341499013376068, + "learning_rate": 9.444290987635285e-06, + "loss": 0.6761, + "step": 1009 + }, + { + "epoch": 0.23695014662756597, + "grad_norm": 1.3608123842594628, + "learning_rate": 9.442412805051171e-06, + "loss": 0.6885, + "step": 1010 + }, + { + "epoch": 0.23718475073313783, + "grad_norm": 2.4917367720298387, + "learning_rate": 9.440531641289133e-06, + "loss": 0.5706, + "step": 1011 + }, + { + "epoch": 0.23741935483870968, + "grad_norm": 1.303283510097461, + "learning_rate": 9.438647497611562e-06, + "loss": 0.6636, + "step": 1012 + }, + { + "epoch": 0.2376539589442815, + "grad_norm": 1.0013777791433367, + "learning_rate": 9.436760375282858e-06, + "loss": 0.664, + "step": 1013 + }, + { + "epoch": 0.23788856304985337, + "grad_norm": 1.2638124086460538, + "learning_rate": 9.434870275569416e-06, + "loss": 0.6067, + "step": 1014 + }, + { + "epoch": 0.23812316715542522, + "grad_norm": 2.9285589828455536, + "learning_rate": 9.432977199739625e-06, + "loss": 0.642, + "step": 1015 + }, + { + "epoch": 0.23835777126099708, + "grad_norm": 2.208856498844545, + "learning_rate": 9.431081149063877e-06, + "loss": 0.6834, + "step": 1016 + }, + { + "epoch": 0.2385923753665689, + "grad_norm": 1.5849986325533076, + "learning_rate": 9.429182124814559e-06, + "loss": 0.5901, + "step": 1017 + }, + { + "epoch": 0.23882697947214077, + "grad_norm": 1.257929746202979, + "learning_rate": 9.427280128266049e-06, + "loss": 0.6423, + "step": 1018 + }, + { + "epoch": 0.23906158357771262, + "grad_norm": 2.2476085484775834, + "learning_rate": 9.425375160694728e-06, + "loss": 0.6493, + "step": 1019 + }, + { + "epoch": 0.23929618768328445, + "grad_norm": 1.0074820565121343, + "learning_rate": 9.423467223378963e-06, + "loss": 0.6378, + "step": 1020 + }, + { + "epoch": 0.2395307917888563, + "grad_norm": 1.4286848351206902, + "learning_rate": 9.42155631759912e-06, + "loss": 0.6528, + "step": 1021 + }, + { + "epoch": 0.23976539589442816, + "grad_norm": 1.605810190143817, + "learning_rate": 9.419642444637547e-06, + "loss": 0.6155, + "step": 1022 + }, + { + "epoch": 0.24, + "grad_norm": 5.124106366515897, + "learning_rate": 9.417725605778599e-06, + "loss": 0.6336, + "step": 1023 + }, + { + "epoch": 0.24023460410557185, + "grad_norm": 1.1513761949498305, + "learning_rate": 9.415805802308606e-06, + "loss": 0.6878, + "step": 1024 + }, + { + "epoch": 0.2404692082111437, + "grad_norm": 1.684519474529015, + "learning_rate": 9.413883035515897e-06, + "loss": 0.6533, + "step": 1025 + }, + { + "epoch": 0.24070381231671553, + "grad_norm": 3.3090245316020157, + "learning_rate": 9.411957306690784e-06, + "loss": 0.5911, + "step": 1026 + }, + { + "epoch": 0.2409384164222874, + "grad_norm": 1.400329534624225, + "learning_rate": 9.410028617125572e-06, + "loss": 0.6988, + "step": 1027 + }, + { + "epoch": 0.24117302052785924, + "grad_norm": 1.1639084569386746, + "learning_rate": 9.40809696811455e-06, + "loss": 0.6551, + "step": 1028 + }, + { + "epoch": 0.24140762463343107, + "grad_norm": 1.3969320801825622, + "learning_rate": 9.406162360953993e-06, + "loss": 0.6635, + "step": 1029 + }, + { + "epoch": 0.24164222873900293, + "grad_norm": 1.5563103547874322, + "learning_rate": 9.40422479694216e-06, + "loss": 0.5857, + "step": 1030 + }, + { + "epoch": 0.24187683284457479, + "grad_norm": 3.31361637878754, + "learning_rate": 9.402284277379295e-06, + "loss": 0.6511, + "step": 1031 + }, + { + "epoch": 0.24211143695014664, + "grad_norm": 1.575052667811003, + "learning_rate": 9.400340803567628e-06, + "loss": 0.6414, + "step": 1032 + }, + { + "epoch": 0.24234604105571847, + "grad_norm": 4.850669535283172, + "learning_rate": 9.398394376811368e-06, + "loss": 0.6273, + "step": 1033 + }, + { + "epoch": 0.24258064516129033, + "grad_norm": 1.8310303846415128, + "learning_rate": 9.39644499841671e-06, + "loss": 0.6402, + "step": 1034 + }, + { + "epoch": 0.24281524926686218, + "grad_norm": 1.9977695614564166, + "learning_rate": 9.394492669691822e-06, + "loss": 0.5988, + "step": 1035 + }, + { + "epoch": 0.243049853372434, + "grad_norm": 1.2847567253839351, + "learning_rate": 9.39253739194686e-06, + "loss": 0.6792, + "step": 1036 + }, + { + "epoch": 0.24328445747800587, + "grad_norm": 1.4476041025422224, + "learning_rate": 9.390579166493955e-06, + "loss": 0.6865, + "step": 1037 + }, + { + "epoch": 0.24351906158357772, + "grad_norm": 2.0841697799025276, + "learning_rate": 9.388617994647217e-06, + "loss": 0.6343, + "step": 1038 + }, + { + "epoch": 0.24375366568914955, + "grad_norm": 1.19250423254534, + "learning_rate": 9.386653877722734e-06, + "loss": 0.6307, + "step": 1039 + }, + { + "epoch": 0.2439882697947214, + "grad_norm": 2.4696550859126396, + "learning_rate": 9.384686817038569e-06, + "loss": 0.6656, + "step": 1040 + }, + { + "epoch": 0.24422287390029326, + "grad_norm": 1.7291518224078675, + "learning_rate": 9.382716813914758e-06, + "loss": 0.5661, + "step": 1041 + }, + { + "epoch": 0.2444574780058651, + "grad_norm": 1.671832316453634, + "learning_rate": 9.380743869673319e-06, + "loss": 0.6366, + "step": 1042 + }, + { + "epoch": 0.24469208211143695, + "grad_norm": 1.9079638634802105, + "learning_rate": 9.378767985638236e-06, + "loss": 0.6393, + "step": 1043 + }, + { + "epoch": 0.2449266862170088, + "grad_norm": 1.4178979802273974, + "learning_rate": 9.37678916313547e-06, + "loss": 0.621, + "step": 1044 + }, + { + "epoch": 0.24516129032258063, + "grad_norm": 4.628465027913799, + "learning_rate": 9.374807403492953e-06, + "loss": 0.6233, + "step": 1045 + }, + { + "epoch": 0.2453958944281525, + "grad_norm": 1.4082937621006473, + "learning_rate": 9.37282270804059e-06, + "loss": 0.6288, + "step": 1046 + }, + { + "epoch": 0.24563049853372435, + "grad_norm": 1.3120202178320575, + "learning_rate": 9.370835078110252e-06, + "loss": 0.6206, + "step": 1047 + }, + { + "epoch": 0.2458651026392962, + "grad_norm": 3.760185749341159, + "learning_rate": 9.36884451503578e-06, + "loss": 0.6435, + "step": 1048 + }, + { + "epoch": 0.24609970674486803, + "grad_norm": 1.6052202436924854, + "learning_rate": 9.366851020152988e-06, + "loss": 0.6317, + "step": 1049 + }, + { + "epoch": 0.24633431085043989, + "grad_norm": 1.3160901614252933, + "learning_rate": 9.364854594799653e-06, + "loss": 0.6246, + "step": 1050 + }, + { + "epoch": 0.24656891495601174, + "grad_norm": 6.051988896786264, + "learning_rate": 9.362855240315519e-06, + "loss": 0.6584, + "step": 1051 + }, + { + "epoch": 0.24680351906158357, + "grad_norm": 1.4054889298572484, + "learning_rate": 9.360852958042294e-06, + "loss": 0.7053, + "step": 1052 + }, + { + "epoch": 0.24703812316715543, + "grad_norm": 1.3057230010347616, + "learning_rate": 9.35884774932366e-06, + "loss": 0.584, + "step": 1053 + }, + { + "epoch": 0.24727272727272728, + "grad_norm": 1.960286991708724, + "learning_rate": 9.35683961550525e-06, + "loss": 0.6803, + "step": 1054 + }, + { + "epoch": 0.2475073313782991, + "grad_norm": 1.4665550768481745, + "learning_rate": 9.354828557934667e-06, + "loss": 0.6629, + "step": 1055 + }, + { + "epoch": 0.24774193548387097, + "grad_norm": 1.4036753887323576, + "learning_rate": 9.352814577961478e-06, + "loss": 0.6403, + "step": 1056 + }, + { + "epoch": 0.24797653958944282, + "grad_norm": 1.2706243050107302, + "learning_rate": 9.350797676937204e-06, + "loss": 0.6162, + "step": 1057 + }, + { + "epoch": 0.24821114369501465, + "grad_norm": 1.0365410635812358, + "learning_rate": 9.348777856215335e-06, + "loss": 0.633, + "step": 1058 + }, + { + "epoch": 0.2484457478005865, + "grad_norm": 4.164437790587883, + "learning_rate": 9.346755117151311e-06, + "loss": 0.6338, + "step": 1059 + }, + { + "epoch": 0.24868035190615836, + "grad_norm": 2.0340855529569044, + "learning_rate": 9.344729461102537e-06, + "loss": 0.7013, + "step": 1060 + }, + { + "epoch": 0.2489149560117302, + "grad_norm": 1.1934887503823248, + "learning_rate": 9.342700889428376e-06, + "loss": 0.6704, + "step": 1061 + }, + { + "epoch": 0.24914956011730205, + "grad_norm": 1.2040869613771354, + "learning_rate": 9.340669403490142e-06, + "loss": 0.6119, + "step": 1062 + }, + { + "epoch": 0.2493841642228739, + "grad_norm": 6.798485147523802, + "learning_rate": 9.338635004651108e-06, + "loss": 0.6574, + "step": 1063 + }, + { + "epoch": 0.24961876832844576, + "grad_norm": 1.2031946135972638, + "learning_rate": 9.336597694276505e-06, + "loss": 0.6772, + "step": 1064 + }, + { + "epoch": 0.2498533724340176, + "grad_norm": 2.7896772865947974, + "learning_rate": 9.33455747373351e-06, + "loss": 0.6685, + "step": 1065 + }, + { + "epoch": 0.2500879765395894, + "grad_norm": 1.434718978079452, + "learning_rate": 9.332514344391261e-06, + "loss": 0.6707, + "step": 1066 + }, + { + "epoch": 0.2503225806451613, + "grad_norm": 1.649597067527315, + "learning_rate": 9.330468307620844e-06, + "loss": 0.6524, + "step": 1067 + }, + { + "epoch": 0.25055718475073313, + "grad_norm": 0.9706807975515751, + "learning_rate": 9.328419364795295e-06, + "loss": 0.6286, + "step": 1068 + }, + { + "epoch": 0.250791788856305, + "grad_norm": 2.8391301835048055, + "learning_rate": 9.326367517289601e-06, + "loss": 0.6309, + "step": 1069 + }, + { + "epoch": 0.25102639296187684, + "grad_norm": 10.149338307211673, + "learning_rate": 9.324312766480703e-06, + "loss": 0.6086, + "step": 1070 + }, + { + "epoch": 0.2512609970674487, + "grad_norm": 3.332973642515985, + "learning_rate": 9.322255113747483e-06, + "loss": 0.668, + "step": 1071 + }, + { + "epoch": 0.25149560117302056, + "grad_norm": 1.6289507101115803, + "learning_rate": 9.320194560470774e-06, + "loss": 0.6529, + "step": 1072 + }, + { + "epoch": 0.25173020527859236, + "grad_norm": 1.057748663764242, + "learning_rate": 9.318131108033355e-06, + "loss": 0.7172, + "step": 1073 + }, + { + "epoch": 0.2519648093841642, + "grad_norm": 5.911103910329838, + "learning_rate": 9.316064757819951e-06, + "loss": 0.6615, + "step": 1074 + }, + { + "epoch": 0.25219941348973607, + "grad_norm": 0.8361519820241816, + "learning_rate": 9.313995511217234e-06, + "loss": 0.647, + "step": 1075 + }, + { + "epoch": 0.2524340175953079, + "grad_norm": 1.430006543360786, + "learning_rate": 9.311923369613814e-06, + "loss": 0.5779, + "step": 1076 + }, + { + "epoch": 0.2526686217008798, + "grad_norm": 1.3705999959998239, + "learning_rate": 9.309848334400247e-06, + "loss": 0.6185, + "step": 1077 + }, + { + "epoch": 0.25290322580645164, + "grad_norm": 1.8027003953341156, + "learning_rate": 9.307770406969032e-06, + "loss": 0.7027, + "step": 1078 + }, + { + "epoch": 0.25313782991202344, + "grad_norm": 1.147881489060793, + "learning_rate": 9.305689588714607e-06, + "loss": 0.6871, + "step": 1079 + }, + { + "epoch": 0.2533724340175953, + "grad_norm": 1.2640670079055658, + "learning_rate": 9.303605881033347e-06, + "loss": 0.6705, + "step": 1080 + }, + { + "epoch": 0.25360703812316715, + "grad_norm": 2.0806876631568647, + "learning_rate": 9.301519285323574e-06, + "loss": 0.6329, + "step": 1081 + }, + { + "epoch": 0.253841642228739, + "grad_norm": 1.0093463389901316, + "learning_rate": 9.299429802985544e-06, + "loss": 0.6615, + "step": 1082 + }, + { + "epoch": 0.25407624633431086, + "grad_norm": 0.9052612031399299, + "learning_rate": 9.297337435421447e-06, + "loss": 0.621, + "step": 1083 + }, + { + "epoch": 0.2543108504398827, + "grad_norm": 2.2904057629841077, + "learning_rate": 9.295242184035411e-06, + "loss": 0.706, + "step": 1084 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 3.3432158091490893, + "learning_rate": 9.293144050233502e-06, + "loss": 0.6377, + "step": 1085 + }, + { + "epoch": 0.2547800586510264, + "grad_norm": 1.3541565854115583, + "learning_rate": 9.29104303542372e-06, + "loss": 0.6255, + "step": 1086 + }, + { + "epoch": 0.25501466275659823, + "grad_norm": 1.36594944692631, + "learning_rate": 9.288939141015993e-06, + "loss": 0.602, + "step": 1087 + }, + { + "epoch": 0.2552492668621701, + "grad_norm": 1.5060651999404175, + "learning_rate": 9.286832368422188e-06, + "loss": 0.6483, + "step": 1088 + }, + { + "epoch": 0.25548387096774194, + "grad_norm": 14.08745174413511, + "learning_rate": 9.2847227190561e-06, + "loss": 0.6518, + "step": 1089 + }, + { + "epoch": 0.2557184750733138, + "grad_norm": 1.0285700361179184, + "learning_rate": 9.282610194333453e-06, + "loss": 0.6561, + "step": 1090 + }, + { + "epoch": 0.25595307917888566, + "grad_norm": 3.3255615306981947, + "learning_rate": 9.280494795671906e-06, + "loss": 0.6508, + "step": 1091 + }, + { + "epoch": 0.25618768328445746, + "grad_norm": 1.2084681719142458, + "learning_rate": 9.278376524491041e-06, + "loss": 0.7036, + "step": 1092 + }, + { + "epoch": 0.2564222873900293, + "grad_norm": 2.6036419960007904, + "learning_rate": 9.276255382212374e-06, + "loss": 0.6228, + "step": 1093 + }, + { + "epoch": 0.25665689149560117, + "grad_norm": 27.743670008252177, + "learning_rate": 9.274131370259338e-06, + "loss": 0.64, + "step": 1094 + }, + { + "epoch": 0.256891495601173, + "grad_norm": 1.4044619403265397, + "learning_rate": 9.272004490057301e-06, + "loss": 0.608, + "step": 1095 + }, + { + "epoch": 0.2571260997067449, + "grad_norm": 2.356120977313629, + "learning_rate": 9.269874743033552e-06, + "loss": 0.6457, + "step": 1096 + }, + { + "epoch": 0.25736070381231674, + "grad_norm": 1.2596090390033359, + "learning_rate": 9.267742130617303e-06, + "loss": 0.657, + "step": 1097 + }, + { + "epoch": 0.25759530791788854, + "grad_norm": 1.3946092978167728, + "learning_rate": 9.265606654239691e-06, + "loss": 0.6597, + "step": 1098 + }, + { + "epoch": 0.2578299120234604, + "grad_norm": 1.4432119584318348, + "learning_rate": 9.263468315333774e-06, + "loss": 0.6312, + "step": 1099 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 1.5515541825522112, + "learning_rate": 9.261327115334531e-06, + "loss": 0.599, + "step": 1100 + }, + { + "epoch": 0.2582991202346041, + "grad_norm": 1.4007693081450738, + "learning_rate": 9.259183055678862e-06, + "loss": 0.6489, + "step": 1101 + }, + { + "epoch": 0.25853372434017596, + "grad_norm": 0.9103206016592846, + "learning_rate": 9.257036137805582e-06, + "loss": 0.6335, + "step": 1102 + }, + { + "epoch": 0.2587683284457478, + "grad_norm": 2.888495959965812, + "learning_rate": 9.254886363155429e-06, + "loss": 0.6071, + "step": 1103 + }, + { + "epoch": 0.2590029325513197, + "grad_norm": 1.3821034837192627, + "learning_rate": 9.252733733171056e-06, + "loss": 0.6486, + "step": 1104 + }, + { + "epoch": 0.2592375366568915, + "grad_norm": 1.4320609764115106, + "learning_rate": 9.25057824929703e-06, + "loss": 0.6374, + "step": 1105 + }, + { + "epoch": 0.25947214076246333, + "grad_norm": 1.119572011338813, + "learning_rate": 9.248419912979839e-06, + "loss": 0.6845, + "step": 1106 + }, + { + "epoch": 0.2597067448680352, + "grad_norm": 1.2163580289103197, + "learning_rate": 9.246258725667881e-06, + "loss": 0.6264, + "step": 1107 + }, + { + "epoch": 0.25994134897360704, + "grad_norm": 1.535185463904399, + "learning_rate": 9.244094688811464e-06, + "loss": 0.6681, + "step": 1108 + }, + { + "epoch": 0.2601759530791789, + "grad_norm": 1.7914866375488323, + "learning_rate": 9.241927803862818e-06, + "loss": 0.6775, + "step": 1109 + }, + { + "epoch": 0.26041055718475076, + "grad_norm": 1.3248249659555873, + "learning_rate": 9.239758072276074e-06, + "loss": 0.6521, + "step": 1110 + }, + { + "epoch": 0.26064516129032256, + "grad_norm": 2.295648656574757, + "learning_rate": 9.23758549550728e-06, + "loss": 0.6337, + "step": 1111 + }, + { + "epoch": 0.2608797653958944, + "grad_norm": 11.75891681296019, + "learning_rate": 9.235410075014388e-06, + "loss": 0.6349, + "step": 1112 + }, + { + "epoch": 0.26111436950146627, + "grad_norm": 1.6452283269390608, + "learning_rate": 9.233231812257266e-06, + "loss": 0.5934, + "step": 1113 + }, + { + "epoch": 0.2613489736070381, + "grad_norm": 1.4467978833872515, + "learning_rate": 9.23105070869768e-06, + "loss": 0.6278, + "step": 1114 + }, + { + "epoch": 0.26158357771261, + "grad_norm": 1.113625485020427, + "learning_rate": 9.228866765799309e-06, + "loss": 0.6643, + "step": 1115 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 10.773666725709122, + "learning_rate": 9.226679985027737e-06, + "loss": 0.6215, + "step": 1116 + }, + { + "epoch": 0.26205278592375364, + "grad_norm": 1.148086209338217, + "learning_rate": 9.22449036785045e-06, + "loss": 0.7284, + "step": 1117 + }, + { + "epoch": 0.2622873900293255, + "grad_norm": 4.126390930082524, + "learning_rate": 9.222297915736835e-06, + "loss": 0.635, + "step": 1118 + }, + { + "epoch": 0.26252199413489735, + "grad_norm": 3.2049462664895296, + "learning_rate": 9.22010263015819e-06, + "loss": 0.6171, + "step": 1119 + }, + { + "epoch": 0.2627565982404692, + "grad_norm": 21.67792858284106, + "learning_rate": 9.217904512587705e-06, + "loss": 0.5676, + "step": 1120 + }, + { + "epoch": 0.26299120234604106, + "grad_norm": 12.556299898852679, + "learning_rate": 9.215703564500478e-06, + "loss": 0.6152, + "step": 1121 + }, + { + "epoch": 0.2632258064516129, + "grad_norm": 1.1779982705559455, + "learning_rate": 9.213499787373501e-06, + "loss": 0.6658, + "step": 1122 + }, + { + "epoch": 0.2634604105571848, + "grad_norm": 1.3696518802678466, + "learning_rate": 9.211293182685668e-06, + "loss": 0.5802, + "step": 1123 + }, + { + "epoch": 0.2636950146627566, + "grad_norm": 1.8874730792051249, + "learning_rate": 9.209083751917767e-06, + "loss": 0.6727, + "step": 1124 + }, + { + "epoch": 0.26392961876832843, + "grad_norm": 0.958902915394563, + "learning_rate": 9.206871496552488e-06, + "loss": 0.6486, + "step": 1125 + }, + { + "epoch": 0.2641642228739003, + "grad_norm": 6.635419383316604, + "learning_rate": 9.20465641807441e-06, + "loss": 0.7071, + "step": 1126 + }, + { + "epoch": 0.26439882697947215, + "grad_norm": 1.4702756870641926, + "learning_rate": 9.202438517970011e-06, + "loss": 0.6239, + "step": 1127 + }, + { + "epoch": 0.264633431085044, + "grad_norm": 1.7660617836374857, + "learning_rate": 9.200217797727663e-06, + "loss": 0.6057, + "step": 1128 + }, + { + "epoch": 0.26486803519061586, + "grad_norm": 1.4327848284209121, + "learning_rate": 9.197994258837627e-06, + "loss": 0.6537, + "step": 1129 + }, + { + "epoch": 0.26510263929618766, + "grad_norm": 2.2357305825059925, + "learning_rate": 9.195767902792055e-06, + "loss": 0.6545, + "step": 1130 + }, + { + "epoch": 0.2653372434017595, + "grad_norm": 1.115617574359435, + "learning_rate": 9.193538731084994e-06, + "loss": 0.6483, + "step": 1131 + }, + { + "epoch": 0.26557184750733137, + "grad_norm": 2.1127768529842204, + "learning_rate": 9.19130674521238e-06, + "loss": 0.6839, + "step": 1132 + }, + { + "epoch": 0.2658064516129032, + "grad_norm": 0.9313613458147902, + "learning_rate": 9.189071946672034e-06, + "loss": 0.6875, + "step": 1133 + }, + { + "epoch": 0.2660410557184751, + "grad_norm": 0.8318000531860501, + "learning_rate": 9.186834336963665e-06, + "loss": 0.6093, + "step": 1134 + }, + { + "epoch": 0.26627565982404694, + "grad_norm": 1.1986092453350174, + "learning_rate": 9.184593917588872e-06, + "loss": 0.6201, + "step": 1135 + }, + { + "epoch": 0.2665102639296188, + "grad_norm": 1.7391576367935673, + "learning_rate": 9.182350690051134e-06, + "loss": 0.5965, + "step": 1136 + }, + { + "epoch": 0.2667448680351906, + "grad_norm": 2.0279189111500484, + "learning_rate": 9.180104655855822e-06, + "loss": 0.7121, + "step": 1137 + }, + { + "epoch": 0.26697947214076245, + "grad_norm": 1.341125336561769, + "learning_rate": 9.177855816510184e-06, + "loss": 0.6235, + "step": 1138 + }, + { + "epoch": 0.2672140762463343, + "grad_norm": 1.540030271635298, + "learning_rate": 9.17560417352335e-06, + "loss": 0.6698, + "step": 1139 + }, + { + "epoch": 0.26744868035190617, + "grad_norm": 1.5842640964603802, + "learning_rate": 9.173349728406342e-06, + "loss": 0.6461, + "step": 1140 + }, + { + "epoch": 0.267683284457478, + "grad_norm": 1.279593804742548, + "learning_rate": 9.171092482672046e-06, + "loss": 0.7069, + "step": 1141 + }, + { + "epoch": 0.2679178885630499, + "grad_norm": 4.766485979159509, + "learning_rate": 9.16883243783524e-06, + "loss": 0.6368, + "step": 1142 + }, + { + "epoch": 0.2681524926686217, + "grad_norm": 3.4148221767290217, + "learning_rate": 9.166569595412576e-06, + "loss": 0.6509, + "step": 1143 + }, + { + "epoch": 0.26838709677419353, + "grad_norm": 1.3395580626279817, + "learning_rate": 9.164303956922582e-06, + "loss": 0.6265, + "step": 1144 + }, + { + "epoch": 0.2686217008797654, + "grad_norm": 1.9671916289784808, + "learning_rate": 9.162035523885669e-06, + "loss": 0.6307, + "step": 1145 + }, + { + "epoch": 0.26885630498533725, + "grad_norm": 1.1231349124358356, + "learning_rate": 9.159764297824111e-06, + "loss": 0.5958, + "step": 1146 + }, + { + "epoch": 0.2690909090909091, + "grad_norm": 1.19635176688707, + "learning_rate": 9.15749028026207e-06, + "loss": 0.6029, + "step": 1147 + }, + { + "epoch": 0.26932551319648096, + "grad_norm": 1.0063746732596204, + "learning_rate": 9.155213472725572e-06, + "loss": 0.6571, + "step": 1148 + }, + { + "epoch": 0.26956011730205276, + "grad_norm": 1.2724902231266055, + "learning_rate": 9.152933876742518e-06, + "loss": 0.6915, + "step": 1149 + }, + { + "epoch": 0.2697947214076246, + "grad_norm": 1.093749239254766, + "learning_rate": 9.150651493842683e-06, + "loss": 0.6117, + "step": 1150 + }, + { + "epoch": 0.27002932551319647, + "grad_norm": 13.259899934384775, + "learning_rate": 9.148366325557707e-06, + "loss": 0.6972, + "step": 1151 + }, + { + "epoch": 0.27026392961876833, + "grad_norm": 0.7952876762386353, + "learning_rate": 9.146078373421106e-06, + "loss": 0.5975, + "step": 1152 + }, + { + "epoch": 0.2704985337243402, + "grad_norm": 1.077868856066063, + "learning_rate": 9.143787638968255e-06, + "loss": 0.6165, + "step": 1153 + }, + { + "epoch": 0.27073313782991204, + "grad_norm": 1.1436430969707327, + "learning_rate": 9.141494123736403e-06, + "loss": 0.6669, + "step": 1154 + }, + { + "epoch": 0.2709677419354839, + "grad_norm": 1.441281901569953, + "learning_rate": 9.139197829264668e-06, + "loss": 0.6202, + "step": 1155 + }, + { + "epoch": 0.2712023460410557, + "grad_norm": 2.2442609787801726, + "learning_rate": 9.136898757094022e-06, + "loss": 0.6579, + "step": 1156 + }, + { + "epoch": 0.27143695014662755, + "grad_norm": 2.083902766981121, + "learning_rate": 9.13459690876731e-06, + "loss": 0.6156, + "step": 1157 + }, + { + "epoch": 0.2716715542521994, + "grad_norm": 1.4164132517497274, + "learning_rate": 9.132292285829237e-06, + "loss": 0.6866, + "step": 1158 + }, + { + "epoch": 0.27190615835777127, + "grad_norm": 1.3221074842967184, + "learning_rate": 9.129984889826373e-06, + "loss": 0.6511, + "step": 1159 + }, + { + "epoch": 0.2721407624633431, + "grad_norm": 1.452528344416446, + "learning_rate": 9.127674722307142e-06, + "loss": 0.7298, + "step": 1160 + }, + { + "epoch": 0.272375366568915, + "grad_norm": 1.2918319576786386, + "learning_rate": 9.125361784821837e-06, + "loss": 0.6537, + "step": 1161 + }, + { + "epoch": 0.2726099706744868, + "grad_norm": 0.9875594783287066, + "learning_rate": 9.123046078922601e-06, + "loss": 0.6512, + "step": 1162 + }, + { + "epoch": 0.27284457478005864, + "grad_norm": 1.6672133667940412, + "learning_rate": 9.120727606163443e-06, + "loss": 0.641, + "step": 1163 + }, + { + "epoch": 0.2730791788856305, + "grad_norm": 1.2912257331658266, + "learning_rate": 9.11840636810022e-06, + "loss": 0.6634, + "step": 1164 + }, + { + "epoch": 0.27331378299120235, + "grad_norm": 1.8275919353703045, + "learning_rate": 9.116082366290653e-06, + "loss": 0.6561, + "step": 1165 + }, + { + "epoch": 0.2735483870967742, + "grad_norm": 2.0492504335429915, + "learning_rate": 9.113755602294311e-06, + "loss": 0.6155, + "step": 1166 + }, + { + "epoch": 0.27378299120234606, + "grad_norm": 23.51595392877474, + "learning_rate": 9.111426077672623e-06, + "loss": 0.6691, + "step": 1167 + }, + { + "epoch": 0.2740175953079179, + "grad_norm": 1.336879483997251, + "learning_rate": 9.109093793988866e-06, + "loss": 0.6681, + "step": 1168 + }, + { + "epoch": 0.2742521994134897, + "grad_norm": 1.4331122193508863, + "learning_rate": 9.10675875280817e-06, + "loss": 0.6119, + "step": 1169 + }, + { + "epoch": 0.2744868035190616, + "grad_norm": 1.847249001964482, + "learning_rate": 9.104420955697513e-06, + "loss": 0.6777, + "step": 1170 + }, + { + "epoch": 0.27472140762463343, + "grad_norm": 1.1832291339880137, + "learning_rate": 9.102080404225726e-06, + "loss": 0.6798, + "step": 1171 + }, + { + "epoch": 0.2749560117302053, + "grad_norm": 4.932581301158991, + "learning_rate": 9.09973709996349e-06, + "loss": 0.5892, + "step": 1172 + }, + { + "epoch": 0.27519061583577714, + "grad_norm": 0.9293782191072257, + "learning_rate": 9.097391044483325e-06, + "loss": 0.672, + "step": 1173 + }, + { + "epoch": 0.275425219941349, + "grad_norm": 1.4098630567348378, + "learning_rate": 9.095042239359608e-06, + "loss": 0.6194, + "step": 1174 + }, + { + "epoch": 0.2756598240469208, + "grad_norm": 1.701552206764051, + "learning_rate": 9.092690686168552e-06, + "loss": 0.7233, + "step": 1175 + }, + { + "epoch": 0.27589442815249265, + "grad_norm": 2.0167746179023855, + "learning_rate": 9.090336386488222e-06, + "loss": 0.6088, + "step": 1176 + }, + { + "epoch": 0.2761290322580645, + "grad_norm": 1.4990342339704141, + "learning_rate": 9.087979341898518e-06, + "loss": 0.6503, + "step": 1177 + }, + { + "epoch": 0.27636363636363637, + "grad_norm": 3.8467707530438044, + "learning_rate": 9.085619553981186e-06, + "loss": 0.62, + "step": 1178 + }, + { + "epoch": 0.2765982404692082, + "grad_norm": 1.320421580962417, + "learning_rate": 9.083257024319817e-06, + "loss": 0.617, + "step": 1179 + }, + { + "epoch": 0.2768328445747801, + "grad_norm": 2.4487808473763626, + "learning_rate": 9.080891754499836e-06, + "loss": 0.6253, + "step": 1180 + }, + { + "epoch": 0.2770674486803519, + "grad_norm": 2.4803890794677357, + "learning_rate": 9.078523746108508e-06, + "loss": 0.6302, + "step": 1181 + }, + { + "epoch": 0.27730205278592374, + "grad_norm": 1.3205711383390113, + "learning_rate": 9.076153000734938e-06, + "loss": 0.6563, + "step": 1182 + }, + { + "epoch": 0.2775366568914956, + "grad_norm": 2.767032869268495, + "learning_rate": 9.073779519970066e-06, + "loss": 0.6445, + "step": 1183 + }, + { + "epoch": 0.27777126099706745, + "grad_norm": 1.2609096823864498, + "learning_rate": 9.071403305406664e-06, + "loss": 0.693, + "step": 1184 + }, + { + "epoch": 0.2780058651026393, + "grad_norm": 1.161706749463485, + "learning_rate": 9.069024358639352e-06, + "loss": 0.709, + "step": 1185 + }, + { + "epoch": 0.27824046920821116, + "grad_norm": 1.4075792283668007, + "learning_rate": 9.066642681264565e-06, + "loss": 0.67, + "step": 1186 + }, + { + "epoch": 0.278475073313783, + "grad_norm": 1.400424726965094, + "learning_rate": 9.064258274880583e-06, + "loss": 0.6217, + "step": 1187 + }, + { + "epoch": 0.2787096774193548, + "grad_norm": 1.488864332901986, + "learning_rate": 9.061871141087514e-06, + "loss": 0.6405, + "step": 1188 + }, + { + "epoch": 0.2789442815249267, + "grad_norm": 2.41257517341978, + "learning_rate": 9.059481281487297e-06, + "loss": 0.6633, + "step": 1189 + }, + { + "epoch": 0.27917888563049853, + "grad_norm": 1.1878581629498228, + "learning_rate": 9.057088697683695e-06, + "loss": 0.655, + "step": 1190 + }, + { + "epoch": 0.2794134897360704, + "grad_norm": 1.6685399189300316, + "learning_rate": 9.05469339128231e-06, + "loss": 0.6261, + "step": 1191 + }, + { + "epoch": 0.27964809384164224, + "grad_norm": 2.304310182312456, + "learning_rate": 9.052295363890561e-06, + "loss": 0.6444, + "step": 1192 + }, + { + "epoch": 0.2798826979472141, + "grad_norm": 1.3241671634056527, + "learning_rate": 9.049894617117697e-06, + "loss": 0.65, + "step": 1193 + }, + { + "epoch": 0.2801173020527859, + "grad_norm": 1.26071567796767, + "learning_rate": 9.04749115257479e-06, + "loss": 0.6223, + "step": 1194 + }, + { + "epoch": 0.28035190615835776, + "grad_norm": 3.4060533850546073, + "learning_rate": 9.045084971874738e-06, + "loss": 0.6414, + "step": 1195 + }, + { + "epoch": 0.2805865102639296, + "grad_norm": 1.0797584781386456, + "learning_rate": 9.042676076632262e-06, + "loss": 0.6103, + "step": 1196 + }, + { + "epoch": 0.28082111436950147, + "grad_norm": 2.3500412110531226, + "learning_rate": 9.040264468463905e-06, + "loss": 0.6658, + "step": 1197 + }, + { + "epoch": 0.2810557184750733, + "grad_norm": 2.5888536931024158, + "learning_rate": 9.037850148988025e-06, + "loss": 0.6292, + "step": 1198 + }, + { + "epoch": 0.2812903225806452, + "grad_norm": 1.2721951957423576, + "learning_rate": 9.035433119824808e-06, + "loss": 0.6561, + "step": 1199 + }, + { + "epoch": 0.28152492668621704, + "grad_norm": 1.6362863863423596, + "learning_rate": 9.033013382596251e-06, + "loss": 0.6251, + "step": 1200 + }, + { + "epoch": 0.28175953079178884, + "grad_norm": 1.6474849766998976, + "learning_rate": 9.030590938926173e-06, + "loss": 0.673, + "step": 1201 + }, + { + "epoch": 0.2819941348973607, + "grad_norm": 1.6701199617740936, + "learning_rate": 9.028165790440206e-06, + "loss": 0.6441, + "step": 1202 + }, + { + "epoch": 0.28222873900293255, + "grad_norm": 1.2187041648785473, + "learning_rate": 9.025737938765803e-06, + "loss": 0.6457, + "step": 1203 + }, + { + "epoch": 0.2824633431085044, + "grad_norm": 2.5892033320650727, + "learning_rate": 9.023307385532221e-06, + "loss": 0.6626, + "step": 1204 + }, + { + "epoch": 0.28269794721407626, + "grad_norm": 1.6534979371731078, + "learning_rate": 9.020874132370539e-06, + "loss": 0.6619, + "step": 1205 + }, + { + "epoch": 0.2829325513196481, + "grad_norm": 2.3196392706709608, + "learning_rate": 9.018438180913644e-06, + "loss": 0.6608, + "step": 1206 + }, + { + "epoch": 0.2831671554252199, + "grad_norm": 1.2616125476534032, + "learning_rate": 9.015999532796237e-06, + "loss": 0.6394, + "step": 1207 + }, + { + "epoch": 0.2834017595307918, + "grad_norm": 1.079851047458475, + "learning_rate": 9.013558189654819e-06, + "loss": 0.6734, + "step": 1208 + }, + { + "epoch": 0.28363636363636363, + "grad_norm": 0.945963473909306, + "learning_rate": 9.011114153127713e-06, + "loss": 0.6595, + "step": 1209 + }, + { + "epoch": 0.2838709677419355, + "grad_norm": 1.2202345279339784, + "learning_rate": 9.00866742485504e-06, + "loss": 0.6094, + "step": 1210 + }, + { + "epoch": 0.28410557184750734, + "grad_norm": 1.6632063032700166, + "learning_rate": 9.006218006478733e-06, + "loss": 0.6195, + "step": 1211 + }, + { + "epoch": 0.2843401759530792, + "grad_norm": 1.543917239991805, + "learning_rate": 9.003765899642522e-06, + "loss": 0.6378, + "step": 1212 + }, + { + "epoch": 0.284574780058651, + "grad_norm": 1.2988372928219616, + "learning_rate": 9.001311105991955e-06, + "loss": 0.6478, + "step": 1213 + }, + { + "epoch": 0.28480938416422286, + "grad_norm": 1.5211791632950824, + "learning_rate": 8.998853627174366e-06, + "loss": 0.5813, + "step": 1214 + }, + { + "epoch": 0.2850439882697947, + "grad_norm": 1.764491215207445, + "learning_rate": 8.996393464838906e-06, + "loss": 0.6872, + "step": 1215 + }, + { + "epoch": 0.28527859237536657, + "grad_norm": 1.457599821694021, + "learning_rate": 8.99393062063652e-06, + "loss": 0.6764, + "step": 1216 + }, + { + "epoch": 0.2855131964809384, + "grad_norm": 7.9843204821114915, + "learning_rate": 8.991465096219949e-06, + "loss": 0.6762, + "step": 1217 + }, + { + "epoch": 0.2857478005865103, + "grad_norm": 2.281328672609779, + "learning_rate": 8.988996893243742e-06, + "loss": 0.6836, + "step": 1218 + }, + { + "epoch": 0.28598240469208214, + "grad_norm": 34.42511778676702, + "learning_rate": 8.986526013364236e-06, + "loss": 0.6762, + "step": 1219 + }, + { + "epoch": 0.28621700879765394, + "grad_norm": 1.782916570198515, + "learning_rate": 8.98405245823957e-06, + "loss": 0.5941, + "step": 1220 + }, + { + "epoch": 0.2864516129032258, + "grad_norm": 1.2283119944541914, + "learning_rate": 8.98157622952968e-06, + "loss": 0.6419, + "step": 1221 + }, + { + "epoch": 0.28668621700879765, + "grad_norm": 3.963871646741732, + "learning_rate": 8.979097328896292e-06, + "loss": 0.6615, + "step": 1222 + }, + { + "epoch": 0.2869208211143695, + "grad_norm": 1.8606785533696573, + "learning_rate": 8.976615758002924e-06, + "loss": 0.6159, + "step": 1223 + }, + { + "epoch": 0.28715542521994136, + "grad_norm": 11.427896013323235, + "learning_rate": 8.97413151851489e-06, + "loss": 0.6047, + "step": 1224 + }, + { + "epoch": 0.2873900293255132, + "grad_norm": 2.3284593765249832, + "learning_rate": 8.971644612099292e-06, + "loss": 0.6466, + "step": 1225 + }, + { + "epoch": 0.287624633431085, + "grad_norm": 1.6781674773194468, + "learning_rate": 8.969155040425024e-06, + "loss": 0.5746, + "step": 1226 + }, + { + "epoch": 0.2878592375366569, + "grad_norm": 1.6035318347922205, + "learning_rate": 8.966662805162766e-06, + "loss": 0.6735, + "step": 1227 + }, + { + "epoch": 0.28809384164222873, + "grad_norm": 1.5828439617406542, + "learning_rate": 8.964167907984989e-06, + "loss": 0.6923, + "step": 1228 + }, + { + "epoch": 0.2883284457478006, + "grad_norm": 1.1751048929274226, + "learning_rate": 8.961670350565945e-06, + "loss": 0.6525, + "step": 1229 + }, + { + "epoch": 0.28856304985337244, + "grad_norm": 6.7059065740479875, + "learning_rate": 8.959170134581677e-06, + "loss": 0.5547, + "step": 1230 + }, + { + "epoch": 0.2887976539589443, + "grad_norm": 1.4608480268772748, + "learning_rate": 8.956667261710007e-06, + "loss": 0.5718, + "step": 1231 + }, + { + "epoch": 0.28903225806451616, + "grad_norm": 1.6912842066364013, + "learning_rate": 8.954161733630546e-06, + "loss": 0.6642, + "step": 1232 + }, + { + "epoch": 0.28926686217008796, + "grad_norm": 1.116960226766822, + "learning_rate": 8.951653552024681e-06, + "loss": 0.6187, + "step": 1233 + }, + { + "epoch": 0.2895014662756598, + "grad_norm": 1.9517351039051145, + "learning_rate": 8.949142718575582e-06, + "loss": 0.667, + "step": 1234 + }, + { + "epoch": 0.28973607038123167, + "grad_norm": 2.4300227400850214, + "learning_rate": 8.946629234968196e-06, + "loss": 0.6293, + "step": 1235 + }, + { + "epoch": 0.2899706744868035, + "grad_norm": 1.1995657881689836, + "learning_rate": 8.944113102889257e-06, + "loss": 0.6431, + "step": 1236 + }, + { + "epoch": 0.2902052785923754, + "grad_norm": 1.6908230703481364, + "learning_rate": 8.941594324027263e-06, + "loss": 0.6384, + "step": 1237 + }, + { + "epoch": 0.29043988269794724, + "grad_norm": 2.6265952904076557, + "learning_rate": 8.939072900072501e-06, + "loss": 0.6417, + "step": 1238 + }, + { + "epoch": 0.29067448680351904, + "grad_norm": 2.0392197385607904, + "learning_rate": 8.936548832717024e-06, + "loss": 0.6557, + "step": 1239 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 2.0104281934582895, + "learning_rate": 8.934022123654667e-06, + "loss": 0.6026, + "step": 1240 + }, + { + "epoch": 0.29114369501466275, + "grad_norm": 1.2199453887680525, + "learning_rate": 8.931492774581028e-06, + "loss": 0.6262, + "step": 1241 + }, + { + "epoch": 0.2913782991202346, + "grad_norm": 1.1753234947448552, + "learning_rate": 8.928960787193486e-06, + "loss": 0.6915, + "step": 1242 + }, + { + "epoch": 0.29161290322580646, + "grad_norm": 1.5561084137983587, + "learning_rate": 8.926426163191181e-06, + "loss": 0.6373, + "step": 1243 + }, + { + "epoch": 0.2918475073313783, + "grad_norm": 1.362777603513328, + "learning_rate": 8.923888904275036e-06, + "loss": 0.6325, + "step": 1244 + }, + { + "epoch": 0.2920821114369501, + "grad_norm": 2.8333130460272145, + "learning_rate": 8.921349012147724e-06, + "loss": 0.6566, + "step": 1245 + }, + { + "epoch": 0.292316715542522, + "grad_norm": 1.1212176544259231, + "learning_rate": 8.918806488513702e-06, + "loss": 0.6217, + "step": 1246 + }, + { + "epoch": 0.29255131964809383, + "grad_norm": 1.3257534046241175, + "learning_rate": 8.916261335079185e-06, + "loss": 0.6553, + "step": 1247 + }, + { + "epoch": 0.2927859237536657, + "grad_norm": 2.101574963760381, + "learning_rate": 8.91371355355215e-06, + "loss": 0.58, + "step": 1248 + }, + { + "epoch": 0.29302052785923755, + "grad_norm": 1.1721164736904788, + "learning_rate": 8.911163145642346e-06, + "loss": 0.6542, + "step": 1249 + }, + { + "epoch": 0.2932551319648094, + "grad_norm": 1.8243554113686955, + "learning_rate": 8.908610113061276e-06, + "loss": 0.603, + "step": 1250 + }, + { + "epoch": 0.29348973607038126, + "grad_norm": 9.405338704027788, + "learning_rate": 8.90605445752221e-06, + "loss": 0.6416, + "step": 1251 + }, + { + "epoch": 0.29372434017595306, + "grad_norm": 1.2324339661112436, + "learning_rate": 8.903496180740176e-06, + "loss": 0.5899, + "step": 1252 + }, + { + "epoch": 0.2939589442815249, + "grad_norm": 2.05393213429527, + "learning_rate": 8.900935284431962e-06, + "loss": 0.6328, + "step": 1253 + }, + { + "epoch": 0.29419354838709677, + "grad_norm": 2.229622561102952, + "learning_rate": 8.898371770316113e-06, + "loss": 0.6716, + "step": 1254 + }, + { + "epoch": 0.2944281524926686, + "grad_norm": 1.0086112497124473, + "learning_rate": 8.89580564011293e-06, + "loss": 0.6624, + "step": 1255 + }, + { + "epoch": 0.2946627565982405, + "grad_norm": 2.0998846289072532, + "learning_rate": 8.893236895544472e-06, + "loss": 0.6717, + "step": 1256 + }, + { + "epoch": 0.29489736070381234, + "grad_norm": 2.7024253872943578, + "learning_rate": 8.89066553833455e-06, + "loss": 0.6527, + "step": 1257 + }, + { + "epoch": 0.29513196480938414, + "grad_norm": 2.1374557550177475, + "learning_rate": 8.88809157020873e-06, + "loss": 0.6144, + "step": 1258 + }, + { + "epoch": 0.295366568914956, + "grad_norm": 1.387422227052281, + "learning_rate": 8.885514992894328e-06, + "loss": 0.6455, + "step": 1259 + }, + { + "epoch": 0.29560117302052785, + "grad_norm": 1.4483659812274923, + "learning_rate": 8.882935808120415e-06, + "loss": 0.6259, + "step": 1260 + }, + { + "epoch": 0.2958357771260997, + "grad_norm": 2.4045389283475775, + "learning_rate": 8.880354017617806e-06, + "loss": 0.6515, + "step": 1261 + }, + { + "epoch": 0.29607038123167156, + "grad_norm": 1.2142610769997242, + "learning_rate": 8.87776962311907e-06, + "loss": 0.6213, + "step": 1262 + }, + { + "epoch": 0.2963049853372434, + "grad_norm": 1.1932169412093907, + "learning_rate": 8.87518262635852e-06, + "loss": 0.6601, + "step": 1263 + }, + { + "epoch": 0.2965395894428153, + "grad_norm": 1.3103600311840702, + "learning_rate": 8.872593029072218e-06, + "loss": 0.6008, + "step": 1264 + }, + { + "epoch": 0.2967741935483871, + "grad_norm": 12.806866833231975, + "learning_rate": 8.870000832997969e-06, + "loss": 0.6514, + "step": 1265 + }, + { + "epoch": 0.29700879765395893, + "grad_norm": 1.8867732638569539, + "learning_rate": 8.867406039875322e-06, + "loss": 0.6072, + "step": 1266 + }, + { + "epoch": 0.2972434017595308, + "grad_norm": 1.7776538825058188, + "learning_rate": 8.864808651445571e-06, + "loss": 0.6305, + "step": 1267 + }, + { + "epoch": 0.29747800586510265, + "grad_norm": 2.5356146132409836, + "learning_rate": 8.862208669451748e-06, + "loss": 0.6274, + "step": 1268 + }, + { + "epoch": 0.2977126099706745, + "grad_norm": 1.1405033241200686, + "learning_rate": 8.85960609563863e-06, + "loss": 0.6065, + "step": 1269 + }, + { + "epoch": 0.29794721407624636, + "grad_norm": 5.247737154694485, + "learning_rate": 8.857000931752727e-06, + "loss": 0.6206, + "step": 1270 + }, + { + "epoch": 0.29818181818181816, + "grad_norm": 1.6192986589450444, + "learning_rate": 8.854393179542296e-06, + "loss": 0.649, + "step": 1271 + }, + { + "epoch": 0.29841642228739, + "grad_norm": 0.955327285671442, + "learning_rate": 8.851782840757322e-06, + "loss": 0.6322, + "step": 1272 + }, + { + "epoch": 0.29865102639296187, + "grad_norm": 10.205085139887778, + "learning_rate": 8.849169917149532e-06, + "loss": 0.6312, + "step": 1273 + }, + { + "epoch": 0.2988856304985337, + "grad_norm": 2.0419819861702027, + "learning_rate": 8.846554410472385e-06, + "loss": 0.6312, + "step": 1274 + }, + { + "epoch": 0.2991202346041056, + "grad_norm": 2.539642348519771, + "learning_rate": 8.843936322481073e-06, + "loss": 0.6329, + "step": 1275 + }, + { + "epoch": 0.29935483870967744, + "grad_norm": 1.9786206541515448, + "learning_rate": 8.841315654932521e-06, + "loss": 0.6982, + "step": 1276 + }, + { + "epoch": 0.29958944281524924, + "grad_norm": 2.714725438430519, + "learning_rate": 8.838692409585386e-06, + "loss": 0.6069, + "step": 1277 + }, + { + "epoch": 0.2998240469208211, + "grad_norm": 1.614491519495665, + "learning_rate": 8.836066588200052e-06, + "loss": 0.7046, + "step": 1278 + }, + { + "epoch": 0.30005865102639295, + "grad_norm": 2.255260228506507, + "learning_rate": 8.833438192538635e-06, + "loss": 0.6133, + "step": 1279 + }, + { + "epoch": 0.3002932551319648, + "grad_norm": 2.280776635454184, + "learning_rate": 8.830807224364978e-06, + "loss": 0.6639, + "step": 1280 + }, + { + "epoch": 0.30052785923753667, + "grad_norm": 1.1117201617882997, + "learning_rate": 8.82817368544465e-06, + "loss": 0.6253, + "step": 1281 + }, + { + "epoch": 0.3007624633431085, + "grad_norm": 1.048458771972502, + "learning_rate": 8.825537577544941e-06, + "loss": 0.5812, + "step": 1282 + }, + { + "epoch": 0.3009970674486804, + "grad_norm": 0.8928909363340157, + "learning_rate": 8.822898902434873e-06, + "loss": 0.6959, + "step": 1283 + }, + { + "epoch": 0.3012316715542522, + "grad_norm": 1.700575200742987, + "learning_rate": 8.820257661885184e-06, + "loss": 0.6136, + "step": 1284 + }, + { + "epoch": 0.30146627565982403, + "grad_norm": 0.9843673483434341, + "learning_rate": 8.817613857668336e-06, + "loss": 0.6366, + "step": 1285 + }, + { + "epoch": 0.3017008797653959, + "grad_norm": 1.065147000843776, + "learning_rate": 8.814967491558513e-06, + "loss": 0.6487, + "step": 1286 + }, + { + "epoch": 0.30193548387096775, + "grad_norm": 2.0813320709130085, + "learning_rate": 8.812318565331615e-06, + "loss": 0.6214, + "step": 1287 + }, + { + "epoch": 0.3021700879765396, + "grad_norm": 2.780332471849667, + "learning_rate": 8.809667080765262e-06, + "loss": 0.5831, + "step": 1288 + }, + { + "epoch": 0.30240469208211146, + "grad_norm": 1.1904230973336396, + "learning_rate": 8.807013039638792e-06, + "loss": 0.5899, + "step": 1289 + }, + { + "epoch": 0.30263929618768326, + "grad_norm": 0.9612417248794958, + "learning_rate": 8.804356443733252e-06, + "loss": 0.6273, + "step": 1290 + }, + { + "epoch": 0.3028739002932551, + "grad_norm": 1.4060937168302299, + "learning_rate": 8.801697294831418e-06, + "loss": 0.6508, + "step": 1291 + }, + { + "epoch": 0.303108504398827, + "grad_norm": 1.192759901327455, + "learning_rate": 8.79903559471776e-06, + "loss": 0.6359, + "step": 1292 + }, + { + "epoch": 0.30334310850439883, + "grad_norm": 1.9201539193558765, + "learning_rate": 8.796371345178477e-06, + "loss": 0.6493, + "step": 1293 + }, + { + "epoch": 0.3035777126099707, + "grad_norm": 2.1214585599349567, + "learning_rate": 8.793704548001468e-06, + "loss": 0.5816, + "step": 1294 + }, + { + "epoch": 0.30381231671554254, + "grad_norm": 0.8599548018777288, + "learning_rate": 8.791035204976346e-06, + "loss": 0.6127, + "step": 1295 + }, + { + "epoch": 0.3040469208211144, + "grad_norm": 1.5129429147192395, + "learning_rate": 8.788363317894432e-06, + "loss": 0.6351, + "step": 1296 + }, + { + "epoch": 0.3042815249266862, + "grad_norm": 2.6109877591111403, + "learning_rate": 8.785688888548756e-06, + "loss": 0.6309, + "step": 1297 + }, + { + "epoch": 0.30451612903225805, + "grad_norm": 1.0230297416784169, + "learning_rate": 8.783011918734048e-06, + "loss": 0.5925, + "step": 1298 + }, + { + "epoch": 0.3047507331378299, + "grad_norm": 2.186467330141591, + "learning_rate": 8.780332410246751e-06, + "loss": 0.5888, + "step": 1299 + }, + { + "epoch": 0.30498533724340177, + "grad_norm": 1.67491301133915, + "learning_rate": 8.777650364885004e-06, + "loss": 0.6724, + "step": 1300 + }, + { + "epoch": 0.3052199413489736, + "grad_norm": 1.1812740966549162, + "learning_rate": 8.774965784448655e-06, + "loss": 0.6233, + "step": 1301 + }, + { + "epoch": 0.3054545454545455, + "grad_norm": 1.3653435440565702, + "learning_rate": 8.772278670739249e-06, + "loss": 0.646, + "step": 1302 + }, + { + "epoch": 0.3056891495601173, + "grad_norm": 0.974001510579673, + "learning_rate": 8.76958902556003e-06, + "loss": 0.6553, + "step": 1303 + }, + { + "epoch": 0.30592375366568914, + "grad_norm": 1.8805764437627832, + "learning_rate": 8.766896850715946e-06, + "loss": 0.5921, + "step": 1304 + }, + { + "epoch": 0.306158357771261, + "grad_norm": 1.3919439552419068, + "learning_rate": 8.764202148013641e-06, + "loss": 0.6436, + "step": 1305 + }, + { + "epoch": 0.30639296187683285, + "grad_norm": 1.1462932005939608, + "learning_rate": 8.761504919261449e-06, + "loss": 0.6637, + "step": 1306 + }, + { + "epoch": 0.3066275659824047, + "grad_norm": 1.1892967885593082, + "learning_rate": 8.758805166269408e-06, + "loss": 0.5987, + "step": 1307 + }, + { + "epoch": 0.30686217008797656, + "grad_norm": 1.1052793782042585, + "learning_rate": 8.756102890849246e-06, + "loss": 0.6807, + "step": 1308 + }, + { + "epoch": 0.30709677419354836, + "grad_norm": 6.883279038259657, + "learning_rate": 8.753398094814382e-06, + "loss": 0.7119, + "step": 1309 + }, + { + "epoch": 0.3073313782991202, + "grad_norm": 1.081980986043162, + "learning_rate": 8.750690779979929e-06, + "loss": 0.5951, + "step": 1310 + }, + { + "epoch": 0.3075659824046921, + "grad_norm": 2.0675410988875154, + "learning_rate": 8.74798094816269e-06, + "loss": 0.6512, + "step": 1311 + }, + { + "epoch": 0.30780058651026393, + "grad_norm": 1.3202120008001563, + "learning_rate": 8.745268601181156e-06, + "loss": 0.6496, + "step": 1312 + }, + { + "epoch": 0.3080351906158358, + "grad_norm": 1.0247827042515312, + "learning_rate": 8.742553740855507e-06, + "loss": 0.6551, + "step": 1313 + }, + { + "epoch": 0.30826979472140764, + "grad_norm": 5.786819309028735, + "learning_rate": 8.739836369007609e-06, + "loss": 0.6285, + "step": 1314 + }, + { + "epoch": 0.3085043988269795, + "grad_norm": 60.92674679982773, + "learning_rate": 8.737116487461012e-06, + "loss": 0.6267, + "step": 1315 + }, + { + "epoch": 0.3087390029325513, + "grad_norm": 1.2774203798381183, + "learning_rate": 8.734394098040958e-06, + "loss": 0.5971, + "step": 1316 + }, + { + "epoch": 0.30897360703812315, + "grad_norm": 2.163502894715537, + "learning_rate": 8.731669202574358e-06, + "loss": 0.6197, + "step": 1317 + }, + { + "epoch": 0.309208211143695, + "grad_norm": 2.152718603403912, + "learning_rate": 8.728941802889816e-06, + "loss": 0.6498, + "step": 1318 + }, + { + "epoch": 0.30944281524926687, + "grad_norm": 1.4893222425762045, + "learning_rate": 8.726211900817615e-06, + "loss": 0.627, + "step": 1319 + }, + { + "epoch": 0.3096774193548387, + "grad_norm": 1.249555665489977, + "learning_rate": 8.723479498189709e-06, + "loss": 0.6245, + "step": 1320 + }, + { + "epoch": 0.3099120234604106, + "grad_norm": 1.1178881942453802, + "learning_rate": 8.720744596839743e-06, + "loss": 0.7129, + "step": 1321 + }, + { + "epoch": 0.3101466275659824, + "grad_norm": 1.3964290043601861, + "learning_rate": 8.718007198603028e-06, + "loss": 0.5847, + "step": 1322 + }, + { + "epoch": 0.31038123167155424, + "grad_norm": 1.6111423346882032, + "learning_rate": 8.715267305316559e-06, + "loss": 0.6642, + "step": 1323 + }, + { + "epoch": 0.3106158357771261, + "grad_norm": 0.9612562831354406, + "learning_rate": 8.712524918818997e-06, + "loss": 0.6563, + "step": 1324 + }, + { + "epoch": 0.31085043988269795, + "grad_norm": 1.1038307346379574, + "learning_rate": 8.70978004095068e-06, + "loss": 0.5907, + "step": 1325 + }, + { + "epoch": 0.3110850439882698, + "grad_norm": 0.766408183617374, + "learning_rate": 8.70703267355362e-06, + "loss": 0.5443, + "step": 1326 + }, + { + "epoch": 0.31131964809384166, + "grad_norm": 1.478909949815166, + "learning_rate": 8.704282818471501e-06, + "loss": 0.6915, + "step": 1327 + }, + { + "epoch": 0.3115542521994135, + "grad_norm": 1.195723486093403, + "learning_rate": 8.701530477549666e-06, + "loss": 0.6195, + "step": 1328 + }, + { + "epoch": 0.3117888563049853, + "grad_norm": 0.7850915533063527, + "learning_rate": 8.698775652635139e-06, + "loss": 0.6113, + "step": 1329 + }, + { + "epoch": 0.3120234604105572, + "grad_norm": 1.3491574479114687, + "learning_rate": 8.696018345576602e-06, + "loss": 0.705, + "step": 1330 + }, + { + "epoch": 0.31225806451612903, + "grad_norm": 1.724935421468019, + "learning_rate": 8.693258558224406e-06, + "loss": 0.6542, + "step": 1331 + }, + { + "epoch": 0.3124926686217009, + "grad_norm": 1.8955182529789312, + "learning_rate": 8.690496292430571e-06, + "loss": 0.6627, + "step": 1332 + }, + { + "epoch": 0.31272727272727274, + "grad_norm": 2.1000158286295925, + "learning_rate": 8.68773155004877e-06, + "loss": 0.6359, + "step": 1333 + }, + { + "epoch": 0.3129618768328446, + "grad_norm": 1.2343201375497233, + "learning_rate": 8.684964332934347e-06, + "loss": 0.63, + "step": 1334 + }, + { + "epoch": 0.3131964809384164, + "grad_norm": 1.1670954778337763, + "learning_rate": 8.682194642944304e-06, + "loss": 0.635, + "step": 1335 + }, + { + "epoch": 0.31343108504398826, + "grad_norm": 2.1456346002897657, + "learning_rate": 8.679422481937297e-06, + "loss": 0.6245, + "step": 1336 + }, + { + "epoch": 0.3136656891495601, + "grad_norm": 1.9460672497958593, + "learning_rate": 8.67664785177365e-06, + "loss": 0.6102, + "step": 1337 + }, + { + "epoch": 0.31390029325513197, + "grad_norm": 1.755780007413827, + "learning_rate": 8.673870754315336e-06, + "loss": 0.6433, + "step": 1338 + }, + { + "epoch": 0.3141348973607038, + "grad_norm": 2.120790636904061, + "learning_rate": 8.671091191425991e-06, + "loss": 0.6888, + "step": 1339 + }, + { + "epoch": 0.3143695014662757, + "grad_norm": 0.8886934532473196, + "learning_rate": 8.668309164970896e-06, + "loss": 0.5888, + "step": 1340 + }, + { + "epoch": 0.3146041055718475, + "grad_norm": 2.4991745759717463, + "learning_rate": 8.665524676816992e-06, + "loss": 0.6203, + "step": 1341 + }, + { + "epoch": 0.31483870967741934, + "grad_norm": 2.598062493726064, + "learning_rate": 8.662737728832873e-06, + "loss": 0.6058, + "step": 1342 + }, + { + "epoch": 0.3150733137829912, + "grad_norm": 0.9442172449383684, + "learning_rate": 8.659948322888778e-06, + "loss": 0.5829, + "step": 1343 + }, + { + "epoch": 0.31530791788856305, + "grad_norm": 2.691296024074173, + "learning_rate": 8.657156460856602e-06, + "loss": 0.6585, + "step": 1344 + }, + { + "epoch": 0.3155425219941349, + "grad_norm": 1.0442090423796575, + "learning_rate": 8.654362144609882e-06, + "loss": 0.6217, + "step": 1345 + }, + { + "epoch": 0.31577712609970676, + "grad_norm": 0.9727296807581846, + "learning_rate": 8.651565376023805e-06, + "loss": 0.6, + "step": 1346 + }, + { + "epoch": 0.3160117302052786, + "grad_norm": 1.550040656766428, + "learning_rate": 8.648766156975207e-06, + "loss": 0.658, + "step": 1347 + }, + { + "epoch": 0.3162463343108504, + "grad_norm": 2.026571473268327, + "learning_rate": 8.645964489342558e-06, + "loss": 0.5898, + "step": 1348 + }, + { + "epoch": 0.3164809384164223, + "grad_norm": 0.9336579973671697, + "learning_rate": 8.643160375005985e-06, + "loss": 0.6419, + "step": 1349 + }, + { + "epoch": 0.31671554252199413, + "grad_norm": 2.1600425115092596, + "learning_rate": 8.640353815847247e-06, + "loss": 0.6264, + "step": 1350 + }, + { + "epoch": 0.316950146627566, + "grad_norm": 1.2313394543528944, + "learning_rate": 8.637544813749747e-06, + "loss": 0.6003, + "step": 1351 + }, + { + "epoch": 0.31718475073313784, + "grad_norm": 1.007341668974774, + "learning_rate": 8.634733370598526e-06, + "loss": 0.6332, + "step": 1352 + }, + { + "epoch": 0.3174193548387097, + "grad_norm": 1.914352211483566, + "learning_rate": 8.631919488280267e-06, + "loss": 0.6323, + "step": 1353 + }, + { + "epoch": 0.3176539589442815, + "grad_norm": 0.92833216095689, + "learning_rate": 8.629103168683284e-06, + "loss": 0.6215, + "step": 1354 + }, + { + "epoch": 0.31788856304985336, + "grad_norm": 1.2857911810777851, + "learning_rate": 8.626284413697532e-06, + "loss": 0.6565, + "step": 1355 + }, + { + "epoch": 0.3181231671554252, + "grad_norm": 1.555815894161518, + "learning_rate": 8.623463225214597e-06, + "loss": 0.6787, + "step": 1356 + }, + { + "epoch": 0.31835777126099707, + "grad_norm": 2.0162124628364015, + "learning_rate": 8.620639605127698e-06, + "loss": 0.6358, + "step": 1357 + }, + { + "epoch": 0.3185923753665689, + "grad_norm": 0.8076909991521062, + "learning_rate": 8.61781355533169e-06, + "loss": 0.6373, + "step": 1358 + }, + { + "epoch": 0.3188269794721408, + "grad_norm": 2.0354027172745, + "learning_rate": 8.61498507772305e-06, + "loss": 0.6122, + "step": 1359 + }, + { + "epoch": 0.31906158357771264, + "grad_norm": 1.245336316860784, + "learning_rate": 8.612154174199896e-06, + "loss": 0.5933, + "step": 1360 + }, + { + "epoch": 0.31929618768328444, + "grad_norm": 3.5947107493839967, + "learning_rate": 8.60932084666196e-06, + "loss": 0.6302, + "step": 1361 + }, + { + "epoch": 0.3195307917888563, + "grad_norm": 3.39452963397432, + "learning_rate": 8.606485097010616e-06, + "loss": 0.6256, + "step": 1362 + }, + { + "epoch": 0.31976539589442815, + "grad_norm": 1.5234498781472734, + "learning_rate": 8.60364692714885e-06, + "loss": 0.6682, + "step": 1363 + }, + { + "epoch": 0.32, + "grad_norm": 1.5146001755726328, + "learning_rate": 8.60080633898128e-06, + "loss": 0.6822, + "step": 1364 + }, + { + "epoch": 0.32023460410557186, + "grad_norm": 2.0777388506099834, + "learning_rate": 8.597963334414146e-06, + "loss": 0.6053, + "step": 1365 + }, + { + "epoch": 0.3204692082111437, + "grad_norm": 2.8714871255271497, + "learning_rate": 8.595117915355304e-06, + "loss": 0.7123, + "step": 1366 + }, + { + "epoch": 0.3207038123167155, + "grad_norm": 0.9002279050923622, + "learning_rate": 8.592270083714238e-06, + "loss": 0.658, + "step": 1367 + }, + { + "epoch": 0.3209384164222874, + "grad_norm": 1.2911358484741762, + "learning_rate": 8.589419841402046e-06, + "loss": 0.6094, + "step": 1368 + }, + { + "epoch": 0.32117302052785923, + "grad_norm": 25.665582483281256, + "learning_rate": 8.586567190331446e-06, + "loss": 0.6069, + "step": 1369 + }, + { + "epoch": 0.3214076246334311, + "grad_norm": 1.537281364347152, + "learning_rate": 8.58371213241677e-06, + "loss": 0.5976, + "step": 1370 + }, + { + "epoch": 0.32164222873900294, + "grad_norm": 1.3845179576829354, + "learning_rate": 8.58085466957397e-06, + "loss": 0.6334, + "step": 1371 + }, + { + "epoch": 0.3218768328445748, + "grad_norm": 1.1386389309429865, + "learning_rate": 8.577994803720605e-06, + "loss": 0.7086, + "step": 1372 + }, + { + "epoch": 0.3221114369501466, + "grad_norm": 1.3254556899668493, + "learning_rate": 8.575132536775854e-06, + "loss": 0.5968, + "step": 1373 + }, + { + "epoch": 0.32234604105571846, + "grad_norm": 1.123212271302889, + "learning_rate": 8.572267870660502e-06, + "loss": 0.6763, + "step": 1374 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 1.930857001417153, + "learning_rate": 8.569400807296944e-06, + "loss": 0.6183, + "step": 1375 + }, + { + "epoch": 0.32281524926686217, + "grad_norm": 2.9025633792142815, + "learning_rate": 8.566531348609186e-06, + "loss": 0.6276, + "step": 1376 + }, + { + "epoch": 0.323049853372434, + "grad_norm": 1.3048918897875166, + "learning_rate": 8.563659496522843e-06, + "loss": 0.6092, + "step": 1377 + }, + { + "epoch": 0.3232844574780059, + "grad_norm": 10.034757630145917, + "learning_rate": 8.560785252965131e-06, + "loss": 0.6447, + "step": 1378 + }, + { + "epoch": 0.32351906158357774, + "grad_norm": 1.0461685899146798, + "learning_rate": 8.557908619864877e-06, + "loss": 0.6641, + "step": 1379 + }, + { + "epoch": 0.32375366568914954, + "grad_norm": 1.786751820197665, + "learning_rate": 8.555029599152504e-06, + "loss": 0.6361, + "step": 1380 + }, + { + "epoch": 0.3239882697947214, + "grad_norm": 1.2934920307732916, + "learning_rate": 8.552148192760047e-06, + "loss": 0.6363, + "step": 1381 + }, + { + "epoch": 0.32422287390029325, + "grad_norm": 1.0223073535206437, + "learning_rate": 8.549264402621133e-06, + "loss": 0.6551, + "step": 1382 + }, + { + "epoch": 0.3244574780058651, + "grad_norm": 3.149917754690008, + "learning_rate": 8.546378230670992e-06, + "loss": 0.5889, + "step": 1383 + }, + { + "epoch": 0.32469208211143696, + "grad_norm": 1.1804970129191603, + "learning_rate": 8.543489678846453e-06, + "loss": 0.6581, + "step": 1384 + }, + { + "epoch": 0.3249266862170088, + "grad_norm": 1.5082960885509378, + "learning_rate": 8.540598749085944e-06, + "loss": 0.6824, + "step": 1385 + }, + { + "epoch": 0.3251612903225806, + "grad_norm": 1.1248000804124532, + "learning_rate": 8.537705443329482e-06, + "loss": 0.6595, + "step": 1386 + }, + { + "epoch": 0.3253958944281525, + "grad_norm": 1.3529893497939975, + "learning_rate": 8.534809763518688e-06, + "loss": 0.643, + "step": 1387 + }, + { + "epoch": 0.32563049853372433, + "grad_norm": 1.4854156645555927, + "learning_rate": 8.531911711596767e-06, + "loss": 0.6337, + "step": 1388 + }, + { + "epoch": 0.3258651026392962, + "grad_norm": 1.4938945083987845, + "learning_rate": 8.529011289508522e-06, + "loss": 0.6729, + "step": 1389 + }, + { + "epoch": 0.32609970674486805, + "grad_norm": 2.454648325683254, + "learning_rate": 8.526108499200344e-06, + "loss": 0.6601, + "step": 1390 + }, + { + "epoch": 0.3263343108504399, + "grad_norm": 3.7530756452385896, + "learning_rate": 8.523203342620212e-06, + "loss": 0.6357, + "step": 1391 + }, + { + "epoch": 0.32656891495601176, + "grad_norm": 1.2082065032805829, + "learning_rate": 8.520295821717697e-06, + "loss": 0.6231, + "step": 1392 + }, + { + "epoch": 0.32680351906158356, + "grad_norm": 1.4691056719546882, + "learning_rate": 8.517385938443955e-06, + "loss": 0.6318, + "step": 1393 + }, + { + "epoch": 0.3270381231671554, + "grad_norm": 1.3820520549209452, + "learning_rate": 8.514473694751723e-06, + "loss": 0.6629, + "step": 1394 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 1.8581106263157248, + "learning_rate": 8.511559092595329e-06, + "loss": 0.5865, + "step": 1395 + }, + { + "epoch": 0.3275073313782991, + "grad_norm": 1.5517032644123216, + "learning_rate": 8.508642133930678e-06, + "loss": 0.6412, + "step": 1396 + }, + { + "epoch": 0.327741935483871, + "grad_norm": 0.9314890362404621, + "learning_rate": 8.50572282071526e-06, + "loss": 0.6346, + "step": 1397 + }, + { + "epoch": 0.32797653958944284, + "grad_norm": 4.83121516223516, + "learning_rate": 8.502801154908142e-06, + "loss": 0.6453, + "step": 1398 + }, + { + "epoch": 0.32821114369501464, + "grad_norm": 1.3559041143950146, + "learning_rate": 8.499877138469973e-06, + "loss": 0.6618, + "step": 1399 + }, + { + "epoch": 0.3284457478005865, + "grad_norm": 4.184967102088001, + "learning_rate": 8.49695077336298e-06, + "loss": 0.6231, + "step": 1400 + }, + { + "epoch": 0.32868035190615835, + "grad_norm": 1.0389706936140033, + "learning_rate": 8.49402206155096e-06, + "loss": 0.6215, + "step": 1401 + }, + { + "epoch": 0.3289149560117302, + "grad_norm": 1.0812713256722006, + "learning_rate": 8.491091004999293e-06, + "loss": 0.6327, + "step": 1402 + }, + { + "epoch": 0.32914956011730206, + "grad_norm": 1.9312825000734952, + "learning_rate": 8.488157605674924e-06, + "loss": 0.5882, + "step": 1403 + }, + { + "epoch": 0.3293841642228739, + "grad_norm": 1.357776340775692, + "learning_rate": 8.485221865546377e-06, + "loss": 0.6269, + "step": 1404 + }, + { + "epoch": 0.3296187683284457, + "grad_norm": 0.9904610631410483, + "learning_rate": 8.482283786583745e-06, + "loss": 0.6261, + "step": 1405 + }, + { + "epoch": 0.3298533724340176, + "grad_norm": 1.3350484411071832, + "learning_rate": 8.479343370758687e-06, + "loss": 0.6327, + "step": 1406 + }, + { + "epoch": 0.33008797653958943, + "grad_norm": 1.0406124091175053, + "learning_rate": 8.476400620044439e-06, + "loss": 0.6286, + "step": 1407 + }, + { + "epoch": 0.3303225806451613, + "grad_norm": 1.2423300727646929, + "learning_rate": 8.47345553641579e-06, + "loss": 0.6257, + "step": 1408 + }, + { + "epoch": 0.33055718475073315, + "grad_norm": 2.5098220016498884, + "learning_rate": 8.470508121849108e-06, + "loss": 0.6509, + "step": 1409 + }, + { + "epoch": 0.330791788856305, + "grad_norm": 1.7740633957474148, + "learning_rate": 8.46755837832232e-06, + "loss": 0.6194, + "step": 1410 + }, + { + "epoch": 0.33102639296187686, + "grad_norm": 1.1350055293410866, + "learning_rate": 8.46460630781491e-06, + "loss": 0.6068, + "step": 1411 + }, + { + "epoch": 0.33126099706744866, + "grad_norm": 0.9013414198812099, + "learning_rate": 8.461651912307937e-06, + "loss": 0.7072, + "step": 1412 + }, + { + "epoch": 0.3314956011730205, + "grad_norm": 1.4451292309717894, + "learning_rate": 8.45869519378401e-06, + "loss": 0.5997, + "step": 1413 + }, + { + "epoch": 0.33173020527859237, + "grad_norm": 1.042894858513527, + "learning_rate": 8.455736154227294e-06, + "loss": 0.6516, + "step": 1414 + }, + { + "epoch": 0.33196480938416423, + "grad_norm": 1.0817335775599264, + "learning_rate": 8.452774795623525e-06, + "loss": 0.5674, + "step": 1415 + }, + { + "epoch": 0.3321994134897361, + "grad_norm": 0.9306528221739203, + "learning_rate": 8.449811119959982e-06, + "loss": 0.5637, + "step": 1416 + }, + { + "epoch": 0.33243401759530794, + "grad_norm": 2.9259924025571524, + "learning_rate": 8.446845129225505e-06, + "loss": 0.6871, + "step": 1417 + }, + { + "epoch": 0.33266862170087974, + "grad_norm": 1.3116069619316164, + "learning_rate": 8.443876825410488e-06, + "loss": 0.6283, + "step": 1418 + }, + { + "epoch": 0.3329032258064516, + "grad_norm": 1.1151155004404432, + "learning_rate": 8.440906210506875e-06, + "loss": 0.6526, + "step": 1419 + }, + { + "epoch": 0.33313782991202345, + "grad_norm": 1.006631697880105, + "learning_rate": 8.437933286508162e-06, + "loss": 0.6675, + "step": 1420 + }, + { + "epoch": 0.3333724340175953, + "grad_norm": 2.812784011260699, + "learning_rate": 8.434958055409395e-06, + "loss": 0.5972, + "step": 1421 + }, + { + "epoch": 0.33360703812316717, + "grad_norm": 1.3126577072563264, + "learning_rate": 8.431980519207168e-06, + "loss": 0.5983, + "step": 1422 + }, + { + "epoch": 0.333841642228739, + "grad_norm": 2.6915182599167955, + "learning_rate": 8.42900067989962e-06, + "loss": 0.6569, + "step": 1423 + }, + { + "epoch": 0.3340762463343109, + "grad_norm": 1.2287047288297095, + "learning_rate": 8.426018539486437e-06, + "loss": 0.6355, + "step": 1424 + }, + { + "epoch": 0.3343108504398827, + "grad_norm": 1.4258173378604266, + "learning_rate": 8.423034099968851e-06, + "loss": 0.6191, + "step": 1425 + }, + { + "epoch": 0.33454545454545453, + "grad_norm": 1.4937674624905826, + "learning_rate": 8.420047363349636e-06, + "loss": 0.6695, + "step": 1426 + }, + { + "epoch": 0.3347800586510264, + "grad_norm": 1.0141852252485175, + "learning_rate": 8.417058331633102e-06, + "loss": 0.681, + "step": 1427 + }, + { + "epoch": 0.33501466275659825, + "grad_norm": 3.0511373158997768, + "learning_rate": 8.414067006825108e-06, + "loss": 0.657, + "step": 1428 + }, + { + "epoch": 0.3352492668621701, + "grad_norm": 5.300541328028981, + "learning_rate": 8.411073390933048e-06, + "loss": 0.5846, + "step": 1429 + }, + { + "epoch": 0.33548387096774196, + "grad_norm": 1.4936228127988724, + "learning_rate": 8.408077485965852e-06, + "loss": 0.6645, + "step": 1430 + }, + { + "epoch": 0.33571847507331376, + "grad_norm": 0.9734727974080785, + "learning_rate": 8.405079293933986e-06, + "loss": 0.6686, + "step": 1431 + }, + { + "epoch": 0.3359530791788856, + "grad_norm": 1.4529755582105603, + "learning_rate": 8.402078816849454e-06, + "loss": 0.6269, + "step": 1432 + }, + { + "epoch": 0.3361876832844575, + "grad_norm": 1.3421411041498725, + "learning_rate": 8.39907605672579e-06, + "loss": 0.6578, + "step": 1433 + }, + { + "epoch": 0.33642228739002933, + "grad_norm": 3.239032135699637, + "learning_rate": 8.396071015578064e-06, + "loss": 0.6525, + "step": 1434 + }, + { + "epoch": 0.3366568914956012, + "grad_norm": 1.9505286799182235, + "learning_rate": 8.393063695422872e-06, + "loss": 0.7155, + "step": 1435 + }, + { + "epoch": 0.33689149560117304, + "grad_norm": 1.6183739710877414, + "learning_rate": 8.390054098278341e-06, + "loss": 0.6401, + "step": 1436 + }, + { + "epoch": 0.33712609970674484, + "grad_norm": 2.4434525002884087, + "learning_rate": 8.38704222616413e-06, + "loss": 0.6546, + "step": 1437 + }, + { + "epoch": 0.3373607038123167, + "grad_norm": 2.9339990370893885, + "learning_rate": 8.384028081101419e-06, + "loss": 0.656, + "step": 1438 + }, + { + "epoch": 0.33759530791788855, + "grad_norm": 1.1336278724835074, + "learning_rate": 8.381011665112918e-06, + "loss": 0.6501, + "step": 1439 + }, + { + "epoch": 0.3378299120234604, + "grad_norm": 3.213719727493025, + "learning_rate": 8.377992980222856e-06, + "loss": 0.6522, + "step": 1440 + }, + { + "epoch": 0.33806451612903227, + "grad_norm": 0.892289646256161, + "learning_rate": 8.374972028456987e-06, + "loss": 0.6515, + "step": 1441 + }, + { + "epoch": 0.3382991202346041, + "grad_norm": 1.226685613357517, + "learning_rate": 8.371948811842589e-06, + "loss": 0.6187, + "step": 1442 + }, + { + "epoch": 0.338533724340176, + "grad_norm": 1.5404695990366168, + "learning_rate": 8.368923332408458e-06, + "loss": 0.6378, + "step": 1443 + }, + { + "epoch": 0.3387683284457478, + "grad_norm": 3.0407503667494975, + "learning_rate": 8.365895592184905e-06, + "loss": 0.6314, + "step": 1444 + }, + { + "epoch": 0.33900293255131964, + "grad_norm": 1.3866293223711708, + "learning_rate": 8.362865593203761e-06, + "loss": 0.6249, + "step": 1445 + }, + { + "epoch": 0.3392375366568915, + "grad_norm": 2.8038931976209054, + "learning_rate": 8.359833337498375e-06, + "loss": 0.6405, + "step": 1446 + }, + { + "epoch": 0.33947214076246335, + "grad_norm": 3.7934641081707556, + "learning_rate": 8.356798827103608e-06, + "loss": 0.5757, + "step": 1447 + }, + { + "epoch": 0.3397067448680352, + "grad_norm": 1.0649556642285551, + "learning_rate": 8.353762064055832e-06, + "loss": 0.618, + "step": 1448 + }, + { + "epoch": 0.33994134897360706, + "grad_norm": 0.9222634812989948, + "learning_rate": 8.350723050392937e-06, + "loss": 0.6281, + "step": 1449 + }, + { + "epoch": 0.34017595307917886, + "grad_norm": 1.5902921327280632, + "learning_rate": 8.347681788154316e-06, + "loss": 0.6306, + "step": 1450 + }, + { + "epoch": 0.3404105571847507, + "grad_norm": 1.1261301785362399, + "learning_rate": 8.344638279380875e-06, + "loss": 0.6863, + "step": 1451 + }, + { + "epoch": 0.3406451612903226, + "grad_norm": 7.724626258384448, + "learning_rate": 8.341592526115028e-06, + "loss": 0.5977, + "step": 1452 + }, + { + "epoch": 0.34087976539589443, + "grad_norm": 1.1253873255951106, + "learning_rate": 8.338544530400693e-06, + "loss": 0.6226, + "step": 1453 + }, + { + "epoch": 0.3411143695014663, + "grad_norm": 2.387143074516115, + "learning_rate": 8.335494294283295e-06, + "loss": 0.6078, + "step": 1454 + }, + { + "epoch": 0.34134897360703814, + "grad_norm": 1.0063514817449035, + "learning_rate": 8.33244181980976e-06, + "loss": 0.6459, + "step": 1455 + }, + { + "epoch": 0.34158357771261, + "grad_norm": 1.4947287816472277, + "learning_rate": 8.329387109028519e-06, + "loss": 0.607, + "step": 1456 + }, + { + "epoch": 0.3418181818181818, + "grad_norm": 1.0172627081928454, + "learning_rate": 8.326330163989501e-06, + "loss": 0.6815, + "step": 1457 + }, + { + "epoch": 0.34205278592375365, + "grad_norm": 2.27189937321112, + "learning_rate": 8.323270986744137e-06, + "loss": 0.6376, + "step": 1458 + }, + { + "epoch": 0.3422873900293255, + "grad_norm": 3.1671117988610327, + "learning_rate": 8.320209579345354e-06, + "loss": 0.6316, + "step": 1459 + }, + { + "epoch": 0.34252199413489737, + "grad_norm": 1.7703890703140879, + "learning_rate": 8.317145943847573e-06, + "loss": 0.6067, + "step": 1460 + }, + { + "epoch": 0.3427565982404692, + "grad_norm": 2.8224082887446182, + "learning_rate": 8.314080082306718e-06, + "loss": 0.6491, + "step": 1461 + }, + { + "epoch": 0.3429912023460411, + "grad_norm": 1.645283338929989, + "learning_rate": 8.3110119967802e-06, + "loss": 0.6197, + "step": 1462 + }, + { + "epoch": 0.3432258064516129, + "grad_norm": 3.3933543977296905, + "learning_rate": 8.307941689326926e-06, + "loss": 0.6727, + "step": 1463 + }, + { + "epoch": 0.34346041055718474, + "grad_norm": 1.1657356603464364, + "learning_rate": 8.30486916200729e-06, + "loss": 0.6484, + "step": 1464 + }, + { + "epoch": 0.3436950146627566, + "grad_norm": 3.8712679730499007, + "learning_rate": 8.301794416883182e-06, + "loss": 0.6499, + "step": 1465 + }, + { + "epoch": 0.34392961876832845, + "grad_norm": 2.070657817367473, + "learning_rate": 8.298717456017975e-06, + "loss": 0.6211, + "step": 1466 + }, + { + "epoch": 0.3441642228739003, + "grad_norm": 1.3603966816757713, + "learning_rate": 8.295638281476529e-06, + "loss": 0.6804, + "step": 1467 + }, + { + "epoch": 0.34439882697947216, + "grad_norm": 1.2553303332731176, + "learning_rate": 8.292556895325195e-06, + "loss": 0.5872, + "step": 1468 + }, + { + "epoch": 0.34463343108504396, + "grad_norm": 1.6785132104367784, + "learning_rate": 8.2894732996318e-06, + "loss": 0.6962, + "step": 1469 + }, + { + "epoch": 0.3448680351906158, + "grad_norm": 2.350729036974717, + "learning_rate": 8.28638749646566e-06, + "loss": 0.7025, + "step": 1470 + }, + { + "epoch": 0.3451026392961877, + "grad_norm": 1.08608432429923, + "learning_rate": 8.28329948789757e-06, + "loss": 0.5597, + "step": 1471 + }, + { + "epoch": 0.34533724340175953, + "grad_norm": 2.5023569034921884, + "learning_rate": 8.28020927599981e-06, + "loss": 0.6566, + "step": 1472 + }, + { + "epoch": 0.3455718475073314, + "grad_norm": 1.803515300768271, + "learning_rate": 8.277116862846127e-06, + "loss": 0.6396, + "step": 1473 + }, + { + "epoch": 0.34580645161290324, + "grad_norm": 1.7119582483094469, + "learning_rate": 8.274022250511755e-06, + "loss": 0.6224, + "step": 1474 + }, + { + "epoch": 0.3460410557184751, + "grad_norm": 1.7639707272829974, + "learning_rate": 8.270925441073402e-06, + "loss": 0.5942, + "step": 1475 + }, + { + "epoch": 0.3462756598240469, + "grad_norm": 1.1373456186692137, + "learning_rate": 8.26782643660925e-06, + "loss": 0.5606, + "step": 1476 + }, + { + "epoch": 0.34651026392961876, + "grad_norm": 3.2584651090339376, + "learning_rate": 8.264725239198952e-06, + "loss": 0.648, + "step": 1477 + }, + { + "epoch": 0.3467448680351906, + "grad_norm": 0.9893722075990332, + "learning_rate": 8.261621850923634e-06, + "loss": 0.6523, + "step": 1478 + }, + { + "epoch": 0.34697947214076247, + "grad_norm": 1.6595170976168387, + "learning_rate": 8.258516273865894e-06, + "loss": 0.6635, + "step": 1479 + }, + { + "epoch": 0.3472140762463343, + "grad_norm": 1.568496810851159, + "learning_rate": 8.255408510109793e-06, + "loss": 0.6594, + "step": 1480 + }, + { + "epoch": 0.3474486803519062, + "grad_norm": 0.9773769390727363, + "learning_rate": 8.252298561740868e-06, + "loss": 0.5897, + "step": 1481 + }, + { + "epoch": 0.347683284457478, + "grad_norm": 4.976287138083255, + "learning_rate": 8.249186430846116e-06, + "loss": 0.6633, + "step": 1482 + }, + { + "epoch": 0.34791788856304984, + "grad_norm": 1.0167181553948246, + "learning_rate": 8.246072119514002e-06, + "loss": 0.6101, + "step": 1483 + }, + { + "epoch": 0.3481524926686217, + "grad_norm": 1.4218757823242925, + "learning_rate": 8.242955629834449e-06, + "loss": 0.6224, + "step": 1484 + }, + { + "epoch": 0.34838709677419355, + "grad_norm": 1.1119203866797485, + "learning_rate": 8.239836963898849e-06, + "loss": 0.623, + "step": 1485 + }, + { + "epoch": 0.3486217008797654, + "grad_norm": 11.640160565456098, + "learning_rate": 8.236716123800048e-06, + "loss": 0.6668, + "step": 1486 + }, + { + "epoch": 0.34885630498533726, + "grad_norm": 1.6163206052113017, + "learning_rate": 8.233593111632354e-06, + "loss": 0.5945, + "step": 1487 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 1.4197707509539244, + "learning_rate": 8.230467929491533e-06, + "loss": 0.6855, + "step": 1488 + }, + { + "epoch": 0.3493255131964809, + "grad_norm": 1.5358864085512611, + "learning_rate": 8.227340579474808e-06, + "loss": 0.6359, + "step": 1489 + }, + { + "epoch": 0.3495601173020528, + "grad_norm": 1.1372195244716454, + "learning_rate": 8.224211063680854e-06, + "loss": 0.6406, + "step": 1490 + }, + { + "epoch": 0.34979472140762463, + "grad_norm": 1.2161043134269158, + "learning_rate": 8.2210793842098e-06, + "loss": 0.6546, + "step": 1491 + }, + { + "epoch": 0.3500293255131965, + "grad_norm": 2.4354306159481762, + "learning_rate": 8.217945543163231e-06, + "loss": 0.6596, + "step": 1492 + }, + { + "epoch": 0.35026392961876834, + "grad_norm": 1.1174853950270742, + "learning_rate": 8.214809542644173e-06, + "loss": 0.6939, + "step": 1493 + }, + { + "epoch": 0.3504985337243402, + "grad_norm": 1.6374231300300062, + "learning_rate": 8.211671384757114e-06, + "loss": 0.6197, + "step": 1494 + }, + { + "epoch": 0.350733137829912, + "grad_norm": 2.7910351323346734, + "learning_rate": 8.20853107160798e-06, + "loss": 0.6833, + "step": 1495 + }, + { + "epoch": 0.35096774193548386, + "grad_norm": 3.969565659824886, + "learning_rate": 8.205388605304147e-06, + "loss": 0.6587, + "step": 1496 + }, + { + "epoch": 0.3512023460410557, + "grad_norm": 1.4218637984942035, + "learning_rate": 8.202243987954435e-06, + "loss": 0.601, + "step": 1497 + }, + { + "epoch": 0.35143695014662757, + "grad_norm": 1.4265802208832241, + "learning_rate": 8.19909722166911e-06, + "loss": 0.6044, + "step": 1498 + }, + { + "epoch": 0.3516715542521994, + "grad_norm": 1.3131746883241227, + "learning_rate": 8.195948308559878e-06, + "loss": 0.6793, + "step": 1499 + }, + { + "epoch": 0.3519061583577713, + "grad_norm": 1.3038815650068798, + "learning_rate": 8.192797250739883e-06, + "loss": 0.6392, + "step": 1500 + }, + { + "epoch": 0.3519061583577713, + "eval_loss": 0.6344643235206604, + "eval_runtime": 25.3359, + "eval_samples_per_second": 21.55, + "eval_steps_per_second": 0.197, + "step": 1500 + }, + { + "epoch": 0.3521407624633431, + "grad_norm": 1.1798503529087203, + "learning_rate": 8.189644050323712e-06, + "loss": 0.657, + "step": 1501 + }, + { + "epoch": 0.35237536656891494, + "grad_norm": 2.438736384051315, + "learning_rate": 8.18648870942739e-06, + "loss": 0.676, + "step": 1502 + }, + { + "epoch": 0.3526099706744868, + "grad_norm": 1.3476462061493253, + "learning_rate": 8.183331230168376e-06, + "loss": 0.6299, + "step": 1503 + }, + { + "epoch": 0.35284457478005865, + "grad_norm": 1.4625706667923106, + "learning_rate": 8.180171614665568e-06, + "loss": 0.7143, + "step": 1504 + }, + { + "epoch": 0.3530791788856305, + "grad_norm": 3.1193412580407145, + "learning_rate": 8.177009865039293e-06, + "loss": 0.6125, + "step": 1505 + }, + { + "epoch": 0.35331378299120236, + "grad_norm": 2.8272425724286316, + "learning_rate": 8.173845983411309e-06, + "loss": 0.6497, + "step": 1506 + }, + { + "epoch": 0.3535483870967742, + "grad_norm": 2.982948695040381, + "learning_rate": 8.170679971904813e-06, + "loss": 0.6559, + "step": 1507 + }, + { + "epoch": 0.353782991202346, + "grad_norm": 9.668237178820712, + "learning_rate": 8.167511832644423e-06, + "loss": 0.6642, + "step": 1508 + }, + { + "epoch": 0.3540175953079179, + "grad_norm": 1.0092042078247905, + "learning_rate": 8.164341567756189e-06, + "loss": 0.6357, + "step": 1509 + }, + { + "epoch": 0.35425219941348973, + "grad_norm": 1.6381656479825193, + "learning_rate": 8.161169179367584e-06, + "loss": 0.6943, + "step": 1510 + }, + { + "epoch": 0.3544868035190616, + "grad_norm": 1.4139566760854536, + "learning_rate": 8.157994669607511e-06, + "loss": 0.6653, + "step": 1511 + }, + { + "epoch": 0.35472140762463344, + "grad_norm": 1.54224733346737, + "learning_rate": 8.154818040606293e-06, + "loss": 0.6183, + "step": 1512 + }, + { + "epoch": 0.3549560117302053, + "grad_norm": 1.1796702378019979, + "learning_rate": 8.151639294495678e-06, + "loss": 0.6607, + "step": 1513 + }, + { + "epoch": 0.3551906158357771, + "grad_norm": 1.1342868420745096, + "learning_rate": 8.148458433408831e-06, + "loss": 0.5884, + "step": 1514 + }, + { + "epoch": 0.35542521994134896, + "grad_norm": 5.80310803368655, + "learning_rate": 8.145275459480337e-06, + "loss": 0.628, + "step": 1515 + }, + { + "epoch": 0.3556598240469208, + "grad_norm": 2.0126247757965974, + "learning_rate": 8.142090374846202e-06, + "loss": 0.6126, + "step": 1516 + }, + { + "epoch": 0.35589442815249267, + "grad_norm": 1.0570035225393455, + "learning_rate": 8.138903181643843e-06, + "loss": 0.6033, + "step": 1517 + }, + { + "epoch": 0.3561290322580645, + "grad_norm": 2.355084585207633, + "learning_rate": 8.135713882012102e-06, + "loss": 0.6349, + "step": 1518 + }, + { + "epoch": 0.3563636363636364, + "grad_norm": 1.1347525348726937, + "learning_rate": 8.13252247809122e-06, + "loss": 0.6547, + "step": 1519 + }, + { + "epoch": 0.35659824046920824, + "grad_norm": 2.9768106128415597, + "learning_rate": 8.129328972022866e-06, + "loss": 0.6146, + "step": 1520 + }, + { + "epoch": 0.35683284457478004, + "grad_norm": 2.2894577730384627, + "learning_rate": 8.126133365950107e-06, + "loss": 0.5932, + "step": 1521 + }, + { + "epoch": 0.3570674486803519, + "grad_norm": 0.8144632935696673, + "learning_rate": 8.122935662017427e-06, + "loss": 0.6158, + "step": 1522 + }, + { + "epoch": 0.35730205278592375, + "grad_norm": 1.5392163258347284, + "learning_rate": 8.11973586237071e-06, + "loss": 0.6243, + "step": 1523 + }, + { + "epoch": 0.3575366568914956, + "grad_norm": 1.374231087459184, + "learning_rate": 8.116533969157256e-06, + "loss": 0.5796, + "step": 1524 + }, + { + "epoch": 0.35777126099706746, + "grad_norm": 1.0180417289839192, + "learning_rate": 8.113329984525765e-06, + "loss": 0.6423, + "step": 1525 + }, + { + "epoch": 0.3580058651026393, + "grad_norm": 4.482669849828746, + "learning_rate": 8.110123910626338e-06, + "loss": 0.6423, + "step": 1526 + }, + { + "epoch": 0.3582404692082111, + "grad_norm": 0.9880813861443274, + "learning_rate": 8.106915749610482e-06, + "loss": 0.6009, + "step": 1527 + }, + { + "epoch": 0.358475073313783, + "grad_norm": 0.9870708942427716, + "learning_rate": 8.103705503631103e-06, + "loss": 0.6134, + "step": 1528 + }, + { + "epoch": 0.35870967741935483, + "grad_norm": 1.014592280258817, + "learning_rate": 8.100493174842509e-06, + "loss": 0.6116, + "step": 1529 + }, + { + "epoch": 0.3589442815249267, + "grad_norm": 4.905569634351141, + "learning_rate": 8.0972787654004e-06, + "loss": 0.6067, + "step": 1530 + }, + { + "epoch": 0.35917888563049855, + "grad_norm": 2.0453894867981592, + "learning_rate": 8.094062277461878e-06, + "loss": 0.568, + "step": 1531 + }, + { + "epoch": 0.3594134897360704, + "grad_norm": 1.1057508767783957, + "learning_rate": 8.090843713185434e-06, + "loss": 0.617, + "step": 1532 + }, + { + "epoch": 0.3596480938416422, + "grad_norm": 1.4426599564301568, + "learning_rate": 8.08762307473096e-06, + "loss": 0.6444, + "step": 1533 + }, + { + "epoch": 0.35988269794721406, + "grad_norm": 1.4925289383756144, + "learning_rate": 8.084400364259733e-06, + "loss": 0.6422, + "step": 1534 + }, + { + "epoch": 0.3601173020527859, + "grad_norm": 1.288728080259212, + "learning_rate": 8.081175583934424e-06, + "loss": 0.6437, + "step": 1535 + }, + { + "epoch": 0.36035190615835777, + "grad_norm": 1.4617074887411599, + "learning_rate": 8.07794873591909e-06, + "loss": 0.6511, + "step": 1536 + }, + { + "epoch": 0.3605865102639296, + "grad_norm": 1.6179956711475658, + "learning_rate": 8.07471982237918e-06, + "loss": 0.6262, + "step": 1537 + }, + { + "epoch": 0.3608211143695015, + "grad_norm": 1.372859733146479, + "learning_rate": 8.071488845481528e-06, + "loss": 0.6267, + "step": 1538 + }, + { + "epoch": 0.36105571847507334, + "grad_norm": 2.163325222536646, + "learning_rate": 8.068255807394347e-06, + "loss": 0.5942, + "step": 1539 + }, + { + "epoch": 0.36129032258064514, + "grad_norm": 3.2258330889492237, + "learning_rate": 8.065020710287241e-06, + "loss": 0.6163, + "step": 1540 + }, + { + "epoch": 0.361524926686217, + "grad_norm": 0.8290572913019899, + "learning_rate": 8.061783556331194e-06, + "loss": 0.5965, + "step": 1541 + }, + { + "epoch": 0.36175953079178885, + "grad_norm": 2.5995923724601924, + "learning_rate": 8.058544347698564e-06, + "loss": 0.6257, + "step": 1542 + }, + { + "epoch": 0.3619941348973607, + "grad_norm": 9.87904279997343, + "learning_rate": 8.055303086563095e-06, + "loss": 0.6675, + "step": 1543 + }, + { + "epoch": 0.36222873900293256, + "grad_norm": 1.6409661285489316, + "learning_rate": 8.052059775099907e-06, + "loss": 0.6791, + "step": 1544 + }, + { + "epoch": 0.3624633431085044, + "grad_norm": 1.4620368708476654, + "learning_rate": 8.048814415485493e-06, + "loss": 0.6084, + "step": 1545 + }, + { + "epoch": 0.3626979472140762, + "grad_norm": 1.2519698876941296, + "learning_rate": 8.045567009897723e-06, + "loss": 0.6483, + "step": 1546 + }, + { + "epoch": 0.3629325513196481, + "grad_norm": 3.564124817180489, + "learning_rate": 8.042317560515839e-06, + "loss": 0.6583, + "step": 1547 + }, + { + "epoch": 0.36316715542521993, + "grad_norm": 3.4346833491965683, + "learning_rate": 8.039066069520455e-06, + "loss": 0.6741, + "step": 1548 + }, + { + "epoch": 0.3634017595307918, + "grad_norm": 1.379115222500377, + "learning_rate": 8.035812539093557e-06, + "loss": 0.6442, + "step": 1549 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 1.9001860505314034, + "learning_rate": 8.032556971418495e-06, + "loss": 0.6556, + "step": 1550 + }, + { + "epoch": 0.3638709677419355, + "grad_norm": 1.991770848217314, + "learning_rate": 8.029299368679988e-06, + "loss": 0.6216, + "step": 1551 + }, + { + "epoch": 0.36410557184750736, + "grad_norm": 2.623481501550087, + "learning_rate": 8.026039733064121e-06, + "loss": 0.5739, + "step": 1552 + }, + { + "epoch": 0.36434017595307916, + "grad_norm": 1.0861583032719995, + "learning_rate": 8.022778066758348e-06, + "loss": 0.6348, + "step": 1553 + }, + { + "epoch": 0.364574780058651, + "grad_norm": 1.1200382803600062, + "learning_rate": 8.019514371951476e-06, + "loss": 0.6584, + "step": 1554 + }, + { + "epoch": 0.36480938416422287, + "grad_norm": 1.4113035236318554, + "learning_rate": 8.016248650833682e-06, + "loss": 0.6396, + "step": 1555 + }, + { + "epoch": 0.36504398826979473, + "grad_norm": 0.9922164964676499, + "learning_rate": 8.012980905596498e-06, + "loss": 0.5767, + "step": 1556 + }, + { + "epoch": 0.3652785923753666, + "grad_norm": 3.00133939894505, + "learning_rate": 8.009711138432816e-06, + "loss": 0.6543, + "step": 1557 + }, + { + "epoch": 0.36551319648093844, + "grad_norm": 2.011321682364616, + "learning_rate": 8.006439351536884e-06, + "loss": 0.6958, + "step": 1558 + }, + { + "epoch": 0.36574780058651024, + "grad_norm": 1.148729790715129, + "learning_rate": 8.003165547104306e-06, + "loss": 0.6776, + "step": 1559 + }, + { + "epoch": 0.3659824046920821, + "grad_norm": 1.355040178949832, + "learning_rate": 7.99988972733204e-06, + "loss": 0.6119, + "step": 1560 + }, + { + "epoch": 0.36621700879765395, + "grad_norm": 1.5592633596308831, + "learning_rate": 7.996611894418395e-06, + "loss": 0.6365, + "step": 1561 + }, + { + "epoch": 0.3664516129032258, + "grad_norm": 1.112529982670453, + "learning_rate": 7.993332050563035e-06, + "loss": 0.6412, + "step": 1562 + }, + { + "epoch": 0.36668621700879767, + "grad_norm": 2.148567458438155, + "learning_rate": 7.99005019796697e-06, + "loss": 0.6052, + "step": 1563 + }, + { + "epoch": 0.3669208211143695, + "grad_norm": 2.4879674832876835, + "learning_rate": 7.986766338832557e-06, + "loss": 0.644, + "step": 1564 + }, + { + "epoch": 0.3671554252199413, + "grad_norm": 1.1792942166911304, + "learning_rate": 7.9834804753635e-06, + "loss": 0.65, + "step": 1565 + }, + { + "epoch": 0.3673900293255132, + "grad_norm": 1.9843919849905691, + "learning_rate": 7.980192609764853e-06, + "loss": 0.6528, + "step": 1566 + }, + { + "epoch": 0.36762463343108503, + "grad_norm": 1.095690126906237, + "learning_rate": 7.976902744243009e-06, + "loss": 0.6004, + "step": 1567 + }, + { + "epoch": 0.3678592375366569, + "grad_norm": 2.2214188096716363, + "learning_rate": 7.973610881005702e-06, + "loss": 0.6527, + "step": 1568 + }, + { + "epoch": 0.36809384164222875, + "grad_norm": 1.1484385025704285, + "learning_rate": 7.97031702226201e-06, + "loss": 0.6095, + "step": 1569 + }, + { + "epoch": 0.3683284457478006, + "grad_norm": 0.9056842858340077, + "learning_rate": 7.96702117022235e-06, + "loss": 0.6455, + "step": 1570 + }, + { + "epoch": 0.36856304985337246, + "grad_norm": 1.2334406714572204, + "learning_rate": 7.963723327098472e-06, + "loss": 0.588, + "step": 1571 + }, + { + "epoch": 0.36879765395894426, + "grad_norm": 2.2468236956839895, + "learning_rate": 7.960423495103467e-06, + "loss": 0.6049, + "step": 1572 + }, + { + "epoch": 0.3690322580645161, + "grad_norm": 1.0812342497919107, + "learning_rate": 7.95712167645176e-06, + "loss": 0.6584, + "step": 1573 + }, + { + "epoch": 0.369266862170088, + "grad_norm": 1.4252390275384497, + "learning_rate": 7.953817873359107e-06, + "loss": 0.6417, + "step": 1574 + }, + { + "epoch": 0.36950146627565983, + "grad_norm": 2.5435387288312916, + "learning_rate": 7.950512088042599e-06, + "loss": 0.6015, + "step": 1575 + }, + { + "epoch": 0.3697360703812317, + "grad_norm": 4.8083615188042375, + "learning_rate": 7.947204322720654e-06, + "loss": 0.6651, + "step": 1576 + }, + { + "epoch": 0.36997067448680354, + "grad_norm": 1.8753561698859949, + "learning_rate": 7.94389457961302e-06, + "loss": 0.6538, + "step": 1577 + }, + { + "epoch": 0.37020527859237534, + "grad_norm": 1.808277657054618, + "learning_rate": 7.940582860940771e-06, + "loss": 0.6536, + "step": 1578 + }, + { + "epoch": 0.3704398826979472, + "grad_norm": 1.285649125514911, + "learning_rate": 7.937269168926311e-06, + "loss": 0.611, + "step": 1579 + }, + { + "epoch": 0.37067448680351905, + "grad_norm": 1.0337780987198115, + "learning_rate": 7.933953505793363e-06, + "loss": 0.6397, + "step": 1580 + }, + { + "epoch": 0.3709090909090909, + "grad_norm": 1.4439969739824046, + "learning_rate": 7.930635873766976e-06, + "loss": 0.6172, + "step": 1581 + }, + { + "epoch": 0.37114369501466277, + "grad_norm": 3.425596650950812, + "learning_rate": 7.92731627507352e-06, + "loss": 0.6754, + "step": 1582 + }, + { + "epoch": 0.3713782991202346, + "grad_norm": 1.1319720004327498, + "learning_rate": 7.923994711940682e-06, + "loss": 0.6371, + "step": 1583 + }, + { + "epoch": 0.3716129032258065, + "grad_norm": 1.943792921836966, + "learning_rate": 7.92067118659747e-06, + "loss": 0.6757, + "step": 1584 + }, + { + "epoch": 0.3718475073313783, + "grad_norm": 1.1546751688129961, + "learning_rate": 7.917345701274208e-06, + "loss": 0.6136, + "step": 1585 + }, + { + "epoch": 0.37208211143695014, + "grad_norm": 1.341873182140012, + "learning_rate": 7.914018258202536e-06, + "loss": 0.6147, + "step": 1586 + }, + { + "epoch": 0.372316715542522, + "grad_norm": 1.8473105605190465, + "learning_rate": 7.910688859615407e-06, + "loss": 0.6845, + "step": 1587 + }, + { + "epoch": 0.37255131964809385, + "grad_norm": 1.1531135465554283, + "learning_rate": 7.907357507747087e-06, + "loss": 0.6211, + "step": 1588 + }, + { + "epoch": 0.3727859237536657, + "grad_norm": 1.4230491981622588, + "learning_rate": 7.904024204833151e-06, + "loss": 0.6462, + "step": 1589 + }, + { + "epoch": 0.37302052785923756, + "grad_norm": 1.0823600490098797, + "learning_rate": 7.900688953110484e-06, + "loss": 0.7231, + "step": 1590 + }, + { + "epoch": 0.37325513196480936, + "grad_norm": 2.3049618171224924, + "learning_rate": 7.897351754817283e-06, + "loss": 0.5903, + "step": 1591 + }, + { + "epoch": 0.3734897360703812, + "grad_norm": 2.3037301234266225, + "learning_rate": 7.894012612193044e-06, + "loss": 0.6455, + "step": 1592 + }, + { + "epoch": 0.3737243401759531, + "grad_norm": 1.4791819697214155, + "learning_rate": 7.890671527478575e-06, + "loss": 0.6001, + "step": 1593 + }, + { + "epoch": 0.37395894428152493, + "grad_norm": 7.0775849390416905, + "learning_rate": 7.887328502915978e-06, + "loss": 0.6076, + "step": 1594 + }, + { + "epoch": 0.3741935483870968, + "grad_norm": 0.9013415258414034, + "learning_rate": 7.883983540748667e-06, + "loss": 0.6262, + "step": 1595 + }, + { + "epoch": 0.37442815249266864, + "grad_norm": 1.5643594979656377, + "learning_rate": 7.880636643221352e-06, + "loss": 0.5712, + "step": 1596 + }, + { + "epoch": 0.37466275659824044, + "grad_norm": 0.9350115266763178, + "learning_rate": 7.877287812580039e-06, + "loss": 0.6307, + "step": 1597 + }, + { + "epoch": 0.3748973607038123, + "grad_norm": 1.4591698399746185, + "learning_rate": 7.873937051072037e-06, + "loss": 0.6154, + "step": 1598 + }, + { + "epoch": 0.37513196480938416, + "grad_norm": 4.371467307462957, + "learning_rate": 7.870584360945942e-06, + "loss": 0.6159, + "step": 1599 + }, + { + "epoch": 0.375366568914956, + "grad_norm": 0.9708133122154972, + "learning_rate": 7.867229744451656e-06, + "loss": 0.6283, + "step": 1600 + }, + { + "epoch": 0.37560117302052787, + "grad_norm": 6.681577787074119, + "learning_rate": 7.863873203840361e-06, + "loss": 0.5997, + "step": 1601 + }, + { + "epoch": 0.3758357771260997, + "grad_norm": 1.1144910197496753, + "learning_rate": 7.860514741364544e-06, + "loss": 0.6308, + "step": 1602 + }, + { + "epoch": 0.3760703812316716, + "grad_norm": 1.0968215471761782, + "learning_rate": 7.857154359277972e-06, + "loss": 0.65, + "step": 1603 + }, + { + "epoch": 0.3763049853372434, + "grad_norm": 1.1814685587763383, + "learning_rate": 7.853792059835699e-06, + "loss": 0.6059, + "step": 1604 + }, + { + "epoch": 0.37653958944281524, + "grad_norm": 1.4699600186850428, + "learning_rate": 7.850427845294072e-06, + "loss": 0.6263, + "step": 1605 + }, + { + "epoch": 0.3767741935483871, + "grad_norm": 2.9306878918291726, + "learning_rate": 7.847061717910721e-06, + "loss": 0.5915, + "step": 1606 + }, + { + "epoch": 0.37700879765395895, + "grad_norm": 2.4936041636247546, + "learning_rate": 7.84369367994456e-06, + "loss": 0.6232, + "step": 1607 + }, + { + "epoch": 0.3772434017595308, + "grad_norm": 1.1124911836527542, + "learning_rate": 7.84032373365578e-06, + "loss": 0.6242, + "step": 1608 + }, + { + "epoch": 0.37747800586510266, + "grad_norm": 1.8757080299496558, + "learning_rate": 7.836951881305861e-06, + "loss": 0.6032, + "step": 1609 + }, + { + "epoch": 0.37771260997067446, + "grad_norm": 1.382837018463846, + "learning_rate": 7.833578125157557e-06, + "loss": 0.6367, + "step": 1610 + }, + { + "epoch": 0.3779472140762463, + "grad_norm": 1.4112673324084646, + "learning_rate": 7.830202467474899e-06, + "loss": 0.598, + "step": 1611 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 1.531733847545577, + "learning_rate": 7.826824910523198e-06, + "loss": 0.6557, + "step": 1612 + }, + { + "epoch": 0.37841642228739003, + "grad_norm": 0.9857359431378977, + "learning_rate": 7.823445456569036e-06, + "loss": 0.6731, + "step": 1613 + }, + { + "epoch": 0.3786510263929619, + "grad_norm": 1.201691042717365, + "learning_rate": 7.820064107880269e-06, + "loss": 0.6205, + "step": 1614 + }, + { + "epoch": 0.37888563049853374, + "grad_norm": 1.2901505532607962, + "learning_rate": 7.816680866726026e-06, + "loss": 0.6209, + "step": 1615 + }, + { + "epoch": 0.3791202346041056, + "grad_norm": 2.330451785881743, + "learning_rate": 7.813295735376703e-06, + "loss": 0.6262, + "step": 1616 + }, + { + "epoch": 0.3793548387096774, + "grad_norm": 1.6877055527703941, + "learning_rate": 7.809908716103966e-06, + "loss": 0.5826, + "step": 1617 + }, + { + "epoch": 0.37958944281524926, + "grad_norm": 1.138528263641915, + "learning_rate": 7.80651981118075e-06, + "loss": 0.6413, + "step": 1618 + }, + { + "epoch": 0.3798240469208211, + "grad_norm": 1.2468604622779627, + "learning_rate": 7.803129022881253e-06, + "loss": 0.6271, + "step": 1619 + }, + { + "epoch": 0.38005865102639297, + "grad_norm": 5.764192010514738, + "learning_rate": 7.799736353480936e-06, + "loss": 0.6412, + "step": 1620 + }, + { + "epoch": 0.3802932551319648, + "grad_norm": 2.198130873342226, + "learning_rate": 7.796341805256526e-06, + "loss": 0.647, + "step": 1621 + }, + { + "epoch": 0.3805278592375367, + "grad_norm": 1.3737147381989245, + "learning_rate": 7.792945380486005e-06, + "loss": 0.6073, + "step": 1622 + }, + { + "epoch": 0.3807624633431085, + "grad_norm": 1.3465757089659696, + "learning_rate": 7.789547081448622e-06, + "loss": 0.5948, + "step": 1623 + }, + { + "epoch": 0.38099706744868034, + "grad_norm": 2.4384398498095523, + "learning_rate": 7.786146910424877e-06, + "loss": 0.6782, + "step": 1624 + }, + { + "epoch": 0.3812316715542522, + "grad_norm": 1.0005342089499756, + "learning_rate": 7.78274486969653e-06, + "loss": 0.6777, + "step": 1625 + }, + { + "epoch": 0.38146627565982405, + "grad_norm": 1.0195375728278713, + "learning_rate": 7.779340961546594e-06, + "loss": 0.6448, + "step": 1626 + }, + { + "epoch": 0.3817008797653959, + "grad_norm": 4.2991384194269715, + "learning_rate": 7.775935188259336e-06, + "loss": 0.6258, + "step": 1627 + }, + { + "epoch": 0.38193548387096776, + "grad_norm": 3.1122548758651725, + "learning_rate": 7.772527552120274e-06, + "loss": 0.5896, + "step": 1628 + }, + { + "epoch": 0.38217008797653956, + "grad_norm": 1.1934580454028738, + "learning_rate": 7.769118055416176e-06, + "loss": 0.6182, + "step": 1629 + }, + { + "epoch": 0.3824046920821114, + "grad_norm": 1.1892450568789514, + "learning_rate": 7.76570670043506e-06, + "loss": 0.6554, + "step": 1630 + }, + { + "epoch": 0.3826392961876833, + "grad_norm": 1.7316044437097646, + "learning_rate": 7.76229348946619e-06, + "loss": 0.5997, + "step": 1631 + }, + { + "epoch": 0.38287390029325513, + "grad_norm": 4.740697767650133, + "learning_rate": 7.758878424800079e-06, + "loss": 0.656, + "step": 1632 + }, + { + "epoch": 0.383108504398827, + "grad_norm": 2.197652151884163, + "learning_rate": 7.755461508728473e-06, + "loss": 0.6249, + "step": 1633 + }, + { + "epoch": 0.38334310850439884, + "grad_norm": 2.0028422669624204, + "learning_rate": 7.752042743544372e-06, + "loss": 0.6537, + "step": 1634 + }, + { + "epoch": 0.3835777126099707, + "grad_norm": 1.4258853071807955, + "learning_rate": 7.748622131542015e-06, + "loss": 0.6232, + "step": 1635 + }, + { + "epoch": 0.3838123167155425, + "grad_norm": 1.1532078913814092, + "learning_rate": 7.745199675016877e-06, + "loss": 0.6394, + "step": 1636 + }, + { + "epoch": 0.38404692082111436, + "grad_norm": 1.5845757651003243, + "learning_rate": 7.741775376265668e-06, + "loss": 0.6369, + "step": 1637 + }, + { + "epoch": 0.3842815249266862, + "grad_norm": 1.6104569009493135, + "learning_rate": 7.738349237586344e-06, + "loss": 0.6562, + "step": 1638 + }, + { + "epoch": 0.38451612903225807, + "grad_norm": 1.9615611064832639, + "learning_rate": 7.734921261278087e-06, + "loss": 0.6212, + "step": 1639 + }, + { + "epoch": 0.3847507331378299, + "grad_norm": 0.9429567174903976, + "learning_rate": 7.731491449641315e-06, + "loss": 0.6223, + "step": 1640 + }, + { + "epoch": 0.3849853372434018, + "grad_norm": 1.8340961559064877, + "learning_rate": 7.72805980497768e-06, + "loss": 0.6296, + "step": 1641 + }, + { + "epoch": 0.3852199413489736, + "grad_norm": 1.1021759982037063, + "learning_rate": 7.724626329590059e-06, + "loss": 0.5946, + "step": 1642 + }, + { + "epoch": 0.38545454545454544, + "grad_norm": 1.4884445571627638, + "learning_rate": 7.721191025782563e-06, + "loss": 0.6015, + "step": 1643 + }, + { + "epoch": 0.3856891495601173, + "grad_norm": 3.226927547681297, + "learning_rate": 7.717753895860526e-06, + "loss": 0.6333, + "step": 1644 + }, + { + "epoch": 0.38592375366568915, + "grad_norm": 2.472600552120965, + "learning_rate": 7.71431494213051e-06, + "loss": 0.6093, + "step": 1645 + }, + { + "epoch": 0.386158357771261, + "grad_norm": 10.70624866986832, + "learning_rate": 7.7108741669003e-06, + "loss": 0.6136, + "step": 1646 + }, + { + "epoch": 0.38639296187683286, + "grad_norm": 1.2892356614800278, + "learning_rate": 7.707431572478901e-06, + "loss": 0.6365, + "step": 1647 + }, + { + "epoch": 0.3866275659824047, + "grad_norm": 6.168935108805046, + "learning_rate": 7.703987161176545e-06, + "loss": 0.6352, + "step": 1648 + }, + { + "epoch": 0.3868621700879765, + "grad_norm": 3.8592196027463257, + "learning_rate": 7.700540935304675e-06, + "loss": 0.6521, + "step": 1649 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.9715944124891208, + "learning_rate": 7.697092897175957e-06, + "loss": 0.6265, + "step": 1650 + }, + { + "epoch": 0.38733137829912023, + "grad_norm": 2.5991287683663247, + "learning_rate": 7.693643049104273e-06, + "loss": 0.5708, + "step": 1651 + }, + { + "epoch": 0.3875659824046921, + "grad_norm": 1.9448631815409265, + "learning_rate": 7.69019139340472e-06, + "loss": 0.6598, + "step": 1652 + }, + { + "epoch": 0.38780058651026394, + "grad_norm": 1.6160675968543425, + "learning_rate": 7.686737932393606e-06, + "loss": 0.7182, + "step": 1653 + }, + { + "epoch": 0.3880351906158358, + "grad_norm": 2.391412577308862, + "learning_rate": 7.68328266838845e-06, + "loss": 0.5988, + "step": 1654 + }, + { + "epoch": 0.3882697947214076, + "grad_norm": 3.369471487906246, + "learning_rate": 7.679825603707982e-06, + "loss": 0.6186, + "step": 1655 + }, + { + "epoch": 0.38850439882697946, + "grad_norm": 1.7192333949644023, + "learning_rate": 7.676366740672142e-06, + "loss": 0.6218, + "step": 1656 + }, + { + "epoch": 0.3887390029325513, + "grad_norm": 2.3388067836375774, + "learning_rate": 7.672906081602074e-06, + "loss": 0.709, + "step": 1657 + }, + { + "epoch": 0.38897360703812317, + "grad_norm": 1.4123309427471646, + "learning_rate": 7.66944362882013e-06, + "loss": 0.6599, + "step": 1658 + }, + { + "epoch": 0.389208211143695, + "grad_norm": 1.213105225305302, + "learning_rate": 7.665979384649864e-06, + "loss": 0.6336, + "step": 1659 + }, + { + "epoch": 0.3894428152492669, + "grad_norm": 1.873253318955163, + "learning_rate": 7.662513351416034e-06, + "loss": 0.6167, + "step": 1660 + }, + { + "epoch": 0.3896774193548387, + "grad_norm": 1.4229359040353953, + "learning_rate": 7.659045531444596e-06, + "loss": 0.6538, + "step": 1661 + }, + { + "epoch": 0.38991202346041054, + "grad_norm": 3.2815731166981763, + "learning_rate": 7.655575927062704e-06, + "loss": 0.6587, + "step": 1662 + }, + { + "epoch": 0.3901466275659824, + "grad_norm": 1.5959250089933368, + "learning_rate": 7.652104540598712e-06, + "loss": 0.6943, + "step": 1663 + }, + { + "epoch": 0.39038123167155425, + "grad_norm": 1.4753200296899938, + "learning_rate": 7.648631374382174e-06, + "loss": 0.6219, + "step": 1664 + }, + { + "epoch": 0.3906158357771261, + "grad_norm": 1.1684194218999984, + "learning_rate": 7.64515643074383e-06, + "loss": 0.6596, + "step": 1665 + }, + { + "epoch": 0.39085043988269796, + "grad_norm": 3.957667986106087, + "learning_rate": 7.641679712015617e-06, + "loss": 0.6477, + "step": 1666 + }, + { + "epoch": 0.3910850439882698, + "grad_norm": 1.4101555559607137, + "learning_rate": 7.638201220530664e-06, + "loss": 0.6176, + "step": 1667 + }, + { + "epoch": 0.3913196480938416, + "grad_norm": 2.0687281157254707, + "learning_rate": 7.634720958623287e-06, + "loss": 0.6814, + "step": 1668 + }, + { + "epoch": 0.3915542521994135, + "grad_norm": 1.7861152334922592, + "learning_rate": 7.631238928628994e-06, + "loss": 0.6347, + "step": 1669 + }, + { + "epoch": 0.39178885630498533, + "grad_norm": 0.8241628931940701, + "learning_rate": 7.6277551328844755e-06, + "loss": 0.562, + "step": 1670 + }, + { + "epoch": 0.3920234604105572, + "grad_norm": 6.179110968857805, + "learning_rate": 7.6242695737276086e-06, + "loss": 0.6288, + "step": 1671 + }, + { + "epoch": 0.39225806451612905, + "grad_norm": 1.6377758236012427, + "learning_rate": 7.620782253497453e-06, + "loss": 0.6214, + "step": 1672 + }, + { + "epoch": 0.3924926686217009, + "grad_norm": 1.4804539440002633, + "learning_rate": 7.617293174534252e-06, + "loss": 0.6045, + "step": 1673 + }, + { + "epoch": 0.3927272727272727, + "grad_norm": 1.1820224477276267, + "learning_rate": 7.613802339179428e-06, + "loss": 0.6143, + "step": 1674 + }, + { + "epoch": 0.39296187683284456, + "grad_norm": 1.6663101843072536, + "learning_rate": 7.610309749775583e-06, + "loss": 0.6225, + "step": 1675 + }, + { + "epoch": 0.3931964809384164, + "grad_norm": 1.2223395736441116, + "learning_rate": 7.606815408666493e-06, + "loss": 0.5776, + "step": 1676 + }, + { + "epoch": 0.39343108504398827, + "grad_norm": 6.289964741746519, + "learning_rate": 7.6033193181971136e-06, + "loss": 0.7023, + "step": 1677 + }, + { + "epoch": 0.3936656891495601, + "grad_norm": 1.0330790797924478, + "learning_rate": 7.599821480713571e-06, + "loss": 0.6286, + "step": 1678 + }, + { + "epoch": 0.393900293255132, + "grad_norm": 1.89908810726443, + "learning_rate": 7.596321898563167e-06, + "loss": 0.6415, + "step": 1679 + }, + { + "epoch": 0.39413489736070384, + "grad_norm": 4.35097376095719, + "learning_rate": 7.59282057409437e-06, + "loss": 0.5727, + "step": 1680 + }, + { + "epoch": 0.39436950146627564, + "grad_norm": 2.2722518194853736, + "learning_rate": 7.589317509656824e-06, + "loss": 0.6787, + "step": 1681 + }, + { + "epoch": 0.3946041055718475, + "grad_norm": 1.0161605344772082, + "learning_rate": 7.585812707601333e-06, + "loss": 0.6062, + "step": 1682 + }, + { + "epoch": 0.39483870967741935, + "grad_norm": 1.1379485114464876, + "learning_rate": 7.582306170279873e-06, + "loss": 0.6193, + "step": 1683 + }, + { + "epoch": 0.3950733137829912, + "grad_norm": 5.590600647341276, + "learning_rate": 7.57879790004558e-06, + "loss": 0.6291, + "step": 1684 + }, + { + "epoch": 0.39530791788856307, + "grad_norm": 1.2093191931980196, + "learning_rate": 7.575287899252759e-06, + "loss": 0.6698, + "step": 1685 + }, + { + "epoch": 0.3955425219941349, + "grad_norm": 0.7656633162018557, + "learning_rate": 7.571776170256869e-06, + "loss": 0.5728, + "step": 1686 + }, + { + "epoch": 0.3957771260997067, + "grad_norm": 1.3366721896457685, + "learning_rate": 7.568262715414534e-06, + "loss": 0.6556, + "step": 1687 + }, + { + "epoch": 0.3960117302052786, + "grad_norm": 1.8218187634970278, + "learning_rate": 7.564747537083533e-06, + "loss": 0.6241, + "step": 1688 + }, + { + "epoch": 0.39624633431085043, + "grad_norm": 1.2066453474454495, + "learning_rate": 7.561230637622805e-06, + "loss": 0.6306, + "step": 1689 + }, + { + "epoch": 0.3964809384164223, + "grad_norm": 1.7231063124864543, + "learning_rate": 7.5577120193924415e-06, + "loss": 0.6729, + "step": 1690 + }, + { + "epoch": 0.39671554252199415, + "grad_norm": 0.8481410402095703, + "learning_rate": 7.554191684753687e-06, + "loss": 0.7079, + "step": 1691 + }, + { + "epoch": 0.396950146627566, + "grad_norm": 1.1626186427349046, + "learning_rate": 7.550669636068939e-06, + "loss": 0.6614, + "step": 1692 + }, + { + "epoch": 0.3971847507331378, + "grad_norm": 1.0690244274077698, + "learning_rate": 7.547145875701744e-06, + "loss": 0.6198, + "step": 1693 + }, + { + "epoch": 0.39741935483870966, + "grad_norm": 1.0249960620447252, + "learning_rate": 7.543620406016801e-06, + "loss": 0.5707, + "step": 1694 + }, + { + "epoch": 0.3976539589442815, + "grad_norm": 19.10349514928793, + "learning_rate": 7.540093229379951e-06, + "loss": 0.656, + "step": 1695 + }, + { + "epoch": 0.39788856304985337, + "grad_norm": 2.6427431748274235, + "learning_rate": 7.536564348158184e-06, + "loss": 0.676, + "step": 1696 + }, + { + "epoch": 0.39812316715542523, + "grad_norm": 1.0658332220420659, + "learning_rate": 7.5330337647196305e-06, + "loss": 0.6751, + "step": 1697 + }, + { + "epoch": 0.3983577712609971, + "grad_norm": 1.2692894945661328, + "learning_rate": 7.52950148143357e-06, + "loss": 0.6647, + "step": 1698 + }, + { + "epoch": 0.39859237536656894, + "grad_norm": 1.66387099040758, + "learning_rate": 7.525967500670413e-06, + "loss": 0.6346, + "step": 1699 + }, + { + "epoch": 0.39882697947214074, + "grad_norm": 1.3709588863996383, + "learning_rate": 7.522431824801717e-06, + "loss": 0.6329, + "step": 1700 + }, + { + "epoch": 0.3990615835777126, + "grad_norm": 3.474680034045354, + "learning_rate": 7.5188944562001746e-06, + "loss": 0.5926, + "step": 1701 + }, + { + "epoch": 0.39929618768328445, + "grad_norm": 1.1517654120841698, + "learning_rate": 7.515355397239613e-06, + "loss": 0.5686, + "step": 1702 + }, + { + "epoch": 0.3995307917888563, + "grad_norm": 5.005503902366467, + "learning_rate": 7.511814650294994e-06, + "loss": 0.5656, + "step": 1703 + }, + { + "epoch": 0.39976539589442817, + "grad_norm": 2.7049150211166135, + "learning_rate": 7.508272217742416e-06, + "loss": 0.6121, + "step": 1704 + }, + { + "epoch": 0.4, + "grad_norm": 1.9908828597574337, + "learning_rate": 7.504728101959103e-06, + "loss": 0.6302, + "step": 1705 + }, + { + "epoch": 0.4002346041055718, + "grad_norm": 0.919251116851586, + "learning_rate": 7.5011823053234115e-06, + "loss": 0.6456, + "step": 1706 + }, + { + "epoch": 0.4004692082111437, + "grad_norm": 1.4389194955101958, + "learning_rate": 7.497634830214826e-06, + "loss": 0.5995, + "step": 1707 + }, + { + "epoch": 0.40070381231671554, + "grad_norm": 5.164560163062019, + "learning_rate": 7.494085679013959e-06, + "loss": 0.5942, + "step": 1708 + }, + { + "epoch": 0.4009384164222874, + "grad_norm": 1.1325687584621202, + "learning_rate": 7.490534854102543e-06, + "loss": 0.6018, + "step": 1709 + }, + { + "epoch": 0.40117302052785925, + "grad_norm": 0.7988283948081473, + "learning_rate": 7.486982357863435e-06, + "loss": 0.6365, + "step": 1710 + }, + { + "epoch": 0.4014076246334311, + "grad_norm": 1.75834369524383, + "learning_rate": 7.48342819268062e-06, + "loss": 0.6335, + "step": 1711 + }, + { + "epoch": 0.40164222873900296, + "grad_norm": 1.1967795275195026, + "learning_rate": 7.479872360939193e-06, + "loss": 0.5808, + "step": 1712 + }, + { + "epoch": 0.40187683284457476, + "grad_norm": 1.8206629863153243, + "learning_rate": 7.476314865025376e-06, + "loss": 0.6394, + "step": 1713 + }, + { + "epoch": 0.4021114369501466, + "grad_norm": 1.4687031946397808, + "learning_rate": 7.472755707326503e-06, + "loss": 0.5859, + "step": 1714 + }, + { + "epoch": 0.4023460410557185, + "grad_norm": 1.3555501632175697, + "learning_rate": 7.469194890231022e-06, + "loss": 0.6619, + "step": 1715 + }, + { + "epoch": 0.40258064516129033, + "grad_norm": 0.887119286137143, + "learning_rate": 7.465632416128501e-06, + "loss": 0.6456, + "step": 1716 + }, + { + "epoch": 0.4028152492668622, + "grad_norm": 1.309577536114152, + "learning_rate": 7.462068287409614e-06, + "loss": 0.6242, + "step": 1717 + }, + { + "epoch": 0.40304985337243404, + "grad_norm": 1.914591925810589, + "learning_rate": 7.458502506466146e-06, + "loss": 0.6502, + "step": 1718 + }, + { + "epoch": 0.40328445747800584, + "grad_norm": 1.277094433107499, + "learning_rate": 7.454935075690994e-06, + "loss": 0.6106, + "step": 1719 + }, + { + "epoch": 0.4035190615835777, + "grad_norm": 22.012098844809557, + "learning_rate": 7.451365997478159e-06, + "loss": 0.5783, + "step": 1720 + }, + { + "epoch": 0.40375366568914955, + "grad_norm": 1.7601628159246012, + "learning_rate": 7.44779527422275e-06, + "loss": 0.6083, + "step": 1721 + }, + { + "epoch": 0.4039882697947214, + "grad_norm": 8.431429472241842, + "learning_rate": 7.444222908320976e-06, + "loss": 0.6239, + "step": 1722 + }, + { + "epoch": 0.40422287390029327, + "grad_norm": 2.78628788759552, + "learning_rate": 7.440648902170153e-06, + "loss": 0.6533, + "step": 1723 + }, + { + "epoch": 0.4044574780058651, + "grad_norm": 1.2379131876459923, + "learning_rate": 7.4370732581686965e-06, + "loss": 0.6407, + "step": 1724 + }, + { + "epoch": 0.4046920821114369, + "grad_norm": 1.6940358989013258, + "learning_rate": 7.433495978716119e-06, + "loss": 0.625, + "step": 1725 + }, + { + "epoch": 0.4049266862170088, + "grad_norm": 78.39486316367483, + "learning_rate": 7.42991706621303e-06, + "loss": 0.6083, + "step": 1726 + }, + { + "epoch": 0.40516129032258064, + "grad_norm": 2.4661684430165214, + "learning_rate": 7.426336523061141e-06, + "loss": 0.7001, + "step": 1727 + }, + { + "epoch": 0.4053958944281525, + "grad_norm": 3.1100131190120863, + "learning_rate": 7.422754351663252e-06, + "loss": 0.6638, + "step": 1728 + }, + { + "epoch": 0.40563049853372435, + "grad_norm": 2.2924349765517467, + "learning_rate": 7.419170554423255e-06, + "loss": 0.6654, + "step": 1729 + }, + { + "epoch": 0.4058651026392962, + "grad_norm": 2.6430893925925685, + "learning_rate": 7.415585133746137e-06, + "loss": 0.5866, + "step": 1730 + }, + { + "epoch": 0.40609970674486806, + "grad_norm": 1.14584465992698, + "learning_rate": 7.411998092037973e-06, + "loss": 0.6111, + "step": 1731 + }, + { + "epoch": 0.40633431085043986, + "grad_norm": 1.631849211332541, + "learning_rate": 7.408409431705924e-06, + "loss": 0.5961, + "step": 1732 + }, + { + "epoch": 0.4065689149560117, + "grad_norm": 1.2145397669954725, + "learning_rate": 7.40481915515824e-06, + "loss": 0.6909, + "step": 1733 + }, + { + "epoch": 0.4068035190615836, + "grad_norm": 1.5507848021981117, + "learning_rate": 7.401227264804254e-06, + "loss": 0.6403, + "step": 1734 + }, + { + "epoch": 0.40703812316715543, + "grad_norm": 2.0558491775949235, + "learning_rate": 7.397633763054384e-06, + "loss": 0.6264, + "step": 1735 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 2.551712659314444, + "learning_rate": 7.3940386523201224e-06, + "loss": 0.6564, + "step": 1736 + }, + { + "epoch": 0.40750733137829914, + "grad_norm": 1.5990076488320568, + "learning_rate": 7.390441935014052e-06, + "loss": 0.6421, + "step": 1737 + }, + { + "epoch": 0.40774193548387094, + "grad_norm": 1.6883610257539423, + "learning_rate": 7.386843613549828e-06, + "loss": 0.599, + "step": 1738 + }, + { + "epoch": 0.4079765395894428, + "grad_norm": 3.512645700209129, + "learning_rate": 7.383243690342179e-06, + "loss": 0.6161, + "step": 1739 + }, + { + "epoch": 0.40821114369501466, + "grad_norm": 1.10750732274883, + "learning_rate": 7.379642167806913e-06, + "loss": 0.5812, + "step": 1740 + }, + { + "epoch": 0.4084457478005865, + "grad_norm": 3.480727119118568, + "learning_rate": 7.3760390483609135e-06, + "loss": 0.6216, + "step": 1741 + }, + { + "epoch": 0.40868035190615837, + "grad_norm": 1.276969992900176, + "learning_rate": 7.372434334422128e-06, + "loss": 0.6176, + "step": 1742 + }, + { + "epoch": 0.4089149560117302, + "grad_norm": 1.2086539896123187, + "learning_rate": 7.368828028409581e-06, + "loss": 0.5926, + "step": 1743 + }, + { + "epoch": 0.4091495601173021, + "grad_norm": 1.4121012202709036, + "learning_rate": 7.365220132743363e-06, + "loss": 0.6611, + "step": 1744 + }, + { + "epoch": 0.4093841642228739, + "grad_norm": 1.195850026147483, + "learning_rate": 7.361610649844628e-06, + "loss": 0.6325, + "step": 1745 + }, + { + "epoch": 0.40961876832844574, + "grad_norm": 1.502712220490566, + "learning_rate": 7.357999582135601e-06, + "loss": 0.5779, + "step": 1746 + }, + { + "epoch": 0.4098533724340176, + "grad_norm": 1.6392808213341148, + "learning_rate": 7.354386932039566e-06, + "loss": 0.614, + "step": 1747 + }, + { + "epoch": 0.41008797653958945, + "grad_norm": 1.0440133915247312, + "learning_rate": 7.350772701980873e-06, + "loss": 0.6974, + "step": 1748 + }, + { + "epoch": 0.4103225806451613, + "grad_norm": 0.86568552994266, + "learning_rate": 7.347156894384926e-06, + "loss": 0.6348, + "step": 1749 + }, + { + "epoch": 0.41055718475073316, + "grad_norm": 4.2340316305038, + "learning_rate": 7.343539511678195e-06, + "loss": 0.6565, + "step": 1750 + }, + { + "epoch": 0.41079178885630496, + "grad_norm": 5.570019801840745, + "learning_rate": 7.339920556288202e-06, + "loss": 0.6476, + "step": 1751 + }, + { + "epoch": 0.4110263929618768, + "grad_norm": 0.9975936348153364, + "learning_rate": 7.336300030643526e-06, + "loss": 0.6302, + "step": 1752 + }, + { + "epoch": 0.4112609970674487, + "grad_norm": 1.1040732239633317, + "learning_rate": 7.3326779371738e-06, + "loss": 0.6035, + "step": 1753 + }, + { + "epoch": 0.41149560117302053, + "grad_norm": 2.047008469580855, + "learning_rate": 7.3290542783097085e-06, + "loss": 0.5845, + "step": 1754 + }, + { + "epoch": 0.4117302052785924, + "grad_norm": 1.161806113312015, + "learning_rate": 7.325429056482986e-06, + "loss": 0.7006, + "step": 1755 + }, + { + "epoch": 0.41196480938416424, + "grad_norm": 1.353184403882004, + "learning_rate": 7.321802274126418e-06, + "loss": 0.6609, + "step": 1756 + }, + { + "epoch": 0.41219941348973604, + "grad_norm": 9.1821190062482, + "learning_rate": 7.318173933673836e-06, + "loss": 0.6799, + "step": 1757 + }, + { + "epoch": 0.4124340175953079, + "grad_norm": 1.057692012937856, + "learning_rate": 7.314544037560115e-06, + "loss": 0.6297, + "step": 1758 + }, + { + "epoch": 0.41266862170087976, + "grad_norm": 0.9493457381189597, + "learning_rate": 7.3109125882211765e-06, + "loss": 0.5545, + "step": 1759 + }, + { + "epoch": 0.4129032258064516, + "grad_norm": 0.8696548362478198, + "learning_rate": 7.3072795880939855e-06, + "loss": 0.6757, + "step": 1760 + }, + { + "epoch": 0.41313782991202347, + "grad_norm": 1.800753236041734, + "learning_rate": 7.303645039616544e-06, + "loss": 0.6521, + "step": 1761 + }, + { + "epoch": 0.4133724340175953, + "grad_norm": 1.0588304341242767, + "learning_rate": 7.300008945227895e-06, + "loss": 0.5957, + "step": 1762 + }, + { + "epoch": 0.4136070381231672, + "grad_norm": 2.683854835153098, + "learning_rate": 7.29637130736812e-06, + "loss": 0.6379, + "step": 1763 + }, + { + "epoch": 0.413841642228739, + "grad_norm": 1.5219693335922593, + "learning_rate": 7.292732128478336e-06, + "loss": 0.6232, + "step": 1764 + }, + { + "epoch": 0.41407624633431084, + "grad_norm": 1.1486892255099441, + "learning_rate": 7.289091411000691e-06, + "loss": 0.6536, + "step": 1765 + }, + { + "epoch": 0.4143108504398827, + "grad_norm": 1.0375978962914074, + "learning_rate": 7.285449157378367e-06, + "loss": 0.6259, + "step": 1766 + }, + { + "epoch": 0.41454545454545455, + "grad_norm": 0.9513297266866592, + "learning_rate": 7.2818053700555816e-06, + "loss": 0.6523, + "step": 1767 + }, + { + "epoch": 0.4147800586510264, + "grad_norm": 1.036511892569362, + "learning_rate": 7.278160051477574e-06, + "loss": 0.6158, + "step": 1768 + }, + { + "epoch": 0.41501466275659826, + "grad_norm": 1.6331785566147172, + "learning_rate": 7.274513204090618e-06, + "loss": 0.6815, + "step": 1769 + }, + { + "epoch": 0.41524926686217006, + "grad_norm": 1.424146712111036, + "learning_rate": 7.270864830342006e-06, + "loss": 0.6556, + "step": 1770 + }, + { + "epoch": 0.4154838709677419, + "grad_norm": 2.2628392752024844, + "learning_rate": 7.267214932680062e-06, + "loss": 0.5884, + "step": 1771 + }, + { + "epoch": 0.4157184750733138, + "grad_norm": 2.5412924077674792, + "learning_rate": 7.263563513554127e-06, + "loss": 0.613, + "step": 1772 + }, + { + "epoch": 0.41595307917888563, + "grad_norm": 7.339083716560217, + "learning_rate": 7.259910575414569e-06, + "loss": 0.6219, + "step": 1773 + }, + { + "epoch": 0.4161876832844575, + "grad_norm": 0.8028217769264021, + "learning_rate": 7.256256120712768e-06, + "loss": 0.6696, + "step": 1774 + }, + { + "epoch": 0.41642228739002934, + "grad_norm": 1.1624684075460954, + "learning_rate": 7.252600151901127e-06, + "loss": 0.6268, + "step": 1775 + }, + { + "epoch": 0.4166568914956012, + "grad_norm": 1.7903872594102144, + "learning_rate": 7.248942671433064e-06, + "loss": 0.6555, + "step": 1776 + }, + { + "epoch": 0.416891495601173, + "grad_norm": 1.1582237563805369, + "learning_rate": 7.24528368176301e-06, + "loss": 0.6252, + "step": 1777 + }, + { + "epoch": 0.41712609970674486, + "grad_norm": 1.0337615573317804, + "learning_rate": 7.241623185346409e-06, + "loss": 0.5977, + "step": 1778 + }, + { + "epoch": 0.4173607038123167, + "grad_norm": 1.7127157102460435, + "learning_rate": 7.237961184639719e-06, + "loss": 0.6615, + "step": 1779 + }, + { + "epoch": 0.41759530791788857, + "grad_norm": 3.469324251654291, + "learning_rate": 7.234297682100404e-06, + "loss": 0.631, + "step": 1780 + }, + { + "epoch": 0.4178299120234604, + "grad_norm": 0.9311411157448523, + "learning_rate": 7.23063268018694e-06, + "loss": 0.6595, + "step": 1781 + }, + { + "epoch": 0.4180645161290323, + "grad_norm": 1.0342309935883347, + "learning_rate": 7.2269661813588e-06, + "loss": 0.615, + "step": 1782 + }, + { + "epoch": 0.4182991202346041, + "grad_norm": 3.127875807541969, + "learning_rate": 7.223298188076475e-06, + "loss": 0.6298, + "step": 1783 + }, + { + "epoch": 0.41853372434017594, + "grad_norm": 1.2357145994413388, + "learning_rate": 7.219628702801448e-06, + "loss": 0.6833, + "step": 1784 + }, + { + "epoch": 0.4187683284457478, + "grad_norm": 4.844067072728044, + "learning_rate": 7.215957727996208e-06, + "loss": 0.5995, + "step": 1785 + }, + { + "epoch": 0.41900293255131965, + "grad_norm": 3.37805068344963, + "learning_rate": 7.212285266124241e-06, + "loss": 0.5597, + "step": 1786 + }, + { + "epoch": 0.4192375366568915, + "grad_norm": 1.9953706043378174, + "learning_rate": 7.208611319650036e-06, + "loss": 0.6702, + "step": 1787 + }, + { + "epoch": 0.41947214076246336, + "grad_norm": 1.108643586188879, + "learning_rate": 7.204935891039071e-06, + "loss": 0.6511, + "step": 1788 + }, + { + "epoch": 0.41970674486803516, + "grad_norm": 1.3078284617761626, + "learning_rate": 7.201258982757827e-06, + "loss": 0.6233, + "step": 1789 + }, + { + "epoch": 0.419941348973607, + "grad_norm": 1.2135587701586805, + "learning_rate": 7.197580597273769e-06, + "loss": 0.6245, + "step": 1790 + }, + { + "epoch": 0.4201759530791789, + "grad_norm": 1.6408345206379327, + "learning_rate": 7.1939007370553615e-06, + "loss": 0.6422, + "step": 1791 + }, + { + "epoch": 0.42041055718475073, + "grad_norm": 2.0834935401659784, + "learning_rate": 7.1902194045720495e-06, + "loss": 0.6616, + "step": 1792 + }, + { + "epoch": 0.4206451612903226, + "grad_norm": 2.0433575326148974, + "learning_rate": 7.186536602294278e-06, + "loss": 0.6567, + "step": 1793 + }, + { + "epoch": 0.42087976539589445, + "grad_norm": 5.513189436713361, + "learning_rate": 7.182852332693467e-06, + "loss": 0.6242, + "step": 1794 + }, + { + "epoch": 0.4211143695014663, + "grad_norm": 1.061612820536488, + "learning_rate": 7.179166598242028e-06, + "loss": 0.6371, + "step": 1795 + }, + { + "epoch": 0.4213489736070381, + "grad_norm": 0.832673904971619, + "learning_rate": 7.175479401413353e-06, + "loss": 0.6085, + "step": 1796 + }, + { + "epoch": 0.42158357771260996, + "grad_norm": 4.1329770556502154, + "learning_rate": 7.171790744681816e-06, + "loss": 0.679, + "step": 1797 + }, + { + "epoch": 0.4218181818181818, + "grad_norm": 1.5563224417692414, + "learning_rate": 7.168100630522769e-06, + "loss": 0.6288, + "step": 1798 + }, + { + "epoch": 0.42205278592375367, + "grad_norm": 1.5160883197367536, + "learning_rate": 7.164409061412545e-06, + "loss": 0.6729, + "step": 1799 + }, + { + "epoch": 0.4222873900293255, + "grad_norm": 0.9237689283235428, + "learning_rate": 7.1607160398284534e-06, + "loss": 0.6053, + "step": 1800 + }, + { + "epoch": 0.4225219941348974, + "grad_norm": 1.5504678306160298, + "learning_rate": 7.1570215682487724e-06, + "loss": 0.6143, + "step": 1801 + }, + { + "epoch": 0.4227565982404692, + "grad_norm": 1.9120918187104634, + "learning_rate": 7.153325649152762e-06, + "loss": 0.5863, + "step": 1802 + }, + { + "epoch": 0.42299120234604104, + "grad_norm": 1.062769322777713, + "learning_rate": 7.149628285020647e-06, + "loss": 0.6312, + "step": 1803 + }, + { + "epoch": 0.4232258064516129, + "grad_norm": 1.5545749260716493, + "learning_rate": 7.145929478333626e-06, + "loss": 0.662, + "step": 1804 + }, + { + "epoch": 0.42346041055718475, + "grad_norm": 1.3440830970119046, + "learning_rate": 7.14222923157386e-06, + "loss": 0.6323, + "step": 1805 + }, + { + "epoch": 0.4236950146627566, + "grad_norm": 1.167036885068974, + "learning_rate": 7.138527547224485e-06, + "loss": 0.6267, + "step": 1806 + }, + { + "epoch": 0.42392961876832846, + "grad_norm": 1.0240846072595606, + "learning_rate": 7.134824427769594e-06, + "loss": 0.6067, + "step": 1807 + }, + { + "epoch": 0.4241642228739003, + "grad_norm": 9.220119696794047, + "learning_rate": 7.131119875694246e-06, + "loss": 0.7045, + "step": 1808 + }, + { + "epoch": 0.4243988269794721, + "grad_norm": 3.2151891902847325, + "learning_rate": 7.127413893484463e-06, + "loss": 0.6602, + "step": 1809 + }, + { + "epoch": 0.424633431085044, + "grad_norm": 4.2408333696511376, + "learning_rate": 7.123706483627225e-06, + "loss": 0.6672, + "step": 1810 + }, + { + "epoch": 0.42486803519061583, + "grad_norm": 1.9624585179056808, + "learning_rate": 7.119997648610468e-06, + "loss": 0.5959, + "step": 1811 + }, + { + "epoch": 0.4251026392961877, + "grad_norm": 4.19021221348873, + "learning_rate": 7.116287390923089e-06, + "loss": 0.6691, + "step": 1812 + }, + { + "epoch": 0.42533724340175955, + "grad_norm": 0.8446859960795003, + "learning_rate": 7.112575713054937e-06, + "loss": 0.5998, + "step": 1813 + }, + { + "epoch": 0.4255718475073314, + "grad_norm": 1.9035935400381916, + "learning_rate": 7.108862617496812e-06, + "loss": 0.6158, + "step": 1814 + }, + { + "epoch": 0.4258064516129032, + "grad_norm": 0.8992856659183355, + "learning_rate": 7.105148106740469e-06, + "loss": 0.6244, + "step": 1815 + }, + { + "epoch": 0.42604105571847506, + "grad_norm": 1.2487981931558807, + "learning_rate": 7.1014321832786135e-06, + "loss": 0.635, + "step": 1816 + }, + { + "epoch": 0.4262756598240469, + "grad_norm": 1.7941136467653054, + "learning_rate": 7.097714849604894e-06, + "loss": 0.6618, + "step": 1817 + }, + { + "epoch": 0.42651026392961877, + "grad_norm": 2.338348486538766, + "learning_rate": 7.09399610821391e-06, + "loss": 0.6712, + "step": 1818 + }, + { + "epoch": 0.4267448680351906, + "grad_norm": 1.1272183144060923, + "learning_rate": 7.090275961601203e-06, + "loss": 0.5519, + "step": 1819 + }, + { + "epoch": 0.4269794721407625, + "grad_norm": 1.2745355777155003, + "learning_rate": 7.086554412263259e-06, + "loss": 0.6893, + "step": 1820 + }, + { + "epoch": 0.4272140762463343, + "grad_norm": 3.4926617128552557, + "learning_rate": 7.082831462697505e-06, + "loss": 0.5682, + "step": 1821 + }, + { + "epoch": 0.42744868035190614, + "grad_norm": 4.027086586041726, + "learning_rate": 7.079107115402306e-06, + "loss": 0.6657, + "step": 1822 + }, + { + "epoch": 0.427683284457478, + "grad_norm": 1.3598308965958918, + "learning_rate": 7.075381372876969e-06, + "loss": 0.6063, + "step": 1823 + }, + { + "epoch": 0.42791788856304985, + "grad_norm": 1.8096930022472564, + "learning_rate": 7.07165423762173e-06, + "loss": 0.6326, + "step": 1824 + }, + { + "epoch": 0.4281524926686217, + "grad_norm": 1.7350170547741561, + "learning_rate": 7.0679257121377706e-06, + "loss": 0.6305, + "step": 1825 + }, + { + "epoch": 0.42838709677419357, + "grad_norm": 1.6282545357478986, + "learning_rate": 7.0641957989271956e-06, + "loss": 0.577, + "step": 1826 + }, + { + "epoch": 0.4286217008797654, + "grad_norm": 2.3377677378479005, + "learning_rate": 7.060464500493044e-06, + "loss": 0.6412, + "step": 1827 + }, + { + "epoch": 0.4288563049853372, + "grad_norm": 1.8744231423038658, + "learning_rate": 7.056731819339287e-06, + "loss": 0.6191, + "step": 1828 + }, + { + "epoch": 0.4290909090909091, + "grad_norm": 1.7219629262070664, + "learning_rate": 7.0529977579708195e-06, + "loss": 0.6175, + "step": 1829 + }, + { + "epoch": 0.42932551319648093, + "grad_norm": 0.8982095927587648, + "learning_rate": 7.049262318893466e-06, + "loss": 0.6053, + "step": 1830 + }, + { + "epoch": 0.4295601173020528, + "grad_norm": 6.44110777874587, + "learning_rate": 7.0455255046139734e-06, + "loss": 0.5827, + "step": 1831 + }, + { + "epoch": 0.42979472140762465, + "grad_norm": 1.5661883166939536, + "learning_rate": 7.041787317640014e-06, + "loss": 0.6156, + "step": 1832 + }, + { + "epoch": 0.4300293255131965, + "grad_norm": 1.2968226800114315, + "learning_rate": 7.0380477604801786e-06, + "loss": 0.592, + "step": 1833 + }, + { + "epoch": 0.4302639296187683, + "grad_norm": 1.1718379659905511, + "learning_rate": 7.034306835643978e-06, + "loss": 0.6717, + "step": 1834 + }, + { + "epoch": 0.43049853372434016, + "grad_norm": 15.81144946578598, + "learning_rate": 7.030564545641842e-06, + "loss": 0.6009, + "step": 1835 + }, + { + "epoch": 0.430733137829912, + "grad_norm": 1.4409313698600532, + "learning_rate": 7.0268208929851164e-06, + "loss": 0.5781, + "step": 1836 + }, + { + "epoch": 0.4309677419354839, + "grad_norm": 9.964109243112334, + "learning_rate": 7.023075880186061e-06, + "loss": 0.6207, + "step": 1837 + }, + { + "epoch": 0.43120234604105573, + "grad_norm": 1.27302283818698, + "learning_rate": 7.019329509757845e-06, + "loss": 0.6312, + "step": 1838 + }, + { + "epoch": 0.4314369501466276, + "grad_norm": 1.7943943144512091, + "learning_rate": 7.015581784214557e-06, + "loss": 0.5983, + "step": 1839 + }, + { + "epoch": 0.43167155425219944, + "grad_norm": 1.008068563518117, + "learning_rate": 7.0118327060711865e-06, + "loss": 0.5751, + "step": 1840 + }, + { + "epoch": 0.43190615835777124, + "grad_norm": 1.0117839893130047, + "learning_rate": 7.0080822778436345e-06, + "loss": 0.6601, + "step": 1841 + }, + { + "epoch": 0.4321407624633431, + "grad_norm": 0.8513872014823226, + "learning_rate": 7.004330502048708e-06, + "loss": 0.6341, + "step": 1842 + }, + { + "epoch": 0.43237536656891495, + "grad_norm": 1.1295110913790627, + "learning_rate": 7.000577381204119e-06, + "loss": 0.6495, + "step": 1843 + }, + { + "epoch": 0.4326099706744868, + "grad_norm": 2.159755829167205, + "learning_rate": 6.9968229178284775e-06, + "loss": 0.641, + "step": 1844 + }, + { + "epoch": 0.43284457478005867, + "grad_norm": 1.2883920798235482, + "learning_rate": 6.993067114441303e-06, + "loss": 0.6853, + "step": 1845 + }, + { + "epoch": 0.4330791788856305, + "grad_norm": 1.592180880850343, + "learning_rate": 6.989309973563006e-06, + "loss": 0.585, + "step": 1846 + }, + { + "epoch": 0.4333137829912023, + "grad_norm": 1.761137530752373, + "learning_rate": 6.985551497714898e-06, + "loss": 0.6437, + "step": 1847 + }, + { + "epoch": 0.4335483870967742, + "grad_norm": 1.542809103262987, + "learning_rate": 6.981791689419187e-06, + "loss": 0.5708, + "step": 1848 + }, + { + "epoch": 0.43378299120234604, + "grad_norm": 1.336389712715548, + "learning_rate": 6.978030551198974e-06, + "loss": 0.5904, + "step": 1849 + }, + { + "epoch": 0.4340175953079179, + "grad_norm": 0.9220463053323381, + "learning_rate": 6.974268085578252e-06, + "loss": 0.5911, + "step": 1850 + }, + { + "epoch": 0.43425219941348975, + "grad_norm": 0.9193301306151851, + "learning_rate": 6.970504295081907e-06, + "loss": 0.6083, + "step": 1851 + }, + { + "epoch": 0.4344868035190616, + "grad_norm": 1.7350690801122728, + "learning_rate": 6.966739182235711e-06, + "loss": 0.6572, + "step": 1852 + }, + { + "epoch": 0.4347214076246334, + "grad_norm": 0.8732511062010674, + "learning_rate": 6.9629727495663265e-06, + "loss": 0.6557, + "step": 1853 + }, + { + "epoch": 0.43495601173020526, + "grad_norm": 1.5657251389423612, + "learning_rate": 6.959204999601299e-06, + "loss": 0.6372, + "step": 1854 + }, + { + "epoch": 0.4351906158357771, + "grad_norm": 1.4136590535535574, + "learning_rate": 6.955435934869059e-06, + "loss": 0.6115, + "step": 1855 + }, + { + "epoch": 0.435425219941349, + "grad_norm": 7.720537086803013, + "learning_rate": 6.951665557898923e-06, + "loss": 0.6, + "step": 1856 + }, + { + "epoch": 0.43565982404692083, + "grad_norm": 1.275708259139756, + "learning_rate": 6.9478938712210785e-06, + "loss": 0.6539, + "step": 1857 + }, + { + "epoch": 0.4358944281524927, + "grad_norm": 0.9695794688862226, + "learning_rate": 6.944120877366605e-06, + "loss": 0.6149, + "step": 1858 + }, + { + "epoch": 0.43612903225806454, + "grad_norm": 1.4441436339665144, + "learning_rate": 6.940346578867446e-06, + "loss": 0.684, + "step": 1859 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 1.5328480561291888, + "learning_rate": 6.936570978256431e-06, + "loss": 0.6793, + "step": 1860 + }, + { + "epoch": 0.4365982404692082, + "grad_norm": 1.0910453658059502, + "learning_rate": 6.932794078067255e-06, + "loss": 0.6445, + "step": 1861 + }, + { + "epoch": 0.43683284457478005, + "grad_norm": 1.4701931397340777, + "learning_rate": 6.929015880834495e-06, + "loss": 0.6575, + "step": 1862 + }, + { + "epoch": 0.4370674486803519, + "grad_norm": 2.09698745681287, + "learning_rate": 6.925236389093588e-06, + "loss": 0.6485, + "step": 1863 + }, + { + "epoch": 0.43730205278592377, + "grad_norm": 1.400083097458855, + "learning_rate": 6.921455605380846e-06, + "loss": 0.6558, + "step": 1864 + }, + { + "epoch": 0.4375366568914956, + "grad_norm": 1.6664198387372906, + "learning_rate": 6.917673532233446e-06, + "loss": 0.5903, + "step": 1865 + }, + { + "epoch": 0.4377712609970674, + "grad_norm": 1.4324783819949398, + "learning_rate": 6.91389017218943e-06, + "loss": 0.6542, + "step": 1866 + }, + { + "epoch": 0.4380058651026393, + "grad_norm": 1.1126814108154655, + "learning_rate": 6.910105527787704e-06, + "loss": 0.6752, + "step": 1867 + }, + { + "epoch": 0.43824046920821114, + "grad_norm": 1.4189156561800234, + "learning_rate": 6.906319601568039e-06, + "loss": 0.5561, + "step": 1868 + }, + { + "epoch": 0.438475073313783, + "grad_norm": 1.2961684619205245, + "learning_rate": 6.90253239607106e-06, + "loss": 0.6477, + "step": 1869 + }, + { + "epoch": 0.43870967741935485, + "grad_norm": 1.8044032786362236, + "learning_rate": 6.8987439138382535e-06, + "loss": 0.6138, + "step": 1870 + }, + { + "epoch": 0.4389442815249267, + "grad_norm": 1.2435404874519989, + "learning_rate": 6.894954157411966e-06, + "loss": 0.5673, + "step": 1871 + }, + { + "epoch": 0.43917888563049856, + "grad_norm": 3.916524373943254, + "learning_rate": 6.891163129335397e-06, + "loss": 0.6542, + "step": 1872 + }, + { + "epoch": 0.43941348973607036, + "grad_norm": 1.5737588998343983, + "learning_rate": 6.8873708321525925e-06, + "loss": 0.574, + "step": 1873 + }, + { + "epoch": 0.4396480938416422, + "grad_norm": 1.6666671112739668, + "learning_rate": 6.883577268408461e-06, + "loss": 0.6151, + "step": 1874 + }, + { + "epoch": 0.4398826979472141, + "grad_norm": 1.1243560789780351, + "learning_rate": 6.879782440648755e-06, + "loss": 0.6281, + "step": 1875 + }, + { + "epoch": 0.44011730205278593, + "grad_norm": 0.8403049964959327, + "learning_rate": 6.875986351420077e-06, + "loss": 0.6382, + "step": 1876 + }, + { + "epoch": 0.4403519061583578, + "grad_norm": 4.859535510978899, + "learning_rate": 6.872189003269873e-06, + "loss": 0.6218, + "step": 1877 + }, + { + "epoch": 0.44058651026392964, + "grad_norm": 1.719034341935976, + "learning_rate": 6.86839039874644e-06, + "loss": 0.5686, + "step": 1878 + }, + { + "epoch": 0.44082111436950144, + "grad_norm": 2.946347376780906, + "learning_rate": 6.86459054039891e-06, + "loss": 0.62, + "step": 1879 + }, + { + "epoch": 0.4410557184750733, + "grad_norm": 1.4181908330571877, + "learning_rate": 6.860789430777265e-06, + "loss": 0.626, + "step": 1880 + }, + { + "epoch": 0.44129032258064516, + "grad_norm": 1.2326386214731289, + "learning_rate": 6.856987072432319e-06, + "loss": 0.624, + "step": 1881 + }, + { + "epoch": 0.441524926686217, + "grad_norm": 5.601743472600724, + "learning_rate": 6.853183467915731e-06, + "loss": 0.6212, + "step": 1882 + }, + { + "epoch": 0.44175953079178887, + "grad_norm": 1.0190872995517646, + "learning_rate": 6.849378619779989e-06, + "loss": 0.6313, + "step": 1883 + }, + { + "epoch": 0.4419941348973607, + "grad_norm": 1.2085972895816905, + "learning_rate": 6.845572530578421e-06, + "loss": 0.651, + "step": 1884 + }, + { + "epoch": 0.4422287390029325, + "grad_norm": 2.0497945874809655, + "learning_rate": 6.8417652028651875e-06, + "loss": 0.5852, + "step": 1885 + }, + { + "epoch": 0.4424633431085044, + "grad_norm": 4.2798005446657275, + "learning_rate": 6.837956639195275e-06, + "loss": 0.6515, + "step": 1886 + }, + { + "epoch": 0.44269794721407624, + "grad_norm": 1.4509746558933565, + "learning_rate": 6.834146842124507e-06, + "loss": 0.5608, + "step": 1887 + }, + { + "epoch": 0.4429325513196481, + "grad_norm": 1.1763189917933676, + "learning_rate": 6.830335814209527e-06, + "loss": 0.6106, + "step": 1888 + }, + { + "epoch": 0.44316715542521995, + "grad_norm": 1.4659870368579202, + "learning_rate": 6.826523558007814e-06, + "loss": 0.6492, + "step": 1889 + }, + { + "epoch": 0.4434017595307918, + "grad_norm": 2.2671677086421096, + "learning_rate": 6.822710076077659e-06, + "loss": 0.6662, + "step": 1890 + }, + { + "epoch": 0.44363636363636366, + "grad_norm": 1.7039536255816103, + "learning_rate": 6.818895370978188e-06, + "loss": 0.6008, + "step": 1891 + }, + { + "epoch": 0.44387096774193546, + "grad_norm": 1.6415167246381555, + "learning_rate": 6.815079445269338e-06, + "loss": 0.6233, + "step": 1892 + }, + { + "epoch": 0.4441055718475073, + "grad_norm": 1.338188601406221, + "learning_rate": 6.8112623015118695e-06, + "loss": 0.6031, + "step": 1893 + }, + { + "epoch": 0.4443401759530792, + "grad_norm": 2.6741621302652048, + "learning_rate": 6.807443942267362e-06, + "loss": 0.6411, + "step": 1894 + }, + { + "epoch": 0.44457478005865103, + "grad_norm": 1.5318056407523253, + "learning_rate": 6.803624370098209e-06, + "loss": 0.6743, + "step": 1895 + }, + { + "epoch": 0.4448093841642229, + "grad_norm": 1.1799940735591201, + "learning_rate": 6.799803587567615e-06, + "loss": 0.6484, + "step": 1896 + }, + { + "epoch": 0.44504398826979474, + "grad_norm": 1.571435534048894, + "learning_rate": 6.795981597239599e-06, + "loss": 0.6537, + "step": 1897 + }, + { + "epoch": 0.44527859237536654, + "grad_norm": 1.1736703928865846, + "learning_rate": 6.7921584016789945e-06, + "loss": 0.6344, + "step": 1898 + }, + { + "epoch": 0.4455131964809384, + "grad_norm": 1.0637970914899377, + "learning_rate": 6.788334003451438e-06, + "loss": 0.6682, + "step": 1899 + }, + { + "epoch": 0.44574780058651026, + "grad_norm": 1.653129961619469, + "learning_rate": 6.784508405123375e-06, + "loss": 0.5866, + "step": 1900 + }, + { + "epoch": 0.4459824046920821, + "grad_norm": 5.0057267635873455, + "learning_rate": 6.780681609262057e-06, + "loss": 0.6171, + "step": 1901 + }, + { + "epoch": 0.44621700879765397, + "grad_norm": 2.2243428911712737, + "learning_rate": 6.77685361843554e-06, + "loss": 0.6155, + "step": 1902 + }, + { + "epoch": 0.4464516129032258, + "grad_norm": 1.963880510484772, + "learning_rate": 6.773024435212678e-06, + "loss": 0.5808, + "step": 1903 + }, + { + "epoch": 0.4466862170087977, + "grad_norm": 1.693426729557211, + "learning_rate": 6.769194062163131e-06, + "loss": 0.6428, + "step": 1904 + }, + { + "epoch": 0.4469208211143695, + "grad_norm": 1.5342204357267053, + "learning_rate": 6.765362501857353e-06, + "loss": 0.6314, + "step": 1905 + }, + { + "epoch": 0.44715542521994134, + "grad_norm": 1.2381517597715204, + "learning_rate": 6.761529756866594e-06, + "loss": 0.6471, + "step": 1906 + }, + { + "epoch": 0.4473900293255132, + "grad_norm": 1.1549886641994709, + "learning_rate": 6.757695829762904e-06, + "loss": 0.6586, + "step": 1907 + }, + { + "epoch": 0.44762463343108505, + "grad_norm": 1.2646696572098253, + "learning_rate": 6.753860723119122e-06, + "loss": 0.647, + "step": 1908 + }, + { + "epoch": 0.4478592375366569, + "grad_norm": 0.8025274153279193, + "learning_rate": 6.75002443950888e-06, + "loss": 0.5708, + "step": 1909 + }, + { + "epoch": 0.44809384164222876, + "grad_norm": 1.4513313803057928, + "learning_rate": 6.746186981506599e-06, + "loss": 0.6032, + "step": 1910 + }, + { + "epoch": 0.44832844574780056, + "grad_norm": 2.9255562450471393, + "learning_rate": 6.742348351687491e-06, + "loss": 0.6325, + "step": 1911 + }, + { + "epoch": 0.4485630498533724, + "grad_norm": 1.3300259933782252, + "learning_rate": 6.73850855262755e-06, + "loss": 0.6635, + "step": 1912 + }, + { + "epoch": 0.4487976539589443, + "grad_norm": 1.3287364286680272, + "learning_rate": 6.734667586903557e-06, + "loss": 0.5662, + "step": 1913 + }, + { + "epoch": 0.44903225806451613, + "grad_norm": 0.996842436056555, + "learning_rate": 6.730825457093077e-06, + "loss": 0.571, + "step": 1914 + }, + { + "epoch": 0.449266862170088, + "grad_norm": 1.4331781076351444, + "learning_rate": 6.726982165774455e-06, + "loss": 0.6802, + "step": 1915 + }, + { + "epoch": 0.44950146627565984, + "grad_norm": 1.5706689804414935, + "learning_rate": 6.7231377155268144e-06, + "loss": 0.6271, + "step": 1916 + }, + { + "epoch": 0.44973607038123165, + "grad_norm": 2.0970147647815707, + "learning_rate": 6.7192921089300565e-06, + "loss": 0.6063, + "step": 1917 + }, + { + "epoch": 0.4499706744868035, + "grad_norm": 1.3010993871155538, + "learning_rate": 6.715445348564863e-06, + "loss": 0.6537, + "step": 1918 + }, + { + "epoch": 0.45020527859237536, + "grad_norm": 1.324428395427468, + "learning_rate": 6.711597437012683e-06, + "loss": 0.6545, + "step": 1919 + }, + { + "epoch": 0.4504398826979472, + "grad_norm": 1.570637482520361, + "learning_rate": 6.707748376855741e-06, + "loss": 0.642, + "step": 1920 + }, + { + "epoch": 0.45067448680351907, + "grad_norm": 1.3174788360455836, + "learning_rate": 6.703898170677034e-06, + "loss": 0.6057, + "step": 1921 + }, + { + "epoch": 0.4509090909090909, + "grad_norm": 1.8110082415582873, + "learning_rate": 6.700046821060328e-06, + "loss": 0.6131, + "step": 1922 + }, + { + "epoch": 0.4511436950146628, + "grad_norm": 1.4819775481519541, + "learning_rate": 6.6961943305901515e-06, + "loss": 0.5847, + "step": 1923 + }, + { + "epoch": 0.4513782991202346, + "grad_norm": 1.4897821818625845, + "learning_rate": 6.692340701851807e-06, + "loss": 0.613, + "step": 1924 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 1.5774010639815104, + "learning_rate": 6.688485937431353e-06, + "loss": 0.5808, + "step": 1925 + }, + { + "epoch": 0.4518475073313783, + "grad_norm": 2.1623584390962485, + "learning_rate": 6.684630039915612e-06, + "loss": 0.5834, + "step": 1926 + }, + { + "epoch": 0.45208211143695015, + "grad_norm": 2.460836104660879, + "learning_rate": 6.6807730118921706e-06, + "loss": 0.6964, + "step": 1927 + }, + { + "epoch": 0.452316715542522, + "grad_norm": 1.3067138443317345, + "learning_rate": 6.676914855949372e-06, + "loss": 0.6404, + "step": 1928 + }, + { + "epoch": 0.45255131964809386, + "grad_norm": 2.6420519994884537, + "learning_rate": 6.673055574676314e-06, + "loss": 0.6682, + "step": 1929 + }, + { + "epoch": 0.45278592375366566, + "grad_norm": 2.717267506649316, + "learning_rate": 6.669195170662851e-06, + "loss": 0.6462, + "step": 1930 + }, + { + "epoch": 0.4530205278592375, + "grad_norm": 1.1087273171939926, + "learning_rate": 6.6653336464995926e-06, + "loss": 0.6379, + "step": 1931 + }, + { + "epoch": 0.4532551319648094, + "grad_norm": 4.247345100435981, + "learning_rate": 6.661471004777899e-06, + "loss": 0.6277, + "step": 1932 + }, + { + "epoch": 0.45348973607038123, + "grad_norm": 2.1968507584563413, + "learning_rate": 6.657607248089879e-06, + "loss": 0.6248, + "step": 1933 + }, + { + "epoch": 0.4537243401759531, + "grad_norm": 2.9111655451005336, + "learning_rate": 6.653742379028389e-06, + "loss": 0.6352, + "step": 1934 + }, + { + "epoch": 0.45395894428152495, + "grad_norm": 1.7338832754085414, + "learning_rate": 6.64987640018704e-06, + "loss": 0.6329, + "step": 1935 + }, + { + "epoch": 0.4541935483870968, + "grad_norm": 0.9310750333439333, + "learning_rate": 6.646009314160173e-06, + "loss": 0.6333, + "step": 1936 + }, + { + "epoch": 0.4544281524926686, + "grad_norm": 3.667141975361051, + "learning_rate": 6.642141123542887e-06, + "loss": 0.6196, + "step": 1937 + }, + { + "epoch": 0.45466275659824046, + "grad_norm": 1.641892619971768, + "learning_rate": 6.638271830931011e-06, + "loss": 0.6684, + "step": 1938 + }, + { + "epoch": 0.4548973607038123, + "grad_norm": 1.3234013033641583, + "learning_rate": 6.634401438921122e-06, + "loss": 0.6746, + "step": 1939 + }, + { + "epoch": 0.45513196480938417, + "grad_norm": 1.0470469732467738, + "learning_rate": 6.630529950110528e-06, + "loss": 0.7021, + "step": 1940 + }, + { + "epoch": 0.455366568914956, + "grad_norm": 0.9471030586762847, + "learning_rate": 6.626657367097278e-06, + "loss": 0.6298, + "step": 1941 + }, + { + "epoch": 0.4556011730205279, + "grad_norm": 0.8134550564494365, + "learning_rate": 6.62278369248015e-06, + "loss": 0.6649, + "step": 1942 + }, + { + "epoch": 0.4558357771260997, + "grad_norm": 1.9352155842283765, + "learning_rate": 6.618908928858663e-06, + "loss": 0.6156, + "step": 1943 + }, + { + "epoch": 0.45607038123167154, + "grad_norm": 2.980139483293199, + "learning_rate": 6.615033078833058e-06, + "loss": 0.6108, + "step": 1944 + }, + { + "epoch": 0.4563049853372434, + "grad_norm": 1.1365315130543463, + "learning_rate": 6.611156145004308e-06, + "loss": 0.6523, + "step": 1945 + }, + { + "epoch": 0.45653958944281525, + "grad_norm": 11.077330773976612, + "learning_rate": 6.607278129974115e-06, + "loss": 0.6289, + "step": 1946 + }, + { + "epoch": 0.4567741935483871, + "grad_norm": 2.329158201272649, + "learning_rate": 6.603399036344906e-06, + "loss": 0.7082, + "step": 1947 + }, + { + "epoch": 0.45700879765395896, + "grad_norm": 2.0870534137036825, + "learning_rate": 6.599518866719832e-06, + "loss": 0.6364, + "step": 1948 + }, + { + "epoch": 0.45724340175953077, + "grad_norm": 2.263397559926266, + "learning_rate": 6.595637623702763e-06, + "loss": 0.5984, + "step": 1949 + }, + { + "epoch": 0.4574780058651026, + "grad_norm": 1.6725044388192607, + "learning_rate": 6.591755309898293e-06, + "loss": 0.5779, + "step": 1950 + }, + { + "epoch": 0.4577126099706745, + "grad_norm": 2.2161112880528253, + "learning_rate": 6.587871927911736e-06, + "loss": 0.6248, + "step": 1951 + }, + { + "epoch": 0.45794721407624633, + "grad_norm": 1.2433596954618322, + "learning_rate": 6.583987480349114e-06, + "loss": 0.6325, + "step": 1952 + }, + { + "epoch": 0.4581818181818182, + "grad_norm": 1.2299981012283725, + "learning_rate": 6.580101969817176e-06, + "loss": 0.6154, + "step": 1953 + }, + { + "epoch": 0.45841642228739005, + "grad_norm": 1.5773499642534672, + "learning_rate": 6.576215398923374e-06, + "loss": 0.6567, + "step": 1954 + }, + { + "epoch": 0.4586510263929619, + "grad_norm": 0.9403302159770113, + "learning_rate": 6.5723277702758815e-06, + "loss": 0.6432, + "step": 1955 + }, + { + "epoch": 0.4588856304985337, + "grad_norm": 1.128632972579098, + "learning_rate": 6.568439086483571e-06, + "loss": 0.6316, + "step": 1956 + }, + { + "epoch": 0.45912023460410556, + "grad_norm": 1.2429754863244453, + "learning_rate": 6.564549350156031e-06, + "loss": 0.681, + "step": 1957 + }, + { + "epoch": 0.4593548387096774, + "grad_norm": 1.355363076347527, + "learning_rate": 6.560658563903552e-06, + "loss": 0.6611, + "step": 1958 + }, + { + "epoch": 0.45958944281524927, + "grad_norm": 1.4509475026824954, + "learning_rate": 6.556766730337134e-06, + "loss": 0.6673, + "step": 1959 + }, + { + "epoch": 0.45982404692082113, + "grad_norm": 1.5079323246745762, + "learning_rate": 6.552873852068477e-06, + "loss": 0.608, + "step": 1960 + }, + { + "epoch": 0.460058651026393, + "grad_norm": 1.4692177513176758, + "learning_rate": 6.548979931709977e-06, + "loss": 0.6185, + "step": 1961 + }, + { + "epoch": 0.4602932551319648, + "grad_norm": 0.9131837214988069, + "learning_rate": 6.545084971874738e-06, + "loss": 0.6479, + "step": 1962 + }, + { + "epoch": 0.46052785923753664, + "grad_norm": 1.1917507594725938, + "learning_rate": 6.541188975176557e-06, + "loss": 0.5956, + "step": 1963 + }, + { + "epoch": 0.4607624633431085, + "grad_norm": 1.1443475410357604, + "learning_rate": 6.537291944229928e-06, + "loss": 0.6449, + "step": 1964 + }, + { + "epoch": 0.46099706744868035, + "grad_norm": 3.0987968141202886, + "learning_rate": 6.533393881650035e-06, + "loss": 0.6057, + "step": 1965 + }, + { + "epoch": 0.4612316715542522, + "grad_norm": 0.9226564047019269, + "learning_rate": 6.529494790052764e-06, + "loss": 0.588, + "step": 1966 + }, + { + "epoch": 0.46146627565982407, + "grad_norm": 1.020077010181788, + "learning_rate": 6.52559467205468e-06, + "loss": 0.6508, + "step": 1967 + }, + { + "epoch": 0.4617008797653959, + "grad_norm": 1.1013146406559506, + "learning_rate": 6.521693530273046e-06, + "loss": 0.6576, + "step": 1968 + }, + { + "epoch": 0.4619354838709677, + "grad_norm": 2.1982949825750056, + "learning_rate": 6.517791367325803e-06, + "loss": 0.6539, + "step": 1969 + }, + { + "epoch": 0.4621700879765396, + "grad_norm": 1.245381337261458, + "learning_rate": 6.513888185831589e-06, + "loss": 0.6192, + "step": 1970 + }, + { + "epoch": 0.46240469208211143, + "grad_norm": 1.6278032653959793, + "learning_rate": 6.509983988409715e-06, + "loss": 0.6307, + "step": 1971 + }, + { + "epoch": 0.4626392961876833, + "grad_norm": 0.9983515455059903, + "learning_rate": 6.506078777680178e-06, + "loss": 0.5629, + "step": 1972 + }, + { + "epoch": 0.46287390029325515, + "grad_norm": 0.9447709125469961, + "learning_rate": 6.502172556263656e-06, + "loss": 0.6282, + "step": 1973 + }, + { + "epoch": 0.463108504398827, + "grad_norm": 1.679620294931306, + "learning_rate": 6.4982653267815055e-06, + "loss": 0.6206, + "step": 1974 + }, + { + "epoch": 0.4633431085043988, + "grad_norm": 0.9613639374971835, + "learning_rate": 6.494357091855752e-06, + "loss": 0.6221, + "step": 1975 + }, + { + "epoch": 0.46357771260997066, + "grad_norm": 2.238864881041864, + "learning_rate": 6.490447854109109e-06, + "loss": 0.6239, + "step": 1976 + }, + { + "epoch": 0.4638123167155425, + "grad_norm": 1.6213560232591244, + "learning_rate": 6.486537616164951e-06, + "loss": 0.6991, + "step": 1977 + }, + { + "epoch": 0.4640469208211144, + "grad_norm": 6.759059559850818, + "learning_rate": 6.48262638064733e-06, + "loss": 0.6261, + "step": 1978 + }, + { + "epoch": 0.46428152492668623, + "grad_norm": 2.1463681214030945, + "learning_rate": 6.478714150180964e-06, + "loss": 0.5846, + "step": 1979 + }, + { + "epoch": 0.4645161290322581, + "grad_norm": 1.7239668052266666, + "learning_rate": 6.4748009273912436e-06, + "loss": 0.6363, + "step": 1980 + }, + { + "epoch": 0.4647507331378299, + "grad_norm": 1.6135701667408295, + "learning_rate": 6.470886714904221e-06, + "loss": 0.6749, + "step": 1981 + }, + { + "epoch": 0.46498533724340174, + "grad_norm": 8.984122051071271, + "learning_rate": 6.466971515346612e-06, + "loss": 0.6738, + "step": 1982 + }, + { + "epoch": 0.4652199413489736, + "grad_norm": 0.812893794337249, + "learning_rate": 6.4630553313457985e-06, + "loss": 0.6484, + "step": 1983 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 2.7612121859292578, + "learning_rate": 6.4591381655298225e-06, + "loss": 0.6734, + "step": 1984 + }, + { + "epoch": 0.4656891495601173, + "grad_norm": 0.8238126963025542, + "learning_rate": 6.4552200205273795e-06, + "loss": 0.6092, + "step": 1985 + }, + { + "epoch": 0.46592375366568917, + "grad_norm": 1.2316750855753464, + "learning_rate": 6.451300898967829e-06, + "loss": 0.6208, + "step": 1986 + }, + { + "epoch": 0.466158357771261, + "grad_norm": 1.702594986920346, + "learning_rate": 6.4473808034811836e-06, + "loss": 0.6237, + "step": 1987 + }, + { + "epoch": 0.4663929618768328, + "grad_norm": 2.9301433173037084, + "learning_rate": 6.443459736698106e-06, + "loss": 0.6545, + "step": 1988 + }, + { + "epoch": 0.4666275659824047, + "grad_norm": 3.0236575786668065, + "learning_rate": 6.439537701249915e-06, + "loss": 0.6496, + "step": 1989 + }, + { + "epoch": 0.46686217008797654, + "grad_norm": 1.1340598218212603, + "learning_rate": 6.435614699768577e-06, + "loss": 0.5931, + "step": 1990 + }, + { + "epoch": 0.4670967741935484, + "grad_norm": 1.1931136346109332, + "learning_rate": 6.431690734886711e-06, + "loss": 0.6094, + "step": 1991 + }, + { + "epoch": 0.46733137829912025, + "grad_norm": 1.0004037116472506, + "learning_rate": 6.427765809237574e-06, + "loss": 0.6651, + "step": 1992 + }, + { + "epoch": 0.4675659824046921, + "grad_norm": 1.8775423558202449, + "learning_rate": 6.423839925455078e-06, + "loss": 0.6376, + "step": 1993 + }, + { + "epoch": 0.4678005865102639, + "grad_norm": 4.252068076887891, + "learning_rate": 6.419913086173769e-06, + "loss": 0.608, + "step": 1994 + }, + { + "epoch": 0.46803519061583576, + "grad_norm": 1.392209392136907, + "learning_rate": 6.415985294028839e-06, + "loss": 0.6179, + "step": 1995 + }, + { + "epoch": 0.4682697947214076, + "grad_norm": 0.8941148909710063, + "learning_rate": 6.412056551656121e-06, + "loss": 0.5984, + "step": 1996 + }, + { + "epoch": 0.4685043988269795, + "grad_norm": 1.5181958453031448, + "learning_rate": 6.40812686169208e-06, + "loss": 0.644, + "step": 1997 + }, + { + "epoch": 0.46873900293255133, + "grad_norm": 1.5176366895207596, + "learning_rate": 6.40419622677382e-06, + "loss": 0.6205, + "step": 1998 + }, + { + "epoch": 0.4689736070381232, + "grad_norm": 1.0608470980433227, + "learning_rate": 6.400264649539083e-06, + "loss": 0.621, + "step": 1999 + }, + { + "epoch": 0.46920821114369504, + "grad_norm": 1.4891152428412084, + "learning_rate": 6.396332132626235e-06, + "loss": 0.6087, + "step": 2000 + }, + { + "epoch": 0.46920821114369504, + "eval_loss": 0.6271416544914246, + "eval_runtime": 34.309, + "eval_samples_per_second": 15.914, + "eval_steps_per_second": 0.146, + "step": 2000 + }, + { + "epoch": 0.46944281524926684, + "grad_norm": 1.323818856110043, + "learning_rate": 6.392398678674281e-06, + "loss": 0.6231, + "step": 2001 + }, + { + "epoch": 0.4696774193548387, + "grad_norm": 1.403156838611236, + "learning_rate": 6.388464290322848e-06, + "loss": 0.5927, + "step": 2002 + }, + { + "epoch": 0.46991202346041056, + "grad_norm": 1.8337101115881043, + "learning_rate": 6.384528970212196e-06, + "loss": 0.6724, + "step": 2003 + }, + { + "epoch": 0.4701466275659824, + "grad_norm": 1.4523158804398866, + "learning_rate": 6.380592720983203e-06, + "loss": 0.6227, + "step": 2004 + }, + { + "epoch": 0.47038123167155427, + "grad_norm": 1.1814001637036708, + "learning_rate": 6.376655545277379e-06, + "loss": 0.6321, + "step": 2005 + }, + { + "epoch": 0.4706158357771261, + "grad_norm": 1.5839664998501093, + "learning_rate": 6.3727174457368505e-06, + "loss": 0.6386, + "step": 2006 + }, + { + "epoch": 0.4708504398826979, + "grad_norm": 1.263564175177129, + "learning_rate": 6.368778425004365e-06, + "loss": 0.6571, + "step": 2007 + }, + { + "epoch": 0.4710850439882698, + "grad_norm": 1.1036086210231406, + "learning_rate": 6.364838485723286e-06, + "loss": 0.6061, + "step": 2008 + }, + { + "epoch": 0.47131964809384164, + "grad_norm": 1.690351485184434, + "learning_rate": 6.360897630537599e-06, + "loss": 0.6111, + "step": 2009 + }, + { + "epoch": 0.4715542521994135, + "grad_norm": 1.6972648310695901, + "learning_rate": 6.3569558620918985e-06, + "loss": 0.6623, + "step": 2010 + }, + { + "epoch": 0.47178885630498535, + "grad_norm": 1.4296033586068853, + "learning_rate": 6.353013183031394e-06, + "loss": 0.6697, + "step": 2011 + }, + { + "epoch": 0.4720234604105572, + "grad_norm": 1.8778101754810188, + "learning_rate": 6.349069596001908e-06, + "loss": 0.6654, + "step": 2012 + }, + { + "epoch": 0.472258064516129, + "grad_norm": 1.1484421913508873, + "learning_rate": 6.34512510364987e-06, + "loss": 0.6109, + "step": 2013 + }, + { + "epoch": 0.47249266862170086, + "grad_norm": 1.2213269093026615, + "learning_rate": 6.341179708622315e-06, + "loss": 0.6094, + "step": 2014 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 1.1797986510112424, + "learning_rate": 6.337233413566889e-06, + "loss": 0.6249, + "step": 2015 + }, + { + "epoch": 0.4729618768328446, + "grad_norm": 2.0379696555636713, + "learning_rate": 6.3332862211318406e-06, + "loss": 0.6219, + "step": 2016 + }, + { + "epoch": 0.47319648093841643, + "grad_norm": 0.8989640094613222, + "learning_rate": 6.3293381339660155e-06, + "loss": 0.6878, + "step": 2017 + }, + { + "epoch": 0.4734310850439883, + "grad_norm": 2.6534551632356376, + "learning_rate": 6.325389154718865e-06, + "loss": 0.6281, + "step": 2018 + }, + { + "epoch": 0.47366568914956014, + "grad_norm": 1.0670968163009993, + "learning_rate": 6.32143928604044e-06, + "loss": 0.6391, + "step": 2019 + }, + { + "epoch": 0.47390029325513194, + "grad_norm": 1.2520887385389352, + "learning_rate": 6.317488530581384e-06, + "loss": 0.5974, + "step": 2020 + }, + { + "epoch": 0.4741348973607038, + "grad_norm": 2.1016639103309633, + "learning_rate": 6.313536890992935e-06, + "loss": 0.6931, + "step": 2021 + }, + { + "epoch": 0.47436950146627566, + "grad_norm": 0.8221569405185536, + "learning_rate": 6.309584369926932e-06, + "loss": 0.6261, + "step": 2022 + }, + { + "epoch": 0.4746041055718475, + "grad_norm": 1.0469243110888673, + "learning_rate": 6.305630970035796e-06, + "loss": 0.5932, + "step": 2023 + }, + { + "epoch": 0.47483870967741937, + "grad_norm": 1.2193409312969328, + "learning_rate": 6.301676693972547e-06, + "loss": 0.6393, + "step": 2024 + }, + { + "epoch": 0.4750733137829912, + "grad_norm": 1.0060176975182853, + "learning_rate": 6.297721544390782e-06, + "loss": 0.5951, + "step": 2025 + }, + { + "epoch": 0.475307917888563, + "grad_norm": 1.2805031802136542, + "learning_rate": 6.293765523944697e-06, + "loss": 0.6822, + "step": 2026 + }, + { + "epoch": 0.4755425219941349, + "grad_norm": 1.5525533878292266, + "learning_rate": 6.28980863528906e-06, + "loss": 0.7123, + "step": 2027 + }, + { + "epoch": 0.47577712609970674, + "grad_norm": 1.7537531970959022, + "learning_rate": 6.285850881079229e-06, + "loss": 0.6235, + "step": 2028 + }, + { + "epoch": 0.4760117302052786, + "grad_norm": 2.624656138701152, + "learning_rate": 6.281892263971143e-06, + "loss": 0.6145, + "step": 2029 + }, + { + "epoch": 0.47624633431085045, + "grad_norm": 1.2092601576569733, + "learning_rate": 6.277932786621318e-06, + "loss": 0.6081, + "step": 2030 + }, + { + "epoch": 0.4764809384164223, + "grad_norm": 1.4072005358178026, + "learning_rate": 6.273972451686845e-06, + "loss": 0.6363, + "step": 2031 + }, + { + "epoch": 0.47671554252199416, + "grad_norm": 1.0231130407716105, + "learning_rate": 6.270011261825396e-06, + "loss": 0.61, + "step": 2032 + }, + { + "epoch": 0.47695014662756596, + "grad_norm": 1.4865705604934518, + "learning_rate": 6.266049219695211e-06, + "loss": 0.6918, + "step": 2033 + }, + { + "epoch": 0.4771847507331378, + "grad_norm": 2.200147147581259, + "learning_rate": 6.262086327955108e-06, + "loss": 0.6011, + "step": 2034 + }, + { + "epoch": 0.4774193548387097, + "grad_norm": 1.1387023311455957, + "learning_rate": 6.258122589264469e-06, + "loss": 0.6083, + "step": 2035 + }, + { + "epoch": 0.47765395894428153, + "grad_norm": 1.4672611309047097, + "learning_rate": 6.25415800628325e-06, + "loss": 0.5993, + "step": 2036 + }, + { + "epoch": 0.4778885630498534, + "grad_norm": 1.3735125227353329, + "learning_rate": 6.250192581671968e-06, + "loss": 0.5535, + "step": 2037 + }, + { + "epoch": 0.47812316715542524, + "grad_norm": 1.0229076024186408, + "learning_rate": 6.246226318091708e-06, + "loss": 0.657, + "step": 2038 + }, + { + "epoch": 0.47835777126099704, + "grad_norm": 1.1284116271169373, + "learning_rate": 6.24225921820412e-06, + "loss": 0.6119, + "step": 2039 + }, + { + "epoch": 0.4785923753665689, + "grad_norm": 1.018314898840206, + "learning_rate": 6.238291284671409e-06, + "loss": 0.6873, + "step": 2040 + }, + { + "epoch": 0.47882697947214076, + "grad_norm": 1.2858736025682194, + "learning_rate": 6.234322520156347e-06, + "loss": 0.5724, + "step": 2041 + }, + { + "epoch": 0.4790615835777126, + "grad_norm": 2.88091820997555, + "learning_rate": 6.2303529273222585e-06, + "loss": 0.6074, + "step": 2042 + }, + { + "epoch": 0.47929618768328447, + "grad_norm": 3.0003708776457056, + "learning_rate": 6.2263825088330254e-06, + "loss": 0.6572, + "step": 2043 + }, + { + "epoch": 0.4795307917888563, + "grad_norm": 2.9239657263409526, + "learning_rate": 6.222411267353081e-06, + "loss": 0.5966, + "step": 2044 + }, + { + "epoch": 0.4797653958944281, + "grad_norm": 1.0877362072010932, + "learning_rate": 6.218439205547418e-06, + "loss": 0.6138, + "step": 2045 + }, + { + "epoch": 0.48, + "grad_norm": 1.8093569813862858, + "learning_rate": 6.214466326081573e-06, + "loss": 0.6149, + "step": 2046 + }, + { + "epoch": 0.48023460410557184, + "grad_norm": 1.491766211761286, + "learning_rate": 6.210492631621632e-06, + "loss": 0.6524, + "step": 2047 + }, + { + "epoch": 0.4804692082111437, + "grad_norm": 1.0604179495136983, + "learning_rate": 6.206518124834231e-06, + "loss": 0.6317, + "step": 2048 + }, + { + "epoch": 0.48070381231671555, + "grad_norm": 0.7762572817944767, + "learning_rate": 6.202542808386549e-06, + "loss": 0.6077, + "step": 2049 + }, + { + "epoch": 0.4809384164222874, + "grad_norm": 2.583739623293477, + "learning_rate": 6.198566684946308e-06, + "loss": 0.6083, + "step": 2050 + }, + { + "epoch": 0.48117302052785926, + "grad_norm": 1.5067311705327708, + "learning_rate": 6.194589757181772e-06, + "loss": 0.6363, + "step": 2051 + }, + { + "epoch": 0.48140762463343106, + "grad_norm": 0.9814158342670776, + "learning_rate": 6.190612027761748e-06, + "loss": 0.6015, + "step": 2052 + }, + { + "epoch": 0.4816422287390029, + "grad_norm": 0.9679059478670816, + "learning_rate": 6.186633499355576e-06, + "loss": 0.5667, + "step": 2053 + }, + { + "epoch": 0.4818768328445748, + "grad_norm": 1.2641337429134951, + "learning_rate": 6.182654174633132e-06, + "loss": 0.6625, + "step": 2054 + }, + { + "epoch": 0.48211143695014663, + "grad_norm": 1.9502971263233753, + "learning_rate": 6.178674056264831e-06, + "loss": 0.6115, + "step": 2055 + }, + { + "epoch": 0.4823460410557185, + "grad_norm": 1.4368040195448697, + "learning_rate": 6.174693146921618e-06, + "loss": 0.6746, + "step": 2056 + }, + { + "epoch": 0.48258064516129034, + "grad_norm": 2.524271662293183, + "learning_rate": 6.17071144927497e-06, + "loss": 0.5983, + "step": 2057 + }, + { + "epoch": 0.48281524926686215, + "grad_norm": 1.0766715766159713, + "learning_rate": 6.16672896599689e-06, + "loss": 0.6108, + "step": 2058 + }, + { + "epoch": 0.483049853372434, + "grad_norm": 6.1316689147008745, + "learning_rate": 6.162745699759911e-06, + "loss": 0.6344, + "step": 2059 + }, + { + "epoch": 0.48328445747800586, + "grad_norm": 1.3924286006700812, + "learning_rate": 6.158761653237091e-06, + "loss": 0.6685, + "step": 2060 + }, + { + "epoch": 0.4835190615835777, + "grad_norm": 1.7880709858752692, + "learning_rate": 6.15477682910201e-06, + "loss": 0.5301, + "step": 2061 + }, + { + "epoch": 0.48375366568914957, + "grad_norm": 2.1616777292211107, + "learning_rate": 6.150791230028772e-06, + "loss": 0.626, + "step": 2062 + }, + { + "epoch": 0.4839882697947214, + "grad_norm": 1.494502195180896, + "learning_rate": 6.146804858692001e-06, + "loss": 0.6205, + "step": 2063 + }, + { + "epoch": 0.4842228739002933, + "grad_norm": 1.0405545660119742, + "learning_rate": 6.142817717766837e-06, + "loss": 0.5933, + "step": 2064 + }, + { + "epoch": 0.4844574780058651, + "grad_norm": 4.123818420590978, + "learning_rate": 6.138829809928938e-06, + "loss": 0.6474, + "step": 2065 + }, + { + "epoch": 0.48469208211143694, + "grad_norm": 1.6130108710560063, + "learning_rate": 6.1348411378544756e-06, + "loss": 0.6335, + "step": 2066 + }, + { + "epoch": 0.4849266862170088, + "grad_norm": 3.5519518159682733, + "learning_rate": 6.130851704220137e-06, + "loss": 0.6165, + "step": 2067 + }, + { + "epoch": 0.48516129032258065, + "grad_norm": 1.1605141693731862, + "learning_rate": 6.126861511703119e-06, + "loss": 0.6823, + "step": 2068 + }, + { + "epoch": 0.4853958944281525, + "grad_norm": 1.4032447915593402, + "learning_rate": 6.122870562981125e-06, + "loss": 0.5898, + "step": 2069 + }, + { + "epoch": 0.48563049853372436, + "grad_norm": 2.762947182978864, + "learning_rate": 6.118878860732369e-06, + "loss": 0.5631, + "step": 2070 + }, + { + "epoch": 0.48586510263929616, + "grad_norm": 1.3411722921723264, + "learning_rate": 6.1148864076355695e-06, + "loss": 0.6272, + "step": 2071 + }, + { + "epoch": 0.486099706744868, + "grad_norm": 1.439123735665396, + "learning_rate": 6.110893206369951e-06, + "loss": 0.5621, + "step": 2072 + }, + { + "epoch": 0.4863343108504399, + "grad_norm": 81.63147080931444, + "learning_rate": 6.1068992596152355e-06, + "loss": 0.619, + "step": 2073 + }, + { + "epoch": 0.48656891495601173, + "grad_norm": 1.4230195049501757, + "learning_rate": 6.102904570051649e-06, + "loss": 0.5931, + "step": 2074 + }, + { + "epoch": 0.4868035190615836, + "grad_norm": 2.1737217123008636, + "learning_rate": 6.098909140359917e-06, + "loss": 0.5828, + "step": 2075 + }, + { + "epoch": 0.48703812316715545, + "grad_norm": 2.329390791042317, + "learning_rate": 6.094912973221256e-06, + "loss": 0.6416, + "step": 2076 + }, + { + "epoch": 0.48727272727272725, + "grad_norm": 2.4877429915990135, + "learning_rate": 6.090916071317383e-06, + "loss": 0.6558, + "step": 2077 + }, + { + "epoch": 0.4875073313782991, + "grad_norm": 1.7686598894141479, + "learning_rate": 6.086918437330508e-06, + "loss": 0.6116, + "step": 2078 + }, + { + "epoch": 0.48774193548387096, + "grad_norm": 1.1245664056743825, + "learning_rate": 6.082920073943328e-06, + "loss": 0.6295, + "step": 2079 + }, + { + "epoch": 0.4879765395894428, + "grad_norm": 1.050651182314869, + "learning_rate": 6.078920983839032e-06, + "loss": 0.6205, + "step": 2080 + }, + { + "epoch": 0.48821114369501467, + "grad_norm": 0.9879313298149861, + "learning_rate": 6.074921169701296e-06, + "loss": 0.5395, + "step": 2081 + }, + { + "epoch": 0.4884457478005865, + "grad_norm": 1.681688526777483, + "learning_rate": 6.070920634214286e-06, + "loss": 0.6156, + "step": 2082 + }, + { + "epoch": 0.4886803519061584, + "grad_norm": 1.2333418581645228, + "learning_rate": 6.066919380062644e-06, + "loss": 0.5937, + "step": 2083 + }, + { + "epoch": 0.4889149560117302, + "grad_norm": 1.1003068376854757, + "learning_rate": 6.0629174099315e-06, + "loss": 0.6386, + "step": 2084 + }, + { + "epoch": 0.48914956011730204, + "grad_norm": 1.4249697023558556, + "learning_rate": 6.0589147265064655e-06, + "loss": 0.6249, + "step": 2085 + }, + { + "epoch": 0.4893841642228739, + "grad_norm": 1.2707201307018698, + "learning_rate": 6.054911332473627e-06, + "loss": 0.6106, + "step": 2086 + }, + { + "epoch": 0.48961876832844575, + "grad_norm": 1.1218073590723743, + "learning_rate": 6.050907230519547e-06, + "loss": 0.6174, + "step": 2087 + }, + { + "epoch": 0.4898533724340176, + "grad_norm": 1.218911458915031, + "learning_rate": 6.046902423331271e-06, + "loss": 0.677, + "step": 2088 + }, + { + "epoch": 0.49008797653958946, + "grad_norm": 1.1170171887739733, + "learning_rate": 6.0428969135963064e-06, + "loss": 0.6809, + "step": 2089 + }, + { + "epoch": 0.49032258064516127, + "grad_norm": 1.4478340510895522, + "learning_rate": 6.038890704002641e-06, + "loss": 0.6629, + "step": 2090 + }, + { + "epoch": 0.4905571847507331, + "grad_norm": 2.3220845394586704, + "learning_rate": 6.0348837972387295e-06, + "loss": 0.7104, + "step": 2091 + }, + { + "epoch": 0.490791788856305, + "grad_norm": 1.4967703459157093, + "learning_rate": 6.030876195993491e-06, + "loss": 0.5655, + "step": 2092 + }, + { + "epoch": 0.49102639296187683, + "grad_norm": 3.1962387851332315, + "learning_rate": 6.026867902956317e-06, + "loss": 0.6309, + "step": 2093 + }, + { + "epoch": 0.4912609970674487, + "grad_norm": 4.875947490971553, + "learning_rate": 6.022858920817056e-06, + "loss": 0.6764, + "step": 2094 + }, + { + "epoch": 0.49149560117302055, + "grad_norm": 2.3529662411181818, + "learning_rate": 6.018849252266028e-06, + "loss": 0.6279, + "step": 2095 + }, + { + "epoch": 0.4917302052785924, + "grad_norm": 0.8877451951374036, + "learning_rate": 6.014838899994004e-06, + "loss": 0.6, + "step": 2096 + }, + { + "epoch": 0.4919648093841642, + "grad_norm": 1.0256533418763865, + "learning_rate": 6.010827866692219e-06, + "loss": 0.6372, + "step": 2097 + }, + { + "epoch": 0.49219941348973606, + "grad_norm": 2.645472715766187, + "learning_rate": 6.006816155052366e-06, + "loss": 0.6368, + "step": 2098 + }, + { + "epoch": 0.4924340175953079, + "grad_norm": 1.1803660965278748, + "learning_rate": 6.002803767766592e-06, + "loss": 0.5749, + "step": 2099 + }, + { + "epoch": 0.49266862170087977, + "grad_norm": 2.153341937182534, + "learning_rate": 5.9987907075274935e-06, + "loss": 0.5923, + "step": 2100 + }, + { + "epoch": 0.49290322580645163, + "grad_norm": 1.5279385287985987, + "learning_rate": 5.994776977028126e-06, + "loss": 0.5865, + "step": 2101 + }, + { + "epoch": 0.4931378299120235, + "grad_norm": 1.0401541582219007, + "learning_rate": 5.990762578961988e-06, + "loss": 0.6482, + "step": 2102 + }, + { + "epoch": 0.4933724340175953, + "grad_norm": 1.3610873882619805, + "learning_rate": 5.986747516023031e-06, + "loss": 0.6234, + "step": 2103 + }, + { + "epoch": 0.49360703812316714, + "grad_norm": 1.0965898966482213, + "learning_rate": 5.9827317909056474e-06, + "loss": 0.6119, + "step": 2104 + }, + { + "epoch": 0.493841642228739, + "grad_norm": 0.8575468956558576, + "learning_rate": 5.9787154063046805e-06, + "loss": 0.5901, + "step": 2105 + }, + { + "epoch": 0.49407624633431085, + "grad_norm": 1.7014611619920734, + "learning_rate": 5.97469836491541e-06, + "loss": 0.6585, + "step": 2106 + }, + { + "epoch": 0.4943108504398827, + "grad_norm": 1.4593600436030598, + "learning_rate": 5.970680669433561e-06, + "loss": 0.6381, + "step": 2107 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 1.2626003058937378, + "learning_rate": 5.966662322555295e-06, + "loss": 0.6838, + "step": 2108 + }, + { + "epoch": 0.49478005865102637, + "grad_norm": 1.5822142751717314, + "learning_rate": 5.962643326977211e-06, + "loss": 0.64, + "step": 2109 + }, + { + "epoch": 0.4950146627565982, + "grad_norm": 1.01398619503039, + "learning_rate": 5.958623685396342e-06, + "loss": 0.6409, + "step": 2110 + }, + { + "epoch": 0.4952492668621701, + "grad_norm": 1.3938547079745711, + "learning_rate": 5.95460340051016e-06, + "loss": 0.6396, + "step": 2111 + }, + { + "epoch": 0.49548387096774194, + "grad_norm": 0.9483622057470202, + "learning_rate": 5.950582475016561e-06, + "loss": 0.6118, + "step": 2112 + }, + { + "epoch": 0.4957184750733138, + "grad_norm": 1.4305872444016032, + "learning_rate": 5.946560911613877e-06, + "loss": 0.6435, + "step": 2113 + }, + { + "epoch": 0.49595307917888565, + "grad_norm": 2.1265470222810574, + "learning_rate": 5.9425387130008635e-06, + "loss": 0.6321, + "step": 2114 + }, + { + "epoch": 0.4961876832844575, + "grad_norm": 1.3248876237644547, + "learning_rate": 5.938515881876708e-06, + "loss": 0.6395, + "step": 2115 + }, + { + "epoch": 0.4964222873900293, + "grad_norm": 1.4061732055086722, + "learning_rate": 5.934492420941017e-06, + "loss": 0.6115, + "step": 2116 + }, + { + "epoch": 0.49665689149560116, + "grad_norm": 2.4947567464829548, + "learning_rate": 5.930468332893821e-06, + "loss": 0.6783, + "step": 2117 + }, + { + "epoch": 0.496891495601173, + "grad_norm": 1.121399487685498, + "learning_rate": 5.926443620435572e-06, + "loss": 0.6726, + "step": 2118 + }, + { + "epoch": 0.4971260997067449, + "grad_norm": 1.5613413013974176, + "learning_rate": 5.922418286267143e-06, + "loss": 0.6105, + "step": 2119 + }, + { + "epoch": 0.49736070381231673, + "grad_norm": 1.0832686155216917, + "learning_rate": 5.918392333089819e-06, + "loss": 0.6815, + "step": 2120 + }, + { + "epoch": 0.4975953079178886, + "grad_norm": 1.0810582259921746, + "learning_rate": 5.914365763605308e-06, + "loss": 0.6054, + "step": 2121 + }, + { + "epoch": 0.4978299120234604, + "grad_norm": 1.199910778135771, + "learning_rate": 5.910338580515723e-06, + "loss": 0.6353, + "step": 2122 + }, + { + "epoch": 0.49806451612903224, + "grad_norm": 1.4493540128067932, + "learning_rate": 5.906310786523594e-06, + "loss": 0.6516, + "step": 2123 + }, + { + "epoch": 0.4982991202346041, + "grad_norm": 1.663545592915166, + "learning_rate": 5.9022823843318614e-06, + "loss": 0.6498, + "step": 2124 + }, + { + "epoch": 0.49853372434017595, + "grad_norm": 2.7499053506077575, + "learning_rate": 5.898253376643871e-06, + "loss": 0.6302, + "step": 2125 + }, + { + "epoch": 0.4987683284457478, + "grad_norm": 2.3568726196225644, + "learning_rate": 5.894223766163377e-06, + "loss": 0.6074, + "step": 2126 + }, + { + "epoch": 0.49900293255131967, + "grad_norm": 1.2555315010985002, + "learning_rate": 5.890193555594537e-06, + "loss": 0.6101, + "step": 2127 + }, + { + "epoch": 0.4992375366568915, + "grad_norm": 1.1208816754811883, + "learning_rate": 5.886162747641912e-06, + "loss": 0.6378, + "step": 2128 + }, + { + "epoch": 0.4994721407624633, + "grad_norm": 0.9039331724175373, + "learning_rate": 5.882131345010462e-06, + "loss": 0.6231, + "step": 2129 + }, + { + "epoch": 0.4997067448680352, + "grad_norm": 1.2402817895612375, + "learning_rate": 5.878099350405551e-06, + "loss": 0.6197, + "step": 2130 + }, + { + "epoch": 0.49994134897360704, + "grad_norm": 5.463467280235245, + "learning_rate": 5.874066766532932e-06, + "loss": 0.5755, + "step": 2131 + }, + { + "epoch": 0.5001759530791788, + "grad_norm": 1.2976140282253599, + "learning_rate": 5.870033596098763e-06, + "loss": 0.6427, + "step": 2132 + }, + { + "epoch": 0.5004105571847507, + "grad_norm": 2.5624849206112397, + "learning_rate": 5.8659998418095865e-06, + "loss": 0.5447, + "step": 2133 + }, + { + "epoch": 0.5006451612903225, + "grad_norm": 1.8390436025804224, + "learning_rate": 5.8619655063723455e-06, + "loss": 0.6494, + "step": 2134 + }, + { + "epoch": 0.5008797653958944, + "grad_norm": 2.1348523946917055, + "learning_rate": 5.857930592494366e-06, + "loss": 0.5987, + "step": 2135 + }, + { + "epoch": 0.5011143695014663, + "grad_norm": 1.3960883090503784, + "learning_rate": 5.853895102883366e-06, + "loss": 0.6984, + "step": 2136 + }, + { + "epoch": 0.5013489736070381, + "grad_norm": 1.5349738414978487, + "learning_rate": 5.849859040247447e-06, + "loss": 0.596, + "step": 2137 + }, + { + "epoch": 0.50158357771261, + "grad_norm": 1.9655076525930564, + "learning_rate": 5.845822407295101e-06, + "loss": 0.6229, + "step": 2138 + }, + { + "epoch": 0.5018181818181818, + "grad_norm": 1.3420007788533819, + "learning_rate": 5.841785206735192e-06, + "loss": 0.6193, + "step": 2139 + }, + { + "epoch": 0.5020527859237537, + "grad_norm": 8.117371270109627, + "learning_rate": 5.8377474412769786e-06, + "loss": 0.5652, + "step": 2140 + }, + { + "epoch": 0.5022873900293255, + "grad_norm": 1.561001686558175, + "learning_rate": 5.833709113630086e-06, + "loss": 0.6372, + "step": 2141 + }, + { + "epoch": 0.5025219941348974, + "grad_norm": 2.092024563321373, + "learning_rate": 5.829670226504525e-06, + "loss": 0.5863, + "step": 2142 + }, + { + "epoch": 0.5027565982404693, + "grad_norm": 10.800969441708352, + "learning_rate": 5.825630782610676e-06, + "loss": 0.6496, + "step": 2143 + }, + { + "epoch": 0.5029912023460411, + "grad_norm": 1.426248979743345, + "learning_rate": 5.821590784659298e-06, + "loss": 0.6597, + "step": 2144 + }, + { + "epoch": 0.5032258064516129, + "grad_norm": 1.2249297694081969, + "learning_rate": 5.817550235361519e-06, + "loss": 0.607, + "step": 2145 + }, + { + "epoch": 0.5034604105571847, + "grad_norm": 1.1137700468619252, + "learning_rate": 5.8135091374288374e-06, + "loss": 0.6101, + "step": 2146 + }, + { + "epoch": 0.5036950146627566, + "grad_norm": 1.4303501400443885, + "learning_rate": 5.809467493573122e-06, + "loss": 0.6401, + "step": 2147 + }, + { + "epoch": 0.5039296187683284, + "grad_norm": 18.40844197854649, + "learning_rate": 5.805425306506602e-06, + "loss": 0.5962, + "step": 2148 + }, + { + "epoch": 0.5041642228739003, + "grad_norm": 1.1199426720492256, + "learning_rate": 5.80138257894188e-06, + "loss": 0.6741, + "step": 2149 + }, + { + "epoch": 0.5043988269794721, + "grad_norm": 1.3560101892234653, + "learning_rate": 5.797339313591912e-06, + "loss": 0.6349, + "step": 2150 + }, + { + "epoch": 0.504633431085044, + "grad_norm": 1.2811267233227819, + "learning_rate": 5.7932955131700234e-06, + "loss": 0.6083, + "step": 2151 + }, + { + "epoch": 0.5048680351906158, + "grad_norm": 1.9114263951195682, + "learning_rate": 5.78925118038989e-06, + "loss": 0.5744, + "step": 2152 + }, + { + "epoch": 0.5051026392961877, + "grad_norm": 1.1503305896877258, + "learning_rate": 5.785206317965554e-06, + "loss": 0.6615, + "step": 2153 + }, + { + "epoch": 0.5053372434017596, + "grad_norm": 1.0972389798714104, + "learning_rate": 5.781160928611405e-06, + "loss": 0.5918, + "step": 2154 + }, + { + "epoch": 0.5055718475073314, + "grad_norm": 2.011327435876117, + "learning_rate": 5.777115015042192e-06, + "loss": 0.611, + "step": 2155 + }, + { + "epoch": 0.5058064516129033, + "grad_norm": 1.1611929902591391, + "learning_rate": 5.773068579973012e-06, + "loss": 0.6294, + "step": 2156 + }, + { + "epoch": 0.506041055718475, + "grad_norm": 1.7643719135611815, + "learning_rate": 5.769021626119314e-06, + "loss": 0.6624, + "step": 2157 + }, + { + "epoch": 0.5062756598240469, + "grad_norm": 20.636142531088044, + "learning_rate": 5.764974156196895e-06, + "loss": 0.6737, + "step": 2158 + }, + { + "epoch": 0.5065102639296187, + "grad_norm": 1.247097777242137, + "learning_rate": 5.760926172921897e-06, + "loss": 0.6894, + "step": 2159 + }, + { + "epoch": 0.5067448680351906, + "grad_norm": 1.8066188779463952, + "learning_rate": 5.756877679010808e-06, + "loss": 0.6224, + "step": 2160 + }, + { + "epoch": 0.5069794721407624, + "grad_norm": 2.330999880939858, + "learning_rate": 5.752828677180459e-06, + "loss": 0.6269, + "step": 2161 + }, + { + "epoch": 0.5072140762463343, + "grad_norm": 1.1600168769235768, + "learning_rate": 5.748779170148018e-06, + "loss": 0.6612, + "step": 2162 + }, + { + "epoch": 0.5074486803519062, + "grad_norm": 2.542947949985644, + "learning_rate": 5.744729160630998e-06, + "loss": 0.6315, + "step": 2163 + }, + { + "epoch": 0.507683284457478, + "grad_norm": 0.9921932303454776, + "learning_rate": 5.740678651347244e-06, + "loss": 0.5856, + "step": 2164 + }, + { + "epoch": 0.5079178885630499, + "grad_norm": 2.9383136063362496, + "learning_rate": 5.736627645014941e-06, + "loss": 0.6361, + "step": 2165 + }, + { + "epoch": 0.5081524926686217, + "grad_norm": 1.5761759179803847, + "learning_rate": 5.732576144352603e-06, + "loss": 0.6016, + "step": 2166 + }, + { + "epoch": 0.5083870967741936, + "grad_norm": 1.4922721065637146, + "learning_rate": 5.728524152079078e-06, + "loss": 0.6215, + "step": 2167 + }, + { + "epoch": 0.5086217008797654, + "grad_norm": 1.0309400868013936, + "learning_rate": 5.724471670913545e-06, + "loss": 0.6037, + "step": 2168 + }, + { + "epoch": 0.5088563049853373, + "grad_norm": 9.738469427385654, + "learning_rate": 5.720418703575507e-06, + "loss": 0.6541, + "step": 2169 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 1.701836346894189, + "learning_rate": 5.716365252784799e-06, + "loss": 0.6199, + "step": 2170 + }, + { + "epoch": 0.5093255131964809, + "grad_norm": 1.1885589396843215, + "learning_rate": 5.7123113212615776e-06, + "loss": 0.6227, + "step": 2171 + }, + { + "epoch": 0.5095601173020528, + "grad_norm": 1.0427266253586323, + "learning_rate": 5.70825691172632e-06, + "loss": 0.6218, + "step": 2172 + }, + { + "epoch": 0.5097947214076246, + "grad_norm": 0.9257854703993581, + "learning_rate": 5.704202026899827e-06, + "loss": 0.5982, + "step": 2173 + }, + { + "epoch": 0.5100293255131965, + "grad_norm": 1.4673358268015058, + "learning_rate": 5.700146669503217e-06, + "loss": 0.5778, + "step": 2174 + }, + { + "epoch": 0.5102639296187683, + "grad_norm": 1.6198375985508418, + "learning_rate": 5.696090842257925e-06, + "loss": 0.6297, + "step": 2175 + }, + { + "epoch": 0.5104985337243402, + "grad_norm": 1.339436711188381, + "learning_rate": 5.692034547885705e-06, + "loss": 0.6198, + "step": 2176 + }, + { + "epoch": 0.510733137829912, + "grad_norm": 1.1579018561803922, + "learning_rate": 5.6879777891086195e-06, + "loss": 0.6565, + "step": 2177 + }, + { + "epoch": 0.5109677419354839, + "grad_norm": 1.5933279101788016, + "learning_rate": 5.6839205686490474e-06, + "loss": 0.6677, + "step": 2178 + }, + { + "epoch": 0.5112023460410557, + "grad_norm": 1.3983475973423523, + "learning_rate": 5.6798628892296704e-06, + "loss": 0.6398, + "step": 2179 + }, + { + "epoch": 0.5114369501466276, + "grad_norm": 1.9172780350428003, + "learning_rate": 5.675804753573488e-06, + "loss": 0.6646, + "step": 2180 + }, + { + "epoch": 0.5116715542521995, + "grad_norm": 1.926769896048311, + "learning_rate": 5.6717461644037954e-06, + "loss": 0.6587, + "step": 2181 + }, + { + "epoch": 0.5119061583577713, + "grad_norm": 1.060879830734715, + "learning_rate": 5.667687124444201e-06, + "loss": 0.6258, + "step": 2182 + }, + { + "epoch": 0.5121407624633431, + "grad_norm": 1.9436882055521738, + "learning_rate": 5.663627636418611e-06, + "loss": 0.6141, + "step": 2183 + }, + { + "epoch": 0.5123753665689149, + "grad_norm": 1.3446154026689008, + "learning_rate": 5.659567703051233e-06, + "loss": 0.5471, + "step": 2184 + }, + { + "epoch": 0.5126099706744868, + "grad_norm": 3.7830776679121305, + "learning_rate": 5.655507327066573e-06, + "loss": 0.5606, + "step": 2185 + }, + { + "epoch": 0.5128445747800586, + "grad_norm": 1.5657620918205266, + "learning_rate": 5.651446511189438e-06, + "loss": 0.6554, + "step": 2186 + }, + { + "epoch": 0.5130791788856305, + "grad_norm": 1.756775187657003, + "learning_rate": 5.647385258144924e-06, + "loss": 0.6211, + "step": 2187 + }, + { + "epoch": 0.5133137829912023, + "grad_norm": 1.1675671400970369, + "learning_rate": 5.6433235706584245e-06, + "loss": 0.5962, + "step": 2188 + }, + { + "epoch": 0.5135483870967742, + "grad_norm": 3.6325308426101017, + "learning_rate": 5.639261451455621e-06, + "loss": 0.6199, + "step": 2189 + }, + { + "epoch": 0.513782991202346, + "grad_norm": 0.9891117588749072, + "learning_rate": 5.635198903262491e-06, + "loss": 0.6064, + "step": 2190 + }, + { + "epoch": 0.5140175953079179, + "grad_norm": 1.1258694136256737, + "learning_rate": 5.631135928805294e-06, + "loss": 0.587, + "step": 2191 + }, + { + "epoch": 0.5142521994134898, + "grad_norm": 1.9338927962022219, + "learning_rate": 5.627072530810577e-06, + "loss": 0.6229, + "step": 2192 + }, + { + "epoch": 0.5144868035190616, + "grad_norm": 0.882848012078404, + "learning_rate": 5.6230087120051724e-06, + "loss": 0.5634, + "step": 2193 + }, + { + "epoch": 0.5147214076246335, + "grad_norm": 1.3356355298231395, + "learning_rate": 5.6189444751161945e-06, + "loss": 0.6302, + "step": 2194 + }, + { + "epoch": 0.5149560117302053, + "grad_norm": 1.9578192368455911, + "learning_rate": 5.614879822871039e-06, + "loss": 0.6136, + "step": 2195 + }, + { + "epoch": 0.5151906158357771, + "grad_norm": 1.4082633999462215, + "learning_rate": 5.610814757997377e-06, + "loss": 0.5932, + "step": 2196 + }, + { + "epoch": 0.5154252199413489, + "grad_norm": 0.8495768077701695, + "learning_rate": 5.60674928322316e-06, + "loss": 0.6419, + "step": 2197 + }, + { + "epoch": 0.5156598240469208, + "grad_norm": 1.633680357289323, + "learning_rate": 5.6026834012766155e-06, + "loss": 0.6364, + "step": 2198 + }, + { + "epoch": 0.5158944281524926, + "grad_norm": 0.7862901238288248, + "learning_rate": 5.59861711488624e-06, + "loss": 0.6226, + "step": 2199 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 1.8803043183879258, + "learning_rate": 5.594550426780804e-06, + "loss": 0.6542, + "step": 2200 + }, + { + "epoch": 0.5163636363636364, + "grad_norm": 2.771889576398019, + "learning_rate": 5.590483339689346e-06, + "loss": 0.6825, + "step": 2201 + }, + { + "epoch": 0.5165982404692082, + "grad_norm": 1.0003813650951727, + "learning_rate": 5.586415856341175e-06, + "loss": 0.5556, + "step": 2202 + }, + { + "epoch": 0.5168328445747801, + "grad_norm": 1.885413679762788, + "learning_rate": 5.582347979465864e-06, + "loss": 0.6098, + "step": 2203 + }, + { + "epoch": 0.5170674486803519, + "grad_norm": 0.8870621895639946, + "learning_rate": 5.578279711793251e-06, + "loss": 0.6223, + "step": 2204 + }, + { + "epoch": 0.5173020527859238, + "grad_norm": 5.416718963422954, + "learning_rate": 5.574211056053434e-06, + "loss": 0.6284, + "step": 2205 + }, + { + "epoch": 0.5175366568914956, + "grad_norm": 1.1596375934628205, + "learning_rate": 5.570142014976773e-06, + "loss": 0.6213, + "step": 2206 + }, + { + "epoch": 0.5177712609970675, + "grad_norm": 1.8189386028411145, + "learning_rate": 5.566072591293892e-06, + "loss": 0.616, + "step": 2207 + }, + { + "epoch": 0.5180058651026394, + "grad_norm": 0.9619387015495638, + "learning_rate": 5.562002787735658e-06, + "loss": 0.6506, + "step": 2208 + }, + { + "epoch": 0.5182404692082111, + "grad_norm": 0.9584281818523113, + "learning_rate": 5.557932607033207e-06, + "loss": 0.5707, + "step": 2209 + }, + { + "epoch": 0.518475073313783, + "grad_norm": 2.3093845761194434, + "learning_rate": 5.553862051917922e-06, + "loss": 0.6048, + "step": 2210 + }, + { + "epoch": 0.5187096774193548, + "grad_norm": 0.9421914922940519, + "learning_rate": 5.549791125121435e-06, + "loss": 0.6103, + "step": 2211 + }, + { + "epoch": 0.5189442815249267, + "grad_norm": 2.7344370847521886, + "learning_rate": 5.545719829375633e-06, + "loss": 0.5971, + "step": 2212 + }, + { + "epoch": 0.5191788856304985, + "grad_norm": 1.4049621935973984, + "learning_rate": 5.541648167412648e-06, + "loss": 0.697, + "step": 2213 + }, + { + "epoch": 0.5194134897360704, + "grad_norm": 1.8334813413280062, + "learning_rate": 5.537576141964854e-06, + "loss": 0.6194, + "step": 2214 + }, + { + "epoch": 0.5196480938416422, + "grad_norm": 1.2094605468362534, + "learning_rate": 5.5335037557648765e-06, + "loss": 0.6261, + "step": 2215 + }, + { + "epoch": 0.5198826979472141, + "grad_norm": 0.7978972605979516, + "learning_rate": 5.5294310115455755e-06, + "loss": 0.6063, + "step": 2216 + }, + { + "epoch": 0.520117302052786, + "grad_norm": 1.4040737386700064, + "learning_rate": 5.525357912040058e-06, + "loss": 0.5976, + "step": 2217 + }, + { + "epoch": 0.5203519061583578, + "grad_norm": 1.3325233002743604, + "learning_rate": 5.521284459981662e-06, + "loss": 0.5753, + "step": 2218 + }, + { + "epoch": 0.5205865102639297, + "grad_norm": 4.882551507518323, + "learning_rate": 5.5172106581039705e-06, + "loss": 0.618, + "step": 2219 + }, + { + "epoch": 0.5208211143695015, + "grad_norm": 0.7895624670833509, + "learning_rate": 5.5131365091407955e-06, + "loss": 0.6401, + "step": 2220 + }, + { + "epoch": 0.5210557184750733, + "grad_norm": 1.1171434818172954, + "learning_rate": 5.5090620158261835e-06, + "loss": 0.6545, + "step": 2221 + }, + { + "epoch": 0.5212903225806451, + "grad_norm": 1.856141293153569, + "learning_rate": 5.504987180894411e-06, + "loss": 0.663, + "step": 2222 + }, + { + "epoch": 0.521524926686217, + "grad_norm": 1.374983758999393, + "learning_rate": 5.500912007079987e-06, + "loss": 0.6079, + "step": 2223 + }, + { + "epoch": 0.5217595307917888, + "grad_norm": 1.8524416969913131, + "learning_rate": 5.496836497117642e-06, + "loss": 0.6345, + "step": 2224 + }, + { + "epoch": 0.5219941348973607, + "grad_norm": 1.0130769940812943, + "learning_rate": 5.492760653742339e-06, + "loss": 0.6545, + "step": 2225 + }, + { + "epoch": 0.5222287390029325, + "grad_norm": 4.3285463762841365, + "learning_rate": 5.4886844796892635e-06, + "loss": 0.6231, + "step": 2226 + }, + { + "epoch": 0.5224633431085044, + "grad_norm": 2.9862132184586088, + "learning_rate": 5.4846079776938144e-06, + "loss": 0.6299, + "step": 2227 + }, + { + "epoch": 0.5226979472140763, + "grad_norm": 1.893331135968169, + "learning_rate": 5.480531150491622e-06, + "loss": 0.569, + "step": 2228 + }, + { + "epoch": 0.5229325513196481, + "grad_norm": 2.463041300453357, + "learning_rate": 5.4764540008185294e-06, + "loss": 0.6277, + "step": 2229 + }, + { + "epoch": 0.52316715542522, + "grad_norm": 2.316814928454367, + "learning_rate": 5.472376531410597e-06, + "loss": 0.6186, + "step": 2230 + }, + { + "epoch": 0.5234017595307918, + "grad_norm": 3.8991222865824504, + "learning_rate": 5.468298745004096e-06, + "loss": 0.6329, + "step": 2231 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 1.3242630714170949, + "learning_rate": 5.464220644335518e-06, + "loss": 0.6249, + "step": 2232 + }, + { + "epoch": 0.5238709677419355, + "grad_norm": 0.7227513969002327, + "learning_rate": 5.460142232141561e-06, + "loss": 0.6156, + "step": 2233 + }, + { + "epoch": 0.5241055718475073, + "grad_norm": 1.1764152539151682, + "learning_rate": 5.45606351115913e-06, + "loss": 0.6911, + "step": 2234 + }, + { + "epoch": 0.5243401759530791, + "grad_norm": 0.9231453226180013, + "learning_rate": 5.451984484125341e-06, + "loss": 0.6387, + "step": 2235 + }, + { + "epoch": 0.524574780058651, + "grad_norm": 1.4423818958906918, + "learning_rate": 5.447905153777515e-06, + "loss": 0.6028, + "step": 2236 + }, + { + "epoch": 0.5248093841642228, + "grad_norm": 1.252361802772466, + "learning_rate": 5.4438255228531734e-06, + "loss": 0.5842, + "step": 2237 + }, + { + "epoch": 0.5250439882697947, + "grad_norm": 4.381022046934695, + "learning_rate": 5.439745594090042e-06, + "loss": 0.6338, + "step": 2238 + }, + { + "epoch": 0.5252785923753666, + "grad_norm": 1.3650915934618688, + "learning_rate": 5.4356653702260475e-06, + "loss": 0.6434, + "step": 2239 + }, + { + "epoch": 0.5255131964809384, + "grad_norm": 0.9064496023863303, + "learning_rate": 5.431584853999312e-06, + "loss": 0.5568, + "step": 2240 + }, + { + "epoch": 0.5257478005865103, + "grad_norm": 2.008849151658651, + "learning_rate": 5.427504048148153e-06, + "loss": 0.6, + "step": 2241 + }, + { + "epoch": 0.5259824046920821, + "grad_norm": 2.564532321909153, + "learning_rate": 5.423422955411087e-06, + "loss": 0.6427, + "step": 2242 + }, + { + "epoch": 0.526217008797654, + "grad_norm": 2.5285727151936572, + "learning_rate": 5.41934157852682e-06, + "loss": 0.5802, + "step": 2243 + }, + { + "epoch": 0.5264516129032258, + "grad_norm": 1.0149743248092284, + "learning_rate": 5.415259920234247e-06, + "loss": 0.6196, + "step": 2244 + }, + { + "epoch": 0.5266862170087977, + "grad_norm": 0.970858090329897, + "learning_rate": 5.411177983272454e-06, + "loss": 0.6638, + "step": 2245 + }, + { + "epoch": 0.5269208211143696, + "grad_norm": 1.221113048601327, + "learning_rate": 5.407095770380717e-06, + "loss": 0.671, + "step": 2246 + }, + { + "epoch": 0.5271554252199413, + "grad_norm": 1.5190796071583499, + "learning_rate": 5.40301328429849e-06, + "loss": 0.6139, + "step": 2247 + }, + { + "epoch": 0.5273900293255132, + "grad_norm": 1.7030575936840922, + "learning_rate": 5.398930527765416e-06, + "loss": 0.6286, + "step": 2248 + }, + { + "epoch": 0.527624633431085, + "grad_norm": 1.9951172534871002, + "learning_rate": 5.394847503521316e-06, + "loss": 0.6786, + "step": 2249 + }, + { + "epoch": 0.5278592375366569, + "grad_norm": 1.5364603846571465, + "learning_rate": 5.3907642143061974e-06, + "loss": 0.6772, + "step": 2250 + }, + { + "epoch": 0.5280938416422287, + "grad_norm": 1.5308755333089372, + "learning_rate": 5.386680662860234e-06, + "loss": 0.6152, + "step": 2251 + }, + { + "epoch": 0.5283284457478006, + "grad_norm": 1.5489129645290929, + "learning_rate": 5.382596851923786e-06, + "loss": 0.6646, + "step": 2252 + }, + { + "epoch": 0.5285630498533724, + "grad_norm": 1.2992198648344988, + "learning_rate": 5.378512784237382e-06, + "loss": 0.5925, + "step": 2253 + }, + { + "epoch": 0.5287976539589443, + "grad_norm": 1.2879178518454926, + "learning_rate": 5.374428462541726e-06, + "loss": 0.6355, + "step": 2254 + }, + { + "epoch": 0.5290322580645161, + "grad_norm": 2.227818265359871, + "learning_rate": 5.37034388957769e-06, + "loss": 0.6364, + "step": 2255 + }, + { + "epoch": 0.529266862170088, + "grad_norm": 0.9408682021543482, + "learning_rate": 5.366259068086316e-06, + "loss": 0.5764, + "step": 2256 + }, + { + "epoch": 0.5295014662756599, + "grad_norm": 1.5853926177483033, + "learning_rate": 5.362174000808813e-06, + "loss": 0.6645, + "step": 2257 + }, + { + "epoch": 0.5297360703812317, + "grad_norm": 2.7689887231795396, + "learning_rate": 5.3580886904865525e-06, + "loss": 0.6073, + "step": 2258 + }, + { + "epoch": 0.5299706744868036, + "grad_norm": 1.0482328044058897, + "learning_rate": 5.354003139861075e-06, + "loss": 0.6727, + "step": 2259 + }, + { + "epoch": 0.5302052785923753, + "grad_norm": 1.0236839529278232, + "learning_rate": 5.349917351674073e-06, + "loss": 0.6187, + "step": 2260 + }, + { + "epoch": 0.5304398826979472, + "grad_norm": 1.0129900103232927, + "learning_rate": 5.345831328667408e-06, + "loss": 0.5774, + "step": 2261 + }, + { + "epoch": 0.530674486803519, + "grad_norm": 1.0846671484043156, + "learning_rate": 5.341745073583092e-06, + "loss": 0.6425, + "step": 2262 + }, + { + "epoch": 0.5309090909090909, + "grad_norm": 1.9163706435576997, + "learning_rate": 5.3376585891633e-06, + "loss": 0.5842, + "step": 2263 + }, + { + "epoch": 0.5311436950146627, + "grad_norm": 1.5106485398321852, + "learning_rate": 5.333571878150349e-06, + "loss": 0.6318, + "step": 2264 + }, + { + "epoch": 0.5313782991202346, + "grad_norm": 1.4955458517030962, + "learning_rate": 5.329484943286721e-06, + "loss": 0.6589, + "step": 2265 + }, + { + "epoch": 0.5316129032258065, + "grad_norm": 1.5417351005640656, + "learning_rate": 5.325397787315042e-06, + "loss": 0.6079, + "step": 2266 + }, + { + "epoch": 0.5318475073313783, + "grad_norm": 1.1147049544994247, + "learning_rate": 5.321310412978087e-06, + "loss": 0.6736, + "step": 2267 + }, + { + "epoch": 0.5320821114369502, + "grad_norm": 0.9691010115156603, + "learning_rate": 5.317222823018775e-06, + "loss": 0.6478, + "step": 2268 + }, + { + "epoch": 0.532316715542522, + "grad_norm": 1.720804059010316, + "learning_rate": 5.313135020180177e-06, + "loss": 0.6839, + "step": 2269 + }, + { + "epoch": 0.5325513196480939, + "grad_norm": 1.997084001822593, + "learning_rate": 5.309047007205498e-06, + "loss": 0.647, + "step": 2270 + }, + { + "epoch": 0.5327859237536657, + "grad_norm": 3.1830546212101867, + "learning_rate": 5.30495878683809e-06, + "loss": 0.6638, + "step": 2271 + }, + { + "epoch": 0.5330205278592376, + "grad_norm": 1.1523445703582846, + "learning_rate": 5.300870361821442e-06, + "loss": 0.6396, + "step": 2272 + }, + { + "epoch": 0.5332551319648093, + "grad_norm": 1.8802046275179825, + "learning_rate": 5.296781734899182e-06, + "loss": 0.6272, + "step": 2273 + }, + { + "epoch": 0.5334897360703812, + "grad_norm": 1.6473165116582242, + "learning_rate": 5.292692908815069e-06, + "loss": 0.6731, + "step": 2274 + }, + { + "epoch": 0.533724340175953, + "grad_norm": 1.2678460151946527, + "learning_rate": 5.288603886313003e-06, + "loss": 0.6138, + "step": 2275 + }, + { + "epoch": 0.5339589442815249, + "grad_norm": 2.716014175519501, + "learning_rate": 5.2845146701370085e-06, + "loss": 0.6514, + "step": 2276 + }, + { + "epoch": 0.5341935483870968, + "grad_norm": 1.9686345263377154, + "learning_rate": 5.280425263031245e-06, + "loss": 0.6167, + "step": 2277 + }, + { + "epoch": 0.5344281524926686, + "grad_norm": 1.2309264732837615, + "learning_rate": 5.276335667739998e-06, + "loss": 0.6013, + "step": 2278 + }, + { + "epoch": 0.5346627565982405, + "grad_norm": 1.1053218192979433, + "learning_rate": 5.272245887007678e-06, + "loss": 0.6811, + "step": 2279 + }, + { + "epoch": 0.5348973607038123, + "grad_norm": 1.0822971676051554, + "learning_rate": 5.268155923578822e-06, + "loss": 0.5927, + "step": 2280 + }, + { + "epoch": 0.5351319648093842, + "grad_norm": 3.3062833457443386, + "learning_rate": 5.264065780198089e-06, + "loss": 0.6314, + "step": 2281 + }, + { + "epoch": 0.535366568914956, + "grad_norm": 0.9575935721960738, + "learning_rate": 5.2599754596102615e-06, + "loss": 0.6794, + "step": 2282 + }, + { + "epoch": 0.5356011730205279, + "grad_norm": 2.7721463574049956, + "learning_rate": 5.255884964560235e-06, + "loss": 0.6081, + "step": 2283 + }, + { + "epoch": 0.5358357771260998, + "grad_norm": 2.592514843146242, + "learning_rate": 5.251794297793027e-06, + "loss": 0.6182, + "step": 2284 + }, + { + "epoch": 0.5360703812316715, + "grad_norm": 1.3936995820236064, + "learning_rate": 5.247703462053767e-06, + "loss": 0.6374, + "step": 2285 + }, + { + "epoch": 0.5363049853372434, + "grad_norm": 15.214073236526824, + "learning_rate": 5.243612460087703e-06, + "loss": 0.6459, + "step": 2286 + }, + { + "epoch": 0.5365395894428152, + "grad_norm": 1.230089585822257, + "learning_rate": 5.239521294640185e-06, + "loss": 0.5995, + "step": 2287 + }, + { + "epoch": 0.5367741935483871, + "grad_norm": 1.3390750104505194, + "learning_rate": 5.235429968456686e-06, + "loss": 0.6388, + "step": 2288 + }, + { + "epoch": 0.5370087976539589, + "grad_norm": 2.2923778420764647, + "learning_rate": 5.231338484282774e-06, + "loss": 0.6608, + "step": 2289 + }, + { + "epoch": 0.5372434017595308, + "grad_norm": 1.3427084443327424, + "learning_rate": 5.227246844864131e-06, + "loss": 0.6616, + "step": 2290 + }, + { + "epoch": 0.5374780058651026, + "grad_norm": 1.479501580983395, + "learning_rate": 5.22315505294654e-06, + "loss": 0.6313, + "step": 2291 + }, + { + "epoch": 0.5377126099706745, + "grad_norm": 1.0572575657370458, + "learning_rate": 5.219063111275889e-06, + "loss": 0.5879, + "step": 2292 + }, + { + "epoch": 0.5379472140762463, + "grad_norm": 1.739928364512639, + "learning_rate": 5.214971022598162e-06, + "loss": 0.6682, + "step": 2293 + }, + { + "epoch": 0.5381818181818182, + "grad_norm": 1.437674607065896, + "learning_rate": 5.210878789659445e-06, + "loss": 0.6549, + "step": 2294 + }, + { + "epoch": 0.5384164222873901, + "grad_norm": 1.7351018511658245, + "learning_rate": 5.206786415205921e-06, + "loss": 0.6323, + "step": 2295 + }, + { + "epoch": 0.5386510263929619, + "grad_norm": 1.3106284147500318, + "learning_rate": 5.202693901983868e-06, + "loss": 0.5961, + "step": 2296 + }, + { + "epoch": 0.5388856304985338, + "grad_norm": 2.530769324981352, + "learning_rate": 5.1986012527396524e-06, + "loss": 0.6121, + "step": 2297 + }, + { + "epoch": 0.5391202346041055, + "grad_norm": 1.9709138346894095, + "learning_rate": 5.194508470219739e-06, + "loss": 0.6177, + "step": 2298 + }, + { + "epoch": 0.5393548387096774, + "grad_norm": 1.5527505505893144, + "learning_rate": 5.190415557170678e-06, + "loss": 0.6227, + "step": 2299 + }, + { + "epoch": 0.5395894428152492, + "grad_norm": 1.1474346639859412, + "learning_rate": 5.186322516339108e-06, + "loss": 0.5706, + "step": 2300 + }, + { + "epoch": 0.5398240469208211, + "grad_norm": 2.047406907631492, + "learning_rate": 5.1822293504717525e-06, + "loss": 0.6194, + "step": 2301 + }, + { + "epoch": 0.5400586510263929, + "grad_norm": 1.04144786967005, + "learning_rate": 5.178136062315421e-06, + "loss": 0.6008, + "step": 2302 + }, + { + "epoch": 0.5402932551319648, + "grad_norm": 1.3444351715872211, + "learning_rate": 5.174042654617001e-06, + "loss": 0.6392, + "step": 2303 + }, + { + "epoch": 0.5405278592375367, + "grad_norm": 1.4815625258755283, + "learning_rate": 5.169949130123465e-06, + "loss": 0.641, + "step": 2304 + }, + { + "epoch": 0.5407624633431085, + "grad_norm": 1.77251076274484, + "learning_rate": 5.165855491581863e-06, + "loss": 0.5752, + "step": 2305 + }, + { + "epoch": 0.5409970674486804, + "grad_norm": 1.4646850211091804, + "learning_rate": 5.161761741739319e-06, + "loss": 0.6036, + "step": 2306 + }, + { + "epoch": 0.5412316715542522, + "grad_norm": 2.4059610900230135, + "learning_rate": 5.157667883343031e-06, + "loss": 0.5928, + "step": 2307 + }, + { + "epoch": 0.5414662756598241, + "grad_norm": 1.247773585715356, + "learning_rate": 5.153573919140275e-06, + "loss": 0.6137, + "step": 2308 + }, + { + "epoch": 0.5417008797653959, + "grad_norm": 1.3846254026394906, + "learning_rate": 5.149479851878393e-06, + "loss": 0.6126, + "step": 2309 + }, + { + "epoch": 0.5419354838709678, + "grad_norm": 0.8360359792043676, + "learning_rate": 5.145385684304799e-06, + "loss": 0.6029, + "step": 2310 + }, + { + "epoch": 0.5421700879765395, + "grad_norm": 2.6140391672325602, + "learning_rate": 5.141291419166974e-06, + "loss": 0.6147, + "step": 2311 + }, + { + "epoch": 0.5424046920821114, + "grad_norm": 1.3117410475337958, + "learning_rate": 5.137197059212463e-06, + "loss": 0.5693, + "step": 2312 + }, + { + "epoch": 0.5426392961876833, + "grad_norm": 3.6763496348206948, + "learning_rate": 5.133102607188875e-06, + "loss": 0.6716, + "step": 2313 + }, + { + "epoch": 0.5428739002932551, + "grad_norm": 2.718896859240161, + "learning_rate": 5.1290080658438814e-06, + "loss": 0.5728, + "step": 2314 + }, + { + "epoch": 0.543108504398827, + "grad_norm": 1.2701500246992494, + "learning_rate": 5.124913437925215e-06, + "loss": 0.5884, + "step": 2315 + }, + { + "epoch": 0.5433431085043988, + "grad_norm": 1.079743766066829, + "learning_rate": 5.120818726180662e-06, + "loss": 0.5784, + "step": 2316 + }, + { + "epoch": 0.5435777126099707, + "grad_norm": 1.2813886474414964, + "learning_rate": 5.116723933358071e-06, + "loss": 0.6133, + "step": 2317 + }, + { + "epoch": 0.5438123167155425, + "grad_norm": 1.5789949106389531, + "learning_rate": 5.112629062205341e-06, + "loss": 0.5909, + "step": 2318 + }, + { + "epoch": 0.5440469208211144, + "grad_norm": 1.4241228759151066, + "learning_rate": 5.108534115470424e-06, + "loss": 0.6201, + "step": 2319 + }, + { + "epoch": 0.5442815249266862, + "grad_norm": 1.421847608917025, + "learning_rate": 5.1044390959013225e-06, + "loss": 0.6455, + "step": 2320 + }, + { + "epoch": 0.5445161290322581, + "grad_norm": 2.5473408353621596, + "learning_rate": 5.100344006246092e-06, + "loss": 0.6531, + "step": 2321 + }, + { + "epoch": 0.54475073313783, + "grad_norm": 4.869648761811444, + "learning_rate": 5.0962488492528276e-06, + "loss": 0.6747, + "step": 2322 + }, + { + "epoch": 0.5449853372434018, + "grad_norm": 1.044340096197161, + "learning_rate": 5.092153627669675e-06, + "loss": 0.6818, + "step": 2323 + }, + { + "epoch": 0.5452199413489736, + "grad_norm": 1.1285358434215709, + "learning_rate": 5.088058344244823e-06, + "loss": 0.5337, + "step": 2324 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 1.0293245804494093, + "learning_rate": 5.083963001726501e-06, + "loss": 0.6231, + "step": 2325 + }, + { + "epoch": 0.5456891495601173, + "grad_norm": 7.794166682885448, + "learning_rate": 5.079867602862974e-06, + "loss": 0.6135, + "step": 2326 + }, + { + "epoch": 0.5459237536656891, + "grad_norm": 1.187300247165658, + "learning_rate": 5.075772150402555e-06, + "loss": 0.6588, + "step": 2327 + }, + { + "epoch": 0.546158357771261, + "grad_norm": 1.1393919289225325, + "learning_rate": 5.071676647093581e-06, + "loss": 0.6286, + "step": 2328 + }, + { + "epoch": 0.5463929618768328, + "grad_norm": 3.7156917470133535, + "learning_rate": 5.0675810956844315e-06, + "loss": 0.5989, + "step": 2329 + }, + { + "epoch": 0.5466275659824047, + "grad_norm": 1.3848509997731784, + "learning_rate": 5.0634854989235145e-06, + "loss": 0.6482, + "step": 2330 + }, + { + "epoch": 0.5468621700879766, + "grad_norm": 1.7479155187976059, + "learning_rate": 5.059389859559271e-06, + "loss": 0.6185, + "step": 2331 + }, + { + "epoch": 0.5470967741935484, + "grad_norm": 2.3408468567460736, + "learning_rate": 5.055294180340167e-06, + "loss": 0.6286, + "step": 2332 + }, + { + "epoch": 0.5473313782991203, + "grad_norm": 1.7450838683077203, + "learning_rate": 5.051198464014698e-06, + "loss": 0.6586, + "step": 2333 + }, + { + "epoch": 0.5475659824046921, + "grad_norm": 1.0214958871229616, + "learning_rate": 5.047102713331387e-06, + "loss": 0.6728, + "step": 2334 + }, + { + "epoch": 0.547800586510264, + "grad_norm": 1.0866483702890142, + "learning_rate": 5.04300693103877e-06, + "loss": 0.5837, + "step": 2335 + }, + { + "epoch": 0.5480351906158358, + "grad_norm": 1.6318482107644605, + "learning_rate": 5.038911119885416e-06, + "loss": 0.6479, + "step": 2336 + }, + { + "epoch": 0.5482697947214076, + "grad_norm": 1.3678399546457385, + "learning_rate": 5.034815282619907e-06, + "loss": 0.6184, + "step": 2337 + }, + { + "epoch": 0.5485043988269794, + "grad_norm": 6.6632809204162, + "learning_rate": 5.030719421990845e-06, + "loss": 0.6383, + "step": 2338 + }, + { + "epoch": 0.5487390029325513, + "grad_norm": 1.020276110373152, + "learning_rate": 5.026623540746842e-06, + "loss": 0.562, + "step": 2339 + }, + { + "epoch": 0.5489736070381231, + "grad_norm": 2.23704785312758, + "learning_rate": 5.0225276416365355e-06, + "loss": 0.5807, + "step": 2340 + }, + { + "epoch": 0.549208211143695, + "grad_norm": 1.3090584423433533, + "learning_rate": 5.018431727408563e-06, + "loss": 0.5913, + "step": 2341 + }, + { + "epoch": 0.5494428152492669, + "grad_norm": 1.1215291210455687, + "learning_rate": 5.0143358008115785e-06, + "loss": 0.5933, + "step": 2342 + }, + { + "epoch": 0.5496774193548387, + "grad_norm": 1.3851313663648712, + "learning_rate": 5.010239864594241e-06, + "loss": 0.6181, + "step": 2343 + }, + { + "epoch": 0.5499120234604106, + "grad_norm": 2.5214427901534107, + "learning_rate": 5.00614392150522e-06, + "loss": 0.5563, + "step": 2344 + }, + { + "epoch": 0.5501466275659824, + "grad_norm": 1.6969333520157845, + "learning_rate": 5.002047974293186e-06, + "loss": 0.61, + "step": 2345 + }, + { + "epoch": 0.5503812316715543, + "grad_norm": 0.9065597378911432, + "learning_rate": 4.997952025706816e-06, + "loss": 0.6154, + "step": 2346 + }, + { + "epoch": 0.5506158357771261, + "grad_norm": 3.403546478196266, + "learning_rate": 4.9938560784947815e-06, + "loss": 0.6081, + "step": 2347 + }, + { + "epoch": 0.550850439882698, + "grad_norm": 1.522859665413107, + "learning_rate": 4.98976013540576e-06, + "loss": 0.6104, + "step": 2348 + }, + { + "epoch": 0.5510850439882697, + "grad_norm": 2.2807731653683274, + "learning_rate": 4.985664199188424e-06, + "loss": 0.5902, + "step": 2349 + }, + { + "epoch": 0.5513196480938416, + "grad_norm": 1.996223618876685, + "learning_rate": 4.981568272591439e-06, + "loss": 0.6297, + "step": 2350 + }, + { + "epoch": 0.5515542521994135, + "grad_norm": 4.8740727246594755, + "learning_rate": 4.977472358363466e-06, + "loss": 0.648, + "step": 2351 + }, + { + "epoch": 0.5517888563049853, + "grad_norm": 1.930770244826502, + "learning_rate": 4.9733764592531595e-06, + "loss": 0.6538, + "step": 2352 + }, + { + "epoch": 0.5520234604105572, + "grad_norm": 1.7525721701911414, + "learning_rate": 4.969280578009157e-06, + "loss": 0.6159, + "step": 2353 + }, + { + "epoch": 0.552258064516129, + "grad_norm": 1.635051903248449, + "learning_rate": 4.965184717380094e-06, + "loss": 0.623, + "step": 2354 + }, + { + "epoch": 0.5524926686217009, + "grad_norm": 1.253187755847933, + "learning_rate": 4.961088880114586e-06, + "loss": 0.5675, + "step": 2355 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 2.9737604974053533, + "learning_rate": 4.956993068961231e-06, + "loss": 0.6619, + "step": 2356 + }, + { + "epoch": 0.5529618768328446, + "grad_norm": 2.22359482931915, + "learning_rate": 4.952897286668616e-06, + "loss": 0.6697, + "step": 2357 + }, + { + "epoch": 0.5531964809384164, + "grad_norm": 1.7212673210837532, + "learning_rate": 4.948801535985303e-06, + "loss": 0.6135, + "step": 2358 + }, + { + "epoch": 0.5534310850439883, + "grad_norm": 1.175532897757222, + "learning_rate": 4.944705819659834e-06, + "loss": 0.616, + "step": 2359 + }, + { + "epoch": 0.5536656891495602, + "grad_norm": 2.3673754525350335, + "learning_rate": 4.940610140440731e-06, + "loss": 0.5772, + "step": 2360 + }, + { + "epoch": 0.553900293255132, + "grad_norm": 6.049807078263912, + "learning_rate": 4.936514501076486e-06, + "loss": 0.5753, + "step": 2361 + }, + { + "epoch": 0.5541348973607038, + "grad_norm": 1.6170123243691585, + "learning_rate": 4.932418904315569e-06, + "loss": 0.6588, + "step": 2362 + }, + { + "epoch": 0.5543695014662756, + "grad_norm": 1.329215654069329, + "learning_rate": 4.928323352906421e-06, + "loss": 0.6389, + "step": 2363 + }, + { + "epoch": 0.5546041055718475, + "grad_norm": 1.2327922877038138, + "learning_rate": 4.924227849597447e-06, + "loss": 0.5687, + "step": 2364 + }, + { + "epoch": 0.5548387096774193, + "grad_norm": 1.376394351500655, + "learning_rate": 4.9201323971370264e-06, + "loss": 0.622, + "step": 2365 + }, + { + "epoch": 0.5550733137829912, + "grad_norm": 1.1401847022392284, + "learning_rate": 4.9160369982735025e-06, + "loss": 0.6096, + "step": 2366 + }, + { + "epoch": 0.555307917888563, + "grad_norm": 1.6061812093808758, + "learning_rate": 4.911941655755179e-06, + "loss": 0.6188, + "step": 2367 + }, + { + "epoch": 0.5555425219941349, + "grad_norm": 1.320887601449324, + "learning_rate": 4.907846372330326e-06, + "loss": 0.613, + "step": 2368 + }, + { + "epoch": 0.5557771260997068, + "grad_norm": 1.8809095814675294, + "learning_rate": 4.903751150747176e-06, + "loss": 0.6148, + "step": 2369 + }, + { + "epoch": 0.5560117302052786, + "grad_norm": 0.9737004625936243, + "learning_rate": 4.89965599375391e-06, + "loss": 0.5631, + "step": 2370 + }, + { + "epoch": 0.5562463343108505, + "grad_norm": 2.4030696249831283, + "learning_rate": 4.895560904098678e-06, + "loss": 0.6168, + "step": 2371 + }, + { + "epoch": 0.5564809384164223, + "grad_norm": 1.448894327844498, + "learning_rate": 4.8914658845295786e-06, + "loss": 0.645, + "step": 2372 + }, + { + "epoch": 0.5567155425219942, + "grad_norm": 1.636351851164784, + "learning_rate": 4.887370937794661e-06, + "loss": 0.5676, + "step": 2373 + }, + { + "epoch": 0.556950146627566, + "grad_norm": 3.0144010208779095, + "learning_rate": 4.8832760666419295e-06, + "loss": 0.5601, + "step": 2374 + }, + { + "epoch": 0.5571847507331378, + "grad_norm": 1.5360350101568192, + "learning_rate": 4.87918127381934e-06, + "loss": 0.6082, + "step": 2375 + }, + { + "epoch": 0.5574193548387096, + "grad_norm": 1.4622800973510324, + "learning_rate": 4.8750865620747865e-06, + "loss": 0.5936, + "step": 2376 + }, + { + "epoch": 0.5576539589442815, + "grad_norm": 1.3974336295850467, + "learning_rate": 4.870991934156119e-06, + "loss": 0.5887, + "step": 2377 + }, + { + "epoch": 0.5578885630498533, + "grad_norm": 1.1946185311544872, + "learning_rate": 4.866897392811127e-06, + "loss": 0.6253, + "step": 2378 + }, + { + "epoch": 0.5581231671554252, + "grad_norm": 1.1308622762551268, + "learning_rate": 4.8628029407875385e-06, + "loss": 0.6541, + "step": 2379 + }, + { + "epoch": 0.5583577712609971, + "grad_norm": 0.9214548517449351, + "learning_rate": 4.858708580833027e-06, + "loss": 0.6298, + "step": 2380 + }, + { + "epoch": 0.5585923753665689, + "grad_norm": 1.2700773667255392, + "learning_rate": 4.854614315695203e-06, + "loss": 0.6025, + "step": 2381 + }, + { + "epoch": 0.5588269794721408, + "grad_norm": 1.9741834869439465, + "learning_rate": 4.850520148121609e-06, + "loss": 0.5761, + "step": 2382 + }, + { + "epoch": 0.5590615835777126, + "grad_norm": 1.7369595965255913, + "learning_rate": 4.846426080859728e-06, + "loss": 0.6152, + "step": 2383 + }, + { + "epoch": 0.5592961876832845, + "grad_norm": 2.2043774595925543, + "learning_rate": 4.84233211665697e-06, + "loss": 0.6177, + "step": 2384 + }, + { + "epoch": 0.5595307917888563, + "grad_norm": 1.131176231870637, + "learning_rate": 4.838238258260684e-06, + "loss": 0.6464, + "step": 2385 + }, + { + "epoch": 0.5597653958944282, + "grad_norm": 1.2273789428672568, + "learning_rate": 4.83414450841814e-06, + "loss": 0.6084, + "step": 2386 + }, + { + "epoch": 0.56, + "grad_norm": 1.0192662099118455, + "learning_rate": 4.830050869876536e-06, + "loss": 0.6207, + "step": 2387 + }, + { + "epoch": 0.5602346041055718, + "grad_norm": 1.649900195779437, + "learning_rate": 4.825957345383e-06, + "loss": 0.6458, + "step": 2388 + }, + { + "epoch": 0.5604692082111437, + "grad_norm": 1.7812859104244483, + "learning_rate": 4.8218639376845825e-06, + "loss": 0.6388, + "step": 2389 + }, + { + "epoch": 0.5607038123167155, + "grad_norm": 1.7056418402379854, + "learning_rate": 4.817770649528249e-06, + "loss": 0.5979, + "step": 2390 + }, + { + "epoch": 0.5609384164222874, + "grad_norm": 2.759126474933544, + "learning_rate": 4.813677483660892e-06, + "loss": 0.6573, + "step": 2391 + }, + { + "epoch": 0.5611730205278592, + "grad_norm": 1.3796494768679621, + "learning_rate": 4.809584442829323e-06, + "loss": 0.5753, + "step": 2392 + }, + { + "epoch": 0.5614076246334311, + "grad_norm": 2.225836720974062, + "learning_rate": 4.805491529780261e-06, + "loss": 0.5676, + "step": 2393 + }, + { + "epoch": 0.5616422287390029, + "grad_norm": 4.8603603742046, + "learning_rate": 4.801398747260348e-06, + "loss": 0.576, + "step": 2394 + }, + { + "epoch": 0.5618768328445748, + "grad_norm": 2.728295412602604, + "learning_rate": 4.797306098016135e-06, + "loss": 0.6185, + "step": 2395 + }, + { + "epoch": 0.5621114369501466, + "grad_norm": 1.9233726850880661, + "learning_rate": 4.79321358479408e-06, + "loss": 0.6146, + "step": 2396 + }, + { + "epoch": 0.5623460410557185, + "grad_norm": 1.7933146896764474, + "learning_rate": 4.789121210340555e-06, + "loss": 0.6069, + "step": 2397 + }, + { + "epoch": 0.5625806451612904, + "grad_norm": 0.9867527993249323, + "learning_rate": 4.78502897740184e-06, + "loss": 0.5888, + "step": 2398 + }, + { + "epoch": 0.5628152492668622, + "grad_norm": 1.964434092692377, + "learning_rate": 4.780936888724113e-06, + "loss": 0.6451, + "step": 2399 + }, + { + "epoch": 0.5630498533724341, + "grad_norm": 7.884498287566548, + "learning_rate": 4.77684494705346e-06, + "loss": 0.6011, + "step": 2400 + }, + { + "epoch": 0.5632844574780058, + "grad_norm": 2.5780682514079545, + "learning_rate": 4.772753155135871e-06, + "loss": 0.6058, + "step": 2401 + }, + { + "epoch": 0.5635190615835777, + "grad_norm": 2.179264908491652, + "learning_rate": 4.768661515717227e-06, + "loss": 0.6801, + "step": 2402 + }, + { + "epoch": 0.5637536656891495, + "grad_norm": 1.3226549993329324, + "learning_rate": 4.764570031543316e-06, + "loss": 0.6316, + "step": 2403 + }, + { + "epoch": 0.5639882697947214, + "grad_norm": 1.359005961369105, + "learning_rate": 4.760478705359816e-06, + "loss": 0.6121, + "step": 2404 + }, + { + "epoch": 0.5642228739002932, + "grad_norm": 1.6349963956013602, + "learning_rate": 4.756387539912299e-06, + "loss": 0.6454, + "step": 2405 + }, + { + "epoch": 0.5644574780058651, + "grad_norm": 1.8640311979755861, + "learning_rate": 4.752296537946236e-06, + "loss": 0.6086, + "step": 2406 + }, + { + "epoch": 0.564692082111437, + "grad_norm": 2.5519420011751976, + "learning_rate": 4.748205702206975e-06, + "loss": 0.6204, + "step": 2407 + }, + { + "epoch": 0.5649266862170088, + "grad_norm": 4.836370764245565, + "learning_rate": 4.744115035439766e-06, + "loss": 0.6785, + "step": 2408 + }, + { + "epoch": 0.5651612903225807, + "grad_norm": 2.0737177128369084, + "learning_rate": 4.740024540389741e-06, + "loss": 0.6272, + "step": 2409 + }, + { + "epoch": 0.5653958944281525, + "grad_norm": 0.9540299827895479, + "learning_rate": 4.735934219801912e-06, + "loss": 0.5643, + "step": 2410 + }, + { + "epoch": 0.5656304985337244, + "grad_norm": 1.1685301518053146, + "learning_rate": 4.73184407642118e-06, + "loss": 0.6396, + "step": 2411 + }, + { + "epoch": 0.5658651026392962, + "grad_norm": 3.8313120205410023, + "learning_rate": 4.727754112992326e-06, + "loss": 0.5677, + "step": 2412 + }, + { + "epoch": 0.566099706744868, + "grad_norm": 1.2533957183595168, + "learning_rate": 4.723664332260004e-06, + "loss": 0.6528, + "step": 2413 + }, + { + "epoch": 0.5663343108504398, + "grad_norm": 1.9385572074533628, + "learning_rate": 4.719574736968755e-06, + "loss": 0.5795, + "step": 2414 + }, + { + "epoch": 0.5665689149560117, + "grad_norm": 1.4613315996424439, + "learning_rate": 4.715485329862992e-06, + "loss": 0.7028, + "step": 2415 + }, + { + "epoch": 0.5668035190615835, + "grad_norm": 7.705662308862915, + "learning_rate": 4.711396113686998e-06, + "loss": 0.6981, + "step": 2416 + }, + { + "epoch": 0.5670381231671554, + "grad_norm": 1.35730909069504, + "learning_rate": 4.707307091184931e-06, + "loss": 0.6128, + "step": 2417 + }, + { + "epoch": 0.5672727272727273, + "grad_norm": 1.0543319364793236, + "learning_rate": 4.7032182651008204e-06, + "loss": 0.6413, + "step": 2418 + }, + { + "epoch": 0.5675073313782991, + "grad_norm": 1.3648126645029455, + "learning_rate": 4.6991296381785585e-06, + "loss": 0.6146, + "step": 2419 + }, + { + "epoch": 0.567741935483871, + "grad_norm": 1.0596246737273716, + "learning_rate": 4.69504121316191e-06, + "loss": 0.6238, + "step": 2420 + }, + { + "epoch": 0.5679765395894428, + "grad_norm": 2.954131782774614, + "learning_rate": 4.690952992794504e-06, + "loss": 0.6482, + "step": 2421 + }, + { + "epoch": 0.5682111436950147, + "grad_norm": 1.644712519006594, + "learning_rate": 4.686864979819825e-06, + "loss": 0.6336, + "step": 2422 + }, + { + "epoch": 0.5684457478005865, + "grad_norm": 0.9851063697756272, + "learning_rate": 4.682777176981225e-06, + "loss": 0.595, + "step": 2423 + }, + { + "epoch": 0.5686803519061584, + "grad_norm": 1.9024601181916914, + "learning_rate": 4.6786895870219155e-06, + "loss": 0.6239, + "step": 2424 + }, + { + "epoch": 0.5689149560117303, + "grad_norm": 1.9274176199318562, + "learning_rate": 4.674602212684959e-06, + "loss": 0.626, + "step": 2425 + }, + { + "epoch": 0.569149560117302, + "grad_norm": 1.4662092880540012, + "learning_rate": 4.670515056713279e-06, + "loss": 0.5909, + "step": 2426 + }, + { + "epoch": 0.5693841642228739, + "grad_norm": 1.0391261342097122, + "learning_rate": 4.666428121849653e-06, + "loss": 0.6, + "step": 2427 + }, + { + "epoch": 0.5696187683284457, + "grad_norm": 1.191446619321251, + "learning_rate": 4.662341410836703e-06, + "loss": 0.6285, + "step": 2428 + }, + { + "epoch": 0.5698533724340176, + "grad_norm": 1.4999105564002244, + "learning_rate": 4.658254926416911e-06, + "loss": 0.62, + "step": 2429 + }, + { + "epoch": 0.5700879765395894, + "grad_norm": 1.7234983917641036, + "learning_rate": 4.654168671332594e-06, + "loss": 0.6239, + "step": 2430 + }, + { + "epoch": 0.5703225806451613, + "grad_norm": 1.0770283119340396, + "learning_rate": 4.6500826483259284e-06, + "loss": 0.6065, + "step": 2431 + }, + { + "epoch": 0.5705571847507331, + "grad_norm": 1.2128209828917635, + "learning_rate": 4.645996860138928e-06, + "loss": 0.6046, + "step": 2432 + }, + { + "epoch": 0.570791788856305, + "grad_norm": 2.0413579361061887, + "learning_rate": 4.641911309513449e-06, + "loss": 0.636, + "step": 2433 + }, + { + "epoch": 0.5710263929618768, + "grad_norm": 0.8811953036824909, + "learning_rate": 4.637825999191189e-06, + "loss": 0.6787, + "step": 2434 + }, + { + "epoch": 0.5712609970674487, + "grad_norm": 1.426557418556202, + "learning_rate": 4.6337409319136865e-06, + "loss": 0.6398, + "step": 2435 + }, + { + "epoch": 0.5714956011730206, + "grad_norm": 1.380859345024999, + "learning_rate": 4.629656110422311e-06, + "loss": 0.6174, + "step": 2436 + }, + { + "epoch": 0.5717302052785924, + "grad_norm": 1.1169157694598355, + "learning_rate": 4.625571537458275e-06, + "loss": 0.6291, + "step": 2437 + }, + { + "epoch": 0.5719648093841643, + "grad_norm": 1.2417537193839552, + "learning_rate": 4.621487215762619e-06, + "loss": 0.6316, + "step": 2438 + }, + { + "epoch": 0.572199413489736, + "grad_norm": 1.4795045752184326, + "learning_rate": 4.617403148076215e-06, + "loss": 0.5826, + "step": 2439 + }, + { + "epoch": 0.5724340175953079, + "grad_norm": 0.8933352430022446, + "learning_rate": 4.613319337139767e-06, + "loss": 0.6526, + "step": 2440 + }, + { + "epoch": 0.5726686217008797, + "grad_norm": 1.324711975821137, + "learning_rate": 4.609235785693805e-06, + "loss": 0.5706, + "step": 2441 + }, + { + "epoch": 0.5729032258064516, + "grad_norm": 1.1085588036263778, + "learning_rate": 4.605152496478685e-06, + "loss": 0.6657, + "step": 2442 + }, + { + "epoch": 0.5731378299120234, + "grad_norm": 1.0288656736675692, + "learning_rate": 4.601069472234584e-06, + "loss": 0.5887, + "step": 2443 + }, + { + "epoch": 0.5733724340175953, + "grad_norm": 1.1064627616767901, + "learning_rate": 4.596986715701511e-06, + "loss": 0.6392, + "step": 2444 + }, + { + "epoch": 0.5736070381231672, + "grad_norm": 7.134453408230588, + "learning_rate": 4.592904229619284e-06, + "loss": 0.5801, + "step": 2445 + }, + { + "epoch": 0.573841642228739, + "grad_norm": 3.627498858661479, + "learning_rate": 4.588822016727546e-06, + "loss": 0.6651, + "step": 2446 + }, + { + "epoch": 0.5740762463343109, + "grad_norm": 2.127464440263173, + "learning_rate": 4.584740079765755e-06, + "loss": 0.6006, + "step": 2447 + }, + { + "epoch": 0.5743108504398827, + "grad_norm": 2.535380292056328, + "learning_rate": 4.580658421473182e-06, + "loss": 0.6363, + "step": 2448 + }, + { + "epoch": 0.5745454545454546, + "grad_norm": 1.263636915924277, + "learning_rate": 4.576577044588912e-06, + "loss": 0.5691, + "step": 2449 + }, + { + "epoch": 0.5747800586510264, + "grad_norm": 3.8953432680279536, + "learning_rate": 4.572495951851848e-06, + "loss": 0.6474, + "step": 2450 + }, + { + "epoch": 0.5750146627565983, + "grad_norm": 3.449525943312299, + "learning_rate": 4.568415146000689e-06, + "loss": 0.5945, + "step": 2451 + }, + { + "epoch": 0.57524926686217, + "grad_norm": 0.9085189847225632, + "learning_rate": 4.564334629773956e-06, + "loss": 0.5878, + "step": 2452 + }, + { + "epoch": 0.5754838709677419, + "grad_norm": 1.4430234876118253, + "learning_rate": 4.560254405909959e-06, + "loss": 0.5796, + "step": 2453 + }, + { + "epoch": 0.5757184750733138, + "grad_norm": 0.9715140944677086, + "learning_rate": 4.556174477146828e-06, + "loss": 0.6414, + "step": 2454 + }, + { + "epoch": 0.5759530791788856, + "grad_norm": 1.3802544914953896, + "learning_rate": 4.552094846222487e-06, + "loss": 0.6364, + "step": 2455 + }, + { + "epoch": 0.5761876832844575, + "grad_norm": 14.50739306268874, + "learning_rate": 4.548015515874661e-06, + "loss": 0.6201, + "step": 2456 + }, + { + "epoch": 0.5764222873900293, + "grad_norm": 1.3123194340321647, + "learning_rate": 4.543936488840871e-06, + "loss": 0.6007, + "step": 2457 + }, + { + "epoch": 0.5766568914956012, + "grad_norm": 1.5720636883531176, + "learning_rate": 4.539857767858442e-06, + "loss": 0.6532, + "step": 2458 + }, + { + "epoch": 0.576891495601173, + "grad_norm": 6.814471147180956, + "learning_rate": 4.5357793556644825e-06, + "loss": 0.6058, + "step": 2459 + }, + { + "epoch": 0.5771260997067449, + "grad_norm": 2.249498514114243, + "learning_rate": 4.531701254995905e-06, + "loss": 0.6017, + "step": 2460 + }, + { + "epoch": 0.5773607038123167, + "grad_norm": 2.5722080365660474, + "learning_rate": 4.527623468589406e-06, + "loss": 0.5677, + "step": 2461 + }, + { + "epoch": 0.5775953079178886, + "grad_norm": 1.3189916081527315, + "learning_rate": 4.523545999181472e-06, + "loss": 0.5812, + "step": 2462 + }, + { + "epoch": 0.5778299120234605, + "grad_norm": 1.6996214839000159, + "learning_rate": 4.519468849508379e-06, + "loss": 0.6036, + "step": 2463 + }, + { + "epoch": 0.5780645161290323, + "grad_norm": 0.9931281950875175, + "learning_rate": 4.515392022306187e-06, + "loss": 0.5785, + "step": 2464 + }, + { + "epoch": 0.5782991202346041, + "grad_norm": 1.4534131665761378, + "learning_rate": 4.511315520310738e-06, + "loss": 0.6852, + "step": 2465 + }, + { + "epoch": 0.5785337243401759, + "grad_norm": 0.9868659889697355, + "learning_rate": 4.50723934625766e-06, + "loss": 0.6259, + "step": 2466 + }, + { + "epoch": 0.5787683284457478, + "grad_norm": 2.6107327136802203, + "learning_rate": 4.503163502882359e-06, + "loss": 0.628, + "step": 2467 + }, + { + "epoch": 0.5790029325513196, + "grad_norm": 1.0088657547611688, + "learning_rate": 4.4990879929200145e-06, + "loss": 0.6573, + "step": 2468 + }, + { + "epoch": 0.5792375366568915, + "grad_norm": 1.453365422227254, + "learning_rate": 4.49501281910559e-06, + "loss": 0.6703, + "step": 2469 + }, + { + "epoch": 0.5794721407624633, + "grad_norm": 0.8819630378676603, + "learning_rate": 4.490937984173818e-06, + "loss": 0.594, + "step": 2470 + }, + { + "epoch": 0.5797067448680352, + "grad_norm": 4.032052328443642, + "learning_rate": 4.486863490859205e-06, + "loss": 0.614, + "step": 2471 + }, + { + "epoch": 0.579941348973607, + "grad_norm": 1.6205609214614662, + "learning_rate": 4.482789341896029e-06, + "loss": 0.6446, + "step": 2472 + }, + { + "epoch": 0.5801759530791789, + "grad_norm": 1.7741084960213596, + "learning_rate": 4.47871554001834e-06, + "loss": 0.6264, + "step": 2473 + }, + { + "epoch": 0.5804105571847508, + "grad_norm": 3.095282512692249, + "learning_rate": 4.474642087959944e-06, + "loss": 0.6742, + "step": 2474 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 1.165975209316844, + "learning_rate": 4.470568988454426e-06, + "loss": 0.6051, + "step": 2475 + }, + { + "epoch": 0.5808797653958945, + "grad_norm": 6.365287398706099, + "learning_rate": 4.466496244235126e-06, + "loss": 0.5462, + "step": 2476 + }, + { + "epoch": 0.5811143695014662, + "grad_norm": 1.0096029586962736, + "learning_rate": 4.4624238580351466e-06, + "loss": 0.6374, + "step": 2477 + }, + { + "epoch": 0.5813489736070381, + "grad_norm": 1.198192719480048, + "learning_rate": 4.458351832587354e-06, + "loss": 0.6622, + "step": 2478 + }, + { + "epoch": 0.5815835777126099, + "grad_norm": 1.669480946734029, + "learning_rate": 4.454280170624368e-06, + "loss": 0.613, + "step": 2479 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 1.0721974900794962, + "learning_rate": 4.4502088748785646e-06, + "loss": 0.5943, + "step": 2480 + }, + { + "epoch": 0.5820527859237536, + "grad_norm": 0.7996800998165046, + "learning_rate": 4.446137948082081e-06, + "loss": 0.638, + "step": 2481 + }, + { + "epoch": 0.5822873900293255, + "grad_norm": 1.1849513327296384, + "learning_rate": 4.442067392966794e-06, + "loss": 0.5961, + "step": 2482 + }, + { + "epoch": 0.5825219941348974, + "grad_norm": 2.3522109536517175, + "learning_rate": 4.437997212264343e-06, + "loss": 0.6678, + "step": 2483 + }, + { + "epoch": 0.5827565982404692, + "grad_norm": 1.2178400640964204, + "learning_rate": 4.4339274087061116e-06, + "loss": 0.6717, + "step": 2484 + }, + { + "epoch": 0.5829912023460411, + "grad_norm": 2.640290682903799, + "learning_rate": 4.4298579850232274e-06, + "loss": 0.6418, + "step": 2485 + }, + { + "epoch": 0.5832258064516129, + "grad_norm": 10.345703317890523, + "learning_rate": 4.425788943946567e-06, + "loss": 0.6121, + "step": 2486 + }, + { + "epoch": 0.5834604105571848, + "grad_norm": 6.764934192433235, + "learning_rate": 4.421720288206752e-06, + "loss": 0.6012, + "step": 2487 + }, + { + "epoch": 0.5836950146627566, + "grad_norm": 0.9387854031509927, + "learning_rate": 4.4176520205341365e-06, + "loss": 0.609, + "step": 2488 + }, + { + "epoch": 0.5839296187683285, + "grad_norm": 0.9911978936260616, + "learning_rate": 4.413584143658826e-06, + "loss": 0.6379, + "step": 2489 + }, + { + "epoch": 0.5841642228739002, + "grad_norm": 1.366898656153631, + "learning_rate": 4.409516660310655e-06, + "loss": 0.6191, + "step": 2490 + }, + { + "epoch": 0.5843988269794721, + "grad_norm": 1.0545558254205785, + "learning_rate": 4.405449573219198e-06, + "loss": 0.6498, + "step": 2491 + }, + { + "epoch": 0.584633431085044, + "grad_norm": 3.351501696016705, + "learning_rate": 4.401382885113761e-06, + "loss": 0.635, + "step": 2492 + }, + { + "epoch": 0.5848680351906158, + "grad_norm": 1.1648565999529112, + "learning_rate": 4.397316598723385e-06, + "loss": 0.5701, + "step": 2493 + }, + { + "epoch": 0.5851026392961877, + "grad_norm": 1.1896548676159913, + "learning_rate": 4.3932507167768405e-06, + "loss": 0.6399, + "step": 2494 + }, + { + "epoch": 0.5853372434017595, + "grad_norm": 1.519452926276841, + "learning_rate": 4.389185242002622e-06, + "loss": 0.6113, + "step": 2495 + }, + { + "epoch": 0.5855718475073314, + "grad_norm": 1.4964823361045552, + "learning_rate": 4.385120177128963e-06, + "loss": 0.612, + "step": 2496 + }, + { + "epoch": 0.5858064516129032, + "grad_norm": 1.0272462888998013, + "learning_rate": 4.381055524883806e-06, + "loss": 0.607, + "step": 2497 + }, + { + "epoch": 0.5860410557184751, + "grad_norm": 2.199327762237035, + "learning_rate": 4.376991287994829e-06, + "loss": 0.6191, + "step": 2498 + }, + { + "epoch": 0.586275659824047, + "grad_norm": 1.0706737172415943, + "learning_rate": 4.372927469189425e-06, + "loss": 0.6234, + "step": 2499 + }, + { + "epoch": 0.5865102639296188, + "grad_norm": 1.3477826341717938, + "learning_rate": 4.368864071194709e-06, + "loss": 0.6277, + "step": 2500 + }, + { + "epoch": 0.5865102639296188, + "eval_loss": 0.6218044757843018, + "eval_runtime": 38.7474, + "eval_samples_per_second": 14.091, + "eval_steps_per_second": 0.129, + "step": 2500 + }, + { + "epoch": 0.5867448680351907, + "grad_norm": 1.43454169755497, + "learning_rate": 4.3648010967375114e-06, + "loss": 0.675, + "step": 2501 + }, + { + "epoch": 0.5869794721407625, + "grad_norm": 1.6132885857656545, + "learning_rate": 4.3607385485443805e-06, + "loss": 0.6075, + "step": 2502 + }, + { + "epoch": 0.5872140762463343, + "grad_norm": 1.9554429793995742, + "learning_rate": 4.356676429341577e-06, + "loss": 0.6351, + "step": 2503 + }, + { + "epoch": 0.5874486803519061, + "grad_norm": 0.9729149313289474, + "learning_rate": 4.352614741855079e-06, + "loss": 0.6028, + "step": 2504 + }, + { + "epoch": 0.587683284457478, + "grad_norm": 1.1783962682160853, + "learning_rate": 4.348553488810563e-06, + "loss": 0.6051, + "step": 2505 + }, + { + "epoch": 0.5879178885630498, + "grad_norm": 0.8412586023405613, + "learning_rate": 4.344492672933427e-06, + "loss": 0.623, + "step": 2506 + }, + { + "epoch": 0.5881524926686217, + "grad_norm": 8.623322800244942, + "learning_rate": 4.340432296948769e-06, + "loss": 0.6421, + "step": 2507 + }, + { + "epoch": 0.5883870967741935, + "grad_norm": 1.6358070714325266, + "learning_rate": 4.336372363581391e-06, + "loss": 0.6101, + "step": 2508 + }, + { + "epoch": 0.5886217008797654, + "grad_norm": 0.9504336081958826, + "learning_rate": 4.3323128755557995e-06, + "loss": 0.6325, + "step": 2509 + }, + { + "epoch": 0.5888563049853373, + "grad_norm": 4.554846572918646, + "learning_rate": 4.328253835596207e-06, + "loss": 0.6629, + "step": 2510 + }, + { + "epoch": 0.5890909090909091, + "grad_norm": 1.0740787205585829, + "learning_rate": 4.324195246426515e-06, + "loss": 0.6084, + "step": 2511 + }, + { + "epoch": 0.589325513196481, + "grad_norm": 1.8572210056231553, + "learning_rate": 4.32013711077033e-06, + "loss": 0.6289, + "step": 2512 + }, + { + "epoch": 0.5895601173020528, + "grad_norm": 1.1658183387450498, + "learning_rate": 4.316079431350956e-06, + "loss": 0.5882, + "step": 2513 + }, + { + "epoch": 0.5897947214076247, + "grad_norm": 1.2400853161647338, + "learning_rate": 4.312022210891381e-06, + "loss": 0.6151, + "step": 2514 + }, + { + "epoch": 0.5900293255131965, + "grad_norm": 1.0979577190626502, + "learning_rate": 4.307965452114296e-06, + "loss": 0.5982, + "step": 2515 + }, + { + "epoch": 0.5902639296187683, + "grad_norm": 30.268023563133838, + "learning_rate": 4.303909157742077e-06, + "loss": 0.5786, + "step": 2516 + }, + { + "epoch": 0.5904985337243401, + "grad_norm": 3.0804218435920703, + "learning_rate": 4.299853330496784e-06, + "loss": 0.6121, + "step": 2517 + }, + { + "epoch": 0.590733137829912, + "grad_norm": 1.7700600677022102, + "learning_rate": 4.295797973100174e-06, + "loss": 0.5792, + "step": 2518 + }, + { + "epoch": 0.5909677419354838, + "grad_norm": 1.4165185996015008, + "learning_rate": 4.291743088273681e-06, + "loss": 0.6553, + "step": 2519 + }, + { + "epoch": 0.5912023460410557, + "grad_norm": 1.1984550821123752, + "learning_rate": 4.287688678738423e-06, + "loss": 0.6573, + "step": 2520 + }, + { + "epoch": 0.5914369501466276, + "grad_norm": 1.4132918562457883, + "learning_rate": 4.283634747215202e-06, + "loss": 0.5815, + "step": 2521 + }, + { + "epoch": 0.5916715542521994, + "grad_norm": 3.449582328145338, + "learning_rate": 4.2795812964244935e-06, + "loss": 0.661, + "step": 2522 + }, + { + "epoch": 0.5919061583577713, + "grad_norm": 1.2157965402261237, + "learning_rate": 4.275528329086457e-06, + "loss": 0.6723, + "step": 2523 + }, + { + "epoch": 0.5921407624633431, + "grad_norm": 1.1385200395220776, + "learning_rate": 4.271475847920924e-06, + "loss": 0.6041, + "step": 2524 + }, + { + "epoch": 0.592375366568915, + "grad_norm": 1.9621492288963664, + "learning_rate": 4.267423855647399e-06, + "loss": 0.6661, + "step": 2525 + }, + { + "epoch": 0.5926099706744868, + "grad_norm": 1.4433105370089916, + "learning_rate": 4.2633723549850595e-06, + "loss": 0.6671, + "step": 2526 + }, + { + "epoch": 0.5928445747800587, + "grad_norm": 4.715698963848849, + "learning_rate": 4.259321348652757e-06, + "loss": 0.6528, + "step": 2527 + }, + { + "epoch": 0.5930791788856306, + "grad_norm": 1.820896037731156, + "learning_rate": 4.2552708393690035e-06, + "loss": 0.692, + "step": 2528 + }, + { + "epoch": 0.5933137829912023, + "grad_norm": 1.3980967432063924, + "learning_rate": 4.251220829851983e-06, + "loss": 0.5905, + "step": 2529 + }, + { + "epoch": 0.5935483870967742, + "grad_norm": 1.4592183597296384, + "learning_rate": 4.247171322819544e-06, + "loss": 0.6232, + "step": 2530 + }, + { + "epoch": 0.593782991202346, + "grad_norm": 1.6652890905059732, + "learning_rate": 4.243122320989194e-06, + "loss": 0.5506, + "step": 2531 + }, + { + "epoch": 0.5940175953079179, + "grad_norm": 0.890755420015173, + "learning_rate": 4.239073827078103e-06, + "loss": 0.6309, + "step": 2532 + }, + { + "epoch": 0.5942521994134897, + "grad_norm": 2.132100196010778, + "learning_rate": 4.2350258438031075e-06, + "loss": 0.6081, + "step": 2533 + }, + { + "epoch": 0.5944868035190616, + "grad_norm": 1.3237668202449755, + "learning_rate": 4.230978373880687e-06, + "loss": 0.5812, + "step": 2534 + }, + { + "epoch": 0.5947214076246334, + "grad_norm": 1.1462967036815435, + "learning_rate": 4.226931420026989e-06, + "loss": 0.5801, + "step": 2535 + }, + { + "epoch": 0.5949560117302053, + "grad_norm": 3.0768367275227977, + "learning_rate": 4.22288498495781e-06, + "loss": 0.6285, + "step": 2536 + }, + { + "epoch": 0.5951906158357771, + "grad_norm": 0.9785169841442185, + "learning_rate": 4.218839071388597e-06, + "loss": 0.6169, + "step": 2537 + }, + { + "epoch": 0.595425219941349, + "grad_norm": 1.561503407894026, + "learning_rate": 4.2147936820344484e-06, + "loss": 0.6483, + "step": 2538 + }, + { + "epoch": 0.5956598240469209, + "grad_norm": 2.081519039084864, + "learning_rate": 4.210748819610112e-06, + "loss": 0.6184, + "step": 2539 + }, + { + "epoch": 0.5958944281524927, + "grad_norm": 1.4292431629296551, + "learning_rate": 4.206704486829978e-06, + "loss": 0.6574, + "step": 2540 + }, + { + "epoch": 0.5961290322580645, + "grad_norm": 1.744841612567776, + "learning_rate": 4.202660686408088e-06, + "loss": 0.6214, + "step": 2541 + }, + { + "epoch": 0.5963636363636363, + "grad_norm": 1.1919779137620403, + "learning_rate": 4.198617421058122e-06, + "loss": 0.6328, + "step": 2542 + }, + { + "epoch": 0.5965982404692082, + "grad_norm": 4.436466065782175, + "learning_rate": 4.194574693493398e-06, + "loss": 0.6622, + "step": 2543 + }, + { + "epoch": 0.59683284457478, + "grad_norm": 3.042941235668859, + "learning_rate": 4.19053250642688e-06, + "loss": 0.6053, + "step": 2544 + }, + { + "epoch": 0.5970674486803519, + "grad_norm": 0.767431961578569, + "learning_rate": 4.186490862571164e-06, + "loss": 0.5862, + "step": 2545 + }, + { + "epoch": 0.5973020527859237, + "grad_norm": 2.2513791519685866, + "learning_rate": 4.182449764638483e-06, + "loss": 0.5453, + "step": 2546 + }, + { + "epoch": 0.5975366568914956, + "grad_norm": 1.3737399402284083, + "learning_rate": 4.178409215340704e-06, + "loss": 0.5814, + "step": 2547 + }, + { + "epoch": 0.5977712609970675, + "grad_norm": 1.055379340574401, + "learning_rate": 4.174369217389326e-06, + "loss": 0.613, + "step": 2548 + }, + { + "epoch": 0.5980058651026393, + "grad_norm": 1.2827478647327584, + "learning_rate": 4.170329773495477e-06, + "loss": 0.6532, + "step": 2549 + }, + { + "epoch": 0.5982404692082112, + "grad_norm": 1.4526147471149213, + "learning_rate": 4.166290886369916e-06, + "loss": 0.6089, + "step": 2550 + }, + { + "epoch": 0.598475073313783, + "grad_norm": 1.2719172270318275, + "learning_rate": 4.162252558723023e-06, + "loss": 0.6736, + "step": 2551 + }, + { + "epoch": 0.5987096774193549, + "grad_norm": 1.3665798570895367, + "learning_rate": 4.158214793264808e-06, + "loss": 0.5972, + "step": 2552 + }, + { + "epoch": 0.5989442815249267, + "grad_norm": 0.9995262318034842, + "learning_rate": 4.154177592704902e-06, + "loss": 0.6147, + "step": 2553 + }, + { + "epoch": 0.5991788856304985, + "grad_norm": 1.3115689334601, + "learning_rate": 4.1501409597525535e-06, + "loss": 0.6099, + "step": 2554 + }, + { + "epoch": 0.5994134897360703, + "grad_norm": 2.5755184545949765, + "learning_rate": 4.146104897116634e-06, + "loss": 0.589, + "step": 2555 + }, + { + "epoch": 0.5996480938416422, + "grad_norm": 1.188197608601526, + "learning_rate": 4.1420694075056365e-06, + "loss": 0.6492, + "step": 2556 + }, + { + "epoch": 0.599882697947214, + "grad_norm": 1.7042100888826746, + "learning_rate": 4.138034493627655e-06, + "loss": 0.5418, + "step": 2557 + }, + { + "epoch": 0.6001173020527859, + "grad_norm": 0.8867087212357306, + "learning_rate": 4.1340001581904135e-06, + "loss": 0.626, + "step": 2558 + }, + { + "epoch": 0.6003519061583578, + "grad_norm": 2.8658080054154085, + "learning_rate": 4.1299664039012384e-06, + "loss": 0.5942, + "step": 2559 + }, + { + "epoch": 0.6005865102639296, + "grad_norm": 1.2862082960348973, + "learning_rate": 4.125933233467069e-06, + "loss": 0.6224, + "step": 2560 + }, + { + "epoch": 0.6008211143695015, + "grad_norm": 1.1332587290115137, + "learning_rate": 4.12190064959445e-06, + "loss": 0.589, + "step": 2561 + }, + { + "epoch": 0.6010557184750733, + "grad_norm": 1.536860711564503, + "learning_rate": 4.1178686549895395e-06, + "loss": 0.6411, + "step": 2562 + }, + { + "epoch": 0.6012903225806452, + "grad_norm": 4.299445647549985, + "learning_rate": 4.113837252358089e-06, + "loss": 0.6219, + "step": 2563 + }, + { + "epoch": 0.601524926686217, + "grad_norm": 1.138103697486042, + "learning_rate": 4.109806444405463e-06, + "loss": 0.6009, + "step": 2564 + }, + { + "epoch": 0.6017595307917889, + "grad_norm": 0.9390309637600313, + "learning_rate": 4.105776233836624e-06, + "loss": 0.6466, + "step": 2565 + }, + { + "epoch": 0.6019941348973608, + "grad_norm": 1.1217142386900871, + "learning_rate": 4.10174662335613e-06, + "loss": 0.6242, + "step": 2566 + }, + { + "epoch": 0.6022287390029325, + "grad_norm": 19.912152061851284, + "learning_rate": 4.097717615668139e-06, + "loss": 0.6174, + "step": 2567 + }, + { + "epoch": 0.6024633431085044, + "grad_norm": 1.6585648907036556, + "learning_rate": 4.093689213476408e-06, + "loss": 0.6203, + "step": 2568 + }, + { + "epoch": 0.6026979472140762, + "grad_norm": 1.2833892554078743, + "learning_rate": 4.089661419484279e-06, + "loss": 0.5992, + "step": 2569 + }, + { + "epoch": 0.6029325513196481, + "grad_norm": 0.7427174733322811, + "learning_rate": 4.085634236394696e-06, + "loss": 0.6039, + "step": 2570 + }, + { + "epoch": 0.6031671554252199, + "grad_norm": 1.9360940074646746, + "learning_rate": 4.0816076669101815e-06, + "loss": 0.6306, + "step": 2571 + }, + { + "epoch": 0.6034017595307918, + "grad_norm": 1.0696471538576655, + "learning_rate": 4.077581713732859e-06, + "loss": 0.6405, + "step": 2572 + }, + { + "epoch": 0.6036363636363636, + "grad_norm": 1.206946854193182, + "learning_rate": 4.073556379564429e-06, + "loss": 0.628, + "step": 2573 + }, + { + "epoch": 0.6038709677419355, + "grad_norm": 3.042576436469917, + "learning_rate": 4.069531667106181e-06, + "loss": 0.5874, + "step": 2574 + }, + { + "epoch": 0.6041055718475073, + "grad_norm": 2.3226578612064097, + "learning_rate": 4.065507579058984e-06, + "loss": 0.566, + "step": 2575 + }, + { + "epoch": 0.6043401759530792, + "grad_norm": 1.205886716802571, + "learning_rate": 4.0614841181232935e-06, + "loss": 0.6125, + "step": 2576 + }, + { + "epoch": 0.6045747800586511, + "grad_norm": 1.7443724384894912, + "learning_rate": 4.057461286999137e-06, + "loss": 0.6886, + "step": 2577 + }, + { + "epoch": 0.6048093841642229, + "grad_norm": 0.9805606507048945, + "learning_rate": 4.053439088386124e-06, + "loss": 0.6146, + "step": 2578 + }, + { + "epoch": 0.6050439882697948, + "grad_norm": 1.211593514855346, + "learning_rate": 4.04941752498344e-06, + "loss": 0.5793, + "step": 2579 + }, + { + "epoch": 0.6052785923753665, + "grad_norm": 1.4647559318014667, + "learning_rate": 4.045396599489841e-06, + "loss": 0.6786, + "step": 2580 + }, + { + "epoch": 0.6055131964809384, + "grad_norm": 1.40486952361013, + "learning_rate": 4.041376314603659e-06, + "loss": 0.5964, + "step": 2581 + }, + { + "epoch": 0.6057478005865102, + "grad_norm": 0.8290983501422904, + "learning_rate": 4.037356673022792e-06, + "loss": 0.6164, + "step": 2582 + }, + { + "epoch": 0.6059824046920821, + "grad_norm": 1.5157273495726928, + "learning_rate": 4.033337677444708e-06, + "loss": 0.6199, + "step": 2583 + }, + { + "epoch": 0.606217008797654, + "grad_norm": 2.9508326712613395, + "learning_rate": 4.02931933056644e-06, + "loss": 0.6085, + "step": 2584 + }, + { + "epoch": 0.6064516129032258, + "grad_norm": 1.0947176978306004, + "learning_rate": 4.025301635084592e-06, + "loss": 0.6444, + "step": 2585 + }, + { + "epoch": 0.6066862170087977, + "grad_norm": 2.5767801974845717, + "learning_rate": 4.021284593695321e-06, + "loss": 0.6482, + "step": 2586 + }, + { + "epoch": 0.6069208211143695, + "grad_norm": 1.3068368441858098, + "learning_rate": 4.017268209094353e-06, + "loss": 0.6913, + "step": 2587 + }, + { + "epoch": 0.6071554252199414, + "grad_norm": 2.755890994447317, + "learning_rate": 4.013252483976972e-06, + "loss": 0.6012, + "step": 2588 + }, + { + "epoch": 0.6073900293255132, + "grad_norm": 1.2940935368296023, + "learning_rate": 4.009237421038013e-06, + "loss": 0.6162, + "step": 2589 + }, + { + "epoch": 0.6076246334310851, + "grad_norm": 1.5107803665701236, + "learning_rate": 4.0052230229718755e-06, + "loss": 0.6479, + "step": 2590 + }, + { + "epoch": 0.6078592375366569, + "grad_norm": 0.9096356154205831, + "learning_rate": 4.001209292472507e-06, + "loss": 0.6229, + "step": 2591 + }, + { + "epoch": 0.6080938416422288, + "grad_norm": 1.6646603414639927, + "learning_rate": 3.997196232233409e-06, + "loss": 0.6291, + "step": 2592 + }, + { + "epoch": 0.6083284457478005, + "grad_norm": 1.2012478773424764, + "learning_rate": 3.9931838449476355e-06, + "loss": 0.6158, + "step": 2593 + }, + { + "epoch": 0.6085630498533724, + "grad_norm": 3.208605929111589, + "learning_rate": 3.989172133307782e-06, + "loss": 0.6519, + "step": 2594 + }, + { + "epoch": 0.6087976539589443, + "grad_norm": 0.9172720961577118, + "learning_rate": 3.985161100005997e-06, + "loss": 0.5433, + "step": 2595 + }, + { + "epoch": 0.6090322580645161, + "grad_norm": 2.5108512772562976, + "learning_rate": 3.981150747733975e-06, + "loss": 0.6582, + "step": 2596 + }, + { + "epoch": 0.609266862170088, + "grad_norm": 1.74396930458251, + "learning_rate": 3.977141079182945e-06, + "loss": 0.6168, + "step": 2597 + }, + { + "epoch": 0.6095014662756598, + "grad_norm": 1.3166982183851292, + "learning_rate": 3.973132097043685e-06, + "loss": 0.6899, + "step": 2598 + }, + { + "epoch": 0.6097360703812317, + "grad_norm": 1.2046832346059244, + "learning_rate": 3.9691238040065105e-06, + "loss": 0.6051, + "step": 2599 + }, + { + "epoch": 0.6099706744868035, + "grad_norm": 0.9358456079586567, + "learning_rate": 3.965116202761271e-06, + "loss": 0.625, + "step": 2600 + }, + { + "epoch": 0.6102052785923754, + "grad_norm": 1.4680245628396122, + "learning_rate": 3.961109295997359e-06, + "loss": 0.5869, + "step": 2601 + }, + { + "epoch": 0.6104398826979472, + "grad_norm": 2.2864639472629413, + "learning_rate": 3.957103086403694e-06, + "loss": 0.6568, + "step": 2602 + }, + { + "epoch": 0.6106744868035191, + "grad_norm": 1.3100229042611278, + "learning_rate": 3.95309757666873e-06, + "loss": 0.6953, + "step": 2603 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 1.589692790494812, + "learning_rate": 3.949092769480453e-06, + "loss": 0.5994, + "step": 2604 + }, + { + "epoch": 0.6111436950146627, + "grad_norm": 2.239815470993383, + "learning_rate": 3.945088667526375e-06, + "loss": 0.5607, + "step": 2605 + }, + { + "epoch": 0.6113782991202346, + "grad_norm": 1.546349756698268, + "learning_rate": 3.941085273493536e-06, + "loss": 0.6097, + "step": 2606 + }, + { + "epoch": 0.6116129032258064, + "grad_norm": 3.09397743068729, + "learning_rate": 3.9370825900685e-06, + "loss": 0.7011, + "step": 2607 + }, + { + "epoch": 0.6118475073313783, + "grad_norm": 1.585717896104532, + "learning_rate": 3.9330806199373595e-06, + "loss": 0.5706, + "step": 2608 + }, + { + "epoch": 0.6120821114369501, + "grad_norm": 1.400585147618227, + "learning_rate": 3.929079365785716e-06, + "loss": 0.6331, + "step": 2609 + }, + { + "epoch": 0.612316715542522, + "grad_norm": 1.2322264495654516, + "learning_rate": 3.925078830298704e-06, + "loss": 0.6274, + "step": 2610 + }, + { + "epoch": 0.6125513196480938, + "grad_norm": 1.4124526796440844, + "learning_rate": 3.92107901616097e-06, + "loss": 0.6103, + "step": 2611 + }, + { + "epoch": 0.6127859237536657, + "grad_norm": 1.5300209305905332, + "learning_rate": 3.917079926056674e-06, + "loss": 0.6342, + "step": 2612 + }, + { + "epoch": 0.6130205278592376, + "grad_norm": 1.2017607539252948, + "learning_rate": 3.913081562669492e-06, + "loss": 0.584, + "step": 2613 + }, + { + "epoch": 0.6132551319648094, + "grad_norm": 1.171386657241352, + "learning_rate": 3.9090839286826185e-06, + "loss": 0.611, + "step": 2614 + }, + { + "epoch": 0.6134897360703813, + "grad_norm": 1.175493331648172, + "learning_rate": 3.9050870267787446e-06, + "loss": 0.6258, + "step": 2615 + }, + { + "epoch": 0.6137243401759531, + "grad_norm": 0.8381430034876453, + "learning_rate": 3.9010908596400865e-06, + "loss": 0.6404, + "step": 2616 + }, + { + "epoch": 0.613958944281525, + "grad_norm": 1.3056645293670937, + "learning_rate": 3.897095429948352e-06, + "loss": 0.6062, + "step": 2617 + }, + { + "epoch": 0.6141935483870967, + "grad_norm": 0.9232357664815262, + "learning_rate": 3.893100740384766e-06, + "loss": 0.5713, + "step": 2618 + }, + { + "epoch": 0.6144281524926686, + "grad_norm": 1.9520853215963634, + "learning_rate": 3.889106793630052e-06, + "loss": 0.6338, + "step": 2619 + }, + { + "epoch": 0.6146627565982404, + "grad_norm": 1.4988606630842436, + "learning_rate": 3.885113592364432e-06, + "loss": 0.6179, + "step": 2620 + }, + { + "epoch": 0.6148973607038123, + "grad_norm": 1.465478839792279, + "learning_rate": 3.881121139267632e-06, + "loss": 0.5571, + "step": 2621 + }, + { + "epoch": 0.6151319648093841, + "grad_norm": 1.37075089168491, + "learning_rate": 3.8771294370188775e-06, + "loss": 0.6586, + "step": 2622 + }, + { + "epoch": 0.615366568914956, + "grad_norm": 1.1861832507325312, + "learning_rate": 3.873138488296883e-06, + "loss": 0.5717, + "step": 2623 + }, + { + "epoch": 0.6156011730205279, + "grad_norm": 1.0975220488700843, + "learning_rate": 3.8691482957798636e-06, + "loss": 0.6611, + "step": 2624 + }, + { + "epoch": 0.6158357771260997, + "grad_norm": 0.9290249703287269, + "learning_rate": 3.865158862145525e-06, + "loss": 0.5995, + "step": 2625 + }, + { + "epoch": 0.6160703812316716, + "grad_norm": 0.9140054772375495, + "learning_rate": 3.861170190071064e-06, + "loss": 0.6243, + "step": 2626 + }, + { + "epoch": 0.6163049853372434, + "grad_norm": 1.2375783218924745, + "learning_rate": 3.8571822822331644e-06, + "loss": 0.6148, + "step": 2627 + }, + { + "epoch": 0.6165395894428153, + "grad_norm": 5.314375888137883, + "learning_rate": 3.853195141308001e-06, + "loss": 0.6104, + "step": 2628 + }, + { + "epoch": 0.6167741935483871, + "grad_norm": 2.2260423031861105, + "learning_rate": 3.8492087699712294e-06, + "loss": 0.6345, + "step": 2629 + }, + { + "epoch": 0.617008797653959, + "grad_norm": 1.2388179244531388, + "learning_rate": 3.8452231708979905e-06, + "loss": 0.6356, + "step": 2630 + }, + { + "epoch": 0.6172434017595307, + "grad_norm": 1.474997299256861, + "learning_rate": 3.8412383467629104e-06, + "loss": 0.6138, + "step": 2631 + }, + { + "epoch": 0.6174780058651026, + "grad_norm": 1.4157998917734198, + "learning_rate": 3.83725430024009e-06, + "loss": 0.5826, + "step": 2632 + }, + { + "epoch": 0.6177126099706745, + "grad_norm": 1.086677811735564, + "learning_rate": 3.833271034003111e-06, + "loss": 0.6335, + "step": 2633 + }, + { + "epoch": 0.6179472140762463, + "grad_norm": 0.9606402467854953, + "learning_rate": 3.8292885507250315e-06, + "loss": 0.6142, + "step": 2634 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 1.029850892655021, + "learning_rate": 3.8253068530783825e-06, + "loss": 0.6191, + "step": 2635 + }, + { + "epoch": 0.61841642228739, + "grad_norm": 2.030545771516275, + "learning_rate": 3.8213259437351695e-06, + "loss": 0.5957, + "step": 2636 + }, + { + "epoch": 0.6186510263929619, + "grad_norm": 0.8472838413106262, + "learning_rate": 3.817345825366871e-06, + "loss": 0.5828, + "step": 2637 + }, + { + "epoch": 0.6188856304985337, + "grad_norm": 4.825714667726455, + "learning_rate": 3.813366500644426e-06, + "loss": 0.5783, + "step": 2638 + }, + { + "epoch": 0.6191202346041056, + "grad_norm": 3.2736180582520293, + "learning_rate": 3.809387972238255e-06, + "loss": 0.6472, + "step": 2639 + }, + { + "epoch": 0.6193548387096774, + "grad_norm": 2.0847739365669073, + "learning_rate": 3.8054102428182284e-06, + "loss": 0.5926, + "step": 2640 + }, + { + "epoch": 0.6195894428152493, + "grad_norm": 2.13526093103931, + "learning_rate": 3.801433315053693e-06, + "loss": 0.6315, + "step": 2641 + }, + { + "epoch": 0.6198240469208212, + "grad_norm": 1.25259446518174, + "learning_rate": 3.7974571916134533e-06, + "loss": 0.6209, + "step": 2642 + }, + { + "epoch": 0.620058651026393, + "grad_norm": 1.5747100024152387, + "learning_rate": 3.793481875165771e-06, + "loss": 0.6398, + "step": 2643 + }, + { + "epoch": 0.6202932551319648, + "grad_norm": 0.9601714494423652, + "learning_rate": 3.7895073683783683e-06, + "loss": 0.6226, + "step": 2644 + }, + { + "epoch": 0.6205278592375366, + "grad_norm": 1.506069025950793, + "learning_rate": 3.7855336739184303e-06, + "loss": 0.611, + "step": 2645 + }, + { + "epoch": 0.6207624633431085, + "grad_norm": 1.1899295416433049, + "learning_rate": 3.7815607944525826e-06, + "loss": 0.5866, + "step": 2646 + }, + { + "epoch": 0.6209970674486803, + "grad_norm": 1.8749969036584033, + "learning_rate": 3.777588732646919e-06, + "loss": 0.596, + "step": 2647 + }, + { + "epoch": 0.6212316715542522, + "grad_norm": 1.792002552963097, + "learning_rate": 3.7736174911669775e-06, + "loss": 0.5952, + "step": 2648 + }, + { + "epoch": 0.621466275659824, + "grad_norm": 1.0404897387789034, + "learning_rate": 3.7696470726777436e-06, + "loss": 0.6029, + "step": 2649 + }, + { + "epoch": 0.6217008797653959, + "grad_norm": 1.7960891351545873, + "learning_rate": 3.7656774798436545e-06, + "loss": 0.6562, + "step": 2650 + }, + { + "epoch": 0.6219354838709678, + "grad_norm": 1.6596318518513833, + "learning_rate": 3.761708715328593e-06, + "loss": 0.611, + "step": 2651 + }, + { + "epoch": 0.6221700879765396, + "grad_norm": 0.9896201383882054, + "learning_rate": 3.7577407817958815e-06, + "loss": 0.6461, + "step": 2652 + }, + { + "epoch": 0.6224046920821115, + "grad_norm": 1.739289988297414, + "learning_rate": 3.7537736819082926e-06, + "loss": 0.6074, + "step": 2653 + }, + { + "epoch": 0.6226392961876833, + "grad_norm": 1.1191647740135733, + "learning_rate": 3.7498074183280344e-06, + "loss": 0.6547, + "step": 2654 + }, + { + "epoch": 0.6228739002932552, + "grad_norm": 1.874952600529361, + "learning_rate": 3.7458419937167516e-06, + "loss": 0.6325, + "step": 2655 + }, + { + "epoch": 0.623108504398827, + "grad_norm": 2.979617394432184, + "learning_rate": 3.7418774107355318e-06, + "loss": 0.6199, + "step": 2656 + }, + { + "epoch": 0.6233431085043988, + "grad_norm": 1.1896527781829702, + "learning_rate": 3.737913672044894e-06, + "loss": 0.6108, + "step": 2657 + }, + { + "epoch": 0.6235777126099706, + "grad_norm": 0.8829346383219321, + "learning_rate": 3.73395078030479e-06, + "loss": 0.5705, + "step": 2658 + }, + { + "epoch": 0.6238123167155425, + "grad_norm": 0.7062082275621552, + "learning_rate": 3.7299887381746047e-06, + "loss": 0.5735, + "step": 2659 + }, + { + "epoch": 0.6240469208211143, + "grad_norm": 1.221847445565343, + "learning_rate": 3.726027548313157e-06, + "loss": 0.597, + "step": 2660 + }, + { + "epoch": 0.6242815249266862, + "grad_norm": 2.0902160378726014, + "learning_rate": 3.722067213378683e-06, + "loss": 0.6836, + "step": 2661 + }, + { + "epoch": 0.6245161290322581, + "grad_norm": 1.377813196198677, + "learning_rate": 3.718107736028858e-06, + "loss": 0.6921, + "step": 2662 + }, + { + "epoch": 0.6247507331378299, + "grad_norm": 2.4337369135631692, + "learning_rate": 3.7141491189207716e-06, + "loss": 0.6248, + "step": 2663 + }, + { + "epoch": 0.6249853372434018, + "grad_norm": 1.2767180243430922, + "learning_rate": 3.7101913647109415e-06, + "loss": 0.6024, + "step": 2664 + }, + { + "epoch": 0.6252199413489736, + "grad_norm": 1.82351491486182, + "learning_rate": 3.706234476055306e-06, + "loss": 0.6211, + "step": 2665 + }, + { + "epoch": 0.6254545454545455, + "grad_norm": 1.0719908652535732, + "learning_rate": 3.702278455609219e-06, + "loss": 0.6546, + "step": 2666 + }, + { + "epoch": 0.6256891495601173, + "grad_norm": 1.243280748149877, + "learning_rate": 3.6983233060274543e-06, + "loss": 0.6106, + "step": 2667 + }, + { + "epoch": 0.6259237536656892, + "grad_norm": 2.1666429604577084, + "learning_rate": 3.6943690299642055e-06, + "loss": 0.6072, + "step": 2668 + }, + { + "epoch": 0.6261583577712609, + "grad_norm": 0.917414355692659, + "learning_rate": 3.6904156300730697e-06, + "loss": 0.6379, + "step": 2669 + }, + { + "epoch": 0.6263929618768328, + "grad_norm": 1.0762156591178333, + "learning_rate": 3.6864631090070656e-06, + "loss": 0.6422, + "step": 2670 + }, + { + "epoch": 0.6266275659824047, + "grad_norm": 0.9453096475901798, + "learning_rate": 3.6825114694186192e-06, + "loss": 0.5849, + "step": 2671 + }, + { + "epoch": 0.6268621700879765, + "grad_norm": 1.5752552955005232, + "learning_rate": 3.6785607139595625e-06, + "loss": 0.6535, + "step": 2672 + }, + { + "epoch": 0.6270967741935484, + "grad_norm": 1.6758441060756903, + "learning_rate": 3.6746108452811347e-06, + "loss": 0.6612, + "step": 2673 + }, + { + "epoch": 0.6273313782991202, + "grad_norm": 1.7082489282247946, + "learning_rate": 3.670661866033988e-06, + "loss": 0.5804, + "step": 2674 + }, + { + "epoch": 0.6275659824046921, + "grad_norm": 0.9346939807372437, + "learning_rate": 3.666713778868161e-06, + "loss": 0.6119, + "step": 2675 + }, + { + "epoch": 0.6278005865102639, + "grad_norm": 1.3277551451666765, + "learning_rate": 3.662766586433111e-06, + "loss": 0.6514, + "step": 2676 + }, + { + "epoch": 0.6280351906158358, + "grad_norm": 2.6067439559867163, + "learning_rate": 3.6588202913776865e-06, + "loss": 0.6681, + "step": 2677 + }, + { + "epoch": 0.6282697947214076, + "grad_norm": 3.3067083631174516, + "learning_rate": 3.6548748963501324e-06, + "loss": 0.5825, + "step": 2678 + }, + { + "epoch": 0.6285043988269795, + "grad_norm": 2.4946271815287755, + "learning_rate": 3.650930403998093e-06, + "loss": 0.6368, + "step": 2679 + }, + { + "epoch": 0.6287390029325514, + "grad_norm": 3.530727381232086, + "learning_rate": 3.646986816968608e-06, + "loss": 0.6473, + "step": 2680 + }, + { + "epoch": 0.6289736070381232, + "grad_norm": 1.2456120856690516, + "learning_rate": 3.643044137908104e-06, + "loss": 0.5802, + "step": 2681 + }, + { + "epoch": 0.629208211143695, + "grad_norm": 1.5535352093408097, + "learning_rate": 3.6391023694624017e-06, + "loss": 0.638, + "step": 2682 + }, + { + "epoch": 0.6294428152492668, + "grad_norm": 3.621896711457749, + "learning_rate": 3.635161514276715e-06, + "loss": 0.5986, + "step": 2683 + }, + { + "epoch": 0.6296774193548387, + "grad_norm": 1.2081206262423436, + "learning_rate": 3.631221574995637e-06, + "loss": 0.6501, + "step": 2684 + }, + { + "epoch": 0.6299120234604105, + "grad_norm": 1.6672839582985357, + "learning_rate": 3.6272825542631507e-06, + "loss": 0.5878, + "step": 2685 + }, + { + "epoch": 0.6301466275659824, + "grad_norm": 10.538271916312206, + "learning_rate": 3.6233444547226214e-06, + "loss": 0.663, + "step": 2686 + }, + { + "epoch": 0.6303812316715542, + "grad_norm": 1.0090407633007583, + "learning_rate": 3.6194072790167978e-06, + "loss": 0.5989, + "step": 2687 + }, + { + "epoch": 0.6306158357771261, + "grad_norm": 1.3637371746508158, + "learning_rate": 3.615471029787807e-06, + "loss": 0.6095, + "step": 2688 + }, + { + "epoch": 0.630850439882698, + "grad_norm": 0.8850365791866931, + "learning_rate": 3.6115357096771536e-06, + "loss": 0.5444, + "step": 2689 + }, + { + "epoch": 0.6310850439882698, + "grad_norm": 1.0810936007043943, + "learning_rate": 3.6076013213257198e-06, + "loss": 0.5888, + "step": 2690 + }, + { + "epoch": 0.6313196480938417, + "grad_norm": 1.2337870138836586, + "learning_rate": 3.6036678673737668e-06, + "loss": 0.5531, + "step": 2691 + }, + { + "epoch": 0.6315542521994135, + "grad_norm": 1.0824233995302337, + "learning_rate": 3.599735350460919e-06, + "loss": 0.6177, + "step": 2692 + }, + { + "epoch": 0.6317888563049854, + "grad_norm": 1.223039780177105, + "learning_rate": 3.5958037732261806e-06, + "loss": 0.5734, + "step": 2693 + }, + { + "epoch": 0.6320234604105572, + "grad_norm": 1.2296995133189876, + "learning_rate": 3.5918731383079224e-06, + "loss": 0.5612, + "step": 2694 + }, + { + "epoch": 0.632258064516129, + "grad_norm": 1.06663436020491, + "learning_rate": 3.5879434483438814e-06, + "loss": 0.6859, + "step": 2695 + }, + { + "epoch": 0.6324926686217008, + "grad_norm": 1.1289573518711966, + "learning_rate": 3.5840147059711603e-06, + "loss": 0.6033, + "step": 2696 + }, + { + "epoch": 0.6327272727272727, + "grad_norm": 2.1740518436103193, + "learning_rate": 3.5800869138262333e-06, + "loss": 0.5861, + "step": 2697 + }, + { + "epoch": 0.6329618768328445, + "grad_norm": 1.127612472110948, + "learning_rate": 3.5761600745449234e-06, + "loss": 0.6525, + "step": 2698 + }, + { + "epoch": 0.6331964809384164, + "grad_norm": 1.1748248943049848, + "learning_rate": 3.572234190762426e-06, + "loss": 0.6545, + "step": 2699 + }, + { + "epoch": 0.6334310850439883, + "grad_norm": 1.1353027900528645, + "learning_rate": 3.568309265113291e-06, + "loss": 0.6244, + "step": 2700 + }, + { + "epoch": 0.6336656891495601, + "grad_norm": 1.7665864864207614, + "learning_rate": 3.564385300231423e-06, + "loss": 0.6311, + "step": 2701 + }, + { + "epoch": 0.633900293255132, + "grad_norm": 1.3127731536976572, + "learning_rate": 3.5604622987500857e-06, + "loss": 0.6428, + "step": 2702 + }, + { + "epoch": 0.6341348973607038, + "grad_norm": 0.993273922561666, + "learning_rate": 3.5565402633018963e-06, + "loss": 0.6029, + "step": 2703 + }, + { + "epoch": 0.6343695014662757, + "grad_norm": 0.9025340222281976, + "learning_rate": 3.5526191965188173e-06, + "loss": 0.6261, + "step": 2704 + }, + { + "epoch": 0.6346041055718475, + "grad_norm": 1.4845181882160094, + "learning_rate": 3.5486991010321703e-06, + "loss": 0.644, + "step": 2705 + }, + { + "epoch": 0.6348387096774194, + "grad_norm": 1.4027983004146045, + "learning_rate": 3.5447799794726213e-06, + "loss": 0.6432, + "step": 2706 + }, + { + "epoch": 0.6350733137829913, + "grad_norm": 1.2481329067569296, + "learning_rate": 3.5408618344701783e-06, + "loss": 0.6242, + "step": 2707 + }, + { + "epoch": 0.635307917888563, + "grad_norm": 0.9977286075070906, + "learning_rate": 3.536944668654202e-06, + "loss": 0.6417, + "step": 2708 + }, + { + "epoch": 0.6355425219941349, + "grad_norm": 1.4098047628922843, + "learning_rate": 3.5330284846533892e-06, + "loss": 0.6009, + "step": 2709 + }, + { + "epoch": 0.6357771260997067, + "grad_norm": 3.4440858861484824, + "learning_rate": 3.529113285095781e-06, + "loss": 0.6783, + "step": 2710 + }, + { + "epoch": 0.6360117302052786, + "grad_norm": 1.1546959945770436, + "learning_rate": 3.525199072608758e-06, + "loss": 0.5936, + "step": 2711 + }, + { + "epoch": 0.6362463343108504, + "grad_norm": 1.9316549570873787, + "learning_rate": 3.521285849819037e-06, + "loss": 0.6582, + "step": 2712 + }, + { + "epoch": 0.6364809384164223, + "grad_norm": 1.4124539147742596, + "learning_rate": 3.517373619352672e-06, + "loss": 0.6443, + "step": 2713 + }, + { + "epoch": 0.6367155425219941, + "grad_norm": 1.5754658740131884, + "learning_rate": 3.513462383835051e-06, + "loss": 0.6192, + "step": 2714 + }, + { + "epoch": 0.636950146627566, + "grad_norm": 4.357016819681828, + "learning_rate": 3.5095521458908933e-06, + "loss": 0.6045, + "step": 2715 + }, + { + "epoch": 0.6371847507331379, + "grad_norm": 1.332128586662064, + "learning_rate": 3.505642908144248e-06, + "loss": 0.5887, + "step": 2716 + }, + { + "epoch": 0.6374193548387097, + "grad_norm": 1.1025566333002628, + "learning_rate": 3.5017346732184974e-06, + "loss": 0.6358, + "step": 2717 + }, + { + "epoch": 0.6376539589442816, + "grad_norm": 1.025648018635841, + "learning_rate": 3.4978274437363447e-06, + "loss": 0.5679, + "step": 2718 + }, + { + "epoch": 0.6378885630498534, + "grad_norm": 7.6139405115353656, + "learning_rate": 3.4939212223198217e-06, + "loss": 0.6501, + "step": 2719 + }, + { + "epoch": 0.6381231671554253, + "grad_norm": 1.7807089617714034, + "learning_rate": 3.490016011590287e-06, + "loss": 0.5845, + "step": 2720 + }, + { + "epoch": 0.638357771260997, + "grad_norm": 0.6796377890963405, + "learning_rate": 3.486111814168412e-06, + "loss": 0.6334, + "step": 2721 + }, + { + "epoch": 0.6385923753665689, + "grad_norm": 2.3100830732035416, + "learning_rate": 3.4822086326741966e-06, + "loss": 0.6216, + "step": 2722 + }, + { + "epoch": 0.6388269794721407, + "grad_norm": 1.066199556390954, + "learning_rate": 3.478306469726957e-06, + "loss": 0.6064, + "step": 2723 + }, + { + "epoch": 0.6390615835777126, + "grad_norm": 2.4176759794341476, + "learning_rate": 3.4744053279453215e-06, + "loss": 0.624, + "step": 2724 + }, + { + "epoch": 0.6392961876832844, + "grad_norm": 1.4245791185101886, + "learning_rate": 3.470505209947238e-06, + "loss": 0.6554, + "step": 2725 + }, + { + "epoch": 0.6395307917888563, + "grad_norm": 2.2478579927150144, + "learning_rate": 3.4666061183499664e-06, + "loss": 0.6072, + "step": 2726 + }, + { + "epoch": 0.6397653958944282, + "grad_norm": 42.51034965314345, + "learning_rate": 3.4627080557700745e-06, + "loss": 0.5753, + "step": 2727 + }, + { + "epoch": 0.64, + "grad_norm": 1.0267583118908696, + "learning_rate": 3.458811024823444e-06, + "loss": 0.6259, + "step": 2728 + }, + { + "epoch": 0.6402346041055719, + "grad_norm": 2.0496655540758906, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.6206, + "step": 2729 + }, + { + "epoch": 0.6404692082111437, + "grad_norm": 4.606048010629496, + "learning_rate": 3.451020068290024e-06, + "loss": 0.6405, + "step": 2730 + }, + { + "epoch": 0.6407038123167156, + "grad_norm": 7.1661831612698474, + "learning_rate": 3.4471261479315254e-06, + "loss": 0.5937, + "step": 2731 + }, + { + "epoch": 0.6409384164222874, + "grad_norm": 1.364196386868124, + "learning_rate": 3.4432332696628666e-06, + "loss": 0.6627, + "step": 2732 + }, + { + "epoch": 0.6411730205278592, + "grad_norm": 1.3018487040503812, + "learning_rate": 3.439341436096449e-06, + "loss": 0.5911, + "step": 2733 + }, + { + "epoch": 0.641407624633431, + "grad_norm": 2.5308607910073873, + "learning_rate": 3.435450649843972e-06, + "loss": 0.6302, + "step": 2734 + }, + { + "epoch": 0.6416422287390029, + "grad_norm": 0.9991064818881716, + "learning_rate": 3.4315609135164308e-06, + "loss": 0.6248, + "step": 2735 + }, + { + "epoch": 0.6418768328445748, + "grad_norm": 0.9548112508106517, + "learning_rate": 3.4276722297241206e-06, + "loss": 0.6133, + "step": 2736 + }, + { + "epoch": 0.6421114369501466, + "grad_norm": 0.9423465003312195, + "learning_rate": 3.423784601076627e-06, + "loss": 0.6046, + "step": 2737 + }, + { + "epoch": 0.6423460410557185, + "grad_norm": 1.7320553071311917, + "learning_rate": 3.4198980301828256e-06, + "loss": 0.6002, + "step": 2738 + }, + { + "epoch": 0.6425806451612903, + "grad_norm": 1.4341561810044723, + "learning_rate": 3.4160125196508866e-06, + "loss": 0.643, + "step": 2739 + }, + { + "epoch": 0.6428152492668622, + "grad_norm": 1.04030864388395, + "learning_rate": 3.4121280720882664e-06, + "loss": 0.6217, + "step": 2740 + }, + { + "epoch": 0.643049853372434, + "grad_norm": 1.1039159426036667, + "learning_rate": 3.4082446901017074e-06, + "loss": 0.64, + "step": 2741 + }, + { + "epoch": 0.6432844574780059, + "grad_norm": 1.6120862915442005, + "learning_rate": 3.4043623762972373e-06, + "loss": 0.6533, + "step": 2742 + }, + { + "epoch": 0.6435190615835777, + "grad_norm": 1.2605836948366589, + "learning_rate": 3.400481133280171e-06, + "loss": 0.6339, + "step": 2743 + }, + { + "epoch": 0.6437536656891496, + "grad_norm": 1.2836432076726034, + "learning_rate": 3.3966009636550944e-06, + "loss": 0.6091, + "step": 2744 + }, + { + "epoch": 0.6439882697947215, + "grad_norm": 2.0565431344091625, + "learning_rate": 3.392721870025886e-06, + "loss": 0.6201, + "step": 2745 + }, + { + "epoch": 0.6442228739002932, + "grad_norm": 1.0085068623908457, + "learning_rate": 3.388843854995694e-06, + "loss": 0.58, + "step": 2746 + }, + { + "epoch": 0.6444574780058651, + "grad_norm": 1.1703206393983352, + "learning_rate": 3.3849669211669446e-06, + "loss": 0.5855, + "step": 2747 + }, + { + "epoch": 0.6446920821114369, + "grad_norm": 1.5128832145377682, + "learning_rate": 3.3810910711413376e-06, + "loss": 0.6566, + "step": 2748 + }, + { + "epoch": 0.6449266862170088, + "grad_norm": 1.0285692439770233, + "learning_rate": 3.3772163075198512e-06, + "loss": 0.5741, + "step": 2749 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 1.8756765771691108, + "learning_rate": 3.373342632902723e-06, + "loss": 0.6326, + "step": 2750 + }, + { + "epoch": 0.6453958944281525, + "grad_norm": 8.108637468473399, + "learning_rate": 3.369470049889473e-06, + "loss": 0.5747, + "step": 2751 + }, + { + "epoch": 0.6456304985337243, + "grad_norm": 1.5276067869363399, + "learning_rate": 3.36559856107888e-06, + "loss": 0.6272, + "step": 2752 + }, + { + "epoch": 0.6458651026392962, + "grad_norm": 2.293138628477277, + "learning_rate": 3.3617281690689895e-06, + "loss": 0.6686, + "step": 2753 + }, + { + "epoch": 0.646099706744868, + "grad_norm": 0.8556954037490899, + "learning_rate": 3.3578588764571144e-06, + "loss": 0.6563, + "step": 2754 + }, + { + "epoch": 0.6463343108504399, + "grad_norm": 1.614328853113237, + "learning_rate": 3.3539906858398285e-06, + "loss": 0.6094, + "step": 2755 + }, + { + "epoch": 0.6465689149560118, + "grad_norm": 1.047880166499049, + "learning_rate": 3.3501235998129624e-06, + "loss": 0.5949, + "step": 2756 + }, + { + "epoch": 0.6468035190615836, + "grad_norm": 1.226481161791258, + "learning_rate": 3.3462576209716125e-06, + "loss": 0.6267, + "step": 2757 + }, + { + "epoch": 0.6470381231671555, + "grad_norm": 1.3843091369275025, + "learning_rate": 3.342392751910123e-06, + "loss": 0.6006, + "step": 2758 + }, + { + "epoch": 0.6472727272727272, + "grad_norm": 1.2495802125022213, + "learning_rate": 3.3385289952221023e-06, + "loss": 0.6683, + "step": 2759 + }, + { + "epoch": 0.6475073313782991, + "grad_norm": 0.9728160026784236, + "learning_rate": 3.3346663535004087e-06, + "loss": 0.579, + "step": 2760 + }, + { + "epoch": 0.6477419354838709, + "grad_norm": 1.0065823588235314, + "learning_rate": 3.3308048293371504e-06, + "loss": 0.6237, + "step": 2761 + }, + { + "epoch": 0.6479765395894428, + "grad_norm": 1.7313388752219214, + "learning_rate": 3.3269444253236877e-06, + "loss": 0.6416, + "step": 2762 + }, + { + "epoch": 0.6482111436950146, + "grad_norm": 2.9632307552546715, + "learning_rate": 3.32308514405063e-06, + "loss": 0.6369, + "step": 2763 + }, + { + "epoch": 0.6484457478005865, + "grad_norm": 1.5046527093512325, + "learning_rate": 3.3192269881078302e-06, + "loss": 0.644, + "step": 2764 + }, + { + "epoch": 0.6486803519061584, + "grad_norm": 1.7642242622972442, + "learning_rate": 3.315369960084388e-06, + "loss": 0.5786, + "step": 2765 + }, + { + "epoch": 0.6489149560117302, + "grad_norm": 1.3994481817732896, + "learning_rate": 3.311514062568649e-06, + "loss": 0.6897, + "step": 2766 + }, + { + "epoch": 0.6491495601173021, + "grad_norm": 1.017016404276696, + "learning_rate": 3.307659298148195e-06, + "loss": 0.6394, + "step": 2767 + }, + { + "epoch": 0.6493841642228739, + "grad_norm": 1.0081237138282781, + "learning_rate": 3.3038056694098485e-06, + "loss": 0.6467, + "step": 2768 + }, + { + "epoch": 0.6496187683284458, + "grad_norm": 6.285559513489754, + "learning_rate": 3.2999531789396746e-06, + "loss": 0.6673, + "step": 2769 + }, + { + "epoch": 0.6498533724340176, + "grad_norm": 1.9095115053630256, + "learning_rate": 3.2961018293229676e-06, + "loss": 0.6369, + "step": 2770 + }, + { + "epoch": 0.6500879765395895, + "grad_norm": 1.3376844881925287, + "learning_rate": 3.2922516231442605e-06, + "loss": 0.628, + "step": 2771 + }, + { + "epoch": 0.6503225806451612, + "grad_norm": 1.5111841203473888, + "learning_rate": 3.2884025629873206e-06, + "loss": 0.5885, + "step": 2772 + }, + { + "epoch": 0.6505571847507331, + "grad_norm": 1.7208948910426636, + "learning_rate": 3.2845546514351384e-06, + "loss": 0.6226, + "step": 2773 + }, + { + "epoch": 0.650791788856305, + "grad_norm": 16.711657319988067, + "learning_rate": 3.280707891069943e-06, + "loss": 0.573, + "step": 2774 + }, + { + "epoch": 0.6510263929618768, + "grad_norm": 0.895072769673571, + "learning_rate": 3.276862284473187e-06, + "loss": 0.6127, + "step": 2775 + }, + { + "epoch": 0.6512609970674487, + "grad_norm": 1.3120849452846568, + "learning_rate": 3.2730178342255458e-06, + "loss": 0.594, + "step": 2776 + }, + { + "epoch": 0.6514956011730205, + "grad_norm": 1.4245611380007643, + "learning_rate": 3.2691745429069234e-06, + "loss": 0.6032, + "step": 2777 + }, + { + "epoch": 0.6517302052785924, + "grad_norm": 0.9659806790106332, + "learning_rate": 3.265332413096444e-06, + "loss": 0.513, + "step": 2778 + }, + { + "epoch": 0.6519648093841642, + "grad_norm": 2.3951227318079167, + "learning_rate": 3.2614914473724506e-06, + "loss": 0.6545, + "step": 2779 + }, + { + "epoch": 0.6521994134897361, + "grad_norm": 0.9086189715822532, + "learning_rate": 3.257651648312512e-06, + "loss": 0.6551, + "step": 2780 + }, + { + "epoch": 0.652434017595308, + "grad_norm": 1.7055376227017278, + "learning_rate": 3.253813018493402e-06, + "loss": 0.6354, + "step": 2781 + }, + { + "epoch": 0.6526686217008798, + "grad_norm": 1.9206882119330737, + "learning_rate": 3.249975560491121e-06, + "loss": 0.6246, + "step": 2782 + }, + { + "epoch": 0.6529032258064517, + "grad_norm": 1.1316882930673104, + "learning_rate": 3.24613927688088e-06, + "loss": 0.6016, + "step": 2783 + }, + { + "epoch": 0.6531378299120235, + "grad_norm": 1.11577630879089, + "learning_rate": 3.2423041702370976e-06, + "loss": 0.5841, + "step": 2784 + }, + { + "epoch": 0.6533724340175953, + "grad_norm": 1.0826065702956267, + "learning_rate": 3.238470243133407e-06, + "loss": 0.6303, + "step": 2785 + }, + { + "epoch": 0.6536070381231671, + "grad_norm": 3.4741203170593296, + "learning_rate": 3.23463749814265e-06, + "loss": 0.6435, + "step": 2786 + }, + { + "epoch": 0.653841642228739, + "grad_norm": 4.310514045761198, + "learning_rate": 3.2308059378368693e-06, + "loss": 0.6219, + "step": 2787 + }, + { + "epoch": 0.6540762463343108, + "grad_norm": 1.2706772062017948, + "learning_rate": 3.226975564787322e-06, + "loss": 0.6163, + "step": 2788 + }, + { + "epoch": 0.6543108504398827, + "grad_norm": 3.0178359748669195, + "learning_rate": 3.2231463815644616e-06, + "loss": 0.6427, + "step": 2789 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 2.2186249513083265, + "learning_rate": 3.219318390737944e-06, + "loss": 0.576, + "step": 2790 + }, + { + "epoch": 0.6547800586510264, + "grad_norm": 4.714384982564151, + "learning_rate": 3.2154915948766263e-06, + "loss": 0.624, + "step": 2791 + }, + { + "epoch": 0.6550146627565983, + "grad_norm": 0.940786148661298, + "learning_rate": 3.211665996548564e-06, + "loss": 0.6327, + "step": 2792 + }, + { + "epoch": 0.6552492668621701, + "grad_norm": 3.598906368613139, + "learning_rate": 3.207841598321007e-06, + "loss": 0.6062, + "step": 2793 + }, + { + "epoch": 0.655483870967742, + "grad_norm": 2.230046756241626, + "learning_rate": 3.2040184027604003e-06, + "loss": 0.6567, + "step": 2794 + }, + { + "epoch": 0.6557184750733138, + "grad_norm": 3.3323044330215237, + "learning_rate": 3.2001964124323883e-06, + "loss": 0.638, + "step": 2795 + }, + { + "epoch": 0.6559530791788857, + "grad_norm": 1.4963999977115132, + "learning_rate": 3.1963756299017924e-06, + "loss": 0.5887, + "step": 2796 + }, + { + "epoch": 0.6561876832844574, + "grad_norm": 1.1389768169399515, + "learning_rate": 3.192556057732638e-06, + "loss": 0.6353, + "step": 2797 + }, + { + "epoch": 0.6564222873900293, + "grad_norm": 0.9912206816074209, + "learning_rate": 3.1887376984881313e-06, + "loss": 0.6657, + "step": 2798 + }, + { + "epoch": 0.6566568914956011, + "grad_norm": 1.3764734320177685, + "learning_rate": 3.1849205547306638e-06, + "loss": 0.6573, + "step": 2799 + }, + { + "epoch": 0.656891495601173, + "grad_norm": 1.7392991946176513, + "learning_rate": 3.1811046290218125e-06, + "loss": 0.6301, + "step": 2800 + }, + { + "epoch": 0.6571260997067448, + "grad_norm": 1.7459654559653102, + "learning_rate": 3.177289923922342e-06, + "loss": 0.5983, + "step": 2801 + }, + { + "epoch": 0.6573607038123167, + "grad_norm": 1.503585318798611, + "learning_rate": 3.173476441992187e-06, + "loss": 0.5794, + "step": 2802 + }, + { + "epoch": 0.6575953079178886, + "grad_norm": 1.7370600512456607, + "learning_rate": 3.1696641857904743e-06, + "loss": 0.6309, + "step": 2803 + }, + { + "epoch": 0.6578299120234604, + "grad_norm": 4.265726896789798, + "learning_rate": 3.165853157875495e-06, + "loss": 0.6069, + "step": 2804 + }, + { + "epoch": 0.6580645161290323, + "grad_norm": 1.220764530339127, + "learning_rate": 3.162043360804726e-06, + "loss": 0.5676, + "step": 2805 + }, + { + "epoch": 0.6582991202346041, + "grad_norm": 0.9774335610640675, + "learning_rate": 3.158234797134815e-06, + "loss": 0.6017, + "step": 2806 + }, + { + "epoch": 0.658533724340176, + "grad_norm": 1.185068209023832, + "learning_rate": 3.1544274694215804e-06, + "loss": 0.6278, + "step": 2807 + }, + { + "epoch": 0.6587683284457478, + "grad_norm": 1.2496617926932865, + "learning_rate": 3.1506213802200113e-06, + "loss": 0.6298, + "step": 2808 + }, + { + "epoch": 0.6590029325513197, + "grad_norm": 0.9430005699055659, + "learning_rate": 3.1468165320842712e-06, + "loss": 0.6327, + "step": 2809 + }, + { + "epoch": 0.6592375366568914, + "grad_norm": 1.4647903486974598, + "learning_rate": 3.1430129275676813e-06, + "loss": 0.5568, + "step": 2810 + }, + { + "epoch": 0.6594721407624633, + "grad_norm": 0.9140100955216531, + "learning_rate": 3.139210569222736e-06, + "loss": 0.6392, + "step": 2811 + }, + { + "epoch": 0.6597067448680352, + "grad_norm": 3.573415723502299, + "learning_rate": 3.1354094596010908e-06, + "loss": 0.6562, + "step": 2812 + }, + { + "epoch": 0.659941348973607, + "grad_norm": 1.3931891053320853, + "learning_rate": 3.1316096012535625e-06, + "loss": 0.6509, + "step": 2813 + }, + { + "epoch": 0.6601759530791789, + "grad_norm": 1.128743321592625, + "learning_rate": 3.127810996730128e-06, + "loss": 0.6702, + "step": 2814 + }, + { + "epoch": 0.6604105571847507, + "grad_norm": 1.255563636699726, + "learning_rate": 3.1240136485799254e-06, + "loss": 0.5907, + "step": 2815 + }, + { + "epoch": 0.6606451612903226, + "grad_norm": 0.9502285457823598, + "learning_rate": 3.120217559351247e-06, + "loss": 0.6267, + "step": 2816 + }, + { + "epoch": 0.6608797653958944, + "grad_norm": 1.3567066440983155, + "learning_rate": 3.1164227315915398e-06, + "loss": 0.5449, + "step": 2817 + }, + { + "epoch": 0.6611143695014663, + "grad_norm": 1.279395443678301, + "learning_rate": 3.112629167847409e-06, + "loss": 0.6256, + "step": 2818 + }, + { + "epoch": 0.6613489736070381, + "grad_norm": 1.3699719026303925, + "learning_rate": 3.1088368706646057e-06, + "loss": 0.6479, + "step": 2819 + }, + { + "epoch": 0.66158357771261, + "grad_norm": 2.792818448408116, + "learning_rate": 3.1050458425880335e-06, + "loss": 0.5395, + "step": 2820 + }, + { + "epoch": 0.6618181818181819, + "grad_norm": 1.5848622001133197, + "learning_rate": 3.101256086161747e-06, + "loss": 0.6829, + "step": 2821 + }, + { + "epoch": 0.6620527859237537, + "grad_norm": 0.9511167105997375, + "learning_rate": 3.0974676039289418e-06, + "loss": 0.6108, + "step": 2822 + }, + { + "epoch": 0.6622873900293255, + "grad_norm": 23.345564151938994, + "learning_rate": 3.0936803984319618e-06, + "loss": 0.6289, + "step": 2823 + }, + { + "epoch": 0.6625219941348973, + "grad_norm": 1.8392029992996093, + "learning_rate": 3.089894472212297e-06, + "loss": 0.6412, + "step": 2824 + }, + { + "epoch": 0.6627565982404692, + "grad_norm": 1.3812046493943864, + "learning_rate": 3.0861098278105713e-06, + "loss": 0.6064, + "step": 2825 + }, + { + "epoch": 0.662991202346041, + "grad_norm": 1.456229608860474, + "learning_rate": 3.0823264677665573e-06, + "loss": 0.6365, + "step": 2826 + }, + { + "epoch": 0.6632258064516129, + "grad_norm": 1.4220174379325206, + "learning_rate": 3.078544394619155e-06, + "loss": 0.5967, + "step": 2827 + }, + { + "epoch": 0.6634604105571847, + "grad_norm": 0.9232152751664393, + "learning_rate": 3.0747636109064126e-06, + "loss": 0.6265, + "step": 2828 + }, + { + "epoch": 0.6636950146627566, + "grad_norm": 1.674191387003085, + "learning_rate": 3.0709841191655064e-06, + "loss": 0.6271, + "step": 2829 + }, + { + "epoch": 0.6639296187683285, + "grad_norm": 1.4379811489781062, + "learning_rate": 3.0672059219327453e-06, + "loss": 0.6451, + "step": 2830 + }, + { + "epoch": 0.6641642228739003, + "grad_norm": 2.3144628414571033, + "learning_rate": 3.0634290217435703e-06, + "loss": 0.6097, + "step": 2831 + }, + { + "epoch": 0.6643988269794722, + "grad_norm": 2.348917719491438, + "learning_rate": 3.059653421132557e-06, + "loss": 0.5946, + "step": 2832 + }, + { + "epoch": 0.664633431085044, + "grad_norm": 1.3169704591372087, + "learning_rate": 3.0558791226333974e-06, + "loss": 0.5785, + "step": 2833 + }, + { + "epoch": 0.6648680351906159, + "grad_norm": 1.109645045604988, + "learning_rate": 3.0521061287789215e-06, + "loss": 0.6101, + "step": 2834 + }, + { + "epoch": 0.6651026392961877, + "grad_norm": 1.332282345179443, + "learning_rate": 3.0483344421010797e-06, + "loss": 0.6328, + "step": 2835 + }, + { + "epoch": 0.6653372434017595, + "grad_norm": 1.5215806892713624, + "learning_rate": 3.0445640651309416e-06, + "loss": 0.5636, + "step": 2836 + }, + { + "epoch": 0.6655718475073313, + "grad_norm": 0.759584834471881, + "learning_rate": 3.0407950003987025e-06, + "loss": 0.6954, + "step": 2837 + }, + { + "epoch": 0.6658064516129032, + "grad_norm": 2.274171222583829, + "learning_rate": 3.037027250433676e-06, + "loss": 0.6559, + "step": 2838 + }, + { + "epoch": 0.666041055718475, + "grad_norm": 1.511166352592508, + "learning_rate": 3.0332608177642896e-06, + "loss": 0.6084, + "step": 2839 + }, + { + "epoch": 0.6662756598240469, + "grad_norm": 1.6393585855755946, + "learning_rate": 3.029495704918094e-06, + "loss": 0.627, + "step": 2840 + }, + { + "epoch": 0.6665102639296188, + "grad_norm": 1.328829377128736, + "learning_rate": 3.0257319144217488e-06, + "loss": 0.6695, + "step": 2841 + }, + { + "epoch": 0.6667448680351906, + "grad_norm": 1.1493865460633572, + "learning_rate": 3.021969448801028e-06, + "loss": 0.6438, + "step": 2842 + }, + { + "epoch": 0.6669794721407625, + "grad_norm": 1.5317020059344715, + "learning_rate": 3.0182083105808135e-06, + "loss": 0.6045, + "step": 2843 + }, + { + "epoch": 0.6672140762463343, + "grad_norm": 1.590973695060672, + "learning_rate": 3.0144485022851035e-06, + "loss": 0.6542, + "step": 2844 + }, + { + "epoch": 0.6674486803519062, + "grad_norm": 1.5991926674214896, + "learning_rate": 3.0106900264369956e-06, + "loss": 0.6055, + "step": 2845 + }, + { + "epoch": 0.667683284457478, + "grad_norm": 1.0388666239291835, + "learning_rate": 3.006932885558697e-06, + "loss": 0.585, + "step": 2846 + }, + { + "epoch": 0.6679178885630499, + "grad_norm": 1.3609644532751701, + "learning_rate": 3.0031770821715233e-06, + "loss": 0.6355, + "step": 2847 + }, + { + "epoch": 0.6681524926686218, + "grad_norm": 10.985182746296877, + "learning_rate": 2.9994226187958824e-06, + "loss": 0.7029, + "step": 2848 + }, + { + "epoch": 0.6683870967741935, + "grad_norm": 3.1268044104649353, + "learning_rate": 2.9956694979512936e-06, + "loss": 0.6416, + "step": 2849 + }, + { + "epoch": 0.6686217008797654, + "grad_norm": 1.1755091839315208, + "learning_rate": 2.991917722156367e-06, + "loss": 0.6039, + "step": 2850 + }, + { + "epoch": 0.6688563049853372, + "grad_norm": 2.2426166787988757, + "learning_rate": 2.988167293928815e-06, + "loss": 0.637, + "step": 2851 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 1.272917129366812, + "learning_rate": 2.984418215785445e-06, + "loss": 0.6057, + "step": 2852 + }, + { + "epoch": 0.6693255131964809, + "grad_norm": 1.3658970750260513, + "learning_rate": 2.9806704902421557e-06, + "loss": 0.585, + "step": 2853 + }, + { + "epoch": 0.6695601173020528, + "grad_norm": 3.979620433534735, + "learning_rate": 2.9769241198139404e-06, + "loss": 0.6345, + "step": 2854 + }, + { + "epoch": 0.6697947214076246, + "grad_norm": 1.1443883545732938, + "learning_rate": 2.9731791070148856e-06, + "loss": 0.6229, + "step": 2855 + }, + { + "epoch": 0.6700293255131965, + "grad_norm": 1.6220656787609486, + "learning_rate": 2.969435454358158e-06, + "loss": 0.6288, + "step": 2856 + }, + { + "epoch": 0.6702639296187684, + "grad_norm": 1.340768995873484, + "learning_rate": 2.965693164356023e-06, + "loss": 0.6359, + "step": 2857 + }, + { + "epoch": 0.6704985337243402, + "grad_norm": 1.7545770470136055, + "learning_rate": 2.961952239519823e-06, + "loss": 0.5351, + "step": 2858 + }, + { + "epoch": 0.6707331378299121, + "grad_norm": 1.2654033268936953, + "learning_rate": 2.9582126823599877e-06, + "loss": 0.6208, + "step": 2859 + }, + { + "epoch": 0.6709677419354839, + "grad_norm": 1.1272607784509288, + "learning_rate": 2.9544744953860265e-06, + "loss": 0.6451, + "step": 2860 + }, + { + "epoch": 0.6712023460410557, + "grad_norm": 0.9460218742456779, + "learning_rate": 2.950737681106537e-06, + "loss": 0.5644, + "step": 2861 + }, + { + "epoch": 0.6714369501466275, + "grad_norm": 1.8527524541962235, + "learning_rate": 2.947002242029182e-06, + "loss": 0.6979, + "step": 2862 + }, + { + "epoch": 0.6716715542521994, + "grad_norm": 0.818960409812016, + "learning_rate": 2.9432681806607145e-06, + "loss": 0.633, + "step": 2863 + }, + { + "epoch": 0.6719061583577712, + "grad_norm": 6.587958564986356, + "learning_rate": 2.9395354995069574e-06, + "loss": 0.6928, + "step": 2864 + }, + { + "epoch": 0.6721407624633431, + "grad_norm": 1.6530245205627356, + "learning_rate": 2.935804201072806e-06, + "loss": 0.6065, + "step": 2865 + }, + { + "epoch": 0.672375366568915, + "grad_norm": 1.39135578014723, + "learning_rate": 2.9320742878622303e-06, + "loss": 0.6615, + "step": 2866 + }, + { + "epoch": 0.6726099706744868, + "grad_norm": 1.1184181775851425, + "learning_rate": 2.9283457623782707e-06, + "loss": 0.6423, + "step": 2867 + }, + { + "epoch": 0.6728445747800587, + "grad_norm": 1.158912499957988, + "learning_rate": 2.9246186271230335e-06, + "loss": 0.6295, + "step": 2868 + }, + { + "epoch": 0.6730791788856305, + "grad_norm": 1.6279079724630416, + "learning_rate": 2.9208928845976947e-06, + "loss": 0.5959, + "step": 2869 + }, + { + "epoch": 0.6733137829912024, + "grad_norm": 1.705259226439366, + "learning_rate": 2.917168537302497e-06, + "loss": 0.6354, + "step": 2870 + }, + { + "epoch": 0.6735483870967742, + "grad_norm": 1.1830061393698021, + "learning_rate": 2.913445587736742e-06, + "loss": 0.688, + "step": 2871 + }, + { + "epoch": 0.6737829912023461, + "grad_norm": 1.1272705513657213, + "learning_rate": 2.9097240383988e-06, + "loss": 0.5818, + "step": 2872 + }, + { + "epoch": 0.6740175953079179, + "grad_norm": 1.0288641731279962, + "learning_rate": 2.9060038917860928e-06, + "loss": 0.6086, + "step": 2873 + }, + { + "epoch": 0.6742521994134897, + "grad_norm": 1.464748073480879, + "learning_rate": 2.902285150395106e-06, + "loss": 0.5965, + "step": 2874 + }, + { + "epoch": 0.6744868035190615, + "grad_norm": 12.020052613752277, + "learning_rate": 2.898567816721389e-06, + "loss": 0.6194, + "step": 2875 + }, + { + "epoch": 0.6747214076246334, + "grad_norm": 3.604009329849857, + "learning_rate": 2.894851893259532e-06, + "loss": 0.6338, + "step": 2876 + }, + { + "epoch": 0.6749560117302053, + "grad_norm": 1.6719245844127935, + "learning_rate": 2.8911373825031897e-06, + "loss": 0.6333, + "step": 2877 + }, + { + "epoch": 0.6751906158357771, + "grad_norm": 1.0984583224202944, + "learning_rate": 2.8874242869450655e-06, + "loss": 0.5903, + "step": 2878 + }, + { + "epoch": 0.675425219941349, + "grad_norm": 0.9577017499977273, + "learning_rate": 2.8837126090769118e-06, + "loss": 0.5841, + "step": 2879 + }, + { + "epoch": 0.6756598240469208, + "grad_norm": 1.1865292120559294, + "learning_rate": 2.8800023513895324e-06, + "loss": 0.5805, + "step": 2880 + }, + { + "epoch": 0.6758944281524927, + "grad_norm": 2.230851870372906, + "learning_rate": 2.876293516372778e-06, + "loss": 0.5917, + "step": 2881 + }, + { + "epoch": 0.6761290322580645, + "grad_norm": 0.9735266374200137, + "learning_rate": 2.872586106515537e-06, + "loss": 0.6052, + "step": 2882 + }, + { + "epoch": 0.6763636363636364, + "grad_norm": 1.839648421209005, + "learning_rate": 2.8688801243057536e-06, + "loss": 0.577, + "step": 2883 + }, + { + "epoch": 0.6765982404692082, + "grad_norm": 1.2510107958058816, + "learning_rate": 2.8651755722304082e-06, + "loss": 0.7087, + "step": 2884 + }, + { + "epoch": 0.6768328445747801, + "grad_norm": 2.1524909106565837, + "learning_rate": 2.8614724527755168e-06, + "loss": 0.6245, + "step": 2885 + }, + { + "epoch": 0.677067448680352, + "grad_norm": 1.8904527779746534, + "learning_rate": 2.857770768426139e-06, + "loss": 0.5994, + "step": 2886 + }, + { + "epoch": 0.6773020527859237, + "grad_norm": 2.065524797125004, + "learning_rate": 2.854070521666376e-06, + "loss": 0.6339, + "step": 2887 + }, + { + "epoch": 0.6775366568914956, + "grad_norm": 0.9496542794903673, + "learning_rate": 2.8503717149793543e-06, + "loss": 0.5921, + "step": 2888 + }, + { + "epoch": 0.6777712609970674, + "grad_norm": 1.4592674331186697, + "learning_rate": 2.846674350847239e-06, + "loss": 0.6341, + "step": 2889 + }, + { + "epoch": 0.6780058651026393, + "grad_norm": 1.6848807945445805, + "learning_rate": 2.8429784317512275e-06, + "loss": 0.6479, + "step": 2890 + }, + { + "epoch": 0.6782404692082111, + "grad_norm": 2.0942540269295793, + "learning_rate": 2.8392839601715482e-06, + "loss": 0.5938, + "step": 2891 + }, + { + "epoch": 0.678475073313783, + "grad_norm": 1.2271050050156875, + "learning_rate": 2.8355909385874546e-06, + "loss": 0.546, + "step": 2892 + }, + { + "epoch": 0.6787096774193548, + "grad_norm": 4.204770776951658, + "learning_rate": 2.831899369477233e-06, + "loss": 0.6386, + "step": 2893 + }, + { + "epoch": 0.6789442815249267, + "grad_norm": 2.0165045411018014, + "learning_rate": 2.8282092553181863e-06, + "loss": 0.5919, + "step": 2894 + }, + { + "epoch": 0.6791788856304986, + "grad_norm": 1.011918455921512, + "learning_rate": 2.8245205985866467e-06, + "loss": 0.5626, + "step": 2895 + }, + { + "epoch": 0.6794134897360704, + "grad_norm": 0.9938648578959085, + "learning_rate": 2.8208334017579735e-06, + "loss": 0.6657, + "step": 2896 + }, + { + "epoch": 0.6796480938416423, + "grad_norm": 1.1786828064949448, + "learning_rate": 2.8171476673065343e-06, + "loss": 0.6132, + "step": 2897 + }, + { + "epoch": 0.6798826979472141, + "grad_norm": 3.0588265278017173, + "learning_rate": 2.8134633977057236e-06, + "loss": 0.6338, + "step": 2898 + }, + { + "epoch": 0.680117302052786, + "grad_norm": 1.4664012215829267, + "learning_rate": 2.8097805954279505e-06, + "loss": 0.6034, + "step": 2899 + }, + { + "epoch": 0.6803519061583577, + "grad_norm": 0.992836254764791, + "learning_rate": 2.8060992629446406e-06, + "loss": 0.6164, + "step": 2900 + }, + { + "epoch": 0.6805865102639296, + "grad_norm": 1.3595756537761916, + "learning_rate": 2.802419402726233e-06, + "loss": 0.687, + "step": 2901 + }, + { + "epoch": 0.6808211143695014, + "grad_norm": 1.098305893354896, + "learning_rate": 2.7987410172421756e-06, + "loss": 0.6327, + "step": 2902 + }, + { + "epoch": 0.6810557184750733, + "grad_norm": 2.0540377571088806, + "learning_rate": 2.7950641089609275e-06, + "loss": 0.6314, + "step": 2903 + }, + { + "epoch": 0.6812903225806451, + "grad_norm": 1.1741171050623673, + "learning_rate": 2.7913886803499657e-06, + "loss": 0.5887, + "step": 2904 + }, + { + "epoch": 0.681524926686217, + "grad_norm": 0.9085521953652322, + "learning_rate": 2.7877147338757605e-06, + "loss": 0.6212, + "step": 2905 + }, + { + "epoch": 0.6817595307917889, + "grad_norm": 1.0193622872667523, + "learning_rate": 2.7840422720037943e-06, + "loss": 0.6166, + "step": 2906 + }, + { + "epoch": 0.6819941348973607, + "grad_norm": 1.3384204790400827, + "learning_rate": 2.7803712971985535e-06, + "loss": 0.6386, + "step": 2907 + }, + { + "epoch": 0.6822287390029326, + "grad_norm": 1.0633549432371698, + "learning_rate": 2.7767018119235263e-06, + "loss": 0.6078, + "step": 2908 + }, + { + "epoch": 0.6824633431085044, + "grad_norm": 2.2193333192961524, + "learning_rate": 2.7730338186411997e-06, + "loss": 0.6392, + "step": 2909 + }, + { + "epoch": 0.6826979472140763, + "grad_norm": 2.4172875239478633, + "learning_rate": 2.769367319813064e-06, + "loss": 0.6209, + "step": 2910 + }, + { + "epoch": 0.6829325513196481, + "grad_norm": 1.0609735734651438, + "learning_rate": 2.7657023178995955e-06, + "loss": 0.6235, + "step": 2911 + }, + { + "epoch": 0.68316715542522, + "grad_norm": 2.658231085240406, + "learning_rate": 2.7620388153602807e-06, + "loss": 0.5978, + "step": 2912 + }, + { + "epoch": 0.6834017595307917, + "grad_norm": 1.4268951686909808, + "learning_rate": 2.7583768146535927e-06, + "loss": 0.6674, + "step": 2913 + }, + { + "epoch": 0.6836363636363636, + "grad_norm": 0.9821828639937361, + "learning_rate": 2.754716318236993e-06, + "loss": 0.6747, + "step": 2914 + }, + { + "epoch": 0.6838709677419355, + "grad_norm": 1.6885672688129323, + "learning_rate": 2.7510573285669382e-06, + "loss": 0.6205, + "step": 2915 + }, + { + "epoch": 0.6841055718475073, + "grad_norm": 1.5084964667460745, + "learning_rate": 2.7473998480988746e-06, + "loss": 0.6012, + "step": 2916 + }, + { + "epoch": 0.6843401759530792, + "grad_norm": 0.9368814430221031, + "learning_rate": 2.7437438792872332e-06, + "loss": 0.6208, + "step": 2917 + }, + { + "epoch": 0.684574780058651, + "grad_norm": 3.8251928370956545, + "learning_rate": 2.7400894245854327e-06, + "loss": 0.599, + "step": 2918 + }, + { + "epoch": 0.6848093841642229, + "grad_norm": 0.9841376000436076, + "learning_rate": 2.7364364864458724e-06, + "loss": 0.6451, + "step": 2919 + }, + { + "epoch": 0.6850439882697947, + "grad_norm": 1.1442021678808518, + "learning_rate": 2.7327850673199384e-06, + "loss": 0.6777, + "step": 2920 + }, + { + "epoch": 0.6852785923753666, + "grad_norm": 1.7873515601018037, + "learning_rate": 2.7291351696579966e-06, + "loss": 0.615, + "step": 2921 + }, + { + "epoch": 0.6855131964809384, + "grad_norm": 1.1203503910361199, + "learning_rate": 2.7254867959093856e-06, + "loss": 0.5835, + "step": 2922 + }, + { + "epoch": 0.6857478005865103, + "grad_norm": 1.5265436819101847, + "learning_rate": 2.721839948522428e-06, + "loss": 0.5801, + "step": 2923 + }, + { + "epoch": 0.6859824046920822, + "grad_norm": 1.2311245489074993, + "learning_rate": 2.7181946299444205e-06, + "loss": 0.6397, + "step": 2924 + }, + { + "epoch": 0.6862170087976539, + "grad_norm": 0.9843243705694885, + "learning_rate": 2.7145508426216345e-06, + "loss": 0.6595, + "step": 2925 + }, + { + "epoch": 0.6864516129032258, + "grad_norm": 2.3092268531811233, + "learning_rate": 2.7109085889993112e-06, + "loss": 0.6262, + "step": 2926 + }, + { + "epoch": 0.6866862170087976, + "grad_norm": 1.80595676554796, + "learning_rate": 2.707267871521666e-06, + "loss": 0.6001, + "step": 2927 + }, + { + "epoch": 0.6869208211143695, + "grad_norm": 1.1348891790325555, + "learning_rate": 2.70362869263188e-06, + "loss": 0.615, + "step": 2928 + }, + { + "epoch": 0.6871554252199413, + "grad_norm": 2.263449903706541, + "learning_rate": 2.699991054772105e-06, + "loss": 0.6215, + "step": 2929 + }, + { + "epoch": 0.6873900293255132, + "grad_norm": 1.3392135503572136, + "learning_rate": 2.6963549603834584e-06, + "loss": 0.6426, + "step": 2930 + }, + { + "epoch": 0.687624633431085, + "grad_norm": 1.9645893704674489, + "learning_rate": 2.692720411906017e-06, + "loss": 0.5612, + "step": 2931 + }, + { + "epoch": 0.6878592375366569, + "grad_norm": 1.194251282533319, + "learning_rate": 2.689087411778823e-06, + "loss": 0.5922, + "step": 2932 + }, + { + "epoch": 0.6880938416422288, + "grad_norm": 1.7386890174380603, + "learning_rate": 2.6854559624398867e-06, + "loss": 0.6513, + "step": 2933 + }, + { + "epoch": 0.6883284457478006, + "grad_norm": 1.2575489656956949, + "learning_rate": 2.681826066326166e-06, + "loss": 0.6096, + "step": 2934 + }, + { + "epoch": 0.6885630498533725, + "grad_norm": 1.6346961341634954, + "learning_rate": 2.6781977258735825e-06, + "loss": 0.622, + "step": 2935 + }, + { + "epoch": 0.6887976539589443, + "grad_norm": 0.859684406970139, + "learning_rate": 2.674570943517014e-06, + "loss": 0.6752, + "step": 2936 + }, + { + "epoch": 0.6890322580645162, + "grad_norm": 1.0660195697369763, + "learning_rate": 2.6709457216902923e-06, + "loss": 0.6744, + "step": 2937 + }, + { + "epoch": 0.6892668621700879, + "grad_norm": 7.324120026228176, + "learning_rate": 2.6673220628262002e-06, + "loss": 0.6128, + "step": 2938 + }, + { + "epoch": 0.6895014662756598, + "grad_norm": 1.0028191550126604, + "learning_rate": 2.6636999693564757e-06, + "loss": 0.6286, + "step": 2939 + }, + { + "epoch": 0.6897360703812316, + "grad_norm": 6.330141781990575, + "learning_rate": 2.6600794437117984e-06, + "loss": 0.674, + "step": 2940 + }, + { + "epoch": 0.6899706744868035, + "grad_norm": 2.8688801373319834, + "learning_rate": 2.656460488321805e-06, + "loss": 0.6466, + "step": 2941 + }, + { + "epoch": 0.6902052785923753, + "grad_norm": 1.0042272689014873, + "learning_rate": 2.6528431056150753e-06, + "loss": 0.6179, + "step": 2942 + }, + { + "epoch": 0.6904398826979472, + "grad_norm": 0.884456076476593, + "learning_rate": 2.649227298019129e-06, + "loss": 0.632, + "step": 2943 + }, + { + "epoch": 0.6906744868035191, + "grad_norm": 1.1733317012689413, + "learning_rate": 2.645613067960435e-06, + "loss": 0.6153, + "step": 2944 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 1.2068306093086028, + "learning_rate": 2.642000417864401e-06, + "loss": 0.5545, + "step": 2945 + }, + { + "epoch": 0.6911436950146628, + "grad_norm": 1.0984644873764033, + "learning_rate": 2.638389350155373e-06, + "loss": 0.5988, + "step": 2946 + }, + { + "epoch": 0.6913782991202346, + "grad_norm": 1.5260598428986425, + "learning_rate": 2.6347798672566405e-06, + "loss": 0.6232, + "step": 2947 + }, + { + "epoch": 0.6916129032258065, + "grad_norm": 0.9801830003751519, + "learning_rate": 2.631171971590419e-06, + "loss": 0.6082, + "step": 2948 + }, + { + "epoch": 0.6918475073313783, + "grad_norm": 2.187568539878525, + "learning_rate": 2.627565665577872e-06, + "loss": 0.648, + "step": 2949 + }, + { + "epoch": 0.6920821114369502, + "grad_norm": 1.2116407120973633, + "learning_rate": 2.6239609516390886e-06, + "loss": 0.5748, + "step": 2950 + }, + { + "epoch": 0.6923167155425219, + "grad_norm": 1.058164460232869, + "learning_rate": 2.620357832193088e-06, + "loss": 0.6096, + "step": 2951 + }, + { + "epoch": 0.6925513196480938, + "grad_norm": 1.4310036079844117, + "learning_rate": 2.616756309657823e-06, + "loss": 0.6508, + "step": 2952 + }, + { + "epoch": 0.6927859237536657, + "grad_norm": 0.8324497438810345, + "learning_rate": 2.613156386450174e-06, + "loss": 0.6373, + "step": 2953 + }, + { + "epoch": 0.6930205278592375, + "grad_norm": 1.4335451407998134, + "learning_rate": 2.6095580649859488e-06, + "loss": 0.5867, + "step": 2954 + }, + { + "epoch": 0.6932551319648094, + "grad_norm": 1.8180013742308017, + "learning_rate": 2.6059613476798784e-06, + "loss": 0.6112, + "step": 2955 + }, + { + "epoch": 0.6934897360703812, + "grad_norm": 3.1499798966921913, + "learning_rate": 2.6023662369456206e-06, + "loss": 0.5904, + "step": 2956 + }, + { + "epoch": 0.6937243401759531, + "grad_norm": 2.1569673925272737, + "learning_rate": 2.598772735195746e-06, + "loss": 0.5609, + "step": 2957 + }, + { + "epoch": 0.6939589442815249, + "grad_norm": 1.1310822575355794, + "learning_rate": 2.5951808448417603e-06, + "loss": 0.6575, + "step": 2958 + }, + { + "epoch": 0.6941935483870968, + "grad_norm": 1.1641509734349609, + "learning_rate": 2.591590568294078e-06, + "loss": 0.5965, + "step": 2959 + }, + { + "epoch": 0.6944281524926686, + "grad_norm": 1.062059412976598, + "learning_rate": 2.5880019079620294e-06, + "loss": 0.6265, + "step": 2960 + }, + { + "epoch": 0.6946627565982405, + "grad_norm": 1.3379517850880083, + "learning_rate": 2.5844148662538627e-06, + "loss": 0.6738, + "step": 2961 + }, + { + "epoch": 0.6948973607038124, + "grad_norm": 2.594687775988743, + "learning_rate": 2.5808294455767468e-06, + "loss": 0.6698, + "step": 2962 + }, + { + "epoch": 0.6951319648093842, + "grad_norm": 3.255937518305223, + "learning_rate": 2.57724564833675e-06, + "loss": 0.5931, + "step": 2963 + }, + { + "epoch": 0.695366568914956, + "grad_norm": 2.5251109529175224, + "learning_rate": 2.573663476938859e-06, + "loss": 0.623, + "step": 2964 + }, + { + "epoch": 0.6956011730205278, + "grad_norm": 1.3058098673928853, + "learning_rate": 2.57008293378697e-06, + "loss": 0.6892, + "step": 2965 + }, + { + "epoch": 0.6958357771260997, + "grad_norm": 1.0865533957493687, + "learning_rate": 2.5665040212838822e-06, + "loss": 0.5507, + "step": 2966 + }, + { + "epoch": 0.6960703812316715, + "grad_norm": 0.9426206025996563, + "learning_rate": 2.5629267418313065e-06, + "loss": 0.608, + "step": 2967 + }, + { + "epoch": 0.6963049853372434, + "grad_norm": 2.1931147294957376, + "learning_rate": 2.5593510978298487e-06, + "loss": 0.6676, + "step": 2968 + }, + { + "epoch": 0.6965395894428152, + "grad_norm": 2.713450087881679, + "learning_rate": 2.555777091679024e-06, + "loss": 0.6456, + "step": 2969 + }, + { + "epoch": 0.6967741935483871, + "grad_norm": 1.2620988976737808, + "learning_rate": 2.552204725777253e-06, + "loss": 0.6396, + "step": 2970 + }, + { + "epoch": 0.697008797653959, + "grad_norm": 1.1781007934007224, + "learning_rate": 2.5486340025218425e-06, + "loss": 0.5823, + "step": 2971 + }, + { + "epoch": 0.6972434017595308, + "grad_norm": 5.135887486520184, + "learning_rate": 2.5450649243090076e-06, + "loss": 0.6487, + "step": 2972 + }, + { + "epoch": 0.6974780058651027, + "grad_norm": 2.0839686381113016, + "learning_rate": 2.5414974935338555e-06, + "loss": 0.6321, + "step": 2973 + }, + { + "epoch": 0.6977126099706745, + "grad_norm": 1.3161281018417577, + "learning_rate": 2.5379317125903876e-06, + "loss": 0.5946, + "step": 2974 + }, + { + "epoch": 0.6979472140762464, + "grad_norm": 0.8018410432530535, + "learning_rate": 2.5343675838715e-06, + "loss": 0.6257, + "step": 2975 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 2.0506368128493393, + "learning_rate": 2.53080510976898e-06, + "loss": 0.5974, + "step": 2976 + }, + { + "epoch": 0.69841642228739, + "grad_norm": 6.629086288287914, + "learning_rate": 2.5272442926735e-06, + "loss": 0.6418, + "step": 2977 + }, + { + "epoch": 0.6986510263929618, + "grad_norm": 1.2259407613073565, + "learning_rate": 2.5236851349746242e-06, + "loss": 0.5896, + "step": 2978 + }, + { + "epoch": 0.6988856304985337, + "grad_norm": 2.028341343535244, + "learning_rate": 2.5201276390608087e-06, + "loss": 0.6621, + "step": 2979 + }, + { + "epoch": 0.6991202346041056, + "grad_norm": 2.619611758559292, + "learning_rate": 2.5165718073193823e-06, + "loss": 0.6777, + "step": 2980 + }, + { + "epoch": 0.6993548387096774, + "grad_norm": 2.5583207750689043, + "learning_rate": 2.513017642136566e-06, + "loss": 0.6431, + "step": 2981 + }, + { + "epoch": 0.6995894428152493, + "grad_norm": 1.3158747470507823, + "learning_rate": 2.5094651458974593e-06, + "loss": 0.643, + "step": 2982 + }, + { + "epoch": 0.6998240469208211, + "grad_norm": 0.9359562415514225, + "learning_rate": 2.5059143209860428e-06, + "loss": 0.5798, + "step": 2983 + }, + { + "epoch": 0.700058651026393, + "grad_norm": 1.0087867885845683, + "learning_rate": 2.5023651697851734e-06, + "loss": 0.6573, + "step": 2984 + }, + { + "epoch": 0.7002932551319648, + "grad_norm": 1.2602193218941862, + "learning_rate": 2.4988176946765906e-06, + "loss": 0.6273, + "step": 2985 + }, + { + "epoch": 0.7005278592375367, + "grad_norm": 1.1527791507490224, + "learning_rate": 2.4952718980408973e-06, + "loss": 0.7004, + "step": 2986 + }, + { + "epoch": 0.7007624633431085, + "grad_norm": 1.8359600994002145, + "learning_rate": 2.491727782257584e-06, + "loss": 0.6263, + "step": 2987 + }, + { + "epoch": 0.7009970674486804, + "grad_norm": 1.0763423476582339, + "learning_rate": 2.4881853497050074e-06, + "loss": 0.6284, + "step": 2988 + }, + { + "epoch": 0.7012316715542521, + "grad_norm": 5.243668554983986, + "learning_rate": 2.4846446027603892e-06, + "loss": 0.5597, + "step": 2989 + }, + { + "epoch": 0.701466275659824, + "grad_norm": 4.7044154282587005, + "learning_rate": 2.481105543799827e-06, + "loss": 0.6417, + "step": 2990 + }, + { + "epoch": 0.7017008797653959, + "grad_norm": 0.8525839737743981, + "learning_rate": 2.4775681751982837e-06, + "loss": 0.668, + "step": 2991 + }, + { + "epoch": 0.7019354838709677, + "grad_norm": 1.6191931203466026, + "learning_rate": 2.474032499329588e-06, + "loss": 0.6046, + "step": 2992 + }, + { + "epoch": 0.7021700879765396, + "grad_norm": 2.6330047781071055, + "learning_rate": 2.470498518566433e-06, + "loss": 0.5948, + "step": 2993 + }, + { + "epoch": 0.7024046920821114, + "grad_norm": 1.1482290058274915, + "learning_rate": 2.466966235280369e-06, + "loss": 0.6493, + "step": 2994 + }, + { + "epoch": 0.7026392961876833, + "grad_norm": 1.9539406976269607, + "learning_rate": 2.4634356518418167e-06, + "loss": 0.6036, + "step": 2995 + }, + { + "epoch": 0.7028739002932551, + "grad_norm": 1.546517172870452, + "learning_rate": 2.459906770620051e-06, + "loss": 0.5652, + "step": 2996 + }, + { + "epoch": 0.703108504398827, + "grad_norm": 0.9728200086431372, + "learning_rate": 2.4563795939832014e-06, + "loss": 0.5967, + "step": 2997 + }, + { + "epoch": 0.7033431085043989, + "grad_norm": 1.1543715824692742, + "learning_rate": 2.452854124298257e-06, + "loss": 0.5984, + "step": 2998 + }, + { + "epoch": 0.7035777126099707, + "grad_norm": 1.5598700116312605, + "learning_rate": 2.4493303639310634e-06, + "loss": 0.5613, + "step": 2999 + }, + { + "epoch": 0.7038123167155426, + "grad_norm": 1.0808904679380773, + "learning_rate": 2.445808315246315e-06, + "loss": 0.6377, + "step": 3000 + }, + { + "epoch": 0.7038123167155426, + "eval_loss": 0.618014931678772, + "eval_runtime": 39.8688, + "eval_samples_per_second": 13.695, + "eval_steps_per_second": 0.125, + "step": 3000 + }, + { + "epoch": 0.7040469208211144, + "grad_norm": 1.0083436032813202, + "learning_rate": 2.44228798060756e-06, + "loss": 0.5637, + "step": 3001 + }, + { + "epoch": 0.7042815249266862, + "grad_norm": 1.551336060663135, + "learning_rate": 2.438769362377196e-06, + "loss": 0.6397, + "step": 3002 + }, + { + "epoch": 0.704516129032258, + "grad_norm": 2.0623084373347162, + "learning_rate": 2.435252462916467e-06, + "loss": 0.5864, + "step": 3003 + }, + { + "epoch": 0.7047507331378299, + "grad_norm": 1.460880689822133, + "learning_rate": 2.4317372845854663e-06, + "loss": 0.6128, + "step": 3004 + }, + { + "epoch": 0.7049853372434017, + "grad_norm": 0.7829634791946768, + "learning_rate": 2.4282238297431328e-06, + "loss": 0.6119, + "step": 3005 + }, + { + "epoch": 0.7052199413489736, + "grad_norm": 3.4110934486558393, + "learning_rate": 2.424712100747243e-06, + "loss": 0.5944, + "step": 3006 + }, + { + "epoch": 0.7054545454545454, + "grad_norm": 4.504269536036809, + "learning_rate": 2.4212020999544184e-06, + "loss": 0.5989, + "step": 3007 + }, + { + "epoch": 0.7056891495601173, + "grad_norm": 2.9123036593108784, + "learning_rate": 2.4176938297201285e-06, + "loss": 0.6156, + "step": 3008 + }, + { + "epoch": 0.7059237536656892, + "grad_norm": 1.6082724379510858, + "learning_rate": 2.414187292398668e-06, + "loss": 0.5904, + "step": 3009 + }, + { + "epoch": 0.706158357771261, + "grad_norm": 1.003507769067074, + "learning_rate": 2.4106824903431774e-06, + "loss": 0.6328, + "step": 3010 + }, + { + "epoch": 0.7063929618768329, + "grad_norm": 0.8252337540952676, + "learning_rate": 2.40717942590563e-06, + "loss": 0.6094, + "step": 3011 + }, + { + "epoch": 0.7066275659824047, + "grad_norm": 1.664789198568701, + "learning_rate": 2.4036781014368344e-06, + "loss": 0.6058, + "step": 3012 + }, + { + "epoch": 0.7068621700879766, + "grad_norm": 0.94550532740909, + "learning_rate": 2.4001785192864314e-06, + "loss": 0.6074, + "step": 3013 + }, + { + "epoch": 0.7070967741935484, + "grad_norm": 0.7776157157972251, + "learning_rate": 2.3966806818028894e-06, + "loss": 0.6772, + "step": 3014 + }, + { + "epoch": 0.7073313782991202, + "grad_norm": 1.376934742327379, + "learning_rate": 2.3931845913335073e-06, + "loss": 0.6232, + "step": 3015 + }, + { + "epoch": 0.707565982404692, + "grad_norm": 1.2813169239858986, + "learning_rate": 2.3896902502244194e-06, + "loss": 0.6091, + "step": 3016 + }, + { + "epoch": 0.7078005865102639, + "grad_norm": 1.313302381723937, + "learning_rate": 2.3861976608205734e-06, + "loss": 0.6414, + "step": 3017 + }, + { + "epoch": 0.7080351906158358, + "grad_norm": 2.2660289115882866, + "learning_rate": 2.3827068254657493e-06, + "loss": 0.6105, + "step": 3018 + }, + { + "epoch": 0.7082697947214076, + "grad_norm": 1.2436694682572658, + "learning_rate": 2.3792177465025483e-06, + "loss": 0.6077, + "step": 3019 + }, + { + "epoch": 0.7085043988269795, + "grad_norm": 1.2934200363317827, + "learning_rate": 2.375730426272393e-06, + "loss": 0.6443, + "step": 3020 + }, + { + "epoch": 0.7087390029325513, + "grad_norm": 1.3226873193959154, + "learning_rate": 2.3722448671155253e-06, + "loss": 0.5967, + "step": 3021 + }, + { + "epoch": 0.7089736070381232, + "grad_norm": 1.4469709024801236, + "learning_rate": 2.368761071371008e-06, + "loss": 0.63, + "step": 3022 + }, + { + "epoch": 0.709208211143695, + "grad_norm": 1.691638793475797, + "learning_rate": 2.3652790413767125e-06, + "loss": 0.5817, + "step": 3023 + }, + { + "epoch": 0.7094428152492669, + "grad_norm": 1.283648412545291, + "learning_rate": 2.3617987794693358e-06, + "loss": 0.6422, + "step": 3024 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 1.2682867575516177, + "learning_rate": 2.3583202879843844e-06, + "loss": 0.6562, + "step": 3025 + }, + { + "epoch": 0.7099120234604106, + "grad_norm": 1.5039338883580333, + "learning_rate": 2.354843569256172e-06, + "loss": 0.6024, + "step": 3026 + }, + { + "epoch": 0.7101466275659825, + "grad_norm": 1.1283712881073278, + "learning_rate": 2.3513686256178276e-06, + "loss": 0.62, + "step": 3027 + }, + { + "epoch": 0.7103812316715542, + "grad_norm": 1.5069041246396937, + "learning_rate": 2.3478954594012884e-06, + "loss": 0.6182, + "step": 3028 + }, + { + "epoch": 0.7106158357771261, + "grad_norm": 1.3222552024129546, + "learning_rate": 2.344424072937298e-06, + "loss": 0.6445, + "step": 3029 + }, + { + "epoch": 0.7108504398826979, + "grad_norm": 1.649292393317922, + "learning_rate": 2.3409544685554064e-06, + "loss": 0.6197, + "step": 3030 + }, + { + "epoch": 0.7110850439882698, + "grad_norm": 1.1890982073103622, + "learning_rate": 2.3374866485839664e-06, + "loss": 0.5987, + "step": 3031 + }, + { + "epoch": 0.7113196480938416, + "grad_norm": 1.2831891994843283, + "learning_rate": 2.3340206153501354e-06, + "loss": 0.6385, + "step": 3032 + }, + { + "epoch": 0.7115542521994135, + "grad_norm": 1.1886954278865602, + "learning_rate": 2.3305563711798696e-06, + "loss": 0.5843, + "step": 3033 + }, + { + "epoch": 0.7117888563049853, + "grad_norm": 1.1703464338328116, + "learning_rate": 2.327093918397928e-06, + "loss": 0.6114, + "step": 3034 + }, + { + "epoch": 0.7120234604105572, + "grad_norm": 1.3578841915745736, + "learning_rate": 2.3236332593278603e-06, + "loss": 0.6142, + "step": 3035 + }, + { + "epoch": 0.712258064516129, + "grad_norm": 1.211142185636031, + "learning_rate": 2.3201743962920202e-06, + "loss": 0.6294, + "step": 3036 + }, + { + "epoch": 0.7124926686217009, + "grad_norm": 1.084284996182101, + "learning_rate": 2.3167173316115528e-06, + "loss": 0.6227, + "step": 3037 + }, + { + "epoch": 0.7127272727272728, + "grad_norm": 1.0898888210111246, + "learning_rate": 2.313262067606396e-06, + "loss": 0.557, + "step": 3038 + }, + { + "epoch": 0.7129618768328446, + "grad_norm": 1.0564936899031456, + "learning_rate": 2.3098086065952808e-06, + "loss": 0.5943, + "step": 3039 + }, + { + "epoch": 0.7131964809384165, + "grad_norm": 1.9814185949779952, + "learning_rate": 2.3063569508957267e-06, + "loss": 0.6005, + "step": 3040 + }, + { + "epoch": 0.7134310850439882, + "grad_norm": 2.386733138868949, + "learning_rate": 2.3029071028240434e-06, + "loss": 0.6237, + "step": 3041 + }, + { + "epoch": 0.7136656891495601, + "grad_norm": 3.276239044924214, + "learning_rate": 2.299459064695328e-06, + "loss": 0.6184, + "step": 3042 + }, + { + "epoch": 0.7139002932551319, + "grad_norm": 1.5829964122673519, + "learning_rate": 2.2960128388234582e-06, + "loss": 0.6109, + "step": 3043 + }, + { + "epoch": 0.7141348973607038, + "grad_norm": 0.9406077157147652, + "learning_rate": 2.292568427521098e-06, + "loss": 0.6473, + "step": 3044 + }, + { + "epoch": 0.7143695014662756, + "grad_norm": 1.9567918321818636, + "learning_rate": 2.289125833099702e-06, + "loss": 0.5906, + "step": 3045 + }, + { + "epoch": 0.7146041055718475, + "grad_norm": 1.0668767104284897, + "learning_rate": 2.2856850578694906e-06, + "loss": 0.5668, + "step": 3046 + }, + { + "epoch": 0.7148387096774194, + "grad_norm": 1.1161683312619683, + "learning_rate": 2.2822461041394745e-06, + "loss": 0.6033, + "step": 3047 + }, + { + "epoch": 0.7150733137829912, + "grad_norm": 3.070349541073239, + "learning_rate": 2.2788089742174374e-06, + "loss": 0.5884, + "step": 3048 + }, + { + "epoch": 0.7153079178885631, + "grad_norm": 1.18434704318199, + "learning_rate": 2.2753736704099418e-06, + "loss": 0.5763, + "step": 3049 + }, + { + "epoch": 0.7155425219941349, + "grad_norm": 1.4469610247277611, + "learning_rate": 2.2719401950223215e-06, + "loss": 0.6348, + "step": 3050 + }, + { + "epoch": 0.7157771260997068, + "grad_norm": 2.0429440492187587, + "learning_rate": 2.268508550358687e-06, + "loss": 0.6096, + "step": 3051 + }, + { + "epoch": 0.7160117302052786, + "grad_norm": 1.1376126933156385, + "learning_rate": 2.2650787387219138e-06, + "loss": 0.6232, + "step": 3052 + }, + { + "epoch": 0.7162463343108504, + "grad_norm": 1.0273333553927704, + "learning_rate": 2.2616507624136564e-06, + "loss": 0.6217, + "step": 3053 + }, + { + "epoch": 0.7164809384164222, + "grad_norm": 0.9998438699287782, + "learning_rate": 2.258224623734333e-06, + "loss": 0.6015, + "step": 3054 + }, + { + "epoch": 0.7167155425219941, + "grad_norm": 0.9329147141061209, + "learning_rate": 2.2548003249831256e-06, + "loss": 0.5317, + "step": 3055 + }, + { + "epoch": 0.716950146627566, + "grad_norm": 0.8633350498749837, + "learning_rate": 2.2513778684579864e-06, + "loss": 0.6328, + "step": 3056 + }, + { + "epoch": 0.7171847507331378, + "grad_norm": 2.210007930230921, + "learning_rate": 2.2479572564556286e-06, + "loss": 0.6384, + "step": 3057 + }, + { + "epoch": 0.7174193548387097, + "grad_norm": 1.200148315167745, + "learning_rate": 2.2445384912715285e-06, + "loss": 0.6012, + "step": 3058 + }, + { + "epoch": 0.7176539589442815, + "grad_norm": 1.6038881013054826, + "learning_rate": 2.241121575199925e-06, + "loss": 0.5895, + "step": 3059 + }, + { + "epoch": 0.7178885630498534, + "grad_norm": 1.0959333665107625, + "learning_rate": 2.2377065105338106e-06, + "loss": 0.6262, + "step": 3060 + }, + { + "epoch": 0.7181231671554252, + "grad_norm": 0.8947350604959121, + "learning_rate": 2.2342932995649395e-06, + "loss": 0.6071, + "step": 3061 + }, + { + "epoch": 0.7183577712609971, + "grad_norm": 10.184755961967252, + "learning_rate": 2.2308819445838258e-06, + "loss": 0.6115, + "step": 3062 + }, + { + "epoch": 0.718592375366569, + "grad_norm": 0.8963815777111561, + "learning_rate": 2.2274724478797284e-06, + "loss": 0.6656, + "step": 3063 + }, + { + "epoch": 0.7188269794721408, + "grad_norm": 1.6515649916986137, + "learning_rate": 2.224064811740666e-06, + "loss": 0.6202, + "step": 3064 + }, + { + "epoch": 0.7190615835777127, + "grad_norm": 1.046955887754639, + "learning_rate": 2.220659038453408e-06, + "loss": 0.6358, + "step": 3065 + }, + { + "epoch": 0.7192961876832844, + "grad_norm": 6.840520870058273, + "learning_rate": 2.217255130303471e-06, + "loss": 0.5781, + "step": 3066 + }, + { + "epoch": 0.7195307917888563, + "grad_norm": 0.7073585759640614, + "learning_rate": 2.2138530895751236e-06, + "loss": 0.5675, + "step": 3067 + }, + { + "epoch": 0.7197653958944281, + "grad_norm": 0.8842575469228101, + "learning_rate": 2.2104529185513807e-06, + "loss": 0.6518, + "step": 3068 + }, + { + "epoch": 0.72, + "grad_norm": 2.3747487052793663, + "learning_rate": 2.207054619513995e-06, + "loss": 0.6211, + "step": 3069 + }, + { + "epoch": 0.7202346041055718, + "grad_norm": 4.562252425973848, + "learning_rate": 2.2036581947434755e-06, + "loss": 0.585, + "step": 3070 + }, + { + "epoch": 0.7204692082111437, + "grad_norm": 1.3957950102548555, + "learning_rate": 2.2002636465190656e-06, + "loss": 0.6095, + "step": 3071 + }, + { + "epoch": 0.7207038123167155, + "grad_norm": 1.0275054610921959, + "learning_rate": 2.196870977118749e-06, + "loss": 0.5958, + "step": 3072 + }, + { + "epoch": 0.7209384164222874, + "grad_norm": 2.3443541466146143, + "learning_rate": 2.19348018881925e-06, + "loss": 0.637, + "step": 3073 + }, + { + "epoch": 0.7211730205278593, + "grad_norm": 1.0533846665061215, + "learning_rate": 2.1900912838960353e-06, + "loss": 0.5426, + "step": 3074 + }, + { + "epoch": 0.7214076246334311, + "grad_norm": 1.6562542886649325, + "learning_rate": 2.186704264623299e-06, + "loss": 0.6025, + "step": 3075 + }, + { + "epoch": 0.721642228739003, + "grad_norm": 0.9924826617798908, + "learning_rate": 2.183319133273976e-06, + "loss": 0.6394, + "step": 3076 + }, + { + "epoch": 0.7218768328445748, + "grad_norm": 4.865792980566119, + "learning_rate": 2.179935892119732e-06, + "loss": 0.6339, + "step": 3077 + }, + { + "epoch": 0.7221114369501467, + "grad_norm": 1.0134198127887246, + "learning_rate": 2.176554543430965e-06, + "loss": 0.5715, + "step": 3078 + }, + { + "epoch": 0.7223460410557184, + "grad_norm": 1.4089052269905165, + "learning_rate": 2.173175089476803e-06, + "loss": 0.5746, + "step": 3079 + }, + { + "epoch": 0.7225806451612903, + "grad_norm": 1.008940549715216, + "learning_rate": 2.169797532525103e-06, + "loss": 0.6386, + "step": 3080 + }, + { + "epoch": 0.7228152492668621, + "grad_norm": 0.8327562239505816, + "learning_rate": 2.166421874842446e-06, + "loss": 0.6307, + "step": 3081 + }, + { + "epoch": 0.723049853372434, + "grad_norm": 1.544075456904035, + "learning_rate": 2.1630481186941394e-06, + "loss": 0.5737, + "step": 3082 + }, + { + "epoch": 0.7232844574780058, + "grad_norm": 1.481794493564964, + "learning_rate": 2.159676266344222e-06, + "loss": 0.6036, + "step": 3083 + }, + { + "epoch": 0.7235190615835777, + "grad_norm": 0.8996551378170596, + "learning_rate": 2.156306320055443e-06, + "loss": 0.5978, + "step": 3084 + }, + { + "epoch": 0.7237536656891496, + "grad_norm": 0.755147395183249, + "learning_rate": 2.15293828208928e-06, + "loss": 0.642, + "step": 3085 + }, + { + "epoch": 0.7239882697947214, + "grad_norm": 1.5027580035435841, + "learning_rate": 2.149572154705929e-06, + "loss": 0.592, + "step": 3086 + }, + { + "epoch": 0.7242228739002933, + "grad_norm": 3.2543859736341347, + "learning_rate": 2.1462079401643023e-06, + "loss": 0.6052, + "step": 3087 + }, + { + "epoch": 0.7244574780058651, + "grad_norm": 1.6318345814434447, + "learning_rate": 2.1428456407220317e-06, + "loss": 0.5663, + "step": 3088 + }, + { + "epoch": 0.724692082111437, + "grad_norm": 0.9578170186463885, + "learning_rate": 2.1394852586354574e-06, + "loss": 0.6225, + "step": 3089 + }, + { + "epoch": 0.7249266862170088, + "grad_norm": 0.883183190557881, + "learning_rate": 2.1361267961596377e-06, + "loss": 0.6094, + "step": 3090 + }, + { + "epoch": 0.7251612903225807, + "grad_norm": 1.2821911719857417, + "learning_rate": 2.132770255548347e-06, + "loss": 0.6445, + "step": 3091 + }, + { + "epoch": 0.7253958944281524, + "grad_norm": 1.2509694667357942, + "learning_rate": 2.1294156390540597e-06, + "loss": 0.6104, + "step": 3092 + }, + { + "epoch": 0.7256304985337243, + "grad_norm": 1.0663919817396155, + "learning_rate": 2.1260629489279662e-06, + "loss": 0.6476, + "step": 3093 + }, + { + "epoch": 0.7258651026392962, + "grad_norm": 1.898001441221721, + "learning_rate": 2.1227121874199626e-06, + "loss": 0.6382, + "step": 3094 + }, + { + "epoch": 0.726099706744868, + "grad_norm": 4.713467343013875, + "learning_rate": 2.119363356778649e-06, + "loss": 0.6383, + "step": 3095 + }, + { + "epoch": 0.7263343108504399, + "grad_norm": 0.9249120017017618, + "learning_rate": 2.116016459251333e-06, + "loss": 0.662, + "step": 3096 + }, + { + "epoch": 0.7265689149560117, + "grad_norm": 1.1245590424039145, + "learning_rate": 2.112671497084024e-06, + "loss": 0.5678, + "step": 3097 + }, + { + "epoch": 0.7268035190615836, + "grad_norm": 3.5769692477007946, + "learning_rate": 2.109328472521427e-06, + "loss": 0.6488, + "step": 3098 + }, + { + "epoch": 0.7270381231671554, + "grad_norm": 4.387371038068046, + "learning_rate": 2.105987387806956e-06, + "loss": 0.5981, + "step": 3099 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.8345824137163759, + "learning_rate": 2.1026482451827185e-06, + "loss": 0.5947, + "step": 3100 + }, + { + "epoch": 0.7275073313782991, + "grad_norm": 1.2060890079193303, + "learning_rate": 2.0993110468895167e-06, + "loss": 0.6008, + "step": 3101 + }, + { + "epoch": 0.727741935483871, + "grad_norm": 1.8075769290329984, + "learning_rate": 2.095975795166851e-06, + "loss": 0.5954, + "step": 3102 + }, + { + "epoch": 0.7279765395894429, + "grad_norm": 1.1759828909931278, + "learning_rate": 2.092642492252915e-06, + "loss": 0.598, + "step": 3103 + }, + { + "epoch": 0.7282111436950147, + "grad_norm": 1.268712882941752, + "learning_rate": 2.0893111403845947e-06, + "loss": 0.5805, + "step": 3104 + }, + { + "epoch": 0.7284457478005865, + "grad_norm": 1.198108753238446, + "learning_rate": 2.085981741797465e-06, + "loss": 0.5865, + "step": 3105 + }, + { + "epoch": 0.7286803519061583, + "grad_norm": 1.508717280905453, + "learning_rate": 2.082654298725793e-06, + "loss": 0.6283, + "step": 3106 + }, + { + "epoch": 0.7289149560117302, + "grad_norm": 1.6247561744111116, + "learning_rate": 2.079328813402531e-06, + "loss": 0.6365, + "step": 3107 + }, + { + "epoch": 0.729149560117302, + "grad_norm": 1.060366855349647, + "learning_rate": 2.0760052880593213e-06, + "loss": 0.6289, + "step": 3108 + }, + { + "epoch": 0.7293841642228739, + "grad_norm": 1.8964263061112259, + "learning_rate": 2.0726837249264824e-06, + "loss": 0.5458, + "step": 3109 + }, + { + "epoch": 0.7296187683284457, + "grad_norm": 2.249605547258256, + "learning_rate": 2.069364126233025e-06, + "loss": 0.6055, + "step": 3110 + }, + { + "epoch": 0.7298533724340176, + "grad_norm": 1.5909784767012003, + "learning_rate": 2.0660464942066376e-06, + "loss": 0.6011, + "step": 3111 + }, + { + "epoch": 0.7300879765395895, + "grad_norm": 1.5043636812013932, + "learning_rate": 2.06273083107369e-06, + "loss": 0.5516, + "step": 3112 + }, + { + "epoch": 0.7303225806451613, + "grad_norm": 1.4742416464015022, + "learning_rate": 2.0594171390592294e-06, + "loss": 0.571, + "step": 3113 + }, + { + "epoch": 0.7305571847507332, + "grad_norm": 3.8632603210786214, + "learning_rate": 2.0561054203869813e-06, + "loss": 0.6107, + "step": 3114 + }, + { + "epoch": 0.730791788856305, + "grad_norm": 2.841471697964061, + "learning_rate": 2.052795677279347e-06, + "loss": 0.6255, + "step": 3115 + }, + { + "epoch": 0.7310263929618769, + "grad_norm": 1.6206507011890372, + "learning_rate": 2.0494879119574017e-06, + "loss": 0.5709, + "step": 3116 + }, + { + "epoch": 0.7312609970674486, + "grad_norm": 0.9339932762109435, + "learning_rate": 2.046182126640895e-06, + "loss": 0.6723, + "step": 3117 + }, + { + "epoch": 0.7314956011730205, + "grad_norm": 1.8009934526165545, + "learning_rate": 2.0428783235482423e-06, + "loss": 0.6217, + "step": 3118 + }, + { + "epoch": 0.7317302052785923, + "grad_norm": 2.5312349835021952, + "learning_rate": 2.039576504896534e-06, + "loss": 0.6169, + "step": 3119 + }, + { + "epoch": 0.7319648093841642, + "grad_norm": 0.9887613242226666, + "learning_rate": 2.036276672901531e-06, + "loss": 0.6332, + "step": 3120 + }, + { + "epoch": 0.732199413489736, + "grad_norm": 1.678550249720782, + "learning_rate": 2.0329788297776525e-06, + "loss": 0.6192, + "step": 3121 + }, + { + "epoch": 0.7324340175953079, + "grad_norm": 1.1955767796139254, + "learning_rate": 2.0296829777379912e-06, + "loss": 0.6095, + "step": 3122 + }, + { + "epoch": 0.7326686217008798, + "grad_norm": 3.0385761272879916, + "learning_rate": 2.0263891189942993e-06, + "loss": 0.617, + "step": 3123 + }, + { + "epoch": 0.7329032258064516, + "grad_norm": 1.4161970808512983, + "learning_rate": 2.023097255756992e-06, + "loss": 0.6094, + "step": 3124 + }, + { + "epoch": 0.7331378299120235, + "grad_norm": 0.9458671409511216, + "learning_rate": 2.0198073902351474e-06, + "loss": 0.5817, + "step": 3125 + }, + { + "epoch": 0.7333724340175953, + "grad_norm": 3.385699080239693, + "learning_rate": 2.0165195246365014e-06, + "loss": 0.5997, + "step": 3126 + }, + { + "epoch": 0.7336070381231672, + "grad_norm": 2.0482022284730346, + "learning_rate": 2.013233661167444e-06, + "loss": 0.6438, + "step": 3127 + }, + { + "epoch": 0.733841642228739, + "grad_norm": 1.1718917100723913, + "learning_rate": 2.0099498020330305e-06, + "loss": 0.6445, + "step": 3128 + }, + { + "epoch": 0.7340762463343109, + "grad_norm": 1.0495146094318377, + "learning_rate": 2.0066679494369664e-06, + "loss": 0.6037, + "step": 3129 + }, + { + "epoch": 0.7343108504398826, + "grad_norm": 0.9820637177008168, + "learning_rate": 2.0033881055816055e-06, + "loss": 0.6352, + "step": 3130 + }, + { + "epoch": 0.7345454545454545, + "grad_norm": 1.5459291010794618, + "learning_rate": 2.000110272667962e-06, + "loss": 0.5882, + "step": 3131 + }, + { + "epoch": 0.7347800586510264, + "grad_norm": 0.9100604994307435, + "learning_rate": 1.996834452895695e-06, + "loss": 0.6138, + "step": 3132 + }, + { + "epoch": 0.7350146627565982, + "grad_norm": 1.5098249251636204, + "learning_rate": 1.993560648463117e-06, + "loss": 0.6248, + "step": 3133 + }, + { + "epoch": 0.7352492668621701, + "grad_norm": 0.9405517757113439, + "learning_rate": 1.9902888615671868e-06, + "loss": 0.5962, + "step": 3134 + }, + { + "epoch": 0.7354838709677419, + "grad_norm": 2.171708657575388, + "learning_rate": 1.987019094403502e-06, + "loss": 0.5975, + "step": 3135 + }, + { + "epoch": 0.7357184750733138, + "grad_norm": 0.8737450754598194, + "learning_rate": 1.983751349166318e-06, + "loss": 0.6323, + "step": 3136 + }, + { + "epoch": 0.7359530791788856, + "grad_norm": 1.2820614570051516, + "learning_rate": 1.980485628048525e-06, + "loss": 0.6098, + "step": 3137 + }, + { + "epoch": 0.7361876832844575, + "grad_norm": 4.415409656406048, + "learning_rate": 1.9772219332416544e-06, + "loss": 0.6305, + "step": 3138 + }, + { + "epoch": 0.7364222873900294, + "grad_norm": 1.172455834608719, + "learning_rate": 1.9739602669358796e-06, + "loss": 0.5922, + "step": 3139 + }, + { + "epoch": 0.7366568914956012, + "grad_norm": 0.9686245660583657, + "learning_rate": 1.970700631320014e-06, + "loss": 0.6243, + "step": 3140 + }, + { + "epoch": 0.7368914956011731, + "grad_norm": 0.9026122089982698, + "learning_rate": 1.967443028581507e-06, + "loss": 0.6328, + "step": 3141 + }, + { + "epoch": 0.7371260997067449, + "grad_norm": 1.908394707423746, + "learning_rate": 1.9641874609064443e-06, + "loss": 0.6088, + "step": 3142 + }, + { + "epoch": 0.7373607038123167, + "grad_norm": 1.2418215891782105, + "learning_rate": 1.960933930479545e-06, + "loss": 0.6463, + "step": 3143 + }, + { + "epoch": 0.7375953079178885, + "grad_norm": 0.8779404299481255, + "learning_rate": 1.957682439484161e-06, + "loss": 0.6391, + "step": 3144 + }, + { + "epoch": 0.7378299120234604, + "grad_norm": 6.762926930788082, + "learning_rate": 1.9544329901022774e-06, + "loss": 0.5739, + "step": 3145 + }, + { + "epoch": 0.7380645161290322, + "grad_norm": 1.1959431410962325, + "learning_rate": 1.951185584514509e-06, + "loss": 0.6349, + "step": 3146 + }, + { + "epoch": 0.7382991202346041, + "grad_norm": 5.181848344572192, + "learning_rate": 1.947940224900095e-06, + "loss": 0.6058, + "step": 3147 + }, + { + "epoch": 0.738533724340176, + "grad_norm": 1.1024974625091453, + "learning_rate": 1.944696913436905e-06, + "loss": 0.665, + "step": 3148 + }, + { + "epoch": 0.7387683284457478, + "grad_norm": 1.3799331790467135, + "learning_rate": 1.941455652301438e-06, + "loss": 0.5731, + "step": 3149 + }, + { + "epoch": 0.7390029325513197, + "grad_norm": 1.5318353909593199, + "learning_rate": 1.9382164436688083e-06, + "loss": 0.6505, + "step": 3150 + }, + { + "epoch": 0.7392375366568915, + "grad_norm": 1.7414709924900138, + "learning_rate": 1.934979289712759e-06, + "loss": 0.6059, + "step": 3151 + }, + { + "epoch": 0.7394721407624634, + "grad_norm": 2.435808441135469, + "learning_rate": 1.9317441926056534e-06, + "loss": 0.5639, + "step": 3152 + }, + { + "epoch": 0.7397067448680352, + "grad_norm": 1.3984171255855322, + "learning_rate": 1.928511154518473e-06, + "loss": 0.6034, + "step": 3153 + }, + { + "epoch": 0.7399413489736071, + "grad_norm": 1.5570283468737232, + "learning_rate": 1.9252801776208214e-06, + "loss": 0.6545, + "step": 3154 + }, + { + "epoch": 0.7401759530791789, + "grad_norm": 1.7027751874831438, + "learning_rate": 1.922051264080912e-06, + "loss": 0.6166, + "step": 3155 + }, + { + "epoch": 0.7404105571847507, + "grad_norm": 0.892403316284046, + "learning_rate": 1.9188244160655767e-06, + "loss": 0.6034, + "step": 3156 + }, + { + "epoch": 0.7406451612903225, + "grad_norm": 1.080148125578477, + "learning_rate": 1.9155996357402682e-06, + "loss": 0.5889, + "step": 3157 + }, + { + "epoch": 0.7408797653958944, + "grad_norm": 1.0278013041046992, + "learning_rate": 1.912376925269041e-06, + "loss": 0.6308, + "step": 3158 + }, + { + "epoch": 0.7411143695014663, + "grad_norm": 1.474904404158769, + "learning_rate": 1.9091562868145665e-06, + "loss": 0.5791, + "step": 3159 + }, + { + "epoch": 0.7413489736070381, + "grad_norm": 0.8000659816307549, + "learning_rate": 1.9059377225381237e-06, + "loss": 0.604, + "step": 3160 + }, + { + "epoch": 0.74158357771261, + "grad_norm": 1.0811011568216184, + "learning_rate": 1.9027212345996004e-06, + "loss": 0.5715, + "step": 3161 + }, + { + "epoch": 0.7418181818181818, + "grad_norm": 1.351120093416797, + "learning_rate": 1.8995068251574917e-06, + "loss": 0.6177, + "step": 3162 + }, + { + "epoch": 0.7420527859237537, + "grad_norm": 2.1420836870860547, + "learning_rate": 1.8962944963688984e-06, + "loss": 0.6011, + "step": 3163 + }, + { + "epoch": 0.7422873900293255, + "grad_norm": 0.9624553925494452, + "learning_rate": 1.8930842503895203e-06, + "loss": 0.583, + "step": 3164 + }, + { + "epoch": 0.7425219941348974, + "grad_norm": 1.2035760958104174, + "learning_rate": 1.889876089373663e-06, + "loss": 0.6134, + "step": 3165 + }, + { + "epoch": 0.7427565982404692, + "grad_norm": 11.10356395089785, + "learning_rate": 1.8866700154742373e-06, + "loss": 0.5597, + "step": 3166 + }, + { + "epoch": 0.7429912023460411, + "grad_norm": 1.3005055557435543, + "learning_rate": 1.8834660308427455e-06, + "loss": 0.587, + "step": 3167 + }, + { + "epoch": 0.743225806451613, + "grad_norm": 0.8253032565548221, + "learning_rate": 1.8802641376292913e-06, + "loss": 0.5997, + "step": 3168 + }, + { + "epoch": 0.7434604105571847, + "grad_norm": 1.1465569201298969, + "learning_rate": 1.8770643379825753e-06, + "loss": 0.5903, + "step": 3169 + }, + { + "epoch": 0.7436950146627566, + "grad_norm": 1.6163983338351435, + "learning_rate": 1.873866634049894e-06, + "loss": 0.5688, + "step": 3170 + }, + { + "epoch": 0.7439296187683284, + "grad_norm": 1.8421419088577964, + "learning_rate": 1.8706710279771344e-06, + "loss": 0.6198, + "step": 3171 + }, + { + "epoch": 0.7441642228739003, + "grad_norm": 1.1628522233856817, + "learning_rate": 1.867477521908781e-06, + "loss": 0.6059, + "step": 3172 + }, + { + "epoch": 0.7443988269794721, + "grad_norm": 1.1566810176453983, + "learning_rate": 1.8642861179878996e-06, + "loss": 0.6138, + "step": 3173 + }, + { + "epoch": 0.744633431085044, + "grad_norm": 1.7677356551185617, + "learning_rate": 1.8610968183561568e-06, + "loss": 0.6072, + "step": 3174 + }, + { + "epoch": 0.7448680351906158, + "grad_norm": 3.209839176190197, + "learning_rate": 1.8579096251538014e-06, + "loss": 0.5988, + "step": 3175 + }, + { + "epoch": 0.7451026392961877, + "grad_norm": 2.7197831295626513, + "learning_rate": 1.8547245405196656e-06, + "loss": 0.6332, + "step": 3176 + }, + { + "epoch": 0.7453372434017596, + "grad_norm": 1.1208861821326568, + "learning_rate": 1.8515415665911718e-06, + "loss": 0.6438, + "step": 3177 + }, + { + "epoch": 0.7455718475073314, + "grad_norm": 1.1706103880893302, + "learning_rate": 1.8483607055043234e-06, + "loss": 0.6166, + "step": 3178 + }, + { + "epoch": 0.7458064516129033, + "grad_norm": 1.047508343781435, + "learning_rate": 1.845181959393707e-06, + "loss": 0.658, + "step": 3179 + }, + { + "epoch": 0.7460410557184751, + "grad_norm": 1.4765537772475474, + "learning_rate": 1.8420053303924907e-06, + "loss": 0.6553, + "step": 3180 + }, + { + "epoch": 0.7462756598240469, + "grad_norm": 1.4285242167071128, + "learning_rate": 1.8388308206324163e-06, + "loss": 0.628, + "step": 3181 + }, + { + "epoch": 0.7465102639296187, + "grad_norm": 1.5413870290351732, + "learning_rate": 1.835658432243812e-06, + "loss": 0.6168, + "step": 3182 + }, + { + "epoch": 0.7467448680351906, + "grad_norm": 1.162157085894512, + "learning_rate": 1.832488167355579e-06, + "loss": 0.6126, + "step": 3183 + }, + { + "epoch": 0.7469794721407624, + "grad_norm": 1.870748574842299, + "learning_rate": 1.829320028095189e-06, + "loss": 0.6164, + "step": 3184 + }, + { + "epoch": 0.7472140762463343, + "grad_norm": 3.5789738134555473, + "learning_rate": 1.8261540165886922e-06, + "loss": 0.6272, + "step": 3185 + }, + { + "epoch": 0.7474486803519061, + "grad_norm": 1.2668968034268808, + "learning_rate": 1.8229901349607094e-06, + "loss": 0.5763, + "step": 3186 + }, + { + "epoch": 0.747683284457478, + "grad_norm": 1.439955611019166, + "learning_rate": 1.8198283853344328e-06, + "loss": 0.626, + "step": 3187 + }, + { + "epoch": 0.7479178885630499, + "grad_norm": 4.236895960995736, + "learning_rate": 1.8166687698316237e-06, + "loss": 0.6326, + "step": 3188 + }, + { + "epoch": 0.7481524926686217, + "grad_norm": 0.9687410433379223, + "learning_rate": 1.8135112905726105e-06, + "loss": 0.6125, + "step": 3189 + }, + { + "epoch": 0.7483870967741936, + "grad_norm": 1.4870595340848511, + "learning_rate": 1.810355949676289e-06, + "loss": 0.6338, + "step": 3190 + }, + { + "epoch": 0.7486217008797654, + "grad_norm": 1.1158536244934172, + "learning_rate": 1.8072027492601185e-06, + "loss": 0.595, + "step": 3191 + }, + { + "epoch": 0.7488563049853373, + "grad_norm": 1.2743991860418764, + "learning_rate": 1.804051691440125e-06, + "loss": 0.592, + "step": 3192 + }, + { + "epoch": 0.7490909090909091, + "grad_norm": 0.9837471161778617, + "learning_rate": 1.8009027783308914e-06, + "loss": 0.6272, + "step": 3193 + }, + { + "epoch": 0.7493255131964809, + "grad_norm": 1.2492660965151676, + "learning_rate": 1.7977560120455645e-06, + "loss": 0.5991, + "step": 3194 + }, + { + "epoch": 0.7495601173020527, + "grad_norm": 1.1936748068108065, + "learning_rate": 1.794611394695855e-06, + "loss": 0.6599, + "step": 3195 + }, + { + "epoch": 0.7497947214076246, + "grad_norm": 1.6810118197692079, + "learning_rate": 1.7914689283920217e-06, + "loss": 0.6005, + "step": 3196 + }, + { + "epoch": 0.7500293255131965, + "grad_norm": 1.5887747861032746, + "learning_rate": 1.7883286152428876e-06, + "loss": 0.5649, + "step": 3197 + }, + { + "epoch": 0.7502639296187683, + "grad_norm": 1.8736086111526098, + "learning_rate": 1.7851904573558276e-06, + "loss": 0.6145, + "step": 3198 + }, + { + "epoch": 0.7504985337243402, + "grad_norm": 1.2735683110887916, + "learning_rate": 1.7820544568367714e-06, + "loss": 0.5978, + "step": 3199 + }, + { + "epoch": 0.750733137829912, + "grad_norm": 1.2712134023945538, + "learning_rate": 1.7789206157902023e-06, + "loss": 0.5872, + "step": 3200 + }, + { + "epoch": 0.7509677419354839, + "grad_norm": 1.2131521657913562, + "learning_rate": 1.7757889363191484e-06, + "loss": 0.6117, + "step": 3201 + }, + { + "epoch": 0.7512023460410557, + "grad_norm": 1.4220997200621723, + "learning_rate": 1.7726594205251917e-06, + "loss": 0.5951, + "step": 3202 + }, + { + "epoch": 0.7514369501466276, + "grad_norm": 1.4570245317132258, + "learning_rate": 1.7695320705084678e-06, + "loss": 0.6275, + "step": 3203 + }, + { + "epoch": 0.7516715542521994, + "grad_norm": 3.1706261340018926, + "learning_rate": 1.7664068883676478e-06, + "loss": 0.5926, + "step": 3204 + }, + { + "epoch": 0.7519061583577713, + "grad_norm": 1.6096835212893639, + "learning_rate": 1.7632838761999539e-06, + "loss": 0.6522, + "step": 3205 + }, + { + "epoch": 0.7521407624633432, + "grad_norm": 2.065351607750116, + "learning_rate": 1.760163036101153e-06, + "loss": 0.5881, + "step": 3206 + }, + { + "epoch": 0.7523753665689149, + "grad_norm": 1.1357256647355625, + "learning_rate": 1.7570443701655514e-06, + "loss": 0.6865, + "step": 3207 + }, + { + "epoch": 0.7526099706744868, + "grad_norm": 1.777097383952137, + "learning_rate": 1.7539278804859993e-06, + "loss": 0.5925, + "step": 3208 + }, + { + "epoch": 0.7528445747800586, + "grad_norm": 2.4940549327241506, + "learning_rate": 1.7508135691538852e-06, + "loss": 0.5852, + "step": 3209 + }, + { + "epoch": 0.7530791788856305, + "grad_norm": 1.4418911966589327, + "learning_rate": 1.7477014382591317e-06, + "loss": 0.6765, + "step": 3210 + }, + { + "epoch": 0.7533137829912023, + "grad_norm": 1.6796218309635522, + "learning_rate": 1.7445914898902072e-06, + "loss": 0.596, + "step": 3211 + }, + { + "epoch": 0.7535483870967742, + "grad_norm": 2.4018444163431614, + "learning_rate": 1.7414837261341095e-06, + "loss": 0.6741, + "step": 3212 + }, + { + "epoch": 0.753782991202346, + "grad_norm": 1.9184834069018, + "learning_rate": 1.7383781490763678e-06, + "loss": 0.6114, + "step": 3213 + }, + { + "epoch": 0.7540175953079179, + "grad_norm": 2.32381471595833, + "learning_rate": 1.7352747608010495e-06, + "loss": 0.6452, + "step": 3214 + }, + { + "epoch": 0.7542521994134898, + "grad_norm": 1.4461303924946431, + "learning_rate": 1.732173563390751e-06, + "loss": 0.6317, + "step": 3215 + }, + { + "epoch": 0.7544868035190616, + "grad_norm": 1.882361661059761, + "learning_rate": 1.7290745589265979e-06, + "loss": 0.6106, + "step": 3216 + }, + { + "epoch": 0.7547214076246335, + "grad_norm": 1.237694940304862, + "learning_rate": 1.7259777494882452e-06, + "loss": 0.5929, + "step": 3217 + }, + { + "epoch": 0.7549560117302053, + "grad_norm": 2.42434472479123, + "learning_rate": 1.722883137153874e-06, + "loss": 0.5957, + "step": 3218 + }, + { + "epoch": 0.7551906158357772, + "grad_norm": 1.168549637164787, + "learning_rate": 1.7197907240001915e-06, + "loss": 0.5911, + "step": 3219 + }, + { + "epoch": 0.7554252199413489, + "grad_norm": 1.4714442871452822, + "learning_rate": 1.716700512102429e-06, + "loss": 0.5573, + "step": 3220 + }, + { + "epoch": 0.7556598240469208, + "grad_norm": 1.2759502742647681, + "learning_rate": 1.7136125035343416e-06, + "loss": 0.5863, + "step": 3221 + }, + { + "epoch": 0.7558944281524926, + "grad_norm": 1.105407342353222, + "learning_rate": 1.7105267003682019e-06, + "loss": 0.6292, + "step": 3222 + }, + { + "epoch": 0.7561290322580645, + "grad_norm": 2.1463339906352097, + "learning_rate": 1.7074431046748075e-06, + "loss": 0.6516, + "step": 3223 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 1.3566818248828167, + "learning_rate": 1.7043617185234718e-06, + "loss": 0.6561, + "step": 3224 + }, + { + "epoch": 0.7565982404692082, + "grad_norm": 1.391925597739797, + "learning_rate": 1.701282543982027e-06, + "loss": 0.6495, + "step": 3225 + }, + { + "epoch": 0.7568328445747801, + "grad_norm": 1.052403619890732, + "learning_rate": 1.6982055831168187e-06, + "loss": 0.6365, + "step": 3226 + }, + { + "epoch": 0.7570674486803519, + "grad_norm": 0.9718241393674357, + "learning_rate": 1.6951308379927102e-06, + "loss": 0.6318, + "step": 3227 + }, + { + "epoch": 0.7573020527859238, + "grad_norm": 1.5025567820685952, + "learning_rate": 1.6920583106730749e-06, + "loss": 0.6091, + "step": 3228 + }, + { + "epoch": 0.7575366568914956, + "grad_norm": 1.082148010090495, + "learning_rate": 1.688988003219802e-06, + "loss": 0.641, + "step": 3229 + }, + { + "epoch": 0.7577712609970675, + "grad_norm": 1.183491718195806, + "learning_rate": 1.685919917693284e-06, + "loss": 0.6996, + "step": 3230 + }, + { + "epoch": 0.7580058651026393, + "grad_norm": 1.0837438815945954, + "learning_rate": 1.682854056152427e-06, + "loss": 0.6328, + "step": 3231 + }, + { + "epoch": 0.7582404692082112, + "grad_norm": 1.8852225998302086, + "learning_rate": 1.6797904206546495e-06, + "loss": 0.6047, + "step": 3232 + }, + { + "epoch": 0.7584750733137829, + "grad_norm": 3.0390157560807944, + "learning_rate": 1.676729013255865e-06, + "loss": 0.5883, + "step": 3233 + }, + { + "epoch": 0.7587096774193548, + "grad_norm": 1.4808623864273756, + "learning_rate": 1.6736698360105003e-06, + "loss": 0.582, + "step": 3234 + }, + { + "epoch": 0.7589442815249267, + "grad_norm": 1.3712063641076089, + "learning_rate": 1.6706128909714819e-06, + "loss": 0.6305, + "step": 3235 + }, + { + "epoch": 0.7591788856304985, + "grad_norm": 1.4806061227165328, + "learning_rate": 1.6675581801902403e-06, + "loss": 0.5892, + "step": 3236 + }, + { + "epoch": 0.7594134897360704, + "grad_norm": 0.8762509533048858, + "learning_rate": 1.6645057057167059e-06, + "loss": 0.616, + "step": 3237 + }, + { + "epoch": 0.7596480938416422, + "grad_norm": 1.1895038329000147, + "learning_rate": 1.6614554695993085e-06, + "loss": 0.6203, + "step": 3238 + }, + { + "epoch": 0.7598826979472141, + "grad_norm": 2.3442459561327187, + "learning_rate": 1.658407473884972e-06, + "loss": 0.6605, + "step": 3239 + }, + { + "epoch": 0.7601173020527859, + "grad_norm": 2.0882630174993557, + "learning_rate": 1.6553617206191247e-06, + "loss": 0.6289, + "step": 3240 + }, + { + "epoch": 0.7603519061583578, + "grad_norm": 2.046616949155505, + "learning_rate": 1.6523182118456855e-06, + "loss": 0.6084, + "step": 3241 + }, + { + "epoch": 0.7605865102639296, + "grad_norm": 1.489896729297481, + "learning_rate": 1.6492769496070642e-06, + "loss": 0.6511, + "step": 3242 + }, + { + "epoch": 0.7608211143695015, + "grad_norm": 0.9596870948622265, + "learning_rate": 1.6462379359441682e-06, + "loss": 0.5915, + "step": 3243 + }, + { + "epoch": 0.7610557184750734, + "grad_norm": 0.8713245464286309, + "learning_rate": 1.6432011728963936e-06, + "loss": 0.6004, + "step": 3244 + }, + { + "epoch": 0.7612903225806451, + "grad_norm": 0.9299015133859897, + "learning_rate": 1.640166662501626e-06, + "loss": 0.5891, + "step": 3245 + }, + { + "epoch": 0.761524926686217, + "grad_norm": 2.007429978690424, + "learning_rate": 1.6371344067962413e-06, + "loss": 0.5883, + "step": 3246 + }, + { + "epoch": 0.7617595307917888, + "grad_norm": 0.7109673104179589, + "learning_rate": 1.6341044078150964e-06, + "loss": 0.6513, + "step": 3247 + }, + { + "epoch": 0.7619941348973607, + "grad_norm": 2.522006058325781, + "learning_rate": 1.631076667591543e-06, + "loss": 0.5992, + "step": 3248 + }, + { + "epoch": 0.7622287390029325, + "grad_norm": 1.1163809240656386, + "learning_rate": 1.6280511881574119e-06, + "loss": 0.6283, + "step": 3249 + }, + { + "epoch": 0.7624633431085044, + "grad_norm": 1.186669010120512, + "learning_rate": 1.625027971543014e-06, + "loss": 0.5698, + "step": 3250 + }, + { + "epoch": 0.7626979472140762, + "grad_norm": 1.3475417859065295, + "learning_rate": 1.622007019777146e-06, + "loss": 0.5819, + "step": 3251 + }, + { + "epoch": 0.7629325513196481, + "grad_norm": 1.3745759835709592, + "learning_rate": 1.6189883348870837e-06, + "loss": 0.6516, + "step": 3252 + }, + { + "epoch": 0.76316715542522, + "grad_norm": 0.9443505990754515, + "learning_rate": 1.615971918898581e-06, + "loss": 0.5694, + "step": 3253 + }, + { + "epoch": 0.7634017595307918, + "grad_norm": 0.7468947245991708, + "learning_rate": 1.6129577738358704e-06, + "loss": 0.6225, + "step": 3254 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 1.9303938491739363, + "learning_rate": 1.6099459017216606e-06, + "loss": 0.6212, + "step": 3255 + }, + { + "epoch": 0.7638709677419355, + "grad_norm": 3.6901644762067445, + "learning_rate": 1.606936304577129e-06, + "loss": 0.6265, + "step": 3256 + }, + { + "epoch": 0.7641055718475074, + "grad_norm": 2.807876702342237, + "learning_rate": 1.6039289844219369e-06, + "loss": 0.5838, + "step": 3257 + }, + { + "epoch": 0.7643401759530791, + "grad_norm": 9.607632025709526, + "learning_rate": 1.6009239432742114e-06, + "loss": 0.6052, + "step": 3258 + }, + { + "epoch": 0.764574780058651, + "grad_norm": 2.4273439455654806, + "learning_rate": 1.5979211831505475e-06, + "loss": 0.6034, + "step": 3259 + }, + { + "epoch": 0.7648093841642228, + "grad_norm": 2.565759391994739, + "learning_rate": 1.5949207060660138e-06, + "loss": 0.6173, + "step": 3260 + }, + { + "epoch": 0.7650439882697947, + "grad_norm": 1.960429815561841, + "learning_rate": 1.5919225140341493e-06, + "loss": 0.6673, + "step": 3261 + }, + { + "epoch": 0.7652785923753666, + "grad_norm": 0.725331766460454, + "learning_rate": 1.5889266090669525e-06, + "loss": 0.653, + "step": 3262 + }, + { + "epoch": 0.7655131964809384, + "grad_norm": 2.4796172938947345, + "learning_rate": 1.5859329931748918e-06, + "loss": 0.6185, + "step": 3263 + }, + { + "epoch": 0.7657478005865103, + "grad_norm": 1.4892279586421546, + "learning_rate": 1.582941668366898e-06, + "loss": 0.6473, + "step": 3264 + }, + { + "epoch": 0.7659824046920821, + "grad_norm": 2.410209132399163, + "learning_rate": 1.579952636650366e-06, + "loss": 0.6977, + "step": 3265 + }, + { + "epoch": 0.766217008797654, + "grad_norm": 4.559312836778912, + "learning_rate": 1.5769659000311489e-06, + "loss": 0.6302, + "step": 3266 + }, + { + "epoch": 0.7664516129032258, + "grad_norm": 1.389797322576221, + "learning_rate": 1.5739814605135645e-06, + "loss": 0.5953, + "step": 3267 + }, + { + "epoch": 0.7666862170087977, + "grad_norm": 3.8276136266417145, + "learning_rate": 1.5709993201003827e-06, + "loss": 0.6052, + "step": 3268 + }, + { + "epoch": 0.7669208211143695, + "grad_norm": 0.8715327110089265, + "learning_rate": 1.5680194807928328e-06, + "loss": 0.5987, + "step": 3269 + }, + { + "epoch": 0.7671554252199414, + "grad_norm": 2.118969297046594, + "learning_rate": 1.5650419445906062e-06, + "loss": 0.5737, + "step": 3270 + }, + { + "epoch": 0.7673900293255131, + "grad_norm": 3.9105429891345294, + "learning_rate": 1.5620667134918389e-06, + "loss": 0.6153, + "step": 3271 + }, + { + "epoch": 0.767624633431085, + "grad_norm": 1.7709376221271529, + "learning_rate": 1.5590937894931263e-06, + "loss": 0.6535, + "step": 3272 + }, + { + "epoch": 0.7678592375366569, + "grad_norm": 1.6513279442972413, + "learning_rate": 1.5561231745895127e-06, + "loss": 0.6863, + "step": 3273 + }, + { + "epoch": 0.7680938416422287, + "grad_norm": 1.1747444830281086, + "learning_rate": 1.5531548707744958e-06, + "loss": 0.6106, + "step": 3274 + }, + { + "epoch": 0.7683284457478006, + "grad_norm": 1.035057859924388, + "learning_rate": 1.5501888800400205e-06, + "loss": 0.6253, + "step": 3275 + }, + { + "epoch": 0.7685630498533724, + "grad_norm": 1.3552243673443785, + "learning_rate": 1.5472252043764775e-06, + "loss": 0.5905, + "step": 3276 + }, + { + "epoch": 0.7687976539589443, + "grad_norm": 2.0780799228519258, + "learning_rate": 1.544263845772705e-06, + "loss": 0.6417, + "step": 3277 + }, + { + "epoch": 0.7690322580645161, + "grad_norm": 1.105916728228006, + "learning_rate": 1.541304806215993e-06, + "loss": 0.6249, + "step": 3278 + }, + { + "epoch": 0.769266862170088, + "grad_norm": 1.7097551252747039, + "learning_rate": 1.5383480876920636e-06, + "loss": 0.5781, + "step": 3279 + }, + { + "epoch": 0.7695014662756599, + "grad_norm": 1.759568915164601, + "learning_rate": 1.5353936921850898e-06, + "loss": 0.6555, + "step": 3280 + }, + { + "epoch": 0.7697360703812317, + "grad_norm": 1.8502303481125513, + "learning_rate": 1.532441621677682e-06, + "loss": 0.6013, + "step": 3281 + }, + { + "epoch": 0.7699706744868036, + "grad_norm": 1.5160790920259053, + "learning_rate": 1.5294918781508927e-06, + "loss": 0.5995, + "step": 3282 + }, + { + "epoch": 0.7702052785923754, + "grad_norm": 1.2596422018253846, + "learning_rate": 1.5265444635842109e-06, + "loss": 0.6085, + "step": 3283 + }, + { + "epoch": 0.7704398826979472, + "grad_norm": 1.2229870670497078, + "learning_rate": 1.5235993799555648e-06, + "loss": 0.6861, + "step": 3284 + }, + { + "epoch": 0.770674486803519, + "grad_norm": 2.460475240227371, + "learning_rate": 1.5206566292413123e-06, + "loss": 0.5767, + "step": 3285 + }, + { + "epoch": 0.7709090909090909, + "grad_norm": 1.4910074561987794, + "learning_rate": 1.5177162134162554e-06, + "loss": 0.5958, + "step": 3286 + }, + { + "epoch": 0.7711436950146627, + "grad_norm": 1.411344643686666, + "learning_rate": 1.5147781344536244e-06, + "loss": 0.6117, + "step": 3287 + }, + { + "epoch": 0.7713782991202346, + "grad_norm": 1.0327102545913112, + "learning_rate": 1.511842394325077e-06, + "loss": 0.6305, + "step": 3288 + }, + { + "epoch": 0.7716129032258064, + "grad_norm": 6.381707169698441, + "learning_rate": 1.5089089950007085e-06, + "loss": 0.6463, + "step": 3289 + }, + { + "epoch": 0.7718475073313783, + "grad_norm": 1.1841337846414857, + "learning_rate": 1.5059779384490398e-06, + "loss": 0.6157, + "step": 3290 + }, + { + "epoch": 0.7720821114369502, + "grad_norm": 0.984782978062202, + "learning_rate": 1.5030492266370206e-06, + "loss": 0.5964, + "step": 3291 + }, + { + "epoch": 0.772316715542522, + "grad_norm": 2.110256966634448, + "learning_rate": 1.5001228615300263e-06, + "loss": 0.6114, + "step": 3292 + }, + { + "epoch": 0.7725513196480939, + "grad_norm": 1.8412856984855048, + "learning_rate": 1.497198845091858e-06, + "loss": 0.6028, + "step": 3293 + }, + { + "epoch": 0.7727859237536657, + "grad_norm": 1.0422277408556229, + "learning_rate": 1.494277179284741e-06, + "loss": 0.6307, + "step": 3294 + }, + { + "epoch": 0.7730205278592376, + "grad_norm": 3.1943671941253546, + "learning_rate": 1.491357866069324e-06, + "loss": 0.6182, + "step": 3295 + }, + { + "epoch": 0.7732551319648094, + "grad_norm": 1.3588517103236077, + "learning_rate": 1.488440907404673e-06, + "loss": 0.6184, + "step": 3296 + }, + { + "epoch": 0.7734897360703812, + "grad_norm": 1.028187402162887, + "learning_rate": 1.4855263052482783e-06, + "loss": 0.626, + "step": 3297 + }, + { + "epoch": 0.773724340175953, + "grad_norm": 4.933856366207103, + "learning_rate": 1.482614061556047e-06, + "loss": 0.5675, + "step": 3298 + }, + { + "epoch": 0.7739589442815249, + "grad_norm": 1.287848599817723, + "learning_rate": 1.4797041782823035e-06, + "loss": 0.6397, + "step": 3299 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 1.5406686761677237, + "learning_rate": 1.4767966573797886e-06, + "loss": 0.6583, + "step": 3300 + }, + { + "epoch": 0.7744281524926686, + "grad_norm": 1.8461556875770175, + "learning_rate": 1.4738915007996574e-06, + "loss": 0.6391, + "step": 3301 + }, + { + "epoch": 0.7746627565982405, + "grad_norm": 1.3167080216353213, + "learning_rate": 1.470988710491479e-06, + "loss": 0.6444, + "step": 3302 + }, + { + "epoch": 0.7748973607038123, + "grad_norm": 1.5707265556655843, + "learning_rate": 1.4680882884032333e-06, + "loss": 0.6416, + "step": 3303 + }, + { + "epoch": 0.7751319648093842, + "grad_norm": 0.8937236046784137, + "learning_rate": 1.465190236481314e-06, + "loss": 0.6134, + "step": 3304 + }, + { + "epoch": 0.775366568914956, + "grad_norm": 2.3669902915170695, + "learning_rate": 1.4622945566705188e-06, + "loss": 0.6163, + "step": 3305 + }, + { + "epoch": 0.7756011730205279, + "grad_norm": 1.6036349563529806, + "learning_rate": 1.4594012509140566e-06, + "loss": 0.577, + "step": 3306 + }, + { + "epoch": 0.7758357771260997, + "grad_norm": 1.2137070662818732, + "learning_rate": 1.456510321153548e-06, + "loss": 0.6103, + "step": 3307 + }, + { + "epoch": 0.7760703812316716, + "grad_norm": 1.0992178316279348, + "learning_rate": 1.4536217693290094e-06, + "loss": 0.6119, + "step": 3308 + }, + { + "epoch": 0.7763049853372433, + "grad_norm": 0.7671658076737455, + "learning_rate": 1.450735597378869e-06, + "loss": 0.679, + "step": 3309 + }, + { + "epoch": 0.7765395894428152, + "grad_norm": 5.060920076699434, + "learning_rate": 1.447851807239954e-06, + "loss": 0.6404, + "step": 3310 + }, + { + "epoch": 0.7767741935483871, + "grad_norm": 1.260822906990352, + "learning_rate": 1.4449704008474958e-06, + "loss": 0.6321, + "step": 3311 + }, + { + "epoch": 0.7770087976539589, + "grad_norm": 1.6548344036130198, + "learning_rate": 1.4420913801351238e-06, + "loss": 0.576, + "step": 3312 + }, + { + "epoch": 0.7772434017595308, + "grad_norm": 2.153777299956326, + "learning_rate": 1.4392147470348706e-06, + "loss": 0.6446, + "step": 3313 + }, + { + "epoch": 0.7774780058651026, + "grad_norm": 1.823554274049185, + "learning_rate": 1.4363405034771578e-06, + "loss": 0.6322, + "step": 3314 + }, + { + "epoch": 0.7777126099706745, + "grad_norm": 1.5847096202332276, + "learning_rate": 1.4334686513908142e-06, + "loss": 0.5185, + "step": 3315 + }, + { + "epoch": 0.7779472140762463, + "grad_norm": 3.054059340863301, + "learning_rate": 1.4305991927030583e-06, + "loss": 0.6468, + "step": 3316 + }, + { + "epoch": 0.7781818181818182, + "grad_norm": 3.064431629624091, + "learning_rate": 1.4277321293395002e-06, + "loss": 0.6333, + "step": 3317 + }, + { + "epoch": 0.77841642228739, + "grad_norm": 1.2313262936700589, + "learning_rate": 1.424867463224147e-06, + "loss": 0.635, + "step": 3318 + }, + { + "epoch": 0.7786510263929619, + "grad_norm": 1.9063299113047427, + "learning_rate": 1.4220051962793952e-06, + "loss": 0.6024, + "step": 3319 + }, + { + "epoch": 0.7788856304985338, + "grad_norm": 1.9564053300211146, + "learning_rate": 1.4191453304260305e-06, + "loss": 0.6241, + "step": 3320 + }, + { + "epoch": 0.7791202346041056, + "grad_norm": 8.484623799086485, + "learning_rate": 1.4162878675832313e-06, + "loss": 0.6786, + "step": 3321 + }, + { + "epoch": 0.7793548387096774, + "grad_norm": 3.435443621340251, + "learning_rate": 1.4134328096685546e-06, + "loss": 0.6067, + "step": 3322 + }, + { + "epoch": 0.7795894428152492, + "grad_norm": 1.2425308672133275, + "learning_rate": 1.4105801585979546e-06, + "loss": 0.6332, + "step": 3323 + }, + { + "epoch": 0.7798240469208211, + "grad_norm": 1.1949842893128475, + "learning_rate": 1.407729916285764e-06, + "loss": 0.5866, + "step": 3324 + }, + { + "epoch": 0.7800586510263929, + "grad_norm": 2.9335507153002567, + "learning_rate": 1.4048820846446976e-06, + "loss": 0.5974, + "step": 3325 + }, + { + "epoch": 0.7802932551319648, + "grad_norm": 2.6593441141100866, + "learning_rate": 1.4020366655858565e-06, + "loss": 0.6377, + "step": 3326 + }, + { + "epoch": 0.7805278592375366, + "grad_norm": 2.245946615688388, + "learning_rate": 1.3991936610187208e-06, + "loss": 0.6443, + "step": 3327 + }, + { + "epoch": 0.7807624633431085, + "grad_norm": 2.714745053001885, + "learning_rate": 1.396353072851151e-06, + "loss": 0.6029, + "step": 3328 + }, + { + "epoch": 0.7809970674486804, + "grad_norm": 2.998846477985479, + "learning_rate": 1.3935149029893852e-06, + "loss": 0.6465, + "step": 3329 + }, + { + "epoch": 0.7812316715542522, + "grad_norm": 1.791661294773031, + "learning_rate": 1.3906791533380398e-06, + "loss": 0.6322, + "step": 3330 + }, + { + "epoch": 0.7814662756598241, + "grad_norm": 2.223055666219806, + "learning_rate": 1.387845825800106e-06, + "loss": 0.6332, + "step": 3331 + }, + { + "epoch": 0.7817008797653959, + "grad_norm": 1.040131659888429, + "learning_rate": 1.38501492227695e-06, + "loss": 0.6324, + "step": 3332 + }, + { + "epoch": 0.7819354838709678, + "grad_norm": 1.9881791871211159, + "learning_rate": 1.3821864446683126e-06, + "loss": 0.6359, + "step": 3333 + }, + { + "epoch": 0.7821700879765396, + "grad_norm": 1.6620906650821152, + "learning_rate": 1.3793603948723027e-06, + "loss": 0.5877, + "step": 3334 + }, + { + "epoch": 0.7824046920821114, + "grad_norm": 1.7234082345660948, + "learning_rate": 1.3765367747854031e-06, + "loss": 0.6428, + "step": 3335 + }, + { + "epoch": 0.7826392961876832, + "grad_norm": 1.3111511248431982, + "learning_rate": 1.3737155863024692e-06, + "loss": 0.5781, + "step": 3336 + }, + { + "epoch": 0.7828739002932551, + "grad_norm": 0.8901252471828437, + "learning_rate": 1.3708968313167165e-06, + "loss": 0.5707, + "step": 3337 + }, + { + "epoch": 0.783108504398827, + "grad_norm": 2.656244122080412, + "learning_rate": 1.3680805117197343e-06, + "loss": 0.6339, + "step": 3338 + }, + { + "epoch": 0.7833431085043988, + "grad_norm": 0.9263622289797381, + "learning_rate": 1.3652666294014744e-06, + "loss": 0.5491, + "step": 3339 + }, + { + "epoch": 0.7835777126099707, + "grad_norm": 0.9188490814641627, + "learning_rate": 1.362455186250254e-06, + "loss": 0.6536, + "step": 3340 + }, + { + "epoch": 0.7838123167155425, + "grad_norm": 1.1099594772663544, + "learning_rate": 1.3596461841527553e-06, + "loss": 0.6266, + "step": 3341 + }, + { + "epoch": 0.7840469208211144, + "grad_norm": 2.1744789612847817, + "learning_rate": 1.356839624994017e-06, + "loss": 0.6167, + "step": 3342 + }, + { + "epoch": 0.7842815249266862, + "grad_norm": 1.2617211581651355, + "learning_rate": 1.3540355106574415e-06, + "loss": 0.6239, + "step": 3343 + }, + { + "epoch": 0.7845161290322581, + "grad_norm": 1.9742936170049474, + "learning_rate": 1.3512338430247962e-06, + "loss": 0.5839, + "step": 3344 + }, + { + "epoch": 0.78475073313783, + "grad_norm": 1.0826767865312767, + "learning_rate": 1.348434623976196e-06, + "loss": 0.6618, + "step": 3345 + }, + { + "epoch": 0.7849853372434018, + "grad_norm": 2.185132353567675, + "learning_rate": 1.3456378553901195e-06, + "loss": 0.5943, + "step": 3346 + }, + { + "epoch": 0.7852199413489737, + "grad_norm": 1.048066889707301, + "learning_rate": 1.3428435391433992e-06, + "loss": 0.6473, + "step": 3347 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 1.231194490593031, + "learning_rate": 1.340051677111222e-06, + "loss": 0.5767, + "step": 3348 + }, + { + "epoch": 0.7856891495601173, + "grad_norm": 1.4452062806085455, + "learning_rate": 1.3372622711671274e-06, + "loss": 0.6548, + "step": 3349 + }, + { + "epoch": 0.7859237536656891, + "grad_norm": 1.1827767162128813, + "learning_rate": 1.3344753231830094e-06, + "loss": 0.6323, + "step": 3350 + }, + { + "epoch": 0.786158357771261, + "grad_norm": 0.9452392352350582, + "learning_rate": 1.3316908350291047e-06, + "loss": 0.6468, + "step": 3351 + }, + { + "epoch": 0.7863929618768328, + "grad_norm": 2.4247209122080378, + "learning_rate": 1.3289088085740097e-06, + "loss": 0.5795, + "step": 3352 + }, + { + "epoch": 0.7866275659824047, + "grad_norm": 1.2327284477556488, + "learning_rate": 1.3261292456846648e-06, + "loss": 0.6387, + "step": 3353 + }, + { + "epoch": 0.7868621700879765, + "grad_norm": 0.950831340782894, + "learning_rate": 1.323352148226351e-06, + "loss": 0.5497, + "step": 3354 + }, + { + "epoch": 0.7870967741935484, + "grad_norm": 1.0809008969675262, + "learning_rate": 1.320577518062704e-06, + "loss": 0.6244, + "step": 3355 + }, + { + "epoch": 0.7873313782991203, + "grad_norm": 1.1115781736671435, + "learning_rate": 1.317805357055698e-06, + "loss": 0.5876, + "step": 3356 + }, + { + "epoch": 0.7875659824046921, + "grad_norm": 1.3060102063465862, + "learning_rate": 1.3150356670656533e-06, + "loss": 0.6285, + "step": 3357 + }, + { + "epoch": 0.787800586510264, + "grad_norm": 2.1245940015772193, + "learning_rate": 1.31226844995123e-06, + "loss": 0.6092, + "step": 3358 + }, + { + "epoch": 0.7880351906158358, + "grad_norm": 1.8045950580074606, + "learning_rate": 1.3095037075694312e-06, + "loss": 0.6315, + "step": 3359 + }, + { + "epoch": 0.7882697947214077, + "grad_norm": 1.0368790165036073, + "learning_rate": 1.3067414417755931e-06, + "loss": 0.6301, + "step": 3360 + }, + { + "epoch": 0.7885043988269794, + "grad_norm": 12.983901457301942, + "learning_rate": 1.3039816544233986e-06, + "loss": 0.5728, + "step": 3361 + }, + { + "epoch": 0.7887390029325513, + "grad_norm": 1.354932499189322, + "learning_rate": 1.3012243473648633e-06, + "loss": 0.6179, + "step": 3362 + }, + { + "epoch": 0.7889736070381231, + "grad_norm": 3.908539668487798, + "learning_rate": 1.2984695224503351e-06, + "loss": 0.618, + "step": 3363 + }, + { + "epoch": 0.789208211143695, + "grad_norm": 0.860678478738657, + "learning_rate": 1.2957171815285014e-06, + "loss": 0.6253, + "step": 3364 + }, + { + "epoch": 0.7894428152492668, + "grad_norm": 1.5337010548258105, + "learning_rate": 1.2929673264463798e-06, + "loss": 0.6361, + "step": 3365 + }, + { + "epoch": 0.7896774193548387, + "grad_norm": 1.8146665153796193, + "learning_rate": 1.2902199590493209e-06, + "loss": 0.5863, + "step": 3366 + }, + { + "epoch": 0.7899120234604106, + "grad_norm": 1.1848692429594583, + "learning_rate": 1.2874750811810062e-06, + "loss": 0.5762, + "step": 3367 + }, + { + "epoch": 0.7901466275659824, + "grad_norm": 1.0084090285521348, + "learning_rate": 1.2847326946834427e-06, + "loss": 0.5974, + "step": 3368 + }, + { + "epoch": 0.7903812316715543, + "grad_norm": 1.8962677897193514, + "learning_rate": 1.2819928013969717e-06, + "loss": 0.645, + "step": 3369 + }, + { + "epoch": 0.7906158357771261, + "grad_norm": 2.5131656674707563, + "learning_rate": 1.279255403160259e-06, + "loss": 0.6142, + "step": 3370 + }, + { + "epoch": 0.790850439882698, + "grad_norm": 1.1443527817593289, + "learning_rate": 1.2765205018102928e-06, + "loss": 0.6114, + "step": 3371 + }, + { + "epoch": 0.7910850439882698, + "grad_norm": 2.162940758826294, + "learning_rate": 1.2737880991823888e-06, + "loss": 0.6757, + "step": 3372 + }, + { + "epoch": 0.7913196480938416, + "grad_norm": 2.711688830845445, + "learning_rate": 1.2710581971101855e-06, + "loss": 0.6145, + "step": 3373 + }, + { + "epoch": 0.7915542521994134, + "grad_norm": 1.7964302931137748, + "learning_rate": 1.2683307974256436e-06, + "loss": 0.6805, + "step": 3374 + }, + { + "epoch": 0.7917888563049853, + "grad_norm": 1.47766141071686, + "learning_rate": 1.2656059019590445e-06, + "loss": 0.6007, + "step": 3375 + }, + { + "epoch": 0.7920234604105572, + "grad_norm": 1.13335750370577, + "learning_rate": 1.2628835125389877e-06, + "loss": 0.6222, + "step": 3376 + }, + { + "epoch": 0.792258064516129, + "grad_norm": 1.947783932586144, + "learning_rate": 1.2601636309923925e-06, + "loss": 0.5923, + "step": 3377 + }, + { + "epoch": 0.7924926686217009, + "grad_norm": 1.1599492959540594, + "learning_rate": 1.257446259144494e-06, + "loss": 0.6269, + "step": 3378 + }, + { + "epoch": 0.7927272727272727, + "grad_norm": 1.2256969852581998, + "learning_rate": 1.2547313988188464e-06, + "loss": 0.6487, + "step": 3379 + }, + { + "epoch": 0.7929618768328446, + "grad_norm": 1.1128353455031734, + "learning_rate": 1.2520190518373126e-06, + "loss": 0.6368, + "step": 3380 + }, + { + "epoch": 0.7931964809384164, + "grad_norm": 1.3646467602570638, + "learning_rate": 1.2493092200200713e-06, + "loss": 0.5822, + "step": 3381 + }, + { + "epoch": 0.7934310850439883, + "grad_norm": 1.9955223075484694, + "learning_rate": 1.24660190518562e-06, + "loss": 0.5729, + "step": 3382 + }, + { + "epoch": 0.7936656891495601, + "grad_norm": 1.0226382679391863, + "learning_rate": 1.2438971091507552e-06, + "loss": 0.5821, + "step": 3383 + }, + { + "epoch": 0.793900293255132, + "grad_norm": 0.8494424787549543, + "learning_rate": 1.2411948337305922e-06, + "loss": 0.5865, + "step": 3384 + }, + { + "epoch": 0.7941348973607039, + "grad_norm": 1.443145038082585, + "learning_rate": 1.238495080738551e-06, + "loss": 0.6101, + "step": 3385 + }, + { + "epoch": 0.7943695014662756, + "grad_norm": 2.338883443189615, + "learning_rate": 1.23579785198636e-06, + "loss": 0.6457, + "step": 3386 + }, + { + "epoch": 0.7946041055718475, + "grad_norm": 0.9019626554116807, + "learning_rate": 1.233103149284055e-06, + "loss": 0.6284, + "step": 3387 + }, + { + "epoch": 0.7948387096774193, + "grad_norm": 1.958804857967841, + "learning_rate": 1.2304109744399717e-06, + "loss": 0.6538, + "step": 3388 + }, + { + "epoch": 0.7950733137829912, + "grad_norm": 0.733177226538545, + "learning_rate": 1.2277213292607526e-06, + "loss": 0.6726, + "step": 3389 + }, + { + "epoch": 0.795307917888563, + "grad_norm": 1.2349663035512626, + "learning_rate": 1.2250342155513473e-06, + "loss": 0.6444, + "step": 3390 + }, + { + "epoch": 0.7955425219941349, + "grad_norm": 1.1684590968148054, + "learning_rate": 1.2223496351149977e-06, + "loss": 0.5985, + "step": 3391 + }, + { + "epoch": 0.7957771260997067, + "grad_norm": 0.9992866474664069, + "learning_rate": 1.2196675897532512e-06, + "loss": 0.6353, + "step": 3392 + }, + { + "epoch": 0.7960117302052786, + "grad_norm": 1.2238212730894584, + "learning_rate": 1.216988081265953e-06, + "loss": 0.5725, + "step": 3393 + }, + { + "epoch": 0.7962463343108505, + "grad_norm": 1.8022939523151673, + "learning_rate": 1.2143111114512457e-06, + "loss": 0.7001, + "step": 3394 + }, + { + "epoch": 0.7964809384164223, + "grad_norm": 0.9958332458784978, + "learning_rate": 1.211636682105568e-06, + "loss": 0.6095, + "step": 3395 + }, + { + "epoch": 0.7967155425219942, + "grad_norm": 1.2532084596987885, + "learning_rate": 1.2089647950236556e-06, + "loss": 0.572, + "step": 3396 + }, + { + "epoch": 0.796950146627566, + "grad_norm": 2.4333055534970245, + "learning_rate": 1.2062954519985321e-06, + "loss": 0.5893, + "step": 3397 + }, + { + "epoch": 0.7971847507331379, + "grad_norm": 5.836441593581866, + "learning_rate": 1.203628654821523e-06, + "loss": 0.5637, + "step": 3398 + }, + { + "epoch": 0.7974193548387096, + "grad_norm": 1.2065652498080184, + "learning_rate": 1.2009644052822407e-06, + "loss": 0.573, + "step": 3399 + }, + { + "epoch": 0.7976539589442815, + "grad_norm": 11.18946386473759, + "learning_rate": 1.1983027051685841e-06, + "loss": 0.6198, + "step": 3400 + }, + { + "epoch": 0.7978885630498533, + "grad_norm": 1.2357894570784649, + "learning_rate": 1.1956435562667479e-06, + "loss": 0.6572, + "step": 3401 + }, + { + "epoch": 0.7981231671554252, + "grad_norm": 1.8245800883873866, + "learning_rate": 1.1929869603612104e-06, + "loss": 0.6301, + "step": 3402 + }, + { + "epoch": 0.798357771260997, + "grad_norm": 0.778615321641996, + "learning_rate": 1.1903329192347397e-06, + "loss": 0.5784, + "step": 3403 + }, + { + "epoch": 0.7985923753665689, + "grad_norm": 0.9277145250923433, + "learning_rate": 1.1876814346683868e-06, + "loss": 0.6406, + "step": 3404 + }, + { + "epoch": 0.7988269794721408, + "grad_norm": 1.15173702577741, + "learning_rate": 1.1850325084414882e-06, + "loss": 0.5859, + "step": 3405 + }, + { + "epoch": 0.7990615835777126, + "grad_norm": 2.1054307208618273, + "learning_rate": 1.1823861423316647e-06, + "loss": 0.6402, + "step": 3406 + }, + { + "epoch": 0.7992961876832845, + "grad_norm": 2.9105516733225145, + "learning_rate": 1.1797423381148165e-06, + "loss": 0.6764, + "step": 3407 + }, + { + "epoch": 0.7995307917888563, + "grad_norm": 1.2208509199172028, + "learning_rate": 1.1771010975651287e-06, + "loss": 0.607, + "step": 3408 + }, + { + "epoch": 0.7997653958944282, + "grad_norm": 3.9541231802179593, + "learning_rate": 1.17446242245506e-06, + "loss": 0.6052, + "step": 3409 + }, + { + "epoch": 0.8, + "grad_norm": 1.2953208369938987, + "learning_rate": 1.1718263145553522e-06, + "loss": 0.634, + "step": 3410 + }, + { + "epoch": 0.8002346041055719, + "grad_norm": 4.350314084605691, + "learning_rate": 1.169192775635023e-06, + "loss": 0.5889, + "step": 3411 + }, + { + "epoch": 0.8004692082111436, + "grad_norm": 1.2319447970218724, + "learning_rate": 1.1665618074613655e-06, + "loss": 0.6296, + "step": 3412 + }, + { + "epoch": 0.8007038123167155, + "grad_norm": 3.177908540022959, + "learning_rate": 1.1639334117999495e-06, + "loss": 0.5964, + "step": 3413 + }, + { + "epoch": 0.8009384164222874, + "grad_norm": 3.1498970263816672, + "learning_rate": 1.1613075904146159e-06, + "loss": 0.611, + "step": 3414 + }, + { + "epoch": 0.8011730205278592, + "grad_norm": 2.3301323900155784, + "learning_rate": 1.1586843450674807e-06, + "loss": 0.5801, + "step": 3415 + }, + { + "epoch": 0.8014076246334311, + "grad_norm": 2.028950348561728, + "learning_rate": 1.15606367751893e-06, + "loss": 0.6185, + "step": 3416 + }, + { + "epoch": 0.8016422287390029, + "grad_norm": 5.647118137567237, + "learning_rate": 1.1534455895276176e-06, + "loss": 0.6362, + "step": 3417 + }, + { + "epoch": 0.8018768328445748, + "grad_norm": 1.8848009142357725, + "learning_rate": 1.1508300828504682e-06, + "loss": 0.6238, + "step": 3418 + }, + { + "epoch": 0.8021114369501466, + "grad_norm": 1.6414677876576313, + "learning_rate": 1.1482171592426793e-06, + "loss": 0.5847, + "step": 3419 + }, + { + "epoch": 0.8023460410557185, + "grad_norm": 1.5514532825097258, + "learning_rate": 1.145606820457706e-06, + "loss": 0.6086, + "step": 3420 + }, + { + "epoch": 0.8025806451612904, + "grad_norm": 1.2375321438653892, + "learning_rate": 1.1429990682472736e-06, + "loss": 0.5699, + "step": 3421 + }, + { + "epoch": 0.8028152492668622, + "grad_norm": 2.131660061599189, + "learning_rate": 1.140393904361372e-06, + "loss": 0.6265, + "step": 3422 + }, + { + "epoch": 0.8030498533724341, + "grad_norm": 1.2230117810356314, + "learning_rate": 1.137791330548253e-06, + "loss": 0.6236, + "step": 3423 + }, + { + "epoch": 0.8032844574780059, + "grad_norm": 3.017257196280345, + "learning_rate": 1.1351913485544302e-06, + "loss": 0.5798, + "step": 3424 + }, + { + "epoch": 0.8035190615835777, + "grad_norm": 1.0228870218795352, + "learning_rate": 1.13259396012468e-06, + "loss": 0.6636, + "step": 3425 + }, + { + "epoch": 0.8037536656891495, + "grad_norm": 0.9908655019649436, + "learning_rate": 1.1299991670020316e-06, + "loss": 0.6472, + "step": 3426 + }, + { + "epoch": 0.8039882697947214, + "grad_norm": 1.2559780700329926, + "learning_rate": 1.1274069709277817e-06, + "loss": 0.6223, + "step": 3427 + }, + { + "epoch": 0.8042228739002932, + "grad_norm": 2.5058246431430704, + "learning_rate": 1.1248173736414807e-06, + "loss": 0.6046, + "step": 3428 + }, + { + "epoch": 0.8044574780058651, + "grad_norm": 1.4441822987176873, + "learning_rate": 1.1222303768809317e-06, + "loss": 0.5347, + "step": 3429 + }, + { + "epoch": 0.804692082111437, + "grad_norm": 0.9891278592157156, + "learning_rate": 1.1196459823821954e-06, + "loss": 0.6416, + "step": 3430 + }, + { + "epoch": 0.8049266862170088, + "grad_norm": 3.709255185597513, + "learning_rate": 1.117064191879587e-06, + "loss": 0.6121, + "step": 3431 + }, + { + "epoch": 0.8051612903225807, + "grad_norm": 1.0709487276616474, + "learning_rate": 1.1144850071056729e-06, + "loss": 0.5815, + "step": 3432 + }, + { + "epoch": 0.8053958944281525, + "grad_norm": 1.6016675274815613, + "learning_rate": 1.111908429791273e-06, + "loss": 0.6087, + "step": 3433 + }, + { + "epoch": 0.8056304985337244, + "grad_norm": 2.5992323255194822, + "learning_rate": 1.1093344616654512e-06, + "loss": 0.6105, + "step": 3434 + }, + { + "epoch": 0.8058651026392962, + "grad_norm": 1.2465891584533022, + "learning_rate": 1.1067631044555287e-06, + "loss": 0.6321, + "step": 3435 + }, + { + "epoch": 0.8060997067448681, + "grad_norm": 1.0656886684813924, + "learning_rate": 1.104194359887072e-06, + "loss": 0.6944, + "step": 3436 + }, + { + "epoch": 0.8063343108504398, + "grad_norm": 0.8576471430758463, + "learning_rate": 1.1016282296838887e-06, + "loss": 0.5715, + "step": 3437 + }, + { + "epoch": 0.8065689149560117, + "grad_norm": 3.7196268162761603, + "learning_rate": 1.099064715568039e-06, + "loss": 0.64, + "step": 3438 + }, + { + "epoch": 0.8068035190615835, + "grad_norm": 0.9884368668341875, + "learning_rate": 1.0965038192598248e-06, + "loss": 0.5934, + "step": 3439 + }, + { + "epoch": 0.8070381231671554, + "grad_norm": 0.917770902955031, + "learning_rate": 1.0939455424777912e-06, + "loss": 0.6006, + "step": 3440 + }, + { + "epoch": 0.8072727272727273, + "grad_norm": 1.085614535387496, + "learning_rate": 1.091389886938725e-06, + "loss": 0.6211, + "step": 3441 + }, + { + "epoch": 0.8075073313782991, + "grad_norm": 1.0339829162989158, + "learning_rate": 1.0888368543576566e-06, + "loss": 0.5927, + "step": 3442 + }, + { + "epoch": 0.807741935483871, + "grad_norm": 0.978303960663678, + "learning_rate": 1.08628644644785e-06, + "loss": 0.6304, + "step": 3443 + }, + { + "epoch": 0.8079765395894428, + "grad_norm": 0.9835665862573097, + "learning_rate": 1.0837386649208165e-06, + "loss": 0.622, + "step": 3444 + }, + { + "epoch": 0.8082111436950147, + "grad_norm": 1.105736761674846, + "learning_rate": 1.0811935114862992e-06, + "loss": 0.6267, + "step": 3445 + }, + { + "epoch": 0.8084457478005865, + "grad_norm": 1.3358721873164068, + "learning_rate": 1.0786509878522773e-06, + "loss": 0.608, + "step": 3446 + }, + { + "epoch": 0.8086803519061584, + "grad_norm": 1.4031151785314153, + "learning_rate": 1.0761110957249654e-06, + "loss": 0.6285, + "step": 3447 + }, + { + "epoch": 0.8089149560117302, + "grad_norm": 1.2800841677325638, + "learning_rate": 1.0735738368088188e-06, + "loss": 0.6121, + "step": 3448 + }, + { + "epoch": 0.8091495601173021, + "grad_norm": 2.996863614109977, + "learning_rate": 1.0710392128065161e-06, + "loss": 0.6352, + "step": 3449 + }, + { + "epoch": 0.8093841642228738, + "grad_norm": 1.2062631636769132, + "learning_rate": 1.0685072254189726e-06, + "loss": 0.587, + "step": 3450 + }, + { + "epoch": 0.8096187683284457, + "grad_norm": 1.2920479404932967, + "learning_rate": 1.0659778763453343e-06, + "loss": 0.6107, + "step": 3451 + }, + { + "epoch": 0.8098533724340176, + "grad_norm": 1.8474260191248044, + "learning_rate": 1.0634511672829756e-06, + "loss": 0.589, + "step": 3452 + }, + { + "epoch": 0.8100879765395894, + "grad_norm": 0.9470352861129286, + "learning_rate": 1.0609270999275e-06, + "loss": 0.5796, + "step": 3453 + }, + { + "epoch": 0.8103225806451613, + "grad_norm": 8.115248280785327, + "learning_rate": 1.058405675972739e-06, + "loss": 0.5878, + "step": 3454 + }, + { + "epoch": 0.8105571847507331, + "grad_norm": 1.1784816455084388, + "learning_rate": 1.055886897110745e-06, + "loss": 0.5983, + "step": 3455 + }, + { + "epoch": 0.810791788856305, + "grad_norm": 0.9124461933172028, + "learning_rate": 1.0533707650318043e-06, + "loss": 0.5784, + "step": 3456 + }, + { + "epoch": 0.8110263929618768, + "grad_norm": 1.049205448579507, + "learning_rate": 1.0508572814244206e-06, + "loss": 0.5951, + "step": 3457 + }, + { + "epoch": 0.8112609970674487, + "grad_norm": 1.431606379787145, + "learning_rate": 1.0483464479753208e-06, + "loss": 0.6573, + "step": 3458 + }, + { + "epoch": 0.8114956011730206, + "grad_norm": 0.9809468144743023, + "learning_rate": 1.045838266369455e-06, + "loss": 0.6089, + "step": 3459 + }, + { + "epoch": 0.8117302052785924, + "grad_norm": 3.727388887782009, + "learning_rate": 1.0433327382899927e-06, + "loss": 0.5923, + "step": 3460 + }, + { + "epoch": 0.8119648093841643, + "grad_norm": 0.8896991320966477, + "learning_rate": 1.040829865418324e-06, + "loss": 0.5877, + "step": 3461 + }, + { + "epoch": 0.8121994134897361, + "grad_norm": 1.018395009550909, + "learning_rate": 1.0383296494340567e-06, + "loss": 0.587, + "step": 3462 + }, + { + "epoch": 0.8124340175953079, + "grad_norm": 1.287468978398467, + "learning_rate": 1.0358320920150133e-06, + "loss": 0.5934, + "step": 3463 + }, + { + "epoch": 0.8126686217008797, + "grad_norm": 2.7926171161451867, + "learning_rate": 1.0333371948372344e-06, + "loss": 0.5986, + "step": 3464 + }, + { + "epoch": 0.8129032258064516, + "grad_norm": 1.8736280494825253, + "learning_rate": 1.0308449595749781e-06, + "loss": 0.613, + "step": 3465 + }, + { + "epoch": 0.8131378299120234, + "grad_norm": 0.9349203319508304, + "learning_rate": 1.0283553879007097e-06, + "loss": 0.6019, + "step": 3466 + }, + { + "epoch": 0.8133724340175953, + "grad_norm": 2.1523798510918186, + "learning_rate": 1.0258684814851116e-06, + "loss": 0.5917, + "step": 3467 + }, + { + "epoch": 0.8136070381231671, + "grad_norm": 0.9878462442135967, + "learning_rate": 1.0233842419970773e-06, + "loss": 0.6228, + "step": 3468 + }, + { + "epoch": 0.813841642228739, + "grad_norm": 2.3537190534706305, + "learning_rate": 1.0209026711037095e-06, + "loss": 0.5871, + "step": 3469 + }, + { + "epoch": 0.8140762463343109, + "grad_norm": 1.29018941978892, + "learning_rate": 1.0184237704703198e-06, + "loss": 0.6462, + "step": 3470 + }, + { + "epoch": 0.8143108504398827, + "grad_norm": 2.022691422348032, + "learning_rate": 1.0159475417604302e-06, + "loss": 0.5954, + "step": 3471 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 0.8903482850727282, + "learning_rate": 1.0134739866357645e-06, + "loss": 0.5839, + "step": 3472 + }, + { + "epoch": 0.8147800586510264, + "grad_norm": 4.840718158812474, + "learning_rate": 1.0110031067562592e-06, + "loss": 0.6187, + "step": 3473 + }, + { + "epoch": 0.8150146627565983, + "grad_norm": 2.0406564338103657, + "learning_rate": 1.0085349037800517e-06, + "loss": 0.6272, + "step": 3474 + }, + { + "epoch": 0.8152492668621701, + "grad_norm": 3.409140244776667, + "learning_rate": 1.0060693793634817e-06, + "loss": 0.6034, + "step": 3475 + }, + { + "epoch": 0.8154838709677419, + "grad_norm": 7.243806883878034, + "learning_rate": 1.0036065351610941e-06, + "loss": 0.6321, + "step": 3476 + }, + { + "epoch": 0.8157184750733137, + "grad_norm": 7.195637294241193, + "learning_rate": 1.0011463728256338e-06, + "loss": 0.5766, + "step": 3477 + }, + { + "epoch": 0.8159530791788856, + "grad_norm": 1.3114893035862916, + "learning_rate": 9.986888940080468e-07, + "loss": 0.6126, + "step": 3478 + }, + { + "epoch": 0.8161876832844575, + "grad_norm": 2.4355311049465334, + "learning_rate": 9.962341003574777e-07, + "loss": 0.6483, + "step": 3479 + }, + { + "epoch": 0.8164222873900293, + "grad_norm": 1.6477126802661715, + "learning_rate": 9.937819935212689e-07, + "loss": 0.5685, + "step": 3480 + }, + { + "epoch": 0.8166568914956012, + "grad_norm": 0.9376230126087468, + "learning_rate": 9.913325751449604e-07, + "loss": 0.6642, + "step": 3481 + }, + { + "epoch": 0.816891495601173, + "grad_norm": 3.068464537116145, + "learning_rate": 9.88885846872289e-07, + "loss": 0.6068, + "step": 3482 + }, + { + "epoch": 0.8171260997067449, + "grad_norm": 1.012110598195739, + "learning_rate": 9.86441810345183e-07, + "loss": 0.6291, + "step": 3483 + }, + { + "epoch": 0.8173607038123167, + "grad_norm": 4.890928410756853, + "learning_rate": 9.840004672037662e-07, + "loss": 0.6299, + "step": 3484 + }, + { + "epoch": 0.8175953079178886, + "grad_norm": 0.9490324308707202, + "learning_rate": 9.815618190863569e-07, + "loss": 0.6318, + "step": 3485 + }, + { + "epoch": 0.8178299120234604, + "grad_norm": 1.0164596010633509, + "learning_rate": 9.791258676294624e-07, + "loss": 0.674, + "step": 3486 + }, + { + "epoch": 0.8180645161290323, + "grad_norm": 1.5936458082809284, + "learning_rate": 9.7669261446778e-07, + "loss": 0.7048, + "step": 3487 + }, + { + "epoch": 0.8182991202346042, + "grad_norm": 1.3760063676507426, + "learning_rate": 9.742620612341992e-07, + "loss": 0.592, + "step": 3488 + }, + { + "epoch": 0.8185337243401759, + "grad_norm": 1.2236214272978867, + "learning_rate": 9.718342095597938e-07, + "loss": 0.6422, + "step": 3489 + }, + { + "epoch": 0.8187683284457478, + "grad_norm": 0.9393637249751821, + "learning_rate": 9.694090610738282e-07, + "loss": 0.5863, + "step": 3490 + }, + { + "epoch": 0.8190029325513196, + "grad_norm": 3.0544605828552607, + "learning_rate": 9.669866174037512e-07, + "loss": 0.6415, + "step": 3491 + }, + { + "epoch": 0.8192375366568915, + "grad_norm": 0.9127177445804946, + "learning_rate": 9.645668801751945e-07, + "loss": 0.5778, + "step": 3492 + }, + { + "epoch": 0.8194721407624633, + "grad_norm": 4.315266733744367, + "learning_rate": 9.621498510119753e-07, + "loss": 0.656, + "step": 3493 + }, + { + "epoch": 0.8197067448680352, + "grad_norm": 1.6398331279048695, + "learning_rate": 9.597355315360972e-07, + "loss": 0.6588, + "step": 3494 + }, + { + "epoch": 0.819941348973607, + "grad_norm": 1.4045317259022378, + "learning_rate": 9.57323923367739e-07, + "loss": 0.5761, + "step": 3495 + }, + { + "epoch": 0.8201759530791789, + "grad_norm": 0.7886031222397599, + "learning_rate": 9.549150281252633e-07, + "loss": 0.5902, + "step": 3496 + }, + { + "epoch": 0.8204105571847508, + "grad_norm": 1.5585839263058723, + "learning_rate": 9.52508847425212e-07, + "loss": 0.5764, + "step": 3497 + }, + { + "epoch": 0.8206451612903226, + "grad_norm": 1.0925902592568077, + "learning_rate": 9.501053828823054e-07, + "loss": 0.6187, + "step": 3498 + }, + { + "epoch": 0.8208797653958945, + "grad_norm": 2.38334965345538, + "learning_rate": 9.4770463610944e-07, + "loss": 0.6244, + "step": 3499 + }, + { + "epoch": 0.8211143695014663, + "grad_norm": 1.0805049509677784, + "learning_rate": 9.453066087176916e-07, + "loss": 0.6368, + "step": 3500 + }, + { + "epoch": 0.8211143695014663, + "eval_loss": 0.6160818338394165, + "eval_runtime": 35.8884, + "eval_samples_per_second": 15.214, + "eval_steps_per_second": 0.139, + "step": 3500 + } + ], + "logging_steps": 1, + "max_steps": 4262, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.006925861060608e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}