{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3872, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025826446280991736, "grad_norm": 53.840389251708984, "learning_rate": 5e-05, "loss": 6.6687, "step": 1 }, { "epoch": 0.0005165289256198347, "grad_norm": 27.229652404785156, "learning_rate": 4.9999991771152054e-05, "loss": 6.6279, "step": 2 }, { "epoch": 0.0007747933884297521, "grad_norm": 15.289543151855469, "learning_rate": 4.999996708461365e-05, "loss": 5.0008, "step": 3 }, { "epoch": 0.0010330578512396695, "grad_norm": 13.504425048828125, "learning_rate": 4.9999925940401013e-05, "loss": 5.3606, "step": 4 }, { "epoch": 0.0012913223140495868, "grad_norm": 12.114389419555664, "learning_rate": 4.999986833854125e-05, "loss": 5.1593, "step": 5 }, { "epoch": 0.0015495867768595042, "grad_norm": 11.176727294921875, "learning_rate": 4.999979427907227e-05, "loss": 5.0326, "step": 6 }, { "epoch": 0.0018078512396694215, "grad_norm": 14.477932929992676, "learning_rate": 4.999970376204284e-05, "loss": 5.0365, "step": 7 }, { "epoch": 0.002066115702479339, "grad_norm": 12.142794609069824, "learning_rate": 4.999959678751253e-05, "loss": 4.5666, "step": 8 }, { "epoch": 0.0023243801652892563, "grad_norm": 7.995607852935791, "learning_rate": 4.999947335555177e-05, "loss": 5.18, "step": 9 }, { "epoch": 0.0025826446280991736, "grad_norm": 17.40962028503418, "learning_rate": 4.999933346624182e-05, "loss": 4.7613, "step": 10 }, { "epoch": 0.002840909090909091, "grad_norm": 9.160776138305664, "learning_rate": 4.999917711967477e-05, "loss": 4.4082, "step": 11 }, { "epoch": 0.0030991735537190084, "grad_norm": 13.619319915771484, "learning_rate": 4.999900431595353e-05, "loss": 4.4938, "step": 12 }, { "epoch": 0.0033574380165289257, "grad_norm": 10.708993911743164, "learning_rate": 4.999881505519189e-05, "loss": 4.8479, "step": 13 }, { "epoch": 0.003615702479338843, "grad_norm": 11.576859474182129, "learning_rate": 4.999860933751441e-05, "loss": 3.5541, "step": 14 }, { "epoch": 0.0038739669421487604, "grad_norm": 10.250454902648926, "learning_rate": 4.999838716305653e-05, "loss": 4.7155, "step": 15 }, { "epoch": 0.004132231404958678, "grad_norm": 14.48772144317627, "learning_rate": 4.9998148531964504e-05, "loss": 4.6896, "step": 16 }, { "epoch": 0.004390495867768595, "grad_norm": 7.666966438293457, "learning_rate": 4.999789344439543e-05, "loss": 4.6982, "step": 17 }, { "epoch": 0.0046487603305785125, "grad_norm": 7.573092937469482, "learning_rate": 4.9997621900517224e-05, "loss": 3.7451, "step": 18 }, { "epoch": 0.00490702479338843, "grad_norm": 14.484883308410645, "learning_rate": 4.999733390050866e-05, "loss": 4.8409, "step": 19 }, { "epoch": 0.005165289256198347, "grad_norm": 13.713484764099121, "learning_rate": 4.9997029444559315e-05, "loss": 3.9651, "step": 20 }, { "epoch": 0.005423553719008265, "grad_norm": 8.872648239135742, "learning_rate": 4.999670853286963e-05, "loss": 4.6157, "step": 21 }, { "epoch": 0.005681818181818182, "grad_norm": 5.978562355041504, "learning_rate": 4.9996371165650854e-05, "loss": 5.0112, "step": 22 }, { "epoch": 0.005940082644628099, "grad_norm": 5.327659606933594, "learning_rate": 4.9996017343125085e-05, "loss": 4.4115, "step": 23 }, { "epoch": 0.006198347107438017, "grad_norm": 10.541975021362305, "learning_rate": 4.9995647065525244e-05, "loss": 4.9645, "step": 24 }, { "epoch": 0.006456611570247934, "grad_norm": 14.819223403930664, "learning_rate": 4.9995260333095086e-05, "loss": 4.5675, "step": 25 }, { "epoch": 0.006714876033057851, "grad_norm": 4.339598655700684, "learning_rate": 4.999485714608919e-05, "loss": 3.5312, "step": 26 }, { "epoch": 0.006973140495867769, "grad_norm": 16.896474838256836, "learning_rate": 4.9994437504773e-05, "loss": 4.4091, "step": 27 }, { "epoch": 0.007231404958677686, "grad_norm": 11.386642456054688, "learning_rate": 4.999400140942275e-05, "loss": 3.9448, "step": 28 }, { "epoch": 0.0074896694214876035, "grad_norm": 5.738588333129883, "learning_rate": 4.999354886032554e-05, "loss": 4.533, "step": 29 }, { "epoch": 0.007747933884297521, "grad_norm": 8.14008903503418, "learning_rate": 4.999307985777928e-05, "loss": 4.7326, "step": 30 }, { "epoch": 0.008006198347107437, "grad_norm": 6.116652965545654, "learning_rate": 4.999259440209271e-05, "loss": 4.4588, "step": 31 }, { "epoch": 0.008264462809917356, "grad_norm": 4.593879699707031, "learning_rate": 4.999209249358542e-05, "loss": 4.1308, "step": 32 }, { "epoch": 0.008522727272727272, "grad_norm": 4.517784118652344, "learning_rate": 4.9991574132587815e-05, "loss": 4.4279, "step": 33 }, { "epoch": 0.00878099173553719, "grad_norm": 4.5291852951049805, "learning_rate": 4.999103931944114e-05, "loss": 4.3191, "step": 34 }, { "epoch": 0.009039256198347107, "grad_norm": 6.284876823425293, "learning_rate": 4.999048805449746e-05, "loss": 4.0255, "step": 35 }, { "epoch": 0.009297520661157025, "grad_norm": 5.008647918701172, "learning_rate": 4.9989920338119685e-05, "loss": 4.2156, "step": 36 }, { "epoch": 0.009555785123966942, "grad_norm": 7.349335670471191, "learning_rate": 4.998933617068154e-05, "loss": 4.5376, "step": 37 }, { "epoch": 0.00981404958677686, "grad_norm": 4.096324443817139, "learning_rate": 4.998873555256759e-05, "loss": 4.9153, "step": 38 }, { "epoch": 0.010072314049586776, "grad_norm": 3.7914671897888184, "learning_rate": 4.998811848417323e-05, "loss": 4.8065, "step": 39 }, { "epoch": 0.010330578512396695, "grad_norm": 5.458169460296631, "learning_rate": 4.998748496590468e-05, "loss": 3.922, "step": 40 }, { "epoch": 0.010588842975206611, "grad_norm": 4.8252272605896, "learning_rate": 4.998683499817899e-05, "loss": 4.1211, "step": 41 }, { "epoch": 0.01084710743801653, "grad_norm": 5.078761577606201, "learning_rate": 4.9986168581424034e-05, "loss": 4.2454, "step": 42 }, { "epoch": 0.011105371900826446, "grad_norm": 4.522867679595947, "learning_rate": 4.998548571607852e-05, "loss": 4.0788, "step": 43 }, { "epoch": 0.011363636363636364, "grad_norm": 5.034168243408203, "learning_rate": 4.9984786402591986e-05, "loss": 4.4371, "step": 44 }, { "epoch": 0.01162190082644628, "grad_norm": 14.258532524108887, "learning_rate": 4.99840706414248e-05, "loss": 4.7075, "step": 45 }, { "epoch": 0.011880165289256199, "grad_norm": 43.27157974243164, "learning_rate": 4.9983338433048146e-05, "loss": 4.2055, "step": 46 }, { "epoch": 0.012138429752066115, "grad_norm": 5.479022026062012, "learning_rate": 4.9982589777944035e-05, "loss": 4.5322, "step": 47 }, { "epoch": 0.012396694214876033, "grad_norm": 53.97672653198242, "learning_rate": 4.998182467660534e-05, "loss": 4.3036, "step": 48 }, { "epoch": 0.01265495867768595, "grad_norm": 2.785889148712158, "learning_rate": 4.9981043129535704e-05, "loss": 3.6971, "step": 49 }, { "epoch": 0.012913223140495868, "grad_norm": 8.316186904907227, "learning_rate": 4.9980245137249644e-05, "loss": 3.9946, "step": 50 }, { "epoch": 0.013171487603305785, "grad_norm": 3.754406213760376, "learning_rate": 4.997943070027248e-05, "loss": 4.4776, "step": 51 }, { "epoch": 0.013429752066115703, "grad_norm": 5.386639595031738, "learning_rate": 4.997859981914035e-05, "loss": 4.9292, "step": 52 }, { "epoch": 0.01368801652892562, "grad_norm": 4.383195877075195, "learning_rate": 4.9977752494400246e-05, "loss": 4.2677, "step": 53 }, { "epoch": 0.013946280991735538, "grad_norm": 3.746431589126587, "learning_rate": 4.997688872660996e-05, "loss": 4.4961, "step": 54 }, { "epoch": 0.014204545454545454, "grad_norm": 5.871412754058838, "learning_rate": 4.9976008516338124e-05, "loss": 4.0702, "step": 55 }, { "epoch": 0.014462809917355372, "grad_norm": 8.554112434387207, "learning_rate": 4.997511186416418e-05, "loss": 4.4877, "step": 56 }, { "epoch": 0.014721074380165289, "grad_norm": 4.625486373901367, "learning_rate": 4.9974198770678404e-05, "loss": 4.3615, "step": 57 }, { "epoch": 0.014979338842975207, "grad_norm": 3.3607544898986816, "learning_rate": 4.9973269236481895e-05, "loss": 4.0811, "step": 58 }, { "epoch": 0.015237603305785124, "grad_norm": 2.2759134769439697, "learning_rate": 4.997232326218657e-05, "loss": 4.4594, "step": 59 }, { "epoch": 0.015495867768595042, "grad_norm": 4.068732261657715, "learning_rate": 4.9971360848415174e-05, "loss": 4.2588, "step": 60 }, { "epoch": 0.015754132231404958, "grad_norm": 5.904776573181152, "learning_rate": 4.997038199580126e-05, "loss": 4.3282, "step": 61 }, { "epoch": 0.016012396694214875, "grad_norm": 4.295011043548584, "learning_rate": 4.996938670498923e-05, "loss": 4.5112, "step": 62 }, { "epoch": 0.016270661157024795, "grad_norm": 12.721244812011719, "learning_rate": 4.996837497663428e-05, "loss": 4.8132, "step": 63 }, { "epoch": 0.01652892561983471, "grad_norm": 6.3944621086120605, "learning_rate": 4.996734681140245e-05, "loss": 4.2785, "step": 64 }, { "epoch": 0.016787190082644628, "grad_norm": 4.764975547790527, "learning_rate": 4.996630220997058e-05, "loss": 4.2217, "step": 65 }, { "epoch": 0.017045454545454544, "grad_norm": 6.782361030578613, "learning_rate": 4.996524117302635e-05, "loss": 4.3614, "step": 66 }, { "epoch": 0.017303719008264464, "grad_norm": 5.226018905639648, "learning_rate": 4.9964163701268224e-05, "loss": 4.4106, "step": 67 }, { "epoch": 0.01756198347107438, "grad_norm": 7.595004558563232, "learning_rate": 4.9963069795405535e-05, "loss": 4.8534, "step": 68 }, { "epoch": 0.017820247933884297, "grad_norm": 6.699203968048096, "learning_rate": 4.996195945615841e-05, "loss": 3.7455, "step": 69 }, { "epoch": 0.018078512396694214, "grad_norm": 4.171411037445068, "learning_rate": 4.996083268425778e-05, "loss": 3.8616, "step": 70 }, { "epoch": 0.018336776859504134, "grad_norm": 3.505398988723755, "learning_rate": 4.9959689480445414e-05, "loss": 4.4466, "step": 71 }, { "epoch": 0.01859504132231405, "grad_norm": 2.7879478931427, "learning_rate": 4.995852984547389e-05, "loss": 4.8603, "step": 72 }, { "epoch": 0.018853305785123967, "grad_norm": 6.860620975494385, "learning_rate": 4.9957353780106606e-05, "loss": 4.9207, "step": 73 }, { "epoch": 0.019111570247933883, "grad_norm": 5.668856143951416, "learning_rate": 4.995616128511778e-05, "loss": 4.3603, "step": 74 }, { "epoch": 0.019369834710743803, "grad_norm": 5.666945457458496, "learning_rate": 4.995495236129244e-05, "loss": 4.7967, "step": 75 }, { "epoch": 0.01962809917355372, "grad_norm": 6.0366339683532715, "learning_rate": 4.995372700942642e-05, "loss": 4.6242, "step": 76 }, { "epoch": 0.019886363636363636, "grad_norm": 2.4756109714508057, "learning_rate": 4.995248523032639e-05, "loss": 4.6441, "step": 77 }, { "epoch": 0.020144628099173553, "grad_norm": 3.7523810863494873, "learning_rate": 4.9951227024809816e-05, "loss": 3.7976, "step": 78 }, { "epoch": 0.020402892561983473, "grad_norm": 16.27356719970703, "learning_rate": 4.994995239370498e-05, "loss": 4.1691, "step": 79 }, { "epoch": 0.02066115702479339, "grad_norm": 3.7802422046661377, "learning_rate": 4.9948661337851e-05, "loss": 4.3627, "step": 80 }, { "epoch": 0.020919421487603305, "grad_norm": 7.6312150955200195, "learning_rate": 4.994735385809777e-05, "loss": 4.6262, "step": 81 }, { "epoch": 0.021177685950413222, "grad_norm": 15.234808921813965, "learning_rate": 4.994602995530603e-05, "loss": 4.5146, "step": 82 }, { "epoch": 0.021435950413223142, "grad_norm": 10.243242263793945, "learning_rate": 4.9944689630347296e-05, "loss": 4.2094, "step": 83 }, { "epoch": 0.02169421487603306, "grad_norm": 3.343564033508301, "learning_rate": 4.994333288410393e-05, "loss": 4.01, "step": 84 }, { "epoch": 0.021952479338842975, "grad_norm": 2.5226852893829346, "learning_rate": 4.994195971746908e-05, "loss": 4.3948, "step": 85 }, { "epoch": 0.02221074380165289, "grad_norm": 4.347932815551758, "learning_rate": 4.994057013134672e-05, "loss": 4.592, "step": 86 }, { "epoch": 0.02246900826446281, "grad_norm": 6.666423320770264, "learning_rate": 4.993916412665162e-05, "loss": 4.3461, "step": 87 }, { "epoch": 0.022727272727272728, "grad_norm": 4.659460067749023, "learning_rate": 4.993774170430936e-05, "loss": 4.7975, "step": 88 }, { "epoch": 0.022985537190082644, "grad_norm": 3.7171742916107178, "learning_rate": 4.993630286525634e-05, "loss": 4.3346, "step": 89 }, { "epoch": 0.02324380165289256, "grad_norm": 5.627862453460693, "learning_rate": 4.993484761043976e-05, "loss": 4.5265, "step": 90 }, { "epoch": 0.02350206611570248, "grad_norm": 4.604198455810547, "learning_rate": 4.993337594081762e-05, "loss": 4.4974, "step": 91 }, { "epoch": 0.023760330578512397, "grad_norm": 5.0029425621032715, "learning_rate": 4.9931887857358726e-05, "loss": 4.0715, "step": 92 }, { "epoch": 0.024018595041322314, "grad_norm": 5.457329750061035, "learning_rate": 4.993038336104271e-05, "loss": 4.2757, "step": 93 }, { "epoch": 0.02427685950413223, "grad_norm": 4.700888156890869, "learning_rate": 4.9928862452859984e-05, "loss": 4.2882, "step": 94 }, { "epoch": 0.02453512396694215, "grad_norm": 11.442171096801758, "learning_rate": 4.992732513381178e-05, "loss": 4.5355, "step": 95 }, { "epoch": 0.024793388429752067, "grad_norm": 11.37818431854248, "learning_rate": 4.992577140491012e-05, "loss": 3.8256, "step": 96 }, { "epoch": 0.025051652892561983, "grad_norm": 9.722389221191406, "learning_rate": 4.992420126717784e-05, "loss": 4.3105, "step": 97 }, { "epoch": 0.0253099173553719, "grad_norm": 5.016815662384033, "learning_rate": 4.9922614721648565e-05, "loss": 3.7693, "step": 98 }, { "epoch": 0.02556818181818182, "grad_norm": 4.705469608306885, "learning_rate": 4.992101176936674e-05, "loss": 4.2579, "step": 99 }, { "epoch": 0.025826446280991736, "grad_norm": 4.018533706665039, "learning_rate": 4.991939241138761e-05, "loss": 4.639, "step": 100 }, { "epoch": 0.026084710743801653, "grad_norm": 4.770525932312012, "learning_rate": 4.991775664877719e-05, "loss": 4.5522, "step": 101 }, { "epoch": 0.02634297520661157, "grad_norm": 9.878790855407715, "learning_rate": 4.9916104482612335e-05, "loss": 4.4499, "step": 102 }, { "epoch": 0.02660123966942149, "grad_norm": 3.022998809814453, "learning_rate": 4.991443591398066e-05, "loss": 4.3418, "step": 103 }, { "epoch": 0.026859504132231406, "grad_norm": 3.3290250301361084, "learning_rate": 4.9912750943980616e-05, "loss": 4.0149, "step": 104 }, { "epoch": 0.027117768595041322, "grad_norm": 9.788814544677734, "learning_rate": 4.9911049573721413e-05, "loss": 4.3167, "step": 105 }, { "epoch": 0.02737603305785124, "grad_norm": 3.9538798332214355, "learning_rate": 4.990933180432309e-05, "loss": 4.0665, "step": 106 }, { "epoch": 0.02763429752066116, "grad_norm": 4.416702747344971, "learning_rate": 4.990759763691646e-05, "loss": 3.8938, "step": 107 }, { "epoch": 0.027892561983471075, "grad_norm": 3.857116460800171, "learning_rate": 4.9905847072643146e-05, "loss": 4.4948, "step": 108 }, { "epoch": 0.02815082644628099, "grad_norm": 3.1803622245788574, "learning_rate": 4.990408011265556e-05, "loss": 4.2333, "step": 109 }, { "epoch": 0.028409090909090908, "grad_norm": 4.34283971786499, "learning_rate": 4.99022967581169e-05, "loss": 4.5169, "step": 110 }, { "epoch": 0.028667355371900828, "grad_norm": 2.8400394916534424, "learning_rate": 4.990049701020115e-05, "loss": 4.3241, "step": 111 }, { "epoch": 0.028925619834710745, "grad_norm": 8.408182144165039, "learning_rate": 4.989868087009312e-05, "loss": 4.6846, "step": 112 }, { "epoch": 0.02918388429752066, "grad_norm": 2.6488664150238037, "learning_rate": 4.989684833898838e-05, "loss": 4.4083, "step": 113 }, { "epoch": 0.029442148760330578, "grad_norm": 4.068680763244629, "learning_rate": 4.9894999418093294e-05, "loss": 4.2266, "step": 114 }, { "epoch": 0.029700413223140498, "grad_norm": 4.06151008605957, "learning_rate": 4.989313410862504e-05, "loss": 4.1865, "step": 115 }, { "epoch": 0.029958677685950414, "grad_norm": 2.733849287033081, "learning_rate": 4.989125241181154e-05, "loss": 3.9368, "step": 116 }, { "epoch": 0.03021694214876033, "grad_norm": 3.380722761154175, "learning_rate": 4.988935432889155e-05, "loss": 4.4911, "step": 117 }, { "epoch": 0.030475206611570247, "grad_norm": 3.7356367111206055, "learning_rate": 4.9887439861114585e-05, "loss": 3.7288, "step": 118 }, { "epoch": 0.030733471074380167, "grad_norm": 23.165063858032227, "learning_rate": 4.988550900974095e-05, "loss": 4.7676, "step": 119 }, { "epoch": 0.030991735537190084, "grad_norm": 4.693371772766113, "learning_rate": 4.9883561776041745e-05, "loss": 4.3925, "step": 120 }, { "epoch": 0.03125, "grad_norm": 3.2422921657562256, "learning_rate": 4.9881598161298846e-05, "loss": 4.9324, "step": 121 }, { "epoch": 0.031508264462809916, "grad_norm": 5.02814245223999, "learning_rate": 4.987961816680492e-05, "loss": 4.2515, "step": 122 }, { "epoch": 0.03176652892561983, "grad_norm": 3.9163026809692383, "learning_rate": 4.987762179386342e-05, "loss": 4.3568, "step": 123 }, { "epoch": 0.03202479338842975, "grad_norm": 2.9507973194122314, "learning_rate": 4.987560904378856e-05, "loss": 4.5584, "step": 124 }, { "epoch": 0.032283057851239666, "grad_norm": 2.3179731369018555, "learning_rate": 4.9873579917905345e-05, "loss": 4.3986, "step": 125 }, { "epoch": 0.03254132231404959, "grad_norm": 2.487847089767456, "learning_rate": 4.987153441754958e-05, "loss": 4.1565, "step": 126 }, { "epoch": 0.032799586776859506, "grad_norm": 2.913970947265625, "learning_rate": 4.986947254406783e-05, "loss": 3.8632, "step": 127 }, { "epoch": 0.03305785123966942, "grad_norm": 3.3636367321014404, "learning_rate": 4.9867394298817435e-05, "loss": 3.964, "step": 128 }, { "epoch": 0.03331611570247934, "grad_norm": 10.071113586425781, "learning_rate": 4.986529968316653e-05, "loss": 4.4414, "step": 129 }, { "epoch": 0.033574380165289255, "grad_norm": 3.0183660984039307, "learning_rate": 4.986318869849402e-05, "loss": 4.56, "step": 130 }, { "epoch": 0.03383264462809917, "grad_norm": 7.945610523223877, "learning_rate": 4.9861061346189565e-05, "loss": 4.3411, "step": 131 }, { "epoch": 0.03409090909090909, "grad_norm": 3.8863184452056885, "learning_rate": 4.985891762765363e-05, "loss": 4.3815, "step": 132 }, { "epoch": 0.034349173553719005, "grad_norm": 4.191023826599121, "learning_rate": 4.985675754429744e-05, "loss": 4.0397, "step": 133 }, { "epoch": 0.03460743801652893, "grad_norm": 3.30517840385437, "learning_rate": 4.985458109754298e-05, "loss": 4.1065, "step": 134 }, { "epoch": 0.034865702479338845, "grad_norm": 6.06351900100708, "learning_rate": 4.9852388288823054e-05, "loss": 4.1124, "step": 135 }, { "epoch": 0.03512396694214876, "grad_norm": 3.6495954990386963, "learning_rate": 4.9850179119581186e-05, "loss": 4.9393, "step": 136 }, { "epoch": 0.03538223140495868, "grad_norm": 2.772559404373169, "learning_rate": 4.984795359127168e-05, "loss": 4.5008, "step": 137 }, { "epoch": 0.035640495867768594, "grad_norm": 3.098214626312256, "learning_rate": 4.984571170535964e-05, "loss": 4.2548, "step": 138 }, { "epoch": 0.03589876033057851, "grad_norm": 6.730007171630859, "learning_rate": 4.98434534633209e-05, "loss": 3.752, "step": 139 }, { "epoch": 0.03615702479338843, "grad_norm": 2.2908971309661865, "learning_rate": 4.9841178866642085e-05, "loss": 4.2323, "step": 140 }, { "epoch": 0.036415289256198344, "grad_norm": 4.095367431640625, "learning_rate": 4.983888791682058e-05, "loss": 4.6213, "step": 141 }, { "epoch": 0.03667355371900827, "grad_norm": 3.3654677867889404, "learning_rate": 4.983658061536454e-05, "loss": 4.2887, "step": 142 }, { "epoch": 0.036931818181818184, "grad_norm": 4.5996222496032715, "learning_rate": 4.9834256963792866e-05, "loss": 4.2816, "step": 143 }, { "epoch": 0.0371900826446281, "grad_norm": 4.038592338562012, "learning_rate": 4.983191696363525e-05, "loss": 3.9801, "step": 144 }, { "epoch": 0.03744834710743802, "grad_norm": 3.0726122856140137, "learning_rate": 4.982956061643212e-05, "loss": 4.5149, "step": 145 }, { "epoch": 0.03770661157024793, "grad_norm": 4.887876033782959, "learning_rate": 4.9827187923734695e-05, "loss": 3.8485, "step": 146 }, { "epoch": 0.03796487603305785, "grad_norm": 3.9160537719726562, "learning_rate": 4.982479888710493e-05, "loss": 4.4737, "step": 147 }, { "epoch": 0.038223140495867766, "grad_norm": 3.0038321018218994, "learning_rate": 4.9822393508115536e-05, "loss": 4.7921, "step": 148 }, { "epoch": 0.03848140495867768, "grad_norm": 4.034693717956543, "learning_rate": 4.981997178835e-05, "loss": 4.8058, "step": 149 }, { "epoch": 0.038739669421487606, "grad_norm": 4.038486957550049, "learning_rate": 4.9817533729402553e-05, "loss": 4.7706, "step": 150 }, { "epoch": 0.03899793388429752, "grad_norm": 2.630828380584717, "learning_rate": 4.981507933287821e-05, "loss": 4.3564, "step": 151 }, { "epoch": 0.03925619834710744, "grad_norm": 3.202615737915039, "learning_rate": 4.9812608600392694e-05, "loss": 4.4376, "step": 152 }, { "epoch": 0.039514462809917356, "grad_norm": 3.9663283824920654, "learning_rate": 4.981012153357252e-05, "loss": 3.8367, "step": 153 }, { "epoch": 0.03977272727272727, "grad_norm": 3.8060214519500732, "learning_rate": 4.980761813405494e-05, "loss": 4.5492, "step": 154 }, { "epoch": 0.04003099173553719, "grad_norm": 4.062107086181641, "learning_rate": 4.980509840348796e-05, "loss": 4.0062, "step": 155 }, { "epoch": 0.040289256198347105, "grad_norm": 3.9136059284210205, "learning_rate": 4.9802562343530345e-05, "loss": 3.7075, "step": 156 }, { "epoch": 0.04054752066115702, "grad_norm": 2.513056516647339, "learning_rate": 4.9800009955851604e-05, "loss": 4.5533, "step": 157 }, { "epoch": 0.040805785123966945, "grad_norm": 3.0869741439819336, "learning_rate": 4.979744124213199e-05, "loss": 4.3089, "step": 158 }, { "epoch": 0.04106404958677686, "grad_norm": 9.562580108642578, "learning_rate": 4.97948562040625e-05, "loss": 3.8639, "step": 159 }, { "epoch": 0.04132231404958678, "grad_norm": 2.799509286880493, "learning_rate": 4.9792254843344886e-05, "loss": 4.463, "step": 160 }, { "epoch": 0.041580578512396695, "grad_norm": 2.3364462852478027, "learning_rate": 4.978963716169166e-05, "loss": 4.277, "step": 161 }, { "epoch": 0.04183884297520661, "grad_norm": 4.806636810302734, "learning_rate": 4.978700316082605e-05, "loss": 4.4326, "step": 162 }, { "epoch": 0.04209710743801653, "grad_norm": 3.435563087463379, "learning_rate": 4.978435284248204e-05, "loss": 4.2589, "step": 163 }, { "epoch": 0.042355371900826444, "grad_norm": 2.1915886402130127, "learning_rate": 4.978168620840435e-05, "loss": 3.7539, "step": 164 }, { "epoch": 0.04261363636363636, "grad_norm": 3.5137360095977783, "learning_rate": 4.9779003260348465e-05, "loss": 4.8633, "step": 165 }, { "epoch": 0.042871900826446284, "grad_norm": 4.3233642578125, "learning_rate": 4.977630400008057e-05, "loss": 4.5868, "step": 166 }, { "epoch": 0.0431301652892562, "grad_norm": 5.085021495819092, "learning_rate": 4.9773588429377624e-05, "loss": 4.2138, "step": 167 }, { "epoch": 0.04338842975206612, "grad_norm": 2.3626906871795654, "learning_rate": 4.9770856550027296e-05, "loss": 4.57, "step": 168 }, { "epoch": 0.04364669421487603, "grad_norm": 3.6927638053894043, "learning_rate": 4.976810836382802e-05, "loss": 4.4396, "step": 169 }, { "epoch": 0.04390495867768595, "grad_norm": 3.7683534622192383, "learning_rate": 4.976534387258892e-05, "loss": 4.2618, "step": 170 }, { "epoch": 0.044163223140495866, "grad_norm": 2.4252023696899414, "learning_rate": 4.976256307812991e-05, "loss": 4.7329, "step": 171 }, { "epoch": 0.04442148760330578, "grad_norm": 2.7693023681640625, "learning_rate": 4.97597659822816e-05, "loss": 4.3043, "step": 172 }, { "epoch": 0.0446797520661157, "grad_norm": 3.720137119293213, "learning_rate": 4.9756952586885345e-05, "loss": 5.0129, "step": 173 }, { "epoch": 0.04493801652892562, "grad_norm": 3.4952056407928467, "learning_rate": 4.975412289379321e-05, "loss": 4.2098, "step": 174 }, { "epoch": 0.04519628099173554, "grad_norm": 5.7649335861206055, "learning_rate": 4.975127690486802e-05, "loss": 4.2672, "step": 175 }, { "epoch": 0.045454545454545456, "grad_norm": 4.174394607543945, "learning_rate": 4.97484146219833e-05, "loss": 4.2654, "step": 176 }, { "epoch": 0.04571280991735537, "grad_norm": 3.189321517944336, "learning_rate": 4.9745536047023324e-05, "loss": 4.9332, "step": 177 }, { "epoch": 0.04597107438016529, "grad_norm": 3.7539098262786865, "learning_rate": 4.974264118188307e-05, "loss": 4.4166, "step": 178 }, { "epoch": 0.046229338842975205, "grad_norm": 3.347956418991089, "learning_rate": 4.973973002846826e-05, "loss": 4.1361, "step": 179 }, { "epoch": 0.04648760330578512, "grad_norm": 2.8486969470977783, "learning_rate": 4.973680258869532e-05, "loss": 4.7267, "step": 180 }, { "epoch": 0.04674586776859504, "grad_norm": 6.4123663902282715, "learning_rate": 4.9733858864491414e-05, "loss": 4.0173, "step": 181 }, { "epoch": 0.04700413223140496, "grad_norm": 4.235849857330322, "learning_rate": 4.9730898857794416e-05, "loss": 4.7252, "step": 182 }, { "epoch": 0.04726239669421488, "grad_norm": 3.420592784881592, "learning_rate": 4.972792257055292e-05, "loss": 3.6732, "step": 183 }, { "epoch": 0.047520661157024795, "grad_norm": 4.555610179901123, "learning_rate": 4.9724930004726245e-05, "loss": 4.4998, "step": 184 }, { "epoch": 0.04777892561983471, "grad_norm": 3.737490177154541, "learning_rate": 4.972192116228441e-05, "loss": 3.861, "step": 185 }, { "epoch": 0.04803719008264463, "grad_norm": 2.8203542232513428, "learning_rate": 4.971889604520817e-05, "loss": 4.2769, "step": 186 }, { "epoch": 0.048295454545454544, "grad_norm": 4.513956069946289, "learning_rate": 4.971585465548898e-05, "loss": 4.4349, "step": 187 }, { "epoch": 0.04855371900826446, "grad_norm": 11.399883270263672, "learning_rate": 4.971279699512901e-05, "loss": 4.3311, "step": 188 }, { "epoch": 0.04881198347107438, "grad_norm": 5.689480304718018, "learning_rate": 4.970972306614114e-05, "loss": 3.7772, "step": 189 }, { "epoch": 0.0490702479338843, "grad_norm": 3.3621368408203125, "learning_rate": 4.9706632870548964e-05, "loss": 4.5215, "step": 190 }, { "epoch": 0.04932851239669422, "grad_norm": 3.5370137691497803, "learning_rate": 4.970352641038678e-05, "loss": 3.8737, "step": 191 }, { "epoch": 0.049586776859504134, "grad_norm": 2.799914836883545, "learning_rate": 4.97004036876996e-05, "loss": 4.2502, "step": 192 }, { "epoch": 0.04984504132231405, "grad_norm": 3.2626450061798096, "learning_rate": 4.969726470454313e-05, "loss": 4.8077, "step": 193 }, { "epoch": 0.05010330578512397, "grad_norm": 4.800884246826172, "learning_rate": 4.969410946298379e-05, "loss": 3.9856, "step": 194 }, { "epoch": 0.05036157024793388, "grad_norm": 2.131112575531006, "learning_rate": 4.96909379650987e-05, "loss": 4.6066, "step": 195 }, { "epoch": 0.0506198347107438, "grad_norm": 4.762937068939209, "learning_rate": 4.968775021297568e-05, "loss": 4.0115, "step": 196 }, { "epoch": 0.050878099173553716, "grad_norm": 2.302441358566284, "learning_rate": 4.968454620871326e-05, "loss": 3.7559, "step": 197 }, { "epoch": 0.05113636363636364, "grad_norm": 3.0902583599090576, "learning_rate": 4.9681325954420655e-05, "loss": 4.2461, "step": 198 }, { "epoch": 0.051394628099173556, "grad_norm": 3.9115631580352783, "learning_rate": 4.967808945221778e-05, "loss": 4.7368, "step": 199 }, { "epoch": 0.05165289256198347, "grad_norm": 2.4413836002349854, "learning_rate": 4.967483670423526e-05, "loss": 4.2189, "step": 200 }, { "epoch": 0.05191115702479339, "grad_norm": 3.0176079273223877, "learning_rate": 4.967156771261439e-05, "loss": 4.5489, "step": 201 }, { "epoch": 0.052169421487603305, "grad_norm": 5.948681831359863, "learning_rate": 4.966828247950719e-05, "loss": 4.1854, "step": 202 }, { "epoch": 0.05242768595041322, "grad_norm": 3.9825966358184814, "learning_rate": 4.966498100707634e-05, "loss": 3.9795, "step": 203 }, { "epoch": 0.05268595041322314, "grad_norm": 2.409513473510742, "learning_rate": 4.966166329749523e-05, "loss": 4.3284, "step": 204 }, { "epoch": 0.052944214876033055, "grad_norm": 3.4758553504943848, "learning_rate": 4.965832935294794e-05, "loss": 4.4845, "step": 205 }, { "epoch": 0.05320247933884298, "grad_norm": 2.307767391204834, "learning_rate": 4.9654979175629226e-05, "loss": 4.3931, "step": 206 }, { "epoch": 0.053460743801652895, "grad_norm": 4.839701175689697, "learning_rate": 4.9651612767744535e-05, "loss": 4.002, "step": 207 }, { "epoch": 0.05371900826446281, "grad_norm": 4.272008895874023, "learning_rate": 4.964823013151001e-05, "loss": 4.1945, "step": 208 }, { "epoch": 0.05397727272727273, "grad_norm": 3.6931703090667725, "learning_rate": 4.964483126915245e-05, "loss": 4.0644, "step": 209 }, { "epoch": 0.054235537190082644, "grad_norm": 4.448047637939453, "learning_rate": 4.9641416182909364e-05, "loss": 3.9563, "step": 210 }, { "epoch": 0.05449380165289256, "grad_norm": 4.601946830749512, "learning_rate": 4.963798487502893e-05, "loss": 4.3265, "step": 211 }, { "epoch": 0.05475206611570248, "grad_norm": 3.967369556427002, "learning_rate": 4.9634537347770005e-05, "loss": 4.3102, "step": 212 }, { "epoch": 0.055010330578512394, "grad_norm": 3.376329183578491, "learning_rate": 4.9631073603402114e-05, "loss": 4.0273, "step": 213 }, { "epoch": 0.05526859504132232, "grad_norm": 2.6277973651885986, "learning_rate": 4.962759364420547e-05, "loss": 4.2329, "step": 214 }, { "epoch": 0.055526859504132234, "grad_norm": 2.1323187351226807, "learning_rate": 4.962409747247097e-05, "loss": 4.2965, "step": 215 }, { "epoch": 0.05578512396694215, "grad_norm": 12.565481185913086, "learning_rate": 4.962058509050016e-05, "loss": 4.1646, "step": 216 }, { "epoch": 0.05604338842975207, "grad_norm": 3.3220667839050293, "learning_rate": 4.961705650060528e-05, "loss": 4.41, "step": 217 }, { "epoch": 0.05630165289256198, "grad_norm": 2.9350833892822266, "learning_rate": 4.96135117051092e-05, "loss": 4.4435, "step": 218 }, { "epoch": 0.0565599173553719, "grad_norm": 6.540204048156738, "learning_rate": 4.960995070634552e-05, "loss": 3.1962, "step": 219 }, { "epoch": 0.056818181818181816, "grad_norm": 6.5188889503479, "learning_rate": 4.9606373506658456e-05, "loss": 4.1192, "step": 220 }, { "epoch": 0.05707644628099173, "grad_norm": 4.663192272186279, "learning_rate": 4.96027801084029e-05, "loss": 4.3002, "step": 221 }, { "epoch": 0.057334710743801656, "grad_norm": 3.508042335510254, "learning_rate": 4.959917051394444e-05, "loss": 4.8093, "step": 222 }, { "epoch": 0.05759297520661157, "grad_norm": 3.388587713241577, "learning_rate": 4.9595544725659275e-05, "loss": 4.3135, "step": 223 }, { "epoch": 0.05785123966942149, "grad_norm": 3.2106237411499023, "learning_rate": 4.959190274593431e-05, "loss": 3.7329, "step": 224 }, { "epoch": 0.058109504132231406, "grad_norm": 5.04145622253418, "learning_rate": 4.9588244577167065e-05, "loss": 4.4788, "step": 225 }, { "epoch": 0.05836776859504132, "grad_norm": 2.6704213619232178, "learning_rate": 4.958457022176576e-05, "loss": 4.4042, "step": 226 }, { "epoch": 0.05862603305785124, "grad_norm": 5.906059741973877, "learning_rate": 4.958087968214924e-05, "loss": 4.123, "step": 227 }, { "epoch": 0.058884297520661155, "grad_norm": 2.8475260734558105, "learning_rate": 4.957717296074703e-05, "loss": 4.1531, "step": 228 }, { "epoch": 0.05914256198347107, "grad_norm": 4.067889213562012, "learning_rate": 4.957345005999928e-05, "loss": 4.2535, "step": 229 }, { "epoch": 0.059400826446280995, "grad_norm": 3.020292043685913, "learning_rate": 4.956971098235682e-05, "loss": 4.1811, "step": 230 }, { "epoch": 0.05965909090909091, "grad_norm": 2.9406447410583496, "learning_rate": 4.95659557302811e-05, "loss": 4.0091, "step": 231 }, { "epoch": 0.05991735537190083, "grad_norm": 3.1298396587371826, "learning_rate": 4.9562184306244234e-05, "loss": 4.026, "step": 232 }, { "epoch": 0.060175619834710745, "grad_norm": 2.888399600982666, "learning_rate": 4.955839671272899e-05, "loss": 3.9579, "step": 233 }, { "epoch": 0.06043388429752066, "grad_norm": 4.131311416625977, "learning_rate": 4.9554592952228754e-05, "loss": 3.9565, "step": 234 }, { "epoch": 0.06069214876033058, "grad_norm": 4.115970611572266, "learning_rate": 4.9550773027247586e-05, "loss": 4.1271, "step": 235 }, { "epoch": 0.060950413223140494, "grad_norm": 2.3268158435821533, "learning_rate": 4.954693694030017e-05, "loss": 3.6085, "step": 236 }, { "epoch": 0.06120867768595041, "grad_norm": 2.8340978622436523, "learning_rate": 4.9543084693911826e-05, "loss": 4.7223, "step": 237 }, { "epoch": 0.061466942148760334, "grad_norm": 5.3101677894592285, "learning_rate": 4.953921629061852e-05, "loss": 4.9228, "step": 238 }, { "epoch": 0.06172520661157025, "grad_norm": 4.083702564239502, "learning_rate": 4.953533173296685e-05, "loss": 4.4234, "step": 239 }, { "epoch": 0.06198347107438017, "grad_norm": 3.5520925521850586, "learning_rate": 4.9531431023514065e-05, "loss": 4.3761, "step": 240 }, { "epoch": 0.062241735537190084, "grad_norm": 2.6579904556274414, "learning_rate": 4.952751416482801e-05, "loss": 4.2581, "step": 241 }, { "epoch": 0.0625, "grad_norm": 3.3205533027648926, "learning_rate": 4.9523581159487206e-05, "loss": 4.0199, "step": 242 }, { "epoch": 0.06275826446280992, "grad_norm": 3.2058026790618896, "learning_rate": 4.951963201008076e-05, "loss": 4.2566, "step": 243 }, { "epoch": 0.06301652892561983, "grad_norm": 7.030323505401611, "learning_rate": 4.951566671920845e-05, "loss": 4.2355, "step": 244 }, { "epoch": 0.06327479338842976, "grad_norm": 5.20966100692749, "learning_rate": 4.951168528948063e-05, "loss": 4.9382, "step": 245 }, { "epoch": 0.06353305785123967, "grad_norm": 3.6411569118499756, "learning_rate": 4.950768772351835e-05, "loss": 4.3791, "step": 246 }, { "epoch": 0.06379132231404959, "grad_norm": 2.7272262573242188, "learning_rate": 4.95036740239532e-05, "loss": 4.5444, "step": 247 }, { "epoch": 0.0640495867768595, "grad_norm": 3.0384621620178223, "learning_rate": 4.949964419342743e-05, "loss": 4.4751, "step": 248 }, { "epoch": 0.06430785123966942, "grad_norm": 2.2613348960876465, "learning_rate": 4.949559823459393e-05, "loss": 4.2281, "step": 249 }, { "epoch": 0.06456611570247933, "grad_norm": 7.897278785705566, "learning_rate": 4.949153615011618e-05, "loss": 4.3327, "step": 250 }, { "epoch": 0.06482438016528926, "grad_norm": 5.87761116027832, "learning_rate": 4.948745794266828e-05, "loss": 3.7273, "step": 251 }, { "epoch": 0.06508264462809918, "grad_norm": 3.300201177597046, "learning_rate": 4.948336361493494e-05, "loss": 4.5439, "step": 252 }, { "epoch": 0.06534090909090909, "grad_norm": 4.523470878601074, "learning_rate": 4.9479253169611494e-05, "loss": 4.1422, "step": 253 }, { "epoch": 0.06559917355371901, "grad_norm": 2.9071879386901855, "learning_rate": 4.947512660940388e-05, "loss": 4.4657, "step": 254 }, { "epoch": 0.06585743801652892, "grad_norm": 3.7290151119232178, "learning_rate": 4.947098393702865e-05, "loss": 3.58, "step": 255 }, { "epoch": 0.06611570247933884, "grad_norm": 4.3736677169799805, "learning_rate": 4.946682515521295e-05, "loss": 4.4153, "step": 256 }, { "epoch": 0.06637396694214875, "grad_norm": 3.3906376361846924, "learning_rate": 4.9462650266694544e-05, "loss": 4.198, "step": 257 }, { "epoch": 0.06663223140495868, "grad_norm": 2.697543144226074, "learning_rate": 4.945845927422178e-05, "loss": 4.5726, "step": 258 }, { "epoch": 0.0668904958677686, "grad_norm": 3.3882362842559814, "learning_rate": 4.945425218055364e-05, "loss": 3.8089, "step": 259 }, { "epoch": 0.06714876033057851, "grad_norm": 2.6826095581054688, "learning_rate": 4.9450028988459686e-05, "loss": 4.0397, "step": 260 }, { "epoch": 0.06740702479338843, "grad_norm": 2.516768217086792, "learning_rate": 4.944578970072006e-05, "loss": 4.3379, "step": 261 }, { "epoch": 0.06766528925619834, "grad_norm": 4.2431793212890625, "learning_rate": 4.9441534320125535e-05, "loss": 3.9981, "step": 262 }, { "epoch": 0.06792355371900827, "grad_norm": 3.9999237060546875, "learning_rate": 4.943726284947745e-05, "loss": 3.9183, "step": 263 }, { "epoch": 0.06818181818181818, "grad_norm": 2.451709747314453, "learning_rate": 4.9432975291587756e-05, "loss": 4.3482, "step": 264 }, { "epoch": 0.0684400826446281, "grad_norm": 2.290229082107544, "learning_rate": 4.942867164927899e-05, "loss": 4.2646, "step": 265 }, { "epoch": 0.06869834710743801, "grad_norm": 3.2290444374084473, "learning_rate": 4.942435192538426e-05, "loss": 4.4105, "step": 266 }, { "epoch": 0.06895661157024793, "grad_norm": 5.709512233734131, "learning_rate": 4.9420016122747286e-05, "loss": 3.9893, "step": 267 }, { "epoch": 0.06921487603305786, "grad_norm": 2.9337809085845947, "learning_rate": 4.941566424422236e-05, "loss": 4.2683, "step": 268 }, { "epoch": 0.06947314049586777, "grad_norm": 2.7263827323913574, "learning_rate": 4.941129629267434e-05, "loss": 4.364, "step": 269 }, { "epoch": 0.06973140495867769, "grad_norm": 6.661377906799316, "learning_rate": 4.9406912270978704e-05, "loss": 5.0535, "step": 270 }, { "epoch": 0.0699896694214876, "grad_norm": 3.1108593940734863, "learning_rate": 4.9402512182021485e-05, "loss": 4.2481, "step": 271 }, { "epoch": 0.07024793388429752, "grad_norm": 5.562469005584717, "learning_rate": 4.939809602869928e-05, "loss": 3.8494, "step": 272 }, { "epoch": 0.07050619834710743, "grad_norm": 7.0484747886657715, "learning_rate": 4.93936638139193e-05, "loss": 4.1979, "step": 273 }, { "epoch": 0.07076446280991736, "grad_norm": 1.8418248891830444, "learning_rate": 4.938921554059929e-05, "loss": 4.6068, "step": 274 }, { "epoch": 0.07102272727272728, "grad_norm": 3.3992538452148438, "learning_rate": 4.938475121166759e-05, "loss": 3.6932, "step": 275 }, { "epoch": 0.07128099173553719, "grad_norm": 3.3403046131134033, "learning_rate": 4.93802708300631e-05, "loss": 4.0431, "step": 276 }, { "epoch": 0.07153925619834711, "grad_norm": 6.694905757904053, "learning_rate": 4.937577439873529e-05, "loss": 4.2685, "step": 277 }, { "epoch": 0.07179752066115702, "grad_norm": 3.434704303741455, "learning_rate": 4.9371261920644196e-05, "loss": 4.1055, "step": 278 }, { "epoch": 0.07205578512396695, "grad_norm": 3.187915086746216, "learning_rate": 4.9366733398760424e-05, "loss": 4.2916, "step": 279 }, { "epoch": 0.07231404958677685, "grad_norm": 3.8759007453918457, "learning_rate": 4.936218883606513e-05, "loss": 4.0829, "step": 280 }, { "epoch": 0.07257231404958678, "grad_norm": 4.045225143432617, "learning_rate": 4.935762823555004e-05, "loss": 3.4967, "step": 281 }, { "epoch": 0.07283057851239669, "grad_norm": 2.8516764640808105, "learning_rate": 4.935305160021742e-05, "loss": 4.1327, "step": 282 }, { "epoch": 0.07308884297520661, "grad_norm": 3.572760820388794, "learning_rate": 4.934845893308012e-05, "loss": 3.6957, "step": 283 }, { "epoch": 0.07334710743801653, "grad_norm": 2.657180070877075, "learning_rate": 4.934385023716152e-05, "loss": 4.2341, "step": 284 }, { "epoch": 0.07360537190082644, "grad_norm": 2.8591058254241943, "learning_rate": 4.933922551549556e-05, "loss": 4.4077, "step": 285 }, { "epoch": 0.07386363636363637, "grad_norm": 4.204916477203369, "learning_rate": 4.9334584771126743e-05, "loss": 4.2273, "step": 286 }, { "epoch": 0.07412190082644628, "grad_norm": 3.6107823848724365, "learning_rate": 4.932992800711009e-05, "loss": 4.4042, "step": 287 }, { "epoch": 0.0743801652892562, "grad_norm": 3.8509185314178467, "learning_rate": 4.932525522651119e-05, "loss": 3.8752, "step": 288 }, { "epoch": 0.07463842975206611, "grad_norm": 3.5519165992736816, "learning_rate": 4.932056643240618e-05, "loss": 4.0918, "step": 289 }, { "epoch": 0.07489669421487603, "grad_norm": 2.879927396774292, "learning_rate": 4.931586162788173e-05, "loss": 4.245, "step": 290 }, { "epoch": 0.07515495867768596, "grad_norm": 3.1901206970214844, "learning_rate": 4.931114081603504e-05, "loss": 4.4168, "step": 291 }, { "epoch": 0.07541322314049587, "grad_norm": 2.687814474105835, "learning_rate": 4.930640399997386e-05, "loss": 4.5022, "step": 292 }, { "epoch": 0.07567148760330579, "grad_norm": 7.261200428009033, "learning_rate": 4.930165118281648e-05, "loss": 3.3319, "step": 293 }, { "epoch": 0.0759297520661157, "grad_norm": 5.631179332733154, "learning_rate": 4.92968823676917e-05, "loss": 4.1023, "step": 294 }, { "epoch": 0.07618801652892562, "grad_norm": 3.4509215354919434, "learning_rate": 4.929209755773889e-05, "loss": 4.377, "step": 295 }, { "epoch": 0.07644628099173553, "grad_norm": 4.24806022644043, "learning_rate": 4.928729675610791e-05, "loss": 4.2663, "step": 296 }, { "epoch": 0.07670454545454546, "grad_norm": 3.4682419300079346, "learning_rate": 4.9282479965959175e-05, "loss": 5.0662, "step": 297 }, { "epoch": 0.07696280991735537, "grad_norm": 2.678131341934204, "learning_rate": 4.927764719046362e-05, "loss": 4.3411, "step": 298 }, { "epoch": 0.07722107438016529, "grad_norm": 2.929517984390259, "learning_rate": 4.927279843280268e-05, "loss": 3.7486, "step": 299 }, { "epoch": 0.07747933884297521, "grad_norm": 4.933820724487305, "learning_rate": 4.9267933696168356e-05, "loss": 3.7245, "step": 300 }, { "epoch": 0.07773760330578512, "grad_norm": 3.910463571548462, "learning_rate": 4.926305298376313e-05, "loss": 4.3431, "step": 301 }, { "epoch": 0.07799586776859505, "grad_norm": 4.166701793670654, "learning_rate": 4.9258156298799995e-05, "loss": 4.1259, "step": 302 }, { "epoch": 0.07825413223140495, "grad_norm": 5.588757038116455, "learning_rate": 4.925324364450251e-05, "loss": 4.55, "step": 303 }, { "epoch": 0.07851239669421488, "grad_norm": 4.676442623138428, "learning_rate": 4.924831502410469e-05, "loss": 4.1928, "step": 304 }, { "epoch": 0.07877066115702479, "grad_norm": 2.0846738815307617, "learning_rate": 4.92433704408511e-05, "loss": 4.6723, "step": 305 }, { "epoch": 0.07902892561983471, "grad_norm": 2.694462776184082, "learning_rate": 4.923840989799678e-05, "loss": 4.4494, "step": 306 }, { "epoch": 0.07928719008264463, "grad_norm": 3.2807676792144775, "learning_rate": 4.9233433398807315e-05, "loss": 4.2158, "step": 307 }, { "epoch": 0.07954545454545454, "grad_norm": 3.999415159225464, "learning_rate": 4.922844094655876e-05, "loss": 4.7524, "step": 308 }, { "epoch": 0.07980371900826447, "grad_norm": 3.0611307621002197, "learning_rate": 4.922343254453768e-05, "loss": 4.4963, "step": 309 }, { "epoch": 0.08006198347107438, "grad_norm": 3.3428456783294678, "learning_rate": 4.9218408196041166e-05, "loss": 4.1267, "step": 310 }, { "epoch": 0.0803202479338843, "grad_norm": 2.991830825805664, "learning_rate": 4.9213367904376775e-05, "loss": 4.3456, "step": 311 }, { "epoch": 0.08057851239669421, "grad_norm": 2.938211441040039, "learning_rate": 4.920831167286256e-05, "loss": 4.574, "step": 312 }, { "epoch": 0.08083677685950413, "grad_norm": 5.896919250488281, "learning_rate": 4.920323950482709e-05, "loss": 4.3214, "step": 313 }, { "epoch": 0.08109504132231404, "grad_norm": 3.137040615081787, "learning_rate": 4.919815140360941e-05, "loss": 4.1911, "step": 314 }, { "epoch": 0.08135330578512397, "grad_norm": 3.255697011947632, "learning_rate": 4.9193047372559056e-05, "loss": 4.4076, "step": 315 }, { "epoch": 0.08161157024793389, "grad_norm": 3.0316996574401855, "learning_rate": 4.918792741503606e-05, "loss": 4.4522, "step": 316 }, { "epoch": 0.0818698347107438, "grad_norm": 3.236427068710327, "learning_rate": 4.918279153441091e-05, "loss": 4.1626, "step": 317 }, { "epoch": 0.08212809917355372, "grad_norm": 2.60062575340271, "learning_rate": 4.917763973406462e-05, "loss": 4.2029, "step": 318 }, { "epoch": 0.08238636363636363, "grad_norm": 4.7989182472229, "learning_rate": 4.917247201738865e-05, "loss": 4.5074, "step": 319 }, { "epoch": 0.08264462809917356, "grad_norm": 3.276747226715088, "learning_rate": 4.916728838778494e-05, "loss": 4.5506, "step": 320 }, { "epoch": 0.08290289256198347, "grad_norm": 3.414419651031494, "learning_rate": 4.916208884866593e-05, "loss": 4.2696, "step": 321 }, { "epoch": 0.08316115702479339, "grad_norm": 2.9038968086242676, "learning_rate": 4.915687340345451e-05, "loss": 4.4467, "step": 322 }, { "epoch": 0.08341942148760331, "grad_norm": 3.3383615016937256, "learning_rate": 4.915164205558404e-05, "loss": 4.6865, "step": 323 }, { "epoch": 0.08367768595041322, "grad_norm": 4.643346786499023, "learning_rate": 4.914639480849837e-05, "loss": 4.7906, "step": 324 }, { "epoch": 0.08393595041322315, "grad_norm": 2.4287960529327393, "learning_rate": 4.914113166565179e-05, "loss": 4.2757, "step": 325 }, { "epoch": 0.08419421487603305, "grad_norm": 4.286943435668945, "learning_rate": 4.9135852630509087e-05, "loss": 4.6959, "step": 326 }, { "epoch": 0.08445247933884298, "grad_norm": 2.9254109859466553, "learning_rate": 4.913055770654548e-05, "loss": 3.9103, "step": 327 }, { "epoch": 0.08471074380165289, "grad_norm": 5.163883686065674, "learning_rate": 4.912524689724666e-05, "loss": 4.7698, "step": 328 }, { "epoch": 0.08496900826446281, "grad_norm": 4.514895915985107, "learning_rate": 4.9119920206108776e-05, "loss": 4.3189, "step": 329 }, { "epoch": 0.08522727272727272, "grad_norm": 2.699148416519165, "learning_rate": 4.911457763663842e-05, "loss": 4.07, "step": 330 }, { "epoch": 0.08548553719008264, "grad_norm": 4.968969821929932, "learning_rate": 4.910921919235267e-05, "loss": 4.1844, "step": 331 }, { "epoch": 0.08574380165289257, "grad_norm": 5.383642196655273, "learning_rate": 4.910384487677901e-05, "loss": 4.4612, "step": 332 }, { "epoch": 0.08600206611570248, "grad_norm": 2.6665968894958496, "learning_rate": 4.9098454693455417e-05, "loss": 4.1793, "step": 333 }, { "epoch": 0.0862603305785124, "grad_norm": 11.511502265930176, "learning_rate": 4.909304864593026e-05, "loss": 4.2547, "step": 334 }, { "epoch": 0.08651859504132231, "grad_norm": 3.2964212894439697, "learning_rate": 4.90876267377624e-05, "loss": 4.4916, "step": 335 }, { "epoch": 0.08677685950413223, "grad_norm": 3.943655252456665, "learning_rate": 4.908218897252113e-05, "loss": 4.0591, "step": 336 }, { "epoch": 0.08703512396694214, "grad_norm": 4.311870098114014, "learning_rate": 4.907673535378616e-05, "loss": 4.1372, "step": 337 }, { "epoch": 0.08729338842975207, "grad_norm": 2.109882354736328, "learning_rate": 4.907126588514766e-05, "loss": 4.7245, "step": 338 }, { "epoch": 0.08755165289256199, "grad_norm": 3.140096426010132, "learning_rate": 4.906578057020621e-05, "loss": 4.1189, "step": 339 }, { "epoch": 0.0878099173553719, "grad_norm": 4.02254581451416, "learning_rate": 4.906027941257284e-05, "loss": 4.098, "step": 340 }, { "epoch": 0.08806818181818182, "grad_norm": 3.7174644470214844, "learning_rate": 4.905476241586901e-05, "loss": 4.1668, "step": 341 }, { "epoch": 0.08832644628099173, "grad_norm": 3.1885805130004883, "learning_rate": 4.904922958372661e-05, "loss": 4.3888, "step": 342 }, { "epoch": 0.08858471074380166, "grad_norm": 2.686063051223755, "learning_rate": 4.9043680919787925e-05, "loss": 4.3323, "step": 343 }, { "epoch": 0.08884297520661157, "grad_norm": 2.8979225158691406, "learning_rate": 4.9038116427705707e-05, "loss": 4.352, "step": 344 }, { "epoch": 0.08910123966942149, "grad_norm": 5.826053142547607, "learning_rate": 4.903253611114309e-05, "loss": 4.5379, "step": 345 }, { "epoch": 0.0893595041322314, "grad_norm": 3.7370429039001465, "learning_rate": 4.902693997377365e-05, "loss": 4.6625, "step": 346 }, { "epoch": 0.08961776859504132, "grad_norm": 3.5591065883636475, "learning_rate": 4.902132801928136e-05, "loss": 4.067, "step": 347 }, { "epoch": 0.08987603305785125, "grad_norm": 2.944754123687744, "learning_rate": 4.901570025136062e-05, "loss": 4.1409, "step": 348 }, { "epoch": 0.09013429752066116, "grad_norm": 3.4933669567108154, "learning_rate": 4.901005667371622e-05, "loss": 4.8254, "step": 349 }, { "epoch": 0.09039256198347108, "grad_norm": 2.420107364654541, "learning_rate": 4.900439729006338e-05, "loss": 4.5807, "step": 350 }, { "epoch": 0.09065082644628099, "grad_norm": 5.218144416809082, "learning_rate": 4.8998722104127735e-05, "loss": 3.9034, "step": 351 }, { "epoch": 0.09090909090909091, "grad_norm": 3.883570671081543, "learning_rate": 4.8993031119645276e-05, "loss": 4.2412, "step": 352 }, { "epoch": 0.09116735537190082, "grad_norm": 2.49178147315979, "learning_rate": 4.898732434036244e-05, "loss": 4.2519, "step": 353 }, { "epoch": 0.09142561983471074, "grad_norm": 3.893573760986328, "learning_rate": 4.8981601770036034e-05, "loss": 3.5557, "step": 354 }, { "epoch": 0.09168388429752067, "grad_norm": 4.1029839515686035, "learning_rate": 4.8975863412433276e-05, "loss": 4.1882, "step": 355 }, { "epoch": 0.09194214876033058, "grad_norm": 3.1988184452056885, "learning_rate": 4.897010927133176e-05, "loss": 4.7054, "step": 356 }, { "epoch": 0.0922004132231405, "grad_norm": 4.7963547706604, "learning_rate": 4.8964339350519515e-05, "loss": 3.6825, "step": 357 }, { "epoch": 0.09245867768595041, "grad_norm": 3.4981300830841064, "learning_rate": 4.895855365379489e-05, "loss": 4.1829, "step": 358 }, { "epoch": 0.09271694214876033, "grad_norm": 5.2450785636901855, "learning_rate": 4.895275218496668e-05, "loss": 4.139, "step": 359 }, { "epoch": 0.09297520661157024, "grad_norm": 3.245875597000122, "learning_rate": 4.894693494785401e-05, "loss": 4.3136, "step": 360 }, { "epoch": 0.09323347107438017, "grad_norm": 3.9221534729003906, "learning_rate": 4.894110194628644e-05, "loss": 4.2822, "step": 361 }, { "epoch": 0.09349173553719008, "grad_norm": 3.569499969482422, "learning_rate": 4.893525318410386e-05, "loss": 4.2597, "step": 362 }, { "epoch": 0.09375, "grad_norm": 4.168821334838867, "learning_rate": 4.892938866515657e-05, "loss": 4.1577, "step": 363 }, { "epoch": 0.09400826446280992, "grad_norm": 4.4107279777526855, "learning_rate": 4.892350839330522e-05, "loss": 3.9026, "step": 364 }, { "epoch": 0.09426652892561983, "grad_norm": 2.6410112380981445, "learning_rate": 4.891761237242085e-05, "loss": 4.5286, "step": 365 }, { "epoch": 0.09452479338842976, "grad_norm": 3.7202372550964355, "learning_rate": 4.8911700606384844e-05, "loss": 4.0774, "step": 366 }, { "epoch": 0.09478305785123967, "grad_norm": 6.481695175170898, "learning_rate": 4.890577309908897e-05, "loss": 4.4517, "step": 367 }, { "epoch": 0.09504132231404959, "grad_norm": 3.4685723781585693, "learning_rate": 4.889982985443535e-05, "loss": 4.0002, "step": 368 }, { "epoch": 0.0952995867768595, "grad_norm": 2.463899850845337, "learning_rate": 4.889387087633647e-05, "loss": 4.6455, "step": 369 }, { "epoch": 0.09555785123966942, "grad_norm": 2.676372766494751, "learning_rate": 4.888789616871517e-05, "loss": 4.5486, "step": 370 }, { "epoch": 0.09581611570247933, "grad_norm": 3.3557162284851074, "learning_rate": 4.8881905735504665e-05, "loss": 3.9425, "step": 371 }, { "epoch": 0.09607438016528926, "grad_norm": 2.6369903087615967, "learning_rate": 4.887589958064848e-05, "loss": 4.1108, "step": 372 }, { "epoch": 0.09633264462809918, "grad_norm": 6.581792831420898, "learning_rate": 4.886987770810051e-05, "loss": 4.6352, "step": 373 }, { "epoch": 0.09659090909090909, "grad_norm": 3.299222707748413, "learning_rate": 4.886384012182502e-05, "loss": 4.497, "step": 374 }, { "epoch": 0.09684917355371901, "grad_norm": 3.3572399616241455, "learning_rate": 4.88577868257966e-05, "loss": 4.1092, "step": 375 }, { "epoch": 0.09710743801652892, "grad_norm": 5.5388007164001465, "learning_rate": 4.885171782400017e-05, "loss": 4.1151, "step": 376 }, { "epoch": 0.09736570247933884, "grad_norm": 5.8736796379089355, "learning_rate": 4.8845633120431004e-05, "loss": 3.8625, "step": 377 }, { "epoch": 0.09762396694214875, "grad_norm": 2.7073659896850586, "learning_rate": 4.883953271909472e-05, "loss": 3.4939, "step": 378 }, { "epoch": 0.09788223140495868, "grad_norm": 5.0317583084106445, "learning_rate": 4.8833416624007244e-05, "loss": 4.5042, "step": 379 }, { "epoch": 0.0981404958677686, "grad_norm": 2.874643325805664, "learning_rate": 4.8827284839194866e-05, "loss": 4.2888, "step": 380 }, { "epoch": 0.09839876033057851, "grad_norm": 2.956871509552002, "learning_rate": 4.8821137368694184e-05, "loss": 3.8034, "step": 381 }, { "epoch": 0.09865702479338843, "grad_norm": 4.042211532592773, "learning_rate": 4.881497421655211e-05, "loss": 4.2274, "step": 382 }, { "epoch": 0.09891528925619834, "grad_norm": 7.291164398193359, "learning_rate": 4.880879538682591e-05, "loss": 4.4696, "step": 383 }, { "epoch": 0.09917355371900827, "grad_norm": 3.7647910118103027, "learning_rate": 4.8802600883583164e-05, "loss": 4.369, "step": 384 }, { "epoch": 0.09943181818181818, "grad_norm": 3.33284068107605, "learning_rate": 4.879639071090174e-05, "loss": 4.1323, "step": 385 }, { "epoch": 0.0996900826446281, "grad_norm": 4.151761054992676, "learning_rate": 4.879016487286986e-05, "loss": 4.2058, "step": 386 }, { "epoch": 0.09994834710743801, "grad_norm": 2.6773219108581543, "learning_rate": 4.878392337358604e-05, "loss": 4.7263, "step": 387 }, { "epoch": 0.10020661157024793, "grad_norm": 3.505990982055664, "learning_rate": 4.87776662171591e-05, "loss": 4.3285, "step": 388 }, { "epoch": 0.10046487603305786, "grad_norm": 2.9558258056640625, "learning_rate": 4.877139340770818e-05, "loss": 4.3758, "step": 389 }, { "epoch": 0.10072314049586777, "grad_norm": 3.3224751949310303, "learning_rate": 4.876510494936272e-05, "loss": 4.7256, "step": 390 }, { "epoch": 0.10098140495867769, "grad_norm": 4.350029468536377, "learning_rate": 4.8758800846262464e-05, "loss": 4.3606, "step": 391 }, { "epoch": 0.1012396694214876, "grad_norm": 2.806070566177368, "learning_rate": 4.875248110255745e-05, "loss": 3.9198, "step": 392 }, { "epoch": 0.10149793388429752, "grad_norm": 2.967585802078247, "learning_rate": 4.874614572240801e-05, "loss": 3.7918, "step": 393 }, { "epoch": 0.10175619834710743, "grad_norm": 2.281514883041382, "learning_rate": 4.873979470998478e-05, "loss": 4.2498, "step": 394 }, { "epoch": 0.10201446280991736, "grad_norm": 2.6014010906219482, "learning_rate": 4.8733428069468675e-05, "loss": 4.2668, "step": 395 }, { "epoch": 0.10227272727272728, "grad_norm": 4.515266418457031, "learning_rate": 4.872704580505092e-05, "loss": 4.0848, "step": 396 }, { "epoch": 0.10253099173553719, "grad_norm": 3.7145237922668457, "learning_rate": 4.872064792093299e-05, "loss": 4.2215, "step": 397 }, { "epoch": 0.10278925619834711, "grad_norm": 4.1907525062561035, "learning_rate": 4.871423442132668e-05, "loss": 4.9588, "step": 398 }, { "epoch": 0.10304752066115702, "grad_norm": 2.5968103408813477, "learning_rate": 4.870780531045403e-05, "loss": 4.1731, "step": 399 }, { "epoch": 0.10330578512396695, "grad_norm": 3.498263359069824, "learning_rate": 4.8701360592547385e-05, "loss": 4.3387, "step": 400 }, { "epoch": 0.10356404958677685, "grad_norm": 3.7038209438323975, "learning_rate": 4.869490027184935e-05, "loss": 3.993, "step": 401 }, { "epoch": 0.10382231404958678, "grad_norm": 2.2199714183807373, "learning_rate": 4.868842435261281e-05, "loss": 4.2934, "step": 402 }, { "epoch": 0.10408057851239669, "grad_norm": 2.3624000549316406, "learning_rate": 4.86819328391009e-05, "loss": 4.0064, "step": 403 }, { "epoch": 0.10433884297520661, "grad_norm": 3.8801276683807373, "learning_rate": 4.867542573558705e-05, "loss": 4.3171, "step": 404 }, { "epoch": 0.10459710743801653, "grad_norm": 5.345362186431885, "learning_rate": 4.866890304635492e-05, "loss": 4.3268, "step": 405 }, { "epoch": 0.10485537190082644, "grad_norm": 2.0130910873413086, "learning_rate": 4.866236477569847e-05, "loss": 4.6494, "step": 406 }, { "epoch": 0.10511363636363637, "grad_norm": 3.5157954692840576, "learning_rate": 4.865581092792187e-05, "loss": 4.1957, "step": 407 }, { "epoch": 0.10537190082644628, "grad_norm": 2.765255928039551, "learning_rate": 4.864924150733959e-05, "loss": 3.9281, "step": 408 }, { "epoch": 0.1056301652892562, "grad_norm": 6.162137031555176, "learning_rate": 4.864265651827632e-05, "loss": 4.3837, "step": 409 }, { "epoch": 0.10588842975206611, "grad_norm": 3.0127923488616943, "learning_rate": 4.863605596506702e-05, "loss": 4.4616, "step": 410 }, { "epoch": 0.10614669421487603, "grad_norm": 4.026800632476807, "learning_rate": 4.862943985205687e-05, "loss": 4.3965, "step": 411 }, { "epoch": 0.10640495867768596, "grad_norm": 5.61582088470459, "learning_rate": 4.862280818360132e-05, "loss": 4.395, "step": 412 }, { "epoch": 0.10666322314049587, "grad_norm": 2.3633649349212646, "learning_rate": 4.861616096406604e-05, "loss": 4.2741, "step": 413 }, { "epoch": 0.10692148760330579, "grad_norm": 4.938944339752197, "learning_rate": 4.860949819782696e-05, "loss": 3.9691, "step": 414 }, { "epoch": 0.1071797520661157, "grad_norm": 3.2308902740478516, "learning_rate": 4.860281988927023e-05, "loss": 4.5326, "step": 415 }, { "epoch": 0.10743801652892562, "grad_norm": 4.0965681076049805, "learning_rate": 4.8596126042792225e-05, "loss": 4.0764, "step": 416 }, { "epoch": 0.10769628099173553, "grad_norm": 3.7899227142333984, "learning_rate": 4.858941666279955e-05, "loss": 4.3483, "step": 417 }, { "epoch": 0.10795454545454546, "grad_norm": 4.264508247375488, "learning_rate": 4.858269175370906e-05, "loss": 4.1178, "step": 418 }, { "epoch": 0.10821280991735537, "grad_norm": 2.8881983757019043, "learning_rate": 4.857595131994781e-05, "loss": 4.187, "step": 419 }, { "epoch": 0.10847107438016529, "grad_norm": 4.702012538909912, "learning_rate": 4.8569195365953065e-05, "loss": 4.1068, "step": 420 }, { "epoch": 0.10872933884297521, "grad_norm": 2.5262558460235596, "learning_rate": 4.856242389617234e-05, "loss": 4.7629, "step": 421 }, { "epoch": 0.10898760330578512, "grad_norm": 3.1902360916137695, "learning_rate": 4.8555636915063345e-05, "loss": 4.1476, "step": 422 }, { "epoch": 0.10924586776859505, "grad_norm": 2.5026755332946777, "learning_rate": 4.8548834427094e-05, "loss": 4.4348, "step": 423 }, { "epoch": 0.10950413223140495, "grad_norm": 4.032755374908447, "learning_rate": 4.854201643674243e-05, "loss": 3.6908, "step": 424 }, { "epoch": 0.10976239669421488, "grad_norm": 2.416297197341919, "learning_rate": 4.8535182948496984e-05, "loss": 4.2931, "step": 425 }, { "epoch": 0.11002066115702479, "grad_norm": 7.452066421508789, "learning_rate": 4.852833396685619e-05, "loss": 4.2468, "step": 426 }, { "epoch": 0.11027892561983471, "grad_norm": 2.399571180343628, "learning_rate": 4.852146949632879e-05, "loss": 4.1965, "step": 427 }, { "epoch": 0.11053719008264463, "grad_norm": 5.3663482666015625, "learning_rate": 4.8514589541433714e-05, "loss": 4.3169, "step": 428 }, { "epoch": 0.11079545454545454, "grad_norm": 2.8073689937591553, "learning_rate": 4.85076941067001e-05, "loss": 4.2736, "step": 429 }, { "epoch": 0.11105371900826447, "grad_norm": 2.7947630882263184, "learning_rate": 4.8500783196667256e-05, "loss": 3.9661, "step": 430 }, { "epoch": 0.11131198347107438, "grad_norm": 2.8923792839050293, "learning_rate": 4.84938568158847e-05, "loss": 4.3446, "step": 431 }, { "epoch": 0.1115702479338843, "grad_norm": 5.21348762512207, "learning_rate": 4.8486914968912115e-05, "loss": 4.2574, "step": 432 }, { "epoch": 0.11182851239669421, "grad_norm": 3.487698554992676, "learning_rate": 4.847995766031937e-05, "loss": 4.559, "step": 433 }, { "epoch": 0.11208677685950413, "grad_norm": 2.108779191970825, "learning_rate": 4.847298489468653e-05, "loss": 3.9903, "step": 434 }, { "epoch": 0.11234504132231404, "grad_norm": 9.249544143676758, "learning_rate": 4.846599667660381e-05, "loss": 4.016, "step": 435 }, { "epoch": 0.11260330578512397, "grad_norm": 3.9998090267181396, "learning_rate": 4.8458993010671604e-05, "loss": 3.986, "step": 436 }, { "epoch": 0.11286157024793389, "grad_norm": 3.234348773956299, "learning_rate": 4.845197390150049e-05, "loss": 4.9303, "step": 437 }, { "epoch": 0.1131198347107438, "grad_norm": 3.2988414764404297, "learning_rate": 4.84449393537112e-05, "loss": 4.0984, "step": 438 }, { "epoch": 0.11337809917355372, "grad_norm": 2.350144386291504, "learning_rate": 4.843788937193463e-05, "loss": 4.1802, "step": 439 }, { "epoch": 0.11363636363636363, "grad_norm": 3.0476694107055664, "learning_rate": 4.843082396081185e-05, "loss": 3.8013, "step": 440 }, { "epoch": 0.11389462809917356, "grad_norm": 3.9199581146240234, "learning_rate": 4.842374312499405e-05, "loss": 3.8853, "step": 441 }, { "epoch": 0.11415289256198347, "grad_norm": 3.4650304317474365, "learning_rate": 4.8416646869142614e-05, "loss": 4.2905, "step": 442 }, { "epoch": 0.11441115702479339, "grad_norm": 3.608509063720703, "learning_rate": 4.840953519792907e-05, "loss": 4.3406, "step": 443 }, { "epoch": 0.11466942148760331, "grad_norm": 6.975587844848633, "learning_rate": 4.840240811603508e-05, "loss": 3.5109, "step": 444 }, { "epoch": 0.11492768595041322, "grad_norm": 3.2671689987182617, "learning_rate": 4.8395265628152456e-05, "loss": 4.401, "step": 445 }, { "epoch": 0.11518595041322315, "grad_norm": 3.7310142517089844, "learning_rate": 4.838810773898316e-05, "loss": 4.683, "step": 446 }, { "epoch": 0.11544421487603305, "grad_norm": 5.255064964294434, "learning_rate": 4.838093445323928e-05, "loss": 4.3009, "step": 447 }, { "epoch": 0.11570247933884298, "grad_norm": 3.543912887573242, "learning_rate": 4.8373745775643056e-05, "loss": 4.4647, "step": 448 }, { "epoch": 0.11596074380165289, "grad_norm": 4.361945629119873, "learning_rate": 4.8366541710926825e-05, "loss": 3.9911, "step": 449 }, { "epoch": 0.11621900826446281, "grad_norm": 3.891491651535034, "learning_rate": 4.835932226383312e-05, "loss": 4.0036, "step": 450 }, { "epoch": 0.11647727272727272, "grad_norm": 3.576981782913208, "learning_rate": 4.835208743911452e-05, "loss": 4.4616, "step": 451 }, { "epoch": 0.11673553719008264, "grad_norm": 3.5000741481781006, "learning_rate": 4.834483724153379e-05, "loss": 4.3966, "step": 452 }, { "epoch": 0.11699380165289257, "grad_norm": 2.7199764251708984, "learning_rate": 4.8337571675863794e-05, "loss": 4.302, "step": 453 }, { "epoch": 0.11725206611570248, "grad_norm": 2.793304920196533, "learning_rate": 4.833029074688749e-05, "loss": 4.6235, "step": 454 }, { "epoch": 0.1175103305785124, "grad_norm": 2.70456600189209, "learning_rate": 4.8322994459397983e-05, "loss": 4.249, "step": 455 }, { "epoch": 0.11776859504132231, "grad_norm": 2.725092887878418, "learning_rate": 4.831568281819849e-05, "loss": 4.0957, "step": 456 }, { "epoch": 0.11802685950413223, "grad_norm": 2.3365604877471924, "learning_rate": 4.83083558281023e-05, "loss": 4.1207, "step": 457 }, { "epoch": 0.11828512396694214, "grad_norm": 1.8987153768539429, "learning_rate": 4.8301013493932836e-05, "loss": 4.2206, "step": 458 }, { "epoch": 0.11854338842975207, "grad_norm": 3.3715627193450928, "learning_rate": 4.829365582052362e-05, "loss": 4.4645, "step": 459 }, { "epoch": 0.11880165289256199, "grad_norm": 3.5247058868408203, "learning_rate": 4.8286282812718256e-05, "loss": 4.5088, "step": 460 }, { "epoch": 0.1190599173553719, "grad_norm": 3.0877256393432617, "learning_rate": 4.827889447537045e-05, "loss": 4.1189, "step": 461 }, { "epoch": 0.11931818181818182, "grad_norm": 3.1084835529327393, "learning_rate": 4.827149081334402e-05, "loss": 4.8253, "step": 462 }, { "epoch": 0.11957644628099173, "grad_norm": 2.406656265258789, "learning_rate": 4.826407183151284e-05, "loss": 4.0015, "step": 463 }, { "epoch": 0.11983471074380166, "grad_norm": 3.146512746810913, "learning_rate": 4.82566375347609e-05, "loss": 4.1191, "step": 464 }, { "epoch": 0.12009297520661157, "grad_norm": 3.582761764526367, "learning_rate": 4.824918792798222e-05, "loss": 4.1178, "step": 465 }, { "epoch": 0.12035123966942149, "grad_norm": 3.4898977279663086, "learning_rate": 4.8241723016080974e-05, "loss": 4.5054, "step": 466 }, { "epoch": 0.1206095041322314, "grad_norm": 2.237847089767456, "learning_rate": 4.8234242803971366e-05, "loss": 4.3795, "step": 467 }, { "epoch": 0.12086776859504132, "grad_norm": 2.151704788208008, "learning_rate": 4.822674729657765e-05, "loss": 4.2423, "step": 468 }, { "epoch": 0.12112603305785125, "grad_norm": 1.990654706954956, "learning_rate": 4.821923649883421e-05, "loss": 4.7203, "step": 469 }, { "epoch": 0.12138429752066116, "grad_norm": 2.3743534088134766, "learning_rate": 4.821171041568544e-05, "loss": 4.8761, "step": 470 }, { "epoch": 0.12164256198347108, "grad_norm": 3.448927879333496, "learning_rate": 4.820416905208583e-05, "loss": 3.9047, "step": 471 }, { "epoch": 0.12190082644628099, "grad_norm": 3.4794440269470215, "learning_rate": 4.819661241299992e-05, "loss": 4.4389, "step": 472 }, { "epoch": 0.12215909090909091, "grad_norm": 4.219357013702393, "learning_rate": 4.81890405034023e-05, "loss": 4.3475, "step": 473 }, { "epoch": 0.12241735537190082, "grad_norm": 2.9895553588867188, "learning_rate": 4.818145332827762e-05, "loss": 3.8559, "step": 474 }, { "epoch": 0.12267561983471074, "grad_norm": 2.6828505992889404, "learning_rate": 4.817385089262058e-05, "loss": 4.3463, "step": 475 }, { "epoch": 0.12293388429752067, "grad_norm": 6.0823283195495605, "learning_rate": 4.816623320143591e-05, "loss": 3.4472, "step": 476 }, { "epoch": 0.12319214876033058, "grad_norm": 2.9521889686584473, "learning_rate": 4.8158600259738415e-05, "loss": 4.2892, "step": 477 }, { "epoch": 0.1234504132231405, "grad_norm": 3.7767374515533447, "learning_rate": 4.81509520725529e-05, "loss": 4.1529, "step": 478 }, { "epoch": 0.12370867768595041, "grad_norm": 5.446708679199219, "learning_rate": 4.814328864491425e-05, "loss": 4.594, "step": 479 }, { "epoch": 0.12396694214876033, "grad_norm": 2.639927864074707, "learning_rate": 4.8135609981867336e-05, "loss": 4.1356, "step": 480 }, { "epoch": 0.12422520661157024, "grad_norm": 6.366829872131348, "learning_rate": 4.81279160884671e-05, "loss": 4.0635, "step": 481 }, { "epoch": 0.12448347107438017, "grad_norm": 4.173468112945557, "learning_rate": 4.812020696977848e-05, "loss": 4.8007, "step": 482 }, { "epoch": 0.12474173553719008, "grad_norm": 3.13634991645813, "learning_rate": 4.8112482630876456e-05, "loss": 4.0859, "step": 483 }, { "epoch": 0.125, "grad_norm": 6.823054790496826, "learning_rate": 4.810474307684601e-05, "loss": 4.2597, "step": 484 }, { "epoch": 0.1252582644628099, "grad_norm": 4.3503313064575195, "learning_rate": 4.8096988312782174e-05, "loss": 3.766, "step": 485 }, { "epoch": 0.12551652892561985, "grad_norm": 3.1858136653900146, "learning_rate": 4.808921834378994e-05, "loss": 4.2493, "step": 486 }, { "epoch": 0.12577479338842976, "grad_norm": 2.075404644012451, "learning_rate": 4.808143317498437e-05, "loss": 4.1571, "step": 487 }, { "epoch": 0.12603305785123967, "grad_norm": 7.117515563964844, "learning_rate": 4.8073632811490483e-05, "loss": 4.2658, "step": 488 }, { "epoch": 0.12629132231404958, "grad_norm": 5.851067543029785, "learning_rate": 4.806581725844332e-05, "loss": 3.8833, "step": 489 }, { "epoch": 0.1265495867768595, "grad_norm": 2.58473539352417, "learning_rate": 4.805798652098793e-05, "loss": 4.2675, "step": 490 }, { "epoch": 0.12680785123966942, "grad_norm": 4.994014739990234, "learning_rate": 4.8050140604279334e-05, "loss": 4.1219, "step": 491 }, { "epoch": 0.12706611570247933, "grad_norm": 3.336108446121216, "learning_rate": 4.804227951348258e-05, "loss": 3.5849, "step": 492 }, { "epoch": 0.12732438016528927, "grad_norm": 5.2521891593933105, "learning_rate": 4.803440325377267e-05, "loss": 3.5624, "step": 493 }, { "epoch": 0.12758264462809918, "grad_norm": 2.2049899101257324, "learning_rate": 4.802651183033461e-05, "loss": 4.1777, "step": 494 }, { "epoch": 0.1278409090909091, "grad_norm": 2.7883236408233643, "learning_rate": 4.801860524836339e-05, "loss": 4.6746, "step": 495 }, { "epoch": 0.128099173553719, "grad_norm": 2.4612860679626465, "learning_rate": 4.801068351306397e-05, "loss": 4.6662, "step": 496 }, { "epoch": 0.12835743801652894, "grad_norm": 2.9493658542633057, "learning_rate": 4.80027466296513e-05, "loss": 4.1396, "step": 497 }, { "epoch": 0.12861570247933884, "grad_norm": 2.364104747772217, "learning_rate": 4.7994794603350276e-05, "loss": 4.1861, "step": 498 }, { "epoch": 0.12887396694214875, "grad_norm": 4.943397045135498, "learning_rate": 4.798682743939581e-05, "loss": 4.0795, "step": 499 }, { "epoch": 0.12913223140495866, "grad_norm": 4.84872579574585, "learning_rate": 4.7978845143032706e-05, "loss": 4.3015, "step": 500 }, { "epoch": 0.1293904958677686, "grad_norm": 4.4985809326171875, "learning_rate": 4.79708477195158e-05, "loss": 4.1386, "step": 501 }, { "epoch": 0.1296487603305785, "grad_norm": 3.8653507232666016, "learning_rate": 4.796283517410986e-05, "loss": 4.2541, "step": 502 }, { "epoch": 0.12990702479338842, "grad_norm": 4.245180606842041, "learning_rate": 4.7954807512089594e-05, "loss": 4.2266, "step": 503 }, { "epoch": 0.13016528925619836, "grad_norm": 4.227993965148926, "learning_rate": 4.7946764738739685e-05, "loss": 3.7684, "step": 504 }, { "epoch": 0.13042355371900827, "grad_norm": 3.8329241275787354, "learning_rate": 4.793870685935475e-05, "loss": 4.3012, "step": 505 }, { "epoch": 0.13068181818181818, "grad_norm": 2.927421808242798, "learning_rate": 4.793063387923935e-05, "loss": 4.385, "step": 506 }, { "epoch": 0.1309400826446281, "grad_norm": 3.89184832572937, "learning_rate": 4.7922545803707995e-05, "loss": 4.4167, "step": 507 }, { "epoch": 0.13119834710743802, "grad_norm": 2.7736072540283203, "learning_rate": 4.791444263808513e-05, "loss": 3.9269, "step": 508 }, { "epoch": 0.13145661157024793, "grad_norm": 2.4557313919067383, "learning_rate": 4.790632438770513e-05, "loss": 4.3115, "step": 509 }, { "epoch": 0.13171487603305784, "grad_norm": 5.68839168548584, "learning_rate": 4.7898191057912304e-05, "loss": 3.9793, "step": 510 }, { "epoch": 0.13197314049586778, "grad_norm": 3.0405900478363037, "learning_rate": 4.7890042654060884e-05, "loss": 4.3696, "step": 511 }, { "epoch": 0.1322314049586777, "grad_norm": 3.639803886413574, "learning_rate": 4.7881879181515036e-05, "loss": 3.9986, "step": 512 }, { "epoch": 0.1324896694214876, "grad_norm": 3.1459505558013916, "learning_rate": 4.787370064564883e-05, "loss": 4.1287, "step": 513 }, { "epoch": 0.1327479338842975, "grad_norm": 5.266561985015869, "learning_rate": 4.786550705184626e-05, "loss": 4.2881, "step": 514 }, { "epoch": 0.13300619834710745, "grad_norm": 4.25514030456543, "learning_rate": 4.7857298405501236e-05, "loss": 4.6326, "step": 515 }, { "epoch": 0.13326446280991736, "grad_norm": 2.5748279094696045, "learning_rate": 4.784907471201758e-05, "loss": 4.6321, "step": 516 }, { "epoch": 0.13352272727272727, "grad_norm": 3.6999270915985107, "learning_rate": 4.7840835976809007e-05, "loss": 4.4276, "step": 517 }, { "epoch": 0.1337809917355372, "grad_norm": 5.31508207321167, "learning_rate": 4.7832582205299134e-05, "loss": 4.3547, "step": 518 }, { "epoch": 0.1340392561983471, "grad_norm": 4.474201202392578, "learning_rate": 4.7824313402921495e-05, "loss": 4.0976, "step": 519 }, { "epoch": 0.13429752066115702, "grad_norm": 3.583083152770996, "learning_rate": 4.7816029575119505e-05, "loss": 4.4159, "step": 520 }, { "epoch": 0.13455578512396693, "grad_norm": 2.71618390083313, "learning_rate": 4.780773072734647e-05, "loss": 3.7429, "step": 521 }, { "epoch": 0.13481404958677687, "grad_norm": 3.0197532176971436, "learning_rate": 4.7799416865065586e-05, "loss": 3.9968, "step": 522 }, { "epoch": 0.13507231404958678, "grad_norm": 4.843269348144531, "learning_rate": 4.779108799374994e-05, "loss": 3.9914, "step": 523 }, { "epoch": 0.1353305785123967, "grad_norm": 3.018805980682373, "learning_rate": 4.778274411888248e-05, "loss": 3.7576, "step": 524 }, { "epoch": 0.13558884297520662, "grad_norm": 2.8648924827575684, "learning_rate": 4.777438524595607e-05, "loss": 4.2871, "step": 525 }, { "epoch": 0.13584710743801653, "grad_norm": 3.9527881145477295, "learning_rate": 4.776601138047338e-05, "loss": 4.9434, "step": 526 }, { "epoch": 0.13610537190082644, "grad_norm": 4.133703231811523, "learning_rate": 4.7757622527947035e-05, "loss": 4.3546, "step": 527 }, { "epoch": 0.13636363636363635, "grad_norm": 2.8476991653442383, "learning_rate": 4.7749218693899456e-05, "loss": 4.1503, "step": 528 }, { "epoch": 0.1366219008264463, "grad_norm": 3.7078473567962646, "learning_rate": 4.774079988386296e-05, "loss": 4.2078, "step": 529 }, { "epoch": 0.1368801652892562, "grad_norm": 20.020771026611328, "learning_rate": 4.773236610337972e-05, "loss": 4.0211, "step": 530 }, { "epoch": 0.1371384297520661, "grad_norm": 4.032477855682373, "learning_rate": 4.772391735800175e-05, "loss": 4.1135, "step": 531 }, { "epoch": 0.13739669421487602, "grad_norm": 3.2555131912231445, "learning_rate": 4.771545365329093e-05, "loss": 4.3735, "step": 532 }, { "epoch": 0.13765495867768596, "grad_norm": 4.997492790222168, "learning_rate": 4.7706974994818985e-05, "loss": 3.3815, "step": 533 }, { "epoch": 0.13791322314049587, "grad_norm": 2.776726722717285, "learning_rate": 4.7698481388167484e-05, "loss": 4.0008, "step": 534 }, { "epoch": 0.13817148760330578, "grad_norm": 3.686534881591797, "learning_rate": 4.7689972838927834e-05, "loss": 3.7231, "step": 535 }, { "epoch": 0.1384297520661157, "grad_norm": 5.7028489112854, "learning_rate": 4.768144935270127e-05, "loss": 4.0767, "step": 536 }, { "epoch": 0.13868801652892562, "grad_norm": 2.753481388092041, "learning_rate": 4.767291093509888e-05, "loss": 3.5691, "step": 537 }, { "epoch": 0.13894628099173553, "grad_norm": 3.262737274169922, "learning_rate": 4.7664357591741574e-05, "loss": 4.1927, "step": 538 }, { "epoch": 0.13920454545454544, "grad_norm": 4.598397731781006, "learning_rate": 4.765578932826007e-05, "loss": 4.5123, "step": 539 }, { "epoch": 0.13946280991735538, "grad_norm": 3.0009686946868896, "learning_rate": 4.7647206150294934e-05, "loss": 4.0751, "step": 540 }, { "epoch": 0.1397210743801653, "grad_norm": 3.3911259174346924, "learning_rate": 4.7638608063496534e-05, "loss": 4.5956, "step": 541 }, { "epoch": 0.1399793388429752, "grad_norm": 3.0753774642944336, "learning_rate": 4.762999507352507e-05, "loss": 4.1012, "step": 542 }, { "epoch": 0.14023760330578514, "grad_norm": 4.775676250457764, "learning_rate": 4.762136718605054e-05, "loss": 4.689, "step": 543 }, { "epoch": 0.14049586776859505, "grad_norm": 6.635494232177734, "learning_rate": 4.761272440675272e-05, "loss": 3.7701, "step": 544 }, { "epoch": 0.14075413223140495, "grad_norm": 4.115767478942871, "learning_rate": 4.760406674132126e-05, "loss": 4.3061, "step": 545 }, { "epoch": 0.14101239669421486, "grad_norm": 6.918871879577637, "learning_rate": 4.759539419545555e-05, "loss": 4.1871, "step": 546 }, { "epoch": 0.1412706611570248, "grad_norm": 3.60444712638855, "learning_rate": 4.7586706774864784e-05, "loss": 4.123, "step": 547 }, { "epoch": 0.1415289256198347, "grad_norm": 6.308271884918213, "learning_rate": 4.7578004485267976e-05, "loss": 4.3284, "step": 548 }, { "epoch": 0.14178719008264462, "grad_norm": 2.4336934089660645, "learning_rate": 4.7569287332393906e-05, "loss": 4.1276, "step": 549 }, { "epoch": 0.14204545454545456, "grad_norm": 2.955442428588867, "learning_rate": 4.756055532198114e-05, "loss": 4.2672, "step": 550 }, { "epoch": 0.14230371900826447, "grad_norm": 4.593275547027588, "learning_rate": 4.755180845977804e-05, "loss": 4.2629, "step": 551 }, { "epoch": 0.14256198347107438, "grad_norm": 4.422439098358154, "learning_rate": 4.7543046751542716e-05, "loss": 3.8406, "step": 552 }, { "epoch": 0.1428202479338843, "grad_norm": 9.05133056640625, "learning_rate": 4.7534270203043096e-05, "loss": 4.1609, "step": 553 }, { "epoch": 0.14307851239669422, "grad_norm": 3.1181893348693848, "learning_rate": 4.752547882005682e-05, "loss": 4.3597, "step": 554 }, { "epoch": 0.14333677685950413, "grad_norm": 4.914687156677246, "learning_rate": 4.751667260837135e-05, "loss": 4.4082, "step": 555 }, { "epoch": 0.14359504132231404, "grad_norm": 7.492733955383301, "learning_rate": 4.750785157378387e-05, "loss": 4.7121, "step": 556 }, { "epoch": 0.14385330578512398, "grad_norm": 2.793487071990967, "learning_rate": 4.7499015722101335e-05, "loss": 4.5668, "step": 557 }, { "epoch": 0.1441115702479339, "grad_norm": 3.479344129562378, "learning_rate": 4.749016505914047e-05, "loss": 4.3117, "step": 558 }, { "epoch": 0.1443698347107438, "grad_norm": 3.494668483734131, "learning_rate": 4.7481299590727716e-05, "loss": 4.2655, "step": 559 }, { "epoch": 0.1446280991735537, "grad_norm": 3.34570050239563, "learning_rate": 4.7472419322699294e-05, "loss": 3.8629, "step": 560 }, { "epoch": 0.14488636363636365, "grad_norm": 4.16757345199585, "learning_rate": 4.746352426090114e-05, "loss": 4.2321, "step": 561 }, { "epoch": 0.14514462809917356, "grad_norm": 2.886615753173828, "learning_rate": 4.7454614411188956e-05, "loss": 4.1649, "step": 562 }, { "epoch": 0.14540289256198347, "grad_norm": 3.0501339435577393, "learning_rate": 4.744568977942817e-05, "loss": 4.1037, "step": 563 }, { "epoch": 0.14566115702479338, "grad_norm": 4.805057525634766, "learning_rate": 4.743675037149393e-05, "loss": 4.1057, "step": 564 }, { "epoch": 0.1459194214876033, "grad_norm": 2.6603481769561768, "learning_rate": 4.7427796193271115e-05, "loss": 4.4278, "step": 565 }, { "epoch": 0.14617768595041322, "grad_norm": 2.906229257583618, "learning_rate": 4.741882725065433e-05, "loss": 4.2971, "step": 566 }, { "epoch": 0.14643595041322313, "grad_norm": 2.9982309341430664, "learning_rate": 4.740984354954791e-05, "loss": 4.2127, "step": 567 }, { "epoch": 0.14669421487603307, "grad_norm": 3.1446757316589355, "learning_rate": 4.740084509586588e-05, "loss": 4.7396, "step": 568 }, { "epoch": 0.14695247933884298, "grad_norm": 3.8845112323760986, "learning_rate": 4.739183189553201e-05, "loss": 4.1259, "step": 569 }, { "epoch": 0.1472107438016529, "grad_norm": 3.0983452796936035, "learning_rate": 4.7382803954479745e-05, "loss": 4.0158, "step": 570 }, { "epoch": 0.1474690082644628, "grad_norm": 3.0103871822357178, "learning_rate": 4.737376127865226e-05, "loss": 4.0286, "step": 571 }, { "epoch": 0.14772727272727273, "grad_norm": 3.1424522399902344, "learning_rate": 4.736470387400243e-05, "loss": 4.7572, "step": 572 }, { "epoch": 0.14798553719008264, "grad_norm": 5.305017471313477, "learning_rate": 4.735563174649278e-05, "loss": 4.7386, "step": 573 }, { "epoch": 0.14824380165289255, "grad_norm": 3.1116836071014404, "learning_rate": 4.73465449020956e-05, "loss": 4.2641, "step": 574 }, { "epoch": 0.1485020661157025, "grad_norm": 3.2409989833831787, "learning_rate": 4.733744334679281e-05, "loss": 4.529, "step": 575 }, { "epoch": 0.1487603305785124, "grad_norm": 11.367789268493652, "learning_rate": 4.732832708657604e-05, "loss": 3.9992, "step": 576 }, { "epoch": 0.1490185950413223, "grad_norm": 2.5663325786590576, "learning_rate": 4.73191961274466e-05, "loss": 4.5928, "step": 577 }, { "epoch": 0.14927685950413222, "grad_norm": 2.782188892364502, "learning_rate": 4.731005047541546e-05, "loss": 4.5828, "step": 578 }, { "epoch": 0.14953512396694216, "grad_norm": 4.093466281890869, "learning_rate": 4.730089013650328e-05, "loss": 4.0301, "step": 579 }, { "epoch": 0.14979338842975207, "grad_norm": 4.196178913116455, "learning_rate": 4.729171511674039e-05, "loss": 4.4993, "step": 580 }, { "epoch": 0.15005165289256198, "grad_norm": 3.0709402561187744, "learning_rate": 4.728252542216678e-05, "loss": 4.164, "step": 581 }, { "epoch": 0.1503099173553719, "grad_norm": 6.842238903045654, "learning_rate": 4.727332105883208e-05, "loss": 3.8465, "step": 582 }, { "epoch": 0.15056818181818182, "grad_norm": 2.660494327545166, "learning_rate": 4.726410203279561e-05, "loss": 4.2335, "step": 583 }, { "epoch": 0.15082644628099173, "grad_norm": 2.9883899688720703, "learning_rate": 4.725486835012632e-05, "loss": 4.4023, "step": 584 }, { "epoch": 0.15108471074380164, "grad_norm": 3.8854992389678955, "learning_rate": 4.724562001690282e-05, "loss": 4.8016, "step": 585 }, { "epoch": 0.15134297520661158, "grad_norm": 2.838876485824585, "learning_rate": 4.723635703921335e-05, "loss": 4.6217, "step": 586 }, { "epoch": 0.1516012396694215, "grad_norm": 3.1815407276153564, "learning_rate": 4.722707942315582e-05, "loss": 4.8413, "step": 587 }, { "epoch": 0.1518595041322314, "grad_norm": 3.883784055709839, "learning_rate": 4.721778717483774e-05, "loss": 3.8104, "step": 588 }, { "epoch": 0.15211776859504134, "grad_norm": 4.636765003204346, "learning_rate": 4.720848030037628e-05, "loss": 3.8442, "step": 589 }, { "epoch": 0.15237603305785125, "grad_norm": 2.606038808822632, "learning_rate": 4.7199158805898214e-05, "loss": 3.8259, "step": 590 }, { "epoch": 0.15263429752066116, "grad_norm": 3.5006654262542725, "learning_rate": 4.7189822697539974e-05, "loss": 3.9808, "step": 591 }, { "epoch": 0.15289256198347106, "grad_norm": 3.0077250003814697, "learning_rate": 4.718047198144758e-05, "loss": 4.1247, "step": 592 }, { "epoch": 0.153150826446281, "grad_norm": 2.5759973526000977, "learning_rate": 4.7171106663776694e-05, "loss": 4.2819, "step": 593 }, { "epoch": 0.1534090909090909, "grad_norm": 2.466933488845825, "learning_rate": 4.716172675069256e-05, "loss": 4.6462, "step": 594 }, { "epoch": 0.15366735537190082, "grad_norm": 3.578263759613037, "learning_rate": 4.715233224837007e-05, "loss": 4.0663, "step": 595 }, { "epoch": 0.15392561983471073, "grad_norm": 2.1775879859924316, "learning_rate": 4.714292316299368e-05, "loss": 3.6588, "step": 596 }, { "epoch": 0.15418388429752067, "grad_norm": 5.9521379470825195, "learning_rate": 4.7133499500757465e-05, "loss": 3.993, "step": 597 }, { "epoch": 0.15444214876033058, "grad_norm": 2.341494083404541, "learning_rate": 4.712406126786511e-05, "loss": 4.1119, "step": 598 }, { "epoch": 0.1547004132231405, "grad_norm": 2.360034465789795, "learning_rate": 4.711460847052987e-05, "loss": 4.1812, "step": 599 }, { "epoch": 0.15495867768595042, "grad_norm": 2.214731216430664, "learning_rate": 4.7105141114974595e-05, "loss": 4.3453, "step": 600 }, { "epoch": 0.15521694214876033, "grad_norm": 3.2088797092437744, "learning_rate": 4.709565920743172e-05, "loss": 4.4623, "step": 601 }, { "epoch": 0.15547520661157024, "grad_norm": 3.223482847213745, "learning_rate": 4.7086162754143255e-05, "loss": 4.713, "step": 602 }, { "epoch": 0.15573347107438015, "grad_norm": 3.694664716720581, "learning_rate": 4.707665176136079e-05, "loss": 3.6768, "step": 603 }, { "epoch": 0.1559917355371901, "grad_norm": 3.1965219974517822, "learning_rate": 4.70671262353455e-05, "loss": 4.795, "step": 604 }, { "epoch": 0.15625, "grad_norm": 17.701778411865234, "learning_rate": 4.7057586182368094e-05, "loss": 4.4679, "step": 605 }, { "epoch": 0.1565082644628099, "grad_norm": 2.7801413536071777, "learning_rate": 4.7048031608708876e-05, "loss": 3.9044, "step": 606 }, { "epoch": 0.15676652892561985, "grad_norm": 4.230818271636963, "learning_rate": 4.703846252065769e-05, "loss": 4.096, "step": 607 }, { "epoch": 0.15702479338842976, "grad_norm": 3.1791999340057373, "learning_rate": 4.702887892451395e-05, "loss": 4.2882, "step": 608 }, { "epoch": 0.15728305785123967, "grad_norm": 2.3889377117156982, "learning_rate": 4.7019280826586606e-05, "loss": 4.2898, "step": 609 }, { "epoch": 0.15754132231404958, "grad_norm": 4.633270263671875, "learning_rate": 4.7009668233194165e-05, "loss": 3.4976, "step": 610 }, { "epoch": 0.1577995867768595, "grad_norm": 2.9156076908111572, "learning_rate": 4.700004115066467e-05, "loss": 4.1398, "step": 611 }, { "epoch": 0.15805785123966942, "grad_norm": 4.672552585601807, "learning_rate": 4.6990399585335695e-05, "loss": 4.5429, "step": 612 }, { "epoch": 0.15831611570247933, "grad_norm": 1.7687879800796509, "learning_rate": 4.698074354355438e-05, "loss": 4.5378, "step": 613 }, { "epoch": 0.15857438016528927, "grad_norm": 2.956759452819824, "learning_rate": 4.697107303167735e-05, "loss": 4.1556, "step": 614 }, { "epoch": 0.15883264462809918, "grad_norm": 2.5030198097229004, "learning_rate": 4.69613880560708e-05, "loss": 3.9833, "step": 615 }, { "epoch": 0.1590909090909091, "grad_norm": 4.137096881866455, "learning_rate": 4.695168862311041e-05, "loss": 4.416, "step": 616 }, { "epoch": 0.159349173553719, "grad_norm": 2.970672369003296, "learning_rate": 4.6941974739181395e-05, "loss": 4.1228, "step": 617 }, { "epoch": 0.15960743801652894, "grad_norm": 4.19533109664917, "learning_rate": 4.693224641067848e-05, "loss": 4.3508, "step": 618 }, { "epoch": 0.15986570247933884, "grad_norm": 3.895484209060669, "learning_rate": 4.692250364400591e-05, "loss": 4.1751, "step": 619 }, { "epoch": 0.16012396694214875, "grad_norm": 2.9780566692352295, "learning_rate": 4.6912746445577414e-05, "loss": 4.5123, "step": 620 }, { "epoch": 0.16038223140495866, "grad_norm": 11.878155708312988, "learning_rate": 4.6902974821816245e-05, "loss": 4.5698, "step": 621 }, { "epoch": 0.1606404958677686, "grad_norm": 2.7093777656555176, "learning_rate": 4.689318877915512e-05, "loss": 4.1999, "step": 622 }, { "epoch": 0.1608987603305785, "grad_norm": 2.0223894119262695, "learning_rate": 4.688338832403628e-05, "loss": 4.0967, "step": 623 }, { "epoch": 0.16115702479338842, "grad_norm": 1.727833867073059, "learning_rate": 4.6873573462911445e-05, "loss": 4.3927, "step": 624 }, { "epoch": 0.16141528925619836, "grad_norm": 3.4175846576690674, "learning_rate": 4.6863744202241805e-05, "loss": 4.2432, "step": 625 }, { "epoch": 0.16167355371900827, "grad_norm": 3.4108035564422607, "learning_rate": 4.6853900548498044e-05, "loss": 3.9887, "step": 626 }, { "epoch": 0.16193181818181818, "grad_norm": 3.1254096031188965, "learning_rate": 4.6844042508160316e-05, "loss": 3.8505, "step": 627 }, { "epoch": 0.1621900826446281, "grad_norm": 3.645775556564331, "learning_rate": 4.6834170087718255e-05, "loss": 4.0256, "step": 628 }, { "epoch": 0.16244834710743802, "grad_norm": 2.1857399940490723, "learning_rate": 4.6824283293670936e-05, "loss": 3.8484, "step": 629 }, { "epoch": 0.16270661157024793, "grad_norm": 6.26563024520874, "learning_rate": 4.681438213252692e-05, "loss": 3.8437, "step": 630 }, { "epoch": 0.16296487603305784, "grad_norm": 3.197622299194336, "learning_rate": 4.680446661080422e-05, "loss": 4.3491, "step": 631 }, { "epoch": 0.16322314049586778, "grad_norm": 7.646901607513428, "learning_rate": 4.67945367350303e-05, "loss": 4.7124, "step": 632 }, { "epoch": 0.1634814049586777, "grad_norm": 3.271996259689331, "learning_rate": 4.678459251174209e-05, "loss": 4.0133, "step": 633 }, { "epoch": 0.1637396694214876, "grad_norm": 3.3476314544677734, "learning_rate": 4.6774633947485926e-05, "loss": 4.6516, "step": 634 }, { "epoch": 0.1639979338842975, "grad_norm": 3.8009138107299805, "learning_rate": 4.6764661048817615e-05, "loss": 4.2809, "step": 635 }, { "epoch": 0.16425619834710745, "grad_norm": 6.159664630889893, "learning_rate": 4.6754673822302405e-05, "loss": 4.0684, "step": 636 }, { "epoch": 0.16451446280991736, "grad_norm": 3.7636635303497314, "learning_rate": 4.674467227451496e-05, "loss": 4.4107, "step": 637 }, { "epoch": 0.16477272727272727, "grad_norm": 2.411583185195923, "learning_rate": 4.673465641203938e-05, "loss": 4.4635, "step": 638 }, { "epoch": 0.1650309917355372, "grad_norm": 3.1356215476989746, "learning_rate": 4.6724626241469186e-05, "loss": 4.3313, "step": 639 }, { "epoch": 0.1652892561983471, "grad_norm": 4.462066650390625, "learning_rate": 4.67145817694073e-05, "loss": 4.5629, "step": 640 }, { "epoch": 0.16554752066115702, "grad_norm": 7.024421215057373, "learning_rate": 4.67045230024661e-05, "loss": 4.6201, "step": 641 }, { "epoch": 0.16580578512396693, "grad_norm": 4.363746643066406, "learning_rate": 4.6694449947267335e-05, "loss": 4.2096, "step": 642 }, { "epoch": 0.16606404958677687, "grad_norm": 2.493779182434082, "learning_rate": 4.66843626104422e-05, "loss": 4.1464, "step": 643 }, { "epoch": 0.16632231404958678, "grad_norm": 2.40006947517395, "learning_rate": 4.667426099863124e-05, "loss": 4.2154, "step": 644 }, { "epoch": 0.1665805785123967, "grad_norm": 2.4670660495758057, "learning_rate": 4.666414511848443e-05, "loss": 4.3675, "step": 645 }, { "epoch": 0.16683884297520662, "grad_norm": 3.030200958251953, "learning_rate": 4.665401497666115e-05, "loss": 4.3452, "step": 646 }, { "epoch": 0.16709710743801653, "grad_norm": 2.5324289798736572, "learning_rate": 4.664387057983014e-05, "loss": 4.5135, "step": 647 }, { "epoch": 0.16735537190082644, "grad_norm": 5.2699689865112305, "learning_rate": 4.6633711934669535e-05, "loss": 4.1329, "step": 648 }, { "epoch": 0.16761363636363635, "grad_norm": 3.070150136947632, "learning_rate": 4.662353904786685e-05, "loss": 4.2605, "step": 649 }, { "epoch": 0.1678719008264463, "grad_norm": 2.018260955810547, "learning_rate": 4.6613351926118986e-05, "loss": 4.3207, "step": 650 }, { "epoch": 0.1681301652892562, "grad_norm": 4.301268577575684, "learning_rate": 4.660315057613219e-05, "loss": 4.1837, "step": 651 }, { "epoch": 0.1683884297520661, "grad_norm": 3.084486722946167, "learning_rate": 4.6592935004622104e-05, "loss": 4.467, "step": 652 }, { "epoch": 0.16864669421487602, "grad_norm": 3.2445919513702393, "learning_rate": 4.6582705218313714e-05, "loss": 4.1945, "step": 653 }, { "epoch": 0.16890495867768596, "grad_norm": 3.917997360229492, "learning_rate": 4.6572461223941364e-05, "loss": 4.2247, "step": 654 }, { "epoch": 0.16916322314049587, "grad_norm": 4.514252662658691, "learning_rate": 4.656220302824877e-05, "loss": 3.8777, "step": 655 }, { "epoch": 0.16942148760330578, "grad_norm": 3.1491525173187256, "learning_rate": 4.655193063798896e-05, "loss": 4.1507, "step": 656 }, { "epoch": 0.1696797520661157, "grad_norm": 2.923213005065918, "learning_rate": 4.654164405992435e-05, "loss": 4.3596, "step": 657 }, { "epoch": 0.16993801652892562, "grad_norm": 3.1053693294525146, "learning_rate": 4.653134330082666e-05, "loss": 3.8403, "step": 658 }, { "epoch": 0.17019628099173553, "grad_norm": 2.7208895683288574, "learning_rate": 4.652102836747697e-05, "loss": 5.0034, "step": 659 }, { "epoch": 0.17045454545454544, "grad_norm": 7.22312593460083, "learning_rate": 4.651069926666568e-05, "loss": 3.8389, "step": 660 }, { "epoch": 0.17071280991735538, "grad_norm": 4.367091178894043, "learning_rate": 4.6500356005192514e-05, "loss": 4.3774, "step": 661 }, { "epoch": 0.1709710743801653, "grad_norm": 2.5496153831481934, "learning_rate": 4.6489998589866526e-05, "loss": 3.9707, "step": 662 }, { "epoch": 0.1712293388429752, "grad_norm": 2.692811965942383, "learning_rate": 4.6479627027506075e-05, "loss": 4.3195, "step": 663 }, { "epoch": 0.17148760330578514, "grad_norm": 4.2653326988220215, "learning_rate": 4.646924132493886e-05, "loss": 4.4219, "step": 664 }, { "epoch": 0.17174586776859505, "grad_norm": 3.749643325805664, "learning_rate": 4.645884148900185e-05, "loss": 4.2087, "step": 665 }, { "epoch": 0.17200413223140495, "grad_norm": 3.634293794631958, "learning_rate": 4.6448427526541356e-05, "loss": 4.5318, "step": 666 }, { "epoch": 0.17226239669421486, "grad_norm": 3.1257143020629883, "learning_rate": 4.6437999444412955e-05, "loss": 4.071, "step": 667 }, { "epoch": 0.1725206611570248, "grad_norm": 3.048043966293335, "learning_rate": 4.6427557249481545e-05, "loss": 4.3329, "step": 668 }, { "epoch": 0.1727789256198347, "grad_norm": 3.4774153232574463, "learning_rate": 4.641710094862131e-05, "loss": 4.1681, "step": 669 }, { "epoch": 0.17303719008264462, "grad_norm": 2.093449115753174, "learning_rate": 4.640663054871569e-05, "loss": 4.0665, "step": 670 }, { "epoch": 0.17329545454545456, "grad_norm": 2.9763972759246826, "learning_rate": 4.6396146056657465e-05, "loss": 3.8579, "step": 671 }, { "epoch": 0.17355371900826447, "grad_norm": 3.788087844848633, "learning_rate": 4.638564747934863e-05, "loss": 4.2824, "step": 672 }, { "epoch": 0.17381198347107438, "grad_norm": 2.7998640537261963, "learning_rate": 4.6375134823700505e-05, "loss": 4.1895, "step": 673 }, { "epoch": 0.1740702479338843, "grad_norm": 2.954819679260254, "learning_rate": 4.6364608096633634e-05, "loss": 4.0323, "step": 674 }, { "epoch": 0.17432851239669422, "grad_norm": 3.3264992237091064, "learning_rate": 4.6354067305077856e-05, "loss": 4.0716, "step": 675 }, { "epoch": 0.17458677685950413, "grad_norm": 3.264514684677124, "learning_rate": 4.6343512455972236e-05, "loss": 4.7946, "step": 676 }, { "epoch": 0.17484504132231404, "grad_norm": 4.02720308303833, "learning_rate": 4.6332943556265147e-05, "loss": 4.3998, "step": 677 }, { "epoch": 0.17510330578512398, "grad_norm": 4.299478054046631, "learning_rate": 4.632236061291416e-05, "loss": 3.8476, "step": 678 }, { "epoch": 0.1753615702479339, "grad_norm": 2.53835129737854, "learning_rate": 4.63117636328861e-05, "loss": 4.3206, "step": 679 }, { "epoch": 0.1756198347107438, "grad_norm": 3.675610065460205, "learning_rate": 4.630115262315706e-05, "loss": 4.0801, "step": 680 }, { "epoch": 0.1758780991735537, "grad_norm": 4.361423969268799, "learning_rate": 4.629052759071234e-05, "loss": 4.2555, "step": 681 }, { "epoch": 0.17613636363636365, "grad_norm": 5.225686550140381, "learning_rate": 4.627988854254649e-05, "loss": 4.2284, "step": 682 }, { "epoch": 0.17639462809917356, "grad_norm": 5.003320217132568, "learning_rate": 4.626923548566328e-05, "loss": 4.0445, "step": 683 }, { "epoch": 0.17665289256198347, "grad_norm": 5.266183853149414, "learning_rate": 4.6258568427075676e-05, "loss": 4.0353, "step": 684 }, { "epoch": 0.17691115702479338, "grad_norm": 2.965686321258545, "learning_rate": 4.624788737380592e-05, "loss": 4.1524, "step": 685 }, { "epoch": 0.1771694214876033, "grad_norm": 3.752406597137451, "learning_rate": 4.6237192332885415e-05, "loss": 4.5414, "step": 686 }, { "epoch": 0.17742768595041322, "grad_norm": 4.472149848937988, "learning_rate": 4.62264833113548e-05, "loss": 3.8797, "step": 687 }, { "epoch": 0.17768595041322313, "grad_norm": 6.891357421875, "learning_rate": 4.621576031626389e-05, "loss": 4.2332, "step": 688 }, { "epoch": 0.17794421487603307, "grad_norm": 3.618795394897461, "learning_rate": 4.620502335467174e-05, "loss": 4.5693, "step": 689 }, { "epoch": 0.17820247933884298, "grad_norm": 2.2550673484802246, "learning_rate": 4.619427243364656e-05, "loss": 4.2346, "step": 690 }, { "epoch": 0.1784607438016529, "grad_norm": 2.8594627380371094, "learning_rate": 4.618350756026576e-05, "loss": 4.3209, "step": 691 }, { "epoch": 0.1787190082644628, "grad_norm": 2.707707166671753, "learning_rate": 4.617272874161596e-05, "loss": 4.1189, "step": 692 }, { "epoch": 0.17897727272727273, "grad_norm": 2.7378482818603516, "learning_rate": 4.6161935984792934e-05, "loss": 4.0931, "step": 693 }, { "epoch": 0.17923553719008264, "grad_norm": 4.366027355194092, "learning_rate": 4.615112929690163e-05, "loss": 3.2993, "step": 694 }, { "epoch": 0.17949380165289255, "grad_norm": 5.891956806182861, "learning_rate": 4.6140308685056176e-05, "loss": 4.5711, "step": 695 }, { "epoch": 0.1797520661157025, "grad_norm": 3.2935025691986084, "learning_rate": 4.612947415637988e-05, "loss": 4.6284, "step": 696 }, { "epoch": 0.1800103305785124, "grad_norm": 2.4272849559783936, "learning_rate": 4.611862571800518e-05, "loss": 4.2542, "step": 697 }, { "epoch": 0.1802685950413223, "grad_norm": 3.0966804027557373, "learning_rate": 4.6107763377073696e-05, "loss": 4.0081, "step": 698 }, { "epoch": 0.18052685950413222, "grad_norm": 3.2276155948638916, "learning_rate": 4.60968871407362e-05, "loss": 4.3634, "step": 699 }, { "epoch": 0.18078512396694216, "grad_norm": 2.8379063606262207, "learning_rate": 4.608599701615259e-05, "loss": 3.6047, "step": 700 }, { "epoch": 0.18104338842975207, "grad_norm": 5.248605728149414, "learning_rate": 4.607509301049192e-05, "loss": 3.6477, "step": 701 }, { "epoch": 0.18130165289256198, "grad_norm": 3.053730010986328, "learning_rate": 4.60641751309324e-05, "loss": 4.1593, "step": 702 }, { "epoch": 0.1815599173553719, "grad_norm": 4.720952987670898, "learning_rate": 4.605324338466132e-05, "loss": 3.9935, "step": 703 }, { "epoch": 0.18181818181818182, "grad_norm": 4.797332763671875, "learning_rate": 4.6042297778875184e-05, "loss": 3.5437, "step": 704 }, { "epoch": 0.18207644628099173, "grad_norm": 3.2876484394073486, "learning_rate": 4.6031338320779534e-05, "loss": 4.0829, "step": 705 }, { "epoch": 0.18233471074380164, "grad_norm": 3.0326340198516846, "learning_rate": 4.6020365017589074e-05, "loss": 4.5322, "step": 706 }, { "epoch": 0.18259297520661158, "grad_norm": 3.723104953765869, "learning_rate": 4.600937787652762e-05, "loss": 4.2996, "step": 707 }, { "epoch": 0.1828512396694215, "grad_norm": 2.260082483291626, "learning_rate": 4.599837690482809e-05, "loss": 4.2176, "step": 708 }, { "epoch": 0.1831095041322314, "grad_norm": 6.526580810546875, "learning_rate": 4.598736210973251e-05, "loss": 3.9277, "step": 709 }, { "epoch": 0.18336776859504134, "grad_norm": 3.677022933959961, "learning_rate": 4.597633349849202e-05, "loss": 4.2978, "step": 710 }, { "epoch": 0.18362603305785125, "grad_norm": 2.7963755130767822, "learning_rate": 4.596529107836681e-05, "loss": 4.1913, "step": 711 }, { "epoch": 0.18388429752066116, "grad_norm": 3.4244275093078613, "learning_rate": 4.595423485662622e-05, "loss": 3.7108, "step": 712 }, { "epoch": 0.18414256198347106, "grad_norm": 3.942742347717285, "learning_rate": 4.5943164840548645e-05, "loss": 3.9479, "step": 713 }, { "epoch": 0.184400826446281, "grad_norm": 3.3552279472351074, "learning_rate": 4.5932081037421545e-05, "loss": 4.5146, "step": 714 }, { "epoch": 0.1846590909090909, "grad_norm": 8.690105438232422, "learning_rate": 4.592098345454149e-05, "loss": 3.537, "step": 715 }, { "epoch": 0.18491735537190082, "grad_norm": 4.022606372833252, "learning_rate": 4.590987209921409e-05, "loss": 4.0681, "step": 716 }, { "epoch": 0.18517561983471073, "grad_norm": 3.7529807090759277, "learning_rate": 4.589874697875406e-05, "loss": 4.6164, "step": 717 }, { "epoch": 0.18543388429752067, "grad_norm": 1.9943974018096924, "learning_rate": 4.5887608100485144e-05, "loss": 4.1008, "step": 718 }, { "epoch": 0.18569214876033058, "grad_norm": 7.0456719398498535, "learning_rate": 4.5876455471740144e-05, "loss": 3.9182, "step": 719 }, { "epoch": 0.1859504132231405, "grad_norm": 3.0178842544555664, "learning_rate": 4.586528909986093e-05, "loss": 4.1122, "step": 720 }, { "epoch": 0.18620867768595042, "grad_norm": 2.627761125564575, "learning_rate": 4.5854108992198417e-05, "loss": 4.4926, "step": 721 }, { "epoch": 0.18646694214876033, "grad_norm": 3.285837411880493, "learning_rate": 4.5842915156112545e-05, "loss": 3.9857, "step": 722 }, { "epoch": 0.18672520661157024, "grad_norm": 2.8536934852600098, "learning_rate": 4.583170759897232e-05, "loss": 4.105, "step": 723 }, { "epoch": 0.18698347107438015, "grad_norm": 2.7731990814208984, "learning_rate": 4.582048632815575e-05, "loss": 4.1469, "step": 724 }, { "epoch": 0.1872417355371901, "grad_norm": 4.295103073120117, "learning_rate": 4.58092513510499e-05, "loss": 3.7924, "step": 725 }, { "epoch": 0.1875, "grad_norm": 2.0342023372650146, "learning_rate": 4.5798002675050825e-05, "loss": 4.1444, "step": 726 }, { "epoch": 0.1877582644628099, "grad_norm": 3.8536291122436523, "learning_rate": 4.5786740307563636e-05, "loss": 4.0467, "step": 727 }, { "epoch": 0.18801652892561985, "grad_norm": 6.070218086242676, "learning_rate": 4.577546425600242e-05, "loss": 3.8843, "step": 728 }, { "epoch": 0.18827479338842976, "grad_norm": 4.444738388061523, "learning_rate": 4.57641745277903e-05, "loss": 4.3611, "step": 729 }, { "epoch": 0.18853305785123967, "grad_norm": 3.238269567489624, "learning_rate": 4.57528711303594e-05, "loss": 4.1707, "step": 730 }, { "epoch": 0.18879132231404958, "grad_norm": 3.0562374591827393, "learning_rate": 4.574155407115082e-05, "loss": 4.2517, "step": 731 }, { "epoch": 0.1890495867768595, "grad_norm": 3.1925742626190186, "learning_rate": 4.5730223357614675e-05, "loss": 3.9799, "step": 732 }, { "epoch": 0.18930785123966942, "grad_norm": 7.386104106903076, "learning_rate": 4.571887899721006e-05, "loss": 4.3074, "step": 733 }, { "epoch": 0.18956611570247933, "grad_norm": 3.030938148498535, "learning_rate": 4.5707520997405064e-05, "loss": 4.2273, "step": 734 }, { "epoch": 0.18982438016528927, "grad_norm": 2.6118249893188477, "learning_rate": 4.5696149365676746e-05, "loss": 3.619, "step": 735 }, { "epoch": 0.19008264462809918, "grad_norm": 3.38704252243042, "learning_rate": 4.568476410951112e-05, "loss": 4.2929, "step": 736 }, { "epoch": 0.1903409090909091, "grad_norm": 2.8154659271240234, "learning_rate": 4.567336523640322e-05, "loss": 4.4094, "step": 737 }, { "epoch": 0.190599173553719, "grad_norm": 4.667613983154297, "learning_rate": 4.5661952753857e-05, "loss": 4.2366, "step": 738 }, { "epoch": 0.19085743801652894, "grad_norm": 7.281397342681885, "learning_rate": 4.5650526669385375e-05, "loss": 4.7011, "step": 739 }, { "epoch": 0.19111570247933884, "grad_norm": 1.8602728843688965, "learning_rate": 4.563908699051024e-05, "loss": 4.2782, "step": 740 }, { "epoch": 0.19137396694214875, "grad_norm": 1.825709342956543, "learning_rate": 4.5627633724762434e-05, "loss": 4.122, "step": 741 }, { "epoch": 0.19163223140495866, "grad_norm": 3.187619924545288, "learning_rate": 4.56161668796817e-05, "loss": 4.6874, "step": 742 }, { "epoch": 0.1918904958677686, "grad_norm": 3.1053085327148438, "learning_rate": 4.560468646281679e-05, "loss": 4.701, "step": 743 }, { "epoch": 0.1921487603305785, "grad_norm": 4.168273448944092, "learning_rate": 4.5593192481725324e-05, "loss": 4.5986, "step": 744 }, { "epoch": 0.19240702479338842, "grad_norm": 3.0925402641296387, "learning_rate": 4.558168494397389e-05, "loss": 3.7921, "step": 745 }, { "epoch": 0.19266528925619836, "grad_norm": 5.3987531661987305, "learning_rate": 4.5570163857137994e-05, "loss": 4.3444, "step": 746 }, { "epoch": 0.19292355371900827, "grad_norm": 3.2354071140289307, "learning_rate": 4.555862922880205e-05, "loss": 3.6015, "step": 747 }, { "epoch": 0.19318181818181818, "grad_norm": 3.857442855834961, "learning_rate": 4.5547081066559406e-05, "loss": 3.9365, "step": 748 }, { "epoch": 0.1934400826446281, "grad_norm": 5.576103687286377, "learning_rate": 4.5535519378012295e-05, "loss": 3.8465, "step": 749 }, { "epoch": 0.19369834710743802, "grad_norm": 3.4592318534851074, "learning_rate": 4.552394417077187e-05, "loss": 3.9785, "step": 750 }, { "epoch": 0.19395661157024793, "grad_norm": 3.6066362857818604, "learning_rate": 4.551235545245818e-05, "loss": 4.8342, "step": 751 }, { "epoch": 0.19421487603305784, "grad_norm": 3.544792652130127, "learning_rate": 4.550075323070018e-05, "loss": 4.0825, "step": 752 }, { "epoch": 0.19447314049586778, "grad_norm": 2.991112232208252, "learning_rate": 4.548913751313568e-05, "loss": 4.5375, "step": 753 }, { "epoch": 0.1947314049586777, "grad_norm": 3.0077528953552246, "learning_rate": 4.547750830741143e-05, "loss": 4.5117, "step": 754 }, { "epoch": 0.1949896694214876, "grad_norm": 3.173919200897217, "learning_rate": 4.5465865621183e-05, "loss": 4.2248, "step": 755 }, { "epoch": 0.1952479338842975, "grad_norm": 4.571810245513916, "learning_rate": 4.545420946211488e-05, "loss": 3.7754, "step": 756 }, { "epoch": 0.19550619834710745, "grad_norm": 2.0384573936462402, "learning_rate": 4.5442539837880394e-05, "loss": 4.2051, "step": 757 }, { "epoch": 0.19576446280991736, "grad_norm": 2.147340774536133, "learning_rate": 4.543085675616176e-05, "loss": 4.0037, "step": 758 }, { "epoch": 0.19602272727272727, "grad_norm": 2.9238438606262207, "learning_rate": 4.5419160224650035e-05, "loss": 4.2435, "step": 759 }, { "epoch": 0.1962809917355372, "grad_norm": 3.4503426551818848, "learning_rate": 4.540745025104515e-05, "loss": 4.3578, "step": 760 }, { "epoch": 0.1965392561983471, "grad_norm": 3.9356212615966797, "learning_rate": 4.539572684305585e-05, "loss": 4.2101, "step": 761 }, { "epoch": 0.19679752066115702, "grad_norm": 4.1168084144592285, "learning_rate": 4.5383990008399765e-05, "loss": 4.2849, "step": 762 }, { "epoch": 0.19705578512396693, "grad_norm": 2.196627616882324, "learning_rate": 4.537223975480334e-05, "loss": 4.3203, "step": 763 }, { "epoch": 0.19731404958677687, "grad_norm": 2.248948812484741, "learning_rate": 4.536047609000186e-05, "loss": 4.3427, "step": 764 }, { "epoch": 0.19757231404958678, "grad_norm": 3.396857500076294, "learning_rate": 4.5348699021739425e-05, "loss": 4.4511, "step": 765 }, { "epoch": 0.1978305785123967, "grad_norm": 3.368814706802368, "learning_rate": 4.533690855776899e-05, "loss": 4.684, "step": 766 }, { "epoch": 0.19808884297520662, "grad_norm": 6.546929359436035, "learning_rate": 4.53251047058523e-05, "loss": 3.9275, "step": 767 }, { "epoch": 0.19834710743801653, "grad_norm": 1.7663954496383667, "learning_rate": 4.531328747375994e-05, "loss": 3.997, "step": 768 }, { "epoch": 0.19860537190082644, "grad_norm": 3.008127450942993, "learning_rate": 4.5301456869271255e-05, "loss": 4.1335, "step": 769 }, { "epoch": 0.19886363636363635, "grad_norm": 7.042539119720459, "learning_rate": 4.5289612900174446e-05, "loss": 3.854, "step": 770 }, { "epoch": 0.1991219008264463, "grad_norm": 5.385906219482422, "learning_rate": 4.527775557426648e-05, "loss": 4.2171, "step": 771 }, { "epoch": 0.1993801652892562, "grad_norm": 3.580716371536255, "learning_rate": 4.526588489935314e-05, "loss": 4.414, "step": 772 }, { "epoch": 0.1996384297520661, "grad_norm": 2.52182674407959, "learning_rate": 4.5254000883248983e-05, "loss": 4.0829, "step": 773 }, { "epoch": 0.19989669421487602, "grad_norm": 3.6583306789398193, "learning_rate": 4.5242103533777334e-05, "loss": 4.0606, "step": 774 }, { "epoch": 0.20015495867768596, "grad_norm": 2.5405819416046143, "learning_rate": 4.523019285877033e-05, "loss": 4.1307, "step": 775 }, { "epoch": 0.20041322314049587, "grad_norm": 3.104281425476074, "learning_rate": 4.521826886606885e-05, "loss": 4.3555, "step": 776 }, { "epoch": 0.20067148760330578, "grad_norm": 3.244199752807617, "learning_rate": 4.5206331563522556e-05, "loss": 3.7304, "step": 777 }, { "epoch": 0.2009297520661157, "grad_norm": 7.347205638885498, "learning_rate": 4.519438095898987e-05, "loss": 4.3938, "step": 778 }, { "epoch": 0.20118801652892562, "grad_norm": 3.5583062171936035, "learning_rate": 4.518241706033796e-05, "loss": 4.0937, "step": 779 }, { "epoch": 0.20144628099173553, "grad_norm": 4.263914585113525, "learning_rate": 4.517043987544277e-05, "loss": 4.0452, "step": 780 }, { "epoch": 0.20170454545454544, "grad_norm": 4.166163444519043, "learning_rate": 4.515844941218895e-05, "loss": 4.0298, "step": 781 }, { "epoch": 0.20196280991735538, "grad_norm": 4.218254566192627, "learning_rate": 4.514644567846995e-05, "loss": 3.6993, "step": 782 }, { "epoch": 0.2022210743801653, "grad_norm": 3.3387298583984375, "learning_rate": 4.513442868218789e-05, "loss": 4.8478, "step": 783 }, { "epoch": 0.2024793388429752, "grad_norm": 3.7691876888275146, "learning_rate": 4.5122398431253674e-05, "loss": 3.6377, "step": 784 }, { "epoch": 0.20273760330578514, "grad_norm": 2.3479816913604736, "learning_rate": 4.5110354933586896e-05, "loss": 4.2979, "step": 785 }, { "epoch": 0.20299586776859505, "grad_norm": 3.026317596435547, "learning_rate": 4.509829819711589e-05, "loss": 4.2349, "step": 786 }, { "epoch": 0.20325413223140495, "grad_norm": 4.469359874725342, "learning_rate": 4.50862282297777e-05, "loss": 4.0826, "step": 787 }, { "epoch": 0.20351239669421486, "grad_norm": 1.9642670154571533, "learning_rate": 4.507414503951809e-05, "loss": 4.113, "step": 788 }, { "epoch": 0.2037706611570248, "grad_norm": 1.974671483039856, "learning_rate": 4.50620486342915e-05, "loss": 3.7563, "step": 789 }, { "epoch": 0.2040289256198347, "grad_norm": 6.337873935699463, "learning_rate": 4.5049939022061106e-05, "loss": 4.3487, "step": 790 }, { "epoch": 0.20428719008264462, "grad_norm": 2.5474801063537598, "learning_rate": 4.503781621079875e-05, "loss": 4.7519, "step": 791 }, { "epoch": 0.20454545454545456, "grad_norm": 2.955261707305908, "learning_rate": 4.502568020848498e-05, "loss": 4.2722, "step": 792 }, { "epoch": 0.20480371900826447, "grad_norm": 2.7820627689361572, "learning_rate": 4.5013531023109014e-05, "loss": 3.8388, "step": 793 }, { "epoch": 0.20506198347107438, "grad_norm": 3.136518955230713, "learning_rate": 4.5001368662668764e-05, "loss": 4.0557, "step": 794 }, { "epoch": 0.2053202479338843, "grad_norm": 7.473355770111084, "learning_rate": 4.498919313517079e-05, "loss": 3.9837, "step": 795 }, { "epoch": 0.20557851239669422, "grad_norm": 1.9485678672790527, "learning_rate": 4.497700444863037e-05, "loss": 3.9541, "step": 796 }, { "epoch": 0.20583677685950413, "grad_norm": 2.848452091217041, "learning_rate": 4.496480261107138e-05, "loss": 4.0974, "step": 797 }, { "epoch": 0.20609504132231404, "grad_norm": 1.9788488149642944, "learning_rate": 4.49525876305264e-05, "loss": 3.9326, "step": 798 }, { "epoch": 0.20635330578512398, "grad_norm": 3.2778806686401367, "learning_rate": 4.4940359515036644e-05, "loss": 4.0894, "step": 799 }, { "epoch": 0.2066115702479339, "grad_norm": 2.2333524227142334, "learning_rate": 4.4928118272651974e-05, "loss": 3.9516, "step": 800 }, { "epoch": 0.2068698347107438, "grad_norm": 3.5251846313476562, "learning_rate": 4.49158639114309e-05, "loss": 4.1429, "step": 801 }, { "epoch": 0.2071280991735537, "grad_norm": 7.7503252029418945, "learning_rate": 4.490359643944057e-05, "loss": 4.1691, "step": 802 }, { "epoch": 0.20738636363636365, "grad_norm": 2.2253003120422363, "learning_rate": 4.489131586475674e-05, "loss": 4.733, "step": 803 }, { "epoch": 0.20764462809917356, "grad_norm": 3.437422275543213, "learning_rate": 4.4879022195463825e-05, "loss": 3.6177, "step": 804 }, { "epoch": 0.20790289256198347, "grad_norm": 2.4864017963409424, "learning_rate": 4.486671543965484e-05, "loss": 4.4713, "step": 805 }, { "epoch": 0.20816115702479338, "grad_norm": 2.523683786392212, "learning_rate": 4.485439560543141e-05, "loss": 3.9539, "step": 806 }, { "epoch": 0.2084194214876033, "grad_norm": 2.870737075805664, "learning_rate": 4.4842062700903784e-05, "loss": 3.9689, "step": 807 }, { "epoch": 0.20867768595041322, "grad_norm": 2.139826536178589, "learning_rate": 4.482971673419081e-05, "loss": 4.4155, "step": 808 }, { "epoch": 0.20893595041322313, "grad_norm": 3.01000714302063, "learning_rate": 4.4817357713419935e-05, "loss": 4.4979, "step": 809 }, { "epoch": 0.20919421487603307, "grad_norm": 2.556209087371826, "learning_rate": 4.480498564672721e-05, "loss": 4.1049, "step": 810 }, { "epoch": 0.20945247933884298, "grad_norm": 2.159088611602783, "learning_rate": 4.4792600542257234e-05, "loss": 4.424, "step": 811 }, { "epoch": 0.2097107438016529, "grad_norm": 2.892057180404663, "learning_rate": 4.478020240816325e-05, "loss": 3.8141, "step": 812 }, { "epoch": 0.2099690082644628, "grad_norm": 2.6357154846191406, "learning_rate": 4.476779125260703e-05, "loss": 4.6252, "step": 813 }, { "epoch": 0.21022727272727273, "grad_norm": 3.0445687770843506, "learning_rate": 4.475536708375894e-05, "loss": 3.8323, "step": 814 }, { "epoch": 0.21048553719008264, "grad_norm": 4.832922458648682, "learning_rate": 4.47429299097979e-05, "loss": 4.0934, "step": 815 }, { "epoch": 0.21074380165289255, "grad_norm": 2.5932726860046387, "learning_rate": 4.4730479738911405e-05, "loss": 4.1387, "step": 816 }, { "epoch": 0.2110020661157025, "grad_norm": 3.0785789489746094, "learning_rate": 4.471801657929551e-05, "loss": 3.8914, "step": 817 }, { "epoch": 0.2112603305785124, "grad_norm": 3.756147861480713, "learning_rate": 4.470554043915479e-05, "loss": 4.1519, "step": 818 }, { "epoch": 0.2115185950413223, "grad_norm": 4.3295488357543945, "learning_rate": 4.46930513267024e-05, "loss": 4.0396, "step": 819 }, { "epoch": 0.21177685950413222, "grad_norm": 3.278559684753418, "learning_rate": 4.468054925016002e-05, "loss": 3.7943, "step": 820 }, { "epoch": 0.21203512396694216, "grad_norm": 3.1391477584838867, "learning_rate": 4.466803421775786e-05, "loss": 3.8171, "step": 821 }, { "epoch": 0.21229338842975207, "grad_norm": 3.000504732131958, "learning_rate": 4.4655506237734667e-05, "loss": 3.6223, "step": 822 }, { "epoch": 0.21255165289256198, "grad_norm": 4.056110382080078, "learning_rate": 4.4642965318337706e-05, "loss": 3.6174, "step": 823 }, { "epoch": 0.2128099173553719, "grad_norm": 3.8793773651123047, "learning_rate": 4.4630411467822764e-05, "loss": 3.641, "step": 824 }, { "epoch": 0.21306818181818182, "grad_norm": 9.166484832763672, "learning_rate": 4.4617844694454136e-05, "loss": 4.3761, "step": 825 }, { "epoch": 0.21332644628099173, "grad_norm": 7.283550262451172, "learning_rate": 4.460526500650464e-05, "loss": 5.2454, "step": 826 }, { "epoch": 0.21358471074380164, "grad_norm": 2.628862142562866, "learning_rate": 4.4592672412255565e-05, "loss": 4.2042, "step": 827 }, { "epoch": 0.21384297520661158, "grad_norm": 3.039041042327881, "learning_rate": 4.458006691999673e-05, "loss": 4.28, "step": 828 }, { "epoch": 0.2141012396694215, "grad_norm": 3.01914119720459, "learning_rate": 4.456744853802641e-05, "loss": 4.5731, "step": 829 }, { "epoch": 0.2143595041322314, "grad_norm": 4.560688018798828, "learning_rate": 4.455481727465141e-05, "loss": 4.2718, "step": 830 }, { "epoch": 0.21461776859504134, "grad_norm": 3.6328976154327393, "learning_rate": 4.454217313818697e-05, "loss": 3.9249, "step": 831 }, { "epoch": 0.21487603305785125, "grad_norm": 3.733856201171875, "learning_rate": 4.452951613695684e-05, "loss": 4.4751, "step": 832 }, { "epoch": 0.21513429752066116, "grad_norm": 5.364536762237549, "learning_rate": 4.45168462792932e-05, "loss": 4.4885, "step": 833 }, { "epoch": 0.21539256198347106, "grad_norm": 3.2331275939941406, "learning_rate": 4.450416357353674e-05, "loss": 4.636, "step": 834 }, { "epoch": 0.215650826446281, "grad_norm": 4.264749526977539, "learning_rate": 4.449146802803658e-05, "loss": 4.0546, "step": 835 }, { "epoch": 0.2159090909090909, "grad_norm": 3.437258005142212, "learning_rate": 4.447875965115028e-05, "loss": 4.2188, "step": 836 }, { "epoch": 0.21616735537190082, "grad_norm": 3.521293878555298, "learning_rate": 4.446603845124388e-05, "loss": 4.3785, "step": 837 }, { "epoch": 0.21642561983471073, "grad_norm": 1.5226248502731323, "learning_rate": 4.445330443669184e-05, "loss": 4.1082, "step": 838 }, { "epoch": 0.21668388429752067, "grad_norm": 2.578216791152954, "learning_rate": 4.444055761587708e-05, "loss": 4.0917, "step": 839 }, { "epoch": 0.21694214876033058, "grad_norm": 3.384460687637329, "learning_rate": 4.4427797997190896e-05, "loss": 4.2848, "step": 840 }, { "epoch": 0.2172004132231405, "grad_norm": 2.1781795024871826, "learning_rate": 4.4415025589033063e-05, "loss": 4.5481, "step": 841 }, { "epoch": 0.21745867768595042, "grad_norm": 4.577703475952148, "learning_rate": 4.440224039981178e-05, "loss": 3.4503, "step": 842 }, { "epoch": 0.21771694214876033, "grad_norm": 3.0125694274902344, "learning_rate": 4.438944243794359e-05, "loss": 4.2079, "step": 843 }, { "epoch": 0.21797520661157024, "grad_norm": 5.102847099304199, "learning_rate": 4.437663171185353e-05, "loss": 4.2557, "step": 844 }, { "epoch": 0.21823347107438015, "grad_norm": 5.496633052825928, "learning_rate": 4.4363808229974994e-05, "loss": 4.0074, "step": 845 }, { "epoch": 0.2184917355371901, "grad_norm": 4.407866954803467, "learning_rate": 4.4350972000749766e-05, "loss": 4.2241, "step": 846 }, { "epoch": 0.21875, "grad_norm": 2.376802682876587, "learning_rate": 4.433812303262805e-05, "loss": 4.5994, "step": 847 }, { "epoch": 0.2190082644628099, "grad_norm": 6.889456272125244, "learning_rate": 4.4325261334068426e-05, "loss": 4.076, "step": 848 }, { "epoch": 0.21926652892561985, "grad_norm": 3.544464588165283, "learning_rate": 4.431238691353784e-05, "loss": 4.156, "step": 849 }, { "epoch": 0.21952479338842976, "grad_norm": 2.7103078365325928, "learning_rate": 4.429949977951162e-05, "loss": 4.3353, "step": 850 }, { "epoch": 0.21978305785123967, "grad_norm": 12.386162757873535, "learning_rate": 4.4286599940473476e-05, "loss": 5.1442, "step": 851 }, { "epoch": 0.22004132231404958, "grad_norm": 3.5653157234191895, "learning_rate": 4.4273687404915475e-05, "loss": 3.9538, "step": 852 }, { "epoch": 0.2202995867768595, "grad_norm": 2.684201955795288, "learning_rate": 4.4260762181338035e-05, "loss": 4.1296, "step": 853 }, { "epoch": 0.22055785123966942, "grad_norm": 2.320798635482788, "learning_rate": 4.4247824278249936e-05, "loss": 4.2167, "step": 854 }, { "epoch": 0.22081611570247933, "grad_norm": 3.592247486114502, "learning_rate": 4.4234873704168296e-05, "loss": 3.9509, "step": 855 }, { "epoch": 0.22107438016528927, "grad_norm": 3.5105020999908447, "learning_rate": 4.4221910467618584e-05, "loss": 4.1076, "step": 856 }, { "epoch": 0.22133264462809918, "grad_norm": 2.848118782043457, "learning_rate": 4.4208934577134595e-05, "loss": 4.2825, "step": 857 }, { "epoch": 0.2215909090909091, "grad_norm": 2.649441719055176, "learning_rate": 4.419594604125846e-05, "loss": 3.6569, "step": 858 }, { "epoch": 0.221849173553719, "grad_norm": 3.3287434577941895, "learning_rate": 4.4182944868540646e-05, "loss": 4.1172, "step": 859 }, { "epoch": 0.22210743801652894, "grad_norm": 4.613007068634033, "learning_rate": 4.416993106753992e-05, "loss": 4.6822, "step": 860 }, { "epoch": 0.22236570247933884, "grad_norm": 4.854801654815674, "learning_rate": 4.415690464682335e-05, "loss": 4.3457, "step": 861 }, { "epoch": 0.22262396694214875, "grad_norm": 2.0689849853515625, "learning_rate": 4.4143865614966364e-05, "loss": 3.6008, "step": 862 }, { "epoch": 0.22288223140495866, "grad_norm": 5.494517803192139, "learning_rate": 4.413081398055263e-05, "loss": 4.6979, "step": 863 }, { "epoch": 0.2231404958677686, "grad_norm": 3.6004061698913574, "learning_rate": 4.411774975217416e-05, "loss": 4.5544, "step": 864 }, { "epoch": 0.2233987603305785, "grad_norm": 3.3661012649536133, "learning_rate": 4.410467293843123e-05, "loss": 3.4557, "step": 865 }, { "epoch": 0.22365702479338842, "grad_norm": 3.6384708881378174, "learning_rate": 4.409158354793242e-05, "loss": 4.3085, "step": 866 }, { "epoch": 0.22391528925619836, "grad_norm": 2.2689030170440674, "learning_rate": 4.407848158929455e-05, "loss": 4.1799, "step": 867 }, { "epoch": 0.22417355371900827, "grad_norm": 2.136688470840454, "learning_rate": 4.4065367071142775e-05, "loss": 4.5497, "step": 868 }, { "epoch": 0.22443181818181818, "grad_norm": 2.900409698486328, "learning_rate": 4.405224000211047e-05, "loss": 4.6585, "step": 869 }, { "epoch": 0.2246900826446281, "grad_norm": 2.5901622772216797, "learning_rate": 4.403910039083928e-05, "loss": 4.1039, "step": 870 }, { "epoch": 0.22494834710743802, "grad_norm": 2.075059652328491, "learning_rate": 4.4025948245979126e-05, "loss": 4.139, "step": 871 }, { "epoch": 0.22520661157024793, "grad_norm": 3.1953718662261963, "learning_rate": 4.4012783576188165e-05, "loss": 4.4573, "step": 872 }, { "epoch": 0.22546487603305784, "grad_norm": 3.088961362838745, "learning_rate": 4.3999606390132806e-05, "loss": 4.018, "step": 873 }, { "epoch": 0.22572314049586778, "grad_norm": 2.4549880027770996, "learning_rate": 4.398641669648769e-05, "loss": 4.4989, "step": 874 }, { "epoch": 0.2259814049586777, "grad_norm": 4.473962306976318, "learning_rate": 4.3973214503935685e-05, "loss": 4.4688, "step": 875 }, { "epoch": 0.2262396694214876, "grad_norm": 2.4728336334228516, "learning_rate": 4.395999982116791e-05, "loss": 3.9551, "step": 876 }, { "epoch": 0.2264979338842975, "grad_norm": 3.8104357719421387, "learning_rate": 4.39467726568837e-05, "loss": 4.2868, "step": 877 }, { "epoch": 0.22675619834710745, "grad_norm": 3.6108832359313965, "learning_rate": 4.3933533019790585e-05, "loss": 4.5502, "step": 878 }, { "epoch": 0.22701446280991736, "grad_norm": 3.074371576309204, "learning_rate": 4.392028091860434e-05, "loss": 3.9417, "step": 879 }, { "epoch": 0.22727272727272727, "grad_norm": 3.0894317626953125, "learning_rate": 4.39070163620489e-05, "loss": 4.5363, "step": 880 }, { "epoch": 0.2275309917355372, "grad_norm": 3.798907518386841, "learning_rate": 4.389373935885646e-05, "loss": 4.4148, "step": 881 }, { "epoch": 0.2277892561983471, "grad_norm": 2.875974416732788, "learning_rate": 4.388044991776735e-05, "loss": 4.4033, "step": 882 }, { "epoch": 0.22804752066115702, "grad_norm": 2.674808979034424, "learning_rate": 4.3867148047530126e-05, "loss": 4.2849, "step": 883 }, { "epoch": 0.22830578512396693, "grad_norm": 2.14864182472229, "learning_rate": 4.385383375690151e-05, "loss": 4.1571, "step": 884 }, { "epoch": 0.22856404958677687, "grad_norm": 4.357775688171387, "learning_rate": 4.38405070546464e-05, "loss": 4.5121, "step": 885 }, { "epoch": 0.22882231404958678, "grad_norm": 1.9463998079299927, "learning_rate": 4.3827167949537875e-05, "loss": 4.22, "step": 886 }, { "epoch": 0.2290805785123967, "grad_norm": 4.838544845581055, "learning_rate": 4.381381645035716e-05, "loss": 3.9622, "step": 887 }, { "epoch": 0.22933884297520662, "grad_norm": 4.577043533325195, "learning_rate": 4.380045256589367e-05, "loss": 4.6116, "step": 888 }, { "epoch": 0.22959710743801653, "grad_norm": 1.8865857124328613, "learning_rate": 4.378707630494494e-05, "loss": 4.1074, "step": 889 }, { "epoch": 0.22985537190082644, "grad_norm": 3.7228996753692627, "learning_rate": 4.377368767631668e-05, "loss": 3.8637, "step": 890 }, { "epoch": 0.23011363636363635, "grad_norm": 1.9472205638885498, "learning_rate": 4.3760286688822714e-05, "loss": 3.6949, "step": 891 }, { "epoch": 0.2303719008264463, "grad_norm": 2.6202075481414795, "learning_rate": 4.3746873351285027e-05, "loss": 3.8949, "step": 892 }, { "epoch": 0.2306301652892562, "grad_norm": 4.317259788513184, "learning_rate": 4.3733447672533724e-05, "loss": 4.1851, "step": 893 }, { "epoch": 0.2308884297520661, "grad_norm": 3.1590700149536133, "learning_rate": 4.372000966140704e-05, "loss": 4.0922, "step": 894 }, { "epoch": 0.23114669421487602, "grad_norm": 10.871603965759277, "learning_rate": 4.3706559326751295e-05, "loss": 3.4922, "step": 895 }, { "epoch": 0.23140495867768596, "grad_norm": 2.7782299518585205, "learning_rate": 4.369309667742099e-05, "loss": 3.8421, "step": 896 }, { "epoch": 0.23166322314049587, "grad_norm": 2.49131441116333, "learning_rate": 4.367962172227866e-05, "loss": 4.0689, "step": 897 }, { "epoch": 0.23192148760330578, "grad_norm": 2.855863332748413, "learning_rate": 4.3666134470194994e-05, "loss": 4.623, "step": 898 }, { "epoch": 0.2321797520661157, "grad_norm": 5.905206680297852, "learning_rate": 4.3652634930048745e-05, "loss": 3.1105, "step": 899 }, { "epoch": 0.23243801652892562, "grad_norm": 3.153353214263916, "learning_rate": 4.363912311072678e-05, "loss": 4.0054, "step": 900 }, { "epoch": 0.23269628099173553, "grad_norm": 3.186532735824585, "learning_rate": 4.362559902112401e-05, "loss": 4.0115, "step": 901 }, { "epoch": 0.23295454545454544, "grad_norm": 2.9772372245788574, "learning_rate": 4.361206267014347e-05, "loss": 3.7727, "step": 902 }, { "epoch": 0.23321280991735538, "grad_norm": 2.681910753250122, "learning_rate": 4.359851406669624e-05, "loss": 4.2157, "step": 903 }, { "epoch": 0.2334710743801653, "grad_norm": 3.157335042953491, "learning_rate": 4.358495321970147e-05, "loss": 4.1369, "step": 904 }, { "epoch": 0.2337293388429752, "grad_norm": 2.4806666374206543, "learning_rate": 4.3571380138086373e-05, "loss": 4.2868, "step": 905 }, { "epoch": 0.23398760330578514, "grad_norm": 5.0754852294921875, "learning_rate": 4.355779483078622e-05, "loss": 4.0493, "step": 906 }, { "epoch": 0.23424586776859505, "grad_norm": 3.0326695442199707, "learning_rate": 4.3544197306744316e-05, "loss": 3.6005, "step": 907 }, { "epoch": 0.23450413223140495, "grad_norm": 5.225083351135254, "learning_rate": 4.353058757491202e-05, "loss": 3.7361, "step": 908 }, { "epoch": 0.23476239669421486, "grad_norm": 2.5658252239227295, "learning_rate": 4.3516965644248734e-05, "loss": 4.3769, "step": 909 }, { "epoch": 0.2350206611570248, "grad_norm": 3.9115495681762695, "learning_rate": 4.350333152372188e-05, "loss": 3.9403, "step": 910 }, { "epoch": 0.2352789256198347, "grad_norm": 3.2285561561584473, "learning_rate": 4.348968522230689e-05, "loss": 4.529, "step": 911 }, { "epoch": 0.23553719008264462, "grad_norm": 3.064793348312378, "learning_rate": 4.347602674898725e-05, "loss": 4.3386, "step": 912 }, { "epoch": 0.23579545454545456, "grad_norm": 3.734269618988037, "learning_rate": 4.346235611275443e-05, "loss": 3.9988, "step": 913 }, { "epoch": 0.23605371900826447, "grad_norm": 1.8592121601104736, "learning_rate": 4.3448673322607916e-05, "loss": 4.6762, "step": 914 }, { "epoch": 0.23631198347107438, "grad_norm": 3.8882291316986084, "learning_rate": 4.34349783875552e-05, "loss": 4.7109, "step": 915 }, { "epoch": 0.2365702479338843, "grad_norm": 2.41532564163208, "learning_rate": 4.342127131661177e-05, "loss": 4.0183, "step": 916 }, { "epoch": 0.23682851239669422, "grad_norm": 2.3233115673065186, "learning_rate": 4.34075521188011e-05, "loss": 4.0851, "step": 917 }, { "epoch": 0.23708677685950413, "grad_norm": 2.623234987258911, "learning_rate": 4.339382080315462e-05, "loss": 3.9296, "step": 918 }, { "epoch": 0.23734504132231404, "grad_norm": 2.5202085971832275, "learning_rate": 4.338007737871179e-05, "loss": 4.1267, "step": 919 }, { "epoch": 0.23760330578512398, "grad_norm": 2.1604621410369873, "learning_rate": 4.336632185452001e-05, "loss": 4.671, "step": 920 }, { "epoch": 0.2378615702479339, "grad_norm": 3.659377336502075, "learning_rate": 4.3352554239634636e-05, "loss": 4.164, "step": 921 }, { "epoch": 0.2381198347107438, "grad_norm": 3.0518481731414795, "learning_rate": 4.333877454311901e-05, "loss": 4.6791, "step": 922 }, { "epoch": 0.2383780991735537, "grad_norm": 1.8310980796813965, "learning_rate": 4.332498277404441e-05, "loss": 4.214, "step": 923 }, { "epoch": 0.23863636363636365, "grad_norm": 2.159632921218872, "learning_rate": 4.331117894149006e-05, "loss": 4.4342, "step": 924 }, { "epoch": 0.23889462809917356, "grad_norm": 3.430116653442383, "learning_rate": 4.329736305454314e-05, "loss": 5.1181, "step": 925 }, { "epoch": 0.23915289256198347, "grad_norm": 3.195340633392334, "learning_rate": 4.3283535122298746e-05, "loss": 4.0403, "step": 926 }, { "epoch": 0.23941115702479338, "grad_norm": 4.278591632843018, "learning_rate": 4.3269695153859926e-05, "loss": 4.3194, "step": 927 }, { "epoch": 0.2396694214876033, "grad_norm": 2.874901294708252, "learning_rate": 4.325584315833764e-05, "loss": 4.298, "step": 928 }, { "epoch": 0.23992768595041322, "grad_norm": 3.4892256259918213, "learning_rate": 4.324197914485075e-05, "loss": 4.2697, "step": 929 }, { "epoch": 0.24018595041322313, "grad_norm": 6.47683048248291, "learning_rate": 4.322810312252606e-05, "loss": 4.2134, "step": 930 }, { "epoch": 0.24044421487603307, "grad_norm": 3.7023160457611084, "learning_rate": 4.3214215100498255e-05, "loss": 4.2087, "step": 931 }, { "epoch": 0.24070247933884298, "grad_norm": 2.218562602996826, "learning_rate": 4.320031508790994e-05, "loss": 4.1136, "step": 932 }, { "epoch": 0.2409607438016529, "grad_norm": 3.1495847702026367, "learning_rate": 4.3186403093911585e-05, "loss": 4.2735, "step": 933 }, { "epoch": 0.2412190082644628, "grad_norm": 3.51042103767395, "learning_rate": 4.317247912766158e-05, "loss": 4.269, "step": 934 }, { "epoch": 0.24147727272727273, "grad_norm": 5.290986061096191, "learning_rate": 4.315854319832617e-05, "loss": 4.2107, "step": 935 }, { "epoch": 0.24173553719008264, "grad_norm": 8.237841606140137, "learning_rate": 4.314459531507949e-05, "loss": 4.1172, "step": 936 }, { "epoch": 0.24199380165289255, "grad_norm": 2.791114568710327, "learning_rate": 4.3130635487103555e-05, "loss": 4.4769, "step": 937 }, { "epoch": 0.2422520661157025, "grad_norm": 3.3846542835235596, "learning_rate": 4.3116663723588214e-05, "loss": 4.1614, "step": 938 }, { "epoch": 0.2425103305785124, "grad_norm": 1.8117622137069702, "learning_rate": 4.310268003373118e-05, "loss": 4.1227, "step": 939 }, { "epoch": 0.2427685950413223, "grad_norm": 2.3627421855926514, "learning_rate": 4.308868442673805e-05, "loss": 4.0362, "step": 940 }, { "epoch": 0.24302685950413222, "grad_norm": 4.308487415313721, "learning_rate": 4.307467691182222e-05, "loss": 4.2588, "step": 941 }, { "epoch": 0.24328512396694216, "grad_norm": 4.417941570281982, "learning_rate": 4.3060657498204954e-05, "loss": 3.8574, "step": 942 }, { "epoch": 0.24354338842975207, "grad_norm": 3.7312204837799072, "learning_rate": 4.304662619511535e-05, "loss": 3.8287, "step": 943 }, { "epoch": 0.24380165289256198, "grad_norm": 2.168752908706665, "learning_rate": 4.3032583011790304e-05, "loss": 4.1341, "step": 944 }, { "epoch": 0.2440599173553719, "grad_norm": 2.3002216815948486, "learning_rate": 4.301852795747458e-05, "loss": 4.044, "step": 945 }, { "epoch": 0.24431818181818182, "grad_norm": 2.5160176753997803, "learning_rate": 4.3004461041420715e-05, "loss": 4.207, "step": 946 }, { "epoch": 0.24457644628099173, "grad_norm": 5.469270706176758, "learning_rate": 4.299038227288907e-05, "loss": 4.365, "step": 947 }, { "epoch": 0.24483471074380164, "grad_norm": 3.209669589996338, "learning_rate": 4.297629166114782e-05, "loss": 3.7353, "step": 948 }, { "epoch": 0.24509297520661158, "grad_norm": 2.2678818702697754, "learning_rate": 4.296218921547291e-05, "loss": 4.4308, "step": 949 }, { "epoch": 0.2453512396694215, "grad_norm": 3.724682569503784, "learning_rate": 4.29480749451481e-05, "loss": 4.3202, "step": 950 }, { "epoch": 0.2456095041322314, "grad_norm": 3.7267355918884277, "learning_rate": 4.293394885946492e-05, "loss": 3.7113, "step": 951 }, { "epoch": 0.24586776859504134, "grad_norm": 3.8634958267211914, "learning_rate": 4.291981096772268e-05, "loss": 4.4734, "step": 952 }, { "epoch": 0.24612603305785125, "grad_norm": 5.0777764320373535, "learning_rate": 4.290566127922848e-05, "loss": 4.0934, "step": 953 }, { "epoch": 0.24638429752066116, "grad_norm": 8.540276527404785, "learning_rate": 4.289149980329715e-05, "loss": 3.6062, "step": 954 }, { "epoch": 0.24664256198347106, "grad_norm": 3.117661952972412, "learning_rate": 4.2877326549251326e-05, "loss": 4.7231, "step": 955 }, { "epoch": 0.246900826446281, "grad_norm": 3.710587739944458, "learning_rate": 4.2863141526421346e-05, "loss": 4.0312, "step": 956 }, { "epoch": 0.2471590909090909, "grad_norm": 3.204016923904419, "learning_rate": 4.284894474414533e-05, "loss": 4.8839, "step": 957 }, { "epoch": 0.24741735537190082, "grad_norm": 4.610998153686523, "learning_rate": 4.283473621176914e-05, "loss": 3.9641, "step": 958 }, { "epoch": 0.24767561983471073, "grad_norm": 3.1295664310455322, "learning_rate": 4.282051593864635e-05, "loss": 4.3894, "step": 959 }, { "epoch": 0.24793388429752067, "grad_norm": 2.8181183338165283, "learning_rate": 4.28062839341383e-05, "loss": 4.1948, "step": 960 }, { "epoch": 0.24819214876033058, "grad_norm": 6.70087194442749, "learning_rate": 4.2792040207614005e-05, "loss": 3.7748, "step": 961 }, { "epoch": 0.2484504132231405, "grad_norm": 5.295668601989746, "learning_rate": 4.2777784768450237e-05, "loss": 4.2435, "step": 962 }, { "epoch": 0.24870867768595042, "grad_norm": 1.9001904726028442, "learning_rate": 4.276351762603146e-05, "loss": 4.2316, "step": 963 }, { "epoch": 0.24896694214876033, "grad_norm": 2.8024916648864746, "learning_rate": 4.2749238789749835e-05, "loss": 4.2828, "step": 964 }, { "epoch": 0.24922520661157024, "grad_norm": 3.3703365325927734, "learning_rate": 4.273494826900525e-05, "loss": 4.1863, "step": 965 }, { "epoch": 0.24948347107438015, "grad_norm": 3.663123846054077, "learning_rate": 4.272064607320525e-05, "loss": 4.194, "step": 966 }, { "epoch": 0.2497417355371901, "grad_norm": 3.7679970264434814, "learning_rate": 4.27063322117651e-05, "loss": 4.6305, "step": 967 }, { "epoch": 0.25, "grad_norm": 2.2279958724975586, "learning_rate": 4.269200669410771e-05, "loss": 3.9988, "step": 968 }, { "epoch": 0.2502582644628099, "grad_norm": 2.692396879196167, "learning_rate": 4.267766952966369e-05, "loss": 4.5393, "step": 969 }, { "epoch": 0.2505165289256198, "grad_norm": 5.140891075134277, "learning_rate": 4.2663320727871305e-05, "loss": 4.3359, "step": 970 }, { "epoch": 0.25077479338842973, "grad_norm": 5.090999126434326, "learning_rate": 4.264896029817649e-05, "loss": 3.7616, "step": 971 }, { "epoch": 0.2510330578512397, "grad_norm": 2.0044045448303223, "learning_rate": 4.263458825003282e-05, "loss": 4.1177, "step": 972 }, { "epoch": 0.2512913223140496, "grad_norm": 2.801265239715576, "learning_rate": 4.262020459290152e-05, "loss": 4.3287, "step": 973 }, { "epoch": 0.2515495867768595, "grad_norm": 2.9384589195251465, "learning_rate": 4.2605809336251476e-05, "loss": 4.8009, "step": 974 }, { "epoch": 0.2518078512396694, "grad_norm": 3.982542037963867, "learning_rate": 4.2591402489559205e-05, "loss": 4.6742, "step": 975 }, { "epoch": 0.25206611570247933, "grad_norm": 3.405592203140259, "learning_rate": 4.257698406230884e-05, "loss": 4.4247, "step": 976 }, { "epoch": 0.25232438016528924, "grad_norm": 4.526679992675781, "learning_rate": 4.256255406399213e-05, "loss": 4.1347, "step": 977 }, { "epoch": 0.25258264462809915, "grad_norm": 5.792008876800537, "learning_rate": 4.254811250410849e-05, "loss": 4.0011, "step": 978 }, { "epoch": 0.2528409090909091, "grad_norm": 2.838899612426758, "learning_rate": 4.2533659392164874e-05, "loss": 4.7916, "step": 979 }, { "epoch": 0.253099173553719, "grad_norm": 2.886744499206543, "learning_rate": 4.251919473767589e-05, "loss": 3.9895, "step": 980 }, { "epoch": 0.25335743801652894, "grad_norm": 2.2625539302825928, "learning_rate": 4.2504718550163745e-05, "loss": 4.281, "step": 981 }, { "epoch": 0.25361570247933884, "grad_norm": 2.3116164207458496, "learning_rate": 4.249023083915823e-05, "loss": 4.6059, "step": 982 }, { "epoch": 0.25387396694214875, "grad_norm": 4.1350016593933105, "learning_rate": 4.2475731614196704e-05, "loss": 3.8765, "step": 983 }, { "epoch": 0.25413223140495866, "grad_norm": 3.198376178741455, "learning_rate": 4.246122088482412e-05, "loss": 4.5437, "step": 984 }, { "epoch": 0.2543904958677686, "grad_norm": 2.8990695476531982, "learning_rate": 4.244669866059302e-05, "loss": 3.7207, "step": 985 }, { "epoch": 0.25464876033057854, "grad_norm": 2.205800771713257, "learning_rate": 4.243216495106348e-05, "loss": 4.7103, "step": 986 }, { "epoch": 0.25490702479338845, "grad_norm": 2.1716272830963135, "learning_rate": 4.241761976580316e-05, "loss": 4.0538, "step": 987 }, { "epoch": 0.25516528925619836, "grad_norm": 4.29656982421875, "learning_rate": 4.2403063114387286e-05, "loss": 4.2143, "step": 988 }, { "epoch": 0.25542355371900827, "grad_norm": 2.191084623336792, "learning_rate": 4.238849500639859e-05, "loss": 4.3029, "step": 989 }, { "epoch": 0.2556818181818182, "grad_norm": 3.1089742183685303, "learning_rate": 4.2373915451427394e-05, "loss": 4.3218, "step": 990 }, { "epoch": 0.2559400826446281, "grad_norm": 1.9347580671310425, "learning_rate": 4.235932445907152e-05, "loss": 3.9737, "step": 991 }, { "epoch": 0.256198347107438, "grad_norm": 3.4153876304626465, "learning_rate": 4.234472203893634e-05, "loss": 4.397, "step": 992 }, { "epoch": 0.25645661157024796, "grad_norm": 2.3127739429473877, "learning_rate": 4.233010820063473e-05, "loss": 3.7235, "step": 993 }, { "epoch": 0.25671487603305787, "grad_norm": 2.109673261642456, "learning_rate": 4.23154829537871e-05, "loss": 4.2191, "step": 994 }, { "epoch": 0.2569731404958678, "grad_norm": 5.036170959472656, "learning_rate": 4.230084630802137e-05, "loss": 4.499, "step": 995 }, { "epoch": 0.2572314049586777, "grad_norm": 5.0232110023498535, "learning_rate": 4.228619827297296e-05, "loss": 3.8685, "step": 996 }, { "epoch": 0.2574896694214876, "grad_norm": 2.7369680404663086, "learning_rate": 4.227153885828478e-05, "loss": 4.1708, "step": 997 }, { "epoch": 0.2577479338842975, "grad_norm": 3.0924153327941895, "learning_rate": 4.2256868073607235e-05, "loss": 4.127, "step": 998 }, { "epoch": 0.2580061983471074, "grad_norm": 3.9515185356140137, "learning_rate": 4.224218592859822e-05, "loss": 3.8606, "step": 999 }, { "epoch": 0.25826446280991733, "grad_norm": 2.0813560485839844, "learning_rate": 4.222749243292311e-05, "loss": 4.0674, "step": 1000 }, { "epoch": 0.2585227272727273, "grad_norm": 2.20292592048645, "learning_rate": 4.2212787596254757e-05, "loss": 4.1582, "step": 1001 }, { "epoch": 0.2587809917355372, "grad_norm": 3.86102294921875, "learning_rate": 4.2198071428273444e-05, "loss": 4.3126, "step": 1002 }, { "epoch": 0.2590392561983471, "grad_norm": 1.869868278503418, "learning_rate": 4.218334393866696e-05, "loss": 4.1637, "step": 1003 }, { "epoch": 0.259297520661157, "grad_norm": 5.46482515335083, "learning_rate": 4.2168605137130516e-05, "loss": 4.1872, "step": 1004 }, { "epoch": 0.25955578512396693, "grad_norm": 2.130531072616577, "learning_rate": 4.2153855033366796e-05, "loss": 4.4283, "step": 1005 }, { "epoch": 0.25981404958677684, "grad_norm": 4.729911804199219, "learning_rate": 4.213909363708589e-05, "loss": 3.93, "step": 1006 }, { "epoch": 0.26007231404958675, "grad_norm": 4.158443927764893, "learning_rate": 4.212432095800536e-05, "loss": 4.6786, "step": 1007 }, { "epoch": 0.2603305785123967, "grad_norm": 2.7735049724578857, "learning_rate": 4.2109537005850153e-05, "loss": 4.0379, "step": 1008 }, { "epoch": 0.2605888429752066, "grad_norm": 6.6612162590026855, "learning_rate": 4.2094741790352675e-05, "loss": 3.712, "step": 1009 }, { "epoch": 0.26084710743801653, "grad_norm": 7.1550092697143555, "learning_rate": 4.207993532125274e-05, "loss": 4.5475, "step": 1010 }, { "epoch": 0.26110537190082644, "grad_norm": 2.0233993530273438, "learning_rate": 4.2065117608297556e-05, "loss": 4.6104, "step": 1011 }, { "epoch": 0.26136363636363635, "grad_norm": 2.4000604152679443, "learning_rate": 4.205028866124173e-05, "loss": 4.4392, "step": 1012 }, { "epoch": 0.26162190082644626, "grad_norm": 3.743776798248291, "learning_rate": 4.2035448489847284e-05, "loss": 3.7928, "step": 1013 }, { "epoch": 0.2618801652892562, "grad_norm": 2.160031795501709, "learning_rate": 4.202059710388361e-05, "loss": 4.0695, "step": 1014 }, { "epoch": 0.26213842975206614, "grad_norm": 3.746325731277466, "learning_rate": 4.20057345131275e-05, "loss": 4.1082, "step": 1015 }, { "epoch": 0.26239669421487605, "grad_norm": 3.5613245964050293, "learning_rate": 4.199086072736311e-05, "loss": 3.9396, "step": 1016 }, { "epoch": 0.26265495867768596, "grad_norm": 4.533037185668945, "learning_rate": 4.197597575638198e-05, "loss": 4.6382, "step": 1017 }, { "epoch": 0.26291322314049587, "grad_norm": 3.698241949081421, "learning_rate": 4.1961079609982994e-05, "loss": 4.6569, "step": 1018 }, { "epoch": 0.2631714876033058, "grad_norm": 5.22740364074707, "learning_rate": 4.19461722979724e-05, "loss": 4.138, "step": 1019 }, { "epoch": 0.2634297520661157, "grad_norm": 4.092823028564453, "learning_rate": 4.19312538301638e-05, "loss": 4.2261, "step": 1020 }, { "epoch": 0.2636880165289256, "grad_norm": 2.708958864212036, "learning_rate": 4.191632421637815e-05, "loss": 4.13, "step": 1021 }, { "epoch": 0.26394628099173556, "grad_norm": 3.197927236557007, "learning_rate": 4.19013834664437e-05, "loss": 4.5138, "step": 1022 }, { "epoch": 0.26420454545454547, "grad_norm": 4.1251912117004395, "learning_rate": 4.1886431590196096e-05, "loss": 4.4913, "step": 1023 }, { "epoch": 0.2644628099173554, "grad_norm": 3.269256114959717, "learning_rate": 4.187146859747826e-05, "loss": 4.3209, "step": 1024 }, { "epoch": 0.2647210743801653, "grad_norm": 3.185358762741089, "learning_rate": 4.1856494498140454e-05, "loss": 4.0905, "step": 1025 }, { "epoch": 0.2649793388429752, "grad_norm": 3.5503063201904297, "learning_rate": 4.184150930204024e-05, "loss": 4.4266, "step": 1026 }, { "epoch": 0.2652376033057851, "grad_norm": 3.6543872356414795, "learning_rate": 4.182651301904249e-05, "loss": 4.0172, "step": 1027 }, { "epoch": 0.265495867768595, "grad_norm": 5.779666423797607, "learning_rate": 4.181150565901938e-05, "loss": 4.0199, "step": 1028 }, { "epoch": 0.265754132231405, "grad_norm": 3.1108529567718506, "learning_rate": 4.179648723185036e-05, "loss": 4.1383, "step": 1029 }, { "epoch": 0.2660123966942149, "grad_norm": 2.1474010944366455, "learning_rate": 4.178145774742219e-05, "loss": 4.1521, "step": 1030 }, { "epoch": 0.2662706611570248, "grad_norm": 2.8138372898101807, "learning_rate": 4.176641721562889e-05, "loss": 4.4746, "step": 1031 }, { "epoch": 0.2665289256198347, "grad_norm": 2.225350856781006, "learning_rate": 4.1751365646371776e-05, "loss": 4.076, "step": 1032 }, { "epoch": 0.2667871900826446, "grad_norm": 3.6185245513916016, "learning_rate": 4.173630304955939e-05, "loss": 4.4646, "step": 1033 }, { "epoch": 0.26704545454545453, "grad_norm": 4.271774768829346, "learning_rate": 4.172122943510758e-05, "loss": 4.2733, "step": 1034 }, { "epoch": 0.26730371900826444, "grad_norm": 3.6717050075531006, "learning_rate": 4.170614481293941e-05, "loss": 4.2623, "step": 1035 }, { "epoch": 0.2675619834710744, "grad_norm": 4.598880767822266, "learning_rate": 4.169104919298521e-05, "loss": 4.273, "step": 1036 }, { "epoch": 0.2678202479338843, "grad_norm": 5.0541276931762695, "learning_rate": 4.167594258518254e-05, "loss": 4.038, "step": 1037 }, { "epoch": 0.2680785123966942, "grad_norm": 2.616177558898926, "learning_rate": 4.16608249994762e-05, "loss": 3.9789, "step": 1038 }, { "epoch": 0.26833677685950413, "grad_norm": 2.7061588764190674, "learning_rate": 4.164569644581823e-05, "loss": 4.1141, "step": 1039 }, { "epoch": 0.26859504132231404, "grad_norm": 4.4221272468566895, "learning_rate": 4.163055693416785e-05, "loss": 4.0555, "step": 1040 }, { "epoch": 0.26885330578512395, "grad_norm": 2.433444023132324, "learning_rate": 4.161540647449154e-05, "loss": 4.2466, "step": 1041 }, { "epoch": 0.26911157024793386, "grad_norm": 2.9873366355895996, "learning_rate": 4.1600245076762955e-05, "loss": 4.5674, "step": 1042 }, { "epoch": 0.2693698347107438, "grad_norm": 5.345172882080078, "learning_rate": 4.1585072750962964e-05, "loss": 3.6472, "step": 1043 }, { "epoch": 0.26962809917355374, "grad_norm": 3.6425552368164062, "learning_rate": 4.156988950707963e-05, "loss": 4.2707, "step": 1044 }, { "epoch": 0.26988636363636365, "grad_norm": 3.5044994354248047, "learning_rate": 4.15546953551082e-05, "loss": 3.8413, "step": 1045 }, { "epoch": 0.27014462809917356, "grad_norm": 3.2339987754821777, "learning_rate": 4.1539490305051107e-05, "loss": 3.8149, "step": 1046 }, { "epoch": 0.27040289256198347, "grad_norm": 3.139009952545166, "learning_rate": 4.152427436691795e-05, "loss": 4.0161, "step": 1047 }, { "epoch": 0.2706611570247934, "grad_norm": 1.8700014352798462, "learning_rate": 4.15090475507255e-05, "loss": 3.9803, "step": 1048 }, { "epoch": 0.2709194214876033, "grad_norm": 3.7616944313049316, "learning_rate": 4.1493809866497694e-05, "loss": 4.2395, "step": 1049 }, { "epoch": 0.27117768595041325, "grad_norm": 2.421318531036377, "learning_rate": 4.147856132426561e-05, "loss": 4.0041, "step": 1050 }, { "epoch": 0.27143595041322316, "grad_norm": 2.838829755783081, "learning_rate": 4.146330193406749e-05, "loss": 4.3821, "step": 1051 }, { "epoch": 0.27169421487603307, "grad_norm": 2.72395396232605, "learning_rate": 4.144803170594871e-05, "loss": 4.1345, "step": 1052 }, { "epoch": 0.271952479338843, "grad_norm": 2.807156562805176, "learning_rate": 4.1432750649961785e-05, "loss": 4.0784, "step": 1053 }, { "epoch": 0.2722107438016529, "grad_norm": 3.293426275253296, "learning_rate": 4.141745877616635e-05, "loss": 4.1106, "step": 1054 }, { "epoch": 0.2724690082644628, "grad_norm": 1.9212918281555176, "learning_rate": 4.140215609462915e-05, "loss": 3.922, "step": 1055 }, { "epoch": 0.2727272727272727, "grad_norm": 4.678307056427002, "learning_rate": 4.138684261542409e-05, "loss": 3.6036, "step": 1056 }, { "epoch": 0.27298553719008267, "grad_norm": 3.258934736251831, "learning_rate": 4.137151834863213e-05, "loss": 4.3222, "step": 1057 }, { "epoch": 0.2732438016528926, "grad_norm": 2.5442395210266113, "learning_rate": 4.135618330434136e-05, "loss": 4.54, "step": 1058 }, { "epoch": 0.2735020661157025, "grad_norm": 1.9547654390335083, "learning_rate": 4.134083749264697e-05, "loss": 4.3036, "step": 1059 }, { "epoch": 0.2737603305785124, "grad_norm": 2.2999308109283447, "learning_rate": 4.132548092365122e-05, "loss": 4.5731, "step": 1060 }, { "epoch": 0.2740185950413223, "grad_norm": 1.948661208152771, "learning_rate": 4.131011360746345e-05, "loss": 4.6524, "step": 1061 }, { "epoch": 0.2742768595041322, "grad_norm": 3.2606709003448486, "learning_rate": 4.1294735554200103e-05, "loss": 3.6862, "step": 1062 }, { "epoch": 0.27453512396694213, "grad_norm": 2.1198394298553467, "learning_rate": 4.127934677398466e-05, "loss": 4.3393, "step": 1063 }, { "epoch": 0.27479338842975204, "grad_norm": 3.1867964267730713, "learning_rate": 4.1263947276947686e-05, "loss": 4.4048, "step": 1064 }, { "epoch": 0.275051652892562, "grad_norm": 2.775479793548584, "learning_rate": 4.1248537073226775e-05, "loss": 3.5927, "step": 1065 }, { "epoch": 0.2753099173553719, "grad_norm": 3.2473018169403076, "learning_rate": 4.1233116172966595e-05, "loss": 4.4045, "step": 1066 }, { "epoch": 0.2755681818181818, "grad_norm": 4.158466815948486, "learning_rate": 4.121768458631885e-05, "loss": 4.6994, "step": 1067 }, { "epoch": 0.27582644628099173, "grad_norm": 4.224048614501953, "learning_rate": 4.120224232344227e-05, "loss": 4.1222, "step": 1068 }, { "epoch": 0.27608471074380164, "grad_norm": 3.064734697341919, "learning_rate": 4.118678939450261e-05, "loss": 3.8419, "step": 1069 }, { "epoch": 0.27634297520661155, "grad_norm": 2.1094415187835693, "learning_rate": 4.117132580967265e-05, "loss": 3.5989, "step": 1070 }, { "epoch": 0.27660123966942146, "grad_norm": 3.1439149379730225, "learning_rate": 4.115585157913222e-05, "loss": 3.7952, "step": 1071 }, { "epoch": 0.2768595041322314, "grad_norm": 3.799440622329712, "learning_rate": 4.1140366713068104e-05, "loss": 4.0071, "step": 1072 }, { "epoch": 0.27711776859504134, "grad_norm": 3.5201382637023926, "learning_rate": 4.1124871221674096e-05, "loss": 4.4098, "step": 1073 }, { "epoch": 0.27737603305785125, "grad_norm": 2.3562259674072266, "learning_rate": 4.110936511515103e-05, "loss": 4.0899, "step": 1074 }, { "epoch": 0.27763429752066116, "grad_norm": 2.767244815826416, "learning_rate": 4.109384840370668e-05, "loss": 4.2832, "step": 1075 }, { "epoch": 0.27789256198347106, "grad_norm": 6.16216516494751, "learning_rate": 4.107832109755583e-05, "loss": 3.9588, "step": 1076 }, { "epoch": 0.278150826446281, "grad_norm": 9.287568092346191, "learning_rate": 4.106278320692022e-05, "loss": 3.602, "step": 1077 }, { "epoch": 0.2784090909090909, "grad_norm": 2.0297181606292725, "learning_rate": 4.104723474202856e-05, "loss": 4.3725, "step": 1078 }, { "epoch": 0.27866735537190085, "grad_norm": 2.9879045486450195, "learning_rate": 4.1031675713116524e-05, "loss": 3.8679, "step": 1079 }, { "epoch": 0.27892561983471076, "grad_norm": 2.7868008613586426, "learning_rate": 4.1016106130426755e-05, "loss": 3.9017, "step": 1080 }, { "epoch": 0.27918388429752067, "grad_norm": 2.9436705112457275, "learning_rate": 4.100052600420884e-05, "loss": 4.7159, "step": 1081 }, { "epoch": 0.2794421487603306, "grad_norm": 2.1844942569732666, "learning_rate": 4.098493534471927e-05, "loss": 4.4853, "step": 1082 }, { "epoch": 0.2797004132231405, "grad_norm": 2.5950803756713867, "learning_rate": 4.0969334162221514e-05, "loss": 4.2893, "step": 1083 }, { "epoch": 0.2799586776859504, "grad_norm": 3.474708080291748, "learning_rate": 4.095372246698596e-05, "loss": 3.5148, "step": 1084 }, { "epoch": 0.2802169421487603, "grad_norm": 3.3742032051086426, "learning_rate": 4.09381002692899e-05, "loss": 4.6324, "step": 1085 }, { "epoch": 0.28047520661157027, "grad_norm": 2.508314609527588, "learning_rate": 4.0922467579417546e-05, "loss": 4.3283, "step": 1086 }, { "epoch": 0.2807334710743802, "grad_norm": 2.5059738159179688, "learning_rate": 4.090682440766002e-05, "loss": 4.7232, "step": 1087 }, { "epoch": 0.2809917355371901, "grad_norm": 7.259554862976074, "learning_rate": 4.089117076431536e-05, "loss": 4.7916, "step": 1088 }, { "epoch": 0.28125, "grad_norm": 2.623628616333008, "learning_rate": 4.0875506659688465e-05, "loss": 4.2425, "step": 1089 }, { "epoch": 0.2815082644628099, "grad_norm": 2.1941311359405518, "learning_rate": 4.085983210409114e-05, "loss": 4.5893, "step": 1090 }, { "epoch": 0.2817665289256198, "grad_norm": 4.286355972290039, "learning_rate": 4.084414710784208e-05, "loss": 3.9765, "step": 1091 }, { "epoch": 0.28202479338842973, "grad_norm": 2.464715003967285, "learning_rate": 4.082845168126682e-05, "loss": 4.3371, "step": 1092 }, { "epoch": 0.2822830578512397, "grad_norm": 2.680999517440796, "learning_rate": 4.081274583469781e-05, "loss": 4.1573, "step": 1093 }, { "epoch": 0.2825413223140496, "grad_norm": 3.410165548324585, "learning_rate": 4.07970295784743e-05, "loss": 4.2994, "step": 1094 }, { "epoch": 0.2827995867768595, "grad_norm": 2.512529134750366, "learning_rate": 4.0781302922942455e-05, "loss": 3.7748, "step": 1095 }, { "epoch": 0.2830578512396694, "grad_norm": 5.2254180908203125, "learning_rate": 4.076556587845524e-05, "loss": 3.3508, "step": 1096 }, { "epoch": 0.28331611570247933, "grad_norm": 2.831851005554199, "learning_rate": 4.074981845537247e-05, "loss": 4.031, "step": 1097 }, { "epoch": 0.28357438016528924, "grad_norm": 2.716183662414551, "learning_rate": 4.073406066406081e-05, "loss": 4.0839, "step": 1098 }, { "epoch": 0.28383264462809915, "grad_norm": 2.735152006149292, "learning_rate": 4.071829251489373e-05, "loss": 4.2465, "step": 1099 }, { "epoch": 0.2840909090909091, "grad_norm": 3.078917980194092, "learning_rate": 4.0702514018251524e-05, "loss": 3.8521, "step": 1100 }, { "epoch": 0.284349173553719, "grad_norm": 1.6595752239227295, "learning_rate": 4.06867251845213e-05, "loss": 4.1685, "step": 1101 }, { "epoch": 0.28460743801652894, "grad_norm": 2.1214449405670166, "learning_rate": 4.067092602409698e-05, "loss": 3.8018, "step": 1102 }, { "epoch": 0.28486570247933884, "grad_norm": 3.256396532058716, "learning_rate": 4.065511654737927e-05, "loss": 3.9279, "step": 1103 }, { "epoch": 0.28512396694214875, "grad_norm": 2.665863513946533, "learning_rate": 4.063929676477567e-05, "loss": 4.1118, "step": 1104 }, { "epoch": 0.28538223140495866, "grad_norm": 2.5126500129699707, "learning_rate": 4.062346668670046e-05, "loss": 4.0836, "step": 1105 }, { "epoch": 0.2856404958677686, "grad_norm": 2.807429075241089, "learning_rate": 4.060762632357473e-05, "loss": 4.6796, "step": 1106 }, { "epoch": 0.28589876033057854, "grad_norm": 3.9827442169189453, "learning_rate": 4.0591775685826285e-05, "loss": 3.9171, "step": 1107 }, { "epoch": 0.28615702479338845, "grad_norm": 2.4633195400238037, "learning_rate": 4.0575914783889735e-05, "loss": 3.7057, "step": 1108 }, { "epoch": 0.28641528925619836, "grad_norm": 2.5921459197998047, "learning_rate": 4.0560043628206446e-05, "loss": 4.8307, "step": 1109 }, { "epoch": 0.28667355371900827, "grad_norm": 3.456768274307251, "learning_rate": 4.054416222922452e-05, "loss": 4.0895, "step": 1110 }, { "epoch": 0.2869318181818182, "grad_norm": 3.669999599456787, "learning_rate": 4.052827059739879e-05, "loss": 4.4822, "step": 1111 }, { "epoch": 0.2871900826446281, "grad_norm": 3.465320110321045, "learning_rate": 4.051236874319086e-05, "loss": 4.3501, "step": 1112 }, { "epoch": 0.287448347107438, "grad_norm": 3.8518128395080566, "learning_rate": 4.0496456677069045e-05, "loss": 4.2856, "step": 1113 }, { "epoch": 0.28770661157024796, "grad_norm": 1.6459428071975708, "learning_rate": 4.048053440950838e-05, "loss": 4.2049, "step": 1114 }, { "epoch": 0.28796487603305787, "grad_norm": 1.9270769357681274, "learning_rate": 4.04646019509906e-05, "loss": 4.2239, "step": 1115 }, { "epoch": 0.2882231404958678, "grad_norm": 2.5791163444519043, "learning_rate": 4.044865931200419e-05, "loss": 4.2397, "step": 1116 }, { "epoch": 0.2884814049586777, "grad_norm": 2.995785713195801, "learning_rate": 4.0432706503044315e-05, "loss": 4.6991, "step": 1117 }, { "epoch": 0.2887396694214876, "grad_norm": 3.6049246788024902, "learning_rate": 4.041674353461282e-05, "loss": 4.076, "step": 1118 }, { "epoch": 0.2889979338842975, "grad_norm": 2.9096322059631348, "learning_rate": 4.040077041721826e-05, "loss": 3.9904, "step": 1119 }, { "epoch": 0.2892561983471074, "grad_norm": 4.937403202056885, "learning_rate": 4.038478716137586e-05, "loss": 3.7756, "step": 1120 }, { "epoch": 0.28951446280991733, "grad_norm": 3.1214945316314697, "learning_rate": 4.0368793777607524e-05, "loss": 4.2775, "step": 1121 }, { "epoch": 0.2897727272727273, "grad_norm": 2.1926424503326416, "learning_rate": 4.0352790276441823e-05, "loss": 4.5154, "step": 1122 }, { "epoch": 0.2900309917355372, "grad_norm": 3.151686668395996, "learning_rate": 4.033677666841399e-05, "loss": 4.1591, "step": 1123 }, { "epoch": 0.2902892561983471, "grad_norm": 3.0238983631134033, "learning_rate": 4.03207529640659e-05, "loss": 3.6172, "step": 1124 }, { "epoch": 0.290547520661157, "grad_norm": 4.593628406524658, "learning_rate": 4.0304719173946096e-05, "loss": 3.6979, "step": 1125 }, { "epoch": 0.29080578512396693, "grad_norm": 3.221773147583008, "learning_rate": 4.028867530860974e-05, "loss": 4.0212, "step": 1126 }, { "epoch": 0.29106404958677684, "grad_norm": 3.5841665267944336, "learning_rate": 4.027262137861863e-05, "loss": 4.3814, "step": 1127 }, { "epoch": 0.29132231404958675, "grad_norm": 2.8071744441986084, "learning_rate": 4.02565573945412e-05, "loss": 4.5151, "step": 1128 }, { "epoch": 0.2915805785123967, "grad_norm": 11.023234367370605, "learning_rate": 4.024048336695248e-05, "loss": 4.6423, "step": 1129 }, { "epoch": 0.2918388429752066, "grad_norm": 3.6653239727020264, "learning_rate": 4.0224399306434157e-05, "loss": 4.2194, "step": 1130 }, { "epoch": 0.29209710743801653, "grad_norm": 3.646267890930176, "learning_rate": 4.020830522357448e-05, "loss": 3.3341, "step": 1131 }, { "epoch": 0.29235537190082644, "grad_norm": 4.283847808837891, "learning_rate": 4.019220112896831e-05, "loss": 4.2417, "step": 1132 }, { "epoch": 0.29261363636363635, "grad_norm": 2.8627328872680664, "learning_rate": 4.01760870332171e-05, "loss": 4.2451, "step": 1133 }, { "epoch": 0.29287190082644626, "grad_norm": 3.1673972606658936, "learning_rate": 4.015996294692889e-05, "loss": 4.4063, "step": 1134 }, { "epoch": 0.2931301652892562, "grad_norm": 3.6003966331481934, "learning_rate": 4.014382888071827e-05, "loss": 4.3143, "step": 1135 }, { "epoch": 0.29338842975206614, "grad_norm": 2.212770700454712, "learning_rate": 4.012768484520645e-05, "loss": 4.301, "step": 1136 }, { "epoch": 0.29364669421487605, "grad_norm": 4.477606296539307, "learning_rate": 4.0111530851021164e-05, "loss": 4.5305, "step": 1137 }, { "epoch": 0.29390495867768596, "grad_norm": 2.974325656890869, "learning_rate": 4.009536690879671e-05, "loss": 4.6485, "step": 1138 }, { "epoch": 0.29416322314049587, "grad_norm": 2.1131114959716797, "learning_rate": 4.007919302917393e-05, "loss": 3.7645, "step": 1139 }, { "epoch": 0.2944214876033058, "grad_norm": 2.856687068939209, "learning_rate": 4.0063009222800243e-05, "loss": 3.9655, "step": 1140 }, { "epoch": 0.2946797520661157, "grad_norm": 3.2794432640075684, "learning_rate": 4.004681550032955e-05, "loss": 4.2807, "step": 1141 }, { "epoch": 0.2949380165289256, "grad_norm": 3.044412612915039, "learning_rate": 4.003061187242232e-05, "loss": 4.0139, "step": 1142 }, { "epoch": 0.29519628099173556, "grad_norm": 3.8245506286621094, "learning_rate": 4.001439834974552e-05, "loss": 3.7324, "step": 1143 }, { "epoch": 0.29545454545454547, "grad_norm": 2.664884090423584, "learning_rate": 3.999817494297264e-05, "loss": 4.4551, "step": 1144 }, { "epoch": 0.2957128099173554, "grad_norm": 5.277992248535156, "learning_rate": 3.9981941662783674e-05, "loss": 5.0818, "step": 1145 }, { "epoch": 0.2959710743801653, "grad_norm": 3.2974421977996826, "learning_rate": 3.996569851986513e-05, "loss": 3.7167, "step": 1146 }, { "epoch": 0.2962293388429752, "grad_norm": 3.6730399131774902, "learning_rate": 3.994944552490998e-05, "loss": 4.7171, "step": 1147 }, { "epoch": 0.2964876033057851, "grad_norm": 2.2121524810791016, "learning_rate": 3.9933182688617705e-05, "loss": 4.0241, "step": 1148 }, { "epoch": 0.296745867768595, "grad_norm": 3.0910520553588867, "learning_rate": 3.991691002169426e-05, "loss": 4.0246, "step": 1149 }, { "epoch": 0.297004132231405, "grad_norm": 4.244194030761719, "learning_rate": 3.9900627534852066e-05, "loss": 3.9135, "step": 1150 }, { "epoch": 0.2972623966942149, "grad_norm": 3.144059419631958, "learning_rate": 3.988433523881001e-05, "loss": 4.2575, "step": 1151 }, { "epoch": 0.2975206611570248, "grad_norm": 3.372464179992676, "learning_rate": 3.986803314429344e-05, "loss": 4.4718, "step": 1152 }, { "epoch": 0.2977789256198347, "grad_norm": 4.980547904968262, "learning_rate": 3.9851721262034156e-05, "loss": 4.1783, "step": 1153 }, { "epoch": 0.2980371900826446, "grad_norm": 3.2398624420166016, "learning_rate": 3.9835399602770396e-05, "loss": 4.1571, "step": 1154 }, { "epoch": 0.29829545454545453, "grad_norm": 3.4737911224365234, "learning_rate": 3.9819068177246834e-05, "loss": 3.842, "step": 1155 }, { "epoch": 0.29855371900826444, "grad_norm": 2.5916380882263184, "learning_rate": 3.980272699621458e-05, "loss": 4.3094, "step": 1156 }, { "epoch": 0.2988119834710744, "grad_norm": 3.1258223056793213, "learning_rate": 3.978637607043115e-05, "loss": 4.2375, "step": 1157 }, { "epoch": 0.2990702479338843, "grad_norm": 4.3700103759765625, "learning_rate": 3.9770015410660494e-05, "loss": 3.6347, "step": 1158 }, { "epoch": 0.2993285123966942, "grad_norm": 6.239528179168701, "learning_rate": 3.975364502767297e-05, "loss": 3.554, "step": 1159 }, { "epoch": 0.29958677685950413, "grad_norm": 2.7190301418304443, "learning_rate": 3.973726493224532e-05, "loss": 4.315, "step": 1160 }, { "epoch": 0.29984504132231404, "grad_norm": 2.2138125896453857, "learning_rate": 3.972087513516069e-05, "loss": 4.6675, "step": 1161 }, { "epoch": 0.30010330578512395, "grad_norm": 4.922652244567871, "learning_rate": 3.9704475647208616e-05, "loss": 4.2642, "step": 1162 }, { "epoch": 0.30036157024793386, "grad_norm": 4.200195789337158, "learning_rate": 3.968806647918501e-05, "loss": 3.9931, "step": 1163 }, { "epoch": 0.3006198347107438, "grad_norm": 2.880664348602295, "learning_rate": 3.967164764189215e-05, "loss": 4.022, "step": 1164 }, { "epoch": 0.30087809917355374, "grad_norm": 2.8593130111694336, "learning_rate": 3.965521914613868e-05, "loss": 4.4281, "step": 1165 }, { "epoch": 0.30113636363636365, "grad_norm": 3.4476590156555176, "learning_rate": 3.963878100273962e-05, "loss": 4.0936, "step": 1166 }, { "epoch": 0.30139462809917356, "grad_norm": 3.2103140354156494, "learning_rate": 3.9622333222516326e-05, "loss": 4.3911, "step": 1167 }, { "epoch": 0.30165289256198347, "grad_norm": 1.7922616004943848, "learning_rate": 3.96058758162965e-05, "loss": 4.2156, "step": 1168 }, { "epoch": 0.3019111570247934, "grad_norm": 3.011524200439453, "learning_rate": 3.958940879491418e-05, "loss": 3.7695, "step": 1169 }, { "epoch": 0.3021694214876033, "grad_norm": 4.612856388092041, "learning_rate": 3.9572932169209735e-05, "loss": 3.8906, "step": 1170 }, { "epoch": 0.30242768595041325, "grad_norm": 3.142853260040283, "learning_rate": 3.9556445950029866e-05, "loss": 4.1352, "step": 1171 }, { "epoch": 0.30268595041322316, "grad_norm": 3.2091598510742188, "learning_rate": 3.953995014822755e-05, "loss": 4.2702, "step": 1172 }, { "epoch": 0.30294421487603307, "grad_norm": 2.988811492919922, "learning_rate": 3.9523444774662136e-05, "loss": 3.9668, "step": 1173 }, { "epoch": 0.303202479338843, "grad_norm": 3.713587999343872, "learning_rate": 3.950692984019923e-05, "loss": 3.9916, "step": 1174 }, { "epoch": 0.3034607438016529, "grad_norm": 2.510329246520996, "learning_rate": 3.9490405355710745e-05, "loss": 4.2005, "step": 1175 }, { "epoch": 0.3037190082644628, "grad_norm": 4.187905311584473, "learning_rate": 3.947387133207487e-05, "loss": 3.7942, "step": 1176 }, { "epoch": 0.3039772727272727, "grad_norm": 2.087900400161743, "learning_rate": 3.945732778017609e-05, "loss": 4.0724, "step": 1177 }, { "epoch": 0.30423553719008267, "grad_norm": 5.214295864105225, "learning_rate": 3.944077471090515e-05, "loss": 3.7119, "step": 1178 }, { "epoch": 0.3044938016528926, "grad_norm": 3.689541816711426, "learning_rate": 3.942421213515906e-05, "loss": 4.174, "step": 1179 }, { "epoch": 0.3047520661157025, "grad_norm": 3.7565529346466064, "learning_rate": 3.940764006384111e-05, "loss": 4.224, "step": 1180 }, { "epoch": 0.3050103305785124, "grad_norm": 2.9743542671203613, "learning_rate": 3.939105850786081e-05, "loss": 4.0479, "step": 1181 }, { "epoch": 0.3052685950413223, "grad_norm": 2.7155861854553223, "learning_rate": 3.937446747813394e-05, "loss": 4.2272, "step": 1182 }, { "epoch": 0.3055268595041322, "grad_norm": 2.1422119140625, "learning_rate": 3.93578669855825e-05, "loss": 4.2466, "step": 1183 }, { "epoch": 0.30578512396694213, "grad_norm": 5.181281566619873, "learning_rate": 3.9341257041134716e-05, "loss": 3.8919, "step": 1184 }, { "epoch": 0.30604338842975204, "grad_norm": 2.9527764320373535, "learning_rate": 3.9324637655725055e-05, "loss": 4.204, "step": 1185 }, { "epoch": 0.306301652892562, "grad_norm": 4.260028839111328, "learning_rate": 3.930800884029419e-05, "loss": 4.0491, "step": 1186 }, { "epoch": 0.3065599173553719, "grad_norm": 3.5401971340179443, "learning_rate": 3.929137060578898e-05, "loss": 4.13, "step": 1187 }, { "epoch": 0.3068181818181818, "grad_norm": 3.4431381225585938, "learning_rate": 3.927472296316253e-05, "loss": 4.078, "step": 1188 }, { "epoch": 0.30707644628099173, "grad_norm": 2.3062708377838135, "learning_rate": 3.92580659233741e-05, "loss": 4.6002, "step": 1189 }, { "epoch": 0.30733471074380164, "grad_norm": 2.082777738571167, "learning_rate": 3.924139949738916e-05, "loss": 3.8658, "step": 1190 }, { "epoch": 0.30759297520661155, "grad_norm": 4.062329292297363, "learning_rate": 3.922472369617934e-05, "loss": 3.8035, "step": 1191 }, { "epoch": 0.30785123966942146, "grad_norm": 3.1088407039642334, "learning_rate": 3.920803853072246e-05, "loss": 4.8293, "step": 1192 }, { "epoch": 0.3081095041322314, "grad_norm": 6.427333831787109, "learning_rate": 3.9191344012002476e-05, "loss": 4.4556, "step": 1193 }, { "epoch": 0.30836776859504134, "grad_norm": 2.808093547821045, "learning_rate": 3.9174640151009534e-05, "loss": 3.8395, "step": 1194 }, { "epoch": 0.30862603305785125, "grad_norm": 1.7760009765625, "learning_rate": 3.915792695873992e-05, "loss": 4.3956, "step": 1195 }, { "epoch": 0.30888429752066116, "grad_norm": 3.261389970779419, "learning_rate": 3.914120444619606e-05, "loss": 4.47, "step": 1196 }, { "epoch": 0.30914256198347106, "grad_norm": 3.269498109817505, "learning_rate": 3.912447262438651e-05, "loss": 3.7782, "step": 1197 }, { "epoch": 0.309400826446281, "grad_norm": 3.077920913696289, "learning_rate": 3.910773150432595e-05, "loss": 4.0892, "step": 1198 }, { "epoch": 0.3096590909090909, "grad_norm": 4.549795150756836, "learning_rate": 3.9090981097035204e-05, "loss": 4.1716, "step": 1199 }, { "epoch": 0.30991735537190085, "grad_norm": 2.9768402576446533, "learning_rate": 3.9074221413541194e-05, "loss": 4.8455, "step": 1200 }, { "epoch": 0.31017561983471076, "grad_norm": 5.735114574432373, "learning_rate": 3.905745246487695e-05, "loss": 2.8083, "step": 1201 }, { "epoch": 0.31043388429752067, "grad_norm": 2.737816572189331, "learning_rate": 3.90406742620816e-05, "loss": 4.8118, "step": 1202 }, { "epoch": 0.3106921487603306, "grad_norm": 3.0264482498168945, "learning_rate": 3.9023886816200364e-05, "loss": 3.9949, "step": 1203 }, { "epoch": 0.3109504132231405, "grad_norm": 3.449899196624756, "learning_rate": 3.900709013828456e-05, "loss": 3.8994, "step": 1204 }, { "epoch": 0.3112086776859504, "grad_norm": 1.6357377767562866, "learning_rate": 3.899028423939156e-05, "loss": 3.747, "step": 1205 }, { "epoch": 0.3114669421487603, "grad_norm": 3.360771656036377, "learning_rate": 3.897346913058483e-05, "loss": 4.6533, "step": 1206 }, { "epoch": 0.31172520661157027, "grad_norm": 9.382525444030762, "learning_rate": 3.8956644822933883e-05, "loss": 3.9901, "step": 1207 }, { "epoch": 0.3119834710743802, "grad_norm": 4.022988796234131, "learning_rate": 3.893981132751429e-05, "loss": 4.7237, "step": 1208 }, { "epoch": 0.3122417355371901, "grad_norm": 2.415750741958618, "learning_rate": 3.892296865540767e-05, "loss": 4.3025, "step": 1209 }, { "epoch": 0.3125, "grad_norm": 2.0257508754730225, "learning_rate": 3.89061168177017e-05, "loss": 4.5949, "step": 1210 }, { "epoch": 0.3127582644628099, "grad_norm": 4.626664638519287, "learning_rate": 3.888925582549006e-05, "loss": 4.3733, "step": 1211 }, { "epoch": 0.3130165289256198, "grad_norm": 2.490605354309082, "learning_rate": 3.887238568987248e-05, "loss": 4.2241, "step": 1212 }, { "epoch": 0.31327479338842973, "grad_norm": 2.915437698364258, "learning_rate": 3.885550642195471e-05, "loss": 3.6835, "step": 1213 }, { "epoch": 0.3135330578512397, "grad_norm": 2.9427144527435303, "learning_rate": 3.8838618032848494e-05, "loss": 4.0665, "step": 1214 }, { "epoch": 0.3137913223140496, "grad_norm": 2.567456007003784, "learning_rate": 3.882172053367159e-05, "loss": 3.7869, "step": 1215 }, { "epoch": 0.3140495867768595, "grad_norm": 1.8157142400741577, "learning_rate": 3.880481393554777e-05, "loss": 4.5422, "step": 1216 }, { "epoch": 0.3143078512396694, "grad_norm": 2.7577998638153076, "learning_rate": 3.878789824960677e-05, "loss": 3.7513, "step": 1217 }, { "epoch": 0.31456611570247933, "grad_norm": 3.0959396362304688, "learning_rate": 3.8770973486984305e-05, "loss": 4.2232, "step": 1218 }, { "epoch": 0.31482438016528924, "grad_norm": 3.6297755241394043, "learning_rate": 3.8754039658822106e-05, "loss": 4.4643, "step": 1219 }, { "epoch": 0.31508264462809915, "grad_norm": 3.545289993286133, "learning_rate": 3.8737096776267825e-05, "loss": 4.4334, "step": 1220 }, { "epoch": 0.3153409090909091, "grad_norm": 5.181464672088623, "learning_rate": 3.8720144850475095e-05, "loss": 4.1211, "step": 1221 }, { "epoch": 0.315599173553719, "grad_norm": 2.324563503265381, "learning_rate": 3.87031838926035e-05, "loss": 4.0648, "step": 1222 }, { "epoch": 0.31585743801652894, "grad_norm": 2.338939905166626, "learning_rate": 3.868621391381859e-05, "loss": 3.8306, "step": 1223 }, { "epoch": 0.31611570247933884, "grad_norm": 2.005732297897339, "learning_rate": 3.866923492529182e-05, "loss": 4.0701, "step": 1224 }, { "epoch": 0.31637396694214875, "grad_norm": 3.353872299194336, "learning_rate": 3.865224693820059e-05, "loss": 4.5479, "step": 1225 }, { "epoch": 0.31663223140495866, "grad_norm": 3.2015693187713623, "learning_rate": 3.863524996372823e-05, "loss": 4.0772, "step": 1226 }, { "epoch": 0.3168904958677686, "grad_norm": 3.404008626937866, "learning_rate": 3.861824401306399e-05, "loss": 4.1809, "step": 1227 }, { "epoch": 0.31714876033057854, "grad_norm": 3.9996485710144043, "learning_rate": 3.8601229097402994e-05, "loss": 3.4584, "step": 1228 }, { "epoch": 0.31740702479338845, "grad_norm": 1.9859007596969604, "learning_rate": 3.858420522794631e-05, "loss": 4.4546, "step": 1229 }, { "epoch": 0.31766528925619836, "grad_norm": 1.8640894889831543, "learning_rate": 3.8567172415900904e-05, "loss": 4.1543, "step": 1230 }, { "epoch": 0.31792355371900827, "grad_norm": 2.939577102661133, "learning_rate": 3.855013067247958e-05, "loss": 4.1695, "step": 1231 }, { "epoch": 0.3181818181818182, "grad_norm": 3.0711116790771484, "learning_rate": 3.853308000890107e-05, "loss": 3.9842, "step": 1232 }, { "epoch": 0.3184400826446281, "grad_norm": 3.384723424911499, "learning_rate": 3.851602043638994e-05, "loss": 4.4163, "step": 1233 }, { "epoch": 0.318698347107438, "grad_norm": 2.5083999633789062, "learning_rate": 3.849895196617666e-05, "loss": 4.2224, "step": 1234 }, { "epoch": 0.31895661157024796, "grad_norm": 3.6702280044555664, "learning_rate": 3.848187460949753e-05, "loss": 4.3274, "step": 1235 }, { "epoch": 0.31921487603305787, "grad_norm": 3.946148157119751, "learning_rate": 3.84647883775947e-05, "loss": 4.1001, "step": 1236 }, { "epoch": 0.3194731404958678, "grad_norm": 3.1886885166168213, "learning_rate": 3.844769328171619e-05, "loss": 4.4177, "step": 1237 }, { "epoch": 0.3197314049586777, "grad_norm": 2.8486039638519287, "learning_rate": 3.843058933311582e-05, "loss": 3.6429, "step": 1238 }, { "epoch": 0.3199896694214876, "grad_norm": 3.6725971698760986, "learning_rate": 3.841347654305325e-05, "loss": 4.3184, "step": 1239 }, { "epoch": 0.3202479338842975, "grad_norm": 2.8490939140319824, "learning_rate": 3.8396354922793984e-05, "loss": 4.2971, "step": 1240 }, { "epoch": 0.3205061983471074, "grad_norm": 1.6440829038619995, "learning_rate": 3.8379224483609286e-05, "loss": 3.7402, "step": 1241 }, { "epoch": 0.32076446280991733, "grad_norm": 3.803656816482544, "learning_rate": 3.83620852367763e-05, "loss": 4.5061, "step": 1242 }, { "epoch": 0.3210227272727273, "grad_norm": 1.926193118095398, "learning_rate": 3.834493719357789e-05, "loss": 4.1155, "step": 1243 }, { "epoch": 0.3212809917355372, "grad_norm": 3.4450414180755615, "learning_rate": 3.832778036530276e-05, "loss": 4.0712, "step": 1244 }, { "epoch": 0.3215392561983471, "grad_norm": 2.802874803543091, "learning_rate": 3.8310614763245394e-05, "loss": 4.4553, "step": 1245 }, { "epoch": 0.321797520661157, "grad_norm": 3.4974217414855957, "learning_rate": 3.8293440398706036e-05, "loss": 4.0005, "step": 1246 }, { "epoch": 0.32205578512396693, "grad_norm": 2.7278506755828857, "learning_rate": 3.827625728299069e-05, "loss": 4.087, "step": 1247 }, { "epoch": 0.32231404958677684, "grad_norm": 3.4943089485168457, "learning_rate": 3.825906542741117e-05, "loss": 4.3279, "step": 1248 }, { "epoch": 0.32257231404958675, "grad_norm": 2.128812551498413, "learning_rate": 3.824186484328497e-05, "loss": 4.0822, "step": 1249 }, { "epoch": 0.3228305785123967, "grad_norm": 3.92044997215271, "learning_rate": 3.82246555419354e-05, "loss": 3.8756, "step": 1250 }, { "epoch": 0.3230888429752066, "grad_norm": 5.141569137573242, "learning_rate": 3.820743753469145e-05, "loss": 3.1012, "step": 1251 }, { "epoch": 0.32334710743801653, "grad_norm": 3.435981512069702, "learning_rate": 3.819021083288788e-05, "loss": 4.6115, "step": 1252 }, { "epoch": 0.32360537190082644, "grad_norm": 3.638812780380249, "learning_rate": 3.817297544786519e-05, "loss": 4.1825, "step": 1253 }, { "epoch": 0.32386363636363635, "grad_norm": 4.674869060516357, "learning_rate": 3.8155731390969534e-05, "loss": 4.1567, "step": 1254 }, { "epoch": 0.32412190082644626, "grad_norm": 3.4055280685424805, "learning_rate": 3.813847867355282e-05, "loss": 3.949, "step": 1255 }, { "epoch": 0.3243801652892562, "grad_norm": 2.9093103408813477, "learning_rate": 3.812121730697266e-05, "loss": 4.2693, "step": 1256 }, { "epoch": 0.32463842975206614, "grad_norm": 3.733391046524048, "learning_rate": 3.8103947302592324e-05, "loss": 4.3075, "step": 1257 }, { "epoch": 0.32489669421487605, "grad_norm": 3.6927363872528076, "learning_rate": 3.808666867178081e-05, "loss": 4.0627, "step": 1258 }, { "epoch": 0.32515495867768596, "grad_norm": 2.2583870887756348, "learning_rate": 3.8069381425912764e-05, "loss": 4.2458, "step": 1259 }, { "epoch": 0.32541322314049587, "grad_norm": 2.140232563018799, "learning_rate": 3.805208557636852e-05, "loss": 4.3802, "step": 1260 }, { "epoch": 0.3256714876033058, "grad_norm": 2.348125696182251, "learning_rate": 3.803478113453408e-05, "loss": 4.3885, "step": 1261 }, { "epoch": 0.3259297520661157, "grad_norm": 2.2502338886260986, "learning_rate": 3.801746811180108e-05, "loss": 4.1082, "step": 1262 }, { "epoch": 0.3261880165289256, "grad_norm": 3.0430049896240234, "learning_rate": 3.8000146519566826e-05, "loss": 4.2313, "step": 1263 }, { "epoch": 0.32644628099173556, "grad_norm": 3.549550771713257, "learning_rate": 3.7982816369234254e-05, "loss": 4.0416, "step": 1264 }, { "epoch": 0.32670454545454547, "grad_norm": 1.9970018863677979, "learning_rate": 3.796547767221194e-05, "loss": 4.0459, "step": 1265 }, { "epoch": 0.3269628099173554, "grad_norm": 4.200958728790283, "learning_rate": 3.794813043991408e-05, "loss": 4.0001, "step": 1266 }, { "epoch": 0.3272210743801653, "grad_norm": 3.642282247543335, "learning_rate": 3.7930774683760504e-05, "loss": 4.1472, "step": 1267 }, { "epoch": 0.3274793388429752, "grad_norm": 8.575886726379395, "learning_rate": 3.791341041517663e-05, "loss": 4.7137, "step": 1268 }, { "epoch": 0.3277376033057851, "grad_norm": 2.7681798934936523, "learning_rate": 3.7896037645593495e-05, "loss": 4.16, "step": 1269 }, { "epoch": 0.327995867768595, "grad_norm": 2.506439208984375, "learning_rate": 3.787865638644774e-05, "loss": 4.5732, "step": 1270 }, { "epoch": 0.328254132231405, "grad_norm": 2.6471879482269287, "learning_rate": 3.786126664918157e-05, "loss": 4.1958, "step": 1271 }, { "epoch": 0.3285123966942149, "grad_norm": 2.789843797683716, "learning_rate": 3.7843868445242785e-05, "loss": 4.0642, "step": 1272 }, { "epoch": 0.3287706611570248, "grad_norm": 3.710859537124634, "learning_rate": 3.782646178608477e-05, "loss": 4.1584, "step": 1273 }, { "epoch": 0.3290289256198347, "grad_norm": 2.601902484893799, "learning_rate": 3.780904668316646e-05, "loss": 3.5627, "step": 1274 }, { "epoch": 0.3292871900826446, "grad_norm": 3.2482893466949463, "learning_rate": 3.779162314795235e-05, "loss": 3.8015, "step": 1275 }, { "epoch": 0.32954545454545453, "grad_norm": 3.4456779956817627, "learning_rate": 3.77741911919125e-05, "loss": 4.4533, "step": 1276 }, { "epoch": 0.32980371900826444, "grad_norm": 2.940495491027832, "learning_rate": 3.77567508265225e-05, "loss": 3.7223, "step": 1277 }, { "epoch": 0.3300619834710744, "grad_norm": 3.973440170288086, "learning_rate": 3.773930206326346e-05, "loss": 3.9705, "step": 1278 }, { "epoch": 0.3303202479338843, "grad_norm": 4.7453083992004395, "learning_rate": 3.7721844913622065e-05, "loss": 4.0706, "step": 1279 }, { "epoch": 0.3305785123966942, "grad_norm": 2.7942557334899902, "learning_rate": 3.770437938909048e-05, "loss": 4.2806, "step": 1280 }, { "epoch": 0.33083677685950413, "grad_norm": 2.4743993282318115, "learning_rate": 3.768690550116639e-05, "loss": 4.1905, "step": 1281 }, { "epoch": 0.33109504132231404, "grad_norm": 4.031190872192383, "learning_rate": 3.766942326135301e-05, "loss": 4.7636, "step": 1282 }, { "epoch": 0.33135330578512395, "grad_norm": 3.275761842727661, "learning_rate": 3.7651932681159026e-05, "loss": 3.8918, "step": 1283 }, { "epoch": 0.33161157024793386, "grad_norm": 2.841240644454956, "learning_rate": 3.763443377209862e-05, "loss": 4.1559, "step": 1284 }, { "epoch": 0.3318698347107438, "grad_norm": 3.6043238639831543, "learning_rate": 3.761692654569147e-05, "loss": 4.3547, "step": 1285 }, { "epoch": 0.33212809917355374, "grad_norm": 3.4755682945251465, "learning_rate": 3.75994110134627e-05, "loss": 4.1628, "step": 1286 }, { "epoch": 0.33238636363636365, "grad_norm": 1.6819132566452026, "learning_rate": 3.758188718694296e-05, "loss": 3.739, "step": 1287 }, { "epoch": 0.33264462809917356, "grad_norm": 3.5312886238098145, "learning_rate": 3.756435507766829e-05, "loss": 4.2125, "step": 1288 }, { "epoch": 0.33290289256198347, "grad_norm": 3.525326728820801, "learning_rate": 3.7546814697180225e-05, "loss": 4.2066, "step": 1289 }, { "epoch": 0.3331611570247934, "grad_norm": 2.972268581390381, "learning_rate": 3.7529266057025735e-05, "loss": 4.8439, "step": 1290 }, { "epoch": 0.3334194214876033, "grad_norm": 2.7216956615448, "learning_rate": 3.751170916875723e-05, "loss": 3.9259, "step": 1291 }, { "epoch": 0.33367768595041325, "grad_norm": 1.6108466386795044, "learning_rate": 3.7494144043932534e-05, "loss": 3.7054, "step": 1292 }, { "epoch": 0.33393595041322316, "grad_norm": 2.77410888671875, "learning_rate": 3.747657069411492e-05, "loss": 4.4288, "step": 1293 }, { "epoch": 0.33419421487603307, "grad_norm": 2.2693235874176025, "learning_rate": 3.745898913087306e-05, "loss": 4.54, "step": 1294 }, { "epoch": 0.334452479338843, "grad_norm": 2.1651835441589355, "learning_rate": 3.7441399365781035e-05, "loss": 4.671, "step": 1295 }, { "epoch": 0.3347107438016529, "grad_norm": 2.233532190322876, "learning_rate": 3.742380141041832e-05, "loss": 4.1049, "step": 1296 }, { "epoch": 0.3349690082644628, "grad_norm": 4.697624683380127, "learning_rate": 3.7406195276369796e-05, "loss": 4.2544, "step": 1297 }, { "epoch": 0.3352272727272727, "grad_norm": 4.0248870849609375, "learning_rate": 3.738858097522571e-05, "loss": 3.6235, "step": 1298 }, { "epoch": 0.33548553719008267, "grad_norm": 2.68284273147583, "learning_rate": 3.7370958518581695e-05, "loss": 4.092, "step": 1299 }, { "epoch": 0.3357438016528926, "grad_norm": 3.3764069080352783, "learning_rate": 3.735332791803875e-05, "loss": 3.977, "step": 1300 }, { "epoch": 0.3360020661157025, "grad_norm": 2.790510654449463, "learning_rate": 3.733568918520325e-05, "loss": 4.2089, "step": 1301 }, { "epoch": 0.3362603305785124, "grad_norm": 2.5207536220550537, "learning_rate": 3.73180423316869e-05, "loss": 4.6166, "step": 1302 }, { "epoch": 0.3365185950413223, "grad_norm": 1.8799091577529907, "learning_rate": 3.730038736910677e-05, "loss": 4.1663, "step": 1303 }, { "epoch": 0.3367768595041322, "grad_norm": 3.705777883529663, "learning_rate": 3.728272430908526e-05, "loss": 3.6518, "step": 1304 }, { "epoch": 0.33703512396694213, "grad_norm": 3.0182251930236816, "learning_rate": 3.7265053163250084e-05, "loss": 4.4383, "step": 1305 }, { "epoch": 0.33729338842975204, "grad_norm": 2.845249891281128, "learning_rate": 3.724737394323431e-05, "loss": 4.5966, "step": 1306 }, { "epoch": 0.337551652892562, "grad_norm": 2.7136614322662354, "learning_rate": 3.722968666067631e-05, "loss": 4.4727, "step": 1307 }, { "epoch": 0.3378099173553719, "grad_norm": 3.2898108959198, "learning_rate": 3.7211991327219755e-05, "loss": 4.3365, "step": 1308 }, { "epoch": 0.3380681818181818, "grad_norm": 2.980485677719116, "learning_rate": 3.719428795451362e-05, "loss": 4.2372, "step": 1309 }, { "epoch": 0.33832644628099173, "grad_norm": 3.5819649696350098, "learning_rate": 3.717657655421218e-05, "loss": 4.1693, "step": 1310 }, { "epoch": 0.33858471074380164, "grad_norm": 2.2865288257598877, "learning_rate": 3.715885713797499e-05, "loss": 4.2491, "step": 1311 }, { "epoch": 0.33884297520661155, "grad_norm": 2.3451197147369385, "learning_rate": 3.714112971746686e-05, "loss": 4.3031, "step": 1312 }, { "epoch": 0.33910123966942146, "grad_norm": 2.283020257949829, "learning_rate": 3.712339430435792e-05, "loss": 4.2679, "step": 1313 }, { "epoch": 0.3393595041322314, "grad_norm": 7.1566033363342285, "learning_rate": 3.710565091032351e-05, "loss": 3.9113, "step": 1314 }, { "epoch": 0.33961776859504134, "grad_norm": 2.8174047470092773, "learning_rate": 3.7087899547044254e-05, "loss": 3.9677, "step": 1315 }, { "epoch": 0.33987603305785125, "grad_norm": 2.0859649181365967, "learning_rate": 3.7070140226206016e-05, "loss": 4.2202, "step": 1316 }, { "epoch": 0.34013429752066116, "grad_norm": 2.733835220336914, "learning_rate": 3.705237295949988e-05, "loss": 4.3221, "step": 1317 }, { "epoch": 0.34039256198347106, "grad_norm": 2.8703086376190186, "learning_rate": 3.7034597758622194e-05, "loss": 4.1443, "step": 1318 }, { "epoch": 0.340650826446281, "grad_norm": 3.860724925994873, "learning_rate": 3.701681463527451e-05, "loss": 4.3042, "step": 1319 }, { "epoch": 0.3409090909090909, "grad_norm": 1.9063694477081299, "learning_rate": 3.6999023601163595e-05, "loss": 3.9907, "step": 1320 }, { "epoch": 0.34116735537190085, "grad_norm": 2.4398624897003174, "learning_rate": 3.6981224668001424e-05, "loss": 4.3506, "step": 1321 }, { "epoch": 0.34142561983471076, "grad_norm": 2.723149299621582, "learning_rate": 3.696341784750517e-05, "loss": 3.9064, "step": 1322 }, { "epoch": 0.34168388429752067, "grad_norm": 2.287135124206543, "learning_rate": 3.6945603151397214e-05, "loss": 3.7054, "step": 1323 }, { "epoch": 0.3419421487603306, "grad_norm": 3.4263715744018555, "learning_rate": 3.6927780591405107e-05, "loss": 4.1689, "step": 1324 }, { "epoch": 0.3422004132231405, "grad_norm": 2.751624584197998, "learning_rate": 3.690995017926157e-05, "loss": 4.7041, "step": 1325 }, { "epoch": 0.3424586776859504, "grad_norm": 3.2719125747680664, "learning_rate": 3.6892111926704516e-05, "loss": 3.9756, "step": 1326 }, { "epoch": 0.3427169421487603, "grad_norm": 4.880646228790283, "learning_rate": 3.687426584547699e-05, "loss": 4.0854, "step": 1327 }, { "epoch": 0.34297520661157027, "grad_norm": 3.897780179977417, "learning_rate": 3.685641194732723e-05, "loss": 4.3274, "step": 1328 }, { "epoch": 0.3432334710743802, "grad_norm": 2.884481191635132, "learning_rate": 3.6838550244008576e-05, "loss": 4.3101, "step": 1329 }, { "epoch": 0.3434917355371901, "grad_norm": 4.61041784286499, "learning_rate": 3.6820680747279534e-05, "loss": 4.0892, "step": 1330 }, { "epoch": 0.34375, "grad_norm": 3.0480778217315674, "learning_rate": 3.680280346890374e-05, "loss": 3.4825, "step": 1331 }, { "epoch": 0.3440082644628099, "grad_norm": 2.595197916030884, "learning_rate": 3.678491842064995e-05, "loss": 4.1765, "step": 1332 }, { "epoch": 0.3442665289256198, "grad_norm": 2.5224053859710693, "learning_rate": 3.676702561429201e-05, "loss": 3.856, "step": 1333 }, { "epoch": 0.34452479338842973, "grad_norm": 3.020293951034546, "learning_rate": 3.674912506160891e-05, "loss": 4.7619, "step": 1334 }, { "epoch": 0.3447830578512397, "grad_norm": 3.640674591064453, "learning_rate": 3.673121677438472e-05, "loss": 3.9636, "step": 1335 }, { "epoch": 0.3450413223140496, "grad_norm": 2.9328644275665283, "learning_rate": 3.671330076440862e-05, "loss": 4.1783, "step": 1336 }, { "epoch": 0.3452995867768595, "grad_norm": 2.8988354206085205, "learning_rate": 3.669537704347484e-05, "loss": 3.8139, "step": 1337 }, { "epoch": 0.3455578512396694, "grad_norm": 3.7619879245758057, "learning_rate": 3.667744562338271e-05, "loss": 4.6013, "step": 1338 }, { "epoch": 0.34581611570247933, "grad_norm": 3.398977279663086, "learning_rate": 3.665950651593663e-05, "loss": 3.8189, "step": 1339 }, { "epoch": 0.34607438016528924, "grad_norm": 3.5327072143554688, "learning_rate": 3.664155973294605e-05, "loss": 4.2062, "step": 1340 }, { "epoch": 0.34633264462809915, "grad_norm": 4.977265357971191, "learning_rate": 3.662360528622549e-05, "loss": 3.8977, "step": 1341 }, { "epoch": 0.3465909090909091, "grad_norm": 3.6207988262176514, "learning_rate": 3.660564318759449e-05, "loss": 4.0005, "step": 1342 }, { "epoch": 0.346849173553719, "grad_norm": 2.64888334274292, "learning_rate": 3.658767344887764e-05, "loss": 4.6733, "step": 1343 }, { "epoch": 0.34710743801652894, "grad_norm": 3.975313425064087, "learning_rate": 3.656969608190456e-05, "loss": 3.5567, "step": 1344 }, { "epoch": 0.34736570247933884, "grad_norm": 3.683774471282959, "learning_rate": 3.6551711098509906e-05, "loss": 3.9755, "step": 1345 }, { "epoch": 0.34762396694214875, "grad_norm": 3.125692129135132, "learning_rate": 3.6533718510533315e-05, "loss": 4.6234, "step": 1346 }, { "epoch": 0.34788223140495866, "grad_norm": 2.2079780101776123, "learning_rate": 3.651571832981946e-05, "loss": 3.9825, "step": 1347 }, { "epoch": 0.3481404958677686, "grad_norm": 2.521200180053711, "learning_rate": 3.6497710568217994e-05, "loss": 4.6691, "step": 1348 }, { "epoch": 0.34839876033057854, "grad_norm": 3.0615806579589844, "learning_rate": 3.6479695237583576e-05, "loss": 4.6379, "step": 1349 }, { "epoch": 0.34865702479338845, "grad_norm": 2.181386709213257, "learning_rate": 3.646167234977583e-05, "loss": 4.1101, "step": 1350 }, { "epoch": 0.34891528925619836, "grad_norm": 2.214554786682129, "learning_rate": 3.6443641916659364e-05, "loss": 4.1375, "step": 1351 }, { "epoch": 0.34917355371900827, "grad_norm": 3.088381290435791, "learning_rate": 3.642560395010377e-05, "loss": 4.2235, "step": 1352 }, { "epoch": 0.3494318181818182, "grad_norm": 2.4733569622039795, "learning_rate": 3.640755846198356e-05, "loss": 4.3287, "step": 1353 }, { "epoch": 0.3496900826446281, "grad_norm": 2.846127510070801, "learning_rate": 3.6389505464178244e-05, "loss": 4.059, "step": 1354 }, { "epoch": 0.349948347107438, "grad_norm": 4.557474136352539, "learning_rate": 3.637144496857223e-05, "loss": 4.2449, "step": 1355 }, { "epoch": 0.35020661157024796, "grad_norm": 2.0103397369384766, "learning_rate": 3.635337698705489e-05, "loss": 3.9009, "step": 1356 }, { "epoch": 0.35046487603305787, "grad_norm": 3.2153234481811523, "learning_rate": 3.6335301531520513e-05, "loss": 4.7778, "step": 1357 }, { "epoch": 0.3507231404958678, "grad_norm": 2.576779365539551, "learning_rate": 3.631721861386834e-05, "loss": 4.4234, "step": 1358 }, { "epoch": 0.3509814049586777, "grad_norm": 6.077549457550049, "learning_rate": 3.629912824600246e-05, "loss": 4.3894, "step": 1359 }, { "epoch": 0.3512396694214876, "grad_norm": 3.8731882572174072, "learning_rate": 3.628103043983194e-05, "loss": 4.0619, "step": 1360 }, { "epoch": 0.3514979338842975, "grad_norm": 5.183117866516113, "learning_rate": 3.626292520727067e-05, "loss": 3.3408, "step": 1361 }, { "epoch": 0.3517561983471074, "grad_norm": 3.49296236038208, "learning_rate": 3.624481256023749e-05, "loss": 4.0046, "step": 1362 }, { "epoch": 0.35201446280991733, "grad_norm": 3.7836225032806396, "learning_rate": 3.622669251065611e-05, "loss": 4.0472, "step": 1363 }, { "epoch": 0.3522727272727273, "grad_norm": 3.4153881072998047, "learning_rate": 3.620856507045507e-05, "loss": 4.3435, "step": 1364 }, { "epoch": 0.3525309917355372, "grad_norm": 11.205126762390137, "learning_rate": 3.619043025156782e-05, "loss": 4.1018, "step": 1365 }, { "epoch": 0.3527892561983471, "grad_norm": 4.704075336456299, "learning_rate": 3.6172288065932645e-05, "loss": 4.2042, "step": 1366 }, { "epoch": 0.353047520661157, "grad_norm": 3.7386176586151123, "learning_rate": 3.615413852549271e-05, "loss": 3.9709, "step": 1367 }, { "epoch": 0.35330578512396693, "grad_norm": 5.252556324005127, "learning_rate": 3.613598164219598e-05, "loss": 4.4396, "step": 1368 }, { "epoch": 0.35356404958677684, "grad_norm": 2.8290553092956543, "learning_rate": 3.611781742799528e-05, "loss": 4.1124, "step": 1369 }, { "epoch": 0.35382231404958675, "grad_norm": 1.9053707122802734, "learning_rate": 3.6099645894848254e-05, "loss": 4.2032, "step": 1370 }, { "epoch": 0.3540805785123967, "grad_norm": 3.009881019592285, "learning_rate": 3.6081467054717366e-05, "loss": 4.096, "step": 1371 }, { "epoch": 0.3543388429752066, "grad_norm": 5.54469633102417, "learning_rate": 3.6063280919569884e-05, "loss": 3.0835, "step": 1372 }, { "epoch": 0.35459710743801653, "grad_norm": 3.951551675796509, "learning_rate": 3.6045087501377886e-05, "loss": 4.1153, "step": 1373 }, { "epoch": 0.35485537190082644, "grad_norm": 3.3583273887634277, "learning_rate": 3.602688681211824e-05, "loss": 3.8713, "step": 1374 }, { "epoch": 0.35511363636363635, "grad_norm": 2.4913361072540283, "learning_rate": 3.6008678863772615e-05, "loss": 4.0039, "step": 1375 }, { "epoch": 0.35537190082644626, "grad_norm": 2.808061361312866, "learning_rate": 3.599046366832743e-05, "loss": 3.7164, "step": 1376 }, { "epoch": 0.3556301652892562, "grad_norm": 2.7915732860565186, "learning_rate": 3.59722412377739e-05, "loss": 4.2409, "step": 1377 }, { "epoch": 0.35588842975206614, "grad_norm": 2.5032448768615723, "learning_rate": 3.595401158410798e-05, "loss": 3.983, "step": 1378 }, { "epoch": 0.35614669421487605, "grad_norm": 5.303885459899902, "learning_rate": 3.5935774719330416e-05, "loss": 3.3486, "step": 1379 }, { "epoch": 0.35640495867768596, "grad_norm": 2.798370838165283, "learning_rate": 3.5917530655446656e-05, "loss": 4.0621, "step": 1380 }, { "epoch": 0.35666322314049587, "grad_norm": 2.211796283721924, "learning_rate": 3.589927940446693e-05, "loss": 4.2275, "step": 1381 }, { "epoch": 0.3569214876033058, "grad_norm": 2.6099021434783936, "learning_rate": 3.5881020978406164e-05, "loss": 3.8815, "step": 1382 }, { "epoch": 0.3571797520661157, "grad_norm": 1.886354684829712, "learning_rate": 3.586275538928404e-05, "loss": 4.5862, "step": 1383 }, { "epoch": 0.3574380165289256, "grad_norm": 6.305332660675049, "learning_rate": 3.584448264912492e-05, "loss": 4.4543, "step": 1384 }, { "epoch": 0.35769628099173556, "grad_norm": 4.737603187561035, "learning_rate": 3.582620276995791e-05, "loss": 4.0779, "step": 1385 }, { "epoch": 0.35795454545454547, "grad_norm": 4.231253623962402, "learning_rate": 3.580791576381678e-05, "loss": 4.24, "step": 1386 }, { "epoch": 0.3582128099173554, "grad_norm": 1.9510223865509033, "learning_rate": 3.578962164274001e-05, "loss": 3.9759, "step": 1387 }, { "epoch": 0.3584710743801653, "grad_norm": 3.630009174346924, "learning_rate": 3.577132041877079e-05, "loss": 4.0913, "step": 1388 }, { "epoch": 0.3587293388429752, "grad_norm": 3.909250497817993, "learning_rate": 3.575301210395693e-05, "loss": 4.346, "step": 1389 }, { "epoch": 0.3589876033057851, "grad_norm": 3.379882335662842, "learning_rate": 3.5734696710350945e-05, "loss": 4.2267, "step": 1390 }, { "epoch": 0.359245867768595, "grad_norm": 3.275658369064331, "learning_rate": 3.571637425001001e-05, "loss": 4.158, "step": 1391 }, { "epoch": 0.359504132231405, "grad_norm": 2.914551019668579, "learning_rate": 3.569804473499593e-05, "loss": 4.5417, "step": 1392 }, { "epoch": 0.3597623966942149, "grad_norm": 3.8111627101898193, "learning_rate": 3.567970817737518e-05, "loss": 4.0272, "step": 1393 }, { "epoch": 0.3600206611570248, "grad_norm": 2.8224968910217285, "learning_rate": 3.566136458921886e-05, "loss": 4.5994, "step": 1394 }, { "epoch": 0.3602789256198347, "grad_norm": 2.5129661560058594, "learning_rate": 3.564301398260269e-05, "loss": 4.1385, "step": 1395 }, { "epoch": 0.3605371900826446, "grad_norm": 3.0676372051239014, "learning_rate": 3.5624656369607025e-05, "loss": 4.6485, "step": 1396 }, { "epoch": 0.36079545454545453, "grad_norm": 2.403143882751465, "learning_rate": 3.560629176231682e-05, "loss": 4.1501, "step": 1397 }, { "epoch": 0.36105371900826444, "grad_norm": 3.023664951324463, "learning_rate": 3.558792017282164e-05, "loss": 4.558, "step": 1398 }, { "epoch": 0.3613119834710744, "grad_norm": 3.662849187850952, "learning_rate": 3.556954161321566e-05, "loss": 3.9671, "step": 1399 }, { "epoch": 0.3615702479338843, "grad_norm": 2.708634614944458, "learning_rate": 3.55511560955976e-05, "loss": 3.8958, "step": 1400 }, { "epoch": 0.3618285123966942, "grad_norm": 2.328787326812744, "learning_rate": 3.5532763632070824e-05, "loss": 4.1826, "step": 1401 }, { "epoch": 0.36208677685950413, "grad_norm": 2.826362133026123, "learning_rate": 3.5514364234743216e-05, "loss": 3.7962, "step": 1402 }, { "epoch": 0.36234504132231404, "grad_norm": 3.263254165649414, "learning_rate": 3.5495957915727254e-05, "loss": 4.1873, "step": 1403 }, { "epoch": 0.36260330578512395, "grad_norm": 5.072592735290527, "learning_rate": 3.547754468713995e-05, "loss": 3.9201, "step": 1404 }, { "epoch": 0.36286157024793386, "grad_norm": 3.5237441062927246, "learning_rate": 3.5459124561102874e-05, "loss": 4.5983, "step": 1405 }, { "epoch": 0.3631198347107438, "grad_norm": 2.385533332824707, "learning_rate": 3.5440697549742156e-05, "loss": 4.5157, "step": 1406 }, { "epoch": 0.36337809917355374, "grad_norm": 3.6961398124694824, "learning_rate": 3.5422263665188424e-05, "loss": 3.8282, "step": 1407 }, { "epoch": 0.36363636363636365, "grad_norm": 3.094862222671509, "learning_rate": 3.540382291957687e-05, "loss": 4.3174, "step": 1408 }, { "epoch": 0.36389462809917356, "grad_norm": 3.933861255645752, "learning_rate": 3.5385375325047166e-05, "loss": 4.4503, "step": 1409 }, { "epoch": 0.36415289256198347, "grad_norm": 3.001427173614502, "learning_rate": 3.536692089374351e-05, "loss": 4.4132, "step": 1410 }, { "epoch": 0.3644111570247934, "grad_norm": 2.7955636978149414, "learning_rate": 3.534845963781459e-05, "loss": 5.0064, "step": 1411 }, { "epoch": 0.3646694214876033, "grad_norm": 2.4714677333831787, "learning_rate": 3.532999156941362e-05, "loss": 4.1953, "step": 1412 }, { "epoch": 0.36492768595041325, "grad_norm": 4.480599403381348, "learning_rate": 3.5311516700698254e-05, "loss": 4.063, "step": 1413 }, { "epoch": 0.36518595041322316, "grad_norm": 3.118659019470215, "learning_rate": 3.5293035043830646e-05, "loss": 4.3021, "step": 1414 }, { "epoch": 0.36544421487603307, "grad_norm": 5.395598411560059, "learning_rate": 3.527454661097742e-05, "loss": 3.703, "step": 1415 }, { "epoch": 0.365702479338843, "grad_norm": 4.28107213973999, "learning_rate": 3.5256051414309655e-05, "loss": 4.222, "step": 1416 }, { "epoch": 0.3659607438016529, "grad_norm": 3.465691328048706, "learning_rate": 3.5237549466002884e-05, "loss": 4.1862, "step": 1417 }, { "epoch": 0.3662190082644628, "grad_norm": 3.2960867881774902, "learning_rate": 3.521904077823708e-05, "loss": 4.1469, "step": 1418 }, { "epoch": 0.3664772727272727, "grad_norm": 3.483687400817871, "learning_rate": 3.520052536319666e-05, "loss": 4.2767, "step": 1419 }, { "epoch": 0.36673553719008267, "grad_norm": 2.9861674308776855, "learning_rate": 3.518200323307047e-05, "loss": 3.7027, "step": 1420 }, { "epoch": 0.3669938016528926, "grad_norm": 2.019256830215454, "learning_rate": 3.516347440005177e-05, "loss": 4.5465, "step": 1421 }, { "epoch": 0.3672520661157025, "grad_norm": 5.170139789581299, "learning_rate": 3.5144938876338234e-05, "loss": 4.6232, "step": 1422 }, { "epoch": 0.3675103305785124, "grad_norm": 5.342817306518555, "learning_rate": 3.512639667413195e-05, "loss": 3.655, "step": 1423 }, { "epoch": 0.3677685950413223, "grad_norm": 3.1239655017852783, "learning_rate": 3.510784780563939e-05, "loss": 3.8292, "step": 1424 }, { "epoch": 0.3680268595041322, "grad_norm": 2.311816453933716, "learning_rate": 3.508929228307142e-05, "loss": 3.5382, "step": 1425 }, { "epoch": 0.36828512396694213, "grad_norm": 3.5545096397399902, "learning_rate": 3.507073011864328e-05, "loss": 3.5056, "step": 1426 }, { "epoch": 0.36854338842975204, "grad_norm": 3.469788074493408, "learning_rate": 3.50521613245746e-05, "loss": 3.8293, "step": 1427 }, { "epoch": 0.368801652892562, "grad_norm": 2.0624890327453613, "learning_rate": 3.5033585913089356e-05, "loss": 3.9983, "step": 1428 }, { "epoch": 0.3690599173553719, "grad_norm": 3.037075996398926, "learning_rate": 3.5015003896415874e-05, "loss": 3.9, "step": 1429 }, { "epoch": 0.3693181818181818, "grad_norm": 2.7492637634277344, "learning_rate": 3.4996415286786874e-05, "loss": 4.4391, "step": 1430 }, { "epoch": 0.36957644628099173, "grad_norm": 6.6199517250061035, "learning_rate": 3.4977820096439353e-05, "loss": 3.4672, "step": 1431 }, { "epoch": 0.36983471074380164, "grad_norm": 2.647416591644287, "learning_rate": 3.4959218337614686e-05, "loss": 4.2858, "step": 1432 }, { "epoch": 0.37009297520661155, "grad_norm": 2.2044785022735596, "learning_rate": 3.494061002255856e-05, "loss": 4.1933, "step": 1433 }, { "epoch": 0.37035123966942146, "grad_norm": 3.2155566215515137, "learning_rate": 3.4921995163520945e-05, "loss": 4.3726, "step": 1434 }, { "epoch": 0.3706095041322314, "grad_norm": 2.4257612228393555, "learning_rate": 3.4903373772756196e-05, "loss": 4.086, "step": 1435 }, { "epoch": 0.37086776859504134, "grad_norm": 2.8614070415496826, "learning_rate": 3.488474586252288e-05, "loss": 4.6368, "step": 1436 }, { "epoch": 0.37112603305785125, "grad_norm": 2.8015503883361816, "learning_rate": 3.48661114450839e-05, "loss": 3.6835, "step": 1437 }, { "epoch": 0.37138429752066116, "grad_norm": 3.8880372047424316, "learning_rate": 3.4847470532706474e-05, "loss": 4.5208, "step": 1438 }, { "epoch": 0.37164256198347106, "grad_norm": 2.139502763748169, "learning_rate": 3.482882313766202e-05, "loss": 4.0528, "step": 1439 }, { "epoch": 0.371900826446281, "grad_norm": 4.939480781555176, "learning_rate": 3.481016927222629e-05, "loss": 3.9798, "step": 1440 }, { "epoch": 0.3721590909090909, "grad_norm": 3.1824746131896973, "learning_rate": 3.479150894867926e-05, "loss": 3.3598, "step": 1441 }, { "epoch": 0.37241735537190085, "grad_norm": 2.0180723667144775, "learning_rate": 3.477284217930517e-05, "loss": 3.7374, "step": 1442 }, { "epoch": 0.37267561983471076, "grad_norm": 2.0631678104400635, "learning_rate": 3.475416897639249e-05, "loss": 3.6638, "step": 1443 }, { "epoch": 0.37293388429752067, "grad_norm": 2.8717048168182373, "learning_rate": 3.4735489352233945e-05, "loss": 4.0426, "step": 1444 }, { "epoch": 0.3731921487603306, "grad_norm": 3.033634662628174, "learning_rate": 3.471680331912648e-05, "loss": 3.8522, "step": 1445 }, { "epoch": 0.3734504132231405, "grad_norm": 3.985208034515381, "learning_rate": 3.469811088937126e-05, "loss": 3.7654, "step": 1446 }, { "epoch": 0.3737086776859504, "grad_norm": 3.7770657539367676, "learning_rate": 3.467941207527364e-05, "loss": 3.8891, "step": 1447 }, { "epoch": 0.3739669421487603, "grad_norm": 2.1837124824523926, "learning_rate": 3.466070688914322e-05, "loss": 4.0673, "step": 1448 }, { "epoch": 0.37422520661157027, "grad_norm": 3.171627998352051, "learning_rate": 3.464199534329376e-05, "loss": 4.1747, "step": 1449 }, { "epoch": 0.3744834710743802, "grad_norm": 2.005190372467041, "learning_rate": 3.46232774500432e-05, "loss": 4.6512, "step": 1450 }, { "epoch": 0.3747417355371901, "grad_norm": 2.9170804023742676, "learning_rate": 3.46045532217137e-05, "loss": 4.37, "step": 1451 }, { "epoch": 0.375, "grad_norm": 4.236720561981201, "learning_rate": 3.458582267063156e-05, "loss": 3.8422, "step": 1452 }, { "epoch": 0.3752582644628099, "grad_norm": 3.5228922367095947, "learning_rate": 3.456708580912725e-05, "loss": 4.1266, "step": 1453 }, { "epoch": 0.3755165289256198, "grad_norm": 2.7660067081451416, "learning_rate": 3.454834264953538e-05, "loss": 3.6681, "step": 1454 }, { "epoch": 0.37577479338842973, "grad_norm": 3.2324509620666504, "learning_rate": 3.452959320419473e-05, "loss": 4.2986, "step": 1455 }, { "epoch": 0.3760330578512397, "grad_norm": 1.9157782793045044, "learning_rate": 3.451083748544822e-05, "loss": 4.1721, "step": 1456 }, { "epoch": 0.3762913223140496, "grad_norm": 2.009516954421997, "learning_rate": 3.449207550564285e-05, "loss": 4.0905, "step": 1457 }, { "epoch": 0.3765495867768595, "grad_norm": 4.563449382781982, "learning_rate": 3.447330727712981e-05, "loss": 4.6263, "step": 1458 }, { "epoch": 0.3768078512396694, "grad_norm": 1.9512206315994263, "learning_rate": 3.445453281226436e-05, "loss": 4.1646, "step": 1459 }, { "epoch": 0.37706611570247933, "grad_norm": 2.7737932205200195, "learning_rate": 3.443575212340588e-05, "loss": 4.3413, "step": 1460 }, { "epoch": 0.37732438016528924, "grad_norm": 2.9051618576049805, "learning_rate": 3.441696522291784e-05, "loss": 4.226, "step": 1461 }, { "epoch": 0.37758264462809915, "grad_norm": 3.894015073776245, "learning_rate": 3.439817212316781e-05, "loss": 3.9534, "step": 1462 }, { "epoch": 0.3778409090909091, "grad_norm": 3.3437156677246094, "learning_rate": 3.4379372836527436e-05, "loss": 4.334, "step": 1463 }, { "epoch": 0.378099173553719, "grad_norm": 3.1641151905059814, "learning_rate": 3.436056737537243e-05, "loss": 3.8497, "step": 1464 }, { "epoch": 0.37835743801652894, "grad_norm": 2.312209129333496, "learning_rate": 3.434175575208257e-05, "loss": 4.1501, "step": 1465 }, { "epoch": 0.37861570247933884, "grad_norm": 3.0568151473999023, "learning_rate": 3.432293797904171e-05, "loss": 4.2184, "step": 1466 }, { "epoch": 0.37887396694214875, "grad_norm": 2.4925975799560547, "learning_rate": 3.430411406863772e-05, "loss": 4.076, "step": 1467 }, { "epoch": 0.37913223140495866, "grad_norm": 3.252171039581299, "learning_rate": 3.428528403326254e-05, "loss": 4.1462, "step": 1468 }, { "epoch": 0.3793904958677686, "grad_norm": 3.355541706085205, "learning_rate": 3.426644788531213e-05, "loss": 3.6127, "step": 1469 }, { "epoch": 0.37964876033057854, "grad_norm": 3.437406063079834, "learning_rate": 3.4247605637186465e-05, "loss": 4.0366, "step": 1470 }, { "epoch": 0.37990702479338845, "grad_norm": 3.4507312774658203, "learning_rate": 3.422875730128955e-05, "loss": 3.9744, "step": 1471 }, { "epoch": 0.38016528925619836, "grad_norm": 3.832880735397339, "learning_rate": 3.42099028900294e-05, "loss": 4.1651, "step": 1472 }, { "epoch": 0.38042355371900827, "grad_norm": 3.6166322231292725, "learning_rate": 3.4191042415818e-05, "loss": 3.4032, "step": 1473 }, { "epoch": 0.3806818181818182, "grad_norm": 3.1012327671051025, "learning_rate": 3.417217589107137e-05, "loss": 3.9298, "step": 1474 }, { "epoch": 0.3809400826446281, "grad_norm": 2.652313709259033, "learning_rate": 3.415330332820949e-05, "loss": 4.1763, "step": 1475 }, { "epoch": 0.381198347107438, "grad_norm": 2.7399027347564697, "learning_rate": 3.41344247396563e-05, "loss": 4.0756, "step": 1476 }, { "epoch": 0.38145661157024796, "grad_norm": 4.953134536743164, "learning_rate": 3.411554013783973e-05, "loss": 3.5954, "step": 1477 }, { "epoch": 0.38171487603305787, "grad_norm": 2.9637362957000732, "learning_rate": 3.409664953519166e-05, "loss": 4.483, "step": 1478 }, { "epoch": 0.3819731404958678, "grad_norm": 2.7354795932769775, "learning_rate": 3.407775294414793e-05, "loss": 4.1035, "step": 1479 }, { "epoch": 0.3822314049586777, "grad_norm": 2.8257837295532227, "learning_rate": 3.405885037714831e-05, "loss": 4.2495, "step": 1480 }, { "epoch": 0.3824896694214876, "grad_norm": 2.9075214862823486, "learning_rate": 3.40399418466365e-05, "loss": 4.2507, "step": 1481 }, { "epoch": 0.3827479338842975, "grad_norm": 2.250159740447998, "learning_rate": 3.402102736506014e-05, "loss": 4.2412, "step": 1482 }, { "epoch": 0.3830061983471074, "grad_norm": 2.8215858936309814, "learning_rate": 3.400210694487079e-05, "loss": 3.6577, "step": 1483 }, { "epoch": 0.38326446280991733, "grad_norm": 1.7298365831375122, "learning_rate": 3.39831805985239e-05, "loss": 4.0648, "step": 1484 }, { "epoch": 0.3835227272727273, "grad_norm": 3.279632806777954, "learning_rate": 3.396424833847883e-05, "loss": 4.7183, "step": 1485 }, { "epoch": 0.3837809917355372, "grad_norm": 1.9400554895401, "learning_rate": 3.3945310177198834e-05, "loss": 3.9732, "step": 1486 }, { "epoch": 0.3840392561983471, "grad_norm": 2.6699090003967285, "learning_rate": 3.3926366127151065e-05, "loss": 4.2994, "step": 1487 }, { "epoch": 0.384297520661157, "grad_norm": 2.995440721511841, "learning_rate": 3.390741620080654e-05, "loss": 4.3979, "step": 1488 }, { "epoch": 0.38455578512396693, "grad_norm": 3.212944746017456, "learning_rate": 3.388846041064012e-05, "loss": 3.3287, "step": 1489 }, { "epoch": 0.38481404958677684, "grad_norm": 1.6843606233596802, "learning_rate": 3.386949876913058e-05, "loss": 4.2866, "step": 1490 }, { "epoch": 0.38507231404958675, "grad_norm": 2.854306936264038, "learning_rate": 3.385053128876049e-05, "loss": 3.4898, "step": 1491 }, { "epoch": 0.3853305785123967, "grad_norm": 6.066675662994385, "learning_rate": 3.383155798201632e-05, "loss": 4.2574, "step": 1492 }, { "epoch": 0.3855888429752066, "grad_norm": 3.254911422729492, "learning_rate": 3.381257886138832e-05, "loss": 4.7724, "step": 1493 }, { "epoch": 0.38584710743801653, "grad_norm": 4.7180585861206055, "learning_rate": 3.379359393937061e-05, "loss": 3.7569, "step": 1494 }, { "epoch": 0.38610537190082644, "grad_norm": 1.930472493171692, "learning_rate": 3.377460322846111e-05, "loss": 4.5953, "step": 1495 }, { "epoch": 0.38636363636363635, "grad_norm": 6.549254417419434, "learning_rate": 3.375560674116154e-05, "loss": 3.1763, "step": 1496 }, { "epoch": 0.38662190082644626, "grad_norm": 3.0583009719848633, "learning_rate": 3.3736604489977466e-05, "loss": 4.4434, "step": 1497 }, { "epoch": 0.3868801652892562, "grad_norm": 2.289255142211914, "learning_rate": 3.371759648741819e-05, "loss": 3.8665, "step": 1498 }, { "epoch": 0.38713842975206614, "grad_norm": 2.6989338397979736, "learning_rate": 3.369858274599684e-05, "loss": 3.9385, "step": 1499 }, { "epoch": 0.38739669421487605, "grad_norm": 3.563199281692505, "learning_rate": 3.367956327823031e-05, "loss": 3.961, "step": 1500 }, { "epoch": 0.38765495867768596, "grad_norm": 3.1557154655456543, "learning_rate": 3.366053809663927e-05, "loss": 3.5685, "step": 1501 }, { "epoch": 0.38791322314049587, "grad_norm": 2.779142141342163, "learning_rate": 3.364150721374813e-05, "loss": 4.2036, "step": 1502 }, { "epoch": 0.3881714876033058, "grad_norm": 3.1040751934051514, "learning_rate": 3.362247064208509e-05, "loss": 4.2946, "step": 1503 }, { "epoch": 0.3884297520661157, "grad_norm": 3.987267017364502, "learning_rate": 3.360342839418206e-05, "loss": 3.7069, "step": 1504 }, { "epoch": 0.3886880165289256, "grad_norm": 3.1913902759552, "learning_rate": 3.3584380482574716e-05, "loss": 4.5353, "step": 1505 }, { "epoch": 0.38894628099173556, "grad_norm": 2.911602020263672, "learning_rate": 3.3565326919802435e-05, "loss": 3.931, "step": 1506 }, { "epoch": 0.38920454545454547, "grad_norm": 3.9681243896484375, "learning_rate": 3.354626771840832e-05, "loss": 4.659, "step": 1507 }, { "epoch": 0.3894628099173554, "grad_norm": 3.0187318325042725, "learning_rate": 3.352720289093921e-05, "loss": 4.0983, "step": 1508 }, { "epoch": 0.3897210743801653, "grad_norm": 3.711454391479492, "learning_rate": 3.350813244994562e-05, "loss": 4.2161, "step": 1509 }, { "epoch": 0.3899793388429752, "grad_norm": 5.474893093109131, "learning_rate": 3.348905640798177e-05, "loss": 4.2154, "step": 1510 }, { "epoch": 0.3902376033057851, "grad_norm": 3.9692161083221436, "learning_rate": 3.346997477760558e-05, "loss": 3.9732, "step": 1511 }, { "epoch": 0.390495867768595, "grad_norm": 2.5617494583129883, "learning_rate": 3.345088757137862e-05, "loss": 4.5613, "step": 1512 }, { "epoch": 0.390754132231405, "grad_norm": 2.6944758892059326, "learning_rate": 3.343179480186616e-05, "loss": 3.9683, "step": 1513 }, { "epoch": 0.3910123966942149, "grad_norm": 2.59171199798584, "learning_rate": 3.3412696481637105e-05, "loss": 4.2364, "step": 1514 }, { "epoch": 0.3912706611570248, "grad_norm": 3.0318658351898193, "learning_rate": 3.339359262326405e-05, "loss": 4.2062, "step": 1515 }, { "epoch": 0.3915289256198347, "grad_norm": 1.884788990020752, "learning_rate": 3.337448323932319e-05, "loss": 3.7921, "step": 1516 }, { "epoch": 0.3917871900826446, "grad_norm": 3.1412365436553955, "learning_rate": 3.335536834239441e-05, "loss": 4.5064, "step": 1517 }, { "epoch": 0.39204545454545453, "grad_norm": 4.082050800323486, "learning_rate": 3.333624794506118e-05, "loss": 4.206, "step": 1518 }, { "epoch": 0.39230371900826444, "grad_norm": 2.4154744148254395, "learning_rate": 3.3317122059910607e-05, "loss": 4.1587, "step": 1519 }, { "epoch": 0.3925619834710744, "grad_norm": 2.8097083568573, "learning_rate": 3.3297990699533406e-05, "loss": 3.9159, "step": 1520 }, { "epoch": 0.3928202479338843, "grad_norm": 2.838141679763794, "learning_rate": 3.327885387652391e-05, "loss": 4.0303, "step": 1521 }, { "epoch": 0.3930785123966942, "grad_norm": 2.98055362701416, "learning_rate": 3.325971160348005e-05, "loss": 4.7969, "step": 1522 }, { "epoch": 0.39333677685950413, "grad_norm": 2.7643237113952637, "learning_rate": 3.324056389300331e-05, "loss": 4.0498, "step": 1523 }, { "epoch": 0.39359504132231404, "grad_norm": 2.129709482192993, "learning_rate": 3.32214107576988e-05, "loss": 4.4148, "step": 1524 }, { "epoch": 0.39385330578512395, "grad_norm": 3.0507147312164307, "learning_rate": 3.320225221017516e-05, "loss": 4.3885, "step": 1525 }, { "epoch": 0.39411157024793386, "grad_norm": 2.7983949184417725, "learning_rate": 3.318308826304463e-05, "loss": 4.2968, "step": 1526 }, { "epoch": 0.3943698347107438, "grad_norm": 1.6506847143173218, "learning_rate": 3.316391892892298e-05, "loss": 4.4803, "step": 1527 }, { "epoch": 0.39462809917355374, "grad_norm": 2.734753131866455, "learning_rate": 3.314474422042952e-05, "loss": 4.477, "step": 1528 }, { "epoch": 0.39488636363636365, "grad_norm": 1.9003530740737915, "learning_rate": 3.312556415018713e-05, "loss": 4.1404, "step": 1529 }, { "epoch": 0.39514462809917356, "grad_norm": 2.4210503101348877, "learning_rate": 3.310637873082219e-05, "loss": 4.7817, "step": 1530 }, { "epoch": 0.39540289256198347, "grad_norm": 2.492231607437134, "learning_rate": 3.308718797496461e-05, "loss": 4.1216, "step": 1531 }, { "epoch": 0.3956611570247934, "grad_norm": 2.1365137100219727, "learning_rate": 3.306799189524783e-05, "loss": 3.7124, "step": 1532 }, { "epoch": 0.3959194214876033, "grad_norm": 2.7555830478668213, "learning_rate": 3.304879050430876e-05, "loss": 4.4604, "step": 1533 }, { "epoch": 0.39617768595041325, "grad_norm": 2.948671340942383, "learning_rate": 3.302958381478783e-05, "loss": 4.0001, "step": 1534 }, { "epoch": 0.39643595041322316, "grad_norm": 2.8566126823425293, "learning_rate": 3.301037183932897e-05, "loss": 4.6552, "step": 1535 }, { "epoch": 0.39669421487603307, "grad_norm": 1.6037946939468384, "learning_rate": 3.299115459057955e-05, "loss": 3.742, "step": 1536 }, { "epoch": 0.396952479338843, "grad_norm": 2.7295069694519043, "learning_rate": 3.297193208119047e-05, "loss": 3.9938, "step": 1537 }, { "epoch": 0.3972107438016529, "grad_norm": 3.900174140930176, "learning_rate": 3.295270432381602e-05, "loss": 4.8562, "step": 1538 }, { "epoch": 0.3974690082644628, "grad_norm": 3.387183427810669, "learning_rate": 3.293347133111401e-05, "loss": 4.4996, "step": 1539 }, { "epoch": 0.3977272727272727, "grad_norm": 2.165419340133667, "learning_rate": 3.291423311574566e-05, "loss": 4.5184, "step": 1540 }, { "epoch": 0.39798553719008267, "grad_norm": 2.793762445449829, "learning_rate": 3.2894989690375626e-05, "loss": 4.0005, "step": 1541 }, { "epoch": 0.3982438016528926, "grad_norm": 2.9241037368774414, "learning_rate": 3.2875741067672023e-05, "loss": 3.9704, "step": 1542 }, { "epoch": 0.3985020661157025, "grad_norm": 1.9642356634140015, "learning_rate": 3.285648726030637e-05, "loss": 3.9635, "step": 1543 }, { "epoch": 0.3987603305785124, "grad_norm": 3.5589518547058105, "learning_rate": 3.283722828095359e-05, "loss": 4.2375, "step": 1544 }, { "epoch": 0.3990185950413223, "grad_norm": 2.1440846920013428, "learning_rate": 3.2817964142292026e-05, "loss": 3.609, "step": 1545 }, { "epoch": 0.3992768595041322, "grad_norm": 3.414443016052246, "learning_rate": 3.279869485700341e-05, "loss": 3.8639, "step": 1546 }, { "epoch": 0.39953512396694213, "grad_norm": 2.6095941066741943, "learning_rate": 3.2779420437772866e-05, "loss": 3.9339, "step": 1547 }, { "epoch": 0.39979338842975204, "grad_norm": 3.1918158531188965, "learning_rate": 3.2760140897288884e-05, "loss": 4.2123, "step": 1548 }, { "epoch": 0.400051652892562, "grad_norm": 4.017918586730957, "learning_rate": 3.274085624824335e-05, "loss": 4.0485, "step": 1549 }, { "epoch": 0.4003099173553719, "grad_norm": 4.778039932250977, "learning_rate": 3.2721566503331496e-05, "loss": 3.9002, "step": 1550 }, { "epoch": 0.4005681818181818, "grad_norm": 4.700753688812256, "learning_rate": 3.270227167525191e-05, "loss": 3.8376, "step": 1551 }, { "epoch": 0.40082644628099173, "grad_norm": 2.782362222671509, "learning_rate": 3.2682971776706525e-05, "loss": 4.4478, "step": 1552 }, { "epoch": 0.40108471074380164, "grad_norm": 3.9505422115325928, "learning_rate": 3.266366682040063e-05, "loss": 4.2706, "step": 1553 }, { "epoch": 0.40134297520661155, "grad_norm": 2.746795654296875, "learning_rate": 3.2644356819042806e-05, "loss": 4.1164, "step": 1554 }, { "epoch": 0.40160123966942146, "grad_norm": 1.821284294128418, "learning_rate": 3.2625041785345e-05, "loss": 3.6323, "step": 1555 }, { "epoch": 0.4018595041322314, "grad_norm": 2.893434524536133, "learning_rate": 3.260572173202243e-05, "loss": 3.9763, "step": 1556 }, { "epoch": 0.40211776859504134, "grad_norm": 2.800421714782715, "learning_rate": 3.258639667179366e-05, "loss": 4.0569, "step": 1557 }, { "epoch": 0.40237603305785125, "grad_norm": 4.2209014892578125, "learning_rate": 3.256706661738052e-05, "loss": 3.8292, "step": 1558 }, { "epoch": 0.40263429752066116, "grad_norm": 4.693404674530029, "learning_rate": 3.254773158150812e-05, "loss": 3.6741, "step": 1559 }, { "epoch": 0.40289256198347106, "grad_norm": 2.878765344619751, "learning_rate": 3.252839157690489e-05, "loss": 4.5284, "step": 1560 }, { "epoch": 0.403150826446281, "grad_norm": 3.3946845531463623, "learning_rate": 3.2509046616302496e-05, "loss": 4.1067, "step": 1561 }, { "epoch": 0.4034090909090909, "grad_norm": 2.9824695587158203, "learning_rate": 3.248969671243588e-05, "loss": 4.5337, "step": 1562 }, { "epoch": 0.40366735537190085, "grad_norm": 6.0578999519348145, "learning_rate": 3.247034187804323e-05, "loss": 3.7826, "step": 1563 }, { "epoch": 0.40392561983471076, "grad_norm": 2.8193359375, "learning_rate": 3.245098212586598e-05, "loss": 4.5645, "step": 1564 }, { "epoch": 0.40418388429752067, "grad_norm": 3.9396791458129883, "learning_rate": 3.243161746864883e-05, "loss": 3.7722, "step": 1565 }, { "epoch": 0.4044421487603306, "grad_norm": 3.799791097640991, "learning_rate": 3.241224791913966e-05, "loss": 4.033, "step": 1566 }, { "epoch": 0.4047004132231405, "grad_norm": 2.36106538772583, "learning_rate": 3.239287349008961e-05, "loss": 4.1936, "step": 1567 }, { "epoch": 0.4049586776859504, "grad_norm": 2.684955358505249, "learning_rate": 3.2373494194253015e-05, "loss": 4.0798, "step": 1568 }, { "epoch": 0.4052169421487603, "grad_norm": 2.9782497882843018, "learning_rate": 3.235411004438741e-05, "loss": 3.9025, "step": 1569 }, { "epoch": 0.40547520661157027, "grad_norm": 3.4963202476501465, "learning_rate": 3.233472105325355e-05, "loss": 4.3298, "step": 1570 }, { "epoch": 0.4057334710743802, "grad_norm": 6.207714080810547, "learning_rate": 3.2315327233615345e-05, "loss": 4.3204, "step": 1571 }, { "epoch": 0.4059917355371901, "grad_norm": 3.4804279804229736, "learning_rate": 3.22959285982399e-05, "loss": 4.6289, "step": 1572 }, { "epoch": 0.40625, "grad_norm": 2.540593385696411, "learning_rate": 3.2276525159897496e-05, "loss": 3.8441, "step": 1573 }, { "epoch": 0.4065082644628099, "grad_norm": 4.057863235473633, "learning_rate": 3.225711693136156e-05, "loss": 3.5476, "step": 1574 }, { "epoch": 0.4067665289256198, "grad_norm": 3.149552822113037, "learning_rate": 3.223770392540869e-05, "loss": 4.0861, "step": 1575 }, { "epoch": 0.40702479338842973, "grad_norm": 3.022362232208252, "learning_rate": 3.221828615481861e-05, "loss": 3.997, "step": 1576 }, { "epoch": 0.4072830578512397, "grad_norm": 3.0078306198120117, "learning_rate": 3.21988636323742e-05, "loss": 3.6059, "step": 1577 }, { "epoch": 0.4075413223140496, "grad_norm": 2.546801805496216, "learning_rate": 3.2179436370861454e-05, "loss": 3.7231, "step": 1578 }, { "epoch": 0.4077995867768595, "grad_norm": 2.103842258453369, "learning_rate": 3.216000438306949e-05, "loss": 4.2588, "step": 1579 }, { "epoch": 0.4080578512396694, "grad_norm": 2.623440980911255, "learning_rate": 3.214056768179053e-05, "loss": 3.9113, "step": 1580 }, { "epoch": 0.40831611570247933, "grad_norm": 3.2465739250183105, "learning_rate": 3.2121126279819934e-05, "loss": 4.1684, "step": 1581 }, { "epoch": 0.40857438016528924, "grad_norm": 4.308035850524902, "learning_rate": 3.210168018995609e-05, "loss": 3.9953, "step": 1582 }, { "epoch": 0.40883264462809915, "grad_norm": 1.7719393968582153, "learning_rate": 3.208222942500055e-05, "loss": 4.1677, "step": 1583 }, { "epoch": 0.4090909090909091, "grad_norm": 4.237754821777344, "learning_rate": 3.2062773997757886e-05, "loss": 4.0933, "step": 1584 }, { "epoch": 0.409349173553719, "grad_norm": 3.706975221633911, "learning_rate": 3.2043313921035743e-05, "loss": 4.2943, "step": 1585 }, { "epoch": 0.40960743801652894, "grad_norm": 6.579211235046387, "learning_rate": 3.2023849207644866e-05, "loss": 4.0764, "step": 1586 }, { "epoch": 0.40986570247933884, "grad_norm": 3.674304485321045, "learning_rate": 3.2004379870399026e-05, "loss": 4.2592, "step": 1587 }, { "epoch": 0.41012396694214875, "grad_norm": 2.9180479049682617, "learning_rate": 3.1984905922115025e-05, "loss": 4.0492, "step": 1588 }, { "epoch": 0.41038223140495866, "grad_norm": 3.0587246417999268, "learning_rate": 3.196542737561274e-05, "loss": 3.923, "step": 1589 }, { "epoch": 0.4106404958677686, "grad_norm": 3.063551664352417, "learning_rate": 3.1945944243715024e-05, "loss": 3.8205, "step": 1590 }, { "epoch": 0.41089876033057854, "grad_norm": 2.253952980041504, "learning_rate": 3.192645653924779e-05, "loss": 3.8061, "step": 1591 }, { "epoch": 0.41115702479338845, "grad_norm": 4.8911452293396, "learning_rate": 3.190696427503994e-05, "loss": 3.7745, "step": 1592 }, { "epoch": 0.41141528925619836, "grad_norm": 2.9209914207458496, "learning_rate": 3.188746746392339e-05, "loss": 4.2115, "step": 1593 }, { "epoch": 0.41167355371900827, "grad_norm": 2.4483530521392822, "learning_rate": 3.186796611873305e-05, "loss": 3.9093, "step": 1594 }, { "epoch": 0.4119318181818182, "grad_norm": 3.2114717960357666, "learning_rate": 3.18484602523068e-05, "loss": 4.0459, "step": 1595 }, { "epoch": 0.4121900826446281, "grad_norm": 2.456500291824341, "learning_rate": 3.18289498774855e-05, "loss": 3.7556, "step": 1596 }, { "epoch": 0.412448347107438, "grad_norm": 2.1316123008728027, "learning_rate": 3.1809435007112986e-05, "loss": 4.4177, "step": 1597 }, { "epoch": 0.41270661157024796, "grad_norm": 3.3907411098480225, "learning_rate": 3.178991565403605e-05, "loss": 4.2227, "step": 1598 }, { "epoch": 0.41296487603305787, "grad_norm": 3.0257813930511475, "learning_rate": 3.177039183110444e-05, "loss": 4.1909, "step": 1599 }, { "epoch": 0.4132231404958678, "grad_norm": 3.092495918273926, "learning_rate": 3.175086355117084e-05, "loss": 4.1059, "step": 1600 }, { "epoch": 0.4134814049586777, "grad_norm": 1.8857272863388062, "learning_rate": 3.1731330827090865e-05, "loss": 4.2275, "step": 1601 }, { "epoch": 0.4137396694214876, "grad_norm": 4.083542823791504, "learning_rate": 3.1711793671723064e-05, "loss": 4.4218, "step": 1602 }, { "epoch": 0.4139979338842975, "grad_norm": 3.763951539993286, "learning_rate": 3.16922520979289e-05, "loss": 4.0448, "step": 1603 }, { "epoch": 0.4142561983471074, "grad_norm": 2.1454262733459473, "learning_rate": 3.167270611857275e-05, "loss": 4.5829, "step": 1604 }, { "epoch": 0.41451446280991733, "grad_norm": 4.696751594543457, "learning_rate": 3.165315574652187e-05, "loss": 3.9673, "step": 1605 }, { "epoch": 0.4147727272727273, "grad_norm": 2.019141435623169, "learning_rate": 3.163360099464643e-05, "loss": 4.5097, "step": 1606 }, { "epoch": 0.4150309917355372, "grad_norm": 3.8082048892974854, "learning_rate": 3.1614041875819475e-05, "loss": 4.6133, "step": 1607 }, { "epoch": 0.4152892561983471, "grad_norm": 2.287543773651123, "learning_rate": 3.1594478402916934e-05, "loss": 3.8935, "step": 1608 }, { "epoch": 0.415547520661157, "grad_norm": 7.507081508636475, "learning_rate": 3.157491058881759e-05, "loss": 4.7616, "step": 1609 }, { "epoch": 0.41580578512396693, "grad_norm": 1.9803156852722168, "learning_rate": 3.155533844640309e-05, "loss": 3.5025, "step": 1610 }, { "epoch": 0.41606404958677684, "grad_norm": 5.488758087158203, "learning_rate": 3.153576198855792e-05, "loss": 4.7053, "step": 1611 }, { "epoch": 0.41632231404958675, "grad_norm": 5.566988945007324, "learning_rate": 3.151618122816943e-05, "loss": 3.1203, "step": 1612 }, { "epoch": 0.4165805785123967, "grad_norm": 3.3268418312072754, "learning_rate": 3.149659617812777e-05, "loss": 3.4088, "step": 1613 }, { "epoch": 0.4168388429752066, "grad_norm": 3.1198720932006836, "learning_rate": 3.1477006851325954e-05, "loss": 4.0478, "step": 1614 }, { "epoch": 0.41709710743801653, "grad_norm": 4.4500813484191895, "learning_rate": 3.145741326065978e-05, "loss": 4.2673, "step": 1615 }, { "epoch": 0.41735537190082644, "grad_norm": 2.9488563537597656, "learning_rate": 3.1437815419027836e-05, "loss": 3.9254, "step": 1616 }, { "epoch": 0.41761363636363635, "grad_norm": 2.2801692485809326, "learning_rate": 3.141821333933158e-05, "loss": 4.1712, "step": 1617 }, { "epoch": 0.41787190082644626, "grad_norm": 2.1293351650238037, "learning_rate": 3.139860703447519e-05, "loss": 4.4486, "step": 1618 }, { "epoch": 0.4181301652892562, "grad_norm": 4.94828987121582, "learning_rate": 3.1378996517365645e-05, "loss": 4.2177, "step": 1619 }, { "epoch": 0.41838842975206614, "grad_norm": 2.2338430881500244, "learning_rate": 3.135938180091272e-05, "loss": 4.1279, "step": 1620 }, { "epoch": 0.41864669421487605, "grad_norm": 1.9729859828948975, "learning_rate": 3.1339762898028915e-05, "loss": 4.4262, "step": 1621 }, { "epoch": 0.41890495867768596, "grad_norm": 2.883769989013672, "learning_rate": 3.1320139821629526e-05, "loss": 4.167, "step": 1622 }, { "epoch": 0.41916322314049587, "grad_norm": 3.465716600418091, "learning_rate": 3.130051258463257e-05, "loss": 3.4909, "step": 1623 }, { "epoch": 0.4194214876033058, "grad_norm": 2.079549789428711, "learning_rate": 3.128088119995881e-05, "loss": 3.8399, "step": 1624 }, { "epoch": 0.4196797520661157, "grad_norm": 1.94605553150177, "learning_rate": 3.1261245680531757e-05, "loss": 4.4797, "step": 1625 }, { "epoch": 0.4199380165289256, "grad_norm": 2.737880229949951, "learning_rate": 3.1241606039277594e-05, "loss": 4.6629, "step": 1626 }, { "epoch": 0.42019628099173556, "grad_norm": 3.3258659839630127, "learning_rate": 3.122196228912528e-05, "loss": 3.712, "step": 1627 }, { "epoch": 0.42045454545454547, "grad_norm": 2.302323818206787, "learning_rate": 3.1202314443006433e-05, "loss": 4.4254, "step": 1628 }, { "epoch": 0.4207128099173554, "grad_norm": 2.3684535026550293, "learning_rate": 3.118266251385539e-05, "loss": 4.2236, "step": 1629 }, { "epoch": 0.4209710743801653, "grad_norm": 3.116687297821045, "learning_rate": 3.116300651460917e-05, "loss": 4.0084, "step": 1630 }, { "epoch": 0.4212293388429752, "grad_norm": 2.3154942989349365, "learning_rate": 3.114334645820748e-05, "loss": 4.0321, "step": 1631 }, { "epoch": 0.4214876033057851, "grad_norm": 2.82209849357605, "learning_rate": 3.112368235759267e-05, "loss": 4.2308, "step": 1632 }, { "epoch": 0.421745867768595, "grad_norm": 2.748415946960449, "learning_rate": 3.110401422570979e-05, "loss": 4.0843, "step": 1633 }, { "epoch": 0.422004132231405, "grad_norm": 2.8514435291290283, "learning_rate": 3.1084342075506514e-05, "loss": 4.3863, "step": 1634 }, { "epoch": 0.4222623966942149, "grad_norm": 3.1054141521453857, "learning_rate": 3.106466591993317e-05, "loss": 4.1192, "step": 1635 }, { "epoch": 0.4225206611570248, "grad_norm": 4.784148693084717, "learning_rate": 3.1044985771942735e-05, "loss": 3.8477, "step": 1636 }, { "epoch": 0.4227789256198347, "grad_norm": 2.925445556640625, "learning_rate": 3.102530164449081e-05, "loss": 4.2836, "step": 1637 }, { "epoch": 0.4230371900826446, "grad_norm": 4.283999919891357, "learning_rate": 3.100561355053559e-05, "loss": 4.0906, "step": 1638 }, { "epoch": 0.42329545454545453, "grad_norm": 2.968247890472412, "learning_rate": 3.098592150303792e-05, "loss": 4.3075, "step": 1639 }, { "epoch": 0.42355371900826444, "grad_norm": 3.3480608463287354, "learning_rate": 3.096622551496122e-05, "loss": 4.2179, "step": 1640 }, { "epoch": 0.4238119834710744, "grad_norm": 3.0659172534942627, "learning_rate": 3.0946525599271515e-05, "loss": 4.0351, "step": 1641 }, { "epoch": 0.4240702479338843, "grad_norm": 1.8807321786880493, "learning_rate": 3.092682176893741e-05, "loss": 3.781, "step": 1642 }, { "epoch": 0.4243285123966942, "grad_norm": 2.4058938026428223, "learning_rate": 3.0907114036930094e-05, "loss": 4.2586, "step": 1643 }, { "epoch": 0.42458677685950413, "grad_norm": 2.7189409732818604, "learning_rate": 3.0887402416223334e-05, "loss": 4.083, "step": 1644 }, { "epoch": 0.42484504132231404, "grad_norm": 3.081798791885376, "learning_rate": 3.086768691979342e-05, "loss": 3.9805, "step": 1645 }, { "epoch": 0.42510330578512395, "grad_norm": 2.479400396347046, "learning_rate": 3.084796756061924e-05, "loss": 4.128, "step": 1646 }, { "epoch": 0.42536157024793386, "grad_norm": 3.3229594230651855, "learning_rate": 3.082824435168219e-05, "loss": 4.053, "step": 1647 }, { "epoch": 0.4256198347107438, "grad_norm": 4.313965797424316, "learning_rate": 3.080851730596622e-05, "loss": 4.4873, "step": 1648 }, { "epoch": 0.42587809917355374, "grad_norm": 3.830145835876465, "learning_rate": 3.078878643645778e-05, "loss": 3.9227, "step": 1649 }, { "epoch": 0.42613636363636365, "grad_norm": 4.721293926239014, "learning_rate": 3.076905175614588e-05, "loss": 3.7798, "step": 1650 }, { "epoch": 0.42639462809917356, "grad_norm": 3.452042818069458, "learning_rate": 3.074931327802202e-05, "loss": 4.4544, "step": 1651 }, { "epoch": 0.42665289256198347, "grad_norm": 5.542668342590332, "learning_rate": 3.072957101508017e-05, "loss": 4.4133, "step": 1652 }, { "epoch": 0.4269111570247934, "grad_norm": 1.8205065727233887, "learning_rate": 3.070982498031682e-05, "loss": 4.2181, "step": 1653 }, { "epoch": 0.4271694214876033, "grad_norm": 2.2841217517852783, "learning_rate": 3.069007518673095e-05, "loss": 4.0369, "step": 1654 }, { "epoch": 0.42742768595041325, "grad_norm": 2.996622323989868, "learning_rate": 3.0670321647324e-05, "loss": 4.0715, "step": 1655 }, { "epoch": 0.42768595041322316, "grad_norm": 2.131052255630493, "learning_rate": 3.0650564375099876e-05, "loss": 4.157, "step": 1656 }, { "epoch": 0.42794421487603307, "grad_norm": 2.246185302734375, "learning_rate": 3.063080338306496e-05, "loss": 4.1217, "step": 1657 }, { "epoch": 0.428202479338843, "grad_norm": 2.090320587158203, "learning_rate": 3.061103868422805e-05, "loss": 3.921, "step": 1658 }, { "epoch": 0.4284607438016529, "grad_norm": 4.121866703033447, "learning_rate": 3.059127029160041e-05, "loss": 3.8583, "step": 1659 }, { "epoch": 0.4287190082644628, "grad_norm": 3.263059139251709, "learning_rate": 3.0571498218195726e-05, "loss": 3.9179, "step": 1660 }, { "epoch": 0.4289772727272727, "grad_norm": 3.1610984802246094, "learning_rate": 3.055172247703011e-05, "loss": 4.4017, "step": 1661 }, { "epoch": 0.42923553719008267, "grad_norm": 1.6838912963867188, "learning_rate": 3.0531943081122096e-05, "loss": 3.9923, "step": 1662 }, { "epoch": 0.4294938016528926, "grad_norm": 4.263238430023193, "learning_rate": 3.0512160043492587e-05, "loss": 4.2634, "step": 1663 }, { "epoch": 0.4297520661157025, "grad_norm": 2.791141986846924, "learning_rate": 3.0492373377164946e-05, "loss": 4.2381, "step": 1664 }, { "epoch": 0.4300103305785124, "grad_norm": 2.630615234375, "learning_rate": 3.0472583095164874e-05, "loss": 3.7775, "step": 1665 }, { "epoch": 0.4302685950413223, "grad_norm": 2.819124698638916, "learning_rate": 3.0452789210520465e-05, "loss": 3.8842, "step": 1666 }, { "epoch": 0.4305268595041322, "grad_norm": 2.675503730773926, "learning_rate": 3.0432991736262197e-05, "loss": 3.6623, "step": 1667 }, { "epoch": 0.43078512396694213, "grad_norm": 1.8803517818450928, "learning_rate": 3.0413190685422898e-05, "loss": 4.0096, "step": 1668 }, { "epoch": 0.43104338842975204, "grad_norm": 3.190197229385376, "learning_rate": 3.0393386071037754e-05, "loss": 4.0035, "step": 1669 }, { "epoch": 0.431301652892562, "grad_norm": 3.8462607860565186, "learning_rate": 3.0373577906144307e-05, "loss": 4.2009, "step": 1670 }, { "epoch": 0.4315599173553719, "grad_norm": 24.30170440673828, "learning_rate": 3.0353766203782406e-05, "loss": 4.0966, "step": 1671 }, { "epoch": 0.4318181818181818, "grad_norm": 2.171119213104248, "learning_rate": 3.0333950976994274e-05, "loss": 4.1651, "step": 1672 }, { "epoch": 0.43207644628099173, "grad_norm": 4.866224765777588, "learning_rate": 3.0314132238824415e-05, "loss": 2.7333, "step": 1673 }, { "epoch": 0.43233471074380164, "grad_norm": 3.1120691299438477, "learning_rate": 3.029431000231967e-05, "loss": 4.3683, "step": 1674 }, { "epoch": 0.43259297520661155, "grad_norm": 3.6422669887542725, "learning_rate": 3.027448428052917e-05, "loss": 4.5047, "step": 1675 }, { "epoch": 0.43285123966942146, "grad_norm": 2.517775297164917, "learning_rate": 3.025465508650433e-05, "loss": 4.1419, "step": 1676 }, { "epoch": 0.4331095041322314, "grad_norm": 3.311671495437622, "learning_rate": 3.023482243329888e-05, "loss": 3.7102, "step": 1677 }, { "epoch": 0.43336776859504134, "grad_norm": 2.952918529510498, "learning_rate": 3.0214986333968804e-05, "loss": 4.0754, "step": 1678 }, { "epoch": 0.43362603305785125, "grad_norm": 4.419524192810059, "learning_rate": 3.0195146801572354e-05, "loss": 3.7729, "step": 1679 }, { "epoch": 0.43388429752066116, "grad_norm": 1.5300257205963135, "learning_rate": 3.0175303849170073e-05, "loss": 3.9086, "step": 1680 }, { "epoch": 0.43414256198347106, "grad_norm": 2.9411394596099854, "learning_rate": 3.0155457489824705e-05, "loss": 4.0196, "step": 1681 }, { "epoch": 0.434400826446281, "grad_norm": 3.093928337097168, "learning_rate": 3.0135607736601283e-05, "loss": 4.0731, "step": 1682 }, { "epoch": 0.4346590909090909, "grad_norm": 3.3194568157196045, "learning_rate": 3.0115754602567046e-05, "loss": 3.9688, "step": 1683 }, { "epoch": 0.43491735537190085, "grad_norm": 2.555009126663208, "learning_rate": 3.009589810079147e-05, "loss": 4.3262, "step": 1684 }, { "epoch": 0.43517561983471076, "grad_norm": 1.9933103322982788, "learning_rate": 3.0076038244346242e-05, "loss": 4.2663, "step": 1685 }, { "epoch": 0.43543388429752067, "grad_norm": 4.2757344245910645, "learning_rate": 3.005617504630527e-05, "loss": 4.2043, "step": 1686 }, { "epoch": 0.4356921487603306, "grad_norm": 5.298089504241943, "learning_rate": 3.0036308519744645e-05, "loss": 4.1243, "step": 1687 }, { "epoch": 0.4359504132231405, "grad_norm": 3.7608559131622314, "learning_rate": 3.001643867774266e-05, "loss": 3.6018, "step": 1688 }, { "epoch": 0.4362086776859504, "grad_norm": 6.776874542236328, "learning_rate": 2.9996565533379782e-05, "loss": 3.4538, "step": 1689 }, { "epoch": 0.4364669421487603, "grad_norm": 4.729469299316406, "learning_rate": 2.9976689099738675e-05, "loss": 3.9431, "step": 1690 }, { "epoch": 0.43672520661157027, "grad_norm": 3.3374555110931396, "learning_rate": 2.9956809389904128e-05, "loss": 3.8278, "step": 1691 }, { "epoch": 0.4369834710743802, "grad_norm": 2.711296558380127, "learning_rate": 2.9936926416963124e-05, "loss": 4.0532, "step": 1692 }, { "epoch": 0.4372417355371901, "grad_norm": 3.0675761699676514, "learning_rate": 2.9917040194004776e-05, "loss": 4.0803, "step": 1693 }, { "epoch": 0.4375, "grad_norm": 2.3976833820343018, "learning_rate": 2.9897150734120337e-05, "loss": 4.2412, "step": 1694 }, { "epoch": 0.4377582644628099, "grad_norm": 3.4480934143066406, "learning_rate": 2.9877258050403212e-05, "loss": 4.0411, "step": 1695 }, { "epoch": 0.4380165289256198, "grad_norm": 5.1025824546813965, "learning_rate": 2.985736215594889e-05, "loss": 3.0016, "step": 1696 }, { "epoch": 0.43827479338842973, "grad_norm": 2.9990336894989014, "learning_rate": 2.9837463063854994e-05, "loss": 3.9543, "step": 1697 }, { "epoch": 0.4385330578512397, "grad_norm": 2.7700276374816895, "learning_rate": 2.9817560787221265e-05, "loss": 4.5487, "step": 1698 }, { "epoch": 0.4387913223140496, "grad_norm": 3.700376272201538, "learning_rate": 2.979765533914952e-05, "loss": 4.2446, "step": 1699 }, { "epoch": 0.4390495867768595, "grad_norm": 1.7893661260604858, "learning_rate": 2.9777746732743673e-05, "loss": 3.9117, "step": 1700 }, { "epoch": 0.4393078512396694, "grad_norm": 1.764548420906067, "learning_rate": 2.9757834981109723e-05, "loss": 4.0915, "step": 1701 }, { "epoch": 0.43956611570247933, "grad_norm": 3.1768229007720947, "learning_rate": 2.9737920097355714e-05, "loss": 3.8216, "step": 1702 }, { "epoch": 0.43982438016528924, "grad_norm": 2.825714111328125, "learning_rate": 2.971800209459179e-05, "loss": 4.0446, "step": 1703 }, { "epoch": 0.44008264462809915, "grad_norm": 2.0447728633880615, "learning_rate": 2.969808098593011e-05, "loss": 4.2363, "step": 1704 }, { "epoch": 0.4403409090909091, "grad_norm": 3.2622058391571045, "learning_rate": 2.9678156784484917e-05, "loss": 4.2785, "step": 1705 }, { "epoch": 0.440599173553719, "grad_norm": 2.9501278400421143, "learning_rate": 2.965822950337245e-05, "loss": 4.2915, "step": 1706 }, { "epoch": 0.44085743801652894, "grad_norm": 3.4121174812316895, "learning_rate": 2.9638299155710996e-05, "loss": 4.3872, "step": 1707 }, { "epoch": 0.44111570247933884, "grad_norm": 2.406754970550537, "learning_rate": 2.9618365754620875e-05, "loss": 4.5373, "step": 1708 }, { "epoch": 0.44137396694214875, "grad_norm": 4.707063674926758, "learning_rate": 2.9598429313224392e-05, "loss": 4.4269, "step": 1709 }, { "epoch": 0.44163223140495866, "grad_norm": 7.742244243621826, "learning_rate": 2.957848984464585e-05, "loss": 3.8505, "step": 1710 }, { "epoch": 0.4418904958677686, "grad_norm": 2.418008327484131, "learning_rate": 2.955854736201158e-05, "loss": 4.0833, "step": 1711 }, { "epoch": 0.44214876033057854, "grad_norm": 2.2276968955993652, "learning_rate": 2.9538601878449855e-05, "loss": 4.0147, "step": 1712 }, { "epoch": 0.44240702479338845, "grad_norm": 3.4329075813293457, "learning_rate": 2.951865340709095e-05, "loss": 3.8066, "step": 1713 }, { "epoch": 0.44266528925619836, "grad_norm": 5.152897834777832, "learning_rate": 2.949870196106711e-05, "loss": 4.3957, "step": 1714 }, { "epoch": 0.44292355371900827, "grad_norm": 2.87094783782959, "learning_rate": 2.947874755351251e-05, "loss": 4.7525, "step": 1715 }, { "epoch": 0.4431818181818182, "grad_norm": 4.0503106117248535, "learning_rate": 2.9458790197563312e-05, "loss": 3.6955, "step": 1716 }, { "epoch": 0.4434400826446281, "grad_norm": 1.9228317737579346, "learning_rate": 2.943882990635759e-05, "loss": 4.084, "step": 1717 }, { "epoch": 0.443698347107438, "grad_norm": 2.2104313373565674, "learning_rate": 2.941886669303535e-05, "loss": 4.0007, "step": 1718 }, { "epoch": 0.44395661157024796, "grad_norm": 2.748295545578003, "learning_rate": 2.9398900570738553e-05, "loss": 4.0878, "step": 1719 }, { "epoch": 0.44421487603305787, "grad_norm": 4.339565753936768, "learning_rate": 2.9378931552611026e-05, "loss": 4.3116, "step": 1720 }, { "epoch": 0.4444731404958678, "grad_norm": 2.7125892639160156, "learning_rate": 2.935895965179856e-05, "loss": 4.0358, "step": 1721 }, { "epoch": 0.4447314049586777, "grad_norm": 2.4661707878112793, "learning_rate": 2.9338984881448805e-05, "loss": 4.259, "step": 1722 }, { "epoch": 0.4449896694214876, "grad_norm": 3.9815948009490967, "learning_rate": 2.9319007254711295e-05, "loss": 4.2229, "step": 1723 }, { "epoch": 0.4452479338842975, "grad_norm": 3.923283815383911, "learning_rate": 2.9299026784737475e-05, "loss": 3.6816, "step": 1724 }, { "epoch": 0.4455061983471074, "grad_norm": 2.847902297973633, "learning_rate": 2.9279043484680634e-05, "loss": 4.2968, "step": 1725 }, { "epoch": 0.44576446280991733, "grad_norm": 3.2058260440826416, "learning_rate": 2.925905736769594e-05, "loss": 3.9307, "step": 1726 }, { "epoch": 0.4460227272727273, "grad_norm": 1.7897080183029175, "learning_rate": 2.923906844694041e-05, "loss": 4.1633, "step": 1727 }, { "epoch": 0.4462809917355372, "grad_norm": 6.9924163818359375, "learning_rate": 2.92190767355729e-05, "loss": 4.1069, "step": 1728 }, { "epoch": 0.4465392561983471, "grad_norm": 4.135317802429199, "learning_rate": 2.9199082246754122e-05, "loss": 3.9981, "step": 1729 }, { "epoch": 0.446797520661157, "grad_norm": 3.1107470989227295, "learning_rate": 2.9179084993646598e-05, "loss": 4.1722, "step": 1730 }, { "epoch": 0.44705578512396693, "grad_norm": 3.38948392868042, "learning_rate": 2.9159084989414682e-05, "loss": 4.0354, "step": 1731 }, { "epoch": 0.44731404958677684, "grad_norm": 2.664581537246704, "learning_rate": 2.9139082247224524e-05, "loss": 4.2703, "step": 1732 }, { "epoch": 0.44757231404958675, "grad_norm": 2.375717878341675, "learning_rate": 2.9119076780244093e-05, "loss": 4.1314, "step": 1733 }, { "epoch": 0.4478305785123967, "grad_norm": 5.011549949645996, "learning_rate": 2.909906860164314e-05, "loss": 3.669, "step": 1734 }, { "epoch": 0.4480888429752066, "grad_norm": 3.3623626232147217, "learning_rate": 2.9079057724593206e-05, "loss": 4.1037, "step": 1735 }, { "epoch": 0.44834710743801653, "grad_norm": 4.44000768661499, "learning_rate": 2.9059044162267614e-05, "loss": 4.1844, "step": 1736 }, { "epoch": 0.44860537190082644, "grad_norm": 3.0052876472473145, "learning_rate": 2.903902792784145e-05, "loss": 4.5239, "step": 1737 }, { "epoch": 0.44886363636363635, "grad_norm": 2.7046828269958496, "learning_rate": 2.9019009034491547e-05, "loss": 3.8716, "step": 1738 }, { "epoch": 0.44912190082644626, "grad_norm": 3.589193820953369, "learning_rate": 2.8998987495396505e-05, "loss": 4.1781, "step": 1739 }, { "epoch": 0.4493801652892562, "grad_norm": 3.0077967643737793, "learning_rate": 2.8978963323736667e-05, "loss": 3.9861, "step": 1740 }, { "epoch": 0.44963842975206614, "grad_norm": 2.777775287628174, "learning_rate": 2.895893653269409e-05, "loss": 4.6459, "step": 1741 }, { "epoch": 0.44989669421487605, "grad_norm": 2.4114463329315186, "learning_rate": 2.8938907135452577e-05, "loss": 4.0899, "step": 1742 }, { "epoch": 0.45015495867768596, "grad_norm": 7.0574517250061035, "learning_rate": 2.8918875145197632e-05, "loss": 4.2179, "step": 1743 }, { "epoch": 0.45041322314049587, "grad_norm": 9.378207206726074, "learning_rate": 2.889884057511647e-05, "loss": 3.7921, "step": 1744 }, { "epoch": 0.4506714876033058, "grad_norm": 3.5357308387756348, "learning_rate": 2.8878803438398016e-05, "loss": 4.2334, "step": 1745 }, { "epoch": 0.4509297520661157, "grad_norm": 5.367270469665527, "learning_rate": 2.885876374823286e-05, "loss": 4.3198, "step": 1746 }, { "epoch": 0.4511880165289256, "grad_norm": 2.999501943588257, "learning_rate": 2.88387215178133e-05, "loss": 4.1714, "step": 1747 }, { "epoch": 0.45144628099173556, "grad_norm": 5.397332191467285, "learning_rate": 2.8818676760333284e-05, "loss": 4.2982, "step": 1748 }, { "epoch": 0.45170454545454547, "grad_norm": 2.9499502182006836, "learning_rate": 2.8798629488988422e-05, "loss": 3.9805, "step": 1749 }, { "epoch": 0.4519628099173554, "grad_norm": 2.4803664684295654, "learning_rate": 2.8778579716976017e-05, "loss": 4.3586, "step": 1750 }, { "epoch": 0.4522210743801653, "grad_norm": 3.2696382999420166, "learning_rate": 2.8758527457494976e-05, "loss": 4.4817, "step": 1751 }, { "epoch": 0.4524793388429752, "grad_norm": 5.131446361541748, "learning_rate": 2.8738472723745858e-05, "loss": 3.9318, "step": 1752 }, { "epoch": 0.4527376033057851, "grad_norm": 2.7887513637542725, "learning_rate": 2.871841552893086e-05, "loss": 3.9877, "step": 1753 }, { "epoch": 0.452995867768595, "grad_norm": 4.4968953132629395, "learning_rate": 2.869835588625377e-05, "loss": 3.899, "step": 1754 }, { "epoch": 0.453254132231405, "grad_norm": 2.3841803073883057, "learning_rate": 2.8678293808920033e-05, "loss": 4.0347, "step": 1755 }, { "epoch": 0.4535123966942149, "grad_norm": 4.2543768882751465, "learning_rate": 2.8658229310136654e-05, "loss": 4.2045, "step": 1756 }, { "epoch": 0.4537706611570248, "grad_norm": 1.6524581909179688, "learning_rate": 2.8638162403112255e-05, "loss": 3.9408, "step": 1757 }, { "epoch": 0.4540289256198347, "grad_norm": 3.276524543762207, "learning_rate": 2.861809310105705e-05, "loss": 4.0503, "step": 1758 }, { "epoch": 0.4542871900826446, "grad_norm": 6.069855213165283, "learning_rate": 2.8598021417182795e-05, "loss": 3.599, "step": 1759 }, { "epoch": 0.45454545454545453, "grad_norm": 5.0176777839660645, "learning_rate": 2.8577947364702858e-05, "loss": 4.6321, "step": 1760 }, { "epoch": 0.45480371900826444, "grad_norm": 4.5872721672058105, "learning_rate": 2.8557870956832132e-05, "loss": 4.2261, "step": 1761 }, { "epoch": 0.4550619834710744, "grad_norm": 2.5436525344848633, "learning_rate": 2.853779220678708e-05, "loss": 4.5956, "step": 1762 }, { "epoch": 0.4553202479338843, "grad_norm": 3.0165934562683105, "learning_rate": 2.851771112778569e-05, "loss": 3.8733, "step": 1763 }, { "epoch": 0.4555785123966942, "grad_norm": 2.463062047958374, "learning_rate": 2.849762773304751e-05, "loss": 3.9156, "step": 1764 }, { "epoch": 0.45583677685950413, "grad_norm": 1.9068245887756348, "learning_rate": 2.847754203579358e-05, "loss": 3.8908, "step": 1765 }, { "epoch": 0.45609504132231404, "grad_norm": 3.4355671405792236, "learning_rate": 2.845745404924649e-05, "loss": 3.9322, "step": 1766 }, { "epoch": 0.45635330578512395, "grad_norm": 1.7773760557174683, "learning_rate": 2.84373637866303e-05, "loss": 3.9819, "step": 1767 }, { "epoch": 0.45661157024793386, "grad_norm": 2.530264139175415, "learning_rate": 2.8417271261170602e-05, "loss": 4.1178, "step": 1768 }, { "epoch": 0.4568698347107438, "grad_norm": 8.063155174255371, "learning_rate": 2.8397176486094452e-05, "loss": 4.2834, "step": 1769 }, { "epoch": 0.45712809917355374, "grad_norm": 2.4646098613739014, "learning_rate": 2.83770794746304e-05, "loss": 4.2161, "step": 1770 }, { "epoch": 0.45738636363636365, "grad_norm": 4.025990009307861, "learning_rate": 2.8356980240008478e-05, "loss": 3.6945, "step": 1771 }, { "epoch": 0.45764462809917356, "grad_norm": 5.334585666656494, "learning_rate": 2.8336878795460153e-05, "loss": 3.7958, "step": 1772 }, { "epoch": 0.45790289256198347, "grad_norm": 7.023785591125488, "learning_rate": 2.831677515421838e-05, "loss": 2.9523, "step": 1773 }, { "epoch": 0.4581611570247934, "grad_norm": 4.392211437225342, "learning_rate": 2.829666932951753e-05, "loss": 3.8643, "step": 1774 }, { "epoch": 0.4584194214876033, "grad_norm": 3.40824556350708, "learning_rate": 2.827656133459342e-05, "loss": 3.8152, "step": 1775 }, { "epoch": 0.45867768595041325, "grad_norm": 3.5755231380462646, "learning_rate": 2.8256451182683325e-05, "loss": 4.8038, "step": 1776 }, { "epoch": 0.45893595041322316, "grad_norm": 3.380646228790283, "learning_rate": 2.8236338887025886e-05, "loss": 3.76, "step": 1777 }, { "epoch": 0.45919421487603307, "grad_norm": 3.730395793914795, "learning_rate": 2.8216224460861206e-05, "loss": 4.4078, "step": 1778 }, { "epoch": 0.459452479338843, "grad_norm": 5.016434192657471, "learning_rate": 2.819610791743077e-05, "loss": 4.1168, "step": 1779 }, { "epoch": 0.4597107438016529, "grad_norm": 4.159393310546875, "learning_rate": 2.8175989269977436e-05, "loss": 3.7721, "step": 1780 }, { "epoch": 0.4599690082644628, "grad_norm": 3.116375207901001, "learning_rate": 2.8155868531745487e-05, "loss": 4.1593, "step": 1781 }, { "epoch": 0.4602272727272727, "grad_norm": 4.302967071533203, "learning_rate": 2.813574571598056e-05, "loss": 4.4467, "step": 1782 }, { "epoch": 0.46048553719008267, "grad_norm": 3.616736650466919, "learning_rate": 2.811562083592965e-05, "loss": 3.9005, "step": 1783 }, { "epoch": 0.4607438016528926, "grad_norm": 3.454592227935791, "learning_rate": 2.8095493904841136e-05, "loss": 3.5606, "step": 1784 }, { "epoch": 0.4610020661157025, "grad_norm": 2.019392967224121, "learning_rate": 2.8075364935964726e-05, "loss": 3.5921, "step": 1785 }, { "epoch": 0.4612603305785124, "grad_norm": 2.463850259780884, "learning_rate": 2.8055233942551483e-05, "loss": 4.2441, "step": 1786 }, { "epoch": 0.4615185950413223, "grad_norm": 1.91970694065094, "learning_rate": 2.8035100937853804e-05, "loss": 4.0385, "step": 1787 }, { "epoch": 0.4617768595041322, "grad_norm": 2.3196070194244385, "learning_rate": 2.8014965935125386e-05, "loss": 3.9492, "step": 1788 }, { "epoch": 0.46203512396694213, "grad_norm": 2.923067092895508, "learning_rate": 2.7994828947621276e-05, "loss": 4.1729, "step": 1789 }, { "epoch": 0.46229338842975204, "grad_norm": 7.294532775878906, "learning_rate": 2.7974689988597796e-05, "loss": 3.9957, "step": 1790 }, { "epoch": 0.462551652892562, "grad_norm": 6.095966815948486, "learning_rate": 2.7954549071312597e-05, "loss": 3.9341, "step": 1791 }, { "epoch": 0.4628099173553719, "grad_norm": 2.841609001159668, "learning_rate": 2.7934406209024588e-05, "loss": 4.0975, "step": 1792 }, { "epoch": 0.4630681818181818, "grad_norm": 5.175579071044922, "learning_rate": 2.7914261414993982e-05, "loss": 4.0259, "step": 1793 }, { "epoch": 0.46332644628099173, "grad_norm": 3.2830910682678223, "learning_rate": 2.7894114702482254e-05, "loss": 3.94, "step": 1794 }, { "epoch": 0.46358471074380164, "grad_norm": 2.624756097793579, "learning_rate": 2.787396608475214e-05, "loss": 4.1314, "step": 1795 }, { "epoch": 0.46384297520661155, "grad_norm": 3.5991199016571045, "learning_rate": 2.7853815575067627e-05, "loss": 4.2583, "step": 1796 }, { "epoch": 0.46410123966942146, "grad_norm": 5.3154449462890625, "learning_rate": 2.783366318669397e-05, "loss": 4.7554, "step": 1797 }, { "epoch": 0.4643595041322314, "grad_norm": 10.312579154968262, "learning_rate": 2.7813508932897625e-05, "loss": 3.9411, "step": 1798 }, { "epoch": 0.46461776859504134, "grad_norm": 3.3173320293426514, "learning_rate": 2.7793352826946306e-05, "loss": 4.3656, "step": 1799 }, { "epoch": 0.46487603305785125, "grad_norm": 4.1480584144592285, "learning_rate": 2.7773194882108938e-05, "loss": 4.0919, "step": 1800 }, { "epoch": 0.46513429752066116, "grad_norm": 2.7512450218200684, "learning_rate": 2.7753035111655645e-05, "loss": 4.0868, "step": 1801 }, { "epoch": 0.46539256198347106, "grad_norm": 3.7541894912719727, "learning_rate": 2.7732873528857773e-05, "loss": 4.2816, "step": 1802 }, { "epoch": 0.465650826446281, "grad_norm": 6.903682231903076, "learning_rate": 2.7712710146987837e-05, "loss": 3.9316, "step": 1803 }, { "epoch": 0.4659090909090909, "grad_norm": 2.050029754638672, "learning_rate": 2.7692544979319558e-05, "loss": 4.4644, "step": 1804 }, { "epoch": 0.46616735537190085, "grad_norm": 2.1512880325317383, "learning_rate": 2.767237803912783e-05, "loss": 4.5436, "step": 1805 }, { "epoch": 0.46642561983471076, "grad_norm": 3.3165907859802246, "learning_rate": 2.765220933968868e-05, "loss": 4.3002, "step": 1806 }, { "epoch": 0.46668388429752067, "grad_norm": 7.48758602142334, "learning_rate": 2.7632038894279354e-05, "loss": 4.4735, "step": 1807 }, { "epoch": 0.4669421487603306, "grad_norm": 2.932154417037964, "learning_rate": 2.7611866716178205e-05, "loss": 4.3487, "step": 1808 }, { "epoch": 0.4672004132231405, "grad_norm": 2.037371873855591, "learning_rate": 2.7591692818664723e-05, "loss": 4.0928, "step": 1809 }, { "epoch": 0.4674586776859504, "grad_norm": 2.4417684078216553, "learning_rate": 2.7571517215019556e-05, "loss": 4.0854, "step": 1810 }, { "epoch": 0.4677169421487603, "grad_norm": 4.043039798736572, "learning_rate": 2.755133991852445e-05, "loss": 4.0799, "step": 1811 }, { "epoch": 0.46797520661157027, "grad_norm": 3.024540901184082, "learning_rate": 2.753116094246229e-05, "loss": 4.1023, "step": 1812 }, { "epoch": 0.4682334710743802, "grad_norm": 4.541862964630127, "learning_rate": 2.751098030011705e-05, "loss": 4.4648, "step": 1813 }, { "epoch": 0.4684917355371901, "grad_norm": 3.7706000804901123, "learning_rate": 2.74907980047738e-05, "loss": 4.0639, "step": 1814 }, { "epoch": 0.46875, "grad_norm": 5.0221848487854, "learning_rate": 2.747061406971871e-05, "loss": 2.5604, "step": 1815 }, { "epoch": 0.4690082644628099, "grad_norm": 3.959690570831299, "learning_rate": 2.7450428508239024e-05, "loss": 3.8871, "step": 1816 }, { "epoch": 0.4692665289256198, "grad_norm": 2.5116074085235596, "learning_rate": 2.743024133362304e-05, "loss": 4.744, "step": 1817 }, { "epoch": 0.46952479338842973, "grad_norm": 2.9242846965789795, "learning_rate": 2.741005255916015e-05, "loss": 4.0304, "step": 1818 }, { "epoch": 0.4697830578512397, "grad_norm": 1.5886365175247192, "learning_rate": 2.7389862198140776e-05, "loss": 3.5849, "step": 1819 }, { "epoch": 0.4700413223140496, "grad_norm": 8.535863876342773, "learning_rate": 2.7369670263856385e-05, "loss": 4.4842, "step": 1820 }, { "epoch": 0.4702995867768595, "grad_norm": 2.7128500938415527, "learning_rate": 2.7349476769599503e-05, "loss": 4.0534, "step": 1821 }, { "epoch": 0.4705578512396694, "grad_norm": 6.599178791046143, "learning_rate": 2.7329281728663648e-05, "loss": 3.0224, "step": 1822 }, { "epoch": 0.47081611570247933, "grad_norm": 3.233739137649536, "learning_rate": 2.7309085154343377e-05, "loss": 4.3519, "step": 1823 }, { "epoch": 0.47107438016528924, "grad_norm": 3.2747466564178467, "learning_rate": 2.728888705993426e-05, "loss": 4.6778, "step": 1824 }, { "epoch": 0.47133264462809915, "grad_norm": 9.265226364135742, "learning_rate": 2.726868745873286e-05, "loss": 4.104, "step": 1825 }, { "epoch": 0.4715909090909091, "grad_norm": 3.1642603874206543, "learning_rate": 2.724848636403673e-05, "loss": 4.5646, "step": 1826 }, { "epoch": 0.471849173553719, "grad_norm": 3.963876485824585, "learning_rate": 2.72282837891444e-05, "loss": 4.0165, "step": 1827 }, { "epoch": 0.47210743801652894, "grad_norm": 3.9111664295196533, "learning_rate": 2.7208079747355398e-05, "loss": 4.14, "step": 1828 }, { "epoch": 0.47236570247933884, "grad_norm": 2.8701674938201904, "learning_rate": 2.7187874251970198e-05, "loss": 4.4097, "step": 1829 }, { "epoch": 0.47262396694214875, "grad_norm": 2.9988832473754883, "learning_rate": 2.716766731629023e-05, "loss": 4.1199, "step": 1830 }, { "epoch": 0.47288223140495866, "grad_norm": 2.611564874649048, "learning_rate": 2.7147458953617887e-05, "loss": 4.1563, "step": 1831 }, { "epoch": 0.4731404958677686, "grad_norm": 2.2720789909362793, "learning_rate": 2.7127249177256485e-05, "loss": 3.9537, "step": 1832 }, { "epoch": 0.47339876033057854, "grad_norm": 3.6977620124816895, "learning_rate": 2.7107038000510287e-05, "loss": 3.8859, "step": 1833 }, { "epoch": 0.47365702479338845, "grad_norm": 3.795361280441284, "learning_rate": 2.708682543668446e-05, "loss": 3.863, "step": 1834 }, { "epoch": 0.47391528925619836, "grad_norm": 2.6322739124298096, "learning_rate": 2.7066611499085093e-05, "loss": 3.8493, "step": 1835 }, { "epoch": 0.47417355371900827, "grad_norm": 2.714700937271118, "learning_rate": 2.7046396201019192e-05, "loss": 3.8247, "step": 1836 }, { "epoch": 0.4744318181818182, "grad_norm": 4.6045026779174805, "learning_rate": 2.702617955579463e-05, "loss": 3.817, "step": 1837 }, { "epoch": 0.4746900826446281, "grad_norm": 4.397767066955566, "learning_rate": 2.7005961576720196e-05, "loss": 4.144, "step": 1838 }, { "epoch": 0.474948347107438, "grad_norm": 2.8092076778411865, "learning_rate": 2.698574227710554e-05, "loss": 3.8992, "step": 1839 }, { "epoch": 0.47520661157024796, "grad_norm": 6.42744255065918, "learning_rate": 2.6965521670261175e-05, "loss": 4.3716, "step": 1840 }, { "epoch": 0.47546487603305787, "grad_norm": 3.6388182640075684, "learning_rate": 2.6945299769498494e-05, "loss": 4.2765, "step": 1841 }, { "epoch": 0.4757231404958678, "grad_norm": 3.1239395141601562, "learning_rate": 2.6925076588129743e-05, "loss": 3.6859, "step": 1842 }, { "epoch": 0.4759814049586777, "grad_norm": 1.7787977457046509, "learning_rate": 2.690485213946798e-05, "loss": 3.8662, "step": 1843 }, { "epoch": 0.4762396694214876, "grad_norm": 2.7938220500946045, "learning_rate": 2.6884626436827137e-05, "loss": 3.42, "step": 1844 }, { "epoch": 0.4764979338842975, "grad_norm": 2.116191864013672, "learning_rate": 2.6864399493521934e-05, "loss": 3.9133, "step": 1845 }, { "epoch": 0.4767561983471074, "grad_norm": 3.7266318798065186, "learning_rate": 2.6844171322867943e-05, "loss": 3.9286, "step": 1846 }, { "epoch": 0.47701446280991733, "grad_norm": 5.8897905349731445, "learning_rate": 2.6823941938181524e-05, "loss": 3.6804, "step": 1847 }, { "epoch": 0.4772727272727273, "grad_norm": 3.3995652198791504, "learning_rate": 2.6803711352779826e-05, "loss": 3.8935, "step": 1848 }, { "epoch": 0.4775309917355372, "grad_norm": 3.119176149368286, "learning_rate": 2.6783479579980807e-05, "loss": 4.0955, "step": 1849 }, { "epoch": 0.4777892561983471, "grad_norm": 4.7045722007751465, "learning_rate": 2.676324663310321e-05, "loss": 4.1659, "step": 1850 }, { "epoch": 0.478047520661157, "grad_norm": 2.422605037689209, "learning_rate": 2.674301252546655e-05, "loss": 4.1714, "step": 1851 }, { "epoch": 0.47830578512396693, "grad_norm": 5.001206874847412, "learning_rate": 2.6722777270391082e-05, "loss": 4.4286, "step": 1852 }, { "epoch": 0.47856404958677684, "grad_norm": 2.7714691162109375, "learning_rate": 2.6702540881197835e-05, "loss": 3.9544, "step": 1853 }, { "epoch": 0.47882231404958675, "grad_norm": 2.4849352836608887, "learning_rate": 2.668230337120859e-05, "loss": 4.029, "step": 1854 }, { "epoch": 0.4790805785123967, "grad_norm": 5.870811939239502, "learning_rate": 2.6662064753745846e-05, "loss": 4.1931, "step": 1855 }, { "epoch": 0.4793388429752066, "grad_norm": 3.3110744953155518, "learning_rate": 2.6641825042132857e-05, "loss": 4.1248, "step": 1856 }, { "epoch": 0.47959710743801653, "grad_norm": 2.94197154045105, "learning_rate": 2.6621584249693575e-05, "loss": 4.7185, "step": 1857 }, { "epoch": 0.47985537190082644, "grad_norm": 3.708906412124634, "learning_rate": 2.6601342389752677e-05, "loss": 4.577, "step": 1858 }, { "epoch": 0.48011363636363635, "grad_norm": 2.8957741260528564, "learning_rate": 2.6581099475635542e-05, "loss": 3.9062, "step": 1859 }, { "epoch": 0.48037190082644626, "grad_norm": 3.114271640777588, "learning_rate": 2.656085552066823e-05, "loss": 4.634, "step": 1860 }, { "epoch": 0.4806301652892562, "grad_norm": 3.1002933979034424, "learning_rate": 2.6540610538177496e-05, "loss": 4.1463, "step": 1861 }, { "epoch": 0.48088842975206614, "grad_norm": 2.6316182613372803, "learning_rate": 2.6520364541490778e-05, "loss": 4.0535, "step": 1862 }, { "epoch": 0.48114669421487605, "grad_norm": 3.7838964462280273, "learning_rate": 2.6500117543936163e-05, "loss": 3.3224, "step": 1863 }, { "epoch": 0.48140495867768596, "grad_norm": 3.036574363708496, "learning_rate": 2.6479869558842417e-05, "loss": 4.4629, "step": 1864 }, { "epoch": 0.48166322314049587, "grad_norm": 2.715531349182129, "learning_rate": 2.6459620599538947e-05, "loss": 4.2076, "step": 1865 }, { "epoch": 0.4819214876033058, "grad_norm": 2.7972030639648438, "learning_rate": 2.6439370679355797e-05, "loss": 3.9665, "step": 1866 }, { "epoch": 0.4821797520661157, "grad_norm": 1.984796166419983, "learning_rate": 2.6419119811623654e-05, "loss": 4.4577, "step": 1867 }, { "epoch": 0.4824380165289256, "grad_norm": 4.2574334144592285, "learning_rate": 2.6398868009673816e-05, "loss": 3.8932, "step": 1868 }, { "epoch": 0.48269628099173556, "grad_norm": 2.389324426651001, "learning_rate": 2.6378615286838217e-05, "loss": 4.056, "step": 1869 }, { "epoch": 0.48295454545454547, "grad_norm": 3.316059112548828, "learning_rate": 2.635836165644936e-05, "loss": 3.9686, "step": 1870 }, { "epoch": 0.4832128099173554, "grad_norm": 5.194189071655273, "learning_rate": 2.6338107131840377e-05, "loss": 3.9812, "step": 1871 }, { "epoch": 0.4834710743801653, "grad_norm": 5.806455135345459, "learning_rate": 2.6317851726345e-05, "loss": 3.2415, "step": 1872 }, { "epoch": 0.4837293388429752, "grad_norm": 4.5264458656311035, "learning_rate": 2.6297595453297498e-05, "loss": 4.2793, "step": 1873 }, { "epoch": 0.4839876033057851, "grad_norm": 3.008270263671875, "learning_rate": 2.627733832603274e-05, "loss": 4.1956, "step": 1874 }, { "epoch": 0.484245867768595, "grad_norm": 3.6027321815490723, "learning_rate": 2.625708035788616e-05, "loss": 4.0804, "step": 1875 }, { "epoch": 0.484504132231405, "grad_norm": 4.537043571472168, "learning_rate": 2.623682156219372e-05, "loss": 3.9806, "step": 1876 }, { "epoch": 0.4847623966942149, "grad_norm": 5.993073463439941, "learning_rate": 2.621656195229196e-05, "loss": 4.1324, "step": 1877 }, { "epoch": 0.4850206611570248, "grad_norm": 5.365727424621582, "learning_rate": 2.6196301541517936e-05, "loss": 4.568, "step": 1878 }, { "epoch": 0.4852789256198347, "grad_norm": 4.368423938751221, "learning_rate": 2.617604034320923e-05, "loss": 4.0445, "step": 1879 }, { "epoch": 0.4855371900826446, "grad_norm": 2.3297054767608643, "learning_rate": 2.615577837070396e-05, "loss": 4.0192, "step": 1880 }, { "epoch": 0.48579545454545453, "grad_norm": 2.5703353881835938, "learning_rate": 2.6135515637340717e-05, "loss": 4.023, "step": 1881 }, { "epoch": 0.48605371900826444, "grad_norm": 3.2965118885040283, "learning_rate": 2.6115252156458646e-05, "loss": 3.3171, "step": 1882 }, { "epoch": 0.4863119834710744, "grad_norm": 2.3523786067962646, "learning_rate": 2.609498794139734e-05, "loss": 4.3772, "step": 1883 }, { "epoch": 0.4865702479338843, "grad_norm": 3.788217067718506, "learning_rate": 2.607472300549688e-05, "loss": 3.9219, "step": 1884 }, { "epoch": 0.4868285123966942, "grad_norm": 3.042837619781494, "learning_rate": 2.6054457362097862e-05, "loss": 3.7421, "step": 1885 }, { "epoch": 0.48708677685950413, "grad_norm": 2.7305068969726562, "learning_rate": 2.6034191024541293e-05, "loss": 3.8984, "step": 1886 }, { "epoch": 0.48734504132231404, "grad_norm": 5.499004364013672, "learning_rate": 2.601392400616867e-05, "loss": 3.827, "step": 1887 }, { "epoch": 0.48760330578512395, "grad_norm": 2.060112714767456, "learning_rate": 2.599365632032193e-05, "loss": 4.2119, "step": 1888 }, { "epoch": 0.48786157024793386, "grad_norm": 3.3475241661071777, "learning_rate": 2.5973387980343443e-05, "loss": 4.2504, "step": 1889 }, { "epoch": 0.4881198347107438, "grad_norm": 2.6464385986328125, "learning_rate": 2.5953118999576032e-05, "loss": 3.6232, "step": 1890 }, { "epoch": 0.48837809917355374, "grad_norm": 2.161728620529175, "learning_rate": 2.5932849391362907e-05, "loss": 4.6496, "step": 1891 }, { "epoch": 0.48863636363636365, "grad_norm": 2.014249324798584, "learning_rate": 2.5912579169047714e-05, "loss": 3.9331, "step": 1892 }, { "epoch": 0.48889462809917356, "grad_norm": 3.40820574760437, "learning_rate": 2.5892308345974515e-05, "loss": 4.3464, "step": 1893 }, { "epoch": 0.48915289256198347, "grad_norm": 4.088630199432373, "learning_rate": 2.5872036935487738e-05, "loss": 3.9822, "step": 1894 }, { "epoch": 0.4894111570247934, "grad_norm": 3.0152666568756104, "learning_rate": 2.5851764950932204e-05, "loss": 4.0109, "step": 1895 }, { "epoch": 0.4896694214876033, "grad_norm": 3.9502511024475098, "learning_rate": 2.583149240565314e-05, "loss": 3.4147, "step": 1896 }, { "epoch": 0.48992768595041325, "grad_norm": 2.0753111839294434, "learning_rate": 2.5811219312996106e-05, "loss": 3.5536, "step": 1897 }, { "epoch": 0.49018595041322316, "grad_norm": 2.520763874053955, "learning_rate": 2.579094568630704e-05, "loss": 3.8854, "step": 1898 }, { "epoch": 0.49044421487603307, "grad_norm": 2.9955644607543945, "learning_rate": 2.5770671538932235e-05, "loss": 4.0529, "step": 1899 }, { "epoch": 0.490702479338843, "grad_norm": 2.93401837348938, "learning_rate": 2.575039688421831e-05, "loss": 4.2603, "step": 1900 }, { "epoch": 0.4909607438016529, "grad_norm": 7.896149635314941, "learning_rate": 2.573012173551224e-05, "loss": 4.4483, "step": 1901 }, { "epoch": 0.4912190082644628, "grad_norm": 2.136359214782715, "learning_rate": 2.570984610616131e-05, "loss": 3.8936, "step": 1902 }, { "epoch": 0.4914772727272727, "grad_norm": 3.598496675491333, "learning_rate": 2.5689570009513136e-05, "loss": 3.6333, "step": 1903 }, { "epoch": 0.49173553719008267, "grad_norm": 3.4918642044067383, "learning_rate": 2.5669293458915616e-05, "loss": 3.7095, "step": 1904 }, { "epoch": 0.4919938016528926, "grad_norm": 2.9919300079345703, "learning_rate": 2.564901646771696e-05, "loss": 4.2212, "step": 1905 }, { "epoch": 0.4922520661157025, "grad_norm": 3.2543392181396484, "learning_rate": 2.5628739049265683e-05, "loss": 3.9717, "step": 1906 }, { "epoch": 0.4925103305785124, "grad_norm": 4.012360095977783, "learning_rate": 2.5608461216910566e-05, "loss": 3.7779, "step": 1907 }, { "epoch": 0.4927685950413223, "grad_norm": 4.270179271697998, "learning_rate": 2.558818298400066e-05, "loss": 4.0236, "step": 1908 }, { "epoch": 0.4930268595041322, "grad_norm": 3.613255262374878, "learning_rate": 2.5567904363885294e-05, "loss": 4.1507, "step": 1909 }, { "epoch": 0.49328512396694213, "grad_norm": 3.0425662994384766, "learning_rate": 2.5547625369914024e-05, "loss": 4.086, "step": 1910 }, { "epoch": 0.49354338842975204, "grad_norm": 2.1840972900390625, "learning_rate": 2.552734601543669e-05, "loss": 4.3377, "step": 1911 }, { "epoch": 0.493801652892562, "grad_norm": 5.837273120880127, "learning_rate": 2.550706631380334e-05, "loss": 3.7729, "step": 1912 }, { "epoch": 0.4940599173553719, "grad_norm": 2.8767244815826416, "learning_rate": 2.5486786278364256e-05, "loss": 4.4046, "step": 1913 }, { "epoch": 0.4943181818181818, "grad_norm": 3.0186548233032227, "learning_rate": 2.5466505922469957e-05, "loss": 4.2273, "step": 1914 }, { "epoch": 0.49457644628099173, "grad_norm": 2.701868772506714, "learning_rate": 2.544622525947115e-05, "loss": 4.1659, "step": 1915 }, { "epoch": 0.49483471074380164, "grad_norm": 2.7400989532470703, "learning_rate": 2.5425944302718768e-05, "loss": 4.2098, "step": 1916 }, { "epoch": 0.49509297520661155, "grad_norm": 3.8796279430389404, "learning_rate": 2.5405663065563906e-05, "loss": 3.7941, "step": 1917 }, { "epoch": 0.49535123966942146, "grad_norm": 2.422184705734253, "learning_rate": 2.538538156135787e-05, "loss": 3.99, "step": 1918 }, { "epoch": 0.4956095041322314, "grad_norm": 4.1061553955078125, "learning_rate": 2.5365099803452136e-05, "loss": 4.1973, "step": 1919 }, { "epoch": 0.49586776859504134, "grad_norm": 2.0414998531341553, "learning_rate": 2.5344817805198334e-05, "loss": 3.8297, "step": 1920 }, { "epoch": 0.49612603305785125, "grad_norm": 3.54904842376709, "learning_rate": 2.5324535579948274e-05, "loss": 4.217, "step": 1921 }, { "epoch": 0.49638429752066116, "grad_norm": 2.324327230453491, "learning_rate": 2.5304253141053902e-05, "loss": 3.8389, "step": 1922 }, { "epoch": 0.49664256198347106, "grad_norm": 2.882051944732666, "learning_rate": 2.5283970501867294e-05, "loss": 4.2125, "step": 1923 }, { "epoch": 0.496900826446281, "grad_norm": 2.451080560684204, "learning_rate": 2.5263687675740687e-05, "loss": 4.5661, "step": 1924 }, { "epoch": 0.4971590909090909, "grad_norm": 2.3273870944976807, "learning_rate": 2.5243404676026418e-05, "loss": 4.5794, "step": 1925 }, { "epoch": 0.49741735537190085, "grad_norm": 6.498219966888428, "learning_rate": 2.5223121516076937e-05, "loss": 4.4428, "step": 1926 }, { "epoch": 0.49767561983471076, "grad_norm": 2.5934102535247803, "learning_rate": 2.5202838209244818e-05, "loss": 4.1102, "step": 1927 }, { "epoch": 0.49793388429752067, "grad_norm": 2.2858669757843018, "learning_rate": 2.5182554768882715e-05, "loss": 4.08, "step": 1928 }, { "epoch": 0.4981921487603306, "grad_norm": 4.241889476776123, "learning_rate": 2.516227120834338e-05, "loss": 3.8959, "step": 1929 }, { "epoch": 0.4984504132231405, "grad_norm": 2.4177911281585693, "learning_rate": 2.5141987540979634e-05, "loss": 3.9719, "step": 1930 }, { "epoch": 0.4987086776859504, "grad_norm": 3.1189942359924316, "learning_rate": 2.512170378014438e-05, "loss": 4.5534, "step": 1931 }, { "epoch": 0.4989669421487603, "grad_norm": 2.642329216003418, "learning_rate": 2.5101419939190574e-05, "loss": 4.215, "step": 1932 }, { "epoch": 0.49922520661157027, "grad_norm": 2.944613456726074, "learning_rate": 2.508113603147122e-05, "loss": 3.8049, "step": 1933 }, { "epoch": 0.4994834710743802, "grad_norm": 1.7876752614974976, "learning_rate": 2.5060852070339386e-05, "loss": 4.0677, "step": 1934 }, { "epoch": 0.4997417355371901, "grad_norm": 5.040082931518555, "learning_rate": 2.5040568069148156e-05, "loss": 4.2831, "step": 1935 }, { "epoch": 0.5, "grad_norm": 2.731046199798584, "learning_rate": 2.5020284041250648e-05, "loss": 3.9564, "step": 1936 }, { "epoch": 0.50025826446281, "grad_norm": 2.926952362060547, "learning_rate": 2.5e-05, "loss": 4.7076, "step": 1937 }, { "epoch": 0.5005165289256198, "grad_norm": 3.964996099472046, "learning_rate": 2.497971595874935e-05, "loss": 3.9943, "step": 1938 }, { "epoch": 0.5007747933884298, "grad_norm": 4.351245880126953, "learning_rate": 2.495943193085185e-05, "loss": 3.9315, "step": 1939 }, { "epoch": 0.5010330578512396, "grad_norm": 2.7590630054473877, "learning_rate": 2.4939147929660617e-05, "loss": 3.7076, "step": 1940 }, { "epoch": 0.5012913223140496, "grad_norm": 3.1733834743499756, "learning_rate": 2.4918863968528787e-05, "loss": 3.7829, "step": 1941 }, { "epoch": 0.5015495867768595, "grad_norm": 2.06733775138855, "learning_rate": 2.4898580060809435e-05, "loss": 4.6029, "step": 1942 }, { "epoch": 0.5018078512396694, "grad_norm": 4.149465084075928, "learning_rate": 2.4878296219855636e-05, "loss": 3.7477, "step": 1943 }, { "epoch": 0.5020661157024794, "grad_norm": 16.553647994995117, "learning_rate": 2.4858012459020372e-05, "loss": 3.8865, "step": 1944 }, { "epoch": 0.5023243801652892, "grad_norm": 2.7020928859710693, "learning_rate": 2.483772879165662e-05, "loss": 4.5556, "step": 1945 }, { "epoch": 0.5025826446280992, "grad_norm": 2.5165369510650635, "learning_rate": 2.481744523111729e-05, "loss": 3.9884, "step": 1946 }, { "epoch": 0.5028409090909091, "grad_norm": 3.0807645320892334, "learning_rate": 2.4797161790755184e-05, "loss": 3.8659, "step": 1947 }, { "epoch": 0.503099173553719, "grad_norm": 1.6643083095550537, "learning_rate": 2.4776878483923072e-05, "loss": 4.2406, "step": 1948 }, { "epoch": 0.5033574380165289, "grad_norm": 2.38100004196167, "learning_rate": 2.4756595323973584e-05, "loss": 4.4893, "step": 1949 }, { "epoch": 0.5036157024793388, "grad_norm": 5.918996334075928, "learning_rate": 2.4736312324259322e-05, "loss": 4.2002, "step": 1950 }, { "epoch": 0.5038739669421488, "grad_norm": 1.9740667343139648, "learning_rate": 2.471602949813271e-05, "loss": 4.468, "step": 1951 }, { "epoch": 0.5041322314049587, "grad_norm": 2.3833425045013428, "learning_rate": 2.46957468589461e-05, "loss": 3.969, "step": 1952 }, { "epoch": 0.5043904958677686, "grad_norm": 3.148618221282959, "learning_rate": 2.4675464420051732e-05, "loss": 3.4393, "step": 1953 }, { "epoch": 0.5046487603305785, "grad_norm": 2.7098236083984375, "learning_rate": 2.465518219480167e-05, "loss": 4.0221, "step": 1954 }, { "epoch": 0.5049070247933884, "grad_norm": 4.344113349914551, "learning_rate": 2.4634900196547873e-05, "loss": 4.7314, "step": 1955 }, { "epoch": 0.5051652892561983, "grad_norm": 4.198580265045166, "learning_rate": 2.4614618438642133e-05, "loss": 4.6156, "step": 1956 }, { "epoch": 0.5054235537190083, "grad_norm": 2.926774263381958, "learning_rate": 2.4594336934436103e-05, "loss": 3.7091, "step": 1957 }, { "epoch": 0.5056818181818182, "grad_norm": 5.485823631286621, "learning_rate": 2.4574055697281238e-05, "loss": 4.0223, "step": 1958 }, { "epoch": 0.5059400826446281, "grad_norm": 4.686841011047363, "learning_rate": 2.4553774740528848e-05, "loss": 3.9817, "step": 1959 }, { "epoch": 0.506198347107438, "grad_norm": 6.220475196838379, "learning_rate": 2.453349407753005e-05, "loss": 3.5989, "step": 1960 }, { "epoch": 0.5064566115702479, "grad_norm": 2.8119428157806396, "learning_rate": 2.4513213721635743e-05, "loss": 4.0276, "step": 1961 }, { "epoch": 0.5067148760330579, "grad_norm": 5.069904804229736, "learning_rate": 2.449293368619667e-05, "loss": 3.8979, "step": 1962 }, { "epoch": 0.5069731404958677, "grad_norm": 9.279296875, "learning_rate": 2.4472653984563314e-05, "loss": 4.7558, "step": 1963 }, { "epoch": 0.5072314049586777, "grad_norm": 6.518516540527344, "learning_rate": 2.4452374630085982e-05, "loss": 4.1707, "step": 1964 }, { "epoch": 0.5074896694214877, "grad_norm": 1.9224673509597778, "learning_rate": 2.4432095636114715e-05, "loss": 3.9154, "step": 1965 }, { "epoch": 0.5077479338842975, "grad_norm": 7.537302017211914, "learning_rate": 2.441181701599934e-05, "loss": 3.9669, "step": 1966 }, { "epoch": 0.5080061983471075, "grad_norm": 3.49808406829834, "learning_rate": 2.4391538783089436e-05, "loss": 4.1713, "step": 1967 }, { "epoch": 0.5082644628099173, "grad_norm": 3.78783917427063, "learning_rate": 2.4371260950734316e-05, "loss": 4.3451, "step": 1968 }, { "epoch": 0.5085227272727273, "grad_norm": 2.704267740249634, "learning_rate": 2.4350983532283047e-05, "loss": 4.4535, "step": 1969 }, { "epoch": 0.5087809917355371, "grad_norm": 2.0639190673828125, "learning_rate": 2.433070654108439e-05, "loss": 3.9498, "step": 1970 }, { "epoch": 0.5090392561983471, "grad_norm": 1.8092585802078247, "learning_rate": 2.4310429990486877e-05, "loss": 4.0172, "step": 1971 }, { "epoch": 0.5092975206611571, "grad_norm": 5.24125337600708, "learning_rate": 2.429015389383869e-05, "loss": 3.9164, "step": 1972 }, { "epoch": 0.5095557851239669, "grad_norm": 3.351649045944214, "learning_rate": 2.4269878264487755e-05, "loss": 4.1806, "step": 1973 }, { "epoch": 0.5098140495867769, "grad_norm": 1.8503297567367554, "learning_rate": 2.4249603115781696e-05, "loss": 4.1535, "step": 1974 }, { "epoch": 0.5100723140495868, "grad_norm": 4.258581161499023, "learning_rate": 2.4229328461067774e-05, "loss": 4.4652, "step": 1975 }, { "epoch": 0.5103305785123967, "grad_norm": 2.5755558013916016, "learning_rate": 2.420905431369297e-05, "loss": 4.1194, "step": 1976 }, { "epoch": 0.5105888429752066, "grad_norm": 2.7339389324188232, "learning_rate": 2.41887806870039e-05, "loss": 4.3661, "step": 1977 }, { "epoch": 0.5108471074380165, "grad_norm": 2.6899476051330566, "learning_rate": 2.416850759434687e-05, "loss": 4.1224, "step": 1978 }, { "epoch": 0.5111053719008265, "grad_norm": 2.4282124042510986, "learning_rate": 2.4148235049067798e-05, "loss": 3.7764, "step": 1979 }, { "epoch": 0.5113636363636364, "grad_norm": 4.538363933563232, "learning_rate": 2.4127963064512268e-05, "loss": 4.3554, "step": 1980 }, { "epoch": 0.5116219008264463, "grad_norm": 3.0995328426361084, "learning_rate": 2.410769165402549e-05, "loss": 4.2871, "step": 1981 }, { "epoch": 0.5118801652892562, "grad_norm": 3.4265289306640625, "learning_rate": 2.4087420830952285e-05, "loss": 4.3587, "step": 1982 }, { "epoch": 0.5121384297520661, "grad_norm": 2.494460344314575, "learning_rate": 2.4067150608637102e-05, "loss": 4.3605, "step": 1983 }, { "epoch": 0.512396694214876, "grad_norm": 6.567265510559082, "learning_rate": 2.4046881000423977e-05, "loss": 3.7986, "step": 1984 }, { "epoch": 0.512654958677686, "grad_norm": 5.061009883880615, "learning_rate": 2.4026612019656562e-05, "loss": 4.0395, "step": 1985 }, { "epoch": 0.5129132231404959, "grad_norm": 6.9253973960876465, "learning_rate": 2.4006343679678077e-05, "loss": 3.8665, "step": 1986 }, { "epoch": 0.5131714876033058, "grad_norm": 5.742090702056885, "learning_rate": 2.3986075993831332e-05, "loss": 3.7354, "step": 1987 }, { "epoch": 0.5134297520661157, "grad_norm": 2.615996837615967, "learning_rate": 2.396580897545871e-05, "loss": 4.2446, "step": 1988 }, { "epoch": 0.5136880165289256, "grad_norm": 2.0699284076690674, "learning_rate": 2.3945542637902137e-05, "loss": 3.7936, "step": 1989 }, { "epoch": 0.5139462809917356, "grad_norm": 2.854994773864746, "learning_rate": 2.3925276994503122e-05, "loss": 4.4653, "step": 1990 }, { "epoch": 0.5142045454545454, "grad_norm": 3.2336740493774414, "learning_rate": 2.3905012058602664e-05, "loss": 4.4695, "step": 1991 }, { "epoch": 0.5144628099173554, "grad_norm": 3.200010061264038, "learning_rate": 2.3884747843541363e-05, "loss": 3.8762, "step": 1992 }, { "epoch": 0.5147210743801653, "grad_norm": 4.02729606628418, "learning_rate": 2.3864484362659285e-05, "loss": 3.4512, "step": 1993 }, { "epoch": 0.5149793388429752, "grad_norm": 4.768741130828857, "learning_rate": 2.3844221629296042e-05, "loss": 4.0605, "step": 1994 }, { "epoch": 0.5152376033057852, "grad_norm": 2.0445666313171387, "learning_rate": 2.3823959656790776e-05, "loss": 4.151, "step": 1995 }, { "epoch": 0.515495867768595, "grad_norm": 3.3895483016967773, "learning_rate": 2.3803698458482063e-05, "loss": 3.8873, "step": 1996 }, { "epoch": 0.515754132231405, "grad_norm": 1.7420521974563599, "learning_rate": 2.3783438047708044e-05, "loss": 4.0564, "step": 1997 }, { "epoch": 0.5160123966942148, "grad_norm": 2.9066686630249023, "learning_rate": 2.3763178437806285e-05, "loss": 4.6167, "step": 1998 }, { "epoch": 0.5162706611570248, "grad_norm": 3.25767183303833, "learning_rate": 2.3742919642113852e-05, "loss": 3.7672, "step": 1999 }, { "epoch": 0.5165289256198347, "grad_norm": 2.8557276725769043, "learning_rate": 2.3722661673967266e-05, "loss": 3.7265, "step": 2000 }, { "epoch": 0.5167871900826446, "grad_norm": 1.9848840236663818, "learning_rate": 2.370240454670251e-05, "loss": 3.9676, "step": 2001 }, { "epoch": 0.5170454545454546, "grad_norm": 1.7790645360946655, "learning_rate": 2.368214827365501e-05, "loss": 3.9999, "step": 2002 }, { "epoch": 0.5173037190082644, "grad_norm": 6.006354808807373, "learning_rate": 2.3661892868159622e-05, "loss": 3.5601, "step": 2003 }, { "epoch": 0.5175619834710744, "grad_norm": 3.354269027709961, "learning_rate": 2.364163834355065e-05, "loss": 4.3771, "step": 2004 }, { "epoch": 0.5178202479338843, "grad_norm": 3.7459824085235596, "learning_rate": 2.3621384713161792e-05, "loss": 3.8165, "step": 2005 }, { "epoch": 0.5180785123966942, "grad_norm": 2.8648898601531982, "learning_rate": 2.360113199032619e-05, "loss": 4.5231, "step": 2006 }, { "epoch": 0.5183367768595041, "grad_norm": 3.1760098934173584, "learning_rate": 2.358088018837635e-05, "loss": 3.9734, "step": 2007 }, { "epoch": 0.518595041322314, "grad_norm": 2.4705071449279785, "learning_rate": 2.3560629320644212e-05, "loss": 3.9944, "step": 2008 }, { "epoch": 0.518853305785124, "grad_norm": 3.331120491027832, "learning_rate": 2.354037940046106e-05, "loss": 4.5702, "step": 2009 }, { "epoch": 0.5191115702479339, "grad_norm": 2.86440372467041, "learning_rate": 2.3520130441157586e-05, "loss": 3.6781, "step": 2010 }, { "epoch": 0.5193698347107438, "grad_norm": 3.1246376037597656, "learning_rate": 2.349988245606384e-05, "loss": 3.9911, "step": 2011 }, { "epoch": 0.5196280991735537, "grad_norm": 3.308562994003296, "learning_rate": 2.3479635458509228e-05, "loss": 3.973, "step": 2012 }, { "epoch": 0.5198863636363636, "grad_norm": 3.1326961517333984, "learning_rate": 2.3459389461822513e-05, "loss": 4.0265, "step": 2013 }, { "epoch": 0.5201446280991735, "grad_norm": 4.181568622589111, "learning_rate": 2.3439144479331775e-05, "loss": 3.8285, "step": 2014 }, { "epoch": 0.5204028925619835, "grad_norm": 1.9792567491531372, "learning_rate": 2.341890052436447e-05, "loss": 4.1818, "step": 2015 }, { "epoch": 0.5206611570247934, "grad_norm": 5.632180690765381, "learning_rate": 2.3398657610247325e-05, "loss": 4.4944, "step": 2016 }, { "epoch": 0.5209194214876033, "grad_norm": 3.7293241024017334, "learning_rate": 2.3378415750306424e-05, "loss": 3.9825, "step": 2017 }, { "epoch": 0.5211776859504132, "grad_norm": 3.502537250518799, "learning_rate": 2.3358174957867152e-05, "loss": 3.9978, "step": 2018 }, { "epoch": 0.5214359504132231, "grad_norm": 2.0940396785736084, "learning_rate": 2.333793524625416e-05, "loss": 4.5007, "step": 2019 }, { "epoch": 0.5216942148760331, "grad_norm": 4.932079792022705, "learning_rate": 2.3317696628791423e-05, "loss": 3.3417, "step": 2020 }, { "epoch": 0.5219524793388429, "grad_norm": 3.3012614250183105, "learning_rate": 2.3297459118802174e-05, "loss": 3.9597, "step": 2021 }, { "epoch": 0.5222107438016529, "grad_norm": 3.387645959854126, "learning_rate": 2.3277222729608927e-05, "loss": 3.8699, "step": 2022 }, { "epoch": 0.5224690082644629, "grad_norm": 2.824035406112671, "learning_rate": 2.3256987474533457e-05, "loss": 3.6509, "step": 2023 }, { "epoch": 0.5227272727272727, "grad_norm": 2.627838611602783, "learning_rate": 2.3236753366896786e-05, "loss": 4.0087, "step": 2024 }, { "epoch": 0.5229855371900827, "grad_norm": 2.495760917663574, "learning_rate": 2.3216520420019195e-05, "loss": 3.9073, "step": 2025 }, { "epoch": 0.5232438016528925, "grad_norm": 1.9028764963150024, "learning_rate": 2.3196288647220183e-05, "loss": 3.9823, "step": 2026 }, { "epoch": 0.5235020661157025, "grad_norm": 3.1327435970306396, "learning_rate": 2.317605806181849e-05, "loss": 4.6136, "step": 2027 }, { "epoch": 0.5237603305785123, "grad_norm": 2.6736581325531006, "learning_rate": 2.3155828677132063e-05, "loss": 4.17, "step": 2028 }, { "epoch": 0.5240185950413223, "grad_norm": 2.1827499866485596, "learning_rate": 2.313560050647807e-05, "loss": 4.3122, "step": 2029 }, { "epoch": 0.5242768595041323, "grad_norm": 3.4970338344573975, "learning_rate": 2.3115373563172872e-05, "loss": 3.9696, "step": 2030 }, { "epoch": 0.5245351239669421, "grad_norm": 2.0970401763916016, "learning_rate": 2.3095147860532017e-05, "loss": 4.3622, "step": 2031 }, { "epoch": 0.5247933884297521, "grad_norm": 3.8703246116638184, "learning_rate": 2.3074923411870263e-05, "loss": 4.3522, "step": 2032 }, { "epoch": 0.525051652892562, "grad_norm": 2.76654052734375, "learning_rate": 2.3054700230501502e-05, "loss": 4.514, "step": 2033 }, { "epoch": 0.5253099173553719, "grad_norm": 3.368591785430908, "learning_rate": 2.3034478329738835e-05, "loss": 4.183, "step": 2034 }, { "epoch": 0.5255681818181818, "grad_norm": 3.75972318649292, "learning_rate": 2.3014257722894468e-05, "loss": 4.037, "step": 2035 }, { "epoch": 0.5258264462809917, "grad_norm": 1.9173861742019653, "learning_rate": 2.2994038423279814e-05, "loss": 4.0164, "step": 2036 }, { "epoch": 0.5260847107438017, "grad_norm": 2.1156904697418213, "learning_rate": 2.2973820444205374e-05, "loss": 3.9927, "step": 2037 }, { "epoch": 0.5263429752066116, "grad_norm": 2.253958225250244, "learning_rate": 2.2953603798980807e-05, "loss": 4.4925, "step": 2038 }, { "epoch": 0.5266012396694215, "grad_norm": 2.84348464012146, "learning_rate": 2.293338850091491e-05, "loss": 4.3728, "step": 2039 }, { "epoch": 0.5268595041322314, "grad_norm": 2.565922737121582, "learning_rate": 2.2913174563315546e-05, "loss": 4.6461, "step": 2040 }, { "epoch": 0.5271177685950413, "grad_norm": 4.127674579620361, "learning_rate": 2.2892961999489722e-05, "loss": 3.6403, "step": 2041 }, { "epoch": 0.5273760330578512, "grad_norm": 2.3123416900634766, "learning_rate": 2.2872750822743517e-05, "loss": 3.9914, "step": 2042 }, { "epoch": 0.5276342975206612, "grad_norm": 3.106808662414551, "learning_rate": 2.2852541046382122e-05, "loss": 4.0627, "step": 2043 }, { "epoch": 0.5278925619834711, "grad_norm": 2.148070812225342, "learning_rate": 2.2832332683709775e-05, "loss": 4.5248, "step": 2044 }, { "epoch": 0.528150826446281, "grad_norm": 2.0835041999816895, "learning_rate": 2.2812125748029808e-05, "loss": 3.8868, "step": 2045 }, { "epoch": 0.5284090909090909, "grad_norm": 2.2816548347473145, "learning_rate": 2.2791920252644608e-05, "loss": 4.4162, "step": 2046 }, { "epoch": 0.5286673553719008, "grad_norm": 2.307335376739502, "learning_rate": 2.2771716210855603e-05, "loss": 4.2689, "step": 2047 }, { "epoch": 0.5289256198347108, "grad_norm": 3.9138717651367188, "learning_rate": 2.275151363596328e-05, "loss": 4.1554, "step": 2048 }, { "epoch": 0.5291838842975206, "grad_norm": 2.4723072052001953, "learning_rate": 2.2731312541267145e-05, "loss": 4.2461, "step": 2049 }, { "epoch": 0.5294421487603306, "grad_norm": 2.6585214138031006, "learning_rate": 2.2711112940065747e-05, "loss": 4.4149, "step": 2050 }, { "epoch": 0.5297004132231405, "grad_norm": 1.7052085399627686, "learning_rate": 2.2690914845656625e-05, "loss": 4.261, "step": 2051 }, { "epoch": 0.5299586776859504, "grad_norm": 2.950557231903076, "learning_rate": 2.2670718271336355e-05, "loss": 3.8072, "step": 2052 }, { "epoch": 0.5302169421487604, "grad_norm": 3.190040111541748, "learning_rate": 2.2650523230400506e-05, "loss": 4.4589, "step": 2053 }, { "epoch": 0.5304752066115702, "grad_norm": 6.58162784576416, "learning_rate": 2.2630329736143614e-05, "loss": 3.9594, "step": 2054 }, { "epoch": 0.5307334710743802, "grad_norm": 3.9728708267211914, "learning_rate": 2.2610137801859237e-05, "loss": 3.7973, "step": 2055 }, { "epoch": 0.53099173553719, "grad_norm": 2.65495228767395, "learning_rate": 2.2589947440839853e-05, "loss": 4.2092, "step": 2056 }, { "epoch": 0.53125, "grad_norm": 2.335630178451538, "learning_rate": 2.256975866637697e-05, "loss": 3.9948, "step": 2057 }, { "epoch": 0.53150826446281, "grad_norm": 4.6577887535095215, "learning_rate": 2.2549571491760986e-05, "loss": 4.0094, "step": 2058 }, { "epoch": 0.5317665289256198, "grad_norm": 1.5628714561462402, "learning_rate": 2.252938593028129e-05, "loss": 4.5446, "step": 2059 }, { "epoch": 0.5320247933884298, "grad_norm": 2.098445177078247, "learning_rate": 2.250920199522621e-05, "loss": 4.1696, "step": 2060 }, { "epoch": 0.5322830578512396, "grad_norm": 2.1863439083099365, "learning_rate": 2.248901969988295e-05, "loss": 3.7209, "step": 2061 }, { "epoch": 0.5325413223140496, "grad_norm": 2.8241658210754395, "learning_rate": 2.246883905753772e-05, "loss": 4.2679, "step": 2062 }, { "epoch": 0.5327995867768595, "grad_norm": 3.407252311706543, "learning_rate": 2.2448660081475555e-05, "loss": 4.3639, "step": 2063 }, { "epoch": 0.5330578512396694, "grad_norm": 14.84586238861084, "learning_rate": 2.2428482784980457e-05, "loss": 3.9406, "step": 2064 }, { "epoch": 0.5333161157024794, "grad_norm": 4.281606197357178, "learning_rate": 2.2408307181335286e-05, "loss": 4.3173, "step": 2065 }, { "epoch": 0.5335743801652892, "grad_norm": 2.9959259033203125, "learning_rate": 2.23881332838218e-05, "loss": 3.853, "step": 2066 }, { "epoch": 0.5338326446280992, "grad_norm": 2.7823057174682617, "learning_rate": 2.236796110572065e-05, "loss": 4.7075, "step": 2067 }, { "epoch": 0.5340909090909091, "grad_norm": 3.1502320766448975, "learning_rate": 2.234779066031132e-05, "loss": 4.5225, "step": 2068 }, { "epoch": 0.534349173553719, "grad_norm": 2.5258002281188965, "learning_rate": 2.2327621960872187e-05, "loss": 3.8392, "step": 2069 }, { "epoch": 0.5346074380165289, "grad_norm": 3.3009307384490967, "learning_rate": 2.2307455020680445e-05, "loss": 3.8165, "step": 2070 }, { "epoch": 0.5348657024793388, "grad_norm": 2.4839577674865723, "learning_rate": 2.2287289853012172e-05, "loss": 3.8291, "step": 2071 }, { "epoch": 0.5351239669421488, "grad_norm": 2.2513580322265625, "learning_rate": 2.2267126471142236e-05, "loss": 4.0133, "step": 2072 }, { "epoch": 0.5353822314049587, "grad_norm": 3.325650691986084, "learning_rate": 2.2246964888344357e-05, "loss": 4.5229, "step": 2073 }, { "epoch": 0.5356404958677686, "grad_norm": 1.9438029527664185, "learning_rate": 2.222680511789107e-05, "loss": 3.594, "step": 2074 }, { "epoch": 0.5358987603305785, "grad_norm": 3.432126522064209, "learning_rate": 2.2206647173053697e-05, "loss": 3.7659, "step": 2075 }, { "epoch": 0.5361570247933884, "grad_norm": 2.700282335281372, "learning_rate": 2.218649106710238e-05, "loss": 3.4027, "step": 2076 }, { "epoch": 0.5364152892561983, "grad_norm": 2.3463828563690186, "learning_rate": 2.2166336813306037e-05, "loss": 3.7874, "step": 2077 }, { "epoch": 0.5366735537190083, "grad_norm": 2.347322463989258, "learning_rate": 2.2146184424932385e-05, "loss": 4.1885, "step": 2078 }, { "epoch": 0.5369318181818182, "grad_norm": 2.295785427093506, "learning_rate": 2.2126033915247866e-05, "loss": 3.7955, "step": 2079 }, { "epoch": 0.5371900826446281, "grad_norm": 2.8048830032348633, "learning_rate": 2.2105885297517748e-05, "loss": 4.2328, "step": 2080 }, { "epoch": 0.537448347107438, "grad_norm": 3.0383849143981934, "learning_rate": 2.2085738585006024e-05, "loss": 3.8683, "step": 2081 }, { "epoch": 0.5377066115702479, "grad_norm": 5.413688659667969, "learning_rate": 2.206559379097541e-05, "loss": 3.8323, "step": 2082 }, { "epoch": 0.5379648760330579, "grad_norm": 3.413266658782959, "learning_rate": 2.2045450928687412e-05, "loss": 3.9393, "step": 2083 }, { "epoch": 0.5382231404958677, "grad_norm": 3.166489839553833, "learning_rate": 2.2025310011402207e-05, "loss": 4.1072, "step": 2084 }, { "epoch": 0.5384814049586777, "grad_norm": 6.512360095977783, "learning_rate": 2.2005171052378733e-05, "loss": 3.9051, "step": 2085 }, { "epoch": 0.5387396694214877, "grad_norm": 2.1127331256866455, "learning_rate": 2.198503406487462e-05, "loss": 4.1373, "step": 2086 }, { "epoch": 0.5389979338842975, "grad_norm": 3.3756492137908936, "learning_rate": 2.1964899062146202e-05, "loss": 4.1196, "step": 2087 }, { "epoch": 0.5392561983471075, "grad_norm": 2.85337233543396, "learning_rate": 2.194476605744852e-05, "loss": 3.614, "step": 2088 }, { "epoch": 0.5395144628099173, "grad_norm": 2.259730339050293, "learning_rate": 2.1924635064035277e-05, "loss": 4.051, "step": 2089 }, { "epoch": 0.5397727272727273, "grad_norm": 3.031968832015991, "learning_rate": 2.1904506095158873e-05, "loss": 4.2913, "step": 2090 }, { "epoch": 0.5400309917355371, "grad_norm": 5.743002414703369, "learning_rate": 2.1884379164070352e-05, "loss": 2.9224, "step": 2091 }, { "epoch": 0.5402892561983471, "grad_norm": 2.8635876178741455, "learning_rate": 2.1864254284019452e-05, "loss": 4.1359, "step": 2092 }, { "epoch": 0.5405475206611571, "grad_norm": 7.5449934005737305, "learning_rate": 2.1844131468254516e-05, "loss": 4.3356, "step": 2093 }, { "epoch": 0.5408057851239669, "grad_norm": 3.427532196044922, "learning_rate": 2.1824010730022566e-05, "loss": 4.2577, "step": 2094 }, { "epoch": 0.5410640495867769, "grad_norm": 3.2952358722686768, "learning_rate": 2.1803892082569237e-05, "loss": 3.9295, "step": 2095 }, { "epoch": 0.5413223140495868, "grad_norm": 2.383007287979126, "learning_rate": 2.1783775539138793e-05, "loss": 4.1074, "step": 2096 }, { "epoch": 0.5415805785123967, "grad_norm": 4.535576820373535, "learning_rate": 2.1763661112974117e-05, "loss": 3.5806, "step": 2097 }, { "epoch": 0.5418388429752066, "grad_norm": 3.5958423614501953, "learning_rate": 2.1743548817316684e-05, "loss": 3.4838, "step": 2098 }, { "epoch": 0.5420971074380165, "grad_norm": 2.1547582149505615, "learning_rate": 2.1723438665406586e-05, "loss": 3.8694, "step": 2099 }, { "epoch": 0.5423553719008265, "grad_norm": 2.9033498764038086, "learning_rate": 2.1703330670482477e-05, "loss": 3.9705, "step": 2100 }, { "epoch": 0.5426136363636364, "grad_norm": 1.783887505531311, "learning_rate": 2.1683224845781624e-05, "loss": 4.1368, "step": 2101 }, { "epoch": 0.5428719008264463, "grad_norm": 1.897125482559204, "learning_rate": 2.1663121204539853e-05, "loss": 3.7772, "step": 2102 }, { "epoch": 0.5431301652892562, "grad_norm": 2.7667105197906494, "learning_rate": 2.1643019759991524e-05, "loss": 3.7551, "step": 2103 }, { "epoch": 0.5433884297520661, "grad_norm": 2.7878212928771973, "learning_rate": 2.1622920525369604e-05, "loss": 3.6856, "step": 2104 }, { "epoch": 0.543646694214876, "grad_norm": 2.3180243968963623, "learning_rate": 2.1602823513905554e-05, "loss": 3.9076, "step": 2105 }, { "epoch": 0.543904958677686, "grad_norm": 2.930795192718506, "learning_rate": 2.1582728738829407e-05, "loss": 3.8115, "step": 2106 }, { "epoch": 0.5441632231404959, "grad_norm": 2.4990174770355225, "learning_rate": 2.1562636213369704e-05, "loss": 4.1309, "step": 2107 }, { "epoch": 0.5444214876033058, "grad_norm": 3.112484931945801, "learning_rate": 2.154254595075351e-05, "loss": 4.3971, "step": 2108 }, { "epoch": 0.5446797520661157, "grad_norm": 1.6754344701766968, "learning_rate": 2.152245796420642e-05, "loss": 3.9505, "step": 2109 }, { "epoch": 0.5449380165289256, "grad_norm": 3.0708210468292236, "learning_rate": 2.1502372266952495e-05, "loss": 3.6083, "step": 2110 }, { "epoch": 0.5451962809917356, "grad_norm": 3.860401153564453, "learning_rate": 2.1482288872214315e-05, "loss": 4.023, "step": 2111 }, { "epoch": 0.5454545454545454, "grad_norm": 2.833482265472412, "learning_rate": 2.146220779321293e-05, "loss": 4.6072, "step": 2112 }, { "epoch": 0.5457128099173554, "grad_norm": 2.2243456840515137, "learning_rate": 2.1442129043167874e-05, "loss": 3.8242, "step": 2113 }, { "epoch": 0.5459710743801653, "grad_norm": 6.762054443359375, "learning_rate": 2.1422052635297145e-05, "loss": 4.2401, "step": 2114 }, { "epoch": 0.5462293388429752, "grad_norm": 2.017421245574951, "learning_rate": 2.1401978582817204e-05, "loss": 4.1307, "step": 2115 }, { "epoch": 0.5464876033057852, "grad_norm": 2.260934591293335, "learning_rate": 2.1381906898942957e-05, "loss": 4.0784, "step": 2116 }, { "epoch": 0.546745867768595, "grad_norm": 2.0512888431549072, "learning_rate": 2.1361837596887744e-05, "loss": 4.4971, "step": 2117 }, { "epoch": 0.547004132231405, "grad_norm": 4.443727493286133, "learning_rate": 2.134177068986335e-05, "loss": 3.8771, "step": 2118 }, { "epoch": 0.5472623966942148, "grad_norm": 2.646944284439087, "learning_rate": 2.132170619107997e-05, "loss": 3.8036, "step": 2119 }, { "epoch": 0.5475206611570248, "grad_norm": 3.390573501586914, "learning_rate": 2.1301644113746238e-05, "loss": 4.4733, "step": 2120 }, { "epoch": 0.5477789256198347, "grad_norm": 2.0315675735473633, "learning_rate": 2.128158447106915e-05, "loss": 4.1223, "step": 2121 }, { "epoch": 0.5480371900826446, "grad_norm": 3.043318748474121, "learning_rate": 2.126152727625415e-05, "loss": 4.2112, "step": 2122 }, { "epoch": 0.5482954545454546, "grad_norm": 3.5445799827575684, "learning_rate": 2.1241472542505026e-05, "loss": 4.013, "step": 2123 }, { "epoch": 0.5485537190082644, "grad_norm": 2.1088123321533203, "learning_rate": 2.1221420283023982e-05, "loss": 4.2043, "step": 2124 }, { "epoch": 0.5488119834710744, "grad_norm": 6.508898735046387, "learning_rate": 2.120137051101158e-05, "loss": 3.9264, "step": 2125 }, { "epoch": 0.5490702479338843, "grad_norm": 4.01951265335083, "learning_rate": 2.1181323239666722e-05, "loss": 4.0157, "step": 2126 }, { "epoch": 0.5493285123966942, "grad_norm": 5.376931190490723, "learning_rate": 2.1161278482186712e-05, "loss": 3.4664, "step": 2127 }, { "epoch": 0.5495867768595041, "grad_norm": 3.2179553508758545, "learning_rate": 2.1141236251767142e-05, "loss": 3.7533, "step": 2128 }, { "epoch": 0.549845041322314, "grad_norm": 2.422914981842041, "learning_rate": 2.1121196561601993e-05, "loss": 3.6765, "step": 2129 }, { "epoch": 0.550103305785124, "grad_norm": 3.322899580001831, "learning_rate": 2.1101159424883532e-05, "loss": 3.697, "step": 2130 }, { "epoch": 0.5503615702479339, "grad_norm": 2.778306007385254, "learning_rate": 2.108112485480237e-05, "loss": 4.4249, "step": 2131 }, { "epoch": 0.5506198347107438, "grad_norm": 9.042973518371582, "learning_rate": 2.106109286454743e-05, "loss": 4.1947, "step": 2132 }, { "epoch": 0.5508780991735537, "grad_norm": 2.377422571182251, "learning_rate": 2.104106346730591e-05, "loss": 3.9665, "step": 2133 }, { "epoch": 0.5511363636363636, "grad_norm": 1.7801884412765503, "learning_rate": 2.102103667626334e-05, "loss": 4.3922, "step": 2134 }, { "epoch": 0.5513946280991735, "grad_norm": 1.5145620107650757, "learning_rate": 2.1001012504603497e-05, "loss": 3.8435, "step": 2135 }, { "epoch": 0.5516528925619835, "grad_norm": 3.93962025642395, "learning_rate": 2.0980990965508463e-05, "loss": 3.878, "step": 2136 }, { "epoch": 0.5519111570247934, "grad_norm": 2.3497326374053955, "learning_rate": 2.096097207215856e-05, "loss": 4.2825, "step": 2137 }, { "epoch": 0.5521694214876033, "grad_norm": 2.121587038040161, "learning_rate": 2.094095583773239e-05, "loss": 4.4138, "step": 2138 }, { "epoch": 0.5524276859504132, "grad_norm": 5.4650959968566895, "learning_rate": 2.0920942275406796e-05, "loss": 4.2339, "step": 2139 }, { "epoch": 0.5526859504132231, "grad_norm": 3.2831575870513916, "learning_rate": 2.0900931398356864e-05, "loss": 3.8855, "step": 2140 }, { "epoch": 0.5529442148760331, "grad_norm": 2.3868441581726074, "learning_rate": 2.0880923219755917e-05, "loss": 3.955, "step": 2141 }, { "epoch": 0.5532024793388429, "grad_norm": 3.1645398139953613, "learning_rate": 2.086091775277548e-05, "loss": 4.1889, "step": 2142 }, { "epoch": 0.5534607438016529, "grad_norm": 2.3450798988342285, "learning_rate": 2.0840915010585333e-05, "loss": 4.6479, "step": 2143 }, { "epoch": 0.5537190082644629, "grad_norm": 2.4047155380249023, "learning_rate": 2.0820915006353405e-05, "loss": 4.1702, "step": 2144 }, { "epoch": 0.5539772727272727, "grad_norm": 2.652066946029663, "learning_rate": 2.0800917753245877e-05, "loss": 3.8863, "step": 2145 }, { "epoch": 0.5542355371900827, "grad_norm": 1.8701763153076172, "learning_rate": 2.0780923264427107e-05, "loss": 4.1777, "step": 2146 }, { "epoch": 0.5544938016528925, "grad_norm": 2.8368287086486816, "learning_rate": 2.0760931553059593e-05, "loss": 4.0622, "step": 2147 }, { "epoch": 0.5547520661157025, "grad_norm": 2.5828654766082764, "learning_rate": 2.074094263230407e-05, "loss": 4.0131, "step": 2148 }, { "epoch": 0.5550103305785123, "grad_norm": 2.7674684524536133, "learning_rate": 2.072095651531937e-05, "loss": 4.2097, "step": 2149 }, { "epoch": 0.5552685950413223, "grad_norm": 5.386947154998779, "learning_rate": 2.0700973215262534e-05, "loss": 4.1477, "step": 2150 }, { "epoch": 0.5555268595041323, "grad_norm": 5.670682907104492, "learning_rate": 2.0680992745288707e-05, "loss": 4.5019, "step": 2151 }, { "epoch": 0.5557851239669421, "grad_norm": 4.41996955871582, "learning_rate": 2.06610151185512e-05, "loss": 4.0582, "step": 2152 }, { "epoch": 0.5560433884297521, "grad_norm": 2.773594856262207, "learning_rate": 2.0641040348201442e-05, "loss": 4.5221, "step": 2153 }, { "epoch": 0.556301652892562, "grad_norm": 5.652645111083984, "learning_rate": 2.0621068447388973e-05, "loss": 2.983, "step": 2154 }, { "epoch": 0.5565599173553719, "grad_norm": 4.750703811645508, "learning_rate": 2.060109942926146e-05, "loss": 3.8737, "step": 2155 }, { "epoch": 0.5568181818181818, "grad_norm": 6.256467819213867, "learning_rate": 2.0581133306964658e-05, "loss": 3.0724, "step": 2156 }, { "epoch": 0.5570764462809917, "grad_norm": 2.127408027648926, "learning_rate": 2.0561170093642423e-05, "loss": 4.4542, "step": 2157 }, { "epoch": 0.5573347107438017, "grad_norm": 2.733389377593994, "learning_rate": 2.0541209802436693e-05, "loss": 4.0661, "step": 2158 }, { "epoch": 0.5575929752066116, "grad_norm": 5.221953392028809, "learning_rate": 2.0521252446487487e-05, "loss": 4.3345, "step": 2159 }, { "epoch": 0.5578512396694215, "grad_norm": 2.575435161590576, "learning_rate": 2.0501298038932898e-05, "loss": 3.9733, "step": 2160 }, { "epoch": 0.5581095041322314, "grad_norm": 2.3399317264556885, "learning_rate": 2.0481346592909052e-05, "loss": 4.1176, "step": 2161 }, { "epoch": 0.5583677685950413, "grad_norm": 3.8443291187286377, "learning_rate": 2.046139812155015e-05, "loss": 4.1893, "step": 2162 }, { "epoch": 0.5586260330578512, "grad_norm": 1.8030303716659546, "learning_rate": 2.0441452637988427e-05, "loss": 3.8392, "step": 2163 }, { "epoch": 0.5588842975206612, "grad_norm": 4.658824443817139, "learning_rate": 2.042151015535416e-05, "loss": 3.9973, "step": 2164 }, { "epoch": 0.5591425619834711, "grad_norm": 2.2627904415130615, "learning_rate": 2.0401570686775617e-05, "loss": 4.2316, "step": 2165 }, { "epoch": 0.559400826446281, "grad_norm": 2.802483320236206, "learning_rate": 2.0381634245379124e-05, "loss": 3.8598, "step": 2166 }, { "epoch": 0.5596590909090909, "grad_norm": 3.9154064655303955, "learning_rate": 2.036170084428901e-05, "loss": 3.9761, "step": 2167 }, { "epoch": 0.5599173553719008, "grad_norm": 2.488293409347534, "learning_rate": 2.0341770496627553e-05, "loss": 4.27, "step": 2168 }, { "epoch": 0.5601756198347108, "grad_norm": 2.4951179027557373, "learning_rate": 2.0321843215515096e-05, "loss": 3.9327, "step": 2169 }, { "epoch": 0.5604338842975206, "grad_norm": 3.2368781566619873, "learning_rate": 2.0301919014069893e-05, "loss": 3.7643, "step": 2170 }, { "epoch": 0.5606921487603306, "grad_norm": 3.2761354446411133, "learning_rate": 2.0281997905408224e-05, "loss": 3.8106, "step": 2171 }, { "epoch": 0.5609504132231405, "grad_norm": 3.316318988800049, "learning_rate": 2.0262079902644292e-05, "loss": 4.2177, "step": 2172 }, { "epoch": 0.5612086776859504, "grad_norm": 2.4586586952209473, "learning_rate": 2.024216501889028e-05, "loss": 3.6985, "step": 2173 }, { "epoch": 0.5614669421487604, "grad_norm": 3.4111905097961426, "learning_rate": 2.0222253267256333e-05, "loss": 4.1422, "step": 2174 }, { "epoch": 0.5617252066115702, "grad_norm": 2.3756275177001953, "learning_rate": 2.0202344660850486e-05, "loss": 3.7288, "step": 2175 }, { "epoch": 0.5619834710743802, "grad_norm": 5.721994876861572, "learning_rate": 2.018243921277874e-05, "loss": 4.0894, "step": 2176 }, { "epoch": 0.56224173553719, "grad_norm": 4.040802955627441, "learning_rate": 2.016253693614501e-05, "loss": 4.4051, "step": 2177 }, { "epoch": 0.5625, "grad_norm": 2.571516275405884, "learning_rate": 2.014263784405112e-05, "loss": 4.2451, "step": 2178 }, { "epoch": 0.56275826446281, "grad_norm": 2.5260887145996094, "learning_rate": 2.0122741949596797e-05, "loss": 4.0366, "step": 2179 }, { "epoch": 0.5630165289256198, "grad_norm": 2.837096691131592, "learning_rate": 2.010284926587966e-05, "loss": 3.5373, "step": 2180 }, { "epoch": 0.5632747933884298, "grad_norm": 3.3041112422943115, "learning_rate": 2.0082959805995226e-05, "loss": 3.7266, "step": 2181 }, { "epoch": 0.5635330578512396, "grad_norm": 3.08404541015625, "learning_rate": 2.0063073583036878e-05, "loss": 4.228, "step": 2182 }, { "epoch": 0.5637913223140496, "grad_norm": 3.236569404602051, "learning_rate": 2.0043190610095878e-05, "loss": 4.1651, "step": 2183 }, { "epoch": 0.5640495867768595, "grad_norm": 8.235170364379883, "learning_rate": 2.0023310900261334e-05, "loss": 4.7798, "step": 2184 }, { "epoch": 0.5643078512396694, "grad_norm": 2.3345072269439697, "learning_rate": 2.0003434466620224e-05, "loss": 4.402, "step": 2185 }, { "epoch": 0.5645661157024794, "grad_norm": 4.060367107391357, "learning_rate": 1.9983561322257344e-05, "loss": 4.0952, "step": 2186 }, { "epoch": 0.5648243801652892, "grad_norm": 1.6631745100021362, "learning_rate": 1.9963691480255357e-05, "loss": 3.3974, "step": 2187 }, { "epoch": 0.5650826446280992, "grad_norm": 2.4966559410095215, "learning_rate": 1.9943824953694734e-05, "loss": 3.667, "step": 2188 }, { "epoch": 0.5653409090909091, "grad_norm": 2.0368990898132324, "learning_rate": 1.992396175565376e-05, "loss": 3.2835, "step": 2189 }, { "epoch": 0.565599173553719, "grad_norm": 3.0432450771331787, "learning_rate": 1.990410189920854e-05, "loss": 3.7796, "step": 2190 }, { "epoch": 0.5658574380165289, "grad_norm": 2.9783811569213867, "learning_rate": 1.9884245397432956e-05, "loss": 4.5825, "step": 2191 }, { "epoch": 0.5661157024793388, "grad_norm": 3.0814547538757324, "learning_rate": 1.9864392263398726e-05, "loss": 3.9803, "step": 2192 }, { "epoch": 0.5663739669421488, "grad_norm": 3.494887590408325, "learning_rate": 1.98445425101753e-05, "loss": 3.599, "step": 2193 }, { "epoch": 0.5666322314049587, "grad_norm": 3.5893635749816895, "learning_rate": 1.982469615082993e-05, "loss": 4.5638, "step": 2194 }, { "epoch": 0.5668904958677686, "grad_norm": 2.9300010204315186, "learning_rate": 1.9804853198427648e-05, "loss": 4.1573, "step": 2195 }, { "epoch": 0.5671487603305785, "grad_norm": 3.701960325241089, "learning_rate": 1.9785013666031205e-05, "loss": 3.7823, "step": 2196 }, { "epoch": 0.5674070247933884, "grad_norm": 1.9410642385482788, "learning_rate": 1.9765177566701128e-05, "loss": 3.783, "step": 2197 }, { "epoch": 0.5676652892561983, "grad_norm": 2.8512678146362305, "learning_rate": 1.9745344913495675e-05, "loss": 3.9125, "step": 2198 }, { "epoch": 0.5679235537190083, "grad_norm": 2.8550798892974854, "learning_rate": 1.972551571947084e-05, "loss": 4.084, "step": 2199 }, { "epoch": 0.5681818181818182, "grad_norm": 3.8925414085388184, "learning_rate": 1.9705689997680332e-05, "loss": 4.0999, "step": 2200 }, { "epoch": 0.5684400826446281, "grad_norm": 4.6611714363098145, "learning_rate": 1.9685867761175584e-05, "loss": 4.1946, "step": 2201 }, { "epoch": 0.568698347107438, "grad_norm": 2.5305678844451904, "learning_rate": 1.9666049023005732e-05, "loss": 4.6467, "step": 2202 }, { "epoch": 0.5689566115702479, "grad_norm": 2.940437078475952, "learning_rate": 1.9646233796217596e-05, "loss": 4.1689, "step": 2203 }, { "epoch": 0.5692148760330579, "grad_norm": 4.009382247924805, "learning_rate": 1.9626422093855705e-05, "loss": 3.79, "step": 2204 }, { "epoch": 0.5694731404958677, "grad_norm": 2.348818063735962, "learning_rate": 1.9606613928962248e-05, "loss": 4.075, "step": 2205 }, { "epoch": 0.5697314049586777, "grad_norm": 2.974756956100464, "learning_rate": 1.9586809314577108e-05, "loss": 3.7744, "step": 2206 }, { "epoch": 0.5699896694214877, "grad_norm": 2.7221362590789795, "learning_rate": 1.9567008263737806e-05, "loss": 4.2378, "step": 2207 }, { "epoch": 0.5702479338842975, "grad_norm": 2.1880557537078857, "learning_rate": 1.9547210789479537e-05, "loss": 3.7308, "step": 2208 }, { "epoch": 0.5705061983471075, "grad_norm": 3.4069883823394775, "learning_rate": 1.9527416904835132e-05, "loss": 3.9534, "step": 2209 }, { "epoch": 0.5707644628099173, "grad_norm": 1.951165795326233, "learning_rate": 1.9507626622835053e-05, "loss": 3.8723, "step": 2210 }, { "epoch": 0.5710227272727273, "grad_norm": 4.607691287994385, "learning_rate": 1.9487839956507416e-05, "loss": 4.5672, "step": 2211 }, { "epoch": 0.5712809917355371, "grad_norm": 4.301609992980957, "learning_rate": 1.946805691887791e-05, "loss": 3.7064, "step": 2212 }, { "epoch": 0.5715392561983471, "grad_norm": 2.8235957622528076, "learning_rate": 1.9448277522969897e-05, "loss": 3.7785, "step": 2213 }, { "epoch": 0.5717975206611571, "grad_norm": 2.610536813735962, "learning_rate": 1.942850178180428e-05, "loss": 4.185, "step": 2214 }, { "epoch": 0.5720557851239669, "grad_norm": 3.633051633834839, "learning_rate": 1.9408729708399585e-05, "loss": 4.2462, "step": 2215 }, { "epoch": 0.5723140495867769, "grad_norm": 3.288809299468994, "learning_rate": 1.9388961315771955e-05, "loss": 4.102, "step": 2216 }, { "epoch": 0.5725723140495868, "grad_norm": 2.062317132949829, "learning_rate": 1.9369196616935044e-05, "loss": 3.8024, "step": 2217 }, { "epoch": 0.5728305785123967, "grad_norm": 3.4712440967559814, "learning_rate": 1.9349435624900126e-05, "loss": 3.6235, "step": 2218 }, { "epoch": 0.5730888429752066, "grad_norm": 3.3351693153381348, "learning_rate": 1.9329678352676003e-05, "loss": 3.7668, "step": 2219 }, { "epoch": 0.5733471074380165, "grad_norm": 2.5271973609924316, "learning_rate": 1.930992481326906e-05, "loss": 3.7174, "step": 2220 }, { "epoch": 0.5736053719008265, "grad_norm": 2.9140005111694336, "learning_rate": 1.9290175019683188e-05, "loss": 4.2598, "step": 2221 }, { "epoch": 0.5738636363636364, "grad_norm": 2.041261911392212, "learning_rate": 1.927042898491984e-05, "loss": 3.7365, "step": 2222 }, { "epoch": 0.5741219008264463, "grad_norm": 2.4056966304779053, "learning_rate": 1.925068672197799e-05, "loss": 3.4784, "step": 2223 }, { "epoch": 0.5743801652892562, "grad_norm": 3.442289352416992, "learning_rate": 1.9230948243854115e-05, "loss": 4.403, "step": 2224 }, { "epoch": 0.5746384297520661, "grad_norm": 3.8329217433929443, "learning_rate": 1.921121356354222e-05, "loss": 4.4816, "step": 2225 }, { "epoch": 0.574896694214876, "grad_norm": 3.4969322681427, "learning_rate": 1.9191482694033787e-05, "loss": 4.2207, "step": 2226 }, { "epoch": 0.575154958677686, "grad_norm": 5.416886329650879, "learning_rate": 1.9171755648317817e-05, "loss": 4.4145, "step": 2227 }, { "epoch": 0.5754132231404959, "grad_norm": 2.008687734603882, "learning_rate": 1.9152032439380764e-05, "loss": 3.9269, "step": 2228 }, { "epoch": 0.5756714876033058, "grad_norm": 2.3947935104370117, "learning_rate": 1.9132313080206575e-05, "loss": 4.0225, "step": 2229 }, { "epoch": 0.5759297520661157, "grad_norm": 3.6828510761260986, "learning_rate": 1.911259758377667e-05, "loss": 4.0956, "step": 2230 }, { "epoch": 0.5761880165289256, "grad_norm": 3.776862382888794, "learning_rate": 1.9092885963069902e-05, "loss": 3.9504, "step": 2231 }, { "epoch": 0.5764462809917356, "grad_norm": 3.099039316177368, "learning_rate": 1.90731782310626e-05, "loss": 3.9032, "step": 2232 }, { "epoch": 0.5767045454545454, "grad_norm": 3.128770351409912, "learning_rate": 1.905347440072849e-05, "loss": 4.5402, "step": 2233 }, { "epoch": 0.5769628099173554, "grad_norm": 2.8161051273345947, "learning_rate": 1.9033774485038794e-05, "loss": 4.0798, "step": 2234 }, { "epoch": 0.5772210743801653, "grad_norm": 2.45316481590271, "learning_rate": 1.901407849696209e-05, "loss": 4.0377, "step": 2235 }, { "epoch": 0.5774793388429752, "grad_norm": 2.978213310241699, "learning_rate": 1.8994386449464407e-05, "loss": 4.0052, "step": 2236 }, { "epoch": 0.5777376033057852, "grad_norm": 3.4868674278259277, "learning_rate": 1.8974698355509202e-05, "loss": 4.4435, "step": 2237 }, { "epoch": 0.577995867768595, "grad_norm": 2.906374454498291, "learning_rate": 1.895501422805726e-05, "loss": 3.9298, "step": 2238 }, { "epoch": 0.578254132231405, "grad_norm": 4.697603702545166, "learning_rate": 1.8935334080066834e-05, "loss": 3.9426, "step": 2239 }, { "epoch": 0.5785123966942148, "grad_norm": 5.012669563293457, "learning_rate": 1.8915657924493495e-05, "loss": 4.2455, "step": 2240 }, { "epoch": 0.5787706611570248, "grad_norm": 3.3224472999572754, "learning_rate": 1.889598577429022e-05, "loss": 3.9938, "step": 2241 }, { "epoch": 0.5790289256198347, "grad_norm": 3.221034049987793, "learning_rate": 1.8876317642407335e-05, "loss": 4.6931, "step": 2242 }, { "epoch": 0.5792871900826446, "grad_norm": 2.6499240398406982, "learning_rate": 1.8856653541792532e-05, "loss": 3.4447, "step": 2243 }, { "epoch": 0.5795454545454546, "grad_norm": 1.8786113262176514, "learning_rate": 1.8836993485390835e-05, "loss": 4.091, "step": 2244 }, { "epoch": 0.5798037190082644, "grad_norm": 4.141083717346191, "learning_rate": 1.881733748614461e-05, "loss": 3.8766, "step": 2245 }, { "epoch": 0.5800619834710744, "grad_norm": 2.0331175327301025, "learning_rate": 1.8797685556993576e-05, "loss": 3.5554, "step": 2246 }, { "epoch": 0.5803202479338843, "grad_norm": 2.2101058959960938, "learning_rate": 1.8778037710874725e-05, "loss": 4.5707, "step": 2247 }, { "epoch": 0.5805785123966942, "grad_norm": 2.7982230186462402, "learning_rate": 1.875839396072241e-05, "loss": 3.689, "step": 2248 }, { "epoch": 0.5808367768595041, "grad_norm": 2.5229599475860596, "learning_rate": 1.8738754319468256e-05, "loss": 3.8036, "step": 2249 }, { "epoch": 0.581095041322314, "grad_norm": 5.517892837524414, "learning_rate": 1.8719118800041197e-05, "loss": 4.4017, "step": 2250 }, { "epoch": 0.581353305785124, "grad_norm": 4.21243143081665, "learning_rate": 1.8699487415367434e-05, "loss": 4.3296, "step": 2251 }, { "epoch": 0.5816115702479339, "grad_norm": 4.15452766418457, "learning_rate": 1.8679860178370473e-05, "loss": 4.4377, "step": 2252 }, { "epoch": 0.5818698347107438, "grad_norm": 3.5320396423339844, "learning_rate": 1.8660237101971087e-05, "loss": 4.1963, "step": 2253 }, { "epoch": 0.5821280991735537, "grad_norm": 3.3216946125030518, "learning_rate": 1.8640618199087286e-05, "loss": 3.9362, "step": 2254 }, { "epoch": 0.5823863636363636, "grad_norm": 6.466681957244873, "learning_rate": 1.8621003482634364e-05, "loss": 3.2974, "step": 2255 }, { "epoch": 0.5826446280991735, "grad_norm": 4.730312347412109, "learning_rate": 1.8601392965524816e-05, "loss": 3.9195, "step": 2256 }, { "epoch": 0.5829028925619835, "grad_norm": 2.3328497409820557, "learning_rate": 1.8581786660668434e-05, "loss": 3.9626, "step": 2257 }, { "epoch": 0.5831611570247934, "grad_norm": 4.305302143096924, "learning_rate": 1.8562184580972163e-05, "loss": 4.3235, "step": 2258 }, { "epoch": 0.5834194214876033, "grad_norm": 3.581531524658203, "learning_rate": 1.854258673934023e-05, "loss": 4.0817, "step": 2259 }, { "epoch": 0.5836776859504132, "grad_norm": 2.37381649017334, "learning_rate": 1.8522993148674055e-05, "loss": 4.5148, "step": 2260 }, { "epoch": 0.5839359504132231, "grad_norm": 4.026546955108643, "learning_rate": 1.8503403821872228e-05, "loss": 4.2097, "step": 2261 }, { "epoch": 0.5841942148760331, "grad_norm": 3.8212454319000244, "learning_rate": 1.848381877183058e-05, "loss": 3.9587, "step": 2262 }, { "epoch": 0.5844524793388429, "grad_norm": 2.3206300735473633, "learning_rate": 1.846423801144208e-05, "loss": 3.9937, "step": 2263 }, { "epoch": 0.5847107438016529, "grad_norm": 4.079465389251709, "learning_rate": 1.844466155359692e-05, "loss": 4.1938, "step": 2264 }, { "epoch": 0.5849690082644629, "grad_norm": 3.1588265895843506, "learning_rate": 1.8425089411182414e-05, "loss": 3.8224, "step": 2265 }, { "epoch": 0.5852272727272727, "grad_norm": 5.135882377624512, "learning_rate": 1.8405521597083068e-05, "loss": 3.4313, "step": 2266 }, { "epoch": 0.5854855371900827, "grad_norm": 2.864413022994995, "learning_rate": 1.838595812418053e-05, "loss": 3.8241, "step": 2267 }, { "epoch": 0.5857438016528925, "grad_norm": 3.884568214416504, "learning_rate": 1.836639900535358e-05, "loss": 3.8654, "step": 2268 }, { "epoch": 0.5860020661157025, "grad_norm": 2.481536865234375, "learning_rate": 1.834684425347814e-05, "loss": 3.9641, "step": 2269 }, { "epoch": 0.5862603305785123, "grad_norm": 2.05098295211792, "learning_rate": 1.832729388142726e-05, "loss": 4.0784, "step": 2270 }, { "epoch": 0.5865185950413223, "grad_norm": 2.2716503143310547, "learning_rate": 1.8307747902071105e-05, "loss": 3.9294, "step": 2271 }, { "epoch": 0.5867768595041323, "grad_norm": 4.062479496002197, "learning_rate": 1.828820632827694e-05, "loss": 3.6688, "step": 2272 }, { "epoch": 0.5870351239669421, "grad_norm": 3.1173758506774902, "learning_rate": 1.8268669172909137e-05, "loss": 4.1798, "step": 2273 }, { "epoch": 0.5872933884297521, "grad_norm": 2.041224718093872, "learning_rate": 1.8249136448829165e-05, "loss": 3.8381, "step": 2274 }, { "epoch": 0.587551652892562, "grad_norm": 4.5398454666137695, "learning_rate": 1.8229608168895562e-05, "loss": 3.7408, "step": 2275 }, { "epoch": 0.5878099173553719, "grad_norm": 4.630671501159668, "learning_rate": 1.821008434596396e-05, "loss": 3.8276, "step": 2276 }, { "epoch": 0.5880681818181818, "grad_norm": 3.3699328899383545, "learning_rate": 1.819056499288702e-05, "loss": 4.7188, "step": 2277 }, { "epoch": 0.5883264462809917, "grad_norm": 3.3578271865844727, "learning_rate": 1.8171050122514516e-05, "loss": 4.315, "step": 2278 }, { "epoch": 0.5885847107438017, "grad_norm": 2.784668445587158, "learning_rate": 1.8151539747693215e-05, "loss": 4.3637, "step": 2279 }, { "epoch": 0.5888429752066116, "grad_norm": 2.7371859550476074, "learning_rate": 1.8132033881266948e-05, "loss": 3.9615, "step": 2280 }, { "epoch": 0.5891012396694215, "grad_norm": 3.677658796310425, "learning_rate": 1.8112532536076613e-05, "loss": 3.881, "step": 2281 }, { "epoch": 0.5893595041322314, "grad_norm": 3.677386999130249, "learning_rate": 1.809303572496006e-05, "loss": 3.7813, "step": 2282 }, { "epoch": 0.5896177685950413, "grad_norm": 4.320640563964844, "learning_rate": 1.807354346075222e-05, "loss": 4.0881, "step": 2283 }, { "epoch": 0.5898760330578512, "grad_norm": 1.7659944295883179, "learning_rate": 1.8054055756284986e-05, "loss": 3.8243, "step": 2284 }, { "epoch": 0.5901342975206612, "grad_norm": 2.7172458171844482, "learning_rate": 1.8034572624387276e-05, "loss": 4.4989, "step": 2285 }, { "epoch": 0.5903925619834711, "grad_norm": 3.1327197551727295, "learning_rate": 1.8015094077884974e-05, "loss": 3.4441, "step": 2286 }, { "epoch": 0.590650826446281, "grad_norm": 7.34801721572876, "learning_rate": 1.799562012960098e-05, "loss": 3.9818, "step": 2287 }, { "epoch": 0.5909090909090909, "grad_norm": 1.850011944770813, "learning_rate": 1.7976150792355137e-05, "loss": 4.4204, "step": 2288 }, { "epoch": 0.5911673553719008, "grad_norm": 1.7189111709594727, "learning_rate": 1.795668607896426e-05, "loss": 4.4247, "step": 2289 }, { "epoch": 0.5914256198347108, "grad_norm": 2.394246816635132, "learning_rate": 1.7937226002242126e-05, "loss": 3.95, "step": 2290 }, { "epoch": 0.5916838842975206, "grad_norm": 8.844282150268555, "learning_rate": 1.7917770574999454e-05, "loss": 3.2642, "step": 2291 }, { "epoch": 0.5919421487603306, "grad_norm": 3.5724618434906006, "learning_rate": 1.789831981004391e-05, "loss": 4.0963, "step": 2292 }, { "epoch": 0.5922004132231405, "grad_norm": 2.911865711212158, "learning_rate": 1.7878873720180072e-05, "loss": 3.9848, "step": 2293 }, { "epoch": 0.5924586776859504, "grad_norm": 3.2339344024658203, "learning_rate": 1.7859432318209463e-05, "loss": 3.9967, "step": 2294 }, { "epoch": 0.5927169421487604, "grad_norm": 2.0651049613952637, "learning_rate": 1.7839995616930516e-05, "loss": 3.9213, "step": 2295 }, { "epoch": 0.5929752066115702, "grad_norm": 1.6257367134094238, "learning_rate": 1.782056362913855e-05, "loss": 4.0626, "step": 2296 }, { "epoch": 0.5932334710743802, "grad_norm": 2.2014119625091553, "learning_rate": 1.780113636762581e-05, "loss": 4.1007, "step": 2297 }, { "epoch": 0.59349173553719, "grad_norm": 5.113912582397461, "learning_rate": 1.7781713845181393e-05, "loss": 2.3965, "step": 2298 }, { "epoch": 0.59375, "grad_norm": 3.390679121017456, "learning_rate": 1.7762296074591324e-05, "loss": 4.0056, "step": 2299 }, { "epoch": 0.59400826446281, "grad_norm": 3.4651570320129395, "learning_rate": 1.7742883068638447e-05, "loss": 3.3384, "step": 2300 }, { "epoch": 0.5942665289256198, "grad_norm": 9.469547271728516, "learning_rate": 1.7723474840102506e-05, "loss": 4.3744, "step": 2301 }, { "epoch": 0.5945247933884298, "grad_norm": 3.221461772918701, "learning_rate": 1.770407140176011e-05, "loss": 4.104, "step": 2302 }, { "epoch": 0.5947830578512396, "grad_norm": 2.791409969329834, "learning_rate": 1.7684672766384657e-05, "loss": 4.0702, "step": 2303 }, { "epoch": 0.5950413223140496, "grad_norm": 2.6922590732574463, "learning_rate": 1.766527894674646e-05, "loss": 4.3209, "step": 2304 }, { "epoch": 0.5952995867768595, "grad_norm": 2.784687042236328, "learning_rate": 1.7645889955612593e-05, "loss": 3.9176, "step": 2305 }, { "epoch": 0.5955578512396694, "grad_norm": 3.102848529815674, "learning_rate": 1.7626505805746994e-05, "loss": 4.0415, "step": 2306 }, { "epoch": 0.5958161157024794, "grad_norm": 2.9058754444122314, "learning_rate": 1.76071265099104e-05, "loss": 4.5335, "step": 2307 }, { "epoch": 0.5960743801652892, "grad_norm": 1.817257285118103, "learning_rate": 1.7587752080860343e-05, "loss": 4.1754, "step": 2308 }, { "epoch": 0.5963326446280992, "grad_norm": 5.319660663604736, "learning_rate": 1.756838253135118e-05, "loss": 3.9265, "step": 2309 }, { "epoch": 0.5965909090909091, "grad_norm": 3.2738583087921143, "learning_rate": 1.7549017874134014e-05, "loss": 4.0954, "step": 2310 }, { "epoch": 0.596849173553719, "grad_norm": 3.1948940753936768, "learning_rate": 1.7529658121956777e-05, "loss": 3.5776, "step": 2311 }, { "epoch": 0.5971074380165289, "grad_norm": 2.3429083824157715, "learning_rate": 1.7510303287564127e-05, "loss": 4.0453, "step": 2312 }, { "epoch": 0.5973657024793388, "grad_norm": 2.9847490787506104, "learning_rate": 1.749095338369751e-05, "loss": 3.728, "step": 2313 }, { "epoch": 0.5976239669421488, "grad_norm": 2.246284246444702, "learning_rate": 1.7471608423095108e-05, "loss": 4.622, "step": 2314 }, { "epoch": 0.5978822314049587, "grad_norm": 4.0547566413879395, "learning_rate": 1.745226841849188e-05, "loss": 4.107, "step": 2315 }, { "epoch": 0.5981404958677686, "grad_norm": 2.6953964233398438, "learning_rate": 1.7432933382619487e-05, "loss": 4.0196, "step": 2316 }, { "epoch": 0.5983987603305785, "grad_norm": 2.142134428024292, "learning_rate": 1.741360332820634e-05, "loss": 4.0132, "step": 2317 }, { "epoch": 0.5986570247933884, "grad_norm": 4.123167514801025, "learning_rate": 1.7394278267977572e-05, "loss": 4.0675, "step": 2318 }, { "epoch": 0.5989152892561983, "grad_norm": 4.865847587585449, "learning_rate": 1.737495821465501e-05, "loss": 4.4292, "step": 2319 }, { "epoch": 0.5991735537190083, "grad_norm": 2.532170534133911, "learning_rate": 1.7355643180957203e-05, "loss": 4.609, "step": 2320 }, { "epoch": 0.5994318181818182, "grad_norm": 2.4765126705169678, "learning_rate": 1.733633317959938e-05, "loss": 4.091, "step": 2321 }, { "epoch": 0.5996900826446281, "grad_norm": 5.837951183319092, "learning_rate": 1.7317028223293474e-05, "loss": 3.8719, "step": 2322 }, { "epoch": 0.599948347107438, "grad_norm": 3.2989156246185303, "learning_rate": 1.72977283247481e-05, "loss": 4.1648, "step": 2323 }, { "epoch": 0.6002066115702479, "grad_norm": 2.594320058822632, "learning_rate": 1.7278433496668506e-05, "loss": 3.8366, "step": 2324 }, { "epoch": 0.6004648760330579, "grad_norm": 1.9712904691696167, "learning_rate": 1.725914375175666e-05, "loss": 4.3982, "step": 2325 }, { "epoch": 0.6007231404958677, "grad_norm": 2.9629571437835693, "learning_rate": 1.723985910271112e-05, "loss": 3.5456, "step": 2326 }, { "epoch": 0.6009814049586777, "grad_norm": 4.314224720001221, "learning_rate": 1.7220579562227146e-05, "loss": 4.1245, "step": 2327 }, { "epoch": 0.6012396694214877, "grad_norm": 3.9807605743408203, "learning_rate": 1.7201305142996595e-05, "loss": 3.6927, "step": 2328 }, { "epoch": 0.6014979338842975, "grad_norm": 5.377614974975586, "learning_rate": 1.718203585770798e-05, "loss": 4.4202, "step": 2329 }, { "epoch": 0.6017561983471075, "grad_norm": 1.9772826433181763, "learning_rate": 1.7162771719046416e-05, "loss": 4.0938, "step": 2330 }, { "epoch": 0.6020144628099173, "grad_norm": 5.9398369789123535, "learning_rate": 1.7143512739693634e-05, "loss": 4.2072, "step": 2331 }, { "epoch": 0.6022727272727273, "grad_norm": 2.1534368991851807, "learning_rate": 1.712425893232798e-05, "loss": 3.6954, "step": 2332 }, { "epoch": 0.6025309917355371, "grad_norm": 3.062079906463623, "learning_rate": 1.710501030962438e-05, "loss": 3.7464, "step": 2333 }, { "epoch": 0.6027892561983471, "grad_norm": 3.062324285507202, "learning_rate": 1.7085766884254355e-05, "loss": 4.1142, "step": 2334 }, { "epoch": 0.6030475206611571, "grad_norm": 6.975207805633545, "learning_rate": 1.7066528668885998e-05, "loss": 3.8319, "step": 2335 }, { "epoch": 0.6033057851239669, "grad_norm": 4.161407470703125, "learning_rate": 1.704729567618398e-05, "loss": 3.7682, "step": 2336 }, { "epoch": 0.6035640495867769, "grad_norm": 7.741718292236328, "learning_rate": 1.7028067918809537e-05, "loss": 3.6962, "step": 2337 }, { "epoch": 0.6038223140495868, "grad_norm": 4.129680633544922, "learning_rate": 1.7008845409420445e-05, "loss": 3.7725, "step": 2338 }, { "epoch": 0.6040805785123967, "grad_norm": 4.0300679206848145, "learning_rate": 1.6989628160671034e-05, "loss": 4.4432, "step": 2339 }, { "epoch": 0.6043388429752066, "grad_norm": 2.3935325145721436, "learning_rate": 1.6970416185212166e-05, "loss": 3.5105, "step": 2340 }, { "epoch": 0.6045971074380165, "grad_norm": 3.720930576324463, "learning_rate": 1.6951209495691252e-05, "loss": 4.2949, "step": 2341 }, { "epoch": 0.6048553719008265, "grad_norm": 3.022219657897949, "learning_rate": 1.6932008104752177e-05, "loss": 4.285, "step": 2342 }, { "epoch": 0.6051136363636364, "grad_norm": 3.627777338027954, "learning_rate": 1.6912812025035388e-05, "loss": 3.6985, "step": 2343 }, { "epoch": 0.6053719008264463, "grad_norm": 2.9288432598114014, "learning_rate": 1.689362126917782e-05, "loss": 3.8179, "step": 2344 }, { "epoch": 0.6056301652892562, "grad_norm": 2.871403217315674, "learning_rate": 1.6874435849812873e-05, "loss": 3.9922, "step": 2345 }, { "epoch": 0.6058884297520661, "grad_norm": 3.6124958992004395, "learning_rate": 1.6855255779570488e-05, "loss": 4.4437, "step": 2346 }, { "epoch": 0.606146694214876, "grad_norm": 2.3642938137054443, "learning_rate": 1.6836081071077033e-05, "loss": 4.2603, "step": 2347 }, { "epoch": 0.606404958677686, "grad_norm": 2.216829538345337, "learning_rate": 1.681691173695538e-05, "loss": 3.9953, "step": 2348 }, { "epoch": 0.6066632231404959, "grad_norm": 3.6398065090179443, "learning_rate": 1.6797747789824845e-05, "loss": 4.0656, "step": 2349 }, { "epoch": 0.6069214876033058, "grad_norm": 1.834940791130066, "learning_rate": 1.67785892423012e-05, "loss": 4.0437, "step": 2350 }, { "epoch": 0.6071797520661157, "grad_norm": 3.946444272994995, "learning_rate": 1.6759436106996696e-05, "loss": 3.1952, "step": 2351 }, { "epoch": 0.6074380165289256, "grad_norm": 3.6441354751586914, "learning_rate": 1.6740288396519958e-05, "loss": 3.551, "step": 2352 }, { "epoch": 0.6076962809917356, "grad_norm": 2.950549602508545, "learning_rate": 1.6721146123476093e-05, "loss": 3.7126, "step": 2353 }, { "epoch": 0.6079545454545454, "grad_norm": 4.9929938316345215, "learning_rate": 1.6702009300466596e-05, "loss": 3.9833, "step": 2354 }, { "epoch": 0.6082128099173554, "grad_norm": 2.7445414066314697, "learning_rate": 1.6682877940089406e-05, "loss": 3.6524, "step": 2355 }, { "epoch": 0.6084710743801653, "grad_norm": 3.200406551361084, "learning_rate": 1.666375205493883e-05, "loss": 4.1434, "step": 2356 }, { "epoch": 0.6087293388429752, "grad_norm": 3.4537856578826904, "learning_rate": 1.664463165760559e-05, "loss": 4.0282, "step": 2357 }, { "epoch": 0.6089876033057852, "grad_norm": 2.954744338989258, "learning_rate": 1.662551676067681e-05, "loss": 4.2222, "step": 2358 }, { "epoch": 0.609245867768595, "grad_norm": 1.8219741582870483, "learning_rate": 1.6606407376735955e-05, "loss": 4.0508, "step": 2359 }, { "epoch": 0.609504132231405, "grad_norm": 3.7898495197296143, "learning_rate": 1.6587303518362898e-05, "loss": 4.1678, "step": 2360 }, { "epoch": 0.6097623966942148, "grad_norm": 4.5912675857543945, "learning_rate": 1.6568205198133846e-05, "loss": 4.3974, "step": 2361 }, { "epoch": 0.6100206611570248, "grad_norm": 1.5059895515441895, "learning_rate": 1.6549112428621396e-05, "loss": 3.9809, "step": 2362 }, { "epoch": 0.6102789256198347, "grad_norm": 3.6889588832855225, "learning_rate": 1.6530025222394424e-05, "loss": 4.3219, "step": 2363 }, { "epoch": 0.6105371900826446, "grad_norm": 2.0447909832000732, "learning_rate": 1.6510943592018238e-05, "loss": 3.7444, "step": 2364 }, { "epoch": 0.6107954545454546, "grad_norm": 2.5134620666503906, "learning_rate": 1.6491867550054384e-05, "loss": 4.4159, "step": 2365 }, { "epoch": 0.6110537190082644, "grad_norm": 3.023559808731079, "learning_rate": 1.647279710906079e-05, "loss": 4.4801, "step": 2366 }, { "epoch": 0.6113119834710744, "grad_norm": 2.944206953048706, "learning_rate": 1.6453732281591687e-05, "loss": 4.107, "step": 2367 }, { "epoch": 0.6115702479338843, "grad_norm": 2.6673731803894043, "learning_rate": 1.643467308019757e-05, "loss": 3.8649, "step": 2368 }, { "epoch": 0.6118285123966942, "grad_norm": 2.343238115310669, "learning_rate": 1.6415619517425296e-05, "loss": 4.5493, "step": 2369 }, { "epoch": 0.6120867768595041, "grad_norm": 3.777601718902588, "learning_rate": 1.6396571605817943e-05, "loss": 3.3817, "step": 2370 }, { "epoch": 0.612345041322314, "grad_norm": 3.247659921646118, "learning_rate": 1.6377529357914917e-05, "loss": 4.1721, "step": 2371 }, { "epoch": 0.612603305785124, "grad_norm": 1.9639047384262085, "learning_rate": 1.6358492786251876e-05, "loss": 4.3425, "step": 2372 }, { "epoch": 0.6128615702479339, "grad_norm": 3.8846609592437744, "learning_rate": 1.633946190336074e-05, "loss": 4.095, "step": 2373 }, { "epoch": 0.6131198347107438, "grad_norm": 2.1129233837127686, "learning_rate": 1.63204367217697e-05, "loss": 4.0525, "step": 2374 }, { "epoch": 0.6133780991735537, "grad_norm": 2.096823215484619, "learning_rate": 1.630141725400317e-05, "loss": 3.9243, "step": 2375 }, { "epoch": 0.6136363636363636, "grad_norm": 2.1487491130828857, "learning_rate": 1.6282403512581822e-05, "loss": 4.3736, "step": 2376 }, { "epoch": 0.6138946280991735, "grad_norm": 2.6024086475372314, "learning_rate": 1.6263395510022543e-05, "loss": 4.6213, "step": 2377 }, { "epoch": 0.6141528925619835, "grad_norm": 2.1888177394866943, "learning_rate": 1.6244393258838463e-05, "loss": 3.9691, "step": 2378 }, { "epoch": 0.6144111570247934, "grad_norm": 3.553144693374634, "learning_rate": 1.6225396771538898e-05, "loss": 4.3209, "step": 2379 }, { "epoch": 0.6146694214876033, "grad_norm": 5.402708530426025, "learning_rate": 1.6206406060629393e-05, "loss": 3.5521, "step": 2380 }, { "epoch": 0.6149276859504132, "grad_norm": 2.102248430252075, "learning_rate": 1.618742113861168e-05, "loss": 3.5617, "step": 2381 }, { "epoch": 0.6151859504132231, "grad_norm": 2.432037591934204, "learning_rate": 1.6168442017983683e-05, "loss": 3.8314, "step": 2382 }, { "epoch": 0.6154442148760331, "grad_norm": 3.029008388519287, "learning_rate": 1.6149468711239506e-05, "loss": 4.0512, "step": 2383 }, { "epoch": 0.6157024793388429, "grad_norm": 4.383819103240967, "learning_rate": 1.6130501230869425e-05, "loss": 3.8623, "step": 2384 }, { "epoch": 0.6159607438016529, "grad_norm": 6.214782238006592, "learning_rate": 1.6111539589359887e-05, "loss": 4.1565, "step": 2385 }, { "epoch": 0.6162190082644629, "grad_norm": 2.903116464614868, "learning_rate": 1.6092583799193467e-05, "loss": 3.8018, "step": 2386 }, { "epoch": 0.6164772727272727, "grad_norm": 2.6210432052612305, "learning_rate": 1.607363387284893e-05, "loss": 3.8603, "step": 2387 }, { "epoch": 0.6167355371900827, "grad_norm": 1.7998887300491333, "learning_rate": 1.605468982280117e-05, "loss": 4.1956, "step": 2388 }, { "epoch": 0.6169938016528925, "grad_norm": 2.990771770477295, "learning_rate": 1.6035751661521174e-05, "loss": 3.9989, "step": 2389 }, { "epoch": 0.6172520661157025, "grad_norm": 4.759575366973877, "learning_rate": 1.601681940147611e-05, "loss": 3.631, "step": 2390 }, { "epoch": 0.6175103305785123, "grad_norm": 2.5851833820343018, "learning_rate": 1.599789305512922e-05, "loss": 3.4467, "step": 2391 }, { "epoch": 0.6177685950413223, "grad_norm": 2.0283496379852295, "learning_rate": 1.5978972634939866e-05, "loss": 3.995, "step": 2392 }, { "epoch": 0.6180268595041323, "grad_norm": 3.511977195739746, "learning_rate": 1.5960058153363506e-05, "loss": 4.0548, "step": 2393 }, { "epoch": 0.6182851239669421, "grad_norm": 3.3162331581115723, "learning_rate": 1.5941149622851698e-05, "loss": 4.3984, "step": 2394 }, { "epoch": 0.6185433884297521, "grad_norm": 2.166520357131958, "learning_rate": 1.592224705585208e-05, "loss": 4.2455, "step": 2395 }, { "epoch": 0.618801652892562, "grad_norm": 2.560520887374878, "learning_rate": 1.590335046480834e-05, "loss": 3.9528, "step": 2396 }, { "epoch": 0.6190599173553719, "grad_norm": 2.9791624546051025, "learning_rate": 1.588445986216028e-05, "loss": 4.2192, "step": 2397 }, { "epoch": 0.6193181818181818, "grad_norm": 2.768242120742798, "learning_rate": 1.5865575260343705e-05, "loss": 3.6028, "step": 2398 }, { "epoch": 0.6195764462809917, "grad_norm": 6.098625183105469, "learning_rate": 1.5846696671790522e-05, "loss": 3.8351, "step": 2399 }, { "epoch": 0.6198347107438017, "grad_norm": 3.4308722019195557, "learning_rate": 1.582782410892863e-05, "loss": 3.9055, "step": 2400 }, { "epoch": 0.6200929752066116, "grad_norm": 2.89959716796875, "learning_rate": 1.5808957584181998e-05, "loss": 3.6878, "step": 2401 }, { "epoch": 0.6203512396694215, "grad_norm": 3.498173475265503, "learning_rate": 1.5790097109970608e-05, "loss": 4.0154, "step": 2402 }, { "epoch": 0.6206095041322314, "grad_norm": 4.182631492614746, "learning_rate": 1.577124269871045e-05, "loss": 4.4187, "step": 2403 }, { "epoch": 0.6208677685950413, "grad_norm": 4.183931827545166, "learning_rate": 1.5752394362813538e-05, "loss": 3.551, "step": 2404 }, { "epoch": 0.6211260330578512, "grad_norm": 2.9325530529022217, "learning_rate": 1.573355211468787e-05, "loss": 4.1661, "step": 2405 }, { "epoch": 0.6213842975206612, "grad_norm": 3.401374578475952, "learning_rate": 1.571471596673747e-05, "loss": 3.5787, "step": 2406 }, { "epoch": 0.6216425619834711, "grad_norm": 3.9609649181365967, "learning_rate": 1.569588593136228e-05, "loss": 4.5394, "step": 2407 }, { "epoch": 0.621900826446281, "grad_norm": 2.690058708190918, "learning_rate": 1.5677062020958294e-05, "loss": 4.2399, "step": 2408 }, { "epoch": 0.6221590909090909, "grad_norm": 2.0011260509490967, "learning_rate": 1.5658244247917435e-05, "loss": 3.763, "step": 2409 }, { "epoch": 0.6224173553719008, "grad_norm": 2.5424203872680664, "learning_rate": 1.5639432624627576e-05, "loss": 4.043, "step": 2410 }, { "epoch": 0.6226756198347108, "grad_norm": 2.072235584259033, "learning_rate": 1.5620627163472573e-05, "loss": 3.9811, "step": 2411 }, { "epoch": 0.6229338842975206, "grad_norm": 2.9008991718292236, "learning_rate": 1.5601827876832194e-05, "loss": 3.9547, "step": 2412 }, { "epoch": 0.6231921487603306, "grad_norm": 4.77957820892334, "learning_rate": 1.5583034777082167e-05, "loss": 3.8377, "step": 2413 }, { "epoch": 0.6234504132231405, "grad_norm": 2.883493185043335, "learning_rate": 1.556424787659413e-05, "loss": 3.8072, "step": 2414 }, { "epoch": 0.6237086776859504, "grad_norm": 1.9520262479782104, "learning_rate": 1.554546718773564e-05, "loss": 4.3803, "step": 2415 }, { "epoch": 0.6239669421487604, "grad_norm": 2.2976386547088623, "learning_rate": 1.5526692722870195e-05, "loss": 3.7329, "step": 2416 }, { "epoch": 0.6242252066115702, "grad_norm": 2.465898036956787, "learning_rate": 1.5507924494357153e-05, "loss": 4.0324, "step": 2417 }, { "epoch": 0.6244834710743802, "grad_norm": 2.6700456142425537, "learning_rate": 1.5489162514551797e-05, "loss": 4.4752, "step": 2418 }, { "epoch": 0.62474173553719, "grad_norm": 3.0588338375091553, "learning_rate": 1.547040679580527e-05, "loss": 3.4844, "step": 2419 }, { "epoch": 0.625, "grad_norm": 3.599912405014038, "learning_rate": 1.5451657350464628e-05, "loss": 4.5833, "step": 2420 }, { "epoch": 0.62525826446281, "grad_norm": 2.23374605178833, "learning_rate": 1.5432914190872757e-05, "loss": 4.2747, "step": 2421 }, { "epoch": 0.6255165289256198, "grad_norm": 4.961491107940674, "learning_rate": 1.541417732936844e-05, "loss": 4.3145, "step": 2422 }, { "epoch": 0.6257747933884298, "grad_norm": 3.588346481323242, "learning_rate": 1.5395446778286304e-05, "loss": 4.1631, "step": 2423 }, { "epoch": 0.6260330578512396, "grad_norm": 3.876347064971924, "learning_rate": 1.53767225499568e-05, "loss": 4.032, "step": 2424 }, { "epoch": 0.6262913223140496, "grad_norm": 2.0347065925598145, "learning_rate": 1.5358004656706255e-05, "loss": 4.566, "step": 2425 }, { "epoch": 0.6265495867768595, "grad_norm": 2.508654832839966, "learning_rate": 1.5339293110856784e-05, "loss": 4.4264, "step": 2426 }, { "epoch": 0.6268078512396694, "grad_norm": 3.4442927837371826, "learning_rate": 1.5320587924726368e-05, "loss": 4.2342, "step": 2427 }, { "epoch": 0.6270661157024794, "grad_norm": 2.996325969696045, "learning_rate": 1.5301889110628747e-05, "loss": 3.8828, "step": 2428 }, { "epoch": 0.6273243801652892, "grad_norm": 3.532625198364258, "learning_rate": 1.5283196680873518e-05, "loss": 4.3357, "step": 2429 }, { "epoch": 0.6275826446280992, "grad_norm": 3.391566514968872, "learning_rate": 1.5264510647766058e-05, "loss": 4.0355, "step": 2430 }, { "epoch": 0.6278409090909091, "grad_norm": 3.217043161392212, "learning_rate": 1.5245831023607515e-05, "loss": 3.6708, "step": 2431 }, { "epoch": 0.628099173553719, "grad_norm": 6.2368974685668945, "learning_rate": 1.5227157820694845e-05, "loss": 3.4169, "step": 2432 }, { "epoch": 0.6283574380165289, "grad_norm": 3.365903854370117, "learning_rate": 1.5208491051320744e-05, "loss": 4.4037, "step": 2433 }, { "epoch": 0.6286157024793388, "grad_norm": 2.702695608139038, "learning_rate": 1.5189830727773719e-05, "loss": 4.2397, "step": 2434 }, { "epoch": 0.6288739669421488, "grad_norm": 3.5615181922912598, "learning_rate": 1.5171176862337983e-05, "loss": 3.9192, "step": 2435 }, { "epoch": 0.6291322314049587, "grad_norm": 2.2201147079467773, "learning_rate": 1.5152529467293527e-05, "loss": 4.2649, "step": 2436 }, { "epoch": 0.6293904958677686, "grad_norm": 6.542872428894043, "learning_rate": 1.5133888554916098e-05, "loss": 4.2227, "step": 2437 }, { "epoch": 0.6296487603305785, "grad_norm": 3.833235263824463, "learning_rate": 1.511525413747713e-05, "loss": 3.9484, "step": 2438 }, { "epoch": 0.6299070247933884, "grad_norm": 2.8173415660858154, "learning_rate": 1.5096626227243818e-05, "loss": 3.9642, "step": 2439 }, { "epoch": 0.6301652892561983, "grad_norm": 4.313471794128418, "learning_rate": 1.5078004836479054e-05, "loss": 4.5425, "step": 2440 }, { "epoch": 0.6304235537190083, "grad_norm": 2.107729911804199, "learning_rate": 1.5059389977441454e-05, "loss": 3.8735, "step": 2441 }, { "epoch": 0.6306818181818182, "grad_norm": 3.246063232421875, "learning_rate": 1.5040781662385318e-05, "loss": 4.1538, "step": 2442 }, { "epoch": 0.6309400826446281, "grad_norm": 2.2307674884796143, "learning_rate": 1.5022179903560646e-05, "loss": 4.1019, "step": 2443 }, { "epoch": 0.631198347107438, "grad_norm": 3.299220323562622, "learning_rate": 1.5003584713213132e-05, "loss": 4.5117, "step": 2444 }, { "epoch": 0.6314566115702479, "grad_norm": 3.500929355621338, "learning_rate": 1.498499610358412e-05, "loss": 3.9982, "step": 2445 }, { "epoch": 0.6317148760330579, "grad_norm": 3.5237233638763428, "learning_rate": 1.4966414086910655e-05, "loss": 3.9716, "step": 2446 }, { "epoch": 0.6319731404958677, "grad_norm": 4.284485340118408, "learning_rate": 1.4947838675425407e-05, "loss": 4.0268, "step": 2447 }, { "epoch": 0.6322314049586777, "grad_norm": 5.625925540924072, "learning_rate": 1.4929269881356728e-05, "loss": 2.6651, "step": 2448 }, { "epoch": 0.6324896694214877, "grad_norm": 2.86814546585083, "learning_rate": 1.4910707716928587e-05, "loss": 3.7793, "step": 2449 }, { "epoch": 0.6327479338842975, "grad_norm": 3.649440050125122, "learning_rate": 1.4892152194360615e-05, "loss": 3.6545, "step": 2450 }, { "epoch": 0.6330061983471075, "grad_norm": 1.9800738096237183, "learning_rate": 1.4873603325868055e-05, "loss": 4.3503, "step": 2451 }, { "epoch": 0.6332644628099173, "grad_norm": 1.8514809608459473, "learning_rate": 1.4855061123661768e-05, "loss": 3.8093, "step": 2452 }, { "epoch": 0.6335227272727273, "grad_norm": 2.804696798324585, "learning_rate": 1.4836525599948242e-05, "loss": 4.0498, "step": 2453 }, { "epoch": 0.6337809917355371, "grad_norm": 3.1930477619171143, "learning_rate": 1.4817996766929531e-05, "loss": 3.8645, "step": 2454 }, { "epoch": 0.6340392561983471, "grad_norm": 2.6687393188476562, "learning_rate": 1.4799474636803351e-05, "loss": 3.6465, "step": 2455 }, { "epoch": 0.6342975206611571, "grad_norm": 3.439314603805542, "learning_rate": 1.478095922176293e-05, "loss": 4.119, "step": 2456 }, { "epoch": 0.6345557851239669, "grad_norm": 2.5719988346099854, "learning_rate": 1.4762450533997118e-05, "loss": 4.2487, "step": 2457 }, { "epoch": 0.6348140495867769, "grad_norm": 4.910063743591309, "learning_rate": 1.4743948585690353e-05, "loss": 3.3503, "step": 2458 }, { "epoch": 0.6350723140495868, "grad_norm": 3.409777879714966, "learning_rate": 1.4725453389022586e-05, "loss": 4.5292, "step": 2459 }, { "epoch": 0.6353305785123967, "grad_norm": 2.104436159133911, "learning_rate": 1.4706964956169361e-05, "loss": 3.6724, "step": 2460 }, { "epoch": 0.6355888429752066, "grad_norm": 3.4180333614349365, "learning_rate": 1.4688483299301754e-05, "loss": 3.8655, "step": 2461 }, { "epoch": 0.6358471074380165, "grad_norm": 2.12410306930542, "learning_rate": 1.467000843058639e-05, "loss": 3.9771, "step": 2462 }, { "epoch": 0.6361053719008265, "grad_norm": 4.114750862121582, "learning_rate": 1.4651540362185411e-05, "loss": 4.4011, "step": 2463 }, { "epoch": 0.6363636363636364, "grad_norm": 2.860107421875, "learning_rate": 1.4633079106256497e-05, "loss": 3.8453, "step": 2464 }, { "epoch": 0.6366219008264463, "grad_norm": 4.18625545501709, "learning_rate": 1.4614624674952842e-05, "loss": 4.0099, "step": 2465 }, { "epoch": 0.6368801652892562, "grad_norm": 2.3243296146392822, "learning_rate": 1.4596177080423135e-05, "loss": 3.5089, "step": 2466 }, { "epoch": 0.6371384297520661, "grad_norm": 2.745657444000244, "learning_rate": 1.4577736334811576e-05, "loss": 4.2201, "step": 2467 }, { "epoch": 0.637396694214876, "grad_norm": 2.8939473628997803, "learning_rate": 1.4559302450257856e-05, "loss": 3.7256, "step": 2468 }, { "epoch": 0.637654958677686, "grad_norm": 1.9320701360702515, "learning_rate": 1.4540875438897137e-05, "loss": 3.5385, "step": 2469 }, { "epoch": 0.6379132231404959, "grad_norm": 1.668217420578003, "learning_rate": 1.452245531286006e-05, "loss": 3.4812, "step": 2470 }, { "epoch": 0.6381714876033058, "grad_norm": 2.064049243927002, "learning_rate": 1.4504042084272745e-05, "loss": 3.8451, "step": 2471 }, { "epoch": 0.6384297520661157, "grad_norm": 2.4846439361572266, "learning_rate": 1.4485635765256788e-05, "loss": 4.1417, "step": 2472 }, { "epoch": 0.6386880165289256, "grad_norm": 2.1112656593322754, "learning_rate": 1.4467236367929174e-05, "loss": 4.3253, "step": 2473 }, { "epoch": 0.6389462809917356, "grad_norm": 2.470085620880127, "learning_rate": 1.4448843904402398e-05, "loss": 4.0147, "step": 2474 }, { "epoch": 0.6392045454545454, "grad_norm": 2.8040499687194824, "learning_rate": 1.4430458386784351e-05, "loss": 3.9232, "step": 2475 }, { "epoch": 0.6394628099173554, "grad_norm": 2.0722548961639404, "learning_rate": 1.4412079827178366e-05, "loss": 4.0115, "step": 2476 }, { "epoch": 0.6397210743801653, "grad_norm": 1.7370831966400146, "learning_rate": 1.4393708237683182e-05, "loss": 4.2014, "step": 2477 }, { "epoch": 0.6399793388429752, "grad_norm": 2.7165796756744385, "learning_rate": 1.4375343630392982e-05, "loss": 3.8779, "step": 2478 }, { "epoch": 0.6402376033057852, "grad_norm": 4.3221235275268555, "learning_rate": 1.4356986017397316e-05, "loss": 3.8598, "step": 2479 }, { "epoch": 0.640495867768595, "grad_norm": 2.9736692905426025, "learning_rate": 1.4338635410781143e-05, "loss": 3.7851, "step": 2480 }, { "epoch": 0.640754132231405, "grad_norm": 3.9834766387939453, "learning_rate": 1.432029182262482e-05, "loss": 4.1995, "step": 2481 }, { "epoch": 0.6410123966942148, "grad_norm": 2.301617383956909, "learning_rate": 1.4301955265004075e-05, "loss": 3.9082, "step": 2482 }, { "epoch": 0.6412706611570248, "grad_norm": 2.951587438583374, "learning_rate": 1.428362574999e-05, "loss": 4.0006, "step": 2483 }, { "epoch": 0.6415289256198347, "grad_norm": 3.1861071586608887, "learning_rate": 1.4265303289649058e-05, "loss": 4.2314, "step": 2484 }, { "epoch": 0.6417871900826446, "grad_norm": 4.938000202178955, "learning_rate": 1.4246987896043085e-05, "loss": 3.3103, "step": 2485 }, { "epoch": 0.6420454545454546, "grad_norm": 2.523110866546631, "learning_rate": 1.4228679581229218e-05, "loss": 3.9216, "step": 2486 }, { "epoch": 0.6423037190082644, "grad_norm": 2.800584316253662, "learning_rate": 1.421037835725998e-05, "loss": 4.3645, "step": 2487 }, { "epoch": 0.6425619834710744, "grad_norm": 2.9792120456695557, "learning_rate": 1.4192084236183232e-05, "loss": 3.792, "step": 2488 }, { "epoch": 0.6428202479338843, "grad_norm": 5.241537570953369, "learning_rate": 1.4173797230042097e-05, "loss": 4.0641, "step": 2489 }, { "epoch": 0.6430785123966942, "grad_norm": 2.9031989574432373, "learning_rate": 1.415551735087508e-05, "loss": 4.3532, "step": 2490 }, { "epoch": 0.6433367768595041, "grad_norm": 3.792815685272217, "learning_rate": 1.4137244610715965e-05, "loss": 4.2727, "step": 2491 }, { "epoch": 0.643595041322314, "grad_norm": 2.862121105194092, "learning_rate": 1.4118979021593842e-05, "loss": 3.5982, "step": 2492 }, { "epoch": 0.643853305785124, "grad_norm": 2.3379335403442383, "learning_rate": 1.4100720595533073e-05, "loss": 4.0424, "step": 2493 }, { "epoch": 0.6441115702479339, "grad_norm": 2.0415163040161133, "learning_rate": 1.4082469344553345e-05, "loss": 3.871, "step": 2494 }, { "epoch": 0.6443698347107438, "grad_norm": 2.7574589252471924, "learning_rate": 1.4064225280669597e-05, "loss": 3.782, "step": 2495 }, { "epoch": 0.6446280991735537, "grad_norm": 1.840571403503418, "learning_rate": 1.404598841589202e-05, "loss": 4.1059, "step": 2496 }, { "epoch": 0.6448863636363636, "grad_norm": 2.7024762630462646, "learning_rate": 1.402775876222611e-05, "loss": 4.1147, "step": 2497 }, { "epoch": 0.6451446280991735, "grad_norm": 5.189191818237305, "learning_rate": 1.400953633167258e-05, "loss": 4.2229, "step": 2498 }, { "epoch": 0.6454028925619835, "grad_norm": 2.0053586959838867, "learning_rate": 1.3991321136227394e-05, "loss": 4.0956, "step": 2499 }, { "epoch": 0.6456611570247934, "grad_norm": 4.439626693725586, "learning_rate": 1.3973113187881756e-05, "loss": 4.0471, "step": 2500 }, { "epoch": 0.6459194214876033, "grad_norm": 2.2177376747131348, "learning_rate": 1.395491249862212e-05, "loss": 3.9376, "step": 2501 }, { "epoch": 0.6461776859504132, "grad_norm": 6.064826965332031, "learning_rate": 1.3936719080430124e-05, "loss": 4.0938, "step": 2502 }, { "epoch": 0.6464359504132231, "grad_norm": 2.0208330154418945, "learning_rate": 1.3918532945282636e-05, "loss": 4.0581, "step": 2503 }, { "epoch": 0.6466942148760331, "grad_norm": 2.3392467498779297, "learning_rate": 1.3900354105151747e-05, "loss": 4.0304, "step": 2504 }, { "epoch": 0.6469524793388429, "grad_norm": 3.9035260677337646, "learning_rate": 1.3882182572004725e-05, "loss": 3.8835, "step": 2505 }, { "epoch": 0.6472107438016529, "grad_norm": 2.2512435913085938, "learning_rate": 1.3864018357804026e-05, "loss": 3.8751, "step": 2506 }, { "epoch": 0.6474690082644629, "grad_norm": 2.519193649291992, "learning_rate": 1.384586147450729e-05, "loss": 3.6458, "step": 2507 }, { "epoch": 0.6477272727272727, "grad_norm": 3.9335339069366455, "learning_rate": 1.3827711934067353e-05, "loss": 4.6604, "step": 2508 }, { "epoch": 0.6479855371900827, "grad_norm": 2.338005542755127, "learning_rate": 1.3809569748432189e-05, "loss": 4.3346, "step": 2509 }, { "epoch": 0.6482438016528925, "grad_norm": 3.607882261276245, "learning_rate": 1.3791434929544933e-05, "loss": 3.9675, "step": 2510 }, { "epoch": 0.6485020661157025, "grad_norm": 2.567333698272705, "learning_rate": 1.3773307489343907e-05, "loss": 4.1988, "step": 2511 }, { "epoch": 0.6487603305785123, "grad_norm": 2.47320294380188, "learning_rate": 1.3755187439762507e-05, "loss": 3.8259, "step": 2512 }, { "epoch": 0.6490185950413223, "grad_norm": 3.877255439758301, "learning_rate": 1.3737074792729331e-05, "loss": 3.7117, "step": 2513 }, { "epoch": 0.6492768595041323, "grad_norm": 3.497339963912964, "learning_rate": 1.3718969560168078e-05, "loss": 4.4809, "step": 2514 }, { "epoch": 0.6495351239669421, "grad_norm": 2.31484317779541, "learning_rate": 1.3700871753997535e-05, "loss": 4.62, "step": 2515 }, { "epoch": 0.6497933884297521, "grad_norm": 1.9031718969345093, "learning_rate": 1.3682781386131665e-05, "loss": 3.5066, "step": 2516 }, { "epoch": 0.650051652892562, "grad_norm": 2.2807817459106445, "learning_rate": 1.3664698468479487e-05, "loss": 4.5765, "step": 2517 }, { "epoch": 0.6503099173553719, "grad_norm": 3.034844160079956, "learning_rate": 1.364662301294512e-05, "loss": 4.4517, "step": 2518 }, { "epoch": 0.6505681818181818, "grad_norm": 2.534339189529419, "learning_rate": 1.3628555031427776e-05, "loss": 4.514, "step": 2519 }, { "epoch": 0.6508264462809917, "grad_norm": 5.104548454284668, "learning_rate": 1.3610494535821767e-05, "loss": 3.845, "step": 2520 }, { "epoch": 0.6510847107438017, "grad_norm": 3.5255606174468994, "learning_rate": 1.3592441538016442e-05, "loss": 3.5496, "step": 2521 }, { "epoch": 0.6513429752066116, "grad_norm": 7.322320938110352, "learning_rate": 1.3574396049896232e-05, "loss": 3.6884, "step": 2522 }, { "epoch": 0.6516012396694215, "grad_norm": 2.6864771842956543, "learning_rate": 1.3556358083340632e-05, "loss": 3.6337, "step": 2523 }, { "epoch": 0.6518595041322314, "grad_norm": 2.561821460723877, "learning_rate": 1.3538327650224175e-05, "loss": 4.7299, "step": 2524 }, { "epoch": 0.6521177685950413, "grad_norm": 1.772194743156433, "learning_rate": 1.3520304762416434e-05, "loss": 4.2667, "step": 2525 }, { "epoch": 0.6523760330578512, "grad_norm": 8.138372421264648, "learning_rate": 1.3502289431782005e-05, "loss": 3.5707, "step": 2526 }, { "epoch": 0.6526342975206612, "grad_norm": 2.568851947784424, "learning_rate": 1.3484281670180554e-05, "loss": 3.7814, "step": 2527 }, { "epoch": 0.6528925619834711, "grad_norm": 4.497409343719482, "learning_rate": 1.3466281489466689e-05, "loss": 4.0167, "step": 2528 }, { "epoch": 0.653150826446281, "grad_norm": 3.2648940086364746, "learning_rate": 1.3448288901490095e-05, "loss": 4.4112, "step": 2529 }, { "epoch": 0.6534090909090909, "grad_norm": 2.811567783355713, "learning_rate": 1.3430303918095446e-05, "loss": 3.8607, "step": 2530 }, { "epoch": 0.6536673553719008, "grad_norm": 3.277019500732422, "learning_rate": 1.3412326551122365e-05, "loss": 3.7383, "step": 2531 }, { "epoch": 0.6539256198347108, "grad_norm": 2.998791217803955, "learning_rate": 1.3394356812405517e-05, "loss": 3.6499, "step": 2532 }, { "epoch": 0.6541838842975206, "grad_norm": 2.026315212249756, "learning_rate": 1.337639471377452e-05, "loss": 3.9534, "step": 2533 }, { "epoch": 0.6544421487603306, "grad_norm": 2.999838352203369, "learning_rate": 1.3358440267053957e-05, "loss": 3.8267, "step": 2534 }, { "epoch": 0.6547004132231405, "grad_norm": 1.6496365070343018, "learning_rate": 1.3340493484063374e-05, "loss": 3.8707, "step": 2535 }, { "epoch": 0.6549586776859504, "grad_norm": 1.8778349161148071, "learning_rate": 1.3322554376617285e-05, "loss": 4.0089, "step": 2536 }, { "epoch": 0.6552169421487604, "grad_norm": 5.316661834716797, "learning_rate": 1.3304622956525173e-05, "loss": 4.2276, "step": 2537 }, { "epoch": 0.6554752066115702, "grad_norm": 2.5141611099243164, "learning_rate": 1.3286699235591386e-05, "loss": 4.0617, "step": 2538 }, { "epoch": 0.6557334710743802, "grad_norm": 2.4553158283233643, "learning_rate": 1.3268783225615278e-05, "loss": 3.9007, "step": 2539 }, { "epoch": 0.65599173553719, "grad_norm": 4.979299545288086, "learning_rate": 1.3250874938391095e-05, "loss": 4.8319, "step": 2540 }, { "epoch": 0.65625, "grad_norm": 2.6282334327697754, "learning_rate": 1.3232974385708002e-05, "loss": 3.6636, "step": 2541 }, { "epoch": 0.65650826446281, "grad_norm": 2.8585569858551025, "learning_rate": 1.3215081579350058e-05, "loss": 3.6299, "step": 2542 }, { "epoch": 0.6567665289256198, "grad_norm": 2.8718130588531494, "learning_rate": 1.3197196531096262e-05, "loss": 4.0002, "step": 2543 }, { "epoch": 0.6570247933884298, "grad_norm": 4.355833530426025, "learning_rate": 1.317931925272047e-05, "loss": 3.3409, "step": 2544 }, { "epoch": 0.6572830578512396, "grad_norm": 2.0614237785339355, "learning_rate": 1.3161449755991425e-05, "loss": 4.2462, "step": 2545 }, { "epoch": 0.6575413223140496, "grad_norm": 1.9563151597976685, "learning_rate": 1.3143588052672779e-05, "loss": 4.0209, "step": 2546 }, { "epoch": 0.6577995867768595, "grad_norm": 4.55440616607666, "learning_rate": 1.3125734154523011e-05, "loss": 3.7758, "step": 2547 }, { "epoch": 0.6580578512396694, "grad_norm": 7.875830173492432, "learning_rate": 1.3107888073295496e-05, "loss": 4.6374, "step": 2548 }, { "epoch": 0.6583161157024794, "grad_norm": 2.7738397121429443, "learning_rate": 1.3090049820738431e-05, "loss": 4.0503, "step": 2549 }, { "epoch": 0.6585743801652892, "grad_norm": 2.5805954933166504, "learning_rate": 1.30722194085949e-05, "loss": 4.2434, "step": 2550 }, { "epoch": 0.6588326446280992, "grad_norm": 3.9883460998535156, "learning_rate": 1.305439684860279e-05, "loss": 4.6417, "step": 2551 }, { "epoch": 0.6590909090909091, "grad_norm": 4.980700969696045, "learning_rate": 1.3036582152494825e-05, "loss": 3.4885, "step": 2552 }, { "epoch": 0.659349173553719, "grad_norm": 2.3241074085235596, "learning_rate": 1.301877533199859e-05, "loss": 3.9775, "step": 2553 }, { "epoch": 0.6596074380165289, "grad_norm": 4.6501288414001465, "learning_rate": 1.300097639883641e-05, "loss": 3.8402, "step": 2554 }, { "epoch": 0.6598657024793388, "grad_norm": 2.1598780155181885, "learning_rate": 1.2983185364725494e-05, "loss": 3.9306, "step": 2555 }, { "epoch": 0.6601239669421488, "grad_norm": 2.289466381072998, "learning_rate": 1.296540224137781e-05, "loss": 3.7302, "step": 2556 }, { "epoch": 0.6603822314049587, "grad_norm": 3.046708583831787, "learning_rate": 1.2947627040500121e-05, "loss": 3.9497, "step": 2557 }, { "epoch": 0.6606404958677686, "grad_norm": 5.922731876373291, "learning_rate": 1.2929859773793995e-05, "loss": 4.3864, "step": 2558 }, { "epoch": 0.6608987603305785, "grad_norm": 2.5032122135162354, "learning_rate": 1.2912100452955756e-05, "loss": 4.238, "step": 2559 }, { "epoch": 0.6611570247933884, "grad_norm": 2.7594521045684814, "learning_rate": 1.28943490896765e-05, "loss": 4.0474, "step": 2560 }, { "epoch": 0.6614152892561983, "grad_norm": 2.851264238357544, "learning_rate": 1.2876605695642086e-05, "loss": 3.9575, "step": 2561 }, { "epoch": 0.6616735537190083, "grad_norm": 3.3948562145233154, "learning_rate": 1.2858870282533141e-05, "loss": 4.1602, "step": 2562 }, { "epoch": 0.6619318181818182, "grad_norm": 2.700173854827881, "learning_rate": 1.2841142862025023e-05, "loss": 4.1002, "step": 2563 }, { "epoch": 0.6621900826446281, "grad_norm": 5.353178977966309, "learning_rate": 1.2823423445787819e-05, "loss": 3.691, "step": 2564 }, { "epoch": 0.662448347107438, "grad_norm": 2.7646877765655518, "learning_rate": 1.2805712045486379e-05, "loss": 4.0289, "step": 2565 }, { "epoch": 0.6627066115702479, "grad_norm": 2.3019301891326904, "learning_rate": 1.2788008672780249e-05, "loss": 3.628, "step": 2566 }, { "epoch": 0.6629648760330579, "grad_norm": 1.9650393724441528, "learning_rate": 1.2770313339323697e-05, "loss": 4.1604, "step": 2567 }, { "epoch": 0.6632231404958677, "grad_norm": 2.7472598552703857, "learning_rate": 1.2752626056765687e-05, "loss": 3.7647, "step": 2568 }, { "epoch": 0.6634814049586777, "grad_norm": 2.5415873527526855, "learning_rate": 1.273494683674992e-05, "loss": 4.1543, "step": 2569 }, { "epoch": 0.6637396694214877, "grad_norm": 2.877643585205078, "learning_rate": 1.271727569091475e-05, "loss": 3.4369, "step": 2570 }, { "epoch": 0.6639979338842975, "grad_norm": 3.86422061920166, "learning_rate": 1.2699612630893227e-05, "loss": 4.0046, "step": 2571 }, { "epoch": 0.6642561983471075, "grad_norm": 2.741687536239624, "learning_rate": 1.2681957668313099e-05, "loss": 4.1847, "step": 2572 }, { "epoch": 0.6645144628099173, "grad_norm": 3.917059898376465, "learning_rate": 1.2664310814796754e-05, "loss": 4.0482, "step": 2573 }, { "epoch": 0.6647727272727273, "grad_norm": 2.9613773822784424, "learning_rate": 1.2646672081961253e-05, "loss": 3.9706, "step": 2574 }, { "epoch": 0.6650309917355371, "grad_norm": 4.114764213562012, "learning_rate": 1.2629041481418307e-05, "loss": 4.1244, "step": 2575 }, { "epoch": 0.6652892561983471, "grad_norm": 3.024317502975464, "learning_rate": 1.2611419024774307e-05, "loss": 3.8142, "step": 2576 }, { "epoch": 0.6655475206611571, "grad_norm": 3.454766273498535, "learning_rate": 1.2593804723630209e-05, "loss": 3.9786, "step": 2577 }, { "epoch": 0.6658057851239669, "grad_norm": 2.914076566696167, "learning_rate": 1.2576198589581675e-05, "loss": 3.5014, "step": 2578 }, { "epoch": 0.6660640495867769, "grad_norm": 2.126948833465576, "learning_rate": 1.2558600634218976e-05, "loss": 3.9073, "step": 2579 }, { "epoch": 0.6663223140495868, "grad_norm": 3.0187106132507324, "learning_rate": 1.254101086912694e-05, "loss": 3.8449, "step": 2580 }, { "epoch": 0.6665805785123967, "grad_norm": 3.2848355770111084, "learning_rate": 1.2523429305885082e-05, "loss": 4.0175, "step": 2581 }, { "epoch": 0.6668388429752066, "grad_norm": 1.4772266149520874, "learning_rate": 1.2505855956067475e-05, "loss": 4.1219, "step": 2582 }, { "epoch": 0.6670971074380165, "grad_norm": 5.416191577911377, "learning_rate": 1.2488290831242786e-05, "loss": 3.8301, "step": 2583 }, { "epoch": 0.6673553719008265, "grad_norm": 3.2209882736206055, "learning_rate": 1.247073394297427e-05, "loss": 3.8554, "step": 2584 }, { "epoch": 0.6676136363636364, "grad_norm": 2.765143871307373, "learning_rate": 1.2453185302819781e-05, "loss": 3.6542, "step": 2585 }, { "epoch": 0.6678719008264463, "grad_norm": 2.674966335296631, "learning_rate": 1.2435644922331719e-05, "loss": 3.8419, "step": 2586 }, { "epoch": 0.6681301652892562, "grad_norm": 2.2157909870147705, "learning_rate": 1.241811281305704e-05, "loss": 4.0008, "step": 2587 }, { "epoch": 0.6683884297520661, "grad_norm": 1.9975755214691162, "learning_rate": 1.2400588986537293e-05, "loss": 3.9521, "step": 2588 }, { "epoch": 0.668646694214876, "grad_norm": 2.500840902328491, "learning_rate": 1.2383073454308537e-05, "loss": 3.4697, "step": 2589 }, { "epoch": 0.668904958677686, "grad_norm": 2.91398549079895, "learning_rate": 1.2365566227901388e-05, "loss": 4.257, "step": 2590 }, { "epoch": 0.6691632231404959, "grad_norm": 2.4089415073394775, "learning_rate": 1.2348067318840975e-05, "loss": 4.0605, "step": 2591 }, { "epoch": 0.6694214876033058, "grad_norm": 2.5414037704467773, "learning_rate": 1.2330576738646992e-05, "loss": 4.1287, "step": 2592 }, { "epoch": 0.6696797520661157, "grad_norm": 4.68294620513916, "learning_rate": 1.2313094498833611e-05, "loss": 3.9565, "step": 2593 }, { "epoch": 0.6699380165289256, "grad_norm": 2.5951712131500244, "learning_rate": 1.2295620610909522e-05, "loss": 3.9472, "step": 2594 }, { "epoch": 0.6701962809917356, "grad_norm": 1.8839662075042725, "learning_rate": 1.2278155086377949e-05, "loss": 4.0336, "step": 2595 }, { "epoch": 0.6704545454545454, "grad_norm": 2.4208035469055176, "learning_rate": 1.2260697936736543e-05, "loss": 4.2471, "step": 2596 }, { "epoch": 0.6707128099173554, "grad_norm": 2.863168716430664, "learning_rate": 1.2243249173477513e-05, "loss": 4.3878, "step": 2597 }, { "epoch": 0.6709710743801653, "grad_norm": 3.230649709701538, "learning_rate": 1.222580880808751e-05, "loss": 4.1346, "step": 2598 }, { "epoch": 0.6712293388429752, "grad_norm": 2.5933687686920166, "learning_rate": 1.2208376852047647e-05, "loss": 3.9327, "step": 2599 }, { "epoch": 0.6714876033057852, "grad_norm": 2.453645944595337, "learning_rate": 1.2190953316833545e-05, "loss": 3.8982, "step": 2600 }, { "epoch": 0.671745867768595, "grad_norm": 1.9229648113250732, "learning_rate": 1.2173538213915228e-05, "loss": 3.6607, "step": 2601 }, { "epoch": 0.672004132231405, "grad_norm": 2.890679121017456, "learning_rate": 1.2156131554757224e-05, "loss": 3.9299, "step": 2602 }, { "epoch": 0.6722623966942148, "grad_norm": 3.009424924850464, "learning_rate": 1.2138733350818438e-05, "loss": 4.321, "step": 2603 }, { "epoch": 0.6725206611570248, "grad_norm": 3.461172342300415, "learning_rate": 1.2121343613552269e-05, "loss": 4.0219, "step": 2604 }, { "epoch": 0.6727789256198347, "grad_norm": 6.76215124130249, "learning_rate": 1.2103962354406512e-05, "loss": 3.991, "step": 2605 }, { "epoch": 0.6730371900826446, "grad_norm": 2.609424591064453, "learning_rate": 1.2086589584823382e-05, "loss": 3.8789, "step": 2606 }, { "epoch": 0.6732954545454546, "grad_norm": 2.0354549884796143, "learning_rate": 1.2069225316239501e-05, "loss": 3.9814, "step": 2607 }, { "epoch": 0.6735537190082644, "grad_norm": 1.8215985298156738, "learning_rate": 1.2051869560085924e-05, "loss": 3.931, "step": 2608 }, { "epoch": 0.6738119834710744, "grad_norm": 2.7947890758514404, "learning_rate": 1.203452232778807e-05, "loss": 4.4185, "step": 2609 }, { "epoch": 0.6740702479338843, "grad_norm": 3.0716333389282227, "learning_rate": 1.201718363076575e-05, "loss": 4.032, "step": 2610 }, { "epoch": 0.6743285123966942, "grad_norm": 4.401652812957764, "learning_rate": 1.199985348043318e-05, "loss": 3.8144, "step": 2611 }, { "epoch": 0.6745867768595041, "grad_norm": 2.5803990364074707, "learning_rate": 1.1982531888198925e-05, "loss": 3.9293, "step": 2612 }, { "epoch": 0.674845041322314, "grad_norm": 2.4310388565063477, "learning_rate": 1.196521886546593e-05, "loss": 3.3753, "step": 2613 }, { "epoch": 0.675103305785124, "grad_norm": 3.0296037197113037, "learning_rate": 1.1947914423631477e-05, "loss": 3.7947, "step": 2614 }, { "epoch": 0.6753615702479339, "grad_norm": 4.779835224151611, "learning_rate": 1.1930618574087237e-05, "loss": 3.7638, "step": 2615 }, { "epoch": 0.6756198347107438, "grad_norm": 2.2556252479553223, "learning_rate": 1.1913331328219195e-05, "loss": 4.3551, "step": 2616 }, { "epoch": 0.6758780991735537, "grad_norm": 4.130800247192383, "learning_rate": 1.1896052697407672e-05, "loss": 4.1772, "step": 2617 }, { "epoch": 0.6761363636363636, "grad_norm": 1.97135591506958, "learning_rate": 1.1878782693027354e-05, "loss": 4.4447, "step": 2618 }, { "epoch": 0.6763946280991735, "grad_norm": 2.2743289470672607, "learning_rate": 1.1861521326447181e-05, "loss": 3.9812, "step": 2619 }, { "epoch": 0.6766528925619835, "grad_norm": 3.021549701690674, "learning_rate": 1.184426860903047e-05, "loss": 4.5026, "step": 2620 }, { "epoch": 0.6769111570247934, "grad_norm": 2.6152188777923584, "learning_rate": 1.182702455213482e-05, "loss": 4.2171, "step": 2621 }, { "epoch": 0.6771694214876033, "grad_norm": 3.256300687789917, "learning_rate": 1.1809789167112113e-05, "loss": 3.7202, "step": 2622 }, { "epoch": 0.6774276859504132, "grad_norm": 2.764793634414673, "learning_rate": 1.1792562465308556e-05, "loss": 3.3969, "step": 2623 }, { "epoch": 0.6776859504132231, "grad_norm": 2.6796085834503174, "learning_rate": 1.1775344458064619e-05, "loss": 4.0209, "step": 2624 }, { "epoch": 0.6779442148760331, "grad_norm": 2.2530627250671387, "learning_rate": 1.1758135156715042e-05, "loss": 3.7197, "step": 2625 }, { "epoch": 0.6782024793388429, "grad_norm": 5.15383768081665, "learning_rate": 1.174093457258884e-05, "loss": 3.8957, "step": 2626 }, { "epoch": 0.6784607438016529, "grad_norm": 3.0105857849121094, "learning_rate": 1.1723742717009311e-05, "loss": 3.7101, "step": 2627 }, { "epoch": 0.6787190082644629, "grad_norm": 3.332848072052002, "learning_rate": 1.1706559601293977e-05, "loss": 4.1596, "step": 2628 }, { "epoch": 0.6789772727272727, "grad_norm": 3.2635531425476074, "learning_rate": 1.1689385236754607e-05, "loss": 4.0526, "step": 2629 }, { "epoch": 0.6792355371900827, "grad_norm": 2.528571605682373, "learning_rate": 1.167221963469724e-05, "loss": 3.7679, "step": 2630 }, { "epoch": 0.6794938016528925, "grad_norm": 3.148775577545166, "learning_rate": 1.1655062806422118e-05, "loss": 4.5422, "step": 2631 }, { "epoch": 0.6797520661157025, "grad_norm": 2.389357328414917, "learning_rate": 1.1637914763223712e-05, "loss": 4.0549, "step": 2632 }, { "epoch": 0.6800103305785123, "grad_norm": 2.5488297939300537, "learning_rate": 1.1620775516390708e-05, "loss": 3.69, "step": 2633 }, { "epoch": 0.6802685950413223, "grad_norm": 2.6987922191619873, "learning_rate": 1.1603645077206025e-05, "loss": 3.948, "step": 2634 }, { "epoch": 0.6805268595041323, "grad_norm": 3.469059467315674, "learning_rate": 1.1586523456946751e-05, "loss": 3.8812, "step": 2635 }, { "epoch": 0.6807851239669421, "grad_norm": 3.5399844646453857, "learning_rate": 1.1569410666884181e-05, "loss": 4.3205, "step": 2636 }, { "epoch": 0.6810433884297521, "grad_norm": 1.8726420402526855, "learning_rate": 1.1552306718283811e-05, "loss": 3.7955, "step": 2637 }, { "epoch": 0.681301652892562, "grad_norm": 2.753680467605591, "learning_rate": 1.1535211622405298e-05, "loss": 4.2202, "step": 2638 }, { "epoch": 0.6815599173553719, "grad_norm": 3.1988861560821533, "learning_rate": 1.1518125390502476e-05, "loss": 4.4609, "step": 2639 }, { "epoch": 0.6818181818181818, "grad_norm": 2.6664438247680664, "learning_rate": 1.1501048033823339e-05, "loss": 3.9588, "step": 2640 }, { "epoch": 0.6820764462809917, "grad_norm": 3.4732413291931152, "learning_rate": 1.148397956361007e-05, "loss": 4.2738, "step": 2641 }, { "epoch": 0.6823347107438017, "grad_norm": 2.9504435062408447, "learning_rate": 1.146691999109894e-05, "loss": 4.231, "step": 2642 }, { "epoch": 0.6825929752066116, "grad_norm": 2.54361629486084, "learning_rate": 1.1449869327520416e-05, "loss": 3.8675, "step": 2643 }, { "epoch": 0.6828512396694215, "grad_norm": 3.645799398422241, "learning_rate": 1.1432827584099106e-05, "loss": 3.6667, "step": 2644 }, { "epoch": 0.6831095041322314, "grad_norm": 2.482815742492676, "learning_rate": 1.1415794772053684e-05, "loss": 3.997, "step": 2645 }, { "epoch": 0.6833677685950413, "grad_norm": 2.7791240215301514, "learning_rate": 1.139877090259701e-05, "loss": 4.1609, "step": 2646 }, { "epoch": 0.6836260330578512, "grad_norm": 3.7855193614959717, "learning_rate": 1.1381755986936024e-05, "loss": 3.7392, "step": 2647 }, { "epoch": 0.6838842975206612, "grad_norm": 3.785027027130127, "learning_rate": 1.1364750036271779e-05, "loss": 3.7835, "step": 2648 }, { "epoch": 0.6841425619834711, "grad_norm": 3.5942022800445557, "learning_rate": 1.1347753061799413e-05, "loss": 4.504, "step": 2649 }, { "epoch": 0.684400826446281, "grad_norm": 2.501452684402466, "learning_rate": 1.1330765074708185e-05, "loss": 3.8777, "step": 2650 }, { "epoch": 0.6846590909090909, "grad_norm": 3.525526523590088, "learning_rate": 1.1313786086181416e-05, "loss": 4.4926, "step": 2651 }, { "epoch": 0.6849173553719008, "grad_norm": 1.9708095788955688, "learning_rate": 1.1296816107396497e-05, "loss": 3.8672, "step": 2652 }, { "epoch": 0.6851756198347108, "grad_norm": 2.9574241638183594, "learning_rate": 1.1279855149524912e-05, "loss": 4.3774, "step": 2653 }, { "epoch": 0.6854338842975206, "grad_norm": 2.3730602264404297, "learning_rate": 1.1262903223732186e-05, "loss": 4.0244, "step": 2654 }, { "epoch": 0.6856921487603306, "grad_norm": 3.578773021697998, "learning_rate": 1.1245960341177904e-05, "loss": 4.0832, "step": 2655 }, { "epoch": 0.6859504132231405, "grad_norm": 2.842273473739624, "learning_rate": 1.1229026513015692e-05, "loss": 3.8496, "step": 2656 }, { "epoch": 0.6862086776859504, "grad_norm": 2.6179428100585938, "learning_rate": 1.1212101750393237e-05, "loss": 3.9584, "step": 2657 }, { "epoch": 0.6864669421487604, "grad_norm": 2.9966506958007812, "learning_rate": 1.1195186064452232e-05, "loss": 4.1895, "step": 2658 }, { "epoch": 0.6867252066115702, "grad_norm": 3.519920825958252, "learning_rate": 1.11782794663284e-05, "loss": 4.3676, "step": 2659 }, { "epoch": 0.6869834710743802, "grad_norm": 2.4911084175109863, "learning_rate": 1.1161381967151515e-05, "loss": 4.2437, "step": 2660 }, { "epoch": 0.68724173553719, "grad_norm": 4.073802947998047, "learning_rate": 1.1144493578045296e-05, "loss": 3.8046, "step": 2661 }, { "epoch": 0.6875, "grad_norm": 3.4176578521728516, "learning_rate": 1.1127614310127524e-05, "loss": 4.0824, "step": 2662 }, { "epoch": 0.68775826446281, "grad_norm": 5.045443534851074, "learning_rate": 1.1110744174509952e-05, "loss": 3.8826, "step": 2663 }, { "epoch": 0.6880165289256198, "grad_norm": 2.3134779930114746, "learning_rate": 1.1093883182298306e-05, "loss": 3.9629, "step": 2664 }, { "epoch": 0.6882747933884298, "grad_norm": 4.559762001037598, "learning_rate": 1.1077031344592335e-05, "loss": 3.9495, "step": 2665 }, { "epoch": 0.6885330578512396, "grad_norm": 3.2732841968536377, "learning_rate": 1.1060188672485713e-05, "loss": 3.7758, "step": 2666 }, { "epoch": 0.6887913223140496, "grad_norm": 2.2601165771484375, "learning_rate": 1.104335517706613e-05, "loss": 3.876, "step": 2667 }, { "epoch": 0.6890495867768595, "grad_norm": 3.4805314540863037, "learning_rate": 1.1026530869415172e-05, "loss": 3.6719, "step": 2668 }, { "epoch": 0.6893078512396694, "grad_norm": 2.915435791015625, "learning_rate": 1.1009715760608444e-05, "loss": 3.5883, "step": 2669 }, { "epoch": 0.6895661157024794, "grad_norm": 2.242459774017334, "learning_rate": 1.0992909861715448e-05, "loss": 3.9686, "step": 2670 }, { "epoch": 0.6898243801652892, "grad_norm": 2.608036756515503, "learning_rate": 1.0976113183799633e-05, "loss": 3.8282, "step": 2671 }, { "epoch": 0.6900826446280992, "grad_norm": 3.176844358444214, "learning_rate": 1.0959325737918406e-05, "loss": 3.9589, "step": 2672 }, { "epoch": 0.6903409090909091, "grad_norm": 2.5153696537017822, "learning_rate": 1.0942547535123057e-05, "loss": 4.108, "step": 2673 }, { "epoch": 0.690599173553719, "grad_norm": 2.7261431217193604, "learning_rate": 1.0925778586458812e-05, "loss": 3.8702, "step": 2674 }, { "epoch": 0.6908574380165289, "grad_norm": 2.7979652881622314, "learning_rate": 1.0909018902964795e-05, "loss": 3.9767, "step": 2675 }, { "epoch": 0.6911157024793388, "grad_norm": 3.7434964179992676, "learning_rate": 1.0892268495674052e-05, "loss": 4.6653, "step": 2676 }, { "epoch": 0.6913739669421488, "grad_norm": 2.5525646209716797, "learning_rate": 1.0875527375613501e-05, "loss": 3.6327, "step": 2677 }, { "epoch": 0.6916322314049587, "grad_norm": 3.55446720123291, "learning_rate": 1.0858795553803939e-05, "loss": 3.5364, "step": 2678 }, { "epoch": 0.6918904958677686, "grad_norm": 2.0748980045318604, "learning_rate": 1.0842073041260076e-05, "loss": 3.9842, "step": 2679 }, { "epoch": 0.6921487603305785, "grad_norm": 2.729785919189453, "learning_rate": 1.0825359848990465e-05, "loss": 4.0676, "step": 2680 }, { "epoch": 0.6924070247933884, "grad_norm": 2.607487440109253, "learning_rate": 1.080865598799753e-05, "loss": 4.0134, "step": 2681 }, { "epoch": 0.6926652892561983, "grad_norm": 2.7261645793914795, "learning_rate": 1.0791961469277545e-05, "loss": 4.3555, "step": 2682 }, { "epoch": 0.6929235537190083, "grad_norm": 2.952617883682251, "learning_rate": 1.077527630382067e-05, "loss": 4.2955, "step": 2683 }, { "epoch": 0.6931818181818182, "grad_norm": 1.8297278881072998, "learning_rate": 1.0758600502610844e-05, "loss": 4.049, "step": 2684 }, { "epoch": 0.6934400826446281, "grad_norm": 2.67610502243042, "learning_rate": 1.0741934076625895e-05, "loss": 4.1221, "step": 2685 }, { "epoch": 0.693698347107438, "grad_norm": 4.351108074188232, "learning_rate": 1.0725277036837477e-05, "loss": 4.4334, "step": 2686 }, { "epoch": 0.6939566115702479, "grad_norm": 2.680431365966797, "learning_rate": 1.0708629394211023e-05, "loss": 4.0431, "step": 2687 }, { "epoch": 0.6942148760330579, "grad_norm": 3.294079065322876, "learning_rate": 1.0691991159705822e-05, "loss": 3.8739, "step": 2688 }, { "epoch": 0.6944731404958677, "grad_norm": 3.8998804092407227, "learning_rate": 1.0675362344274953e-05, "loss": 4.0055, "step": 2689 }, { "epoch": 0.6947314049586777, "grad_norm": 2.190553665161133, "learning_rate": 1.0658742958865294e-05, "loss": 3.9096, "step": 2690 }, { "epoch": 0.6949896694214877, "grad_norm": 3.85815167427063, "learning_rate": 1.0642133014417505e-05, "loss": 3.5275, "step": 2691 }, { "epoch": 0.6952479338842975, "grad_norm": 3.3055806159973145, "learning_rate": 1.0625532521866063e-05, "loss": 4.0906, "step": 2692 }, { "epoch": 0.6955061983471075, "grad_norm": 3.026310682296753, "learning_rate": 1.0608941492139193e-05, "loss": 3.8624, "step": 2693 }, { "epoch": 0.6957644628099173, "grad_norm": 4.900149822235107, "learning_rate": 1.0592359936158889e-05, "loss": 3.5802, "step": 2694 }, { "epoch": 0.6960227272727273, "grad_norm": 3.530491590499878, "learning_rate": 1.0575787864840942e-05, "loss": 3.6953, "step": 2695 }, { "epoch": 0.6962809917355371, "grad_norm": 4.973317623138428, "learning_rate": 1.055922528909486e-05, "loss": 3.8084, "step": 2696 }, { "epoch": 0.6965392561983471, "grad_norm": 2.432056427001953, "learning_rate": 1.0542672219823923e-05, "loss": 3.7735, "step": 2697 }, { "epoch": 0.6967975206611571, "grad_norm": 3.077096462249756, "learning_rate": 1.0526128667925134e-05, "loss": 4.2722, "step": 2698 }, { "epoch": 0.6970557851239669, "grad_norm": 2.0571391582489014, "learning_rate": 1.050959464428926e-05, "loss": 4.3764, "step": 2699 }, { "epoch": 0.6973140495867769, "grad_norm": 2.710184335708618, "learning_rate": 1.0493070159800774e-05, "loss": 3.674, "step": 2700 }, { "epoch": 0.6975723140495868, "grad_norm": 2.7000627517700195, "learning_rate": 1.0476555225337861e-05, "loss": 4.0173, "step": 2701 }, { "epoch": 0.6978305785123967, "grad_norm": 3.0661556720733643, "learning_rate": 1.046004985177245e-05, "loss": 3.8321, "step": 2702 }, { "epoch": 0.6980888429752066, "grad_norm": 2.705256462097168, "learning_rate": 1.0443554049970147e-05, "loss": 3.9843, "step": 2703 }, { "epoch": 0.6983471074380165, "grad_norm": 2.0381898880004883, "learning_rate": 1.042706783079027e-05, "loss": 3.9588, "step": 2704 }, { "epoch": 0.6986053719008265, "grad_norm": 5.959049224853516, "learning_rate": 1.0410591205085818e-05, "loss": 4.3057, "step": 2705 }, { "epoch": 0.6988636363636364, "grad_norm": 3.129629373550415, "learning_rate": 1.03941241837035e-05, "loss": 4.0441, "step": 2706 }, { "epoch": 0.6991219008264463, "grad_norm": 2.477238893508911, "learning_rate": 1.0377666777483674e-05, "loss": 3.7081, "step": 2707 }, { "epoch": 0.6993801652892562, "grad_norm": 2.8885657787323, "learning_rate": 1.0361218997260374e-05, "loss": 4.4206, "step": 2708 }, { "epoch": 0.6996384297520661, "grad_norm": 2.6590566635131836, "learning_rate": 1.034478085386133e-05, "loss": 3.8506, "step": 2709 }, { "epoch": 0.699896694214876, "grad_norm": 2.8292393684387207, "learning_rate": 1.0328352358107859e-05, "loss": 3.8055, "step": 2710 }, { "epoch": 0.700154958677686, "grad_norm": 3.6792399883270264, "learning_rate": 1.0311933520815001e-05, "loss": 3.9371, "step": 2711 }, { "epoch": 0.7004132231404959, "grad_norm": 2.537079334259033, "learning_rate": 1.0295524352791392e-05, "loss": 3.7806, "step": 2712 }, { "epoch": 0.7006714876033058, "grad_norm": 2.662325143814087, "learning_rate": 1.027912486483931e-05, "loss": 4.2886, "step": 2713 }, { "epoch": 0.7009297520661157, "grad_norm": 2.9822397232055664, "learning_rate": 1.0262735067754684e-05, "loss": 3.9092, "step": 2714 }, { "epoch": 0.7011880165289256, "grad_norm": 3.5473685264587402, "learning_rate": 1.0246354972327035e-05, "loss": 4.4783, "step": 2715 }, { "epoch": 0.7014462809917356, "grad_norm": 3.186060905456543, "learning_rate": 1.022998458933951e-05, "loss": 3.5441, "step": 2716 }, { "epoch": 0.7017045454545454, "grad_norm": 4.925805568695068, "learning_rate": 1.0213623929568852e-05, "loss": 4.0034, "step": 2717 }, { "epoch": 0.7019628099173554, "grad_norm": 2.2811272144317627, "learning_rate": 1.0197273003785428e-05, "loss": 3.975, "step": 2718 }, { "epoch": 0.7022210743801653, "grad_norm": 3.3083603382110596, "learning_rate": 1.0180931822753172e-05, "loss": 4.3321, "step": 2719 }, { "epoch": 0.7024793388429752, "grad_norm": 3.9744820594787598, "learning_rate": 1.0164600397229603e-05, "loss": 3.8559, "step": 2720 }, { "epoch": 0.7027376033057852, "grad_norm": 3.650798797607422, "learning_rate": 1.0148278737965845e-05, "loss": 4.0749, "step": 2721 }, { "epoch": 0.702995867768595, "grad_norm": 2.589496374130249, "learning_rate": 1.0131966855706562e-05, "loss": 3.646, "step": 2722 }, { "epoch": 0.703254132231405, "grad_norm": 3.6694583892822266, "learning_rate": 1.0115664761189997e-05, "loss": 3.2928, "step": 2723 }, { "epoch": 0.7035123966942148, "grad_norm": 2.913130283355713, "learning_rate": 1.0099372465147935e-05, "loss": 3.9144, "step": 2724 }, { "epoch": 0.7037706611570248, "grad_norm": 2.6197102069854736, "learning_rate": 1.0083089978305754e-05, "loss": 3.895, "step": 2725 }, { "epoch": 0.7040289256198347, "grad_norm": 2.9186434745788574, "learning_rate": 1.0066817311382301e-05, "loss": 3.7934, "step": 2726 }, { "epoch": 0.7042871900826446, "grad_norm": 5.1971235275268555, "learning_rate": 1.0050554475090029e-05, "loss": 3.8502, "step": 2727 }, { "epoch": 0.7045454545454546, "grad_norm": 2.140105962753296, "learning_rate": 1.0034301480134876e-05, "loss": 4.0322, "step": 2728 }, { "epoch": 0.7048037190082644, "grad_norm": 2.9154953956604004, "learning_rate": 1.0018058337216327e-05, "loss": 3.9602, "step": 2729 }, { "epoch": 0.7050619834710744, "grad_norm": 2.372314453125, "learning_rate": 1.0001825057027367e-05, "loss": 4.0713, "step": 2730 }, { "epoch": 0.7053202479338843, "grad_norm": 2.7284188270568848, "learning_rate": 9.98560165025448e-06, "loss": 3.6848, "step": 2731 }, { "epoch": 0.7055785123966942, "grad_norm": 7.62946891784668, "learning_rate": 9.969388127577692e-06, "loss": 3.5409, "step": 2732 }, { "epoch": 0.7058367768595041, "grad_norm": 2.9538321495056152, "learning_rate": 9.95318449967045e-06, "loss": 3.6957, "step": 2733 }, { "epoch": 0.706095041322314, "grad_norm": 1.747124195098877, "learning_rate": 9.936990777199764e-06, "loss": 4.4816, "step": 2734 }, { "epoch": 0.706353305785124, "grad_norm": 4.139410972595215, "learning_rate": 9.920806970826072e-06, "loss": 3.7388, "step": 2735 }, { "epoch": 0.7066115702479339, "grad_norm": 3.2198257446289062, "learning_rate": 9.904633091203297e-06, "loss": 4.103, "step": 2736 }, { "epoch": 0.7068698347107438, "grad_norm": 3.341402530670166, "learning_rate": 9.888469148978844e-06, "loss": 4.0397, "step": 2737 }, { "epoch": 0.7071280991735537, "grad_norm": 4.698668003082275, "learning_rate": 9.872315154793556e-06, "loss": 4.2754, "step": 2738 }, { "epoch": 0.7073863636363636, "grad_norm": 3.6700258255004883, "learning_rate": 9.856171119281737e-06, "loss": 4.3473, "step": 2739 }, { "epoch": 0.7076446280991735, "grad_norm": 1.9766031503677368, "learning_rate": 9.840037053071119e-06, "loss": 4.4904, "step": 2740 }, { "epoch": 0.7079028925619835, "grad_norm": 3.3070437908172607, "learning_rate": 9.823912966782903e-06, "loss": 3.7492, "step": 2741 }, { "epoch": 0.7081611570247934, "grad_norm": 1.7868192195892334, "learning_rate": 9.807798871031695e-06, "loss": 3.7822, "step": 2742 }, { "epoch": 0.7084194214876033, "grad_norm": 3.1456458568573, "learning_rate": 9.791694776425517e-06, "loss": 3.8543, "step": 2743 }, { "epoch": 0.7086776859504132, "grad_norm": 3.1163251399993896, "learning_rate": 9.775600693565842e-06, "loss": 3.9635, "step": 2744 }, { "epoch": 0.7089359504132231, "grad_norm": 2.935246467590332, "learning_rate": 9.75951663304752e-06, "loss": 4.0928, "step": 2745 }, { "epoch": 0.7091942148760331, "grad_norm": 2.4569039344787598, "learning_rate": 9.743442605458814e-06, "loss": 4.4622, "step": 2746 }, { "epoch": 0.7094524793388429, "grad_norm": 2.9804232120513916, "learning_rate": 9.727378621381375e-06, "loss": 3.9581, "step": 2747 }, { "epoch": 0.7097107438016529, "grad_norm": 3.239934206008911, "learning_rate": 9.711324691390278e-06, "loss": 3.9567, "step": 2748 }, { "epoch": 0.7099690082644629, "grad_norm": 3.805147886276245, "learning_rate": 9.69528082605391e-06, "loss": 4.149, "step": 2749 }, { "epoch": 0.7102272727272727, "grad_norm": 2.269317388534546, "learning_rate": 9.679247035934094e-06, "loss": 3.8249, "step": 2750 }, { "epoch": 0.7104855371900827, "grad_norm": 2.5298852920532227, "learning_rate": 9.663223331586018e-06, "loss": 4.2943, "step": 2751 }, { "epoch": 0.7107438016528925, "grad_norm": 2.110125780105591, "learning_rate": 9.647209723558179e-06, "loss": 3.8235, "step": 2752 }, { "epoch": 0.7110020661157025, "grad_norm": 2.3101372718811035, "learning_rate": 9.63120622239248e-06, "loss": 3.7848, "step": 2753 }, { "epoch": 0.7112603305785123, "grad_norm": 3.2274248600006104, "learning_rate": 9.61521283862415e-06, "loss": 3.9159, "step": 2754 }, { "epoch": 0.7115185950413223, "grad_norm": 3.9169423580169678, "learning_rate": 9.599229582781752e-06, "loss": 4.2267, "step": 2755 }, { "epoch": 0.7117768595041323, "grad_norm": 2.475529193878174, "learning_rate": 9.583256465387186e-06, "loss": 3.9515, "step": 2756 }, { "epoch": 0.7120351239669421, "grad_norm": 2.475114107131958, "learning_rate": 9.567293496955692e-06, "loss": 3.9071, "step": 2757 }, { "epoch": 0.7122933884297521, "grad_norm": 3.341503858566284, "learning_rate": 9.551340687995815e-06, "loss": 3.6707, "step": 2758 }, { "epoch": 0.712551652892562, "grad_norm": 3.012789487838745, "learning_rate": 9.5353980490094e-06, "loss": 4.4631, "step": 2759 }, { "epoch": 0.7128099173553719, "grad_norm": 3.1069223880767822, "learning_rate": 9.51946559049163e-06, "loss": 3.9969, "step": 2760 }, { "epoch": 0.7130681818181818, "grad_norm": 1.8301185369491577, "learning_rate": 9.503543322930961e-06, "loss": 3.8735, "step": 2761 }, { "epoch": 0.7133264462809917, "grad_norm": 3.490328550338745, "learning_rate": 9.487631256809146e-06, "loss": 4.0614, "step": 2762 }, { "epoch": 0.7135847107438017, "grad_norm": 3.1834073066711426, "learning_rate": 9.47172940260121e-06, "loss": 3.9606, "step": 2763 }, { "epoch": 0.7138429752066116, "grad_norm": 2.85671329498291, "learning_rate": 9.455837770775489e-06, "loss": 4.2192, "step": 2764 }, { "epoch": 0.7141012396694215, "grad_norm": 3.09222412109375, "learning_rate": 9.439956371793556e-06, "loss": 4.0246, "step": 2765 }, { "epoch": 0.7143595041322314, "grad_norm": 3.9685680866241455, "learning_rate": 9.424085216110259e-06, "loss": 4.0327, "step": 2766 }, { "epoch": 0.7146177685950413, "grad_norm": 5.057885646820068, "learning_rate": 9.408224314173719e-06, "loss": 4.2006, "step": 2767 }, { "epoch": 0.7148760330578512, "grad_norm": 3.3187103271484375, "learning_rate": 9.392373676425278e-06, "loss": 3.7493, "step": 2768 }, { "epoch": 0.7151342975206612, "grad_norm": 2.053683042526245, "learning_rate": 9.376533313299543e-06, "loss": 4.589, "step": 2769 }, { "epoch": 0.7153925619834711, "grad_norm": 3.7388951778411865, "learning_rate": 9.360703235224332e-06, "loss": 4.4191, "step": 2770 }, { "epoch": 0.715650826446281, "grad_norm": 1.9177273511886597, "learning_rate": 9.344883452620733e-06, "loss": 3.8479, "step": 2771 }, { "epoch": 0.7159090909090909, "grad_norm": 2.5572664737701416, "learning_rate": 9.329073975903025e-06, "loss": 3.8047, "step": 2772 }, { "epoch": 0.7161673553719008, "grad_norm": 2.089822292327881, "learning_rate": 9.313274815478698e-06, "loss": 4.2222, "step": 2773 }, { "epoch": 0.7164256198347108, "grad_norm": 4.074653625488281, "learning_rate": 9.297485981748488e-06, "loss": 3.4812, "step": 2774 }, { "epoch": 0.7166838842975206, "grad_norm": 2.694246768951416, "learning_rate": 9.281707485106278e-06, "loss": 3.5305, "step": 2775 }, { "epoch": 0.7169421487603306, "grad_norm": 2.2471559047698975, "learning_rate": 9.265939335939197e-06, "loss": 4.4153, "step": 2776 }, { "epoch": 0.7172004132231405, "grad_norm": 2.419767141342163, "learning_rate": 9.250181544627536e-06, "loss": 4.3084, "step": 2777 }, { "epoch": 0.7174586776859504, "grad_norm": 4.478872776031494, "learning_rate": 9.234434121544766e-06, "loss": 3.685, "step": 2778 }, { "epoch": 0.7177169421487604, "grad_norm": 4.175504684448242, "learning_rate": 9.21869707705755e-06, "loss": 4.508, "step": 2779 }, { "epoch": 0.7179752066115702, "grad_norm": 3.709007501602173, "learning_rate": 9.202970421525703e-06, "loss": 4.0597, "step": 2780 }, { "epoch": 0.7182334710743802, "grad_norm": 2.9699652194976807, "learning_rate": 9.187254165302203e-06, "loss": 3.9061, "step": 2781 }, { "epoch": 0.71849173553719, "grad_norm": 2.7298271656036377, "learning_rate": 9.17154831873318e-06, "loss": 3.8109, "step": 2782 }, { "epoch": 0.71875, "grad_norm": 3.256723165512085, "learning_rate": 9.15585289215793e-06, "loss": 4.3375, "step": 2783 }, { "epoch": 0.71900826446281, "grad_norm": 3.2058026790618896, "learning_rate": 9.140167895908867e-06, "loss": 3.8329, "step": 2784 }, { "epoch": 0.7192665289256198, "grad_norm": 2.626044750213623, "learning_rate": 9.124493340311537e-06, "loss": 4.193, "step": 2785 }, { "epoch": 0.7195247933884298, "grad_norm": 2.985292673110962, "learning_rate": 9.108829235684647e-06, "loss": 4.0576, "step": 2786 }, { "epoch": 0.7197830578512396, "grad_norm": 3.5671000480651855, "learning_rate": 9.093175592339984e-06, "loss": 3.8613, "step": 2787 }, { "epoch": 0.7200413223140496, "grad_norm": 3.314365863800049, "learning_rate": 9.077532420582463e-06, "loss": 4.1902, "step": 2788 }, { "epoch": 0.7202995867768595, "grad_norm": 2.4483914375305176, "learning_rate": 9.061899730710105e-06, "loss": 3.3313, "step": 2789 }, { "epoch": 0.7205578512396694, "grad_norm": 3.071042060852051, "learning_rate": 9.04627753301405e-06, "loss": 3.9688, "step": 2790 }, { "epoch": 0.7208161157024794, "grad_norm": 3.5377843379974365, "learning_rate": 9.030665837778488e-06, "loss": 3.9984, "step": 2791 }, { "epoch": 0.7210743801652892, "grad_norm": 4.139796257019043, "learning_rate": 9.015064655280728e-06, "loss": 4.231, "step": 2792 }, { "epoch": 0.7213326446280992, "grad_norm": 2.38219952583313, "learning_rate": 8.999473995791164e-06, "loss": 4.457, "step": 2793 }, { "epoch": 0.7215909090909091, "grad_norm": 2.629525661468506, "learning_rate": 8.983893869573242e-06, "loss": 4.2476, "step": 2794 }, { "epoch": 0.721849173553719, "grad_norm": 3.0038511753082275, "learning_rate": 8.968324286883479e-06, "loss": 4.1569, "step": 2795 }, { "epoch": 0.7221074380165289, "grad_norm": 2.142561197280884, "learning_rate": 8.952765257971444e-06, "loss": 4.3435, "step": 2796 }, { "epoch": 0.7223657024793388, "grad_norm": 2.0467820167541504, "learning_rate": 8.937216793079798e-06, "loss": 4.2617, "step": 2797 }, { "epoch": 0.7226239669421488, "grad_norm": 2.7581701278686523, "learning_rate": 8.921678902444175e-06, "loss": 3.6297, "step": 2798 }, { "epoch": 0.7228822314049587, "grad_norm": 4.400155067443848, "learning_rate": 8.906151596293313e-06, "loss": 3.1972, "step": 2799 }, { "epoch": 0.7231404958677686, "grad_norm": 2.404609203338623, "learning_rate": 8.890634884848978e-06, "loss": 4.0533, "step": 2800 }, { "epoch": 0.7233987603305785, "grad_norm": 3.198108196258545, "learning_rate": 8.875128778325903e-06, "loss": 4.1489, "step": 2801 }, { "epoch": 0.7236570247933884, "grad_norm": 1.8458307981491089, "learning_rate": 8.859633286931909e-06, "loss": 4.227, "step": 2802 }, { "epoch": 0.7239152892561983, "grad_norm": 3.262338638305664, "learning_rate": 8.844148420867787e-06, "loss": 3.5004, "step": 2803 }, { "epoch": 0.7241735537190083, "grad_norm": 1.859688401222229, "learning_rate": 8.828674190327352e-06, "loss": 3.9922, "step": 2804 }, { "epoch": 0.7244318181818182, "grad_norm": 2.42229962348938, "learning_rate": 8.813210605497396e-06, "loss": 4.2153, "step": 2805 }, { "epoch": 0.7246900826446281, "grad_norm": 2.694718837738037, "learning_rate": 8.79775767655774e-06, "loss": 4.1792, "step": 2806 }, { "epoch": 0.724948347107438, "grad_norm": 2.694899082183838, "learning_rate": 8.782315413681157e-06, "loss": 3.9188, "step": 2807 }, { "epoch": 0.7252066115702479, "grad_norm": 3.030303716659546, "learning_rate": 8.766883827033404e-06, "loss": 4.3722, "step": 2808 }, { "epoch": 0.7254648760330579, "grad_norm": 2.765895366668701, "learning_rate": 8.751462926773229e-06, "loss": 4.1822, "step": 2809 }, { "epoch": 0.7257231404958677, "grad_norm": 1.9684752225875854, "learning_rate": 8.736052723052324e-06, "loss": 3.9263, "step": 2810 }, { "epoch": 0.7259814049586777, "grad_norm": 3.3237712383270264, "learning_rate": 8.720653226015345e-06, "loss": 3.8316, "step": 2811 }, { "epoch": 0.7262396694214877, "grad_norm": 1.7968113422393799, "learning_rate": 8.705264445799899e-06, "loss": 3.9379, "step": 2812 }, { "epoch": 0.7264979338842975, "grad_norm": 4.038061618804932, "learning_rate": 8.689886392536553e-06, "loss": 4.0439, "step": 2813 }, { "epoch": 0.7267561983471075, "grad_norm": 2.677398920059204, "learning_rate": 8.674519076348792e-06, "loss": 3.7576, "step": 2814 }, { "epoch": 0.7270144628099173, "grad_norm": 3.34863543510437, "learning_rate": 8.65916250735303e-06, "loss": 4.1445, "step": 2815 }, { "epoch": 0.7272727272727273, "grad_norm": 1.7488305568695068, "learning_rate": 8.643816695658646e-06, "loss": 3.9694, "step": 2816 }, { "epoch": 0.7275309917355371, "grad_norm": 4.32069730758667, "learning_rate": 8.628481651367876e-06, "loss": 4.51, "step": 2817 }, { "epoch": 0.7277892561983471, "grad_norm": 3.9878151416778564, "learning_rate": 8.613157384575918e-06, "loss": 4.4075, "step": 2818 }, { "epoch": 0.7280475206611571, "grad_norm": 1.9938222169876099, "learning_rate": 8.597843905370853e-06, "loss": 4.3949, "step": 2819 }, { "epoch": 0.7283057851239669, "grad_norm": 4.798807621002197, "learning_rate": 8.582541223833657e-06, "loss": 3.7457, "step": 2820 }, { "epoch": 0.7285640495867769, "grad_norm": 3.021308660507202, "learning_rate": 8.56724935003822e-06, "loss": 4.4535, "step": 2821 }, { "epoch": 0.7288223140495868, "grad_norm": 5.02640962600708, "learning_rate": 8.551968294051294e-06, "loss": 5.1243, "step": 2822 }, { "epoch": 0.7290805785123967, "grad_norm": 2.92314076423645, "learning_rate": 8.536698065932516e-06, "loss": 3.7552, "step": 2823 }, { "epoch": 0.7293388429752066, "grad_norm": 4.572329521179199, "learning_rate": 8.521438675734395e-06, "loss": 4.3603, "step": 2824 }, { "epoch": 0.7295971074380165, "grad_norm": 6.3205132484436035, "learning_rate": 8.506190133502314e-06, "loss": 3.0527, "step": 2825 }, { "epoch": 0.7298553719008265, "grad_norm": 2.4753618240356445, "learning_rate": 8.49095244927451e-06, "loss": 4.0945, "step": 2826 }, { "epoch": 0.7301136363636364, "grad_norm": 5.73077917098999, "learning_rate": 8.475725633082055e-06, "loss": 3.292, "step": 2827 }, { "epoch": 0.7303719008264463, "grad_norm": 3.2941153049468994, "learning_rate": 8.460509694948897e-06, "loss": 4.0776, "step": 2828 }, { "epoch": 0.7306301652892562, "grad_norm": 1.8667281866073608, "learning_rate": 8.445304644891805e-06, "loss": 4.023, "step": 2829 }, { "epoch": 0.7308884297520661, "grad_norm": 2.2607996463775635, "learning_rate": 8.43011049292038e-06, "loss": 3.4181, "step": 2830 }, { "epoch": 0.731146694214876, "grad_norm": 4.8345537185668945, "learning_rate": 8.414927249037038e-06, "loss": 3.7896, "step": 2831 }, { "epoch": 0.731404958677686, "grad_norm": 3.246988296508789, "learning_rate": 8.399754923237049e-06, "loss": 3.7587, "step": 2832 }, { "epoch": 0.7316632231404959, "grad_norm": 3.6236793994903564, "learning_rate": 8.384593525508467e-06, "loss": 3.7976, "step": 2833 }, { "epoch": 0.7319214876033058, "grad_norm": 2.8460500240325928, "learning_rate": 8.369443065832147e-06, "loss": 4.5383, "step": 2834 }, { "epoch": 0.7321797520661157, "grad_norm": 4.4173173904418945, "learning_rate": 8.354303554181776e-06, "loss": 4.1855, "step": 2835 }, { "epoch": 0.7324380165289256, "grad_norm": 6.692438125610352, "learning_rate": 8.3391750005238e-06, "loss": 3.403, "step": 2836 }, { "epoch": 0.7326962809917356, "grad_norm": 3.978307008743286, "learning_rate": 8.324057414817465e-06, "loss": 4.4993, "step": 2837 }, { "epoch": 0.7329545454545454, "grad_norm": 2.900477647781372, "learning_rate": 8.308950807014792e-06, "loss": 4.0438, "step": 2838 }, { "epoch": 0.7332128099173554, "grad_norm": 2.5308454036712646, "learning_rate": 8.293855187060601e-06, "loss": 4.0387, "step": 2839 }, { "epoch": 0.7334710743801653, "grad_norm": 3.7543492317199707, "learning_rate": 8.278770564892424e-06, "loss": 3.8354, "step": 2840 }, { "epoch": 0.7337293388429752, "grad_norm": 2.631798267364502, "learning_rate": 8.263696950440602e-06, "loss": 4.1094, "step": 2841 }, { "epoch": 0.7339876033057852, "grad_norm": 2.788461446762085, "learning_rate": 8.248634353628231e-06, "loss": 4.3609, "step": 2842 }, { "epoch": 0.734245867768595, "grad_norm": 3.6363234519958496, "learning_rate": 8.233582784371108e-06, "loss": 4.1183, "step": 2843 }, { "epoch": 0.734504132231405, "grad_norm": 3.1176979541778564, "learning_rate": 8.218542252577816e-06, "loss": 4.3567, "step": 2844 }, { "epoch": 0.7347623966942148, "grad_norm": 2.625563144683838, "learning_rate": 8.20351276814965e-06, "loss": 3.947, "step": 2845 }, { "epoch": 0.7350206611570248, "grad_norm": 2.7010715007781982, "learning_rate": 8.188494340980634e-06, "loss": 3.2092, "step": 2846 }, { "epoch": 0.7352789256198347, "grad_norm": 1.5932724475860596, "learning_rate": 8.173486980957515e-06, "loss": 3.8409, "step": 2847 }, { "epoch": 0.7355371900826446, "grad_norm": 1.6282655000686646, "learning_rate": 8.158490697959767e-06, "loss": 3.3088, "step": 2848 }, { "epoch": 0.7357954545454546, "grad_norm": 3.3130621910095215, "learning_rate": 8.143505501859552e-06, "loss": 3.9805, "step": 2849 }, { "epoch": 0.7360537190082644, "grad_norm": 2.3704371452331543, "learning_rate": 8.128531402521739e-06, "loss": 3.8563, "step": 2850 }, { "epoch": 0.7363119834710744, "grad_norm": 3.9821114540100098, "learning_rate": 8.113568409803907e-06, "loss": 3.6143, "step": 2851 }, { "epoch": 0.7365702479338843, "grad_norm": 3.532677173614502, "learning_rate": 8.098616533556302e-06, "loss": 3.7226, "step": 2852 }, { "epoch": 0.7368285123966942, "grad_norm": 2.7446815967559814, "learning_rate": 8.083675783621865e-06, "loss": 3.7568, "step": 2853 }, { "epoch": 0.7370867768595041, "grad_norm": 2.796809434890747, "learning_rate": 8.0687461698362e-06, "loss": 4.1358, "step": 2854 }, { "epoch": 0.737345041322314, "grad_norm": 3.1896941661834717, "learning_rate": 8.053827702027611e-06, "loss": 3.8928, "step": 2855 }, { "epoch": 0.737603305785124, "grad_norm": 3.4477059841156006, "learning_rate": 8.03892039001701e-06, "loss": 3.8229, "step": 2856 }, { "epoch": 0.7378615702479339, "grad_norm": 4.527946949005127, "learning_rate": 8.024024243618014e-06, "loss": 3.4463, "step": 2857 }, { "epoch": 0.7381198347107438, "grad_norm": 2.731912612915039, "learning_rate": 8.009139272636884e-06, "loss": 4.0138, "step": 2858 }, { "epoch": 0.7383780991735537, "grad_norm": 2.043689489364624, "learning_rate": 7.994265486872502e-06, "loss": 3.7996, "step": 2859 }, { "epoch": 0.7386363636363636, "grad_norm": 2.437562942504883, "learning_rate": 7.979402896116395e-06, "loss": 3.8753, "step": 2860 }, { "epoch": 0.7388946280991735, "grad_norm": 3.570556879043579, "learning_rate": 7.96455151015272e-06, "loss": 4.7104, "step": 2861 }, { "epoch": 0.7391528925619835, "grad_norm": 4.341465950012207, "learning_rate": 7.94971133875828e-06, "loss": 4.0097, "step": 2862 }, { "epoch": 0.7394111570247934, "grad_norm": 2.8932065963745117, "learning_rate": 7.934882391702453e-06, "loss": 3.8174, "step": 2863 }, { "epoch": 0.7396694214876033, "grad_norm": 2.4075875282287598, "learning_rate": 7.920064678747256e-06, "loss": 4.2335, "step": 2864 }, { "epoch": 0.7399276859504132, "grad_norm": 2.966379165649414, "learning_rate": 7.905258209647326e-06, "loss": 4.4065, "step": 2865 }, { "epoch": 0.7401859504132231, "grad_norm": 2.0135457515716553, "learning_rate": 7.890462994149847e-06, "loss": 3.9855, "step": 2866 }, { "epoch": 0.7404442148760331, "grad_norm": 3.2935009002685547, "learning_rate": 7.875679041994649e-06, "loss": 4.4806, "step": 2867 }, { "epoch": 0.7407024793388429, "grad_norm": 2.590937852859497, "learning_rate": 7.860906362914117e-06, "loss": 3.6914, "step": 2868 }, { "epoch": 0.7409607438016529, "grad_norm": 4.472278594970703, "learning_rate": 7.846144966633215e-06, "loss": 3.9269, "step": 2869 }, { "epoch": 0.7412190082644629, "grad_norm": 3.843050479888916, "learning_rate": 7.831394862869485e-06, "loss": 3.0798, "step": 2870 }, { "epoch": 0.7414772727272727, "grad_norm": 2.884528875350952, "learning_rate": 7.816656061333046e-06, "loss": 4.4377, "step": 2871 }, { "epoch": 0.7417355371900827, "grad_norm": 2.5424461364746094, "learning_rate": 7.801928571726564e-06, "loss": 4.059, "step": 2872 }, { "epoch": 0.7419938016528925, "grad_norm": 2.411454677581787, "learning_rate": 7.787212403745251e-06, "loss": 4.4626, "step": 2873 }, { "epoch": 0.7422520661157025, "grad_norm": 4.411651611328125, "learning_rate": 7.772507567076889e-06, "loss": 3.7803, "step": 2874 }, { "epoch": 0.7425103305785123, "grad_norm": 3.540944814682007, "learning_rate": 7.757814071401781e-06, "loss": 4.3634, "step": 2875 }, { "epoch": 0.7427685950413223, "grad_norm": 4.402584075927734, "learning_rate": 7.743131926392772e-06, "loss": 3.5945, "step": 2876 }, { "epoch": 0.7430268595041323, "grad_norm": 4.173943996429443, "learning_rate": 7.728461141715223e-06, "loss": 3.7703, "step": 2877 }, { "epoch": 0.7432851239669421, "grad_norm": 2.9813954830169678, "learning_rate": 7.713801727027042e-06, "loss": 4.381, "step": 2878 }, { "epoch": 0.7435433884297521, "grad_norm": 2.4940710067749023, "learning_rate": 7.69915369197863e-06, "loss": 4.1053, "step": 2879 }, { "epoch": 0.743801652892562, "grad_norm": 3.323338031768799, "learning_rate": 7.684517046212897e-06, "loss": 3.7828, "step": 2880 }, { "epoch": 0.7440599173553719, "grad_norm": 2.3237931728363037, "learning_rate": 7.669891799365283e-06, "loss": 3.9812, "step": 2881 }, { "epoch": 0.7443181818181818, "grad_norm": 2.377835750579834, "learning_rate": 7.65527796106367e-06, "loss": 3.8157, "step": 2882 }, { "epoch": 0.7445764462809917, "grad_norm": 4.320372104644775, "learning_rate": 7.640675540928487e-06, "loss": 4.4344, "step": 2883 }, { "epoch": 0.7448347107438017, "grad_norm": 2.2802958488464355, "learning_rate": 7.6260845485726154e-06, "loss": 4.3565, "step": 2884 }, { "epoch": 0.7450929752066116, "grad_norm": 2.6516008377075195, "learning_rate": 7.611504993601409e-06, "loss": 3.8225, "step": 2885 }, { "epoch": 0.7453512396694215, "grad_norm": 3.764509439468384, "learning_rate": 7.596936885612721e-06, "loss": 4.3427, "step": 2886 }, { "epoch": 0.7456095041322314, "grad_norm": 2.6777844429016113, "learning_rate": 7.5823802341968425e-06, "loss": 3.6016, "step": 2887 }, { "epoch": 0.7458677685950413, "grad_norm": 3.2193987369537354, "learning_rate": 7.56783504893653e-06, "loss": 4.6108, "step": 2888 }, { "epoch": 0.7461260330578512, "grad_norm": 5.959073543548584, "learning_rate": 7.5533013394069875e-06, "loss": 3.4332, "step": 2889 }, { "epoch": 0.7463842975206612, "grad_norm": 2.678541660308838, "learning_rate": 7.538779115175884e-06, "loss": 3.629, "step": 2890 }, { "epoch": 0.7466425619834711, "grad_norm": 2.813674211502075, "learning_rate": 7.524268385803307e-06, "loss": 3.9767, "step": 2891 }, { "epoch": 0.746900826446281, "grad_norm": 2.7704219818115234, "learning_rate": 7.509769160841773e-06, "loss": 3.3167, "step": 2892 }, { "epoch": 0.7471590909090909, "grad_norm": 6.516298294067383, "learning_rate": 7.495281449836253e-06, "loss": 3.3152, "step": 2893 }, { "epoch": 0.7474173553719008, "grad_norm": 1.7606974840164185, "learning_rate": 7.4808052623241115e-06, "loss": 4.1398, "step": 2894 }, { "epoch": 0.7476756198347108, "grad_norm": 3.813417673110962, "learning_rate": 7.46634060783514e-06, "loss": 4.0955, "step": 2895 }, { "epoch": 0.7479338842975206, "grad_norm": 3.957828998565674, "learning_rate": 7.451887495891519e-06, "loss": 4.488, "step": 2896 }, { "epoch": 0.7481921487603306, "grad_norm": 3.6925883293151855, "learning_rate": 7.437445936007867e-06, "loss": 3.9314, "step": 2897 }, { "epoch": 0.7484504132231405, "grad_norm": 3.617288112640381, "learning_rate": 7.423015937691167e-06, "loss": 4.1248, "step": 2898 }, { "epoch": 0.7487086776859504, "grad_norm": 3.028498411178589, "learning_rate": 7.40859751044079e-06, "loss": 4.1414, "step": 2899 }, { "epoch": 0.7489669421487604, "grad_norm": 2.7381324768066406, "learning_rate": 7.394190663748518e-06, "loss": 4.0567, "step": 2900 }, { "epoch": 0.7492252066115702, "grad_norm": 2.842087984085083, "learning_rate": 7.379795407098483e-06, "loss": 3.8413, "step": 2901 }, { "epoch": 0.7494834710743802, "grad_norm": 1.8195914030075073, "learning_rate": 7.365411749967191e-06, "loss": 4.0262, "step": 2902 }, { "epoch": 0.74974173553719, "grad_norm": 2.8091883659362793, "learning_rate": 7.3510397018235146e-06, "loss": 3.9437, "step": 2903 }, { "epoch": 0.75, "grad_norm": 2.8522143363952637, "learning_rate": 7.336679272128705e-06, "loss": 4.1763, "step": 2904 }, { "epoch": 0.75025826446281, "grad_norm": 2.340480327606201, "learning_rate": 7.3223304703363135e-06, "loss": 3.7611, "step": 2905 }, { "epoch": 0.7505165289256198, "grad_norm": 3.1229753494262695, "learning_rate": 7.307993305892288e-06, "loss": 3.7118, "step": 2906 }, { "epoch": 0.7507747933884298, "grad_norm": 3.1998300552368164, "learning_rate": 7.293667788234909e-06, "loss": 3.9439, "step": 2907 }, { "epoch": 0.7510330578512396, "grad_norm": 3.4471333026885986, "learning_rate": 7.279353926794749e-06, "loss": 3.9681, "step": 2908 }, { "epoch": 0.7512913223140496, "grad_norm": 3.9866702556610107, "learning_rate": 7.2650517309947565e-06, "loss": 3.5462, "step": 2909 }, { "epoch": 0.7515495867768595, "grad_norm": 2.9294025897979736, "learning_rate": 7.25076121025017e-06, "loss": 4.1494, "step": 2910 }, { "epoch": 0.7518078512396694, "grad_norm": 3.3255562782287598, "learning_rate": 7.236482373968553e-06, "loss": 3.974, "step": 2911 }, { "epoch": 0.7520661157024794, "grad_norm": 3.1102874279022217, "learning_rate": 7.222215231549767e-06, "loss": 3.8444, "step": 2912 }, { "epoch": 0.7523243801652892, "grad_norm": 4.1661376953125, "learning_rate": 7.207959792385999e-06, "loss": 3.901, "step": 2913 }, { "epoch": 0.7525826446280992, "grad_norm": 2.9121317863464355, "learning_rate": 7.193716065861708e-06, "loss": 3.8811, "step": 2914 }, { "epoch": 0.7528409090909091, "grad_norm": 2.8842225074768066, "learning_rate": 7.179484061353645e-06, "loss": 4.117, "step": 2915 }, { "epoch": 0.753099173553719, "grad_norm": 3.145970582962036, "learning_rate": 7.165263788230864e-06, "loss": 3.7815, "step": 2916 }, { "epoch": 0.7533574380165289, "grad_norm": 2.320040464401245, "learning_rate": 7.151055255854675e-06, "loss": 4.0478, "step": 2917 }, { "epoch": 0.7536157024793388, "grad_norm": 3.7713613510131836, "learning_rate": 7.136858473578664e-06, "loss": 3.8483, "step": 2918 }, { "epoch": 0.7538739669421488, "grad_norm": 4.1975297927856445, "learning_rate": 7.122673450748679e-06, "loss": 3.9979, "step": 2919 }, { "epoch": 0.7541322314049587, "grad_norm": 3.292996883392334, "learning_rate": 7.108500196702849e-06, "loss": 4.0486, "step": 2920 }, { "epoch": 0.7543904958677686, "grad_norm": 3.051987886428833, "learning_rate": 7.094338720771527e-06, "loss": 3.9596, "step": 2921 }, { "epoch": 0.7546487603305785, "grad_norm": 3.4076247215270996, "learning_rate": 7.080189032277315e-06, "loss": 4.0928, "step": 2922 }, { "epoch": 0.7549070247933884, "grad_norm": 3.1021816730499268, "learning_rate": 7.066051140535082e-06, "loss": 3.837, "step": 2923 }, { "epoch": 0.7551652892561983, "grad_norm": 1.8252822160720825, "learning_rate": 7.051925054851904e-06, "loss": 3.8503, "step": 2924 }, { "epoch": 0.7554235537190083, "grad_norm": 3.8752174377441406, "learning_rate": 7.037810784527097e-06, "loss": 3.9279, "step": 2925 }, { "epoch": 0.7556818181818182, "grad_norm": 1.8702960014343262, "learning_rate": 7.023708338852181e-06, "loss": 4.4826, "step": 2926 }, { "epoch": 0.7559400826446281, "grad_norm": 2.632167100906372, "learning_rate": 7.0096177271109285e-06, "loss": 3.5457, "step": 2927 }, { "epoch": 0.756198347107438, "grad_norm": 2.678870439529419, "learning_rate": 6.995538958579287e-06, "loss": 4.1747, "step": 2928 }, { "epoch": 0.7564566115702479, "grad_norm": 2.646735906600952, "learning_rate": 6.981472042525416e-06, "loss": 3.8747, "step": 2929 }, { "epoch": 0.7567148760330579, "grad_norm": 4.238427639007568, "learning_rate": 6.967416988209699e-06, "loss": 3.2437, "step": 2930 }, { "epoch": 0.7569731404958677, "grad_norm": 6.299400806427002, "learning_rate": 6.953373804884658e-06, "loss": 3.6466, "step": 2931 }, { "epoch": 0.7572314049586777, "grad_norm": 2.62455677986145, "learning_rate": 6.939342501795051e-06, "loss": 4.0046, "step": 2932 }, { "epoch": 0.7574896694214877, "grad_norm": 3.3090708255767822, "learning_rate": 6.92532308817779e-06, "loss": 3.8771, "step": 2933 }, { "epoch": 0.7577479338842975, "grad_norm": 2.13889479637146, "learning_rate": 6.911315573261956e-06, "loss": 3.7899, "step": 2934 }, { "epoch": 0.7580061983471075, "grad_norm": 5.0639448165893555, "learning_rate": 6.897319966268823e-06, "loss": 3.5959, "step": 2935 }, { "epoch": 0.7582644628099173, "grad_norm": 2.1414501667022705, "learning_rate": 6.8833362764117975e-06, "loss": 4.3582, "step": 2936 }, { "epoch": 0.7585227272727273, "grad_norm": 2.7039763927459717, "learning_rate": 6.869364512896454e-06, "loss": 4.1298, "step": 2937 }, { "epoch": 0.7587809917355371, "grad_norm": 2.9768073558807373, "learning_rate": 6.855404684920505e-06, "loss": 4.1368, "step": 2938 }, { "epoch": 0.7590392561983471, "grad_norm": 4.799585819244385, "learning_rate": 6.841456801673832e-06, "loss": 4.5053, "step": 2939 }, { "epoch": 0.7592975206611571, "grad_norm": 4.757508754730225, "learning_rate": 6.8275208723384275e-06, "loss": 3.9056, "step": 2940 }, { "epoch": 0.7595557851239669, "grad_norm": 3.180004358291626, "learning_rate": 6.813596906088415e-06, "loss": 3.8934, "step": 2941 }, { "epoch": 0.7598140495867769, "grad_norm": 5.812942028045654, "learning_rate": 6.799684912090065e-06, "loss": 3.6882, "step": 2942 }, { "epoch": 0.7600723140495868, "grad_norm": 5.428582191467285, "learning_rate": 6.785784899501746e-06, "loss": 3.7229, "step": 2943 }, { "epoch": 0.7603305785123967, "grad_norm": 3.468083620071411, "learning_rate": 6.771896877473946e-06, "loss": 4.3464, "step": 2944 }, { "epoch": 0.7605888429752066, "grad_norm": 2.9291365146636963, "learning_rate": 6.75802085514925e-06, "loss": 3.8406, "step": 2945 }, { "epoch": 0.7608471074380165, "grad_norm": 2.1262993812561035, "learning_rate": 6.7441568416623715e-06, "loss": 4.5351, "step": 2946 }, { "epoch": 0.7611053719008265, "grad_norm": 2.917586326599121, "learning_rate": 6.730304846140076e-06, "loss": 4.1184, "step": 2947 }, { "epoch": 0.7613636363636364, "grad_norm": 4.205169677734375, "learning_rate": 6.7164648777012505e-06, "loss": 3.7447, "step": 2948 }, { "epoch": 0.7616219008264463, "grad_norm": 4.683629512786865, "learning_rate": 6.70263694545687e-06, "loss": 3.8833, "step": 2949 }, { "epoch": 0.7618801652892562, "grad_norm": 2.4400429725646973, "learning_rate": 6.688821058509942e-06, "loss": 4.608, "step": 2950 }, { "epoch": 0.7621384297520661, "grad_norm": 2.7212507724761963, "learning_rate": 6.675017225955599e-06, "loss": 4.2709, "step": 2951 }, { "epoch": 0.762396694214876, "grad_norm": 3.8093314170837402, "learning_rate": 6.661225456880999e-06, "loss": 3.997, "step": 2952 }, { "epoch": 0.762654958677686, "grad_norm": 3.272585153579712, "learning_rate": 6.647445760365373e-06, "loss": 3.3922, "step": 2953 }, { "epoch": 0.7629132231404959, "grad_norm": 4.211659908294678, "learning_rate": 6.633678145479996e-06, "loss": 3.875, "step": 2954 }, { "epoch": 0.7631714876033058, "grad_norm": 4.156970024108887, "learning_rate": 6.619922621288205e-06, "loss": 3.7705, "step": 2955 }, { "epoch": 0.7634297520661157, "grad_norm": 2.480146646499634, "learning_rate": 6.606179196845385e-06, "loss": 4.3986, "step": 2956 }, { "epoch": 0.7636880165289256, "grad_norm": 2.73637056350708, "learning_rate": 6.592447881198907e-06, "loss": 4.3211, "step": 2957 }, { "epoch": 0.7639462809917356, "grad_norm": 4.041680335998535, "learning_rate": 6.57872868338823e-06, "loss": 3.6793, "step": 2958 }, { "epoch": 0.7642045454545454, "grad_norm": 3.487213373184204, "learning_rate": 6.5650216124448e-06, "loss": 4.2224, "step": 2959 }, { "epoch": 0.7644628099173554, "grad_norm": 2.9754230976104736, "learning_rate": 6.5513266773920885e-06, "loss": 3.464, "step": 2960 }, { "epoch": 0.7647210743801653, "grad_norm": 1.768051266670227, "learning_rate": 6.537643887245573e-06, "loss": 3.4701, "step": 2961 }, { "epoch": 0.7649793388429752, "grad_norm": 2.6630423069000244, "learning_rate": 6.523973251012755e-06, "loss": 3.9513, "step": 2962 }, { "epoch": 0.7652376033057852, "grad_norm": 3.327787399291992, "learning_rate": 6.510314777693116e-06, "loss": 3.7778, "step": 2963 }, { "epoch": 0.765495867768595, "grad_norm": 3.9521801471710205, "learning_rate": 6.496668476278125e-06, "loss": 4.0683, "step": 2964 }, { "epoch": 0.765754132231405, "grad_norm": 3.243607521057129, "learning_rate": 6.483034355751266e-06, "loss": 3.4402, "step": 2965 }, { "epoch": 0.7660123966942148, "grad_norm": 4.246148586273193, "learning_rate": 6.469412425087979e-06, "loss": 3.7512, "step": 2966 }, { "epoch": 0.7662706611570248, "grad_norm": 2.5066354274749756, "learning_rate": 6.455802693255688e-06, "loss": 4.1619, "step": 2967 }, { "epoch": 0.7665289256198347, "grad_norm": 3.5038249492645264, "learning_rate": 6.442205169213783e-06, "loss": 4.1309, "step": 2968 }, { "epoch": 0.7667871900826446, "grad_norm": 2.7132728099823, "learning_rate": 6.428619861913637e-06, "loss": 3.8681, "step": 2969 }, { "epoch": 0.7670454545454546, "grad_norm": 3.1184370517730713, "learning_rate": 6.415046780298536e-06, "loss": 4.2356, "step": 2970 }, { "epoch": 0.7673037190082644, "grad_norm": 5.49888801574707, "learning_rate": 6.4014859333037606e-06, "loss": 3.4481, "step": 2971 }, { "epoch": 0.7675619834710744, "grad_norm": 3.5874717235565186, "learning_rate": 6.387937329856541e-06, "loss": 4.0694, "step": 2972 }, { "epoch": 0.7678202479338843, "grad_norm": 3.646803617477417, "learning_rate": 6.374400978875994e-06, "loss": 3.2913, "step": 2973 }, { "epoch": 0.7680785123966942, "grad_norm": 2.3262760639190674, "learning_rate": 6.3608768892732315e-06, "loss": 4.102, "step": 2974 }, { "epoch": 0.7683367768595041, "grad_norm": 8.474355697631836, "learning_rate": 6.347365069951261e-06, "loss": 3.8009, "step": 2975 }, { "epoch": 0.768595041322314, "grad_norm": 1.562949299812317, "learning_rate": 6.333865529805014e-06, "loss": 3.8844, "step": 2976 }, { "epoch": 0.768853305785124, "grad_norm": 2.687361717224121, "learning_rate": 6.320378277721342e-06, "loss": 4.5157, "step": 2977 }, { "epoch": 0.7691115702479339, "grad_norm": 2.5818934440612793, "learning_rate": 6.3069033225790195e-06, "loss": 3.9916, "step": 2978 }, { "epoch": 0.7693698347107438, "grad_norm": 3.9550228118896484, "learning_rate": 6.29344067324871e-06, "loss": 4.1777, "step": 2979 }, { "epoch": 0.7696280991735537, "grad_norm": 3.677687644958496, "learning_rate": 6.2799903385929725e-06, "loss": 4.1837, "step": 2980 }, { "epoch": 0.7698863636363636, "grad_norm": 1.912312388420105, "learning_rate": 6.26655232746628e-06, "loss": 4.1519, "step": 2981 }, { "epoch": 0.7701446280991735, "grad_norm": 4.874294757843018, "learning_rate": 6.25312664871498e-06, "loss": 3.5537, "step": 2982 }, { "epoch": 0.7704028925619835, "grad_norm": 4.81190299987793, "learning_rate": 6.2397133111772944e-06, "loss": 4.1128, "step": 2983 }, { "epoch": 0.7706611570247934, "grad_norm": 3.223020315170288, "learning_rate": 6.226312323683325e-06, "loss": 3.4963, "step": 2984 }, { "epoch": 0.7709194214876033, "grad_norm": 2.226142168045044, "learning_rate": 6.212923695055062e-06, "loss": 3.8504, "step": 2985 }, { "epoch": 0.7711776859504132, "grad_norm": 3.4806950092315674, "learning_rate": 6.199547434106337e-06, "loss": 3.9296, "step": 2986 }, { "epoch": 0.7714359504132231, "grad_norm": 7.442416667938232, "learning_rate": 6.1861835496428375e-06, "loss": 3.2454, "step": 2987 }, { "epoch": 0.7716942148760331, "grad_norm": 3.6820247173309326, "learning_rate": 6.172832050462132e-06, "loss": 4.731, "step": 2988 }, { "epoch": 0.7719524793388429, "grad_norm": 2.1720094680786133, "learning_rate": 6.159492945353607e-06, "loss": 3.9181, "step": 2989 }, { "epoch": 0.7722107438016529, "grad_norm": 3.655104398727417, "learning_rate": 6.146166243098497e-06, "loss": 4.4152, "step": 2990 }, { "epoch": 0.7724690082644629, "grad_norm": 3.3289873600006104, "learning_rate": 6.132851952469876e-06, "loss": 3.7119, "step": 2991 }, { "epoch": 0.7727272727272727, "grad_norm": 4.804155349731445, "learning_rate": 6.11955008223265e-06, "loss": 3.7436, "step": 2992 }, { "epoch": 0.7729855371900827, "grad_norm": 3.4114139080047607, "learning_rate": 6.106260641143546e-06, "loss": 3.9322, "step": 2993 }, { "epoch": 0.7732438016528925, "grad_norm": 3.8854551315307617, "learning_rate": 6.092983637951094e-06, "loss": 3.8774, "step": 2994 }, { "epoch": 0.7735020661157025, "grad_norm": 3.124720335006714, "learning_rate": 6.079719081395674e-06, "loss": 3.4386, "step": 2995 }, { "epoch": 0.7737603305785123, "grad_norm": 2.2647666931152344, "learning_rate": 6.066466980209417e-06, "loss": 4.0756, "step": 2996 }, { "epoch": 0.7740185950413223, "grad_norm": 2.182445764541626, "learning_rate": 6.0532273431163075e-06, "loss": 4.0848, "step": 2997 }, { "epoch": 0.7742768595041323, "grad_norm": 2.495110034942627, "learning_rate": 6.040000178832095e-06, "loss": 4.2169, "step": 2998 }, { "epoch": 0.7745351239669421, "grad_norm": 4.352891445159912, "learning_rate": 6.026785496064319e-06, "loss": 4.1742, "step": 2999 }, { "epoch": 0.7747933884297521, "grad_norm": 2.1302297115325928, "learning_rate": 6.0135833035123215e-06, "loss": 4.398, "step": 3000 }, { "epoch": 0.775051652892562, "grad_norm": 2.802227258682251, "learning_rate": 6.000393609867203e-06, "loss": 4.1736, "step": 3001 }, { "epoch": 0.7753099173553719, "grad_norm": 2.190290689468384, "learning_rate": 5.987216423811842e-06, "loss": 4.0381, "step": 3002 }, { "epoch": 0.7755681818181818, "grad_norm": 2.4693148136138916, "learning_rate": 5.974051754020876e-06, "loss": 3.7312, "step": 3003 }, { "epoch": 0.7758264462809917, "grad_norm": 2.1813268661499023, "learning_rate": 5.960899609160725e-06, "loss": 3.4439, "step": 3004 }, { "epoch": 0.7760847107438017, "grad_norm": 2.7288036346435547, "learning_rate": 5.94775999788954e-06, "loss": 3.8493, "step": 3005 }, { "epoch": 0.7763429752066116, "grad_norm": 3.169830799102783, "learning_rate": 5.934632928857229e-06, "loss": 3.978, "step": 3006 }, { "epoch": 0.7766012396694215, "grad_norm": 3.201216697692871, "learning_rate": 5.921518410705451e-06, "loss": 4.0421, "step": 3007 }, { "epoch": 0.7768595041322314, "grad_norm": 2.3736517429351807, "learning_rate": 5.9084164520675915e-06, "loss": 4.3302, "step": 3008 }, { "epoch": 0.7771177685950413, "grad_norm": 2.7193477153778076, "learning_rate": 5.895327061568776e-06, "loss": 4.0084, "step": 3009 }, { "epoch": 0.7773760330578512, "grad_norm": 3.064923048019409, "learning_rate": 5.882250247825841e-06, "loss": 3.438, "step": 3010 }, { "epoch": 0.7776342975206612, "grad_norm": 1.8658831119537354, "learning_rate": 5.869186019447379e-06, "loss": 3.762, "step": 3011 }, { "epoch": 0.7778925619834711, "grad_norm": 2.6229825019836426, "learning_rate": 5.856134385033646e-06, "loss": 3.5314, "step": 3012 }, { "epoch": 0.778150826446281, "grad_norm": 2.2160470485687256, "learning_rate": 5.8430953531766474e-06, "loss": 4.1256, "step": 3013 }, { "epoch": 0.7784090909090909, "grad_norm": 3.1417458057403564, "learning_rate": 5.830068932460098e-06, "loss": 3.9253, "step": 3014 }, { "epoch": 0.7786673553719008, "grad_norm": 2.240037202835083, "learning_rate": 5.817055131459359e-06, "loss": 3.7208, "step": 3015 }, { "epoch": 0.7789256198347108, "grad_norm": 3.9173498153686523, "learning_rate": 5.804053958741543e-06, "loss": 3.8589, "step": 3016 }, { "epoch": 0.7791838842975206, "grad_norm": 2.390720844268799, "learning_rate": 5.791065422865413e-06, "loss": 3.7276, "step": 3017 }, { "epoch": 0.7794421487603306, "grad_norm": 2.712533950805664, "learning_rate": 5.778089532381429e-06, "loss": 3.8185, "step": 3018 }, { "epoch": 0.7797004132231405, "grad_norm": 2.7469451427459717, "learning_rate": 5.76512629583171e-06, "loss": 3.845, "step": 3019 }, { "epoch": 0.7799586776859504, "grad_norm": 3.426276922225952, "learning_rate": 5.752175721750064e-06, "loss": 4.2501, "step": 3020 }, { "epoch": 0.7802169421487604, "grad_norm": 2.2316877841949463, "learning_rate": 5.739237818661971e-06, "loss": 4.0885, "step": 3021 }, { "epoch": 0.7804752066115702, "grad_norm": 2.271049976348877, "learning_rate": 5.726312595084526e-06, "loss": 3.7602, "step": 3022 }, { "epoch": 0.7807334710743802, "grad_norm": 3.179388999938965, "learning_rate": 5.713400059526527e-06, "loss": 4.4031, "step": 3023 }, { "epoch": 0.78099173553719, "grad_norm": 3.595670700073242, "learning_rate": 5.700500220488386e-06, "loss": 3.9467, "step": 3024 }, { "epoch": 0.78125, "grad_norm": 2.784013032913208, "learning_rate": 5.687613086462171e-06, "loss": 3.5234, "step": 3025 }, { "epoch": 0.78150826446281, "grad_norm": 3.4096200466156006, "learning_rate": 5.674738665931575e-06, "loss": 4.2117, "step": 3026 }, { "epoch": 0.7817665289256198, "grad_norm": 2.461766481399536, "learning_rate": 5.661876967371946e-06, "loss": 3.9837, "step": 3027 }, { "epoch": 0.7820247933884298, "grad_norm": 2.7916419506073, "learning_rate": 5.649027999250234e-06, "loss": 4.7405, "step": 3028 }, { "epoch": 0.7822830578512396, "grad_norm": 3.4178268909454346, "learning_rate": 5.6361917700250035e-06, "loss": 4.3774, "step": 3029 }, { "epoch": 0.7825413223140496, "grad_norm": 3.974893093109131, "learning_rate": 5.623368288146466e-06, "loss": 4.4005, "step": 3030 }, { "epoch": 0.7827995867768595, "grad_norm": 3.6239888668060303, "learning_rate": 5.610557562056409e-06, "loss": 3.8584, "step": 3031 }, { "epoch": 0.7830578512396694, "grad_norm": 3.0626046657562256, "learning_rate": 5.597759600188232e-06, "loss": 3.7247, "step": 3032 }, { "epoch": 0.7833161157024794, "grad_norm": 2.766003131866455, "learning_rate": 5.584974410966931e-06, "loss": 3.9193, "step": 3033 }, { "epoch": 0.7835743801652892, "grad_norm": 2.8055996894836426, "learning_rate": 5.572202002809107e-06, "loss": 3.4607, "step": 3034 }, { "epoch": 0.7838326446280992, "grad_norm": 2.0230352878570557, "learning_rate": 5.559442384122932e-06, "loss": 3.858, "step": 3035 }, { "epoch": 0.7840909090909091, "grad_norm": 2.5230295658111572, "learning_rate": 5.546695563308155e-06, "loss": 3.8701, "step": 3036 }, { "epoch": 0.784349173553719, "grad_norm": 4.961343765258789, "learning_rate": 5.533961548756128e-06, "loss": 3.7738, "step": 3037 }, { "epoch": 0.7846074380165289, "grad_norm": 2.909788131713867, "learning_rate": 5.521240348849724e-06, "loss": 3.5311, "step": 3038 }, { "epoch": 0.7848657024793388, "grad_norm": 3.7523868083953857, "learning_rate": 5.5085319719634296e-06, "loss": 3.4738, "step": 3039 }, { "epoch": 0.7851239669421488, "grad_norm": 3.9309418201446533, "learning_rate": 5.495836426463266e-06, "loss": 4.0796, "step": 3040 }, { "epoch": 0.7853822314049587, "grad_norm": 2.8273677825927734, "learning_rate": 5.483153720706799e-06, "loss": 3.8133, "step": 3041 }, { "epoch": 0.7856404958677686, "grad_norm": 2.0912039279937744, "learning_rate": 5.470483863043169e-06, "loss": 3.9071, "step": 3042 }, { "epoch": 0.7858987603305785, "grad_norm": 2.1477081775665283, "learning_rate": 5.457826861813034e-06, "loss": 3.6318, "step": 3043 }, { "epoch": 0.7861570247933884, "grad_norm": 2.829658269882202, "learning_rate": 5.445182725348596e-06, "loss": 3.5943, "step": 3044 }, { "epoch": 0.7864152892561983, "grad_norm": 3.448798894882202, "learning_rate": 5.432551461973587e-06, "loss": 4.0932, "step": 3045 }, { "epoch": 0.7866735537190083, "grad_norm": 2.6740174293518066, "learning_rate": 5.419933080003278e-06, "loss": 4.4823, "step": 3046 }, { "epoch": 0.7869318181818182, "grad_norm": 1.9111578464508057, "learning_rate": 5.4073275877444404e-06, "loss": 3.5069, "step": 3047 }, { "epoch": 0.7871900826446281, "grad_norm": 4.634271144866943, "learning_rate": 5.394734993495365e-06, "loss": 4.606, "step": 3048 }, { "epoch": 0.787448347107438, "grad_norm": 3.6830687522888184, "learning_rate": 5.3821553055458635e-06, "loss": 3.7661, "step": 3049 }, { "epoch": 0.7877066115702479, "grad_norm": 4.046968460083008, "learning_rate": 5.369588532177241e-06, "loss": 3.5037, "step": 3050 }, { "epoch": 0.7879648760330579, "grad_norm": 3.595989465713501, "learning_rate": 5.3570346816623015e-06, "loss": 3.9642, "step": 3051 }, { "epoch": 0.7882231404958677, "grad_norm": 7.2132439613342285, "learning_rate": 5.344493762265338e-06, "loss": 3.886, "step": 3052 }, { "epoch": 0.7884814049586777, "grad_norm": 3.2247462272644043, "learning_rate": 5.331965782242146e-06, "loss": 3.8912, "step": 3053 }, { "epoch": 0.7887396694214877, "grad_norm": 2.8054232597351074, "learning_rate": 5.319450749839988e-06, "loss": 4.1025, "step": 3054 }, { "epoch": 0.7889979338842975, "grad_norm": 6.14181661605835, "learning_rate": 5.3069486732976014e-06, "loss": 3.5622, "step": 3055 }, { "epoch": 0.7892561983471075, "grad_norm": 2.9143710136413574, "learning_rate": 5.294459560845213e-06, "loss": 3.923, "step": 3056 }, { "epoch": 0.7895144628099173, "grad_norm": 2.838334083557129, "learning_rate": 5.281983420704498e-06, "loss": 3.4441, "step": 3057 }, { "epoch": 0.7897727272727273, "grad_norm": 2.5445446968078613, "learning_rate": 5.269520261088598e-06, "loss": 4.3449, "step": 3058 }, { "epoch": 0.7900309917355371, "grad_norm": 4.778242111206055, "learning_rate": 5.257070090202102e-06, "loss": 4.0094, "step": 3059 }, { "epoch": 0.7902892561983471, "grad_norm": 2.7273833751678467, "learning_rate": 5.244632916241074e-06, "loss": 4.3894, "step": 3060 }, { "epoch": 0.7905475206611571, "grad_norm": 4.428411483764648, "learning_rate": 5.232208747392975e-06, "loss": 4.5691, "step": 3061 }, { "epoch": 0.7908057851239669, "grad_norm": 1.6998902559280396, "learning_rate": 5.219797591836751e-06, "loss": 3.7785, "step": 3062 }, { "epoch": 0.7910640495867769, "grad_norm": 1.900409460067749, "learning_rate": 5.207399457742768e-06, "loss": 3.7611, "step": 3063 }, { "epoch": 0.7913223140495868, "grad_norm": 3.0106942653656006, "learning_rate": 5.1950143532727955e-06, "loss": 3.8209, "step": 3064 }, { "epoch": 0.7915805785123967, "grad_norm": 3.718186140060425, "learning_rate": 5.182642286580064e-06, "loss": 3.5566, "step": 3065 }, { "epoch": 0.7918388429752066, "grad_norm": 2.2557811737060547, "learning_rate": 5.170283265809192e-06, "loss": 4.4365, "step": 3066 }, { "epoch": 0.7920971074380165, "grad_norm": 2.6692733764648438, "learning_rate": 5.157937299096221e-06, "loss": 4.1667, "step": 3067 }, { "epoch": 0.7923553719008265, "grad_norm": 1.9078664779663086, "learning_rate": 5.145604394568593e-06, "loss": 3.8023, "step": 3068 }, { "epoch": 0.7926136363636364, "grad_norm": 2.5996992588043213, "learning_rate": 5.133284560345167e-06, "loss": 4.0582, "step": 3069 }, { "epoch": 0.7928719008264463, "grad_norm": 2.5703012943267822, "learning_rate": 5.120977804536179e-06, "loss": 4.006, "step": 3070 }, { "epoch": 0.7931301652892562, "grad_norm": 3.890080213546753, "learning_rate": 5.108684135243255e-06, "loss": 4.133, "step": 3071 }, { "epoch": 0.7933884297520661, "grad_norm": 4.9146647453308105, "learning_rate": 5.096403560559434e-06, "loss": 4.0423, "step": 3072 }, { "epoch": 0.793646694214876, "grad_norm": 2.605792760848999, "learning_rate": 5.0841360885691e-06, "loss": 3.6567, "step": 3073 }, { "epoch": 0.793904958677686, "grad_norm": 3.184600353240967, "learning_rate": 5.07188172734803e-06, "loss": 3.9918, "step": 3074 }, { "epoch": 0.7941632231404959, "grad_norm": 4.198796272277832, "learning_rate": 5.05964048496336e-06, "loss": 3.8343, "step": 3075 }, { "epoch": 0.7944214876033058, "grad_norm": 4.2864603996276855, "learning_rate": 5.047412369473606e-06, "loss": 3.8176, "step": 3076 }, { "epoch": 0.7946797520661157, "grad_norm": 3.080073595046997, "learning_rate": 5.035197388928628e-06, "loss": 3.761, "step": 3077 }, { "epoch": 0.7949380165289256, "grad_norm": 2.05564022064209, "learning_rate": 5.0229955513696355e-06, "loss": 4.2784, "step": 3078 }, { "epoch": 0.7951962809917356, "grad_norm": 4.008758544921875, "learning_rate": 5.010806864829212e-06, "loss": 3.4971, "step": 3079 }, { "epoch": 0.7954545454545454, "grad_norm": 2.3405208587646484, "learning_rate": 4.9986313373312445e-06, "loss": 3.5995, "step": 3080 }, { "epoch": 0.7957128099173554, "grad_norm": 3.3963730335235596, "learning_rate": 4.986468976890993e-06, "loss": 3.8634, "step": 3081 }, { "epoch": 0.7959710743801653, "grad_norm": 1.7442933320999146, "learning_rate": 4.974319791515028e-06, "loss": 4.2795, "step": 3082 }, { "epoch": 0.7962293388429752, "grad_norm": 2.605700969696045, "learning_rate": 4.9621837892012526e-06, "loss": 4.0945, "step": 3083 }, { "epoch": 0.7964876033057852, "grad_norm": 1.976825475692749, "learning_rate": 4.9500609779388975e-06, "loss": 3.787, "step": 3084 }, { "epoch": 0.796745867768595, "grad_norm": 2.309537172317505, "learning_rate": 4.937951365708496e-06, "loss": 3.8897, "step": 3085 }, { "epoch": 0.797004132231405, "grad_norm": 1.833134412765503, "learning_rate": 4.925854960481918e-06, "loss": 3.9312, "step": 3086 }, { "epoch": 0.7972623966942148, "grad_norm": 3.282010316848755, "learning_rate": 4.913771770222297e-06, "loss": 3.8561, "step": 3087 }, { "epoch": 0.7975206611570248, "grad_norm": 2.872826099395752, "learning_rate": 4.901701802884115e-06, "loss": 3.9245, "step": 3088 }, { "epoch": 0.7977789256198347, "grad_norm": 3.74955153465271, "learning_rate": 4.889645066413112e-06, "loss": 3.7571, "step": 3089 }, { "epoch": 0.7980371900826446, "grad_norm": 2.091648578643799, "learning_rate": 4.877601568746337e-06, "loss": 3.9039, "step": 3090 }, { "epoch": 0.7982954545454546, "grad_norm": 6.677488327026367, "learning_rate": 4.865571317812112e-06, "loss": 3.8207, "step": 3091 }, { "epoch": 0.7985537190082644, "grad_norm": 2.8301141262054443, "learning_rate": 4.853554321530057e-06, "loss": 4.6069, "step": 3092 }, { "epoch": 0.7988119834710744, "grad_norm": 2.0875861644744873, "learning_rate": 4.84155058781105e-06, "loss": 4.1137, "step": 3093 }, { "epoch": 0.7990702479338843, "grad_norm": 4.1384053230285645, "learning_rate": 4.829560124557233e-06, "loss": 3.693, "step": 3094 }, { "epoch": 0.7993285123966942, "grad_norm": 3.2378692626953125, "learning_rate": 4.817582939662041e-06, "loss": 3.7779, "step": 3095 }, { "epoch": 0.7995867768595041, "grad_norm": 4.011788368225098, "learning_rate": 4.8056190410101374e-06, "loss": 3.8311, "step": 3096 }, { "epoch": 0.799845041322314, "grad_norm": 2.652003765106201, "learning_rate": 4.793668436477453e-06, "loss": 3.7713, "step": 3097 }, { "epoch": 0.800103305785124, "grad_norm": 2.1085405349731445, "learning_rate": 4.781731133931152e-06, "loss": 4.0237, "step": 3098 }, { "epoch": 0.8003615702479339, "grad_norm": 2.0412938594818115, "learning_rate": 4.769807141229676e-06, "loss": 3.9356, "step": 3099 }, { "epoch": 0.8006198347107438, "grad_norm": 3.5159549713134766, "learning_rate": 4.7578964662226726e-06, "loss": 3.457, "step": 3100 }, { "epoch": 0.8008780991735537, "grad_norm": 2.633368730545044, "learning_rate": 4.745999116751021e-06, "loss": 4.5431, "step": 3101 }, { "epoch": 0.8011363636363636, "grad_norm": 2.3001716136932373, "learning_rate": 4.734115100646868e-06, "loss": 4.0712, "step": 3102 }, { "epoch": 0.8013946280991735, "grad_norm": 2.8118252754211426, "learning_rate": 4.722244425733521e-06, "loss": 3.6357, "step": 3103 }, { "epoch": 0.8016528925619835, "grad_norm": 3.88752818107605, "learning_rate": 4.710387099825564e-06, "loss": 3.0263, "step": 3104 }, { "epoch": 0.8019111570247934, "grad_norm": 4.2198486328125, "learning_rate": 4.698543130728755e-06, "loss": 4.2192, "step": 3105 }, { "epoch": 0.8021694214876033, "grad_norm": 2.2946817874908447, "learning_rate": 4.68671252624007e-06, "loss": 3.8873, "step": 3106 }, { "epoch": 0.8024276859504132, "grad_norm": 2.1857426166534424, "learning_rate": 4.674895294147699e-06, "loss": 4.068, "step": 3107 }, { "epoch": 0.8026859504132231, "grad_norm": 2.0002620220184326, "learning_rate": 4.6630914422310125e-06, "loss": 4.0388, "step": 3108 }, { "epoch": 0.8029442148760331, "grad_norm": 5.736384391784668, "learning_rate": 4.6513009782605785e-06, "loss": 3.3491, "step": 3109 }, { "epoch": 0.8032024793388429, "grad_norm": 3.019831657409668, "learning_rate": 4.6395239099981475e-06, "loss": 4.1974, "step": 3110 }, { "epoch": 0.8034607438016529, "grad_norm": 6.886529445648193, "learning_rate": 4.627760245196666e-06, "loss": 2.7751, "step": 3111 }, { "epoch": 0.8037190082644629, "grad_norm": 5.153056621551514, "learning_rate": 4.61600999160024e-06, "loss": 3.6861, "step": 3112 }, { "epoch": 0.8039772727272727, "grad_norm": 3.6069653034210205, "learning_rate": 4.60427315694415e-06, "loss": 3.4872, "step": 3113 }, { "epoch": 0.8042355371900827, "grad_norm": 3.60599422454834, "learning_rate": 4.592549748954858e-06, "loss": 4.3806, "step": 3114 }, { "epoch": 0.8044938016528925, "grad_norm": 1.745474100112915, "learning_rate": 4.580839775349968e-06, "loss": 4.3725, "step": 3115 }, { "epoch": 0.8047520661157025, "grad_norm": 3.0892856121063232, "learning_rate": 4.569143243838244e-06, "loss": 3.7547, "step": 3116 }, { "epoch": 0.8050103305785123, "grad_norm": 3.036412239074707, "learning_rate": 4.557460162119606e-06, "loss": 3.9785, "step": 3117 }, { "epoch": 0.8052685950413223, "grad_norm": 7.081357002258301, "learning_rate": 4.545790537885125e-06, "loss": 4.3175, "step": 3118 }, { "epoch": 0.8055268595041323, "grad_norm": 2.513465404510498, "learning_rate": 4.534134378817003e-06, "loss": 4.1359, "step": 3119 }, { "epoch": 0.8057851239669421, "grad_norm": 3.189488410949707, "learning_rate": 4.52249169258857e-06, "loss": 3.217, "step": 3120 }, { "epoch": 0.8060433884297521, "grad_norm": 3.69801926612854, "learning_rate": 4.5108624868643175e-06, "loss": 4.2765, "step": 3121 }, { "epoch": 0.806301652892562, "grad_norm": 3.302111864089966, "learning_rate": 4.499246769299828e-06, "loss": 4.2489, "step": 3122 }, { "epoch": 0.8065599173553719, "grad_norm": 2.4787020683288574, "learning_rate": 4.487644547541825e-06, "loss": 3.9717, "step": 3123 }, { "epoch": 0.8068181818181818, "grad_norm": 3.3503825664520264, "learning_rate": 4.476055829228135e-06, "loss": 3.9015, "step": 3124 }, { "epoch": 0.8070764462809917, "grad_norm": 2.9496114253997803, "learning_rate": 4.4644806219877184e-06, "loss": 3.8175, "step": 3125 }, { "epoch": 0.8073347107438017, "grad_norm": 2.5519232749938965, "learning_rate": 4.4529189334406e-06, "loss": 3.9053, "step": 3126 }, { "epoch": 0.8075929752066116, "grad_norm": 1.902953863143921, "learning_rate": 4.441370771197948e-06, "loss": 4.1742, "step": 3127 }, { "epoch": 0.8078512396694215, "grad_norm": 2.286240577697754, "learning_rate": 4.429836142862012e-06, "loss": 3.9606, "step": 3128 }, { "epoch": 0.8081095041322314, "grad_norm": 3.0922954082489014, "learning_rate": 4.418315056026109e-06, "loss": 4.0317, "step": 3129 }, { "epoch": 0.8083677685950413, "grad_norm": 3.443214178085327, "learning_rate": 4.40680751827468e-06, "loss": 3.8774, "step": 3130 }, { "epoch": 0.8086260330578512, "grad_norm": 1.9670565128326416, "learning_rate": 4.395313537183218e-06, "loss": 4.0058, "step": 3131 }, { "epoch": 0.8088842975206612, "grad_norm": 5.4418253898620605, "learning_rate": 4.383833120318301e-06, "loss": 3.5668, "step": 3132 }, { "epoch": 0.8091425619834711, "grad_norm": 3.11191463470459, "learning_rate": 4.3723662752375745e-06, "loss": 4.2784, "step": 3133 }, { "epoch": 0.809400826446281, "grad_norm": 2.243457078933716, "learning_rate": 4.360913009489761e-06, "loss": 3.9649, "step": 3134 }, { "epoch": 0.8096590909090909, "grad_norm": 1.8260809183120728, "learning_rate": 4.349473330614631e-06, "loss": 3.8978, "step": 3135 }, { "epoch": 0.8099173553719008, "grad_norm": 2.710378408432007, "learning_rate": 4.3380472461430066e-06, "loss": 3.9044, "step": 3136 }, { "epoch": 0.8101756198347108, "grad_norm": 2.024078607559204, "learning_rate": 4.326634763596785e-06, "loss": 3.9716, "step": 3137 }, { "epoch": 0.8104338842975206, "grad_norm": 2.758868455886841, "learning_rate": 4.3152358904888805e-06, "loss": 4.1953, "step": 3138 }, { "epoch": 0.8106921487603306, "grad_norm": 2.3402938842773438, "learning_rate": 4.3038506343232674e-06, "loss": 3.9217, "step": 3139 }, { "epoch": 0.8109504132231405, "grad_norm": 2.7336788177490234, "learning_rate": 4.292479002594937e-06, "loss": 3.3343, "step": 3140 }, { "epoch": 0.8112086776859504, "grad_norm": 1.9625756740570068, "learning_rate": 4.281121002789942e-06, "loss": 3.79, "step": 3141 }, { "epoch": 0.8114669421487604, "grad_norm": 5.451760292053223, "learning_rate": 4.269776642385331e-06, "loss": 4.3542, "step": 3142 }, { "epoch": 0.8117252066115702, "grad_norm": 3.745269298553467, "learning_rate": 4.258445928849183e-06, "loss": 3.5843, "step": 3143 }, { "epoch": 0.8119834710743802, "grad_norm": 1.687119722366333, "learning_rate": 4.24712886964061e-06, "loss": 3.8418, "step": 3144 }, { "epoch": 0.81224173553719, "grad_norm": 6.173981189727783, "learning_rate": 4.235825472209701e-06, "loss": 4.0208, "step": 3145 }, { "epoch": 0.8125, "grad_norm": 2.018956422805786, "learning_rate": 4.224535743997584e-06, "loss": 4.3124, "step": 3146 }, { "epoch": 0.81275826446281, "grad_norm": 3.0678749084472656, "learning_rate": 4.213259692436367e-06, "loss": 4.1324, "step": 3147 }, { "epoch": 0.8130165289256198, "grad_norm": 6.117210388183594, "learning_rate": 4.201997324949175e-06, "loss": 4.1579, "step": 3148 }, { "epoch": 0.8132747933884298, "grad_norm": 2.2027077674865723, "learning_rate": 4.190748648950107e-06, "loss": 3.8435, "step": 3149 }, { "epoch": 0.8135330578512396, "grad_norm": 3.4036483764648438, "learning_rate": 4.1795136718442484e-06, "loss": 3.8577, "step": 3150 }, { "epoch": 0.8137913223140496, "grad_norm": 2.3444583415985107, "learning_rate": 4.168292401027687e-06, "loss": 4.0228, "step": 3151 }, { "epoch": 0.8140495867768595, "grad_norm": 1.5581039190292358, "learning_rate": 4.157084843887454e-06, "loss": 3.7556, "step": 3152 }, { "epoch": 0.8143078512396694, "grad_norm": 4.034010410308838, "learning_rate": 4.145891007801589e-06, "loss": 3.5933, "step": 3153 }, { "epoch": 0.8145661157024794, "grad_norm": 2.150984048843384, "learning_rate": 4.134710900139074e-06, "loss": 3.56, "step": 3154 }, { "epoch": 0.8148243801652892, "grad_norm": 2.815709352493286, "learning_rate": 4.12354452825986e-06, "loss": 3.8845, "step": 3155 }, { "epoch": 0.8150826446280992, "grad_norm": 4.093982696533203, "learning_rate": 4.112391899514864e-06, "loss": 3.7978, "step": 3156 }, { "epoch": 0.8153409090909091, "grad_norm": 4.3202080726623535, "learning_rate": 4.101253021245943e-06, "loss": 4.029, "step": 3157 }, { "epoch": 0.815599173553719, "grad_norm": 2.6700565814971924, "learning_rate": 4.090127900785912e-06, "loss": 4.2953, "step": 3158 }, { "epoch": 0.8158574380165289, "grad_norm": 2.817720651626587, "learning_rate": 4.079016545458516e-06, "loss": 3.7251, "step": 3159 }, { "epoch": 0.8161157024793388, "grad_norm": 2.9881536960601807, "learning_rate": 4.0679189625784596e-06, "loss": 4.4765, "step": 3160 }, { "epoch": 0.8163739669421488, "grad_norm": 6.2988104820251465, "learning_rate": 4.0568351594513635e-06, "loss": 4.06, "step": 3161 }, { "epoch": 0.8166322314049587, "grad_norm": 3.260664939880371, "learning_rate": 4.045765143373775e-06, "loss": 3.6327, "step": 3162 }, { "epoch": 0.8168904958677686, "grad_norm": 3.7498574256896973, "learning_rate": 4.034708921633188e-06, "loss": 4.2448, "step": 3163 }, { "epoch": 0.8171487603305785, "grad_norm": 2.874403953552246, "learning_rate": 4.023666501507989e-06, "loss": 3.5933, "step": 3164 }, { "epoch": 0.8174070247933884, "grad_norm": 2.4448890686035156, "learning_rate": 4.01263789026749e-06, "loss": 3.3198, "step": 3165 }, { "epoch": 0.8176652892561983, "grad_norm": 2.5400733947753906, "learning_rate": 4.001623095171908e-06, "loss": 4.29, "step": 3166 }, { "epoch": 0.8179235537190083, "grad_norm": 3.3781795501708984, "learning_rate": 3.990622123472387e-06, "loss": 3.8302, "step": 3167 }, { "epoch": 0.8181818181818182, "grad_norm": 2.554511308670044, "learning_rate": 3.979634982410929e-06, "loss": 3.9601, "step": 3168 }, { "epoch": 0.8184400826446281, "grad_norm": 3.6860246658325195, "learning_rate": 3.968661679220468e-06, "loss": 3.3482, "step": 3169 }, { "epoch": 0.818698347107438, "grad_norm": 3.3328230381011963, "learning_rate": 3.957702221124826e-06, "loss": 4.3383, "step": 3170 }, { "epoch": 0.8189566115702479, "grad_norm": 2.2283880710601807, "learning_rate": 3.946756615338673e-06, "loss": 3.8698, "step": 3171 }, { "epoch": 0.8192148760330579, "grad_norm": 1.8562620878219604, "learning_rate": 3.935824869067611e-06, "loss": 4.0162, "step": 3172 }, { "epoch": 0.8194731404958677, "grad_norm": 3.1247875690460205, "learning_rate": 3.924906989508084e-06, "loss": 4.561, "step": 3173 }, { "epoch": 0.8197314049586777, "grad_norm": 2.905747413635254, "learning_rate": 3.9140029838474205e-06, "loss": 4.7247, "step": 3174 }, { "epoch": 0.8199896694214877, "grad_norm": 2.2650914192199707, "learning_rate": 3.903112859263805e-06, "loss": 3.6224, "step": 3175 }, { "epoch": 0.8202479338842975, "grad_norm": 2.7449676990509033, "learning_rate": 3.892236622926304e-06, "loss": 3.4891, "step": 3176 }, { "epoch": 0.8205061983471075, "grad_norm": 2.1880366802215576, "learning_rate": 3.881374281994827e-06, "loss": 3.9115, "step": 3177 }, { "epoch": 0.8207644628099173, "grad_norm": 3.510692834854126, "learning_rate": 3.870525843620123e-06, "loss": 3.667, "step": 3178 }, { "epoch": 0.8210227272727273, "grad_norm": 4.581926345825195, "learning_rate": 3.859691314943825e-06, "loss": 4.287, "step": 3179 }, { "epoch": 0.8212809917355371, "grad_norm": 3.573695421218872, "learning_rate": 3.8488707030983785e-06, "loss": 3.7415, "step": 3180 }, { "epoch": 0.8215392561983471, "grad_norm": 3.427015542984009, "learning_rate": 3.838064015207077e-06, "loss": 4.0295, "step": 3181 }, { "epoch": 0.8217975206611571, "grad_norm": 1.7027958631515503, "learning_rate": 3.827271258384041e-06, "loss": 3.9022, "step": 3182 }, { "epoch": 0.8220557851239669, "grad_norm": 2.801062822341919, "learning_rate": 3.81649243973424e-06, "loss": 3.6108, "step": 3183 }, { "epoch": 0.8223140495867769, "grad_norm": 3.1397767066955566, "learning_rate": 3.805727566353451e-06, "loss": 3.6961, "step": 3184 }, { "epoch": 0.8225723140495868, "grad_norm": 3.0541188716888428, "learning_rate": 3.794976645328266e-06, "loss": 3.8607, "step": 3185 }, { "epoch": 0.8228305785123967, "grad_norm": 3.0976879596710205, "learning_rate": 3.78423968373611e-06, "loss": 3.8643, "step": 3186 }, { "epoch": 0.8230888429752066, "grad_norm": 1.8499300479888916, "learning_rate": 3.7735166886452065e-06, "loss": 4.2794, "step": 3187 }, { "epoch": 0.8233471074380165, "grad_norm": 2.1713483333587646, "learning_rate": 3.76280766711459e-06, "loss": 3.9204, "step": 3188 }, { "epoch": 0.8236053719008265, "grad_norm": 1.9648449420928955, "learning_rate": 3.7521126261940793e-06, "loss": 3.6748, "step": 3189 }, { "epoch": 0.8238636363636364, "grad_norm": 2.79081392288208, "learning_rate": 3.7414315729243226e-06, "loss": 3.9665, "step": 3190 }, { "epoch": 0.8241219008264463, "grad_norm": 2.930943727493286, "learning_rate": 3.730764514336732e-06, "loss": 3.4889, "step": 3191 }, { "epoch": 0.8243801652892562, "grad_norm": 4.098539352416992, "learning_rate": 3.7201114574535096e-06, "loss": 4.1931, "step": 3192 }, { "epoch": 0.8246384297520661, "grad_norm": 2.81147837638855, "learning_rate": 3.7094724092876647e-06, "loss": 3.9613, "step": 3193 }, { "epoch": 0.824896694214876, "grad_norm": 4.9157280921936035, "learning_rate": 3.6988473768429426e-06, "loss": 3.9543, "step": 3194 }, { "epoch": 0.825154958677686, "grad_norm": 2.050794839859009, "learning_rate": 3.6882363671139035e-06, "loss": 4.0385, "step": 3195 }, { "epoch": 0.8254132231404959, "grad_norm": 1.9615947008132935, "learning_rate": 3.6776393870858514e-06, "loss": 4.0752, "step": 3196 }, { "epoch": 0.8256714876033058, "grad_norm": 2.9092202186584473, "learning_rate": 3.6670564437348566e-06, "loss": 3.6714, "step": 3197 }, { "epoch": 0.8259297520661157, "grad_norm": 1.7281323671340942, "learning_rate": 3.6564875440277617e-06, "loss": 4.0551, "step": 3198 }, { "epoch": 0.8261880165289256, "grad_norm": 5.160204887390137, "learning_rate": 3.645932694922155e-06, "loss": 3.909, "step": 3199 }, { "epoch": 0.8264462809917356, "grad_norm": 3.5328826904296875, "learning_rate": 3.635391903366375e-06, "loss": 3.7871, "step": 3200 }, { "epoch": 0.8267045454545454, "grad_norm": 3.5692837238311768, "learning_rate": 3.6248651762994995e-06, "loss": 4.308, "step": 3201 }, { "epoch": 0.8269628099173554, "grad_norm": 2.0670406818389893, "learning_rate": 3.6143525206513707e-06, "loss": 3.9453, "step": 3202 }, { "epoch": 0.8272210743801653, "grad_norm": 2.750880479812622, "learning_rate": 3.6038539433425416e-06, "loss": 4.4434, "step": 3203 }, { "epoch": 0.8274793388429752, "grad_norm": 3.218442678451538, "learning_rate": 3.593369451284306e-06, "loss": 3.6455, "step": 3204 }, { "epoch": 0.8277376033057852, "grad_norm": 3.3082327842712402, "learning_rate": 3.582899051378699e-06, "loss": 3.6173, "step": 3205 }, { "epoch": 0.827995867768595, "grad_norm": 2.0444657802581787, "learning_rate": 3.5724427505184593e-06, "loss": 3.8604, "step": 3206 }, { "epoch": 0.828254132231405, "grad_norm": 2.9353392124176025, "learning_rate": 3.56200055558705e-06, "loss": 3.7008, "step": 3207 }, { "epoch": 0.8285123966942148, "grad_norm": 6.380825519561768, "learning_rate": 3.5515724734586476e-06, "loss": 3.451, "step": 3208 }, { "epoch": 0.8287706611570248, "grad_norm": 2.7229349613189697, "learning_rate": 3.5411585109981548e-06, "loss": 3.7026, "step": 3209 }, { "epoch": 0.8290289256198347, "grad_norm": 1.9942761659622192, "learning_rate": 3.5307586750611438e-06, "loss": 3.9016, "step": 3210 }, { "epoch": 0.8292871900826446, "grad_norm": 2.4609639644622803, "learning_rate": 3.520372972493924e-06, "loss": 3.9671, "step": 3211 }, { "epoch": 0.8295454545454546, "grad_norm": 5.345799446105957, "learning_rate": 3.510001410133476e-06, "loss": 4.0393, "step": 3212 }, { "epoch": 0.8298037190082644, "grad_norm": 4.429155349731445, "learning_rate": 3.499643994807486e-06, "loss": 3.9656, "step": 3213 }, { "epoch": 0.8300619834710744, "grad_norm": 2.1105799674987793, "learning_rate": 3.489300733334322e-06, "loss": 3.9425, "step": 3214 }, { "epoch": 0.8303202479338843, "grad_norm": 2.94343638420105, "learning_rate": 3.478971632523026e-06, "loss": 3.9002, "step": 3215 }, { "epoch": 0.8305785123966942, "grad_norm": 3.1997077465057373, "learning_rate": 3.468656699173345e-06, "loss": 3.2779, "step": 3216 }, { "epoch": 0.8308367768595041, "grad_norm": 3.47969651222229, "learning_rate": 3.458355940075653e-06, "loss": 3.9959, "step": 3217 }, { "epoch": 0.831095041322314, "grad_norm": 4.264750003814697, "learning_rate": 3.4480693620110416e-06, "loss": 3.8425, "step": 3218 }, { "epoch": 0.831353305785124, "grad_norm": 2.2381644248962402, "learning_rate": 3.4377969717512365e-06, "loss": 3.995, "step": 3219 }, { "epoch": 0.8316115702479339, "grad_norm": 5.879350662231445, "learning_rate": 3.4275387760586336e-06, "loss": 4.1071, "step": 3220 }, { "epoch": 0.8318698347107438, "grad_norm": 3.4872424602508545, "learning_rate": 3.4172947816862867e-06, "loss": 3.1089, "step": 3221 }, { "epoch": 0.8321280991735537, "grad_norm": 2.7239649295806885, "learning_rate": 3.4070649953778984e-06, "loss": 4.6844, "step": 3222 }, { "epoch": 0.8323863636363636, "grad_norm": 2.525874614715576, "learning_rate": 3.3968494238678124e-06, "loss": 4.0221, "step": 3223 }, { "epoch": 0.8326446280991735, "grad_norm": 5.014348983764648, "learning_rate": 3.3866480738810186e-06, "loss": 4.0579, "step": 3224 }, { "epoch": 0.8329028925619835, "grad_norm": 3.289677858352661, "learning_rate": 3.376460952133151e-06, "loss": 4.0644, "step": 3225 }, { "epoch": 0.8331611570247934, "grad_norm": 2.7192275524139404, "learning_rate": 3.3662880653304725e-06, "loss": 3.9845, "step": 3226 }, { "epoch": 0.8334194214876033, "grad_norm": 2.245616912841797, "learning_rate": 3.3561294201698622e-06, "loss": 3.8729, "step": 3227 }, { "epoch": 0.8336776859504132, "grad_norm": 2.1591696739196777, "learning_rate": 3.345985023338852e-06, "loss": 4.2432, "step": 3228 }, { "epoch": 0.8339359504132231, "grad_norm": 2.903212547302246, "learning_rate": 3.335854881515571e-06, "loss": 3.7913, "step": 3229 }, { "epoch": 0.8341942148760331, "grad_norm": 3.8137121200561523, "learning_rate": 3.325739001368769e-06, "loss": 3.9762, "step": 3230 }, { "epoch": 0.8344524793388429, "grad_norm": 3.8287549018859863, "learning_rate": 3.3156373895578073e-06, "loss": 4.2674, "step": 3231 }, { "epoch": 0.8347107438016529, "grad_norm": 1.9093163013458252, "learning_rate": 3.3055500527326672e-06, "loss": 4.4008, "step": 3232 }, { "epoch": 0.8349690082644629, "grad_norm": 2.687494993209839, "learning_rate": 3.2954769975339055e-06, "loss": 3.8324, "step": 3233 }, { "epoch": 0.8352272727272727, "grad_norm": 2.821951389312744, "learning_rate": 3.285418230592699e-06, "loss": 4.387, "step": 3234 }, { "epoch": 0.8354855371900827, "grad_norm": 3.158222198486328, "learning_rate": 3.2753737585308282e-06, "loss": 4.3587, "step": 3235 }, { "epoch": 0.8357438016528925, "grad_norm": 2.6298329830169678, "learning_rate": 3.265343587960623e-06, "loss": 4.2316, "step": 3236 }, { "epoch": 0.8360020661157025, "grad_norm": 2.9914846420288086, "learning_rate": 3.2553277254850412e-06, "loss": 4.3538, "step": 3237 }, { "epoch": 0.8362603305785123, "grad_norm": 2.927126884460449, "learning_rate": 3.2453261776975995e-06, "loss": 4.0482, "step": 3238 }, { "epoch": 0.8365185950413223, "grad_norm": 3.0722203254699707, "learning_rate": 3.23533895118239e-06, "loss": 3.9017, "step": 3239 }, { "epoch": 0.8367768595041323, "grad_norm": 5.61625337600708, "learning_rate": 3.225366052514081e-06, "loss": 4.6694, "step": 3240 }, { "epoch": 0.8370351239669421, "grad_norm": 1.8180378675460815, "learning_rate": 3.215407488257921e-06, "loss": 4.3628, "step": 3241 }, { "epoch": 0.8372933884297521, "grad_norm": 2.939183473587036, "learning_rate": 3.205463264969702e-06, "loss": 4.0669, "step": 3242 }, { "epoch": 0.837551652892562, "grad_norm": 3.4672558307647705, "learning_rate": 3.1955333891957815e-06, "loss": 3.6988, "step": 3243 }, { "epoch": 0.8378099173553719, "grad_norm": 2.3426365852355957, "learning_rate": 3.1856178674730867e-06, "loss": 3.7426, "step": 3244 }, { "epoch": 0.8380681818181818, "grad_norm": 2.3957138061523438, "learning_rate": 3.1757167063290738e-06, "loss": 4.4925, "step": 3245 }, { "epoch": 0.8383264462809917, "grad_norm": 1.8408613204956055, "learning_rate": 3.1658299122817565e-06, "loss": 3.7932, "step": 3246 }, { "epoch": 0.8385847107438017, "grad_norm": 3.544090986251831, "learning_rate": 3.155957491839684e-06, "loss": 3.9348, "step": 3247 }, { "epoch": 0.8388429752066116, "grad_norm": 5.411297798156738, "learning_rate": 3.1460994515019577e-06, "loss": 3.6072, "step": 3248 }, { "epoch": 0.8391012396694215, "grad_norm": 3.46323299407959, "learning_rate": 3.1362557977582004e-06, "loss": 3.7385, "step": 3249 }, { "epoch": 0.8393595041322314, "grad_norm": 4.092538833618164, "learning_rate": 3.126426537088556e-06, "loss": 4.3975, "step": 3250 }, { "epoch": 0.8396177685950413, "grad_norm": 2.7831227779388428, "learning_rate": 3.116611675963721e-06, "loss": 3.4414, "step": 3251 }, { "epoch": 0.8398760330578512, "grad_norm": 2.303703784942627, "learning_rate": 3.1068112208448824e-06, "loss": 4.3416, "step": 3252 }, { "epoch": 0.8401342975206612, "grad_norm": 5.382920265197754, "learning_rate": 3.0970251781837628e-06, "loss": 4.0949, "step": 3253 }, { "epoch": 0.8403925619834711, "grad_norm": 2.387969493865967, "learning_rate": 3.087253554422584e-06, "loss": 3.7787, "step": 3254 }, { "epoch": 0.840650826446281, "grad_norm": 1.8855735063552856, "learning_rate": 3.0774963559940916e-06, "loss": 3.644, "step": 3255 }, { "epoch": 0.8409090909090909, "grad_norm": 2.7446091175079346, "learning_rate": 3.0677535893215203e-06, "loss": 3.9876, "step": 3256 }, { "epoch": 0.8411673553719008, "grad_norm": 3.4922614097595215, "learning_rate": 3.058025260818609e-06, "loss": 3.3385, "step": 3257 }, { "epoch": 0.8414256198347108, "grad_norm": 2.3861703872680664, "learning_rate": 3.048311376889601e-06, "loss": 4.2635, "step": 3258 }, { "epoch": 0.8416838842975206, "grad_norm": 2.8297808170318604, "learning_rate": 3.0386119439292064e-06, "loss": 4.0832, "step": 3259 }, { "epoch": 0.8419421487603306, "grad_norm": 4.084082126617432, "learning_rate": 3.0289269683226513e-06, "loss": 4.0078, "step": 3260 }, { "epoch": 0.8422004132231405, "grad_norm": 6.224031925201416, "learning_rate": 3.019256456445629e-06, "loss": 4.0007, "step": 3261 }, { "epoch": 0.8424586776859504, "grad_norm": 2.3764243125915527, "learning_rate": 3.009600414664304e-06, "loss": 4.3144, "step": 3262 }, { "epoch": 0.8427169421487604, "grad_norm": 3.3738796710968018, "learning_rate": 2.9999588493353355e-06, "loss": 3.7054, "step": 3263 }, { "epoch": 0.8429752066115702, "grad_norm": 2.86564302444458, "learning_rate": 2.99033176680584e-06, "loss": 4.4712, "step": 3264 }, { "epoch": 0.8432334710743802, "grad_norm": 3.347743034362793, "learning_rate": 2.9807191734133962e-06, "loss": 3.7954, "step": 3265 }, { "epoch": 0.84349173553719, "grad_norm": 20.17072296142578, "learning_rate": 2.9711210754860497e-06, "loss": 4.416, "step": 3266 }, { "epoch": 0.84375, "grad_norm": 6.901514530181885, "learning_rate": 2.9615374793423077e-06, "loss": 2.3777, "step": 3267 }, { "epoch": 0.84400826446281, "grad_norm": 7.482487678527832, "learning_rate": 2.9519683912911266e-06, "loss": 4.3142, "step": 3268 }, { "epoch": 0.8442665289256198, "grad_norm": 2.941615581512451, "learning_rate": 2.942413817631906e-06, "loss": 3.7174, "step": 3269 }, { "epoch": 0.8445247933884298, "grad_norm": 1.6658852100372314, "learning_rate": 2.9328737646545037e-06, "loss": 3.9075, "step": 3270 }, { "epoch": 0.8447830578512396, "grad_norm": 5.922888278961182, "learning_rate": 2.923348238639212e-06, "loss": 3.4197, "step": 3271 }, { "epoch": 0.8450413223140496, "grad_norm": 3.9321846961975098, "learning_rate": 2.9138372458567512e-06, "loss": 3.9471, "step": 3272 }, { "epoch": 0.8452995867768595, "grad_norm": 2.744276762008667, "learning_rate": 2.904340792568286e-06, "loss": 3.9913, "step": 3273 }, { "epoch": 0.8455578512396694, "grad_norm": 3.1525607109069824, "learning_rate": 2.894858885025417e-06, "loss": 3.1699, "step": 3274 }, { "epoch": 0.8458161157024794, "grad_norm": 3.6125094890594482, "learning_rate": 2.8853915294701355e-06, "loss": 3.526, "step": 3275 }, { "epoch": 0.8460743801652892, "grad_norm": 5.7901763916015625, "learning_rate": 2.875938732134889e-06, "loss": 3.7253, "step": 3276 }, { "epoch": 0.8463326446280992, "grad_norm": 2.989886999130249, "learning_rate": 2.866500499242536e-06, "loss": 4.0315, "step": 3277 }, { "epoch": 0.8465909090909091, "grad_norm": 3.351362943649292, "learning_rate": 2.8570768370063285e-06, "loss": 3.3893, "step": 3278 }, { "epoch": 0.846849173553719, "grad_norm": 2.722827196121216, "learning_rate": 2.84766775162994e-06, "loss": 3.4898, "step": 3279 }, { "epoch": 0.8471074380165289, "grad_norm": 3.05431866645813, "learning_rate": 2.838273249307438e-06, "loss": 4.0295, "step": 3280 }, { "epoch": 0.8473657024793388, "grad_norm": 2.1528117656707764, "learning_rate": 2.828893336223315e-06, "loss": 3.8412, "step": 3281 }, { "epoch": 0.8476239669421488, "grad_norm": 4.143296241760254, "learning_rate": 2.8195280185524197e-06, "loss": 4.5443, "step": 3282 }, { "epoch": 0.8478822314049587, "grad_norm": 3.1096043586730957, "learning_rate": 2.810177302460024e-06, "loss": 4.3552, "step": 3283 }, { "epoch": 0.8481404958677686, "grad_norm": 2.278749942779541, "learning_rate": 2.800841194101789e-06, "loss": 3.8643, "step": 3284 }, { "epoch": 0.8483987603305785, "grad_norm": 2.4999918937683105, "learning_rate": 2.7915196996237265e-06, "loss": 4.0542, "step": 3285 }, { "epoch": 0.8486570247933884, "grad_norm": 3.6781015396118164, "learning_rate": 2.782212825162264e-06, "loss": 3.7529, "step": 3286 }, { "epoch": 0.8489152892561983, "grad_norm": 2.775178909301758, "learning_rate": 2.7729205768441846e-06, "loss": 3.8311, "step": 3287 }, { "epoch": 0.8491735537190083, "grad_norm": 1.8655985593795776, "learning_rate": 2.7636429607866527e-06, "loss": 3.8754, "step": 3288 }, { "epoch": 0.8494318181818182, "grad_norm": 4.383373260498047, "learning_rate": 2.7543799830971858e-06, "loss": 3.3937, "step": 3289 }, { "epoch": 0.8496900826446281, "grad_norm": 3.3575387001037598, "learning_rate": 2.7451316498736833e-06, "loss": 4.0975, "step": 3290 }, { "epoch": 0.849948347107438, "grad_norm": 2.4192349910736084, "learning_rate": 2.7358979672043957e-06, "loss": 4.1984, "step": 3291 }, { "epoch": 0.8502066115702479, "grad_norm": 3.2641398906707764, "learning_rate": 2.7266789411679207e-06, "loss": 3.8355, "step": 3292 }, { "epoch": 0.8504648760330579, "grad_norm": 2.862427234649658, "learning_rate": 2.717474577833226e-06, "loss": 4.192, "step": 3293 }, { "epoch": 0.8507231404958677, "grad_norm": 2.733106851577759, "learning_rate": 2.7082848832596095e-06, "loss": 4.1838, "step": 3294 }, { "epoch": 0.8509814049586777, "grad_norm": 3.110285520553589, "learning_rate": 2.699109863496721e-06, "loss": 3.7801, "step": 3295 }, { "epoch": 0.8512396694214877, "grad_norm": 2.949179172515869, "learning_rate": 2.689949524584545e-06, "loss": 4.0172, "step": 3296 }, { "epoch": 0.8514979338842975, "grad_norm": 2.6190640926361084, "learning_rate": 2.6808038725534084e-06, "loss": 4.1025, "step": 3297 }, { "epoch": 0.8517561983471075, "grad_norm": 3.1749491691589355, "learning_rate": 2.671672913423967e-06, "loss": 3.9868, "step": 3298 }, { "epoch": 0.8520144628099173, "grad_norm": 2.7600202560424805, "learning_rate": 2.662556653207193e-06, "loss": 3.8228, "step": 3299 }, { "epoch": 0.8522727272727273, "grad_norm": 3.0651867389678955, "learning_rate": 2.6534550979044086e-06, "loss": 3.8821, "step": 3300 }, { "epoch": 0.8525309917355371, "grad_norm": 2.7938334941864014, "learning_rate": 2.6443682535072177e-06, "loss": 4.3189, "step": 3301 }, { "epoch": 0.8527892561983471, "grad_norm": 2.614443778991699, "learning_rate": 2.63529612599758e-06, "loss": 3.6999, "step": 3302 }, { "epoch": 0.8530475206611571, "grad_norm": 3.3190858364105225, "learning_rate": 2.62623872134774e-06, "loss": 4.5127, "step": 3303 }, { "epoch": 0.8533057851239669, "grad_norm": 4.514482021331787, "learning_rate": 2.6171960455202536e-06, "loss": 5.0227, "step": 3304 }, { "epoch": 0.8535640495867769, "grad_norm": 4.097684383392334, "learning_rate": 2.6081681044679952e-06, "loss": 4.1776, "step": 3305 }, { "epoch": 0.8538223140495868, "grad_norm": 6.03476095199585, "learning_rate": 2.5991549041341238e-06, "loss": 4.0315, "step": 3306 }, { "epoch": 0.8540805785123967, "grad_norm": 1.9562312364578247, "learning_rate": 2.5901564504521026e-06, "loss": 3.8142, "step": 3307 }, { "epoch": 0.8543388429752066, "grad_norm": 1.7855273485183716, "learning_rate": 2.581172749345673e-06, "loss": 3.9116, "step": 3308 }, { "epoch": 0.8545971074380165, "grad_norm": 1.5924967527389526, "learning_rate": 2.5722038067288928e-06, "loss": 3.9056, "step": 3309 }, { "epoch": 0.8548553719008265, "grad_norm": 3.748455286026001, "learning_rate": 2.563249628506079e-06, "loss": 3.8707, "step": 3310 }, { "epoch": 0.8551136363636364, "grad_norm": 2.507373809814453, "learning_rate": 2.554310220571829e-06, "loss": 4.1336, "step": 3311 }, { "epoch": 0.8553719008264463, "grad_norm": 3.3440475463867188, "learning_rate": 2.5453855888110417e-06, "loss": 4.4814, "step": 3312 }, { "epoch": 0.8556301652892562, "grad_norm": 3.128073215484619, "learning_rate": 2.536475739098862e-06, "loss": 3.9918, "step": 3313 }, { "epoch": 0.8558884297520661, "grad_norm": 3.491370439529419, "learning_rate": 2.5275806773007173e-06, "loss": 4.0448, "step": 3314 }, { "epoch": 0.856146694214876, "grad_norm": 2.0319700241088867, "learning_rate": 2.51870040927229e-06, "loss": 3.8953, "step": 3315 }, { "epoch": 0.856404958677686, "grad_norm": 1.7394421100616455, "learning_rate": 2.50983494085954e-06, "loss": 4.0538, "step": 3316 }, { "epoch": 0.8566632231404959, "grad_norm": 3.2068982124328613, "learning_rate": 2.500984277898669e-06, "loss": 3.8038, "step": 3317 }, { "epoch": 0.8569214876033058, "grad_norm": 2.7238526344299316, "learning_rate": 2.4921484262161314e-06, "loss": 3.4368, "step": 3318 }, { "epoch": 0.8571797520661157, "grad_norm": 3.3359580039978027, "learning_rate": 2.483327391628651e-06, "loss": 3.9278, "step": 3319 }, { "epoch": 0.8574380165289256, "grad_norm": 2.3612728118896484, "learning_rate": 2.474521179943179e-06, "loss": 3.7719, "step": 3320 }, { "epoch": 0.8576962809917356, "grad_norm": 4.688827037811279, "learning_rate": 2.465729796956909e-06, "loss": 3.6946, "step": 3321 }, { "epoch": 0.8579545454545454, "grad_norm": 2.816591739654541, "learning_rate": 2.4569532484572777e-06, "loss": 4.3467, "step": 3322 }, { "epoch": 0.8582128099173554, "grad_norm": 1.628989577293396, "learning_rate": 2.448191540221967e-06, "loss": 3.9652, "step": 3323 }, { "epoch": 0.8584710743801653, "grad_norm": 2.445150136947632, "learning_rate": 2.4394446780188596e-06, "loss": 3.7517, "step": 3324 }, { "epoch": 0.8587293388429752, "grad_norm": 3.387211561203003, "learning_rate": 2.430712667606094e-06, "loss": 4.1262, "step": 3325 }, { "epoch": 0.8589876033057852, "grad_norm": 2.04414701461792, "learning_rate": 2.421995514732031e-06, "loss": 3.4369, "step": 3326 }, { "epoch": 0.859245867768595, "grad_norm": 4.0716118812561035, "learning_rate": 2.4132932251352213e-06, "loss": 3.922, "step": 3327 }, { "epoch": 0.859504132231405, "grad_norm": 2.450150728225708, "learning_rate": 2.40460580454446e-06, "loss": 4.1069, "step": 3328 }, { "epoch": 0.8597623966942148, "grad_norm": 2.487478494644165, "learning_rate": 2.3959332586787452e-06, "loss": 4.0515, "step": 3329 }, { "epoch": 0.8600206611570248, "grad_norm": 1.9560402631759644, "learning_rate": 2.387275593247282e-06, "loss": 3.7884, "step": 3330 }, { "epoch": 0.8602789256198347, "grad_norm": 2.6887025833129883, "learning_rate": 2.378632813949469e-06, "loss": 4.3718, "step": 3331 }, { "epoch": 0.8605371900826446, "grad_norm": 4.184412956237793, "learning_rate": 2.3700049264749306e-06, "loss": 4.0694, "step": 3332 }, { "epoch": 0.8607954545454546, "grad_norm": 5.0838541984558105, "learning_rate": 2.3613919365034653e-06, "loss": 4.3978, "step": 3333 }, { "epoch": 0.8610537190082644, "grad_norm": 4.707245826721191, "learning_rate": 2.352793849705068e-06, "loss": 4.3077, "step": 3334 }, { "epoch": 0.8613119834710744, "grad_norm": 1.856627106666565, "learning_rate": 2.344210671739935e-06, "loss": 4.0865, "step": 3335 }, { "epoch": 0.8615702479338843, "grad_norm": 3.239071846008301, "learning_rate": 2.335642408258437e-06, "loss": 3.8168, "step": 3336 }, { "epoch": 0.8618285123966942, "grad_norm": 2.5626232624053955, "learning_rate": 2.327089064901125e-06, "loss": 3.8518, "step": 3337 }, { "epoch": 0.8620867768595041, "grad_norm": 2.2752180099487305, "learning_rate": 2.318550647298734e-06, "loss": 4.2056, "step": 3338 }, { "epoch": 0.862345041322314, "grad_norm": 3.08626651763916, "learning_rate": 2.3100271610721787e-06, "loss": 3.7527, "step": 3339 }, { "epoch": 0.862603305785124, "grad_norm": 3.1925947666168213, "learning_rate": 2.3015186118325205e-06, "loss": 4.1035, "step": 3340 }, { "epoch": 0.8628615702479339, "grad_norm": 4.007467269897461, "learning_rate": 2.2930250051810125e-06, "loss": 4.5895, "step": 3341 }, { "epoch": 0.8631198347107438, "grad_norm": 3.6299121379852295, "learning_rate": 2.2845463467090713e-06, "loss": 4.1343, "step": 3342 }, { "epoch": 0.8633780991735537, "grad_norm": 6.6774773597717285, "learning_rate": 2.2760826419982535e-06, "loss": 3.9375, "step": 3343 }, { "epoch": 0.8636363636363636, "grad_norm": 2.7909927368164062, "learning_rate": 2.267633896620286e-06, "loss": 3.9834, "step": 3344 }, { "epoch": 0.8638946280991735, "grad_norm": 4.339539527893066, "learning_rate": 2.2592001161370392e-06, "loss": 3.9769, "step": 3345 }, { "epoch": 0.8641528925619835, "grad_norm": 3.283012628555298, "learning_rate": 2.2507813061005494e-06, "loss": 3.7596, "step": 3346 }, { "epoch": 0.8644111570247934, "grad_norm": 3.2643041610717773, "learning_rate": 2.24237747205297e-06, "loss": 3.9129, "step": 3347 }, { "epoch": 0.8646694214876033, "grad_norm": 4.137417793273926, "learning_rate": 2.233988619526614e-06, "loss": 3.5465, "step": 3348 }, { "epoch": 0.8649276859504132, "grad_norm": 3.3557534217834473, "learning_rate": 2.2256147540439422e-06, "loss": 4.4813, "step": 3349 }, { "epoch": 0.8651859504132231, "grad_norm": 2.7159531116485596, "learning_rate": 2.2172558811175193e-06, "loss": 4.2489, "step": 3350 }, { "epoch": 0.8654442148760331, "grad_norm": 1.8367173671722412, "learning_rate": 2.208912006250066e-06, "loss": 4.4926, "step": 3351 }, { "epoch": 0.8657024793388429, "grad_norm": 3.205218553543091, "learning_rate": 2.200583134934417e-06, "loss": 4.2086, "step": 3352 }, { "epoch": 0.8659607438016529, "grad_norm": 1.8473701477050781, "learning_rate": 2.1922692726535356e-06, "loss": 3.8765, "step": 3353 }, { "epoch": 0.8662190082644629, "grad_norm": 2.047015428543091, "learning_rate": 2.1839704248804993e-06, "loss": 3.8761, "step": 3354 }, { "epoch": 0.8664772727272727, "grad_norm": 2.0011143684387207, "learning_rate": 2.175686597078508e-06, "loss": 3.837, "step": 3355 }, { "epoch": 0.8667355371900827, "grad_norm": 4.062522888183594, "learning_rate": 2.1674177947008712e-06, "loss": 3.8684, "step": 3356 }, { "epoch": 0.8669938016528925, "grad_norm": 2.8642470836639404, "learning_rate": 2.1591640231909988e-06, "loss": 4.134, "step": 3357 }, { "epoch": 0.8672520661157025, "grad_norm": 2.6218760013580322, "learning_rate": 2.1509252879824222e-06, "loss": 4.0048, "step": 3358 }, { "epoch": 0.8675103305785123, "grad_norm": 3.055185556411743, "learning_rate": 2.142701594498764e-06, "loss": 3.8259, "step": 3359 }, { "epoch": 0.8677685950413223, "grad_norm": 3.7660911083221436, "learning_rate": 2.1344929481537436e-06, "loss": 3.8576, "step": 3360 }, { "epoch": 0.8680268595041323, "grad_norm": 3.3174123764038086, "learning_rate": 2.1262993543511717e-06, "loss": 3.7239, "step": 3361 }, { "epoch": 0.8682851239669421, "grad_norm": 3.113765001296997, "learning_rate": 2.1181208184849645e-06, "loss": 4.0702, "step": 3362 }, { "epoch": 0.8685433884297521, "grad_norm": 1.8923250436782837, "learning_rate": 2.1099573459391146e-06, "loss": 3.831, "step": 3363 }, { "epoch": 0.868801652892562, "grad_norm": 3.0733771324157715, "learning_rate": 2.1018089420876963e-06, "loss": 4.0345, "step": 3364 }, { "epoch": 0.8690599173553719, "grad_norm": 2.886568784713745, "learning_rate": 2.0936756122948738e-06, "loss": 3.7371, "step": 3365 }, { "epoch": 0.8693181818181818, "grad_norm": 3.048457145690918, "learning_rate": 2.085557361914872e-06, "loss": 3.7453, "step": 3366 }, { "epoch": 0.8695764462809917, "grad_norm": 2.6429896354675293, "learning_rate": 2.0774541962920086e-06, "loss": 4.7485, "step": 3367 }, { "epoch": 0.8698347107438017, "grad_norm": 2.694023847579956, "learning_rate": 2.0693661207606546e-06, "loss": 3.918, "step": 3368 }, { "epoch": 0.8700929752066116, "grad_norm": 2.4736437797546387, "learning_rate": 2.061293140645254e-06, "loss": 3.8064, "step": 3369 }, { "epoch": 0.8703512396694215, "grad_norm": 2.7916765213012695, "learning_rate": 2.053235261260317e-06, "loss": 3.7241, "step": 3370 }, { "epoch": 0.8706095041322314, "grad_norm": 3.8951451778411865, "learning_rate": 2.045192487910408e-06, "loss": 4.2386, "step": 3371 }, { "epoch": 0.8708677685950413, "grad_norm": 2.223626136779785, "learning_rate": 2.0371648258901447e-06, "loss": 3.7004, "step": 3372 }, { "epoch": 0.8711260330578512, "grad_norm": 2.634814500808716, "learning_rate": 2.029152280484198e-06, "loss": 3.8959, "step": 3373 }, { "epoch": 0.8713842975206612, "grad_norm": 4.991781711578369, "learning_rate": 2.0211548569672946e-06, "loss": 3.5767, "step": 3374 }, { "epoch": 0.8716425619834711, "grad_norm": 3.6380562782287598, "learning_rate": 2.0131725606042013e-06, "loss": 3.8193, "step": 3375 }, { "epoch": 0.871900826446281, "grad_norm": 2.4980459213256836, "learning_rate": 2.0052053966497204e-06, "loss": 3.9575, "step": 3376 }, { "epoch": 0.8721590909090909, "grad_norm": 2.561688184738159, "learning_rate": 1.9972533703487035e-06, "loss": 4.1568, "step": 3377 }, { "epoch": 0.8724173553719008, "grad_norm": 2.286518096923828, "learning_rate": 1.9893164869360324e-06, "loss": 3.9866, "step": 3378 }, { "epoch": 0.8726756198347108, "grad_norm": 2.858470916748047, "learning_rate": 1.981394751636617e-06, "loss": 4.0274, "step": 3379 }, { "epoch": 0.8729338842975206, "grad_norm": 2.7560088634490967, "learning_rate": 1.973488169665394e-06, "loss": 3.6959, "step": 3380 }, { "epoch": 0.8731921487603306, "grad_norm": 3.1334404945373535, "learning_rate": 1.9655967462273383e-06, "loss": 3.7026, "step": 3381 }, { "epoch": 0.8734504132231405, "grad_norm": 2.983370065689087, "learning_rate": 1.957720486517428e-06, "loss": 4.2244, "step": 3382 }, { "epoch": 0.8737086776859504, "grad_norm": 3.0055181980133057, "learning_rate": 1.9498593957206657e-06, "loss": 3.606, "step": 3383 }, { "epoch": 0.8739669421487604, "grad_norm": 2.5050644874572754, "learning_rate": 1.9420134790120754e-06, "loss": 3.6066, "step": 3384 }, { "epoch": 0.8742252066115702, "grad_norm": 2.851290702819824, "learning_rate": 1.934182741556681e-06, "loss": 3.7764, "step": 3385 }, { "epoch": 0.8744834710743802, "grad_norm": 2.5236024856567383, "learning_rate": 1.9263671885095232e-06, "loss": 3.7672, "step": 3386 }, { "epoch": 0.87474173553719, "grad_norm": 1.9330347776412964, "learning_rate": 1.9185668250156303e-06, "loss": 4.1399, "step": 3387 }, { "epoch": 0.875, "grad_norm": 2.5656964778900146, "learning_rate": 1.910781656210059e-06, "loss": 4.1019, "step": 3388 }, { "epoch": 0.87525826446281, "grad_norm": 3.4180235862731934, "learning_rate": 1.9030116872178316e-06, "loss": 3.9922, "step": 3389 }, { "epoch": 0.8755165289256198, "grad_norm": 2.650888204574585, "learning_rate": 1.8952569231539874e-06, "loss": 3.7119, "step": 3390 }, { "epoch": 0.8757747933884298, "grad_norm": 3.079531192779541, "learning_rate": 1.8875173691235537e-06, "loss": 4.1265, "step": 3391 }, { "epoch": 0.8760330578512396, "grad_norm": 8.220881462097168, "learning_rate": 1.8797930302215244e-06, "loss": 3.9571, "step": 3392 }, { "epoch": 0.8762913223140496, "grad_norm": 1.9178886413574219, "learning_rate": 1.872083911532907e-06, "loss": 4.0272, "step": 3393 }, { "epoch": 0.8765495867768595, "grad_norm": 1.701996088027954, "learning_rate": 1.8643900181326668e-06, "loss": 3.8491, "step": 3394 }, { "epoch": 0.8768078512396694, "grad_norm": 3.448707342147827, "learning_rate": 1.8567113550857578e-06, "loss": 3.8122, "step": 3395 }, { "epoch": 0.8770661157024794, "grad_norm": 3.214686870574951, "learning_rate": 1.8490479274470978e-06, "loss": 4.2586, "step": 3396 }, { "epoch": 0.8773243801652892, "grad_norm": 3.951704502105713, "learning_rate": 1.8413997402615902e-06, "loss": 3.9657, "step": 3397 }, { "epoch": 0.8775826446280992, "grad_norm": 1.9711072444915771, "learning_rate": 1.8337667985640932e-06, "loss": 4.1909, "step": 3398 }, { "epoch": 0.8778409090909091, "grad_norm": 2.556900978088379, "learning_rate": 1.8261491073794268e-06, "loss": 3.8241, "step": 3399 }, { "epoch": 0.878099173553719, "grad_norm": 2.2272233963012695, "learning_rate": 1.8185466717223849e-06, "loss": 4.1884, "step": 3400 }, { "epoch": 0.8783574380165289, "grad_norm": 2.773423910140991, "learning_rate": 1.8109594965977056e-06, "loss": 3.6758, "step": 3401 }, { "epoch": 0.8786157024793388, "grad_norm": 3.930060625076294, "learning_rate": 1.8033875870000882e-06, "loss": 3.8132, "step": 3402 }, { "epoch": 0.8788739669421488, "grad_norm": 4.648772716522217, "learning_rate": 1.7958309479141732e-06, "loss": 3.7004, "step": 3403 }, { "epoch": 0.8791322314049587, "grad_norm": 2.5668938159942627, "learning_rate": 1.7882895843145648e-06, "loss": 4.2233, "step": 3404 }, { "epoch": 0.8793904958677686, "grad_norm": 4.041262149810791, "learning_rate": 1.7807635011657997e-06, "loss": 3.4732, "step": 3405 }, { "epoch": 0.8796487603305785, "grad_norm": 2.010511875152588, "learning_rate": 1.7732527034223513e-06, "loss": 3.8098, "step": 3406 }, { "epoch": 0.8799070247933884, "grad_norm": 3.408688545227051, "learning_rate": 1.7657571960286418e-06, "loss": 4.5027, "step": 3407 }, { "epoch": 0.8801652892561983, "grad_norm": 3.06320858001709, "learning_rate": 1.7582769839190244e-06, "loss": 3.9839, "step": 3408 }, { "epoch": 0.8804235537190083, "grad_norm": 3.5746312141418457, "learning_rate": 1.7508120720177794e-06, "loss": 3.8067, "step": 3409 }, { "epoch": 0.8806818181818182, "grad_norm": 3.930293083190918, "learning_rate": 1.7433624652391096e-06, "loss": 3.8986, "step": 3410 }, { "epoch": 0.8809400826446281, "grad_norm": 2.4392547607421875, "learning_rate": 1.735928168487161e-06, "loss": 4.028, "step": 3411 }, { "epoch": 0.881198347107438, "grad_norm": 2.9954729080200195, "learning_rate": 1.7285091866559827e-06, "loss": 4.1595, "step": 3412 }, { "epoch": 0.8814566115702479, "grad_norm": 1.9812486171722412, "learning_rate": 1.721105524629546e-06, "loss": 3.8101, "step": 3413 }, { "epoch": 0.8817148760330579, "grad_norm": 3.85954213142395, "learning_rate": 1.713717187281752e-06, "loss": 3.8959, "step": 3414 }, { "epoch": 0.8819731404958677, "grad_norm": 3.7409253120422363, "learning_rate": 1.7063441794763852e-06, "loss": 3.9951, "step": 3415 }, { "epoch": 0.8822314049586777, "grad_norm": 2.3165061473846436, "learning_rate": 1.6989865060671684e-06, "loss": 3.7804, "step": 3416 }, { "epoch": 0.8824896694214877, "grad_norm": 3.811399459838867, "learning_rate": 1.691644171897705e-06, "loss": 4.0133, "step": 3417 }, { "epoch": 0.8827479338842975, "grad_norm": 3.1055095195770264, "learning_rate": 1.6843171818015147e-06, "loss": 3.723, "step": 3418 }, { "epoch": 0.8830061983471075, "grad_norm": 1.7939701080322266, "learning_rate": 1.677005540602014e-06, "loss": 3.7919, "step": 3419 }, { "epoch": 0.8832644628099173, "grad_norm": 3.217761278152466, "learning_rate": 1.6697092531125142e-06, "loss": 3.651, "step": 3420 }, { "epoch": 0.8835227272727273, "grad_norm": 2.7719264030456543, "learning_rate": 1.662428324136217e-06, "loss": 4.1831, "step": 3421 }, { "epoch": 0.8837809917355371, "grad_norm": 4.410435199737549, "learning_rate": 1.655162758466211e-06, "loss": 3.8051, "step": 3422 }, { "epoch": 0.8840392561983471, "grad_norm": 2.099574327468872, "learning_rate": 1.6479125608854812e-06, "loss": 4.0111, "step": 3423 }, { "epoch": 0.8842975206611571, "grad_norm": 2.5847878456115723, "learning_rate": 1.6406777361668902e-06, "loss": 3.7511, "step": 3424 }, { "epoch": 0.8845557851239669, "grad_norm": 3.2537429332733154, "learning_rate": 1.6334582890731698e-06, "loss": 3.674, "step": 3425 }, { "epoch": 0.8848140495867769, "grad_norm": 2.6913421154022217, "learning_rate": 1.6262542243569517e-06, "loss": 4.451, "step": 3426 }, { "epoch": 0.8850723140495868, "grad_norm": 2.1942100524902344, "learning_rate": 1.6190655467607229e-06, "loss": 4.3271, "step": 3427 }, { "epoch": 0.8853305785123967, "grad_norm": 2.364292860031128, "learning_rate": 1.6118922610168451e-06, "loss": 4.069, "step": 3428 }, { "epoch": 0.8855888429752066, "grad_norm": 3.3428244590759277, "learning_rate": 1.6047343718475438e-06, "loss": 3.807, "step": 3429 }, { "epoch": 0.8858471074380165, "grad_norm": 2.1298434734344482, "learning_rate": 1.5975918839649245e-06, "loss": 3.9321, "step": 3430 }, { "epoch": 0.8861053719008265, "grad_norm": 2.8968870639801025, "learning_rate": 1.5904648020709317e-06, "loss": 3.8653, "step": 3431 }, { "epoch": 0.8863636363636364, "grad_norm": 2.40994930267334, "learning_rate": 1.5833531308573845e-06, "loss": 3.7024, "step": 3432 }, { "epoch": 0.8866219008264463, "grad_norm": 3.909832715988159, "learning_rate": 1.5762568750059604e-06, "loss": 4.5858, "step": 3433 }, { "epoch": 0.8868801652892562, "grad_norm": 4.120272159576416, "learning_rate": 1.5691760391881611e-06, "loss": 4.08, "step": 3434 }, { "epoch": 0.8871384297520661, "grad_norm": 2.4024624824523926, "learning_rate": 1.562110628065372e-06, "loss": 3.9668, "step": 3435 }, { "epoch": 0.887396694214876, "grad_norm": 2.3347103595733643, "learning_rate": 1.5550606462888057e-06, "loss": 3.7969, "step": 3436 }, { "epoch": 0.887654958677686, "grad_norm": 2.1106293201446533, "learning_rate": 1.5480260984995166e-06, "loss": 4.2921, "step": 3437 }, { "epoch": 0.8879132231404959, "grad_norm": 3.3097736835479736, "learning_rate": 1.5410069893284002e-06, "loss": 3.8725, "step": 3438 }, { "epoch": 0.8881714876033058, "grad_norm": 2.3048887252807617, "learning_rate": 1.5340033233961937e-06, "loss": 3.9401, "step": 3439 }, { "epoch": 0.8884297520661157, "grad_norm": 3.376312494277954, "learning_rate": 1.5270151053134757e-06, "loss": 4.1373, "step": 3440 }, { "epoch": 0.8886880165289256, "grad_norm": 2.137563943862915, "learning_rate": 1.5200423396806274e-06, "loss": 3.9545, "step": 3441 }, { "epoch": 0.8889462809917356, "grad_norm": 2.716418981552124, "learning_rate": 1.5130850310878881e-06, "loss": 4.1368, "step": 3442 }, { "epoch": 0.8892045454545454, "grad_norm": 3.0952115058898926, "learning_rate": 1.5061431841153051e-06, "loss": 3.6921, "step": 3443 }, { "epoch": 0.8894628099173554, "grad_norm": 7.174221038818359, "learning_rate": 1.499216803332748e-06, "loss": 4.2986, "step": 3444 }, { "epoch": 0.8897210743801653, "grad_norm": 4.137556552886963, "learning_rate": 1.4923058932999058e-06, "loss": 3.7565, "step": 3445 }, { "epoch": 0.8899793388429752, "grad_norm": 2.8430159091949463, "learning_rate": 1.4854104585662915e-06, "loss": 4.0792, "step": 3446 }, { "epoch": 0.8902376033057852, "grad_norm": 3.147965431213379, "learning_rate": 1.4785305036712189e-06, "loss": 4.4854, "step": 3447 }, { "epoch": 0.890495867768595, "grad_norm": 2.5706870555877686, "learning_rate": 1.4716660331438147e-06, "loss": 3.9402, "step": 3448 }, { "epoch": 0.890754132231405, "grad_norm": 3.3165700435638428, "learning_rate": 1.4648170515030229e-06, "loss": 3.8854, "step": 3449 }, { "epoch": 0.8910123966942148, "grad_norm": 6.272552013397217, "learning_rate": 1.4579835632575728e-06, "loss": 3.7621, "step": 3450 }, { "epoch": 0.8912706611570248, "grad_norm": 7.483799934387207, "learning_rate": 1.4511655729060047e-06, "loss": 3.9266, "step": 3451 }, { "epoch": 0.8915289256198347, "grad_norm": 2.8220319747924805, "learning_rate": 1.4443630849366562e-06, "loss": 4.0014, "step": 3452 }, { "epoch": 0.8917871900826446, "grad_norm": 1.4340145587921143, "learning_rate": 1.4375761038276647e-06, "loss": 3.6846, "step": 3453 }, { "epoch": 0.8920454545454546, "grad_norm": 2.809190034866333, "learning_rate": 1.4308046340469366e-06, "loss": 4.0947, "step": 3454 }, { "epoch": 0.8923037190082644, "grad_norm": 3.3649113178253174, "learning_rate": 1.424048680052198e-06, "loss": 3.9043, "step": 3455 }, { "epoch": 0.8925619834710744, "grad_norm": 3.0595510005950928, "learning_rate": 1.4173082462909465e-06, "loss": 4.45, "step": 3456 }, { "epoch": 0.8928202479338843, "grad_norm": 2.793586492538452, "learning_rate": 1.4105833372004524e-06, "loss": 3.3847, "step": 3457 }, { "epoch": 0.8930785123966942, "grad_norm": 7.6289191246032715, "learning_rate": 1.4038739572077826e-06, "loss": 4.615, "step": 3458 }, { "epoch": 0.8933367768595041, "grad_norm": 2.115919828414917, "learning_rate": 1.397180110729776e-06, "loss": 3.9124, "step": 3459 }, { "epoch": 0.893595041322314, "grad_norm": 2.4956624507904053, "learning_rate": 1.3905018021730415e-06, "loss": 4.0411, "step": 3460 }, { "epoch": 0.893853305785124, "grad_norm": 2.4357903003692627, "learning_rate": 1.3838390359339593e-06, "loss": 4.0676, "step": 3461 }, { "epoch": 0.8941115702479339, "grad_norm": 2.69044828414917, "learning_rate": 1.3771918163986875e-06, "loss": 3.9968, "step": 3462 }, { "epoch": 0.8943698347107438, "grad_norm": 3.2288174629211426, "learning_rate": 1.370560147943137e-06, "loss": 3.781, "step": 3463 }, { "epoch": 0.8946280991735537, "grad_norm": 2.3274574279785156, "learning_rate": 1.3639440349329846e-06, "loss": 4.0389, "step": 3464 }, { "epoch": 0.8948863636363636, "grad_norm": 2.493269443511963, "learning_rate": 1.3573434817236802e-06, "loss": 4.09, "step": 3465 }, { "epoch": 0.8951446280991735, "grad_norm": 2.9811418056488037, "learning_rate": 1.3507584926604117e-06, "loss": 3.4057, "step": 3466 }, { "epoch": 0.8954028925619835, "grad_norm": 2.171727418899536, "learning_rate": 1.3441890720781309e-06, "loss": 3.7403, "step": 3467 }, { "epoch": 0.8956611570247934, "grad_norm": 3.3984124660491943, "learning_rate": 1.3376352243015367e-06, "loss": 4.2194, "step": 3468 }, { "epoch": 0.8959194214876033, "grad_norm": 6.718136787414551, "learning_rate": 1.3310969536450807e-06, "loss": 3.8947, "step": 3469 }, { "epoch": 0.8961776859504132, "grad_norm": 3.238020658493042, "learning_rate": 1.3245742644129589e-06, "loss": 3.5572, "step": 3470 }, { "epoch": 0.8964359504132231, "grad_norm": 2.6004586219787598, "learning_rate": 1.318067160899103e-06, "loss": 3.9488, "step": 3471 }, { "epoch": 0.8966942148760331, "grad_norm": 2.791968822479248, "learning_rate": 1.3115756473871975e-06, "loss": 3.7724, "step": 3472 }, { "epoch": 0.8969524793388429, "grad_norm": 2.9283454418182373, "learning_rate": 1.305099728150655e-06, "loss": 3.9544, "step": 3473 }, { "epoch": 0.8972107438016529, "grad_norm": 3.6008729934692383, "learning_rate": 1.2986394074526204e-06, "loss": 3.9685, "step": 3474 }, { "epoch": 0.8974690082644629, "grad_norm": 4.339023590087891, "learning_rate": 1.2921946895459724e-06, "loss": 3.5009, "step": 3475 }, { "epoch": 0.8977272727272727, "grad_norm": 2.977484703063965, "learning_rate": 1.2857655786733259e-06, "loss": 4.65, "step": 3476 }, { "epoch": 0.8979855371900827, "grad_norm": 3.275205135345459, "learning_rate": 1.2793520790670116e-06, "loss": 4.2729, "step": 3477 }, { "epoch": 0.8982438016528925, "grad_norm": 3.0364441871643066, "learning_rate": 1.2729541949490803e-06, "loss": 3.7868, "step": 3478 }, { "epoch": 0.8985020661157025, "grad_norm": 3.25525164604187, "learning_rate": 1.266571930531324e-06, "loss": 3.6386, "step": 3479 }, { "epoch": 0.8987603305785123, "grad_norm": 1.6969681978225708, "learning_rate": 1.2602052900152235e-06, "loss": 3.7668, "step": 3480 }, { "epoch": 0.8990185950413223, "grad_norm": 4.0706634521484375, "learning_rate": 1.2538542775919926e-06, "loss": 3.6319, "step": 3481 }, { "epoch": 0.8992768595041323, "grad_norm": 3.6964077949523926, "learning_rate": 1.2475188974425567e-06, "loss": 4.0246, "step": 3482 }, { "epoch": 0.8995351239669421, "grad_norm": 3.2918312549591064, "learning_rate": 1.2411991537375377e-06, "loss": 4.4386, "step": 3483 }, { "epoch": 0.8997933884297521, "grad_norm": 4.767353057861328, "learning_rate": 1.2348950506372803e-06, "loss": 3.8971, "step": 3484 }, { "epoch": 0.900051652892562, "grad_norm": 3.1855108737945557, "learning_rate": 1.2286065922918227e-06, "loss": 4.0585, "step": 3485 }, { "epoch": 0.9003099173553719, "grad_norm": 2.3574044704437256, "learning_rate": 1.2223337828409037e-06, "loss": 4.1218, "step": 3486 }, { "epoch": 0.9005681818181818, "grad_norm": 6.735174179077148, "learning_rate": 1.2160766264139645e-06, "loss": 3.7834, "step": 3487 }, { "epoch": 0.9008264462809917, "grad_norm": 2.76267147064209, "learning_rate": 1.2098351271301406e-06, "loss": 3.4616, "step": 3488 }, { "epoch": 0.9010847107438017, "grad_norm": 2.669605016708374, "learning_rate": 1.2036092890982619e-06, "loss": 4.311, "step": 3489 }, { "epoch": 0.9013429752066116, "grad_norm": 3.2821292877197266, "learning_rate": 1.197399116416839e-06, "loss": 3.6398, "step": 3490 }, { "epoch": 0.9016012396694215, "grad_norm": 2.0125632286071777, "learning_rate": 1.1912046131740877e-06, "loss": 3.9807, "step": 3491 }, { "epoch": 0.9018595041322314, "grad_norm": 5.374893665313721, "learning_rate": 1.1850257834478934e-06, "loss": 2.9842, "step": 3492 }, { "epoch": 0.9021177685950413, "grad_norm": 2.2299129962921143, "learning_rate": 1.1788626313058243e-06, "loss": 3.5551, "step": 3493 }, { "epoch": 0.9023760330578512, "grad_norm": 1.8811044692993164, "learning_rate": 1.172715160805135e-06, "loss": 3.2939, "step": 3494 }, { "epoch": 0.9026342975206612, "grad_norm": 3.008349895477295, "learning_rate": 1.166583375992758e-06, "loss": 3.7055, "step": 3495 }, { "epoch": 0.9028925619834711, "grad_norm": 2.8098244667053223, "learning_rate": 1.1604672809052864e-06, "loss": 3.9134, "step": 3496 }, { "epoch": 0.903150826446281, "grad_norm": 8.882136344909668, "learning_rate": 1.1543668795689939e-06, "loss": 3.7164, "step": 3497 }, { "epoch": 0.9034090909090909, "grad_norm": 1.9784690141677856, "learning_rate": 1.1482821759998374e-06, "loss": 4.4971, "step": 3498 }, { "epoch": 0.9036673553719008, "grad_norm": 4.22647762298584, "learning_rate": 1.1422131742034048e-06, "loss": 3.5474, "step": 3499 }, { "epoch": 0.9039256198347108, "grad_norm": 3.492882251739502, "learning_rate": 1.1361598781749804e-06, "loss": 4.2786, "step": 3500 }, { "epoch": 0.9041838842975206, "grad_norm": 3.0782439708709717, "learning_rate": 1.1301222918994936e-06, "loss": 3.5804, "step": 3501 }, { "epoch": 0.9044421487603306, "grad_norm": 4.71906852722168, "learning_rate": 1.1241004193515315e-06, "loss": 3.8378, "step": 3502 }, { "epoch": 0.9047004132231405, "grad_norm": 2.6081430912017822, "learning_rate": 1.1180942644953424e-06, "loss": 3.9551, "step": 3503 }, { "epoch": 0.9049586776859504, "grad_norm": 3.52667498588562, "learning_rate": 1.112103831284822e-06, "loss": 3.2153, "step": 3504 }, { "epoch": 0.9052169421487604, "grad_norm": 2.8831567764282227, "learning_rate": 1.1061291236635296e-06, "loss": 4.2668, "step": 3505 }, { "epoch": 0.9054752066115702, "grad_norm": 5.971936225891113, "learning_rate": 1.1001701455646496e-06, "loss": 2.0801, "step": 3506 }, { "epoch": 0.9057334710743802, "grad_norm": 5.6863179206848145, "learning_rate": 1.0942269009110329e-06, "loss": 3.6075, "step": 3507 }, { "epoch": 0.90599173553719, "grad_norm": 4.883625507354736, "learning_rate": 1.088299393615161e-06, "loss": 4.2672, "step": 3508 }, { "epoch": 0.90625, "grad_norm": 3.5639562606811523, "learning_rate": 1.0823876275791568e-06, "loss": 4.4123, "step": 3509 }, { "epoch": 0.90650826446281, "grad_norm": 1.7129499912261963, "learning_rate": 1.0764916066947794e-06, "loss": 3.8926, "step": 3510 }, { "epoch": 0.9067665289256198, "grad_norm": 2.531024694442749, "learning_rate": 1.0706113348434326e-06, "loss": 3.3479, "step": 3511 }, { "epoch": 0.9070247933884298, "grad_norm": 2.9449219703674316, "learning_rate": 1.064746815896142e-06, "loss": 3.7209, "step": 3512 }, { "epoch": 0.9072830578512396, "grad_norm": 3.705754041671753, "learning_rate": 1.058898053713564e-06, "loss": 3.6694, "step": 3513 }, { "epoch": 0.9075413223140496, "grad_norm": 4.070271015167236, "learning_rate": 1.0530650521459906e-06, "loss": 3.9521, "step": 3514 }, { "epoch": 0.9077995867768595, "grad_norm": 3.108389139175415, "learning_rate": 1.0472478150333282e-06, "loss": 4.5379, "step": 3515 }, { "epoch": 0.9080578512396694, "grad_norm": 2.9127018451690674, "learning_rate": 1.0414463462051106e-06, "loss": 4.0161, "step": 3516 }, { "epoch": 0.9083161157024794, "grad_norm": 3.026111125946045, "learning_rate": 1.0356606494804883e-06, "loss": 3.9158, "step": 3517 }, { "epoch": 0.9085743801652892, "grad_norm": 2.9682531356811523, "learning_rate": 1.0298907286682342e-06, "loss": 4.1932, "step": 3518 }, { "epoch": 0.9088326446280992, "grad_norm": 4.773813247680664, "learning_rate": 1.0241365875667291e-06, "loss": 4.2721, "step": 3519 }, { "epoch": 0.9090909090909091, "grad_norm": 2.7295033931732178, "learning_rate": 1.0183982299639683e-06, "loss": 4.1813, "step": 3520 }, { "epoch": 0.909349173553719, "grad_norm": 2.9368505477905273, "learning_rate": 1.0126756596375686e-06, "loss": 4.0295, "step": 3521 }, { "epoch": 0.9096074380165289, "grad_norm": 2.7192113399505615, "learning_rate": 1.006968880354725e-06, "loss": 3.7913, "step": 3522 }, { "epoch": 0.9098657024793388, "grad_norm": 1.602691888809204, "learning_rate": 1.0012778958722684e-06, "loss": 3.9047, "step": 3523 }, { "epoch": 0.9101239669421488, "grad_norm": 3.3259940147399902, "learning_rate": 9.95602709936616e-07, "loss": 4.1056, "step": 3524 }, { "epoch": 0.9103822314049587, "grad_norm": 3.2569944858551025, "learning_rate": 9.89943326283782e-07, "loss": 3.8926, "step": 3525 }, { "epoch": 0.9106404958677686, "grad_norm": 2.727327823638916, "learning_rate": 9.84299748639389e-07, "loss": 4.1814, "step": 3526 }, { "epoch": 0.9108987603305785, "grad_norm": 2.6677374839782715, "learning_rate": 9.786719807186457e-07, "loss": 3.8125, "step": 3527 }, { "epoch": 0.9111570247933884, "grad_norm": 3.611823081970215, "learning_rate": 9.73060026226355e-07, "loss": 3.3343, "step": 3528 }, { "epoch": 0.9114152892561983, "grad_norm": 4.13549280166626, "learning_rate": 9.674638888569093e-07, "loss": 3.9984, "step": 3529 }, { "epoch": 0.9116735537190083, "grad_norm": 3.3959436416625977, "learning_rate": 9.618835722942948e-07, "loss": 3.8421, "step": 3530 }, { "epoch": 0.9119318181818182, "grad_norm": 3.1556167602539062, "learning_rate": 9.56319080212076e-07, "loss": 3.8526, "step": 3531 }, { "epoch": 0.9121900826446281, "grad_norm": 2.6798808574676514, "learning_rate": 9.507704162733944e-07, "loss": 3.5442, "step": 3532 }, { "epoch": 0.912448347107438, "grad_norm": 3.90078067779541, "learning_rate": 9.452375841309924e-07, "loss": 4.005, "step": 3533 }, { "epoch": 0.9127066115702479, "grad_norm": 2.562237501144409, "learning_rate": 9.397205874271648e-07, "loss": 4.274, "step": 3534 }, { "epoch": 0.9129648760330579, "grad_norm": 3.465367555618286, "learning_rate": 9.342194297938006e-07, "loss": 3.9508, "step": 3535 }, { "epoch": 0.9132231404958677, "grad_norm": 2.176896810531616, "learning_rate": 9.287341148523504e-07, "loss": 3.9501, "step": 3536 }, { "epoch": 0.9134814049586777, "grad_norm": 3.828343629837036, "learning_rate": 9.232646462138422e-07, "loss": 3.9471, "step": 3537 }, { "epoch": 0.9137396694214877, "grad_norm": 2.658092975616455, "learning_rate": 9.178110274788737e-07, "loss": 3.7828, "step": 3538 }, { "epoch": 0.9139979338842975, "grad_norm": 2.9530375003814697, "learning_rate": 9.123732622375952e-07, "loss": 3.9387, "step": 3539 }, { "epoch": 0.9142561983471075, "grad_norm": 3.107215642929077, "learning_rate": 9.069513540697406e-07, "loss": 4.0971, "step": 3540 }, { "epoch": 0.9145144628099173, "grad_norm": 4.290931224822998, "learning_rate": 9.015453065445906e-07, "loss": 3.6953, "step": 3541 }, { "epoch": 0.9147727272727273, "grad_norm": 2.674926519393921, "learning_rate": 8.961551232209847e-07, "loss": 3.7598, "step": 3542 }, { "epoch": 0.9150309917355371, "grad_norm": 2.520958662033081, "learning_rate": 8.907808076473261e-07, "loss": 4.0788, "step": 3543 }, { "epoch": 0.9152892561983471, "grad_norm": 6.0664849281311035, "learning_rate": 8.854223633615766e-07, "loss": 2.9592, "step": 3544 }, { "epoch": 0.9155475206611571, "grad_norm": 1.8273793458938599, "learning_rate": 8.800797938912253e-07, "loss": 3.9364, "step": 3545 }, { "epoch": 0.9158057851239669, "grad_norm": 2.2911531925201416, "learning_rate": 8.747531027533396e-07, "loss": 3.5465, "step": 3546 }, { "epoch": 0.9160640495867769, "grad_norm": 1.7610814571380615, "learning_rate": 8.694422934545226e-07, "loss": 3.9775, "step": 3547 }, { "epoch": 0.9163223140495868, "grad_norm": 6.852576732635498, "learning_rate": 8.641473694909141e-07, "loss": 3.0203, "step": 3548 }, { "epoch": 0.9165805785123967, "grad_norm": 2.7168402671813965, "learning_rate": 8.588683343482091e-07, "loss": 3.4856, "step": 3549 }, { "epoch": 0.9168388429752066, "grad_norm": 1.9423593282699585, "learning_rate": 8.536051915016385e-07, "loss": 3.8008, "step": 3550 }, { "epoch": 0.9170971074380165, "grad_norm": 4.035871982574463, "learning_rate": 8.483579444159673e-07, "loss": 3.95, "step": 3551 }, { "epoch": 0.9173553719008265, "grad_norm": 1.4160363674163818, "learning_rate": 8.431265965454988e-07, "loss": 3.6164, "step": 3552 }, { "epoch": 0.9176136363636364, "grad_norm": 4.186315059661865, "learning_rate": 8.379111513340754e-07, "loss": 3.9976, "step": 3553 }, { "epoch": 0.9178719008264463, "grad_norm": 2.632674217224121, "learning_rate": 8.327116122150647e-07, "loss": 4.159, "step": 3554 }, { "epoch": 0.9181301652892562, "grad_norm": 25.313425064086914, "learning_rate": 8.275279826113563e-07, "loss": 3.9102, "step": 3555 }, { "epoch": 0.9183884297520661, "grad_norm": 2.918551445007324, "learning_rate": 8.223602659353819e-07, "loss": 3.7977, "step": 3556 }, { "epoch": 0.918646694214876, "grad_norm": 2.145977258682251, "learning_rate": 8.172084655890894e-07, "loss": 3.9878, "step": 3557 }, { "epoch": 0.918904958677686, "grad_norm": 3.7760541439056396, "learning_rate": 8.120725849639466e-07, "loss": 3.8918, "step": 3558 }, { "epoch": 0.9191632231404959, "grad_norm": 2.1841700077056885, "learning_rate": 8.069526274409434e-07, "loss": 4.2546, "step": 3559 }, { "epoch": 0.9194214876033058, "grad_norm": 4.07749605178833, "learning_rate": 8.01848596390592e-07, "loss": 3.9019, "step": 3560 }, { "epoch": 0.9196797520661157, "grad_norm": 3.4071943759918213, "learning_rate": 7.96760495172913e-07, "loss": 3.9261, "step": 3561 }, { "epoch": 0.9199380165289256, "grad_norm": 2.7749783992767334, "learning_rate": 7.916883271374437e-07, "loss": 3.763, "step": 3562 }, { "epoch": 0.9201962809917356, "grad_norm": 2.5384106636047363, "learning_rate": 7.866320956232354e-07, "loss": 4.4781, "step": 3563 }, { "epoch": 0.9204545454545454, "grad_norm": 3.222029447555542, "learning_rate": 7.815918039588366e-07, "loss": 4.1509, "step": 3564 }, { "epoch": 0.9207128099173554, "grad_norm": 1.9281995296478271, "learning_rate": 7.765674554623181e-07, "loss": 3.4706, "step": 3565 }, { "epoch": 0.9209710743801653, "grad_norm": 1.9244985580444336, "learning_rate": 7.715590534412454e-07, "loss": 3.8712, "step": 3566 }, { "epoch": 0.9212293388429752, "grad_norm": 4.350169658660889, "learning_rate": 7.665666011926892e-07, "loss": 3.885, "step": 3567 }, { "epoch": 0.9214876033057852, "grad_norm": 3.2592520713806152, "learning_rate": 7.615901020032207e-07, "loss": 3.7888, "step": 3568 }, { "epoch": 0.921745867768595, "grad_norm": 2.6862549781799316, "learning_rate": 7.566295591489053e-07, "loss": 4.531, "step": 3569 }, { "epoch": 0.922004132231405, "grad_norm": 2.453934669494629, "learning_rate": 7.516849758953143e-07, "loss": 3.7551, "step": 3570 }, { "epoch": 0.9222623966942148, "grad_norm": 3.1986212730407715, "learning_rate": 7.467563554974937e-07, "loss": 3.9773, "step": 3571 }, { "epoch": 0.9225206611570248, "grad_norm": 2.410470485687256, "learning_rate": 7.418437012000068e-07, "loss": 3.6328, "step": 3572 }, { "epoch": 0.9227789256198347, "grad_norm": 2.69417405128479, "learning_rate": 7.36947016236883e-07, "loss": 3.8412, "step": 3573 }, { "epoch": 0.9230371900826446, "grad_norm": 2.5853934288024902, "learning_rate": 7.320663038316495e-07, "loss": 3.4261, "step": 3574 }, { "epoch": 0.9232954545454546, "grad_norm": 2.7000441551208496, "learning_rate": 7.272015671973193e-07, "loss": 3.9085, "step": 3575 }, { "epoch": 0.9235537190082644, "grad_norm": 2.1995344161987305, "learning_rate": 7.223528095363862e-07, "loss": 3.3805, "step": 3576 }, { "epoch": 0.9238119834710744, "grad_norm": 4.606137752532959, "learning_rate": 7.175200340408278e-07, "loss": 3.9038, "step": 3577 }, { "epoch": 0.9240702479338843, "grad_norm": 3.294633388519287, "learning_rate": 7.127032438920933e-07, "loss": 3.7809, "step": 3578 }, { "epoch": 0.9243285123966942, "grad_norm": 2.9429337978363037, "learning_rate": 7.079024422611158e-07, "loss": 3.6757, "step": 3579 }, { "epoch": 0.9245867768595041, "grad_norm": 3.2297754287719727, "learning_rate": 7.031176323083033e-07, "loss": 3.6423, "step": 3580 }, { "epoch": 0.924845041322314, "grad_norm": 1.9332275390625, "learning_rate": 6.983488171835307e-07, "loss": 3.8889, "step": 3581 }, { "epoch": 0.925103305785124, "grad_norm": 2.453639507293701, "learning_rate": 6.935960000261421e-07, "loss": 3.5478, "step": 3582 }, { "epoch": 0.9253615702479339, "grad_norm": 2.6810505390167236, "learning_rate": 6.888591839649655e-07, "loss": 3.3324, "step": 3583 }, { "epoch": 0.9256198347107438, "grad_norm": 4.026587963104248, "learning_rate": 6.841383721182731e-07, "loss": 3.9957, "step": 3584 }, { "epoch": 0.9258780991735537, "grad_norm": 3.26335072517395, "learning_rate": 6.79433567593818e-07, "loss": 3.6117, "step": 3585 }, { "epoch": 0.9261363636363636, "grad_norm": 2.937220811843872, "learning_rate": 6.747447734888112e-07, "loss": 3.7804, "step": 3586 }, { "epoch": 0.9263946280991735, "grad_norm": 4.127540111541748, "learning_rate": 6.700719928899146e-07, "loss": 3.8031, "step": 3587 }, { "epoch": 0.9266528925619835, "grad_norm": 2.839317798614502, "learning_rate": 6.654152288732646e-07, "loss": 4.212, "step": 3588 }, { "epoch": 0.9269111570247934, "grad_norm": 2.022718906402588, "learning_rate": 6.607744845044423e-07, "loss": 4.0061, "step": 3589 }, { "epoch": 0.9271694214876033, "grad_norm": 2.500077486038208, "learning_rate": 6.561497628384844e-07, "loss": 3.9907, "step": 3590 }, { "epoch": 0.9274276859504132, "grad_norm": 3.331881523132324, "learning_rate": 6.515410669198863e-07, "loss": 3.7093, "step": 3591 }, { "epoch": 0.9276859504132231, "grad_norm": 2.213618278503418, "learning_rate": 6.469483997825848e-07, "loss": 3.9325, "step": 3592 }, { "epoch": 0.9279442148760331, "grad_norm": 2.4202346801757812, "learning_rate": 6.423717644499699e-07, "loss": 4.0531, "step": 3593 }, { "epoch": 0.9282024793388429, "grad_norm": 2.8435637950897217, "learning_rate": 6.378111639348732e-07, "loss": 3.3785, "step": 3594 }, { "epoch": 0.9284607438016529, "grad_norm": 5.658176422119141, "learning_rate": 6.332666012395766e-07, "loss": 3.9061, "step": 3595 }, { "epoch": 0.9287190082644629, "grad_norm": 7.229365825653076, "learning_rate": 6.287380793558062e-07, "loss": 3.915, "step": 3596 }, { "epoch": 0.9289772727272727, "grad_norm": 2.193605661392212, "learning_rate": 6.242256012647136e-07, "loss": 3.8973, "step": 3597 }, { "epoch": 0.9292355371900827, "grad_norm": 2.9365005493164062, "learning_rate": 6.197291699369057e-07, "loss": 3.9148, "step": 3598 }, { "epoch": 0.9294938016528925, "grad_norm": 3.3393795490264893, "learning_rate": 6.152487883324148e-07, "loss": 3.4458, "step": 3599 }, { "epoch": 0.9297520661157025, "grad_norm": 2.567018747329712, "learning_rate": 6.107844594007151e-07, "loss": 3.9864, "step": 3600 }, { "epoch": 0.9300103305785123, "grad_norm": 1.9808021783828735, "learning_rate": 6.063361860807026e-07, "loss": 4.2781, "step": 3601 }, { "epoch": 0.9302685950413223, "grad_norm": 4.03883171081543, "learning_rate": 6.019039713007185e-07, "loss": 3.5376, "step": 3602 }, { "epoch": 0.9305268595041323, "grad_norm": 2.2003190517425537, "learning_rate": 5.974878179785204e-07, "loss": 3.8032, "step": 3603 }, { "epoch": 0.9307851239669421, "grad_norm": 5.127209663391113, "learning_rate": 5.930877290212966e-07, "loss": 3.3527, "step": 3604 }, { "epoch": 0.9310433884297521, "grad_norm": 2.475587844848633, "learning_rate": 5.887037073256607e-07, "loss": 3.6034, "step": 3605 }, { "epoch": 0.931301652892562, "grad_norm": 3.4822230339050293, "learning_rate": 5.843357557776485e-07, "loss": 3.4549, "step": 3606 }, { "epoch": 0.9315599173553719, "grad_norm": 3.4666247367858887, "learning_rate": 5.799838772527155e-07, "loss": 3.797, "step": 3607 }, { "epoch": 0.9318181818181818, "grad_norm": 2.763679265975952, "learning_rate": 5.756480746157394e-07, "loss": 4.0025, "step": 3608 }, { "epoch": 0.9320764462809917, "grad_norm": 2.187828540802002, "learning_rate": 5.713283507210148e-07, "loss": 4.0391, "step": 3609 }, { "epoch": 0.9323347107438017, "grad_norm": 6.783918380737305, "learning_rate": 5.670247084122421e-07, "loss": 3.4075, "step": 3610 }, { "epoch": 0.9325929752066116, "grad_norm": 2.6813230514526367, "learning_rate": 5.627371505225492e-07, "loss": 3.9258, "step": 3611 }, { "epoch": 0.9328512396694215, "grad_norm": 1.845249056816101, "learning_rate": 5.584656798744703e-07, "loss": 4.0647, "step": 3612 }, { "epoch": 0.9331095041322314, "grad_norm": 3.758363723754883, "learning_rate": 5.54210299279942e-07, "loss": 3.674, "step": 3613 }, { "epoch": 0.9333677685950413, "grad_norm": 2.0308547019958496, "learning_rate": 5.499710115403206e-07, "loss": 4.059, "step": 3614 }, { "epoch": 0.9336260330578512, "grad_norm": 4.15519380569458, "learning_rate": 5.4574781944636e-07, "loss": 4.2786, "step": 3615 }, { "epoch": 0.9338842975206612, "grad_norm": 2.061561346054077, "learning_rate": 5.415407257782196e-07, "loss": 3.8487, "step": 3616 }, { "epoch": 0.9341425619834711, "grad_norm": 2.2659218311309814, "learning_rate": 5.373497333054617e-07, "loss": 4.4387, "step": 3617 }, { "epoch": 0.934400826446281, "grad_norm": 2.8847782611846924, "learning_rate": 5.331748447870544e-07, "loss": 4.2173, "step": 3618 }, { "epoch": 0.9346590909090909, "grad_norm": 3.14064359664917, "learning_rate": 5.290160629713519e-07, "loss": 4.1775, "step": 3619 }, { "epoch": 0.9349173553719008, "grad_norm": 2.5849497318267822, "learning_rate": 5.248733905961173e-07, "loss": 3.3148, "step": 3620 }, { "epoch": 0.9351756198347108, "grad_norm": 3.883075714111328, "learning_rate": 5.207468303885076e-07, "loss": 3.9116, "step": 3621 }, { "epoch": 0.9354338842975206, "grad_norm": 4.203197956085205, "learning_rate": 5.166363850650641e-07, "loss": 4.0373, "step": 3622 }, { "epoch": 0.9356921487603306, "grad_norm": 5.385982990264893, "learning_rate": 5.125420573317275e-07, "loss": 3.6048, "step": 3623 }, { "epoch": 0.9359504132231405, "grad_norm": 5.884946346282959, "learning_rate": 5.084638498838224e-07, "loss": 3.6407, "step": 3624 }, { "epoch": 0.9362086776859504, "grad_norm": 2.7006030082702637, "learning_rate": 5.044017654060706e-07, "loss": 3.576, "step": 3625 }, { "epoch": 0.9364669421487604, "grad_norm": 3.7900357246398926, "learning_rate": 5.003558065725722e-07, "loss": 4.5216, "step": 3626 }, { "epoch": 0.9367252066115702, "grad_norm": 2.7831337451934814, "learning_rate": 4.963259760468103e-07, "loss": 4.1246, "step": 3627 }, { "epoch": 0.9369834710743802, "grad_norm": 8.833452224731445, "learning_rate": 4.923122764816601e-07, "loss": 3.7038, "step": 3628 }, { "epoch": 0.93724173553719, "grad_norm": 2.1629490852355957, "learning_rate": 4.883147105193636e-07, "loss": 3.7966, "step": 3629 }, { "epoch": 0.9375, "grad_norm": 5.0555806159973145, "learning_rate": 4.843332807915546e-07, "loss": 3.5105, "step": 3630 }, { "epoch": 0.93775826446281, "grad_norm": 3.1834371089935303, "learning_rate": 4.803679899192392e-07, "loss": 3.6899, "step": 3631 }, { "epoch": 0.9380165289256198, "grad_norm": 3.5879034996032715, "learning_rate": 4.7641884051279894e-07, "loss": 3.9035, "step": 3632 }, { "epoch": 0.9382747933884298, "grad_norm": 3.2886576652526855, "learning_rate": 4.724858351719902e-07, "loss": 3.7003, "step": 3633 }, { "epoch": 0.9385330578512396, "grad_norm": 2.383491277694702, "learning_rate": 4.685689764859391e-07, "loss": 3.8176, "step": 3634 }, { "epoch": 0.9387913223140496, "grad_norm": 2.5970187187194824, "learning_rate": 4.646682670331498e-07, "loss": 3.9204, "step": 3635 }, { "epoch": 0.9390495867768595, "grad_norm": 4.201479911804199, "learning_rate": 4.6078370938148483e-07, "loss": 3.7808, "step": 3636 }, { "epoch": 0.9393078512396694, "grad_norm": 3.039266586303711, "learning_rate": 4.569153060881792e-07, "loss": 3.9689, "step": 3637 }, { "epoch": 0.9395661157024794, "grad_norm": 2.735351324081421, "learning_rate": 4.5306305969983476e-07, "loss": 4.4283, "step": 3638 }, { "epoch": 0.9398243801652892, "grad_norm": 1.879524827003479, "learning_rate": 4.4922697275241453e-07, "loss": 3.4413, "step": 3639 }, { "epoch": 0.9400826446280992, "grad_norm": 3.1176557540893555, "learning_rate": 4.4540704777124573e-07, "loss": 4.267, "step": 3640 }, { "epoch": 0.9403409090909091, "grad_norm": 5.534834861755371, "learning_rate": 4.4160328727101687e-07, "loss": 4.2729, "step": 3641 }, { "epoch": 0.940599173553719, "grad_norm": 4.826541900634766, "learning_rate": 4.3781569375576926e-07, "loss": 3.638, "step": 3642 }, { "epoch": 0.9408574380165289, "grad_norm": 2.3753552436828613, "learning_rate": 4.3404426971890287e-07, "loss": 3.6885, "step": 3643 }, { "epoch": 0.9411157024793388, "grad_norm": 2.6811978816986084, "learning_rate": 4.302890176431845e-07, "loss": 4.2142, "step": 3644 }, { "epoch": 0.9413739669421488, "grad_norm": 3.814085006713867, "learning_rate": 4.2654994000072e-07, "loss": 3.5891, "step": 3645 }, { "epoch": 0.9416322314049587, "grad_norm": 2.5375049114227295, "learning_rate": 4.22827039252971e-07, "loss": 3.8732, "step": 3646 }, { "epoch": 0.9418904958677686, "grad_norm": 2.354015350341797, "learning_rate": 4.191203178507602e-07, "loss": 4.5163, "step": 3647 }, { "epoch": 0.9421487603305785, "grad_norm": 2.702793598175049, "learning_rate": 4.15429778234247e-07, "loss": 4.057, "step": 3648 }, { "epoch": 0.9424070247933884, "grad_norm": 2.405747413635254, "learning_rate": 4.1175542283294064e-07, "loss": 3.8646, "step": 3649 }, { "epoch": 0.9426652892561983, "grad_norm": 3.5038275718688965, "learning_rate": 4.08097254065698e-07, "loss": 3.5723, "step": 3650 }, { "epoch": 0.9429235537190083, "grad_norm": 3.655653715133667, "learning_rate": 4.044552743407287e-07, "loss": 3.6455, "step": 3651 }, { "epoch": 0.9431818181818182, "grad_norm": 1.8909848928451538, "learning_rate": 4.0082948605556506e-07, "loss": 4.389, "step": 3652 }, { "epoch": 0.9434400826446281, "grad_norm": 3.0998072624206543, "learning_rate": 3.972198915970976e-07, "loss": 3.7829, "step": 3653 }, { "epoch": 0.943698347107438, "grad_norm": 2.5775530338287354, "learning_rate": 3.936264933415507e-07, "loss": 4.6002, "step": 3654 }, { "epoch": 0.9439566115702479, "grad_norm": 2.600149631500244, "learning_rate": 3.9004929365448473e-07, "loss": 3.9519, "step": 3655 }, { "epoch": 0.9442148760330579, "grad_norm": 2.9042296409606934, "learning_rate": 3.864882948908022e-07, "loss": 3.9944, "step": 3656 }, { "epoch": 0.9444731404958677, "grad_norm": 2.1986021995544434, "learning_rate": 3.8294349939473064e-07, "loss": 3.6855, "step": 3657 }, { "epoch": 0.9447314049586777, "grad_norm": 3.1400649547576904, "learning_rate": 3.7941490949984224e-07, "loss": 4.3104, "step": 3658 }, { "epoch": 0.9449896694214877, "grad_norm": 2.919401168823242, "learning_rate": 3.7590252752903177e-07, "loss": 4.3374, "step": 3659 }, { "epoch": 0.9452479338842975, "grad_norm": 2.4905686378479004, "learning_rate": 3.7240635579452733e-07, "loss": 4.0912, "step": 3660 }, { "epoch": 0.9455061983471075, "grad_norm": 1.9873576164245605, "learning_rate": 3.6892639659789066e-07, "loss": 3.8291, "step": 3661 }, { "epoch": 0.9457644628099173, "grad_norm": 3.1374504566192627, "learning_rate": 3.6546265223000034e-07, "loss": 3.8698, "step": 3662 }, { "epoch": 0.9460227272727273, "grad_norm": 4.210402965545654, "learning_rate": 3.6201512497107124e-07, "loss": 3.5665, "step": 3663 }, { "epoch": 0.9462809917355371, "grad_norm": 2.623227119445801, "learning_rate": 3.585838170906353e-07, "loss": 4.0072, "step": 3664 }, { "epoch": 0.9465392561983471, "grad_norm": 2.8956997394561768, "learning_rate": 3.551687308475521e-07, "loss": 3.961, "step": 3665 }, { "epoch": 0.9467975206611571, "grad_norm": 2.2702836990356445, "learning_rate": 3.517698684899928e-07, "loss": 4.0355, "step": 3666 }, { "epoch": 0.9470557851239669, "grad_norm": 1.8134617805480957, "learning_rate": 3.483872322554621e-07, "loss": 3.7891, "step": 3667 }, { "epoch": 0.9473140495867769, "grad_norm": 3.547464370727539, "learning_rate": 3.450208243707731e-07, "loss": 3.5185, "step": 3668 }, { "epoch": 0.9475723140495868, "grad_norm": 1.8650012016296387, "learning_rate": 3.416706470520586e-07, "loss": 3.832, "step": 3669 }, { "epoch": 0.9478305785123967, "grad_norm": 1.827328085899353, "learning_rate": 3.383367025047685e-07, "loss": 3.9037, "step": 3670 }, { "epoch": 0.9480888429752066, "grad_norm": 2.643094778060913, "learning_rate": 3.350189929236608e-07, "loss": 3.6632, "step": 3671 }, { "epoch": 0.9483471074380165, "grad_norm": 2.6356139183044434, "learning_rate": 3.317175204928136e-07, "loss": 4.1297, "step": 3672 }, { "epoch": 0.9486053719008265, "grad_norm": 2.8614132404327393, "learning_rate": 3.284322873856105e-07, "loss": 3.7939, "step": 3673 }, { "epoch": 0.9488636363636364, "grad_norm": 3.2617287635803223, "learning_rate": 3.251632957647438e-07, "loss": 3.4691, "step": 3674 }, { "epoch": 0.9491219008264463, "grad_norm": 1.9675489664077759, "learning_rate": 3.219105477822226e-07, "loss": 3.4108, "step": 3675 }, { "epoch": 0.9493801652892562, "grad_norm": 1.8828868865966797, "learning_rate": 3.1867404557934797e-07, "loss": 3.9808, "step": 3676 }, { "epoch": 0.9496384297520661, "grad_norm": 1.5962607860565186, "learning_rate": 3.1545379128674333e-07, "loss": 4.0234, "step": 3677 }, { "epoch": 0.949896694214876, "grad_norm": 4.955358982086182, "learning_rate": 3.122497870243185e-07, "loss": 3.5231, "step": 3678 }, { "epoch": 0.950154958677686, "grad_norm": 6.2165937423706055, "learning_rate": 3.0906203490130294e-07, "loss": 3.9117, "step": 3679 }, { "epoch": 0.9504132231404959, "grad_norm": 4.086905479431152, "learning_rate": 3.058905370162124e-07, "loss": 3.689, "step": 3680 }, { "epoch": 0.9506714876033058, "grad_norm": 3.228405237197876, "learning_rate": 3.027352954568713e-07, "loss": 4.4152, "step": 3681 }, { "epoch": 0.9509297520661157, "grad_norm": 4.46485710144043, "learning_rate": 2.995963123004014e-07, "loss": 3.7514, "step": 3682 }, { "epoch": 0.9511880165289256, "grad_norm": 3.8669095039367676, "learning_rate": 2.9647358961321925e-07, "loss": 3.9936, "step": 3683 }, { "epoch": 0.9514462809917356, "grad_norm": 3.09136962890625, "learning_rate": 2.9336712945103594e-07, "loss": 4.2159, "step": 3684 }, { "epoch": 0.9517045454545454, "grad_norm": 2.50604248046875, "learning_rate": 2.9027693385886014e-07, "loss": 3.7919, "step": 3685 }, { "epoch": 0.9519628099173554, "grad_norm": 3.223982334136963, "learning_rate": 2.872030048709895e-07, "loss": 3.6059, "step": 3686 }, { "epoch": 0.9522210743801653, "grad_norm": 2.721142053604126, "learning_rate": 2.841453445110193e-07, "loss": 3.893, "step": 3687 }, { "epoch": 0.9524793388429752, "grad_norm": 4.293535232543945, "learning_rate": 2.8110395479182815e-07, "loss": 3.8741, "step": 3688 }, { "epoch": 0.9527376033057852, "grad_norm": 3.934767723083496, "learning_rate": 2.7807883771558686e-07, "loss": 3.3655, "step": 3689 }, { "epoch": 0.952995867768595, "grad_norm": 16.960939407348633, "learning_rate": 2.750699952737579e-07, "loss": 3.9724, "step": 3690 }, { "epoch": 0.953254132231405, "grad_norm": 2.839562177658081, "learning_rate": 2.7207742944707905e-07, "loss": 3.6558, "step": 3691 }, { "epoch": 0.9535123966942148, "grad_norm": 3.5212130546569824, "learning_rate": 2.691011422055856e-07, "loss": 4.4495, "step": 3692 }, { "epoch": 0.9537706611570248, "grad_norm": 2.4836361408233643, "learning_rate": 2.66141135508588e-07, "loss": 3.5764, "step": 3693 }, { "epoch": 0.9540289256198347, "grad_norm": 2.1646218299865723, "learning_rate": 2.631974113046831e-07, "loss": 4.293, "step": 3694 }, { "epoch": 0.9542871900826446, "grad_norm": 3.2409098148345947, "learning_rate": 2.6026997153174583e-07, "loss": 3.4936, "step": 3695 }, { "epoch": 0.9545454545454546, "grad_norm": 4.4169182777404785, "learning_rate": 2.5735881811693176e-07, "loss": 3.8165, "step": 3696 }, { "epoch": 0.9548037190082644, "grad_norm": 4.380348205566406, "learning_rate": 2.544639529766829e-07, "loss": 4.389, "step": 3697 }, { "epoch": 0.9550619834710744, "grad_norm": 2.2903876304626465, "learning_rate": 2.515853780167027e-07, "loss": 3.9272, "step": 3698 }, { "epoch": 0.9553202479338843, "grad_norm": 2.9783577919006348, "learning_rate": 2.4872309513198634e-07, "loss": 3.5496, "step": 3699 }, { "epoch": 0.9555785123966942, "grad_norm": 2.26296067237854, "learning_rate": 2.4587710620679607e-07, "loss": 3.9011, "step": 3700 }, { "epoch": 0.9558367768595041, "grad_norm": 3.035733699798584, "learning_rate": 2.430474131146609e-07, "loss": 3.7971, "step": 3701 }, { "epoch": 0.956095041322314, "grad_norm": 2.918849468231201, "learning_rate": 2.4023401771840204e-07, "loss": 4.2294, "step": 3702 }, { "epoch": 0.956353305785124, "grad_norm": 2.6566898822784424, "learning_rate": 2.374369218700906e-07, "loss": 3.4507, "step": 3703 }, { "epoch": 0.9566115702479339, "grad_norm": 5.254640579223633, "learning_rate": 2.3465612741108144e-07, "loss": 3.093, "step": 3704 }, { "epoch": 0.9568698347107438, "grad_norm": 2.8269729614257812, "learning_rate": 2.3189163617199083e-07, "loss": 4.1941, "step": 3705 }, { "epoch": 0.9571280991735537, "grad_norm": 2.4934072494506836, "learning_rate": 2.2914344997270743e-07, "loss": 4.0287, "step": 3706 }, { "epoch": 0.9573863636363636, "grad_norm": 2.8003454208374023, "learning_rate": 2.2641157062238129e-07, "loss": 3.8878, "step": 3707 }, { "epoch": 0.9576446280991735, "grad_norm": 2.9515388011932373, "learning_rate": 2.2369599991943213e-07, "loss": 4.4049, "step": 3708 }, { "epoch": 0.9579028925619835, "grad_norm": 3.15842604637146, "learning_rate": 2.2099673965153834e-07, "loss": 4.0408, "step": 3709 }, { "epoch": 0.9581611570247934, "grad_norm": 2.8122758865356445, "learning_rate": 2.183137915956479e-07, "loss": 4.0698, "step": 3710 }, { "epoch": 0.9584194214876033, "grad_norm": 4.076789379119873, "learning_rate": 2.156471575179647e-07, "loss": 3.9091, "step": 3711 }, { "epoch": 0.9586776859504132, "grad_norm": 2.7649779319763184, "learning_rate": 2.1299683917395397e-07, "loss": 4.1204, "step": 3712 }, { "epoch": 0.9589359504132231, "grad_norm": 2.4461724758148193, "learning_rate": 2.1036283830834224e-07, "loss": 3.9315, "step": 3713 }, { "epoch": 0.9591942148760331, "grad_norm": 2.9369423389434814, "learning_rate": 2.0774515665511195e-07, "loss": 3.9407, "step": 3714 }, { "epoch": 0.9594524793388429, "grad_norm": 2.164090871810913, "learning_rate": 2.051437959375041e-07, "loss": 4.3452, "step": 3715 }, { "epoch": 0.9597107438016529, "grad_norm": 2.6939380168914795, "learning_rate": 2.0255875786801827e-07, "loss": 4.0194, "step": 3716 }, { "epoch": 0.9599690082644629, "grad_norm": 3.3126726150512695, "learning_rate": 1.9999004414839872e-07, "loss": 3.7825, "step": 3717 }, { "epoch": 0.9602272727272727, "grad_norm": 2.066408395767212, "learning_rate": 1.9743765646965118e-07, "loss": 3.6792, "step": 3718 }, { "epoch": 0.9604855371900827, "grad_norm": 3.171905040740967, "learning_rate": 1.9490159651203988e-07, "loss": 4.3359, "step": 3719 }, { "epoch": 0.9607438016528925, "grad_norm": 1.8101623058319092, "learning_rate": 1.9238186594506269e-07, "loss": 4.3427, "step": 3720 }, { "epoch": 0.9610020661157025, "grad_norm": 3.9175972938537598, "learning_rate": 1.8987846642748442e-07, "loss": 3.7765, "step": 3721 }, { "epoch": 0.9612603305785123, "grad_norm": 2.73000431060791, "learning_rate": 1.87391399607309e-07, "loss": 4.3477, "step": 3722 }, { "epoch": 0.9615185950413223, "grad_norm": 6.257915019989014, "learning_rate": 1.8492066712179622e-07, "loss": 3.8845, "step": 3723 }, { "epoch": 0.9617768595041323, "grad_norm": 3.2913451194763184, "learning_rate": 1.8246627059744504e-07, "loss": 3.919, "step": 3724 }, { "epoch": 0.9620351239669421, "grad_norm": 2.014899492263794, "learning_rate": 1.8002821165000461e-07, "loss": 3.8099, "step": 3725 }, { "epoch": 0.9622933884297521, "grad_norm": 4.362002372741699, "learning_rate": 1.7760649188446888e-07, "loss": 4.0233, "step": 3726 }, { "epoch": 0.962551652892562, "grad_norm": 1.5968549251556396, "learning_rate": 1.7520111289507646e-07, "loss": 4.0203, "step": 3727 }, { "epoch": 0.9628099173553719, "grad_norm": 3.3780195713043213, "learning_rate": 1.7281207626530238e-07, "loss": 3.82, "step": 3728 }, { "epoch": 0.9630681818181818, "grad_norm": 2.670426845550537, "learning_rate": 1.704393835678747e-07, "loss": 4.1838, "step": 3729 }, { "epoch": 0.9633264462809917, "grad_norm": 4.920481204986572, "learning_rate": 1.680830363647523e-07, "loss": 3.6248, "step": 3730 }, { "epoch": 0.9635847107438017, "grad_norm": 3.8281586170196533, "learning_rate": 1.6574303620713328e-07, "loss": 3.7173, "step": 3731 }, { "epoch": 0.9638429752066116, "grad_norm": 3.203110933303833, "learning_rate": 1.6341938463546313e-07, "loss": 3.9088, "step": 3732 }, { "epoch": 0.9641012396694215, "grad_norm": 1.9780921936035156, "learning_rate": 1.6111208317942106e-07, "loss": 3.8569, "step": 3733 }, { "epoch": 0.9643595041322314, "grad_norm": 1.769336462020874, "learning_rate": 1.5882113335791705e-07, "loss": 3.5172, "step": 3734 }, { "epoch": 0.9646177685950413, "grad_norm": 2.922206163406372, "learning_rate": 1.5654653667910302e-07, "loss": 3.7563, "step": 3735 }, { "epoch": 0.9648760330578512, "grad_norm": 2.4740402698516846, "learning_rate": 1.5428829464036455e-07, "loss": 3.9908, "step": 3736 }, { "epoch": 0.9651342975206612, "grad_norm": 3.8055756092071533, "learning_rate": 1.5204640872832076e-07, "loss": 3.8317, "step": 3737 }, { "epoch": 0.9653925619834711, "grad_norm": 3.6758317947387695, "learning_rate": 1.4982088041881892e-07, "loss": 3.4692, "step": 3738 }, { "epoch": 0.965650826446281, "grad_norm": 3.608020782470703, "learning_rate": 1.4761171117694817e-07, "loss": 3.3145, "step": 3739 }, { "epoch": 0.9659090909090909, "grad_norm": 1.8991438150405884, "learning_rate": 1.4541890245701462e-07, "loss": 3.3896, "step": 3740 }, { "epoch": 0.9661673553719008, "grad_norm": 3.2739222049713135, "learning_rate": 1.4324245570256633e-07, "loss": 4.4301, "step": 3741 }, { "epoch": 0.9664256198347108, "grad_norm": 2.9526984691619873, "learning_rate": 1.4108237234637667e-07, "loss": 3.864, "step": 3742 }, { "epoch": 0.9666838842975206, "grad_norm": 3.7908105850219727, "learning_rate": 1.389386538104387e-07, "loss": 4.2213, "step": 3743 }, { "epoch": 0.9669421487603306, "grad_norm": 3.3527090549468994, "learning_rate": 1.3681130150598463e-07, "loss": 3.9539, "step": 3744 }, { "epoch": 0.9672004132231405, "grad_norm": 1.6673051118850708, "learning_rate": 1.3470031683346651e-07, "loss": 3.7954, "step": 3745 }, { "epoch": 0.9674586776859504, "grad_norm": 2.6516566276550293, "learning_rate": 1.3260570118256156e-07, "loss": 3.6477, "step": 3746 }, { "epoch": 0.9677169421487604, "grad_norm": 2.9701693058013916, "learning_rate": 1.3052745593216952e-07, "loss": 4.0349, "step": 3747 }, { "epoch": 0.9679752066115702, "grad_norm": 3.2317473888397217, "learning_rate": 1.284655824504183e-07, "loss": 3.6846, "step": 3748 }, { "epoch": 0.9682334710743802, "grad_norm": 1.9116369485855103, "learning_rate": 1.264200820946554e-07, "loss": 3.474, "step": 3749 }, { "epoch": 0.96849173553719, "grad_norm": 3.2927606105804443, "learning_rate": 1.2439095621144536e-07, "loss": 3.8175, "step": 3750 }, { "epoch": 0.96875, "grad_norm": 4.106019973754883, "learning_rate": 1.2237820613658636e-07, "loss": 4.6622, "step": 3751 }, { "epoch": 0.96900826446281, "grad_norm": 2.786437749862671, "learning_rate": 1.2038183319507955e-07, "loss": 3.9879, "step": 3752 }, { "epoch": 0.9692665289256198, "grad_norm": 3.1503076553344727, "learning_rate": 1.1840183870115428e-07, "loss": 4.0546, "step": 3753 }, { "epoch": 0.9695247933884298, "grad_norm": 2.8748972415924072, "learning_rate": 1.1643822395825955e-07, "loss": 3.5166, "step": 3754 }, { "epoch": 0.9697830578512396, "grad_norm": 3.124781370162964, "learning_rate": 1.1449099025905574e-07, "loss": 3.6403, "step": 3755 }, { "epoch": 0.9700413223140496, "grad_norm": 2.3954668045043945, "learning_rate": 1.1256013888542306e-07, "loss": 4.115, "step": 3756 }, { "epoch": 0.9702995867768595, "grad_norm": 4.432156085968018, "learning_rate": 1.106456711084558e-07, "loss": 4.237, "step": 3757 }, { "epoch": 0.9705578512396694, "grad_norm": 3.1674132347106934, "learning_rate": 1.0874758818846253e-07, "loss": 4.233, "step": 3758 }, { "epoch": 0.9708161157024794, "grad_norm": 2.7982518672943115, "learning_rate": 1.0686589137496595e-07, "loss": 4.7042, "step": 3759 }, { "epoch": 0.9710743801652892, "grad_norm": 2.1420135498046875, "learning_rate": 1.050005819067057e-07, "loss": 3.6027, "step": 3760 }, { "epoch": 0.9713326446280992, "grad_norm": 3.091804265975952, "learning_rate": 1.0315166101162455e-07, "loss": 3.4556, "step": 3761 }, { "epoch": 0.9715909090909091, "grad_norm": 3.5762948989868164, "learning_rate": 1.0131912990688219e-07, "loss": 3.9391, "step": 3762 }, { "epoch": 0.971849173553719, "grad_norm": 3.6113154888153076, "learning_rate": 9.950298979885254e-08, "loss": 4.258, "step": 3763 }, { "epoch": 0.9721074380165289, "grad_norm": 3.8507139682769775, "learning_rate": 9.770324188310975e-08, "loss": 4.041, "step": 3764 }, { "epoch": 0.9723657024793388, "grad_norm": 4.201594352722168, "learning_rate": 9.591988734444779e-08, "loss": 4.1159, "step": 3765 }, { "epoch": 0.9726239669421488, "grad_norm": 2.372098445892334, "learning_rate": 9.415292735685532e-08, "loss": 3.4503, "step": 3766 }, { "epoch": 0.9728822314049587, "grad_norm": 4.540857791900635, "learning_rate": 9.240236308354077e-08, "loss": 3.3379, "step": 3767 }, { "epoch": 0.9731404958677686, "grad_norm": 2.4730288982391357, "learning_rate": 9.066819567691565e-08, "loss": 3.9177, "step": 3768 }, { "epoch": 0.9733987603305785, "grad_norm": 2.660264015197754, "learning_rate": 8.895042627858896e-08, "loss": 3.7236, "step": 3769 }, { "epoch": 0.9736570247933884, "grad_norm": 4.249515056610107, "learning_rate": 8.724905601939226e-08, "loss": 4.4593, "step": 3770 }, { "epoch": 0.9739152892561983, "grad_norm": 2.1504886150360107, "learning_rate": 8.556408601934075e-08, "loss": 4.2497, "step": 3771 }, { "epoch": 0.9741735537190083, "grad_norm": 5.52370023727417, "learning_rate": 8.38955173876721e-08, "loss": 4.1061, "step": 3772 }, { "epoch": 0.9744318181818182, "grad_norm": 2.1309311389923096, "learning_rate": 8.224335122281046e-08, "loss": 4.0396, "step": 3773 }, { "epoch": 0.9746900826446281, "grad_norm": 2.473555088043213, "learning_rate": 8.060758861239414e-08, "loss": 3.8411, "step": 3774 }, { "epoch": 0.974948347107438, "grad_norm": 2.1947848796844482, "learning_rate": 7.898823063325622e-08, "loss": 3.9069, "step": 3775 }, { "epoch": 0.9752066115702479, "grad_norm": 2.6744422912597656, "learning_rate": 7.738527835143561e-08, "loss": 3.7621, "step": 3776 }, { "epoch": 0.9754648760330579, "grad_norm": 5.313077449798584, "learning_rate": 7.579873282216599e-08, "loss": 4.7774, "step": 3777 }, { "epoch": 0.9757231404958677, "grad_norm": 2.9748306274414062, "learning_rate": 7.422859508988411e-08, "loss": 3.6893, "step": 3778 }, { "epoch": 0.9759814049586777, "grad_norm": 2.56394362449646, "learning_rate": 7.267486618822427e-08, "loss": 4.0334, "step": 3779 }, { "epoch": 0.9762396694214877, "grad_norm": 2.0437560081481934, "learning_rate": 7.113754714001552e-08, "loss": 3.9546, "step": 3780 }, { "epoch": 0.9764979338842975, "grad_norm": 2.1684954166412354, "learning_rate": 6.961663895728999e-08, "loss": 3.8235, "step": 3781 }, { "epoch": 0.9767561983471075, "grad_norm": 2.1603941917419434, "learning_rate": 6.811214264127453e-08, "loss": 4.1331, "step": 3782 }, { "epoch": 0.9770144628099173, "grad_norm": 3.1635611057281494, "learning_rate": 6.662405918238523e-08, "loss": 4.2055, "step": 3783 }, { "epoch": 0.9772727272727273, "grad_norm": 2.8396782875061035, "learning_rate": 6.515238956024405e-08, "loss": 3.7942, "step": 3784 }, { "epoch": 0.9775309917355371, "grad_norm": 3.25321102142334, "learning_rate": 6.369713474366212e-08, "loss": 4.1893, "step": 3785 }, { "epoch": 0.9777892561983471, "grad_norm": 1.6388353109359741, "learning_rate": 6.225829569064257e-08, "loss": 4.015, "step": 3786 }, { "epoch": 0.9780475206611571, "grad_norm": 4.08117151260376, "learning_rate": 6.083587334838603e-08, "loss": 3.6703, "step": 3787 }, { "epoch": 0.9783057851239669, "grad_norm": 2.6119368076324463, "learning_rate": 5.9429868653285147e-08, "loss": 3.9928, "step": 3788 }, { "epoch": 0.9785640495867769, "grad_norm": 3.2524871826171875, "learning_rate": 5.8040282530924526e-08, "loss": 3.9185, "step": 3789 }, { "epoch": 0.9788223140495868, "grad_norm": 2.822561025619507, "learning_rate": 5.666711589607521e-08, "loss": 4.1498, "step": 3790 }, { "epoch": 0.9790805785123967, "grad_norm": 2.732611656188965, "learning_rate": 5.5310369652708527e-08, "loss": 3.9251, "step": 3791 }, { "epoch": 0.9793388429752066, "grad_norm": 2.4466490745544434, "learning_rate": 5.397004469397671e-08, "loss": 4.853, "step": 3792 }, { "epoch": 0.9795971074380165, "grad_norm": 3.0260298252105713, "learning_rate": 5.2646141902229494e-08, "loss": 4.0974, "step": 3793 }, { "epoch": 0.9798553719008265, "grad_norm": 2.959453821182251, "learning_rate": 5.1338662149000294e-08, "loss": 3.7604, "step": 3794 }, { "epoch": 0.9801136363636364, "grad_norm": 2.862783193588257, "learning_rate": 5.004760629501726e-08, "loss": 3.7588, "step": 3795 }, { "epoch": 0.9803719008264463, "grad_norm": 4.212141036987305, "learning_rate": 4.877297519018664e-08, "loss": 4.1054, "step": 3796 }, { "epoch": 0.9806301652892562, "grad_norm": 2.8226680755615234, "learning_rate": 4.751476967361501e-08, "loss": 3.6735, "step": 3797 }, { "epoch": 0.9808884297520661, "grad_norm": 1.610856056213379, "learning_rate": 4.627299057358147e-08, "loss": 3.6304, "step": 3798 }, { "epoch": 0.981146694214876, "grad_norm": 2.6479296684265137, "learning_rate": 4.504763870756268e-08, "loss": 3.9615, "step": 3799 }, { "epoch": 0.981404958677686, "grad_norm": 3.985316753387451, "learning_rate": 4.383871488221891e-08, "loss": 3.3891, "step": 3800 }, { "epoch": 0.9816632231404959, "grad_norm": 1.89349365234375, "learning_rate": 4.264621989339135e-08, "loss": 3.7524, "step": 3801 }, { "epoch": 0.9819214876033058, "grad_norm": 2.7913386821746826, "learning_rate": 4.147015452611036e-08, "loss": 3.687, "step": 3802 }, { "epoch": 0.9821797520661157, "grad_norm": 2.9487271308898926, "learning_rate": 4.0310519554589977e-08, "loss": 3.7305, "step": 3803 }, { "epoch": 0.9824380165289256, "grad_norm": 1.9191807508468628, "learning_rate": 3.9167315742225094e-08, "loss": 3.9847, "step": 3804 }, { "epoch": 0.9826962809917356, "grad_norm": 3.2890465259552, "learning_rate": 3.804054384159428e-08, "loss": 3.2683, "step": 3805 }, { "epoch": 0.9829545454545454, "grad_norm": 2.414564847946167, "learning_rate": 3.693020459446528e-08, "loss": 3.8734, "step": 3806 }, { "epoch": 0.9832128099173554, "grad_norm": 6.40816068649292, "learning_rate": 3.5836298731778405e-08, "loss": 3.2054, "step": 3807 }, { "epoch": 0.9834710743801653, "grad_norm": 2.808929204940796, "learning_rate": 3.4758826973660394e-08, "loss": 4.7952, "step": 3808 }, { "epoch": 0.9837293388429752, "grad_norm": 2.660249710083008, "learning_rate": 3.369779002942442e-08, "loss": 4.2547, "step": 3809 }, { "epoch": 0.9839876033057852, "grad_norm": 3.2698280811309814, "learning_rate": 3.26531885975534e-08, "loss": 3.9706, "step": 3810 }, { "epoch": 0.984245867768595, "grad_norm": 2.994636297225952, "learning_rate": 3.162502336572226e-08, "loss": 4.0749, "step": 3811 }, { "epoch": 0.984504132231405, "grad_norm": 2.382948875427246, "learning_rate": 3.06132950107757e-08, "loss": 4.2929, "step": 3812 }, { "epoch": 0.9847623966942148, "grad_norm": 2.666175127029419, "learning_rate": 2.9618004198742056e-08, "loss": 3.5847, "step": 3813 }, { "epoch": 0.9850206611570248, "grad_norm": 2.9445669651031494, "learning_rate": 2.8639151584833324e-08, "loss": 3.5333, "step": 3814 }, { "epoch": 0.9852789256198347, "grad_norm": 2.730403184890747, "learning_rate": 2.767673781343405e-08, "loss": 4.3761, "step": 3815 }, { "epoch": 0.9855371900826446, "grad_norm": 5.03980827331543, "learning_rate": 2.6730763518106883e-08, "loss": 3.9679, "step": 3816 }, { "epoch": 0.9857954545454546, "grad_norm": 2.1362690925598145, "learning_rate": 2.5801229321595342e-08, "loss": 3.89, "step": 3817 }, { "epoch": 0.9860537190082644, "grad_norm": 2.0731263160705566, "learning_rate": 2.4888135835821058e-08, "loss": 4.0419, "step": 3818 }, { "epoch": 0.9863119834710744, "grad_norm": 3.2159321308135986, "learning_rate": 2.39914836618782e-08, "loss": 3.7587, "step": 3819 }, { "epoch": 0.9865702479338843, "grad_norm": 3.5625083446502686, "learning_rate": 2.3111273390039047e-08, "loss": 3.4397, "step": 3820 }, { "epoch": 0.9868285123966942, "grad_norm": 2.853739023208618, "learning_rate": 2.2247505599756746e-08, "loss": 3.7525, "step": 3821 }, { "epoch": 0.9870867768595041, "grad_norm": 2.632124662399292, "learning_rate": 2.1400180859651453e-08, "loss": 3.9496, "step": 3822 }, { "epoch": 0.987345041322314, "grad_norm": 3.2143917083740234, "learning_rate": 2.0569299727529744e-08, "loss": 4.5887, "step": 3823 }, { "epoch": 0.987603305785124, "grad_norm": 2.920564651489258, "learning_rate": 1.9754862750359647e-08, "loss": 4.3253, "step": 3824 }, { "epoch": 0.9878615702479339, "grad_norm": 3.417703628540039, "learning_rate": 1.8956870464298394e-08, "loss": 3.9315, "step": 3825 }, { "epoch": 0.9881198347107438, "grad_norm": 3.0080394744873047, "learning_rate": 1.817532339466743e-08, "loss": 3.527, "step": 3826 }, { "epoch": 0.9883780991735537, "grad_norm": 2.2923505306243896, "learning_rate": 1.7410222055963543e-08, "loss": 4.0141, "step": 3827 }, { "epoch": 0.9886363636363636, "grad_norm": 3.6300172805786133, "learning_rate": 1.6661566951861607e-08, "loss": 3.7848, "step": 3828 }, { "epoch": 0.9888946280991735, "grad_norm": 3.8843719959259033, "learning_rate": 1.5929358575206275e-08, "loss": 3.908, "step": 3829 }, { "epoch": 0.9891528925619835, "grad_norm": 1.8394577503204346, "learning_rate": 1.5213597408020307e-08, "loss": 3.9275, "step": 3830 }, { "epoch": 0.9894111570247934, "grad_norm": 4.292716979980469, "learning_rate": 1.4514283921485128e-08, "loss": 3.5938, "step": 3831 }, { "epoch": 0.9896694214876033, "grad_norm": 6.4566240310668945, "learning_rate": 1.3831418575974141e-08, "loss": 4.3647, "step": 3832 }, { "epoch": 0.9899276859504132, "grad_norm": 2.292285680770874, "learning_rate": 1.3165001821016654e-08, "loss": 3.7255, "step": 3833 }, { "epoch": 0.9901859504132231, "grad_norm": 3.484215497970581, "learning_rate": 1.2515034095322842e-08, "loss": 3.6859, "step": 3834 }, { "epoch": 0.9904442148760331, "grad_norm": 5.899170398712158, "learning_rate": 1.1881515826769885e-08, "loss": 3.8802, "step": 3835 }, { "epoch": 0.9907024793388429, "grad_norm": 1.8650264739990234, "learning_rate": 1.1264447432410286e-08, "loss": 4.2468, "step": 3836 }, { "epoch": 0.9909607438016529, "grad_norm": 3.1647677421569824, "learning_rate": 1.0663829318463547e-08, "loss": 3.7471, "step": 3837 }, { "epoch": 0.9912190082644629, "grad_norm": 5.678305625915527, "learning_rate": 1.0079661880318948e-08, "loss": 3.3638, "step": 3838 }, { "epoch": 0.9914772727272727, "grad_norm": 2.9323747158050537, "learning_rate": 9.511945502543862e-09, "loss": 3.7567, "step": 3839 }, { "epoch": 0.9917355371900827, "grad_norm": 3.95342755317688, "learning_rate": 8.960680558867118e-09, "loss": 3.8404, "step": 3840 }, { "epoch": 0.9919938016528925, "grad_norm": 3.0686604976654053, "learning_rate": 8.42586741219009e-09, "loss": 3.9015, "step": 3841 }, { "epoch": 0.9922520661157025, "grad_norm": 1.9059290885925293, "learning_rate": 7.907506414583932e-09, "loss": 3.8496, "step": 3842 }, { "epoch": 0.9925103305785123, "grad_norm": 2.6519007682800293, "learning_rate": 7.4055979072923386e-09, "loss": 4.1903, "step": 3843 }, { "epoch": 0.9927685950413223, "grad_norm": 4.5479841232299805, "learning_rate": 6.920142220726011e-09, "loss": 4.0758, "step": 3844 }, { "epoch": 0.9930268595041323, "grad_norm": 5.673126697540283, "learning_rate": 6.451139674459872e-09, "loss": 4.3556, "step": 3845 }, { "epoch": 0.9932851239669421, "grad_norm": 2.0551342964172363, "learning_rate": 5.998590577246943e-09, "loss": 3.9404, "step": 3846 }, { "epoch": 0.9935433884297521, "grad_norm": 4.944198131561279, "learning_rate": 5.562495227001696e-09, "loss": 3.6279, "step": 3847 }, { "epoch": 0.993801652892562, "grad_norm": 3.7643721103668213, "learning_rate": 5.142853910808376e-09, "loss": 3.8774, "step": 3848 }, { "epoch": 0.9940599173553719, "grad_norm": 3.213345527648926, "learning_rate": 4.739666904921003e-09, "loss": 4.3038, "step": 3849 }, { "epoch": 0.9943181818181818, "grad_norm": 2.724073648452759, "learning_rate": 4.352934474760595e-09, "loss": 4.1468, "step": 3850 }, { "epoch": 0.9945764462809917, "grad_norm": 6.293656826019287, "learning_rate": 3.982656874917945e-09, "loss": 4.1677, "step": 3851 }, { "epoch": 0.9948347107438017, "grad_norm": 1.9398272037506104, "learning_rate": 3.6288343491452936e-09, "loss": 4.3585, "step": 3852 }, { "epoch": 0.9950929752066116, "grad_norm": 3.517050266265869, "learning_rate": 3.291467130370207e-09, "loss": 3.9428, "step": 3853 }, { "epoch": 0.9953512396694215, "grad_norm": 1.9605659246444702, "learning_rate": 2.970555440684475e-09, "loss": 3.8963, "step": 3854 }, { "epoch": 0.9956095041322314, "grad_norm": 2.0712828636169434, "learning_rate": 2.666099491346885e-09, "loss": 4.2585, "step": 3855 }, { "epoch": 0.9958677685950413, "grad_norm": 1.767076849937439, "learning_rate": 2.3780994827776738e-09, "loss": 3.9502, "step": 3856 }, { "epoch": 0.9961260330578512, "grad_norm": 4.356866359710693, "learning_rate": 2.1065556045779534e-09, "loss": 3.8967, "step": 3857 }, { "epoch": 0.9963842975206612, "grad_norm": 2.5807902812957764, "learning_rate": 1.8514680355019577e-09, "loss": 4.1113, "step": 3858 }, { "epoch": 0.9966425619834711, "grad_norm": 3.530592441558838, "learning_rate": 1.6128369434764701e-09, "loss": 4.5562, "step": 3859 }, { "epoch": 0.996900826446281, "grad_norm": 1.9079642295837402, "learning_rate": 1.3906624855924977e-09, "loss": 4.3893, "step": 3860 }, { "epoch": 0.9971590909090909, "grad_norm": 2.869751453399658, "learning_rate": 1.1849448081135972e-09, "loss": 3.832, "step": 3861 }, { "epoch": 0.9974173553719008, "grad_norm": 3.273883819580078, "learning_rate": 9.956840464647732e-10, "loss": 4.4417, "step": 3862 }, { "epoch": 0.9976756198347108, "grad_norm": 2.6594390869140625, "learning_rate": 8.228803252324779e-10, "loss": 3.7716, "step": 3863 }, { "epoch": 0.9979338842975206, "grad_norm": 4.988260746002197, "learning_rate": 6.665337581812647e-10, "loss": 4.0487, "step": 3864 }, { "epoch": 0.9981921487603306, "grad_norm": 2.482924222946167, "learning_rate": 5.266444482315835e-10, "loss": 3.8237, "step": 3865 }, { "epoch": 0.9984504132231405, "grad_norm": 2.9042654037475586, "learning_rate": 4.0321248747643426e-10, "loss": 4.157, "step": 3866 }, { "epoch": 0.9987086776859504, "grad_norm": 2.915332317352295, "learning_rate": 2.962379571674889e-10, "loss": 4.0858, "step": 3867 }, { "epoch": 0.9989669421487604, "grad_norm": 3.2134788036346436, "learning_rate": 2.0572092773174513e-10, "loss": 3.9542, "step": 3868 }, { "epoch": 0.9992252066115702, "grad_norm": 3.2500338554382324, "learning_rate": 1.31661458752097e-10, "loss": 3.8375, "step": 3869 }, { "epoch": 0.9994834710743802, "grad_norm": 2.921463966369629, "learning_rate": 7.405959898953984e-11, "loss": 4.2789, "step": 3870 }, { "epoch": 0.99974173553719, "grad_norm": 3.375498056411743, "learning_rate": 3.291538635541436e-11, "loss": 4.284, "step": 3871 }, { "epoch": 1.0, "grad_norm": 2.078681468963623, "learning_rate": 8.228847944713459e-12, "loss": 3.843, "step": 3872 } ], "logging_steps": 1, "max_steps": 3872, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }